#!/usr/bin/python # -*- encoding: utf-8; py-indent-offset: 4 -*- # +------------------------------------------------------------------+ # | ____ _ _ __ __ _ __ | # | / ___| |__ ___ ___| | __ | \/ | |/ / | # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / | # | | |___| | | | __/ (__| < | | | | . \ | # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ | # | | # | Copyright Mathias Kettner 2012 mk@mathias-kettner.de | # +------------------------------------------------------------------+ # # This file is part of Check_MK. # The official homepage is at http://mathias-kettner.de/check_mk. # # check_mk is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation in version 2. check_mk is distributed # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with- # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. See the GNU General Public License for more de- # ails. You should have received a copy of the GNU General Public # License along with GNU Make; see the file COPYING. If not, write # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, # Boston, MA 02110-1301 USA. ####################################### # Check developed by ####################################### # Dr. Markus Hillenbrand # University of Kaiserslautern, Germany # hillenbr@rhrk.uni-kl.de ####################################### # the inventory functions def inventory_nvidia_smi_fan(info): inventory = [] for line in info: if line[2] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_gpuutil(info): inventory = [] for line in info: if line[3] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_memutil(info): inventory = [] for line in info: if line[4] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_errors1(info): inventory = [] for line in info: if line[5] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_errors2(info): inventory = [] for line in info: if line[6] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_temp(info): inventory = [] for line in info: if line[7] != 'N/A': inventory.append( ("GPU"+line[0], "", None) ) return inventory def inventory_nvidia_smi_power(info): inventory = [] for line in info: if line[8] != 'N/A' and line[9] != "N/A": inventory.append( ("GPU"+line[0], "", None) ) return inventory # the check functions def check_nvidia_smi_fan(item, params, info): for line in info: if "GPU"+line[0] == item: value = int(line[2]) perfdata = [('fan', value, 90, 95, 0, 100 )] if value > 95: return (2, "CRITICAL - %s fan speed is %d%%" % (line[1], value), perfdata) elif value > 90: return (1, "WARNING - %s fan speed is %d%%" % (line[1], value), perfdata) else: return (0, "OK - %s fan speed is %d%%" % (line[1], value), perfdata) return (3, "UNKNOWN - GPU %s not found in agent output" % item) def check_nvidia_smi_gpuutil(item, params, info): for line in info: if "GPU"+line[0] == item: value = int(line[3]) perfdata = [('gpuutil', value, 100, 100, 0, 100 )] return (0, "OK - %s utilization is %s%%" % (line[1], value), perfdata) return (3, "UNKNOWN - GPU %s not found in agent output" % item) def check_nvidia_smi_memutil(item, params, info): for line in info: if "GPU"+line[0] == item: value = int(line[4]) perfdata = [('memutil', value, 100, 100, 0, 100 )] if value > 95: return (2, "CRITICAL - %s memory utilization is %d%%" % (line[1], value), perfdata) elif value > 90: return (1, "WARNING - %s memory utilization is %d%%" % (line[1], value), perfdata) else: return (0, "OK - %s memory utilization is %d%%" % (line[1], value), perfdata) return (3, "UNKNOWN - GPU %s not found in agent output" % item) def check_nvidia_smi_errors1(item, params, info): for line in info: if "GPU"+line[0] == item: value = int(line[5]) if value > 500: return (2, "CRITICAL - %s single bit error counter is %d" % (line[1], value)) if value > 100: return (1, "WARNING - %s single bit error counter is %d" % (line[1], value)) else: return (0, "OK - %s single bit error counter is %d" % (line[1], value)) return (3, "UNKNOWN - GPU %s not found in agent output" % item) def check_nvidia_smi_errors2(item, params, info): for line in info: if "GPU"+line[0] == item: value = int(line[6]) if value > 500: return (2, "CRITICAL - %s double bit error counter is %d" % (line[1], value)) if value > 100: return (1, "WARNING - %s double bit error counter is %d" % (line[1], value)) else: return (0, "OK - %s double bit error counter is %d" % (line[1], value)) return (3, "UNKNOWN - GPU %s not found in agent output" % item) def check_nvidia_smi_temp(item, params, info): for line in info: if "GPU"+line[0] == item: value = int(line[7]) perfdata = [('temp', value, 80, 90, 0, 95 )] if value > 90: return (2, "CRITICAL - %s temperature is %d°C" % (line[1], value), perfdata) elif value > 80: return (1, "WARNING - %s temperature is %d°C" % (line[1], value), perfdata) else: return (0, "OK - %s temperature is %d°C" % (line[1], value), perfdata) return (3, "UNKNOWN - GPU %s not found in agent output" % item) def check_nvidia_smi_power(item, params, info): for line in info: if "GPU"+line[0] == item: draw = float(line[8]) limit = float(line[9]) value = draw * 100.0 / limit perfdata = [('power', draw, limit * 0.8, limit * 0.9, 0, limit )] if value > 90: return (2, "CRITICAL - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata) elif value > 80: return (1, "WARNING - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata) else: return (0, "OK - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata) return (3, "UNKNOWN - GPU %s not found in agent output" % item) # declare the check to Check_MK check_info['nvidia_smi.fan'] = (check_nvidia_smi_fan, "%s fan speed" , 1, inventory_nvidia_smi_fan) check_info['nvidia_smi.gpuutil'] = (check_nvidia_smi_gpuutil, "%s utilization" , 1, inventory_nvidia_smi_gpuutil) check_info['nvidia_smi.memutil'] = (check_nvidia_smi_memutil, "%s memory" , 1, inventory_nvidia_smi_memutil) check_info['nvidia_smi.errors1'] = (check_nvidia_smi_errors1, "%s errors single" , 0, inventory_nvidia_smi_errors1) check_info['nvidia_smi.errors2'] = (check_nvidia_smi_errors2, "%s errors double" , 0, inventory_nvidia_smi_errors2) check_info['nvidia_smi.temp'] = (check_nvidia_smi_temp, "%s temperature" , 1, inventory_nvidia_smi_temp) check_info['nvidia_smi.power'] = (check_nvidia_smi_power, "%s power" , 1, inventory_nvidia_smi_power) checkgroup_of['nvidia_smi.errors1'] = 'hw_errors' checkgroup_of['nvidia_smi.errors2'] = 'hw_errors'