Edit file File name : check_a2_smart.py.all Content :#!/usr/bin/python3 import subprocess from shlex import split import os import glob import re import sys import time import socket exit_code = 0 # Default OK # Check if the hostname contains "vplatform" - BFENG-1670 hostname = socket.gethostname() if "vplatform" in hostname: print("Check_Smart_sda - this is vplatform server - OK") exit(0) # Global variables WEAR_WARN_860_PRO_1TB = 5500 WEAR_CRIT_860_PRO_1TB = 5750 # Alert warning if single drive wear below this: WEAR_WARN = 5 # Alert critical if single drive wear below this: WEAR_CRIT = 2 # Alert warning if both drives in a pair below this: PAIR_WEAR_WARN = 10 # Alert crtiical if both drives in a pair below this: PAIR_WEAR_CRIT = 5 SSDREMAP = 400 REMAP_WARN = 5 REMAP_CRIT = 2 # Wait this many seconds before retrying a failed smartctl: FAIL_DELAY = 2 disk_list = [] def shell(cmd, outform="short", input=""): output, errors = subprocess.Popen(['bash','-c', cmd], stdout=subprocess.PIPE, stdin=subprocess.PIPE).communicate(input=input.encode()) if outform != "long": retval = output.decode('utf-8').split('\n')[0] else: retval = output.decode('utf-8') return retval def get_smart_out(dev): retries = 0 success = False while success == False and retries < 5: smart_out = str(shell(smart + ' -a ' + dev, outform="long")) if "SMART Disabled" in smart_out: time.sleep(FAIL_DELAY) # and try again elif "A mandatory SMART command failed" in smart_out: time.sleep(FAIL_DELAY) # and try again elif "Read SMART Data failed" in smart_out: time.sleep(FAIL_DELAY) # and try again else: success = True # escape the loop retries += 1 # still no valid SMART output after 5 tries! if success == False: # returning an empty string will result in critical "unrecognized model" error # this should get the drive the attention it needs if it cannot answer SMART queries at all return "" # We have valid SMART output else: return smart_out # self-contained function to check the disk # mode = sata or nvme def disk_check(smart, file_name_pattern, mode): global exit_code for sdx in glob.iglob(file_name_pattern): device = sdx.split("/")[3] smart_out = get_smart_out('/dev/' + device) if mode == 'sata': model=str(shell('grep "Device Model" | cut -c 19-', input=smart_out)) protocol=str(shell('grep -w "Transport protocol:" | awk \'{ print $3 }\'', input=smart_out)) elif mode == 'nvme': model=str(shell('grep "Model Number" | awk "{print $4}"', input=smart_out)) protocol='nvme' else: print("Check_Smart_ " + device + " - ERROR: Unrecognized device mode: " + mode + ".") exit_code = max(exit_code, 2) return if protocol == 'SAS': defectlist = int(shell('grep -w "Elements in grown defect list:" | awk \'{ print $6 }\'', input=smart_out)) read_error = int(shell('grep -w "read:" | awk \'{ print $8 }\'', input=smart_out)) write_error = int(shell('grep -w "write:" | awk \'{ print $8 }\'', input=smart_out)) verify_error = int(shell('grep -w "verify:" | awk \'{ print $8 }\'', input=smart_out)) if ((defectlist > 20) or (read_error > 20) or (write_error > 20) or (verify_error > 20)): print("Check_Smart_" + device + " - CRITICAL - " + device + ", Elements in defect list=" + str(defectlist) + ", Uncorrected read errors=" + str(read_error) + ", Uncorrected write errors=" + str(write_error) + ", Uncorrected verify errors=" + str(verify_error)) exit_code = max(exit_code, 2) else: print("Check_Smart_" + device + " - OK") exit_code = max(exit_code, 0) # Spinning disks, which key on pending sectors and remap count. seagate_hdd_models = [ 'ST1', 'ST3', 'ST4', 'ST5', 'ST8', 'ST9', 'ST2000' ] if (model.startswith('WDC') or model.startswith('TOSHIBA') or model.startswith(tuple(seagate_hdd_models)) or model.startswith('GB1000')): remap = int(shell("grep Reallocated_Sector_Ct | awk \'{print $10}\'", input=smart_out)) pend = int(shell("grep Current_Pending_Sector | awk \'{print $10}\'", input=smart_out)) hours = int(shell("grep Power_On_Hours | awk \'{print $10}\'", input=smart_out)) if (remap > 50) or (pend > 0): print("Check_Smart_" + device + " - CRITICAL - " + device + " SMART failure Hours=" + str( hours) + " Remap=" + str(remap) + " Pending=" + str(pend)) exit_code = max(exit_code, 2) else: print("Check_Smart_" + device + " - OK - " + device + " clean Hours=" + str(hours)) exit_code = max(exit_code, 0) elif (model.startswith('KINGSTON')): remap = int(shell("grep Retired_Block_Count | awk \'{print $10}\'", input=smart_out)) pend = int(shell("grep Reported_Uncorrect | awk \'{print $10}\'", input=smart_out)) hours = int(shell("grep Power_On_Hours | awk \'{print $10}\'", input=smart_out)) if (remap > 50) or (pend > 0): print("Check_Smart_" + device + " - CRITICAL - " + device + " SMART failure Hours=" + str( hours) + " Remap=" + str(remap) + " Pending=" + str(pend)) exit_code = max(exit_code, 2) else: print("Check_Smart_" + device + " - OK - " + device + " clean Hours=" + str(hours)) exit_code = max(exit_code, 0) # Fetch NVMe data elif mode == 'nvme': # Normalize wear to mean life remaining, like is true for SATA wear = 100 - int( shell("grep 'Percentage Used' | awk '{print $3}' | cut -d '%' -f1", input=smart_out)) # No rsvd block count exposed for NVMe, so put a 0 which is always less than the threshold for SATA disks entry = {'device': device, 'wear': wear, 'model': model, 'rsvd': 0} disk_list.append(entry) # SSD relying on raw data due to normalized smartctl output data being too conservative. Tests wear level and thus cares about raid locality elif '860 PRO 1TB' in model: wear = int(shell("grep Wear_Level | awk '{print $10}'", input=smart_out)) rsvd = int(shell("grep Used_Rsvd | awk '{print $10}'", input=smart_out)) # Normalize manually wear = 100 - (wear / WEAR_CRIT_860_PRO_1TB) entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd} disk_list.append(entry) # Other SSD models that have acceptable SMART values elif ('SSD' in model and not model.startswith('INTEL SSD') or model.startswith('Kingston SKC') or model.startswith('SAMSUNG MZ7')): wear = str(shell("grep Wear_Level | awk '{print $4}'", input=smart_out)) if wear.isdigit(): wear = int(wear) else: # check failed, return a deliberately out-of-bounds value wear = 9000 rsvd = str(shell("grep Used_Rsvd | awk '{print $10}'", input=smart_out)) if rsvd.isdigit(): rsvd = int(rsvd) else: # check failed, return a deliberately out-of-bounds value rsvd = 9000 entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd} disk_list.append(entry) elif('Micron_5300' in model or 'Micron_1100' in model): wear = int(shell("grep Percent_Lifetime_Remain | awk '{print $4}'", input=smart_out)) rsvd = int(shell("grep -e Unused_Rsvd_Blk_Cnt_Tot -e Reallocate_NAND_Blk_Cnt | awk '{print $10}'", input=smart_out)) entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd} disk_list.append(entry) elif('INTEL' in model): wear = int(shell("grep Media_Wearout_Indicator | awk '{print $4}'", input=smart_out)) rsvd = int(shell("grep Available_Reservd_Space | awk '{print $10}'", input=smart_out)) entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd} disk_list.append(entry) # if protocol isnt SAS and no models are matched above, error elif(protocol == ''): print("Check_Smart_" + device + " - ERROR: Unrecognized model: " + model) exit_code = max(exit_code, 2) # end of for looping over the disks # Fetch RAID info from mdadm about these devices and integrate with the smartctl data populate_raid_info(disk_list) # Iterate over each disk and mark it good or bad based on thresholds for disk in disk_list: # 0 = good, 1 = warn, 2+= crit disk['status'] = 0 # Fail if too many remaps. The good/ok gets overwritten by wear leveling checks if needed if ('Micron_5300' in disk['model']): if disk['rsvd'] < REMAP_CRIT: disk['status'] += 2 elif disk['rsvd'] < REMAP_WARN: disk['status'] += 1 # End Micron5300 specific code elif disk['rsvd'] > SSDREMAP: disk['status'] += 1 # Fail independently if too much wear: permits a crit here to override a simple warn from remaps # Wear values are 99 (Best) down to 0 (no predicted write life left), so <= is the proper check if disk['wear'] == 9000: # check for the out-of-bounds value that indicates check malfunctioned disk['status'] = 9000 elif disk['wear'] <= WEAR_CRIT: disk['status'] += 2 elif disk['wear'] <= WEAR_WARN: disk['status'] += 1 if disk['status'] == 0: disk['warn_type'] = "OK" elif disk['status'] == 9000: disk['warn_type'] = "UNKNOWN" elif disk['status'] == 1: disk['warn_type'] = "WARNING" else: disk['warn_type'] = "CRITICAL" # Now that health data on all disks are populated, run through each disk again and determine # whether to alert it as good or bad. for disk in disk_list: # report individual disk health if disk['status'] == 9000: # something in check went wrong output="Check_Smart_" + disk['device'] + " wear_life_remaining=?;?;? remaining_life=?%" + \ " remaps=? " + disk['device'] + " UNKNOWN" exit_code = max(exit_code, 3) elif disk['status'] == 0: output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \ ";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \ "%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " OK" exit_code = max(exit_code, 0) elif disk['status'] == 1: output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \ ";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \ "%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " WARNING" exit_code = max(exit_code, 1) else: output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \ ";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \ "%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " CRITICAL" exit_code = max(exit_code, 2) # now check status of pair partner part = find_pair(disk, disk_list) if disk['wear'] <= PAIR_WEAR_CRIT and part['wear'] <= PAIR_WEAR_CRIT: # crit even if drives would be individually good or warn. output = "Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \ ";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \ "%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " CRITICAL" exit_code = max(exit_code, 2) elif disk['status'] < 2 and disk['wear'] <= PAIR_WEAR_WARN and part['wear'] <= PAIR_WEAR_WARN: # warn even if drives would be individually good (but don't downgrade from crit). output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + ";" \ + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + "%" + \ " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " WARNING" exit_code = max(exit_code, 1) print(output) # Fetch the list of md arrays from the system and populates devices dictionary with them # Finds the first raid10 device and uses it to determine which disks are in what sets. # Area for future improvement: check all arrays instead of just the first, for sanity # Also, it relies on adjacency to determine set info. In a 4x R10 there are two set-As # and two set-Bs and it presumes that near=2 is the setting for deciding which to check. def populate_raid_info(devices): arrays = shell("mdadm --detail --scan") for array in arrays.splitlines(): device = array.split(' ')[1] raid_type = shell("mdadm --detail " + device + " | grep 'Raid Level' | awk '{print $4}'") # Fetch detailed set information for dev in devices: raid_device = shell("mdadm --detail " + device + " | grep " + dev['device'] + " | awk '{print $4}'") if raid_device != '': dev['RaidDevice'] = int(raid_device) set_info = shell("mdadm --detail " + device + " | grep " + dev['device'] + " | awk '{print $7}'") dev['set'] = set_info # Finds the R10 pair in a set # Presumes near=2 def find_pair(disk, devices): try: set_name = disk['set'] raid_device = disk['RaidDevice'] # If even, pair is +1 id if (raid_device % 2) == 0: return fetch_disk_by_id(disk['RaidDevice'] + 1, devices) else: return fetch_disk_by_id(disk['RaidDevice'] - 1, devices) except KeyError: return None def fetch_disk_by_id(id, devices): for d in devices: if d['RaidDevice'] == id: return d return [] ## MAIN CODE # Let's skip mvps grains_role = shell("grep ^[[:space:]].role: /etc/salt/minion | awk '{print $2}'") if grains_role == 'mvps': exit() # determine which disk type the machine uses sdx = os.path.isfile("/sys/block/sda/size") nvme_x = os.path.isfile("/sys/block/nvme0n1/size") for x in range(1,6): if os.path.isfile("/sys/block/nvme" + str(x) + "n1/size"): nvme_x = os.path.isfile("/sys/block/nvme" + str(x) + "n1/size") break # Fail silently and early out of devices that lack both. These would be VMs with # xvda and such, which ought to neither have SMARTmontools nor physical disks to check if not sdx and not nvme_x: exit() # check for smartmontools smart = shell('which smartctl') if not smart: print(smart) print("Check_Smart_sda - ERROR: Unable to detect smartmontools. Is it installed?") exit(2) # execute appropriate check if sdx and nvme_x: disk_check(smart, '/sys/block/sd?', 'sata') disk_check(smart, '/sys/block/nvme?n1/nvme?n?p1', 'nvme') elif sdx: disk_check(smart, '/sys/block/sd?', 'sata') elif nvme_x: disk_check(smart, '/sys/block/nvme?n1/nvme?n?p1', 'nvme') # Exit with the highest severity discovered (0 OK, 1 WARNING, 2 CRITICAL, 3 UNKNOWN) sys.exit(exit_code) Save