Edit file

File name : check_a2_smart.py.all

Content :

#!/usr/bin/python3
import subprocess
from shlex import split
import os
import glob
import re
import sys
import time
import socket

exit_code = 0 # Default OK

# Check if the hostname contains "vplatform" - BFENG-1670
hostname = socket.gethostname()
if "vplatform" in hostname:
  print("Check_Smart_sda - this is vplatform server - OK")
  exit(0)

# Global variables
WEAR_WARN_860_PRO_1TB = 5500
WEAR_CRIT_860_PRO_1TB = 5750
# Alert warning if single drive wear below this:
WEAR_WARN = 5
# Alert critical if single drive wear below this:
WEAR_CRIT = 2
# Alert warning if both drives in a pair below this:
PAIR_WEAR_WARN = 10
# Alert crtiical if both drives in a pair below this:
PAIR_WEAR_CRIT = 5
SSDREMAP = 400
REMAP_WARN = 5
REMAP_CRIT = 2
# Wait this many seconds before retrying a failed smartctl:
FAIL_DELAY = 2

disk_list = []

def shell(cmd, outform="short", input=""):
    output, errors = subprocess.Popen(['bash','-c', cmd], stdout=subprocess.PIPE, stdin=subprocess.PIPE).communicate(input=input.encode())
    if outform != "long":
        retval = output.decode('utf-8').split('\n')[0]
    else:
        retval = output.decode('utf-8')
    return retval

def get_smart_out(dev):
    retries = 0
    success = False
    while success == False and retries < 5:
        smart_out = str(shell(smart + ' -a  ' + dev, outform="long"))
    
        if "SMART Disabled" in smart_out:
            time.sleep(FAIL_DELAY)
            # and try again
        elif "A mandatory SMART command failed" in smart_out:
            time.sleep(FAIL_DELAY)
            # and try again
        elif "Read SMART Data failed" in smart_out:
            time.sleep(FAIL_DELAY)
            # and try again
        else:
            success = True
            # escape the loop

retries += 1

# still no valid SMART output after 5 tries!
    if success == False:
        # returning an empty string will result in critical "unrecognized model" error
        # this should get the drive the attention it needs if it cannot answer SMART queries at all
        return ""
    # We have valid SMART output
    else:
        return smart_out

# self-contained function to check the disk
# mode = sata or nvme
def disk_check(smart, file_name_pattern, mode):
    global exit_code
    for sdx in glob.iglob(file_name_pattern):
        device = sdx.split("/")[3]

smart_out = get_smart_out('/dev/' + device)

if mode == 'sata':
            model=str(shell('grep "Device Model" | cut -c 19-', input=smart_out))
            protocol=str(shell('grep -w "Transport protocol:" | awk \'{ print $3 }\'', input=smart_out))
        elif mode == 'nvme':
            model=str(shell('grep "Model Number" | awk "{print $4}"', input=smart_out))
            protocol='nvme'
        else:
            print("Check_Smart_ " + device + " - ERROR: Unrecognized device mode: " + mode + ".")
            exit_code = max(exit_code, 2)
            return

if protocol == 'SAS':
            defectlist = int(shell('grep -w "Elements in grown defect list:" | awk \'{ print $6 }\'', input=smart_out))
            read_error = int(shell('grep -w "read:" | awk \'{ print $8 }\'', input=smart_out))
            write_error = int(shell('grep -w "write:" | awk \'{ print $8 }\'', input=smart_out))
            verify_error = int(shell('grep -w "verify:" | awk \'{ print $8 }\'', input=smart_out))

if ((defectlist > 20) or (read_error > 20) or (write_error > 20) or (verify_error > 20)):
                print("Check_Smart_" + device + " - CRITICAL - " + device + ", Elements in defect list=" + str(defectlist) + ", Uncorrected read errors=" + str(read_error)
                    + ", Uncorrected write errors=" + str(write_error) + ", Uncorrected verify errors=" + str(verify_error))
                exit_code = max(exit_code, 2)
            else:
                print("Check_Smart_" + device + " - OK")
                exit_code = max(exit_code, 0)

# Spinning disks, which key on pending sectors and remap count.
        seagate_hdd_models = [ 'ST1', 'ST3', 'ST4', 'ST5', 'ST8', 'ST9', 'ST2000' ]
        if (model.startswith('WDC') or model.startswith('TOSHIBA') or model.startswith(tuple(seagate_hdd_models)) or model.startswith('GB1000')):
            remap = int(shell("grep Reallocated_Sector_Ct | awk \'{print $10}\'", input=smart_out))
            pend = int(shell("grep Current_Pending_Sector | awk \'{print $10}\'", input=smart_out))
            hours = int(shell("grep Power_On_Hours | awk \'{print $10}\'", input=smart_out))

if (remap > 50) or (pend > 0):
                print("Check_Smart_" + device + " - CRITICAL - " + device + " SMART failure Hours=" + str(
                    hours) + " Remap=" + str(remap) + " Pending=" + str(pend))
                exit_code = max(exit_code, 2)
            else:
                print("Check_Smart_" + device + " - OK - " + device + " clean Hours=" + str(hours))
                exit_code = max(exit_code, 0)

elif (model.startswith('KINGSTON')):
            remap = int(shell("grep Retired_Block_Count | awk \'{print $10}\'", input=smart_out))
            pend = int(shell("grep Reported_Uncorrect | awk \'{print $10}\'", input=smart_out))
            hours = int(shell("grep Power_On_Hours | awk \'{print $10}\'", input=smart_out))

# Fetch NVMe data
        elif mode == 'nvme':
            # Normalize wear to mean life remaining, like is true for SATA
            wear = 100 - int(
                shell("grep 'Percentage Used' | awk '{print $3}' | cut -d '%' -f1", input=smart_out))
            # No rsvd block count exposed for NVMe, so put a 0 which is always less than the threshold for SATA disks
            entry = {'device': device, 'wear': wear, 'model': model, 'rsvd': 0}
            disk_list.append(entry)

# SSD relying on raw data due to normalized smartctl output data being too conservative. Tests wear level and thus cares about raid locality
        elif '860 PRO 1TB' in model:
            wear = int(shell("grep Wear_Level | awk '{print $10}'", input=smart_out))
            rsvd = int(shell("grep Used_Rsvd | awk '{print $10}'", input=smart_out))

# Normalize manually
            wear = 100 - (wear / WEAR_CRIT_860_PRO_1TB)
            entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd}
            disk_list.append(entry)

# Other SSD models that have acceptable SMART values
        elif ('SSD' in model and not model.startswith('INTEL SSD') or model.startswith('Kingston SKC') or model.startswith('SAMSUNG MZ7')):
            wear = str(shell("grep Wear_Level | awk '{print $4}'", input=smart_out))
            if wear.isdigit():
                wear = int(wear)
            else:
                # check failed, return a deliberately out-of-bounds value
                wear = 9000
            rsvd = str(shell("grep Used_Rsvd | awk '{print $10}'", input=smart_out))
            if rsvd.isdigit():
                rsvd = int(rsvd)
            else:
                # check failed, return a deliberately out-of-bounds value
                rsvd = 9000

entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd}
            disk_list.append(entry)
        elif('Micron_5300' in model or 'Micron_1100' in model):
            wear = int(shell("grep Percent_Lifetime_Remain | awk '{print $4}'", input=smart_out))
            rsvd = int(shell("grep -e Unused_Rsvd_Blk_Cnt_Tot -e Reallocate_NAND_Blk_Cnt | awk '{print $10}'", input=smart_out))

entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd}
            disk_list.append(entry)

elif('INTEL' in model):
            wear = int(shell("grep Media_Wearout_Indicator | awk '{print $4}'", input=smart_out))
            rsvd = int(shell("grep Available_Reservd_Space | awk '{print $10}'", input=smart_out))

entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd}
            disk_list.append(entry)

# if protocol isnt SAS and no models are matched above, error
        elif(protocol == ''):
            print("Check_Smart_" + device + " - ERROR: Unrecognized model: " + model)
            exit_code = max(exit_code, 2)
    # end of for looping over the disks

# Fetch RAID info from mdadm about these devices and integrate with the smartctl data
    populate_raid_info(disk_list)

# Iterate over each disk and mark it good or bad based on thresholds
    for disk in disk_list:
        # 0 = good, 1 = warn, 2+= crit
        disk['status'] = 0
        # Fail if too many remaps. The good/ok gets overwritten by wear leveling checks if needed
        if ('Micron_5300' in disk['model']):
            if disk['rsvd'] < REMAP_CRIT:
                disk['status'] += 2
            elif disk['rsvd'] <  REMAP_WARN:
                disk['status'] += 1
            # End Micron5300 specific code
        elif disk['rsvd'] > SSDREMAP:
            disk['status'] += 1

# Fail independently if too much wear: permits a crit here to override a simple warn from remaps
        # Wear values are 99 (Best) down to 0 (no predicted write life left), so <= is the proper check
        if disk['wear'] == 9000:
            # check for the out-of-bounds value that indicates check malfunctioned
            disk['status'] = 9000
        elif disk['wear'] <= WEAR_CRIT:
            disk['status'] += 2
        elif disk['wear'] <= WEAR_WARN:
            disk['status'] += 1

if disk['status'] == 0:
            disk['warn_type'] = "OK"
        elif disk['status'] == 9000:
            disk['warn_type'] = "UNKNOWN"
        elif disk['status'] == 1:
            disk['warn_type'] = "WARNING"
        else:
            disk['warn_type'] = "CRITICAL"

# Now that health data on all disks are populated, run through each disk again and determine
    # whether to alert it as good or bad.

for disk in disk_list:
        # report individual disk health
        if disk['status'] == 9000:
            # something in check went wrong
            output="Check_Smart_" + disk['device'] + " wear_life_remaining=?;?;? remaining_life=?%" + \
                " remaps=? " + disk['device'] + " UNKNOWN"
            exit_code = max(exit_code, 3)
        elif disk['status'] == 0:
            output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \
                ";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \
                "%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " OK"
            exit_code = max(exit_code, 0)
        elif disk['status'] == 1:
            output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \
                ";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \
                "%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " WARNING"
            exit_code = max(exit_code, 1)
        else:
            output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \
                ";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \
                "%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " CRITICAL"
            exit_code = max(exit_code, 2)

# now check status of pair partner

part = find_pair(disk, disk_list)
        if disk['wear'] <= PAIR_WEAR_CRIT and part['wear'] <= PAIR_WEAR_CRIT:
            # crit even if drives would be individually good or warn.
                output = "Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \
                ";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \
                "%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " CRITICAL"
                exit_code = max(exit_code, 2)

elif disk['status'] < 2 and disk['wear'] <= PAIR_WEAR_WARN and part['wear'] <= PAIR_WEAR_WARN:
            # warn even if drives would be individually good (but don't downgrade from crit).
            output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + ";" \
                + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + "%" + \
                " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " WARNING"
            exit_code = max(exit_code, 1)
            
        print(output)

# Fetch the list of md arrays from the system and populates devices dictionary with them
# Finds the first raid10 device and uses it to determine which disks are in what sets.
# Area for future improvement: check all arrays instead of just the first, for sanity
# Also, it relies on adjacency to determine set info. In a 4x R10 there are two set-As
# and two set-Bs and it presumes that near=2 is the setting for deciding which to check.
def populate_raid_info(devices):
    arrays = shell("mdadm --detail --scan")
    for array in arrays.splitlines():
        device = array.split(' ')[1]
        raid_type = shell("mdadm --detail " + device + " | grep 'Raid Level' | awk '{print $4}'")

# Fetch detailed set information
        for dev in devices:
            raid_device = shell("mdadm --detail " + device + " | grep " + dev['device'] + " | awk '{print $4}'")
            if raid_device != '':
                dev['RaidDevice'] = int(raid_device)
                set_info = shell("mdadm --detail " + device + " | grep " + dev['device'] + " | awk '{print $7}'")
                dev['set'] = set_info

# Finds the R10 pair in a set
# Presumes near=2
def find_pair(disk, devices):
    try:
        set_name = disk['set']
        raid_device = disk['RaidDevice']

# If even, pair is +1 id
        if (raid_device % 2) == 0:
            return fetch_disk_by_id(disk['RaidDevice'] + 1, devices)
        else:
           return fetch_disk_by_id(disk['RaidDevice'] - 1, devices)
    except KeyError:
        return None

def fetch_disk_by_id(id, devices):
    for d in devices:
        if d['RaidDevice'] == id:
            return d
    return []

## MAIN CODE

# Let's skip mvps
grains_role = shell("grep ^[[:space:]].role: /etc/salt/minion | awk '{print $2}'")
if grains_role == 'mvps':
    exit()

# determine which disk type the machine uses
sdx = os.path.isfile("/sys/block/sda/size")
nvme_x = os.path.isfile("/sys/block/nvme0n1/size")

for x in range(1,6):
  if os.path.isfile("/sys/block/nvme" + str(x) + "n1/size"):
    nvme_x = os.path.isfile("/sys/block/nvme" + str(x) + "n1/size")
  break

# Fail silently and early out of devices that lack both. These would be VMs with
# xvda and such, which ought to neither have SMARTmontools nor physical disks to check
if not sdx and not nvme_x:
    exit()

# check for smartmontools
smart = shell('which smartctl')
if not smart:
    print(smart)
    print("Check_Smart_sda - ERROR: Unable to detect smartmontools. Is it installed?")
    exit(2)

# execute appropriate check
if sdx and nvme_x:
    disk_check(smart, '/sys/block/sd?', 'sata')
    disk_check(smart, '/sys/block/nvme?n1/nvme?n?p1', 'nvme')
elif sdx:
    disk_check(smart, '/sys/block/sd?', 'sata')
elif nvme_x:
    disk_check(smart, '/sys/block/nvme?n1/nvme?n?p1', 'nvme')

# Exit with the highest severity discovered (0 OK, 1 WARNING, 2 CRITICAL, 3 UNKNOWN)
sys.exit(exit_code)

TERMOREK-IT SHELL 403

Edit file