Edit file File name : check_a2_olderdrive_health.all Content :#!/bin/bash # Ref : OPSTEAM-15119 - find old drives with reallocated sectors and ata errors # This helps prevent crashes, grub issues, fscks etc on older shared servers # SYSENG-25357 - refactored to run smartctl only once and added OK status # # If not physical server, exit if ! [[ $(systemd-detect-virt) == "none" ]]; then exit 0 fi # SSD drive variables - default values drive_list=$(find /sys/block/ | grep -E 'sd|nvm' | awk -F 'block/' '{print $2}') drive_age_min=39420 # in hours, 4.5 years in age minimum drive_realloc_min=100 # reallocated sectors minimum drive_ata_errors_min=20 # minimum ata errors # NVMe drive variables - default values nvme_age_min=43800 # in hours, at least aged 5 years, NVMes are built to last longer nvme_percentage_used=90 # 'Percentage Used' attribute for NMVe nvme_media_errors=500 # 'Media and Data Integrity' Errors attribute for NMVe for drive in ${drive_list}; do drive_data=$(smartctl -a /dev/"${drive}") if [[ "${drive}" =~ "sd" ]]; then realloc_sectors=$(awk '/Reallocated_Sector/ {print $NF}' <<< "$drive_data") power_on_hours=$(awk '/Power_On_Hours/ {print $NF}' <<< "$drive_data") ata_errors=$(awk '/ATA Error/ {print $NF}' <<< "$drive_data") if [[ ${realloc_sectors} -gt ${drive_realloc_min} ]] && [[ ${power_on_hours} -gt ${drive_age_min} ]] && [[ ${ata_errors} -gt ${drive_ata_errors_min} ]]; then printf '%s\n' "check_old_drive_health_$drive - /dev/$drive age > 4.5y|reallocated sectors: ${realloc_sectors}|ata errors: ${ata_errors}" exit 2 else printf '%s\n' "check_old_drive_health_$drive - status OK" exit 0 fi else media_errors=$(awk '/Media and Data Integrity Errors/ {print $NF}' <<< "$drive_data") nmve_power_on_hours=$(awk '/Power On Hours/ {print $NF}' <<< "$drive_data") percentage_used=$(awk '/Percentage Used:/ {print $NF}' <<< "$drive_data") if [[ ${media_errors} -gt ${nvme_media_errors} ]] && [[ ${nmve_power_on_hours} -gt ${nvme_age_min} ]] && [[ ${percentage_used} -gt ${nvme_percentage_used} ]]; then printf '%s\n' "check_old_drive_health_$drive - /dev/$drive age > 5y|media errors: ${media_errors}|life used: ${percentage_used}" exit 2 else printf '%s\n' "check_old_drive_health_$drive - status OK" exit 0 fi fi done Save