Add disk health check

This commit is contained in:
Yan Lin 2025-09-07 12:54:38 +02:00
parent 95b92c232e
commit d2bc0a522e
6 changed files with 413 additions and 5 deletions

113
scripts/daily-smart-report.sh Executable file
View file

@ -0,0 +1,113 @@
#!/usr/bin/env bash
# Simple daily SMART report script - plain text version
# Only checks SMART attributes and sends report via Gotify
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
GOTIFY_SCRIPT="${SCRIPT_DIR}/gotify-notify.sh"
LOG_FILE="/var/log/daily-smart-report.log"
# Host-specific Gotify configuration
GOTIFY_URL="https://notify.yanlincs.com"
GOTIFY_TOKEN="Ac9qKFH5cA.7Yly"
# Drive configurations
declare -A DRIVES=(
["/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R"]="ZFS Mirror 1"
["/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG"]="ZFS Mirror 2"
["/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PK2N4GB"]="Data Drive 1 (12TB)"
["/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PJ7Z3LE"]="Data Drive 2 (12TB)"
["/dev/disk/by-id/ata-ST16000NM000J-2TW103_WRS0F8BE"]="Parity Drive (16TB)"
)
main() {
local report=""
local healthy_drives=0
local total_drives=0
echo "Starting daily SMART report"
report="Daily SMART Report - $(date '+%Y-%m-%d')\n\n"
report+="Drive SMART Status:\n"
# Check each drive
for device in "${!DRIVES[@]}"; do
local device_name="${DRIVES[$device]}"
total_drives=$((total_drives + 1))
echo "Checking $device_name"
# Quick device existence check
if [[ ! -e "$device" ]]; then
report+="[FAIL] $device_name: Device not found\n"
continue
fi
# Get SMART health
local health="UNKNOWN"
if health=$(smartctl -H "$device" 2>/dev/null | grep -o "PASSED\|FAILED" | head -1); then
echo " Health: $health"
else
health="UNKNOWN"
echo " Health: $health"
fi
# Get temperature
local temp="N/A"
if [[ "$health" == "PASSED" ]]; then
if temp=$(smartctl -A "$device" 2>/dev/null | awk '/Temperature_Celsius/ {print $10}' | head -1); then
if [[ "$temp" -gt 0 ]] 2>/dev/null; then
temp="${temp}C"
echo " Temperature: $temp"
else
temp="N/A"
echo " Temperature: $temp"
fi
else
temp="N/A"
echo " Temperature: $temp"
fi
fi
# Format output
if [[ "$health" == "PASSED" ]]; then
report+="[OK] $device_name: $health (Temp: $temp)\n"
healthy_drives=$((healthy_drives + 1))
else
report+="[FAIL] $device_name: $health (Temp: $temp)\n"
fi
done
# Add summary
report+="\nSummary:\n"
if [[ $healthy_drives -eq $total_drives ]]; then
report+="Status: All $total_drives drives healthy\n"
report+="Next check: $(date -d 'tomorrow 08:00' '+%Y-%m-%d 08:00')"
echo "Result: All drives healthy ($healthy_drives/$total_drives)"
# Send notification
if [[ -x "$GOTIFY_SCRIPT" ]]; then
"$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "normal" "Daily SMART Report" "$report"
fi
else
local issues=$((total_drives - healthy_drives))
report+="Status: $issues of $total_drives drives have issues"
echo "Result: Issues detected ($healthy_drives/$total_drives drives healthy)"
# Send high priority notification for issues
if [[ -x "$GOTIFY_SCRIPT" ]]; then
"$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "high" "Daily SMART Report - Issues Detected" "$report"
fi
fi
# Simple logging
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Daily SMART report: $healthy_drives/$total_drives drives healthy" >> "$LOG_FILE" 2>/dev/null || true
echo "Daily SMART report completed"
}
main "$@"

View file

@ -0,0 +1,96 @@
#!/usr/bin/env bash
# SMART daemon alert script for Gotify notifications
# Called by smartd when SMART issues are detected
# No arguments needed - uses SMARTD_DEVICE environment variable
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
GOTIFY_SCRIPT="${SCRIPT_DIR}/gotify-notify.sh"
LOG_FILE="/var/log/smartd-alerts.log"
# Host-specific Gotify configuration
GOTIFY_URL="https://notify.yanlincs.com"
GOTIFY_TOKEN="Ac9qKFH5cA.7Yly"
# Drive name mapping based on device path
get_drive_name() {
local device="$1"
case "$device" in
*"ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R"*)
echo "ZFS Mirror 1 (System)"
;;
*"ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG"*)
echo "ZFS Mirror 2 (System)"
;;
*"ata-HGST_HUH721212ALE604_5PK2N4GB"*)
echo "Data Drive 1 (12TB)"
;;
*"ata-HGST_HUH721212ALE604_5PJ7Z3LE"*)
echo "Data Drive 2 (12TB)"
;;
*"ata-ST16000NM000J-2TW103_WRS0F8BE"*)
echo "Parity Drive (16TB)"
;;
*)
echo "Unknown Drive ($device)"
;;
esac
}
log_message() {
local message="$1"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $message" | tee -a "$LOG_FILE"
}
send_smartd_alert() {
# smartd provides these environment variables:
local device="${SMARTD_DEVICE:-unknown}"
local failtype="${SMARTD_FAILTYPE:-unknown}"
local message="${SMARTD_MESSAGE:-No details provided}"
local drive_name
drive_name=$(get_drive_name "$device")
log_message "SMART alert for $drive_name ($device): $failtype - $message"
# Determine priority based on failure type
local priority="high"
case "$failtype" in
*"FAILURE"*|*"failure"*|*"CRITICAL"*|*"critical"*)
priority="critical"
;;
*"WARNING"*|*"warning"*|*"Temperature"*)
priority="high"
;;
*)
priority="high"
;;
esac
# Create notification message
local notification_title="SMART Alert: $drive_name"
local notification_message="Device: $device
Failure Type: $failtype
Details: $message
This alert was triggered by smartd monitoring."
# Send Gotify notification
if [[ -x "$GOTIFY_SCRIPT" ]]; then
"$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "$priority" "$notification_title" "$notification_message" || \
log_message "Failed to send Gotify notification"
else
log_message "Gotify script not found or not executable: $GOTIFY_SCRIPT"
fi
}
# Ensure log file exists
touch "$LOG_FILE" 2>/dev/null || {
LOG_FILE="/tmp/smartd-alerts.log"
touch "$LOG_FILE"
}
# Main execution
send_smartd_alert

74
scripts/gotify-notify.sh Executable file
View file

@ -0,0 +1,74 @@
#!/usr/bin/env bash
# Gotify notification script for disk health monitoring
# Usage: gotify-notify.sh <url> <token> <priority> <title> <message>
set -euo pipefail
# Validate arguments
if [[ $# -ne 5 ]]; then
echo "Usage: $0 <url> <token> <priority> <title> <message>"
echo "Example: $0 'https://notify.yanlincs.com' 'token123' 'high' 'Disk Alert' 'Drive temperature critical'"
exit 1
fi
# Get parameters
GOTIFY_URL="$1"
GOTIFY_TOKEN="$2"
priority="$3"
title="$4"
message="$5"
MAX_RETRIES=3
RETRY_DELAY=5
# Priority mapping: 1=low, 5=normal, 8=high, 10=critical
declare -A PRIORITY_MAP=(
["low"]="1"
["normal"]="5"
["high"]="8"
["critical"]="10"
)
send_notification() {
local priority="$1"
local title="$2"
local message="$3"
local attempt=1
# Map priority to numeric value
local numeric_priority="${PRIORITY_MAP[$priority]:-5}"
while [ $attempt -le $MAX_RETRIES ]; do
if curl -s -o /dev/null -w "%{http_code}" \
-X POST "${GOTIFY_URL}/message" \
-H "X-Gotify-Key: ${GOTIFY_TOKEN}" \
-H "Content-Type: application/json" \
-d "{
\"title\": \"${title}\",
\"message\": \"${message}\",
\"priority\": ${numeric_priority}
}" | grep -q "200"; then
echo "Notification sent successfully (attempt $attempt)"
return 0
else
echo "Failed to send notification (attempt $attempt/$MAX_RETRIES)"
if [ $attempt -lt $MAX_RETRIES ]; then
sleep $RETRY_DELAY
fi
((attempt++))
fi
done
echo "ERROR: Failed to send notification after $MAX_RETRIES attempts" >&2
return 1
}
# Validate priority
if [[ ! ${PRIORITY_MAP[$priority]+_} ]]; then
echo "Error: Invalid priority '$priority'. Use: low, normal, high, critical"
exit 1
fi
# Send notification
send_notification "$priority" "$title" "$message"