Add disk health check
This commit is contained in:
parent
95b92c232e
commit
d2bc0a522e
6 changed files with 413 additions and 5 deletions
|
|
@ -6,6 +6,7 @@
|
||||||
../../../modules/podman.nix
|
../../../modules/podman.nix
|
||||||
../../../modules/traefik.nix
|
../../../modules/traefik.nix
|
||||||
../../../modules/samba.nix
|
../../../modules/samba.nix
|
||||||
|
../../../modules/disk-health.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
# GRUB bootloader with ZFS support
|
# GRUB bootloader with ZFS support
|
||||||
|
|
@ -208,11 +209,6 @@
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
# Enable smartd for disk health monitoring
|
|
||||||
services.smartd = {
|
|
||||||
enable = true;
|
|
||||||
autodetect = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
# Allow unfree packages globally
|
# Allow unfree packages globally
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
|
|
||||||
127
modules/disk-health.nix
Normal file
127
modules/disk-health.nix
Normal file
|
|
@ -0,0 +1,127 @@
|
||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
# Simplified disk health monitoring configuration
|
||||||
|
# Focus on smartd real-time monitoring + simple daily SMART reports
|
||||||
|
|
||||||
|
# Package requirements
|
||||||
|
environment.systemPackages = with pkgs; [
|
||||||
|
smartmontools
|
||||||
|
curl # For Gotify notifications
|
||||||
|
];
|
||||||
|
|
||||||
|
# Enhanced smartd configuration for real-time monitoring
|
||||||
|
services.smartd = {
|
||||||
|
enable = true;
|
||||||
|
autodetect = false; # We'll configure devices explicitly
|
||||||
|
|
||||||
|
# Global smartd options
|
||||||
|
extraOptions = [ "-A /var/log/smartd/" "-i 600" ];
|
||||||
|
|
||||||
|
# Device-specific monitoring configurations
|
||||||
|
devices = [
|
||||||
|
# ZFS Mirror drives (NVMe SSDs) - more frequent monitoring
|
||||||
|
{
|
||||||
|
device = "/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R";
|
||||||
|
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../6/03) -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
device = "/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG";
|
||||||
|
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../6/03) -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh";
|
||||||
|
}
|
||||||
|
|
||||||
|
# Data drives (12TB HDDs) - standard monitoring
|
||||||
|
{
|
||||||
|
device = "/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PK2N4GB";
|
||||||
|
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../7/03) -W 4,45,55 -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
device = "/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PJ7Z3LE";
|
||||||
|
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../7/03) -W 4,45,55 -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh";
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parity drive (16TB HDD) - enhanced monitoring due to criticality
|
||||||
|
{
|
||||||
|
device = "/dev/disk/by-id/ata-ST16000NM000J-2TW103_WRS0F8BE";
|
||||||
|
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../1/03) -W 2,45,55 -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Simple systemd service for daily SMART reports
|
||||||
|
systemd.services = {
|
||||||
|
# Daily SMART report service - simplified and reliable
|
||||||
|
daily-smart-report = {
|
||||||
|
description = "Daily SMART Health Report";
|
||||||
|
after = [ "multi-user.target" ];
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
ExecStart = "${pkgs.bash}/bin/bash /home/yanlin/.config/nix/scripts/daily-smart-report.sh";
|
||||||
|
User = "root";
|
||||||
|
StandardOutput = "journal";
|
||||||
|
StandardError = "journal";
|
||||||
|
# Add timeout to prevent hanging
|
||||||
|
TimeoutStartSec = "300"; # 5 minutes max
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Simple systemd timer for daily SMART reports
|
||||||
|
systemd.timers = {
|
||||||
|
# Daily SMART report at 8:00 AM
|
||||||
|
daily-smart-report = {
|
||||||
|
description = "Daily SMART Report Timer";
|
||||||
|
wantedBy = [ "timers.target" ];
|
||||||
|
timerConfig = {
|
||||||
|
OnCalendar = "08:00:00";
|
||||||
|
Persistent = true;
|
||||||
|
RandomizedDelaySec = "5m";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Ensure log directories exist with proper permissions
|
||||||
|
systemd.tmpfiles.rules = [
|
||||||
|
"d /var/log 0755 root root -"
|
||||||
|
"f /var/log/daily-smart-report.log 0644 root root -"
|
||||||
|
"f /var/log/smartd-alerts.log 0644 root root -"
|
||||||
|
"d /var/log/smartd 0755 root root -"
|
||||||
|
];
|
||||||
|
|
||||||
|
# Enable the timer
|
||||||
|
systemd.targets.timers.wants = [
|
||||||
|
"daily-smart-report.timer"
|
||||||
|
];
|
||||||
|
|
||||||
|
# Create a logrotate configuration for the logs
|
||||||
|
services.logrotate = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
"/var/log/daily-smart-report.log" = {
|
||||||
|
frequency = "weekly";
|
||||||
|
rotate = 4;
|
||||||
|
compress = true;
|
||||||
|
delaycompress = true;
|
||||||
|
missingok = true;
|
||||||
|
notifempty = true;
|
||||||
|
create = "644 root root";
|
||||||
|
};
|
||||||
|
"/var/log/smartd-alerts.log" = {
|
||||||
|
frequency = "weekly";
|
||||||
|
rotate = 4;
|
||||||
|
compress = true;
|
||||||
|
delaycompress = true;
|
||||||
|
missingok = true;
|
||||||
|
notifempty = true;
|
||||||
|
create = "644 root root";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Ensure scripts are executable and in the right location
|
||||||
|
system.activationScripts.disk-health-scripts = ''
|
||||||
|
chmod +x /home/yanlin/.config/nix/scripts/gotify-notify.sh
|
||||||
|
chmod +x /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh
|
||||||
|
chmod +x /home/yanlin/.config/nix/scripts/daily-smart-report.sh
|
||||||
|
'';
|
||||||
|
}
|
||||||
|
|
@ -29,6 +29,8 @@ in
|
||||||
hms-offline = "home-manager switch --flake ~/.config/nix#$(whoami)@$(hostname) --option substitute false";
|
hms-offline = "home-manager switch --flake ~/.config/nix#$(whoami)@$(hostname) --option substitute false";
|
||||||
doss = "sudo darwin-rebuild switch --flake ~/.config/nix#$(hostname)";
|
doss = "sudo darwin-rebuild switch --flake ~/.config/nix#$(hostname)";
|
||||||
noss = "sudo nixos-rebuild switch --flake ~/.config/nix#$(hostname)";
|
noss = "sudo nixos-rebuild switch --flake ~/.config/nix#$(hostname)";
|
||||||
|
# Disk health monitoring
|
||||||
|
smart-report = "sudo /home/yanlin/.config/nix/scripts/daily-smart-report.sh";
|
||||||
|
|
||||||
} // lib.optionalAttrs pkgs.stdenv.isDarwin {
|
} // lib.optionalAttrs pkgs.stdenv.isDarwin {
|
||||||
# macOS-specific app aliases
|
# macOS-specific app aliases
|
||||||
|
|
|
||||||
113
scripts/daily-smart-report.sh
Executable file
113
scripts/daily-smart-report.sh
Executable file
|
|
@ -0,0 +1,113 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Simple daily SMART report script - plain text version
|
||||||
|
# Only checks SMART attributes and sends report via Gotify
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
GOTIFY_SCRIPT="${SCRIPT_DIR}/gotify-notify.sh"
|
||||||
|
LOG_FILE="/var/log/daily-smart-report.log"
|
||||||
|
|
||||||
|
# Host-specific Gotify configuration
|
||||||
|
GOTIFY_URL="https://notify.yanlincs.com"
|
||||||
|
GOTIFY_TOKEN="Ac9qKFH5cA.7Yly"
|
||||||
|
|
||||||
|
# Drive configurations
|
||||||
|
declare -A DRIVES=(
|
||||||
|
["/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R"]="ZFS Mirror 1"
|
||||||
|
["/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG"]="ZFS Mirror 2"
|
||||||
|
["/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PK2N4GB"]="Data Drive 1 (12TB)"
|
||||||
|
["/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PJ7Z3LE"]="Data Drive 2 (12TB)"
|
||||||
|
["/dev/disk/by-id/ata-ST16000NM000J-2TW103_WRS0F8BE"]="Parity Drive (16TB)"
|
||||||
|
)
|
||||||
|
|
||||||
|
main() {
|
||||||
|
local report=""
|
||||||
|
local healthy_drives=0
|
||||||
|
local total_drives=0
|
||||||
|
|
||||||
|
echo "Starting daily SMART report"
|
||||||
|
|
||||||
|
report="Daily SMART Report - $(date '+%Y-%m-%d')\n\n"
|
||||||
|
report+="Drive SMART Status:\n"
|
||||||
|
|
||||||
|
# Check each drive
|
||||||
|
for device in "${!DRIVES[@]}"; do
|
||||||
|
local device_name="${DRIVES[$device]}"
|
||||||
|
total_drives=$((total_drives + 1))
|
||||||
|
|
||||||
|
echo "Checking $device_name"
|
||||||
|
|
||||||
|
# Quick device existence check
|
||||||
|
if [[ ! -e "$device" ]]; then
|
||||||
|
report+="[FAIL] $device_name: Device not found\n"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get SMART health
|
||||||
|
local health="UNKNOWN"
|
||||||
|
if health=$(smartctl -H "$device" 2>/dev/null | grep -o "PASSED\|FAILED" | head -1); then
|
||||||
|
echo " Health: $health"
|
||||||
|
else
|
||||||
|
health="UNKNOWN"
|
||||||
|
echo " Health: $health"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get temperature
|
||||||
|
local temp="N/A"
|
||||||
|
if [[ "$health" == "PASSED" ]]; then
|
||||||
|
if temp=$(smartctl -A "$device" 2>/dev/null | awk '/Temperature_Celsius/ {print $10}' | head -1); then
|
||||||
|
if [[ "$temp" -gt 0 ]] 2>/dev/null; then
|
||||||
|
temp="${temp}C"
|
||||||
|
echo " Temperature: $temp"
|
||||||
|
else
|
||||||
|
temp="N/A"
|
||||||
|
echo " Temperature: $temp"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
temp="N/A"
|
||||||
|
echo " Temperature: $temp"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Format output
|
||||||
|
if [[ "$health" == "PASSED" ]]; then
|
||||||
|
report+="[OK] $device_name: $health (Temp: $temp)\n"
|
||||||
|
healthy_drives=$((healthy_drives + 1))
|
||||||
|
else
|
||||||
|
report+="[FAIL] $device_name: $health (Temp: $temp)\n"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Add summary
|
||||||
|
report+="\nSummary:\n"
|
||||||
|
if [[ $healthy_drives -eq $total_drives ]]; then
|
||||||
|
report+="Status: All $total_drives drives healthy\n"
|
||||||
|
report+="Next check: $(date -d 'tomorrow 08:00' '+%Y-%m-%d 08:00')"
|
||||||
|
|
||||||
|
echo "Result: All drives healthy ($healthy_drives/$total_drives)"
|
||||||
|
|
||||||
|
# Send notification
|
||||||
|
if [[ -x "$GOTIFY_SCRIPT" ]]; then
|
||||||
|
"$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "normal" "Daily SMART Report" "$report"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
local issues=$((total_drives - healthy_drives))
|
||||||
|
report+="Status: $issues of $total_drives drives have issues"
|
||||||
|
|
||||||
|
echo "Result: Issues detected ($healthy_drives/$total_drives drives healthy)"
|
||||||
|
|
||||||
|
# Send high priority notification for issues
|
||||||
|
if [[ -x "$GOTIFY_SCRIPT" ]]; then
|
||||||
|
"$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "high" "Daily SMART Report - Issues Detected" "$report"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Simple logging
|
||||||
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Daily SMART report: $healthy_drives/$total_drives drives healthy" >> "$LOG_FILE" 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "Daily SMART report completed"
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
96
scripts/disk-health-smartd-alert.sh
Executable file
96
scripts/disk-health-smartd-alert.sh
Executable file
|
|
@ -0,0 +1,96 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# SMART daemon alert script for Gotify notifications
|
||||||
|
# Called by smartd when SMART issues are detected
|
||||||
|
# No arguments needed - uses SMARTD_DEVICE environment variable
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
GOTIFY_SCRIPT="${SCRIPT_DIR}/gotify-notify.sh"
|
||||||
|
LOG_FILE="/var/log/smartd-alerts.log"
|
||||||
|
|
||||||
|
# Host-specific Gotify configuration
|
||||||
|
GOTIFY_URL="https://notify.yanlincs.com"
|
||||||
|
GOTIFY_TOKEN="Ac9qKFH5cA.7Yly"
|
||||||
|
|
||||||
|
# Drive name mapping based on device path
|
||||||
|
get_drive_name() {
|
||||||
|
local device="$1"
|
||||||
|
case "$device" in
|
||||||
|
*"ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R"*)
|
||||||
|
echo "ZFS Mirror 1 (System)"
|
||||||
|
;;
|
||||||
|
*"ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG"*)
|
||||||
|
echo "ZFS Mirror 2 (System)"
|
||||||
|
;;
|
||||||
|
*"ata-HGST_HUH721212ALE604_5PK2N4GB"*)
|
||||||
|
echo "Data Drive 1 (12TB)"
|
||||||
|
;;
|
||||||
|
*"ata-HGST_HUH721212ALE604_5PJ7Z3LE"*)
|
||||||
|
echo "Data Drive 2 (12TB)"
|
||||||
|
;;
|
||||||
|
*"ata-ST16000NM000J-2TW103_WRS0F8BE"*)
|
||||||
|
echo "Parity Drive (16TB)"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown Drive ($device)"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
log_message() {
|
||||||
|
local message="$1"
|
||||||
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $message" | tee -a "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
send_smartd_alert() {
|
||||||
|
# smartd provides these environment variables:
|
||||||
|
local device="${SMARTD_DEVICE:-unknown}"
|
||||||
|
local failtype="${SMARTD_FAILTYPE:-unknown}"
|
||||||
|
local message="${SMARTD_MESSAGE:-No details provided}"
|
||||||
|
|
||||||
|
local drive_name
|
||||||
|
drive_name=$(get_drive_name "$device")
|
||||||
|
|
||||||
|
log_message "SMART alert for $drive_name ($device): $failtype - $message"
|
||||||
|
|
||||||
|
# Determine priority based on failure type
|
||||||
|
local priority="high"
|
||||||
|
case "$failtype" in
|
||||||
|
*"FAILURE"*|*"failure"*|*"CRITICAL"*|*"critical"*)
|
||||||
|
priority="critical"
|
||||||
|
;;
|
||||||
|
*"WARNING"*|*"warning"*|*"Temperature"*)
|
||||||
|
priority="high"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
priority="high"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Create notification message
|
||||||
|
local notification_title="SMART Alert: $drive_name"
|
||||||
|
local notification_message="Device: $device
|
||||||
|
Failure Type: $failtype
|
||||||
|
Details: $message
|
||||||
|
|
||||||
|
This alert was triggered by smartd monitoring."
|
||||||
|
|
||||||
|
# Send Gotify notification
|
||||||
|
if [[ -x "$GOTIFY_SCRIPT" ]]; then
|
||||||
|
"$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "$priority" "$notification_title" "$notification_message" || \
|
||||||
|
log_message "Failed to send Gotify notification"
|
||||||
|
else
|
||||||
|
log_message "Gotify script not found or not executable: $GOTIFY_SCRIPT"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Ensure log file exists
|
||||||
|
touch "$LOG_FILE" 2>/dev/null || {
|
||||||
|
LOG_FILE="/tmp/smartd-alerts.log"
|
||||||
|
touch "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main execution
|
||||||
|
send_smartd_alert
|
||||||
74
scripts/gotify-notify.sh
Executable file
74
scripts/gotify-notify.sh
Executable file
|
|
@ -0,0 +1,74 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Gotify notification script for disk health monitoring
|
||||||
|
# Usage: gotify-notify.sh <url> <token> <priority> <title> <message>
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Validate arguments
|
||||||
|
if [[ $# -ne 5 ]]; then
|
||||||
|
echo "Usage: $0 <url> <token> <priority> <title> <message>"
|
||||||
|
echo "Example: $0 'https://notify.yanlincs.com' 'token123' 'high' 'Disk Alert' 'Drive temperature critical'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get parameters
|
||||||
|
GOTIFY_URL="$1"
|
||||||
|
GOTIFY_TOKEN="$2"
|
||||||
|
priority="$3"
|
||||||
|
title="$4"
|
||||||
|
message="$5"
|
||||||
|
MAX_RETRIES=3
|
||||||
|
RETRY_DELAY=5
|
||||||
|
|
||||||
|
# Priority mapping: 1=low, 5=normal, 8=high, 10=critical
|
||||||
|
declare -A PRIORITY_MAP=(
|
||||||
|
["low"]="1"
|
||||||
|
["normal"]="5"
|
||||||
|
["high"]="8"
|
||||||
|
["critical"]="10"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
send_notification() {
|
||||||
|
local priority="$1"
|
||||||
|
local title="$2"
|
||||||
|
local message="$3"
|
||||||
|
local attempt=1
|
||||||
|
|
||||||
|
# Map priority to numeric value
|
||||||
|
local numeric_priority="${PRIORITY_MAP[$priority]:-5}"
|
||||||
|
|
||||||
|
while [ $attempt -le $MAX_RETRIES ]; do
|
||||||
|
if curl -s -o /dev/null -w "%{http_code}" \
|
||||||
|
-X POST "${GOTIFY_URL}/message" \
|
||||||
|
-H "X-Gotify-Key: ${GOTIFY_TOKEN}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{
|
||||||
|
\"title\": \"${title}\",
|
||||||
|
\"message\": \"${message}\",
|
||||||
|
\"priority\": ${numeric_priority}
|
||||||
|
}" | grep -q "200"; then
|
||||||
|
echo "Notification sent successfully (attempt $attempt)"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo "Failed to send notification (attempt $attempt/$MAX_RETRIES)"
|
||||||
|
if [ $attempt -lt $MAX_RETRIES ]; then
|
||||||
|
sleep $RETRY_DELAY
|
||||||
|
fi
|
||||||
|
((attempt++))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "ERROR: Failed to send notification after $MAX_RETRIES attempts" >&2
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Validate priority
|
||||||
|
if [[ ! ${PRIORITY_MAP[$priority]+_} ]]; then
|
||||||
|
echo "Error: Invalid priority '$priority'. Use: low, normal, high, critical"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Send notification
|
||||||
|
send_notification "$priority" "$title" "$message"
|
||||||
Loading…
Add table
Add a link
Reference in a new issue