diff --git a/hosts/nixos/hs/system.nix b/hosts/nixos/hs/system.nix index af3a545..6208744 100644 --- a/hosts/nixos/hs/system.nix +++ b/hosts/nixos/hs/system.nix @@ -6,6 +6,7 @@ ../../../modules/podman.nix ../../../modules/traefik.nix ../../../modules/samba.nix + ../../../modules/disk-health.nix ]; # GRUB bootloader with ZFS support @@ -208,11 +209,6 @@ ]; }; - # Enable smartd for disk health monitoring - services.smartd = { - enable = true; - autodetect = true; - }; # Allow unfree packages globally nixpkgs.config.allowUnfree = true; diff --git a/modules/disk-health.nix b/modules/disk-health.nix new file mode 100644 index 0000000..68da26b --- /dev/null +++ b/modules/disk-health.nix @@ -0,0 +1,127 @@ +{ config, lib, pkgs, ... }: + +{ + # Simplified disk health monitoring configuration + # Focus on smartd real-time monitoring + simple daily SMART reports + + # Package requirements + environment.systemPackages = with pkgs; [ + smartmontools + curl # For Gotify notifications + ]; + + # Enhanced smartd configuration for real-time monitoring + services.smartd = { + enable = true; + autodetect = false; # We'll configure devices explicitly + + # Global smartd options + extraOptions = [ "-A /var/log/smartd/" "-i 600" ]; + + # Device-specific monitoring configurations + devices = [ + # ZFS Mirror drives (NVMe SSDs) - more frequent monitoring + { + device = "/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R"; + options = "-d auto -a -o on -S on -s (S/../.././02|L/../../6/03) -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh"; + } + { + device = "/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG"; + options = "-d auto -a -o on -S on -s (S/../.././02|L/../../6/03) -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh"; + } + + # Data drives (12TB HDDs) - standard monitoring + { + device = "/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PK2N4GB"; + options = "-d auto -a -o on -S on -s (S/../.././02|L/../../7/03) -W 4,45,55 -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh"; + } + { + device = "/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PJ7Z3LE"; + options = "-d auto -a -o on -S on -s (S/../.././02|L/../../7/03) -W 4,45,55 -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh"; + } + + # Parity drive (16TB HDD) - enhanced monitoring due to criticality + { + device = "/dev/disk/by-id/ata-ST16000NM000J-2TW103_WRS0F8BE"; + options = "-d auto -a -o on -S on -s (S/../.././02|L/../../1/03) -W 2,45,55 -m exec=/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh"; + } + ]; + }; + + # Simple systemd service for daily SMART reports + systemd.services = { + # Daily SMART report service - simplified and reliable + daily-smart-report = { + description = "Daily SMART Health Report"; + after = [ "multi-user.target" ]; + serviceConfig = { + Type = "oneshot"; + ExecStart = "${pkgs.bash}/bin/bash /home/yanlin/.config/nix/scripts/daily-smart-report.sh"; + User = "root"; + StandardOutput = "journal"; + StandardError = "journal"; + # Add timeout to prevent hanging + TimeoutStartSec = "300"; # 5 minutes max + }; + }; + }; + + # Simple systemd timer for daily SMART reports + systemd.timers = { + # Daily SMART report at 8:00 AM + daily-smart-report = { + description = "Daily SMART Report Timer"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "08:00:00"; + Persistent = true; + RandomizedDelaySec = "5m"; + }; + }; + }; + + # Ensure log directories exist with proper permissions + systemd.tmpfiles.rules = [ + "d /var/log 0755 root root -" + "f /var/log/daily-smart-report.log 0644 root root -" + "f /var/log/smartd-alerts.log 0644 root root -" + "d /var/log/smartd 0755 root root -" + ]; + + # Enable the timer + systemd.targets.timers.wants = [ + "daily-smart-report.timer" + ]; + + # Create a logrotate configuration for the logs + services.logrotate = { + enable = true; + settings = { + "/var/log/daily-smart-report.log" = { + frequency = "weekly"; + rotate = 4; + compress = true; + delaycompress = true; + missingok = true; + notifempty = true; + create = "644 root root"; + }; + "/var/log/smartd-alerts.log" = { + frequency = "weekly"; + rotate = 4; + compress = true; + delaycompress = true; + missingok = true; + notifempty = true; + create = "644 root root"; + }; + }; + }; + + # Ensure scripts are executable and in the right location + system.activationScripts.disk-health-scripts = '' + chmod +x /home/yanlin/.config/nix/scripts/gotify-notify.sh + chmod +x /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh + chmod +x /home/yanlin/.config/nix/scripts/daily-smart-report.sh + ''; +} \ No newline at end of file diff --git a/modules/zsh.nix b/modules/zsh.nix index 1b69c62..7a62d62 100644 --- a/modules/zsh.nix +++ b/modules/zsh.nix @@ -29,6 +29,8 @@ in hms-offline = "home-manager switch --flake ~/.config/nix#$(whoami)@$(hostname) --option substitute false"; doss = "sudo darwin-rebuild switch --flake ~/.config/nix#$(hostname)"; noss = "sudo nixos-rebuild switch --flake ~/.config/nix#$(hostname)"; + # Disk health monitoring + smart-report = "sudo /home/yanlin/.config/nix/scripts/daily-smart-report.sh"; } // lib.optionalAttrs pkgs.stdenv.isDarwin { # macOS-specific app aliases diff --git a/scripts/daily-smart-report.sh b/scripts/daily-smart-report.sh new file mode 100755 index 0000000..f3c955f --- /dev/null +++ b/scripts/daily-smart-report.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash + +# Simple daily SMART report script - plain text version +# Only checks SMART attributes and sends report via Gotify + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GOTIFY_SCRIPT="${SCRIPT_DIR}/gotify-notify.sh" +LOG_FILE="/var/log/daily-smart-report.log" + +# Host-specific Gotify configuration +GOTIFY_URL="https://notify.yanlincs.com" +GOTIFY_TOKEN="Ac9qKFH5cA.7Yly" + +# Drive configurations +declare -A DRIVES=( + ["/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R"]="ZFS Mirror 1" + ["/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG"]="ZFS Mirror 2" + ["/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PK2N4GB"]="Data Drive 1 (12TB)" + ["/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PJ7Z3LE"]="Data Drive 2 (12TB)" + ["/dev/disk/by-id/ata-ST16000NM000J-2TW103_WRS0F8BE"]="Parity Drive (16TB)" +) + +main() { + local report="" + local healthy_drives=0 + local total_drives=0 + + echo "Starting daily SMART report" + + report="Daily SMART Report - $(date '+%Y-%m-%d')\n\n" + report+="Drive SMART Status:\n" + + # Check each drive + for device in "${!DRIVES[@]}"; do + local device_name="${DRIVES[$device]}" + total_drives=$((total_drives + 1)) + + echo "Checking $device_name" + + # Quick device existence check + if [[ ! -e "$device" ]]; then + report+="[FAIL] $device_name: Device not found\n" + continue + fi + + # Get SMART health + local health="UNKNOWN" + if health=$(smartctl -H "$device" 2>/dev/null | grep -o "PASSED\|FAILED" | head -1); then + echo " Health: $health" + else + health="UNKNOWN" + echo " Health: $health" + fi + + # Get temperature + local temp="N/A" + if [[ "$health" == "PASSED" ]]; then + if temp=$(smartctl -A "$device" 2>/dev/null | awk '/Temperature_Celsius/ {print $10}' | head -1); then + if [[ "$temp" -gt 0 ]] 2>/dev/null; then + temp="${temp}C" + echo " Temperature: $temp" + else + temp="N/A" + echo " Temperature: $temp" + fi + else + temp="N/A" + echo " Temperature: $temp" + fi + fi + + # Format output + if [[ "$health" == "PASSED" ]]; then + report+="[OK] $device_name: $health (Temp: $temp)\n" + healthy_drives=$((healthy_drives + 1)) + else + report+="[FAIL] $device_name: $health (Temp: $temp)\n" + fi + done + + # Add summary + report+="\nSummary:\n" + if [[ $healthy_drives -eq $total_drives ]]; then + report+="Status: All $total_drives drives healthy\n" + report+="Next check: $(date -d 'tomorrow 08:00' '+%Y-%m-%d 08:00')" + + echo "Result: All drives healthy ($healthy_drives/$total_drives)" + + # Send notification + if [[ -x "$GOTIFY_SCRIPT" ]]; then + "$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "normal" "Daily SMART Report" "$report" + fi + else + local issues=$((total_drives - healthy_drives)) + report+="Status: $issues of $total_drives drives have issues" + + echo "Result: Issues detected ($healthy_drives/$total_drives drives healthy)" + + # Send high priority notification for issues + if [[ -x "$GOTIFY_SCRIPT" ]]; then + "$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "high" "Daily SMART Report - Issues Detected" "$report" + fi + fi + + # Simple logging + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Daily SMART report: $healthy_drives/$total_drives drives healthy" >> "$LOG_FILE" 2>/dev/null || true + + echo "Daily SMART report completed" +} + +main "$@" \ No newline at end of file diff --git a/scripts/disk-health-smartd-alert.sh b/scripts/disk-health-smartd-alert.sh new file mode 100755 index 0000000..78f8b3d --- /dev/null +++ b/scripts/disk-health-smartd-alert.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash + +# SMART daemon alert script for Gotify notifications +# Called by smartd when SMART issues are detected +# No arguments needed - uses SMARTD_DEVICE environment variable + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GOTIFY_SCRIPT="${SCRIPT_DIR}/gotify-notify.sh" +LOG_FILE="/var/log/smartd-alerts.log" + +# Host-specific Gotify configuration +GOTIFY_URL="https://notify.yanlincs.com" +GOTIFY_TOKEN="Ac9qKFH5cA.7Yly" + +# Drive name mapping based on device path +get_drive_name() { + local device="$1" + case "$device" in + *"ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R"*) + echo "ZFS Mirror 1 (System)" + ;; + *"ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG"*) + echo "ZFS Mirror 2 (System)" + ;; + *"ata-HGST_HUH721212ALE604_5PK2N4GB"*) + echo "Data Drive 1 (12TB)" + ;; + *"ata-HGST_HUH721212ALE604_5PJ7Z3LE"*) + echo "Data Drive 2 (12TB)" + ;; + *"ata-ST16000NM000J-2TW103_WRS0F8BE"*) + echo "Parity Drive (16TB)" + ;; + *) + echo "Unknown Drive ($device)" + ;; + esac +} + +log_message() { + local message="$1" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $message" | tee -a "$LOG_FILE" +} + +send_smartd_alert() { + # smartd provides these environment variables: + local device="${SMARTD_DEVICE:-unknown}" + local failtype="${SMARTD_FAILTYPE:-unknown}" + local message="${SMARTD_MESSAGE:-No details provided}" + + local drive_name + drive_name=$(get_drive_name "$device") + + log_message "SMART alert for $drive_name ($device): $failtype - $message" + + # Determine priority based on failure type + local priority="high" + case "$failtype" in + *"FAILURE"*|*"failure"*|*"CRITICAL"*|*"critical"*) + priority="critical" + ;; + *"WARNING"*|*"warning"*|*"Temperature"*) + priority="high" + ;; + *) + priority="high" + ;; + esac + + # Create notification message + local notification_title="SMART Alert: $drive_name" + local notification_message="Device: $device +Failure Type: $failtype +Details: $message + +This alert was triggered by smartd monitoring." + + # Send Gotify notification + if [[ -x "$GOTIFY_SCRIPT" ]]; then + "$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "$priority" "$notification_title" "$notification_message" || \ + log_message "Failed to send Gotify notification" + else + log_message "Gotify script not found or not executable: $GOTIFY_SCRIPT" + fi +} + +# Ensure log file exists +touch "$LOG_FILE" 2>/dev/null || { + LOG_FILE="/tmp/smartd-alerts.log" + touch "$LOG_FILE" +} + +# Main execution +send_smartd_alert \ No newline at end of file diff --git a/scripts/gotify-notify.sh b/scripts/gotify-notify.sh new file mode 100755 index 0000000..2e1d9dd --- /dev/null +++ b/scripts/gotify-notify.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash + +# Gotify notification script for disk health monitoring +# Usage: gotify-notify.sh <message> + +set -euo pipefail + +# Validate arguments +if [[ $# -ne 5 ]]; then + echo "Usage: $0 <url> <token> <priority> <title> <message>" + echo "Example: $0 'https://notify.yanlincs.com' 'token123' 'high' 'Disk Alert' 'Drive temperature critical'" + exit 1 +fi + +# Get parameters +GOTIFY_URL="$1" +GOTIFY_TOKEN="$2" +priority="$3" +title="$4" +message="$5" +MAX_RETRIES=3 +RETRY_DELAY=5 + +# Priority mapping: 1=low, 5=normal, 8=high, 10=critical +declare -A PRIORITY_MAP=( + ["low"]="1" + ["normal"]="5" + ["high"]="8" + ["critical"]="10" +) + + +send_notification() { + local priority="$1" + local title="$2" + local message="$3" + local attempt=1 + + # Map priority to numeric value + local numeric_priority="${PRIORITY_MAP[$priority]:-5}" + + while [ $attempt -le $MAX_RETRIES ]; do + if curl -s -o /dev/null -w "%{http_code}" \ + -X POST "${GOTIFY_URL}/message" \ + -H "X-Gotify-Key: ${GOTIFY_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "{ + \"title\": \"${title}\", + \"message\": \"${message}\", + \"priority\": ${numeric_priority} + }" | grep -q "200"; then + echo "Notification sent successfully (attempt $attempt)" + return 0 + else + echo "Failed to send notification (attempt $attempt/$MAX_RETRIES)" + if [ $attempt -lt $MAX_RETRIES ]; then + sleep $RETRY_DELAY + fi + ((attempt++)) + fi + done + + echo "ERROR: Failed to send notification after $MAX_RETRIES attempts" >&2 + return 1 +} + +# Validate priority +if [[ ! ${PRIORITY_MAP[$priority]+_} ]]; then + echo "Error: Invalid priority '$priority'. Use: low, normal, high, critical" + exit 1 +fi + +# Send notification +send_notification "$priority" "$title" "$message"