diff --git a/hosts/nixos/hs/disk-health.nix b/hosts/nixos/hs/disk-health.nix deleted file mode 100644 index 54d02d6..0000000 --- a/hosts/nixos/hs/disk-health.nix +++ /dev/null @@ -1,167 +0,0 @@ -{ config, lib, pkgs, ... }: - -{ - # Simplified disk health monitoring configuration - # Focus on smartd real-time monitoring + simple daily SMART reports - - # Package requirements - environment.systemPackages = with pkgs; [ - smartmontools - curl # For Gotify notifications - ]; - - # Enhanced smartd configuration for real-time monitoring - services.smartd = { - enable = true; - autodetect = false; # We'll configure devices explicitly - - # Global smartd options - extraOptions = [ "-A /var/log/smartd/" "-i 600" ]; - - # Disable default notifications - notifications = { - mail.enable = false; - x11.enable = false; - test = false; - }; - - # Device-specific monitoring configurations - devices = [ - # ZFS Mirror drives (NVMe SSDs) - more frequent monitoring - { - device = "/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R"; - options = "-d auto -a -o on -S on -s (S/../.././02|L/../../6/03) -M exec ${pkgs.writeShellScript "smartd-notify-hs-zfs1" '' - export SMARTD_DEVICE="$SMARTD_DEVICE" - export SMARTD_FAILTYPE="$SMARTD_FAILTYPE" - export SMARTD_MESSAGE="$SMARTD_MESSAGE" - /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly ZFS_Mirror_1 - ''}"; - } - { - device = "/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG"; - options = "-d auto -a -o on -S on -s (S/../.././02|L/../../6/03) -M exec ${pkgs.writeShellScript "smartd-notify-hs-zfs2" '' - export SMARTD_DEVICE="$SMARTD_DEVICE" - export SMARTD_FAILTYPE="$SMARTD_FAILTYPE" - export SMARTD_MESSAGE="$SMARTD_MESSAGE" - /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly ZFS_Mirror_2 - ''}"; - } - - # Data drives (12TB HDDs) - standard monitoring - { - device = "/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PK2N4GB"; - options = "-d auto -a -o on -S on -s (S/../.././02|L/../../7/03) -W 4,45,55 -M exec ${pkgs.writeShellScript "smartd-notify-hs-data1" '' - export SMARTD_DEVICE="$SMARTD_DEVICE" - export SMARTD_FAILTYPE="$SMARTD_FAILTYPE" - export SMARTD_MESSAGE="$SMARTD_MESSAGE" - /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly Data_Drive_1_12TB - ''}"; - } - { - device = "/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PJ7Z3LE"; - options = "-d auto -a -o on -S on -s (S/../.././02|L/../../7/03) -W 4,45,55 -M exec ${pkgs.writeShellScript "smartd-notify-hs-data2" '' - export SMARTD_DEVICE="$SMARTD_DEVICE" - export SMARTD_FAILTYPE="$SMARTD_FAILTYPE" - export SMARTD_MESSAGE="$SMARTD_MESSAGE" - /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly Data_Drive_2_12TB - ''}"; - } - - # Parity drive (16TB HDD) - enhanced monitoring due to criticality - { - device = "/dev/disk/by-id/ata-ST16000NM000J-2TW103_WRS0F8BE"; - options = "-d auto -a -o on -S on -s (S/../.././02|L/../../1/03) -W 2,45,55 -M exec ${pkgs.writeShellScript "smartd-notify-hs-parity" '' - export SMARTD_DEVICE="$SMARTD_DEVICE" - export SMARTD_FAILTYPE="$SMARTD_FAILTYPE" - export SMARTD_MESSAGE="$SMARTD_MESSAGE" - /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly Parity_Drive_16TB - ''}"; - } - ]; - }; - - # Simple systemd service for daily SMART reports - systemd.services = { - # Daily SMART report service - simplified and reliable - daily-smart-report = { - description = "Daily SMART Health Report"; - after = [ "multi-user.target" ]; - serviceConfig = { - Type = "oneshot"; - ExecStart = "${pkgs.bash}/bin/bash /home/yanlin/.config/nix/scripts/daily-smart-report.sh Ac9qKFH5cA.7Yly"; - User = "root"; - StandardOutput = "journal"; - StandardError = "journal"; - # Add timeout to prevent hanging - TimeoutStartSec = "300"; # 5 minutes max - # Set PATH and SMART_DRIVES environment variables - Environment = [ - "PATH=/run/current-system/sw/bin" - "SMART_DRIVES=/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R:ZFS Mirror 1;/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG:ZFS Mirror 2;/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PK2N4GB:Data Drive 1 (12TB);/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PJ7Z3LE:Data Drive 2 (12TB);/dev/disk/by-id/ata-ST16000NM000J-2TW103_WRS0F8BE:Parity Drive (16TB)" - ]; - # Allow access to block devices for SMART commands - DeviceAllow = [ "/dev/disk/by-id/* rw" "/dev/sd* rw" "/dev/nvme* rw" "char-* rw" "block-* rw" ]; - DevicePolicy = "closed"; - }; - }; - }; - - # Simple systemd timer for daily SMART reports - systemd.timers = { - # Daily SMART report at 8:00 AM - daily-smart-report = { - description = "Daily SMART Report Timer"; - wantedBy = [ "timers.target" ]; - timerConfig = { - OnCalendar = "08:00:00"; - Persistent = true; - RandomizedDelaySec = "5m"; - }; - }; - }; - - # Ensure log directories exist with proper permissions - systemd.tmpfiles.rules = [ - "d /var/log 0755 root root -" - "f /var/log/daily-smart-report.log 0644 root root -" - "f /var/log/smartd-alerts.log 0644 root root -" - "d /var/log/smartd 0755 root root -" - ]; - - # Enable the timer - systemd.targets.timers.wants = [ - "daily-smart-report.timer" - ]; - - # Create a logrotate configuration for the logs - services.logrotate = { - enable = true; - settings = { - "/var/log/daily-smart-report.log" = { - frequency = "weekly"; - rotate = 4; - compress = true; - delaycompress = true; - missingok = true; - notifempty = true; - create = "644 root root"; - }; - "/var/log/smartd-alerts.log" = { - frequency = "weekly"; - rotate = 4; - compress = true; - delaycompress = true; - missingok = true; - notifempty = true; - create = "644 root root"; - }; - }; - }; - - # Ensure scripts are executable and in the right location - system.activationScripts.disk-health-scripts = '' - chmod +x /home/yanlin/.config/nix/scripts/gotify-notify.sh - chmod +x /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh - chmod +x /home/yanlin/.config/nix/scripts/daily-smart-report.sh - ''; -} \ No newline at end of file diff --git a/hosts/nixos/hs/system.nix b/hosts/nixos/hs/system.nix index eb9f1c1..99cc76f 100644 --- a/hosts/nixos/hs/system.nix +++ b/hosts/nixos/hs/system.nix @@ -3,7 +3,6 @@ ./hardware-configuration.nix ./containers.nix # Host-specific container definitions ./proxy.nix # Host-specific Traefik dynamic configuration - ./disk-health.nix # Host-specific disk health monitoring ../../../modules/wireguard.nix ../../../modules/podman.nix ../../../modules/traefik.nix @@ -253,6 +252,29 @@ hideDotFiles = true; }; + # Daily SMART report using the shell alias + systemd.services.daily-smart-report = { + description = "Daily SMART Health Report"; + after = [ "multi-user.target" ]; + serviceConfig = { + Type = "oneshot"; + User = "root"; + ExecStart = "${pkgs.zsh}/bin/zsh -c 'source /home/yanlin/.zshrc && smart-report'"; + StandardOutput = "journal"; + StandardError = "journal"; + }; + }; + + systemd.timers.daily-smart-report = { + description = "Daily SMART Report Timer"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "08:00:00"; + Persistent = true; + RandomizedDelaySec = "5m"; + }; + }; + # Borg backup configuration services.borgbackup-custom = { enable = true; diff --git a/hosts/nixos/thinkpad/disk-health.nix b/hosts/nixos/thinkpad/disk-health.nix deleted file mode 100644 index da4229c..0000000 --- a/hosts/nixos/thinkpad/disk-health.nix +++ /dev/null @@ -1,123 +0,0 @@ -{ config, lib, pkgs, ... }: - -{ - # Simplified disk health monitoring for ThinkPad laptop - # Single NVMe SSD monitoring with laptop-friendly settings - - # Package requirements - environment.systemPackages = with pkgs; [ - smartmontools - curl # For Gotify notifications - ]; - - # Smartd configuration for laptop NVMe SSD - services.smartd = { - enable = true; - autodetect = false; # Explicit configuration - - # Global smartd options - extraOptions = [ "-A /var/log/smartd/" "-i 900" ]; # Check every 15 minutes - - # Disable default notifications - notifications = { - mail.enable = false; - x11.enable = false; - test = false; - }; - - # Single NVMe drive monitoring with all options inline - devices = [ - { - device = "/dev/nvme0n1"; - options = "-d nvme -a -o on -S on -s (S/../.././03|L/../../7/04) -W 4,60,70 -M exec ${pkgs.writeShellScript "smartd-notify-thinkpad" '' - export SMARTD_DEVICE="$SMARTD_DEVICE" - export SMARTD_FAILTYPE="$SMARTD_FAILTYPE" - export SMARTD_MESSAGE="$SMARTD_MESSAGE" - /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh AieM4SJHFcyl7TC System_SSD_ThinkPad - ''}"; - } - ]; - }; - - # Daily SMART report service - systemd.services = { - daily-smart-report = { - description = "Daily SMART Health Report for ThinkPad"; - after = [ "multi-user.target" ]; - serviceConfig = { - Type = "oneshot"; - ExecStart = "${pkgs.bash}/bin/bash /home/yanlin/.config/nix/scripts/daily-smart-report.sh AieM4SJHFcyl7TC"; - User = "root"; - StandardOutput = "journal"; - StandardError = "journal"; - TimeoutStartSec = "300"; # 5 minutes max - # Environment with single NVMe drive - Environment = [ - "PATH=/run/current-system/sw/bin" - "SMART_DRIVES=/dev/nvme0n1:System SSD (ThinkPad)" - ]; - # Allow access to NVMe devices - DeviceAllow = [ "/dev/nvme* rw" "char-* rw" "block-* rw" ]; - DevicePolicy = "closed"; - }; - }; - }; - - # Daily SMART report timer - runs at 09:00 (later than server) - systemd.timers = { - daily-smart-report = { - description = "Daily SMART Report Timer for ThinkPad"; - wantedBy = [ "timers.target" ]; - timerConfig = { - OnCalendar = "09:00:00"; # Later time for laptop - Persistent = true; - RandomizedDelaySec = "10m"; # Longer randomization for laptop - }; - }; - }; - - # Ensure log directories exist - systemd.tmpfiles.rules = [ - "d /var/log 0755 root root -" - "f /var/log/daily-smart-report.log 0644 root root -" - "f /var/log/smartd-alerts.log 0644 root root -" - "d /var/log/smartd 0755 root root -" - ]; - - # Enable the timer - systemd.targets.timers.wants = [ - "daily-smart-report.timer" - ]; - - # Logrotate configuration - services.logrotate = { - enable = true; - settings = { - "/var/log/daily-smart-report.log" = { - frequency = "weekly"; - rotate = 4; - compress = true; - delaycompress = true; - missingok = true; - notifempty = true; - create = "644 root root"; - }; - "/var/log/smartd-alerts.log" = { - frequency = "weekly"; - rotate = 4; - compress = true; - delaycompress = true; - missingok = true; - notifempty = true; - create = "644 root root"; - }; - }; - }; - - # Ensure scripts are executable - system.activationScripts.disk-health-scripts = '' - chmod +x /home/yanlin/.config/nix/scripts/gotify-notify.sh - chmod +x /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh - chmod +x /home/yanlin/.config/nix/scripts/daily-smart-report.sh - ''; -} \ No newline at end of file diff --git a/hosts/nixos/thinkpad/system.nix b/hosts/nixos/thinkpad/system.nix index d05315b..e03418b 100644 --- a/hosts/nixos/thinkpad/system.nix +++ b/hosts/nixos/thinkpad/system.nix @@ -1,7 +1,6 @@ { config, pkgs, lib, ... }: { imports = [ ./hardware-configuration.nix - ./disk-health.nix ../../../modules/wireguard.nix ../../../modules/borg-server.nix ]; @@ -355,6 +354,29 @@ }; }; + # Daily SMART report using the shell alias + systemd.services.daily-smart-report = { + description = "Daily SMART Health Report"; + after = [ "multi-user.target" ]; + serviceConfig = { + Type = "oneshot"; + User = "root"; + ExecStart = "${pkgs.zsh}/bin/zsh -c 'source /home/yanlin/.zshrc && smart-report'"; + StandardOutput = "journal"; + StandardError = "journal"; + }; + }; + + systemd.timers.daily-smart-report = { + description = "Daily SMART Report Timer"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "09:00:00"; + Persistent = true; + RandomizedDelaySec = "10m"; + }; + }; + # Borg backup server configuration services.borgbackup-server = { enable = true; diff --git a/scripts/disk-health-smartd-alert.sh b/scripts/disk-health-smartd-alert.sh deleted file mode 100755 index af96c0d..0000000 --- a/scripts/disk-health-smartd-alert.sh +++ /dev/null @@ -1,82 +0,0 @@ -# SMART daemon alert script for Gotify notifications -# Called by smartd when SMART issues are detected -# Usage: disk-health-smartd-alert.sh -# Uses SMARTD_DEVICE environment variable for device info - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -GOTIFY_SCRIPT="${SCRIPT_DIR}/gotify-notify.sh" -LOG_FILE="/var/log/smartd-alerts.log" - -# Get parameters -GOTIFY_TOKEN="${1:-}" -DRIVE_NAME="${2:-}" - -# Validate parameters -if [[ -z "$GOTIFY_TOKEN" ]]; then - echo "Error: Gotify token not provided" - echo "Usage: $0 " - exit 1 -fi - -# If drive name not provided, use device path -if [[ -z "$DRIVE_NAME" ]]; then - DRIVE_NAME="${SMARTD_DEVICE:-Unknown Drive}" -fi - -# Gotify configuration -GOTIFY_URL="https://notify.yanlincs.com" - -log_message() { - local message="$1" - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $message" | tee -a "$LOG_FILE" -} - -send_smartd_alert() { - # smartd provides these environment variables: - local device="${SMARTD_DEVICE:-unknown}" - local failtype="${SMARTD_FAILTYPE:-unknown}" - local message="${SMARTD_MESSAGE:-No details provided}" - - log_message "SMART alert for $DRIVE_NAME ($device): $failtype - $message" - - # Determine priority based on failure type - local priority="high" - case "$failtype" in - *"FAILURE"*|*"failure"*|*"CRITICAL"*|*"critical"*) - priority="critical" - ;; - *"WARNING"*|*"warning"*|*"Temperature"*) - priority="high" - ;; - *) - priority="high" - ;; - esac - - # Create notification message - local notification_title="SMART Alert: $DRIVE_NAME" - local notification_message="Device: $device -Failure Type: $failtype -Details: $message - -This alert was triggered by smartd monitoring." - - # Send Gotify notification - if [[ -x "$GOTIFY_SCRIPT" ]]; then - "$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "$priority" "$notification_title" "$notification_message" || \ - log_message "Failed to send Gotify notification" - else - log_message "Gotify script not found or not executable: $GOTIFY_SCRIPT" - fi -} - -# Ensure log file exists -touch "$LOG_FILE" 2>/dev/null || { - LOG_FILE="/tmp/smartd-alerts.log" - touch "$LOG_FILE" -} - -# Main execution -send_smartd_alert