Simplify daily smart report

This commit is contained in:
Yan Lin 2025-09-15 12:25:20 +02:00
parent d57966b115
commit b31ac3cc90
5 changed files with 46 additions and 374 deletions

View file

@ -1,167 +0,0 @@
{ config, lib, pkgs, ... }:
{
# Simplified disk health monitoring configuration
# Focus on smartd real-time monitoring + simple daily SMART reports
# Package requirements
environment.systemPackages = with pkgs; [
smartmontools
curl # For Gotify notifications
];
# Enhanced smartd configuration for real-time monitoring
services.smartd = {
enable = true;
autodetect = false; # We'll configure devices explicitly
# Global smartd options
extraOptions = [ "-A /var/log/smartd/" "-i 600" ];
# Disable default notifications
notifications = {
mail.enable = false;
x11.enable = false;
test = false;
};
# Device-specific monitoring configurations
devices = [
# ZFS Mirror drives (NVMe SSDs) - more frequent monitoring
{
device = "/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R";
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../6/03) -M exec ${pkgs.writeShellScript "smartd-notify-hs-zfs1" ''
export SMARTD_DEVICE="$SMARTD_DEVICE"
export SMARTD_FAILTYPE="$SMARTD_FAILTYPE"
export SMARTD_MESSAGE="$SMARTD_MESSAGE"
/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly ZFS_Mirror_1
''}";
}
{
device = "/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG";
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../6/03) -M exec ${pkgs.writeShellScript "smartd-notify-hs-zfs2" ''
export SMARTD_DEVICE="$SMARTD_DEVICE"
export SMARTD_FAILTYPE="$SMARTD_FAILTYPE"
export SMARTD_MESSAGE="$SMARTD_MESSAGE"
/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly ZFS_Mirror_2
''}";
}
# Data drives (12TB HDDs) - standard monitoring
{
device = "/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PK2N4GB";
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../7/03) -W 4,45,55 -M exec ${pkgs.writeShellScript "smartd-notify-hs-data1" ''
export SMARTD_DEVICE="$SMARTD_DEVICE"
export SMARTD_FAILTYPE="$SMARTD_FAILTYPE"
export SMARTD_MESSAGE="$SMARTD_MESSAGE"
/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly Data_Drive_1_12TB
''}";
}
{
device = "/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PJ7Z3LE";
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../7/03) -W 4,45,55 -M exec ${pkgs.writeShellScript "smartd-notify-hs-data2" ''
export SMARTD_DEVICE="$SMARTD_DEVICE"
export SMARTD_FAILTYPE="$SMARTD_FAILTYPE"
export SMARTD_MESSAGE="$SMARTD_MESSAGE"
/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly Data_Drive_2_12TB
''}";
}
# Parity drive (16TB HDD) - enhanced monitoring due to criticality
{
device = "/dev/disk/by-id/ata-ST16000NM000J-2TW103_WRS0F8BE";
options = "-d auto -a -o on -S on -s (S/../.././02|L/../../1/03) -W 2,45,55 -M exec ${pkgs.writeShellScript "smartd-notify-hs-parity" ''
export SMARTD_DEVICE="$SMARTD_DEVICE"
export SMARTD_FAILTYPE="$SMARTD_FAILTYPE"
export SMARTD_MESSAGE="$SMARTD_MESSAGE"
/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh Ac9qKFH5cA.7Yly Parity_Drive_16TB
''}";
}
];
};
# Simple systemd service for daily SMART reports
systemd.services = {
# Daily SMART report service - simplified and reliable
daily-smart-report = {
description = "Daily SMART Health Report";
after = [ "multi-user.target" ];
serviceConfig = {
Type = "oneshot";
ExecStart = "${pkgs.bash}/bin/bash /home/yanlin/.config/nix/scripts/daily-smart-report.sh Ac9qKFH5cA.7Yly";
User = "root";
StandardOutput = "journal";
StandardError = "journal";
# Add timeout to prevent hanging
TimeoutStartSec = "300"; # 5 minutes max
# Set PATH and SMART_DRIVES environment variables
Environment = [
"PATH=/run/current-system/sw/bin"
"SMART_DRIVES=/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431J4R:ZFS Mirror 1;/dev/disk/by-id/ata-ZHITAI_SC001_XT_1000GB_ZTB401TAB244431KEG:ZFS Mirror 2;/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PK2N4GB:Data Drive 1 (12TB);/dev/disk/by-id/ata-HGST_HUH721212ALE604_5PJ7Z3LE:Data Drive 2 (12TB);/dev/disk/by-id/ata-ST16000NM000J-2TW103_WRS0F8BE:Parity Drive (16TB)"
];
# Allow access to block devices for SMART commands
DeviceAllow = [ "/dev/disk/by-id/* rw" "/dev/sd* rw" "/dev/nvme* rw" "char-* rw" "block-* rw" ];
DevicePolicy = "closed";
};
};
};
# Simple systemd timer for daily SMART reports
systemd.timers = {
# Daily SMART report at 8:00 AM
daily-smart-report = {
description = "Daily SMART Report Timer";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "08:00:00";
Persistent = true;
RandomizedDelaySec = "5m";
};
};
};
# Ensure log directories exist with proper permissions
systemd.tmpfiles.rules = [
"d /var/log 0755 root root -"
"f /var/log/daily-smart-report.log 0644 root root -"
"f /var/log/smartd-alerts.log 0644 root root -"
"d /var/log/smartd 0755 root root -"
];
# Enable the timer
systemd.targets.timers.wants = [
"daily-smart-report.timer"
];
# Create a logrotate configuration for the logs
services.logrotate = {
enable = true;
settings = {
"/var/log/daily-smart-report.log" = {
frequency = "weekly";
rotate = 4;
compress = true;
delaycompress = true;
missingok = true;
notifempty = true;
create = "644 root root";
};
"/var/log/smartd-alerts.log" = {
frequency = "weekly";
rotate = 4;
compress = true;
delaycompress = true;
missingok = true;
notifempty = true;
create = "644 root root";
};
};
};
# Ensure scripts are executable and in the right location
system.activationScripts.disk-health-scripts = ''
chmod +x /home/yanlin/.config/nix/scripts/gotify-notify.sh
chmod +x /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh
chmod +x /home/yanlin/.config/nix/scripts/daily-smart-report.sh
'';
}

View file

@ -3,7 +3,6 @@
./hardware-configuration.nix ./hardware-configuration.nix
./containers.nix # Host-specific container definitions ./containers.nix # Host-specific container definitions
./proxy.nix # Host-specific Traefik dynamic configuration ./proxy.nix # Host-specific Traefik dynamic configuration
./disk-health.nix # Host-specific disk health monitoring
../../../modules/wireguard.nix ../../../modules/wireguard.nix
../../../modules/podman.nix ../../../modules/podman.nix
../../../modules/traefik.nix ../../../modules/traefik.nix
@ -253,6 +252,29 @@
hideDotFiles = true; hideDotFiles = true;
}; };
# Daily SMART report using the shell alias
systemd.services.daily-smart-report = {
description = "Daily SMART Health Report";
after = [ "multi-user.target" ];
serviceConfig = {
Type = "oneshot";
User = "root";
ExecStart = "${pkgs.zsh}/bin/zsh -c 'source /home/yanlin/.zshrc && smart-report'";
StandardOutput = "journal";
StandardError = "journal";
};
};
systemd.timers.daily-smart-report = {
description = "Daily SMART Report Timer";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "08:00:00";
Persistent = true;
RandomizedDelaySec = "5m";
};
};
# Borg backup configuration # Borg backup configuration
services.borgbackup-custom = { services.borgbackup-custom = {
enable = true; enable = true;

View file

@ -1,123 +0,0 @@
{ config, lib, pkgs, ... }:
{
# Simplified disk health monitoring for ThinkPad laptop
# Single NVMe SSD monitoring with laptop-friendly settings
# Package requirements
environment.systemPackages = with pkgs; [
smartmontools
curl # For Gotify notifications
];
# Smartd configuration for laptop NVMe SSD
services.smartd = {
enable = true;
autodetect = false; # Explicit configuration
# Global smartd options
extraOptions = [ "-A /var/log/smartd/" "-i 900" ]; # Check every 15 minutes
# Disable default notifications
notifications = {
mail.enable = false;
x11.enable = false;
test = false;
};
# Single NVMe drive monitoring with all options inline
devices = [
{
device = "/dev/nvme0n1";
options = "-d nvme -a -o on -S on -s (S/../.././03|L/../../7/04) -W 4,60,70 -M exec ${pkgs.writeShellScript "smartd-notify-thinkpad" ''
export SMARTD_DEVICE="$SMARTD_DEVICE"
export SMARTD_FAILTYPE="$SMARTD_FAILTYPE"
export SMARTD_MESSAGE="$SMARTD_MESSAGE"
/home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh AieM4SJHFcyl7TC System_SSD_ThinkPad
''}";
}
];
};
# Daily SMART report service
systemd.services = {
daily-smart-report = {
description = "Daily SMART Health Report for ThinkPad";
after = [ "multi-user.target" ];
serviceConfig = {
Type = "oneshot";
ExecStart = "${pkgs.bash}/bin/bash /home/yanlin/.config/nix/scripts/daily-smart-report.sh AieM4SJHFcyl7TC";
User = "root";
StandardOutput = "journal";
StandardError = "journal";
TimeoutStartSec = "300"; # 5 minutes max
# Environment with single NVMe drive
Environment = [
"PATH=/run/current-system/sw/bin"
"SMART_DRIVES=/dev/nvme0n1:System SSD (ThinkPad)"
];
# Allow access to NVMe devices
DeviceAllow = [ "/dev/nvme* rw" "char-* rw" "block-* rw" ];
DevicePolicy = "closed";
};
};
};
# Daily SMART report timer - runs at 09:00 (later than server)
systemd.timers = {
daily-smart-report = {
description = "Daily SMART Report Timer for ThinkPad";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "09:00:00"; # Later time for laptop
Persistent = true;
RandomizedDelaySec = "10m"; # Longer randomization for laptop
};
};
};
# Ensure log directories exist
systemd.tmpfiles.rules = [
"d /var/log 0755 root root -"
"f /var/log/daily-smart-report.log 0644 root root -"
"f /var/log/smartd-alerts.log 0644 root root -"
"d /var/log/smartd 0755 root root -"
];
# Enable the timer
systemd.targets.timers.wants = [
"daily-smart-report.timer"
];
# Logrotate configuration
services.logrotate = {
enable = true;
settings = {
"/var/log/daily-smart-report.log" = {
frequency = "weekly";
rotate = 4;
compress = true;
delaycompress = true;
missingok = true;
notifempty = true;
create = "644 root root";
};
"/var/log/smartd-alerts.log" = {
frequency = "weekly";
rotate = 4;
compress = true;
delaycompress = true;
missingok = true;
notifempty = true;
create = "644 root root";
};
};
};
# Ensure scripts are executable
system.activationScripts.disk-health-scripts = ''
chmod +x /home/yanlin/.config/nix/scripts/gotify-notify.sh
chmod +x /home/yanlin/.config/nix/scripts/disk-health-smartd-alert.sh
chmod +x /home/yanlin/.config/nix/scripts/daily-smart-report.sh
'';
}

View file

@ -1,7 +1,6 @@
{ config, pkgs, lib, ... }: { { config, pkgs, lib, ... }: {
imports = [ imports = [
./hardware-configuration.nix ./hardware-configuration.nix
./disk-health.nix
../../../modules/wireguard.nix ../../../modules/wireguard.nix
../../../modules/borg-server.nix ../../../modules/borg-server.nix
]; ];
@ -355,6 +354,29 @@
}; };
}; };
# Daily SMART report using the shell alias
systemd.services.daily-smart-report = {
description = "Daily SMART Health Report";
after = [ "multi-user.target" ];
serviceConfig = {
Type = "oneshot";
User = "root";
ExecStart = "${pkgs.zsh}/bin/zsh -c 'source /home/yanlin/.zshrc && smart-report'";
StandardOutput = "journal";
StandardError = "journal";
};
};
systemd.timers.daily-smart-report = {
description = "Daily SMART Report Timer";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "09:00:00";
Persistent = true;
RandomizedDelaySec = "10m";
};
};
# Borg backup server configuration # Borg backup server configuration
services.borgbackup-server = { services.borgbackup-server = {
enable = true; enable = true;

View file

@ -1,82 +0,0 @@
# SMART daemon alert script for Gotify notifications
# Called by smartd when SMART issues are detected
# Usage: disk-health-smartd-alert.sh <gotify_token> <drive_name>
# Uses SMARTD_DEVICE environment variable for device info
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
GOTIFY_SCRIPT="${SCRIPT_DIR}/gotify-notify.sh"
LOG_FILE="/var/log/smartd-alerts.log"
# Get parameters
GOTIFY_TOKEN="${1:-}"
DRIVE_NAME="${2:-}"
# Validate parameters
if [[ -z "$GOTIFY_TOKEN" ]]; then
echo "Error: Gotify token not provided"
echo "Usage: $0 <gotify_token> <drive_name>"
exit 1
fi
# If drive name not provided, use device path
if [[ -z "$DRIVE_NAME" ]]; then
DRIVE_NAME="${SMARTD_DEVICE:-Unknown Drive}"
fi
# Gotify configuration
GOTIFY_URL="https://notify.yanlincs.com"
log_message() {
local message="$1"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $message" | tee -a "$LOG_FILE"
}
send_smartd_alert() {
# smartd provides these environment variables:
local device="${SMARTD_DEVICE:-unknown}"
local failtype="${SMARTD_FAILTYPE:-unknown}"
local message="${SMARTD_MESSAGE:-No details provided}"
log_message "SMART alert for $DRIVE_NAME ($device): $failtype - $message"
# Determine priority based on failure type
local priority="high"
case "$failtype" in
*"FAILURE"*|*"failure"*|*"CRITICAL"*|*"critical"*)
priority="critical"
;;
*"WARNING"*|*"warning"*|*"Temperature"*)
priority="high"
;;
*)
priority="high"
;;
esac
# Create notification message
local notification_title="SMART Alert: $DRIVE_NAME"
local notification_message="Device: $device
Failure Type: $failtype
Details: $message
This alert was triggered by smartd monitoring."
# Send Gotify notification
if [[ -x "$GOTIFY_SCRIPT" ]]; then
"$GOTIFY_SCRIPT" "$GOTIFY_URL" "$GOTIFY_TOKEN" "$priority" "$notification_title" "$notification_message" || \
log_message "Failed to send Gotify notification"
else
log_message "Gotify script not found or not executable: $GOTIFY_SCRIPT"
fi
}
# Ensure log file exists
touch "$LOG_FILE" 2>/dev/null || {
LOG_FILE="/tmp/smartd-alerts.log"
touch "$LOG_FILE"
}
# Main execution
send_smartd_alert