a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
951 lines
34 KiB
Bash
951 lines
34 KiB
Bash
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: smart-drive-exporter.sh
|
|
# Version: 1.0
|
|
# Description: Prometheus exporter for SMART drive health metrics.
|
|
# Reads SMART attributes from SATA and NVMe drives using smartctl
|
|
# and exports temperature, reallocated sectors, pending sectors,
|
|
# uncorrectable errors, power-on hours, wear leveling, NVMe health,
|
|
# and overall drive health status.
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
#
|
|
# Prerequisites:
|
|
# - smartmontools (smartctl)
|
|
# - Root or sudo access for smartctl
|
|
# - netcat (nc) for HTTP mode
|
|
#
|
|
# Usage:
|
|
# # Output to stdout
|
|
# sudo ./smart-drive-exporter.sh
|
|
#
|
|
# # HTTP server mode
|
|
# sudo ./smart-drive-exporter.sh --http -p 9198
|
|
#
|
|
# # Textfile collector mode
|
|
# sudo ./smart-drive-exporter.sh --textfile
|
|
#
|
|
# Metrics Exported:
|
|
# Core Status:
|
|
# - smart_drive_up - Exporter status (1=up, 0=down)
|
|
# - smart_drive_exporter_info{version} - Exporter version
|
|
#
|
|
# Drive Health:
|
|
# - smart_drive_health_ok{device,model,serial,type} - SMART health (1=passed)
|
|
# - smart_drive_temperature_celsius{device,model,serial} - Temperature
|
|
# - smart_drive_power_on_hours{device,model,serial} - Power-on hours
|
|
# - smart_drive_power_cycle_count{device,model,serial} - Power cycles
|
|
# - smart_drive_capacity_bytes{device,model,serial} - Drive capacity
|
|
#
|
|
# SATA Attributes:
|
|
# - smart_drive_reallocated_sectors{device,model,serial} - Reallocated sectors
|
|
# - smart_drive_pending_sectors{device,model,serial} - Pending sectors
|
|
# - smart_drive_uncorrectable_errors{device,model,serial} - Uncorrectable errors
|
|
# - smart_drive_spin_retry_count{device,model,serial} - Spin retries
|
|
# - smart_drive_command_timeout{device,model,serial} - Command timeouts
|
|
# - smart_drive_start_stop_count{device,model,serial} - Start/stop count
|
|
# - smart_drive_wear_leveling_count{device,model,serial} - SSD wear leveling
|
|
# - smart_drive_interface_speed{device,model,serial,speed} - Interface speed
|
|
#
|
|
# NVMe Attributes:
|
|
# - smart_drive_percentage_used{device,model,serial} - NVMe percentage used
|
|
# - smart_drive_available_spare{device,model,serial} - Available spare %
|
|
# - smart_drive_available_spare_threshold{device,model,serial} - Spare threshold
|
|
# - smart_drive_media_errors{device,model,serial} - Media errors
|
|
# - smart_drive_critical_warning{device,model,serial} - Critical warning bitmap
|
|
#
|
|
# Exporter:
|
|
# - smart_drive_exporter_duration_seconds - Script execution time
|
|
# - smart_drive_exporter_last_run_timestamp - Last run timestamp
|
|
# - smart_drive_devices_total - Total drives detected
|
|
#
|
|
# Configuration:
|
|
# Default HTTP port: 9198
|
|
# Textfile directory: /var/lib/node_exporter
|
|
# SMART_DRIVE_DEVICES: auto (or comma-separated, e.g., /dev/sda,/dev/nvme0n1)
|
|
# SMART_DRIVE_SMARTCTL_PATH: /usr/sbin/smartctl
|
|
# SMART_DRIVE_SUDO: auto (auto, yes, no)
|
|
#
|
|
################################################################################
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION VARIABLES
|
|
# ============================================================================
|
|
|
|
TEXTFILE_DIR="/var/lib/node_exporter"
|
|
OUTPUT_FILE=""
|
|
HTTP_MODE=false
|
|
HTTP_PORT=9198
|
|
|
|
DEVICES="${SMART_DRIVE_DEVICES:-auto}"
|
|
SMARTCTL_PATH="${SMART_DRIVE_SMARTCTL_PATH:-/usr/sbin/smartctl}"
|
|
SUDO_MODE="${SMART_DRIVE_SUDO:-auto}"
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
prom_escape() {
|
|
local s="$1"
|
|
s=${s//\\/\\\\}
|
|
s=${s//\"/\\\"}
|
|
s=${s//$'\n'/\\n}
|
|
printf '%s\n' "$s"
|
|
}
|
|
|
|
show_usage() {
|
|
cat <<EOF
|
|
Usage: $0 [OPTIONS]
|
|
|
|
Export SMART drive health metrics as Prometheus metrics (v1.0).
|
|
|
|
MODES:
|
|
--textfile Write to node_exporter textfile collector
|
|
--http Run HTTP server on port $HTTP_PORT
|
|
|
|
OPTIONS:
|
|
-p, --port HTTP port (default: 9198)
|
|
-o, --output Output file path
|
|
--devices LIST Comma-separated device list (default: auto-detect)
|
|
--smartctl PATH Path to smartctl binary (default: $SMARTCTL_PATH)
|
|
--sudo MODE Sudo mode: auto, yes, no (default: auto)
|
|
|
|
EXAMPLES:
|
|
$0 --textfile # Write to textfile collector
|
|
$0 --http --port 9198 # Run HTTP server
|
|
$0 --devices /dev/sda,/dev/nvme0n1 # Specify drives
|
|
$0 -o /tmp/smart_drive.prom # Write to custom file
|
|
|
|
ENVIRONMENT VARIABLES:
|
|
SMART_DRIVE_DEVICES Device list: auto or comma-separated
|
|
SMART_DRIVE_SMARTCTL_PATH Path to smartctl binary
|
|
SMART_DRIVE_SUDO Sudo mode: auto, yes, no
|
|
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help) show_usage ;;
|
|
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/smart_drive.prom"; shift ;;
|
|
--http) HTTP_MODE=true; shift ;;
|
|
-p|--port) HTTP_PORT="$2"; shift 2 ;;
|
|
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
|
|
--devices) DEVICES="$2"; shift 2 ;;
|
|
--smartctl) SMARTCTL_PATH="$2"; shift 2 ;;
|
|
--sudo) SUDO_MODE="$2"; shift 2 ;;
|
|
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# SMARTCTL COMMAND SETUP
|
|
# ============================================================================
|
|
|
|
setup_smartctl_cmd() {
|
|
local cmd="$SMARTCTL_PATH"
|
|
|
|
if [ "$SUDO_MODE" = "yes" ]; then
|
|
SMARTCTL_CMD="sudo $cmd"
|
|
elif [ "$SUDO_MODE" = "no" ]; then
|
|
SMARTCTL_CMD="$cmd"
|
|
else
|
|
# auto: use sudo if not root
|
|
if [ "$(id -u)" -ne 0 ]; then
|
|
SMARTCTL_CMD="sudo $cmd"
|
|
else
|
|
SMARTCTL_CMD="$cmd"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# DRIVE DETECTION
|
|
# ============================================================================
|
|
|
|
detect_drives() {
|
|
if [ "$DEVICES" = "auto" ]; then
|
|
$SMARTCTL_CMD --scan 2>/dev/null | awk '{print $1}'
|
|
else
|
|
echo "$DEVICES" | tr ',' '\n'
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# DRIVE DATA PARSING
|
|
# ============================================================================
|
|
|
|
# Parse smartctl -iHA output for a single drive.
|
|
# Outputs structured KEY VALUE pairs for metric generation.
|
|
parse_drive_data() {
|
|
local device="$1"
|
|
local raw_output
|
|
|
|
raw_output=$($SMARTCTL_CMD -iHA "$device" 2>/dev/null)
|
|
|
|
if [ -z "$raw_output" ]; then
|
|
echo "PARSE_ERROR 1"
|
|
return
|
|
fi
|
|
|
|
echo "$raw_output" | awk -v dev="$device" '
|
|
BEGIN {
|
|
model = ""
|
|
serial = ""
|
|
capacity_bytes = 0
|
|
drive_type = "unknown"
|
|
health = -1
|
|
temperature = -1
|
|
power_on_hours = -1
|
|
power_cycle_count = -1
|
|
reallocated_sectors = -1
|
|
pending_sectors = -1
|
|
uncorrectable = -1
|
|
spin_retry = -1
|
|
command_timeout = -1
|
|
start_stop = -1
|
|
wear_leveling = -1
|
|
nvme_pct_used = -1
|
|
nvme_spare = -1
|
|
nvme_spare_thresh = -1
|
|
nvme_media_errors = -1
|
|
nvme_critical_warn = -1
|
|
sata_speed = ""
|
|
in_smart_attrs = 0
|
|
in_nvme_health = 0
|
|
}
|
|
|
|
# Drive info section
|
|
/^Device Model:/ || /^Model Number:/ {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/^ +| +$/, "")
|
|
model = $0
|
|
}
|
|
/^Serial Number:/ {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/^ +| +$/, "")
|
|
serial = $0
|
|
}
|
|
/^User Capacity:/ {
|
|
# Extract bytes from "User Capacity: 1,000,204,886,016 bytes [1.00 TB]"
|
|
s = $0
|
|
sub(/^[^:]+:[ \t]+/, "", s)
|
|
# Get the number before "bytes"
|
|
if (match(s, /[0-9,]+/)) {
|
|
cap_str = substr(s, RSTART, RLENGTH)
|
|
gsub(/,/, "", cap_str)
|
|
capacity_bytes = cap_str + 0
|
|
}
|
|
}
|
|
/^Total NVM Capacity:/ || /^Namespace 1 Size\/Capacity:/ {
|
|
s = $0
|
|
sub(/^[^:]+:[ \t]+/, "", s)
|
|
if (match(s, /[0-9,]+/)) {
|
|
cap_str = substr(s, RSTART, RLENGTH)
|
|
gsub(/,/, "", cap_str)
|
|
capacity_bytes = cap_str + 0
|
|
}
|
|
}
|
|
/^SATA Version is:/ {
|
|
s = $0
|
|
sub(/^[^:]+:[ \t]+/, "", s)
|
|
# Extract speed like "6.0 Gb/s"
|
|
if (match(s, /[0-9.]+ Gb\/s/)) {
|
|
sata_speed = substr(s, RSTART, RLENGTH)
|
|
}
|
|
}
|
|
/^Rotation Rate:/ {
|
|
if ($0 ~ /Solid State/) {
|
|
drive_type = "ssd"
|
|
} else if ($0 ~ /[0-9]+ rpm/) {
|
|
drive_type = "hdd"
|
|
}
|
|
}
|
|
|
|
# Detect NVMe
|
|
/^Model Number:/ { drive_type = "nvme" }
|
|
|
|
# Health status
|
|
/SMART overall-health self-assessment test result:/ {
|
|
if ($0 ~ /PASSED/) health = 1
|
|
else health = 0
|
|
}
|
|
/SMART Health Status:/ {
|
|
if ($0 ~ /OK/) health = 1
|
|
else health = 0
|
|
}
|
|
|
|
# SATA SMART attributes table detection
|
|
/^ID#/ && /ATTRIBUTE_NAME/ { in_smart_attrs = 1; next }
|
|
/^$/ { in_smart_attrs = 0; in_nvme_health = 0 }
|
|
|
|
# Parse SATA SMART attributes
|
|
in_smart_attrs && NF >= 10 {
|
|
attr_id = $1 + 0
|
|
raw_val = $10 + 0
|
|
|
|
if (attr_id == 5) reallocated_sectors = raw_val
|
|
if (attr_id == 4) start_stop = raw_val
|
|
if (attr_id == 9) power_on_hours = raw_val
|
|
if (attr_id == 10) spin_retry = raw_val
|
|
if (attr_id == 12) power_cycle_count = raw_val
|
|
if (attr_id == 177 || attr_id == 233) wear_leveling = raw_val
|
|
if (attr_id == 188) command_timeout = raw_val
|
|
if (attr_id == 190 || attr_id == 194) {
|
|
# Temperature -- raw value may contain "min/max" appended
|
|
# e.g., "35 (Min/Max 22/42)" -- take first number
|
|
raw_str = $10
|
|
if (match(raw_str, /^[0-9]+/)) {
|
|
temperature = substr(raw_str, RSTART, RLENGTH) + 0
|
|
}
|
|
}
|
|
if (attr_id == 197) pending_sectors = raw_val
|
|
if (attr_id == 198) uncorrectable = raw_val
|
|
}
|
|
|
|
# NVMe SMART/Health Information detection
|
|
/^SMART\/Health Information/ { in_nvme_health = 1; next }
|
|
|
|
# Parse NVMe health attributes
|
|
in_nvme_health {
|
|
if (/^Temperature:/) {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/ .*/, "")
|
|
temperature = $0 + 0
|
|
}
|
|
if (/^Percentage Used:/) {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/%.*/, "")
|
|
nvme_pct_used = $0 + 0
|
|
}
|
|
if (/^Available Spare:/) {
|
|
if (!/Threshold/) {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/%.*/, "")
|
|
nvme_spare = $0 + 0
|
|
}
|
|
}
|
|
if (/^Available Spare Threshold:/) {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/%.*/, "")
|
|
nvme_spare_thresh = $0 + 0
|
|
}
|
|
if (/^Power On Hours:/) {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/,/, "")
|
|
power_on_hours = $0 + 0
|
|
}
|
|
if (/^Power Cycles:/) {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/,/, "")
|
|
power_cycle_count = $0 + 0
|
|
}
|
|
if (/^Media and Data Integrity Errors:/) {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/,/, "")
|
|
nvme_media_errors = $0 + 0
|
|
}
|
|
if (/^Critical Warning:/) {
|
|
sub(/^[^:]+:[ \t]+/, "")
|
|
gsub(/ .*/, "")
|
|
# Convert hex to decimal
|
|
if (substr($0, 1, 2) == "0x") {
|
|
hex_str = substr($0, 3)
|
|
nvme_critical_warn = 0
|
|
for (i = 1; i <= length(hex_str); i++) {
|
|
c = substr(hex_str, i, 1)
|
|
if (c >= 0 && c <= 9) d = c + 0
|
|
else if (c == "a" || c == "A") d = 10
|
|
else if (c == "b" || c == "B") d = 11
|
|
else if (c == "c" || c == "C") d = 12
|
|
else if (c == "d" || c == "D") d = 13
|
|
else if (c == "e" || c == "E") d = 14
|
|
else if (c == "f" || c == "F") d = 15
|
|
else d = 0
|
|
nvme_critical_warn = nvme_critical_warn * 16 + d
|
|
}
|
|
} else {
|
|
nvme_critical_warn = $0 + 0
|
|
}
|
|
}
|
|
}
|
|
|
|
END {
|
|
# Fix drive type for NVMe if not already detected
|
|
if (drive_type == "unknown" && nvme_pct_used >= 0) drive_type = "nvme"
|
|
if (drive_type == "unknown") drive_type = "sata"
|
|
|
|
print "DEVICE " dev
|
|
print "MODEL " model
|
|
print "SERIAL " serial
|
|
print "TYPE " drive_type
|
|
print "HEALTH " health
|
|
print "CAPACITY " capacity_bytes
|
|
if (temperature >= 0) print "TEMPERATURE " temperature
|
|
if (power_on_hours >= 0) print "POWER_ON_HOURS " power_on_hours
|
|
if (power_cycle_count >= 0) print "POWER_CYCLE_COUNT " power_cycle_count
|
|
if (reallocated_sectors >= 0) print "REALLOCATED_SECTORS " reallocated_sectors
|
|
if (pending_sectors >= 0) print "PENDING_SECTORS " pending_sectors
|
|
if (uncorrectable >= 0) print "UNCORRECTABLE " uncorrectable
|
|
if (spin_retry >= 0) print "SPIN_RETRY " spin_retry
|
|
if (command_timeout >= 0) print "COMMAND_TIMEOUT " command_timeout
|
|
if (start_stop >= 0) print "START_STOP " start_stop
|
|
if (wear_leveling >= 0) print "WEAR_LEVELING " wear_leveling
|
|
if (nvme_pct_used >= 0) print "NVME_PCT_USED " nvme_pct_used
|
|
if (nvme_spare >= 0) print "NVME_SPARE " nvme_spare
|
|
if (nvme_spare_thresh >= 0) print "NVME_SPARE_THRESH " nvme_spare_thresh
|
|
if (nvme_media_errors >= 0) print "NVME_MEDIA_ERRORS " nvme_media_errors
|
|
if (nvme_critical_warn >= 0) print "NVME_CRITICAL_WARN " nvme_critical_warn
|
|
if (sata_speed != "") print "SATA_SPEED " sata_speed
|
|
}
|
|
'
|
|
}
|
|
|
|
# ============================================================================
|
|
# METRICS GENERATION
|
|
# ============================================================================
|
|
|
|
generate_metrics() {
|
|
local script_start
|
|
script_start=$(date +%s)
|
|
|
|
# ========================================================================
|
|
# Exporter Status
|
|
# ========================================================================
|
|
|
|
if ! command -v "$SMARTCTL_PATH" >/dev/null 2>&1; then
|
|
cat <<EOF
|
|
# HELP smart_drive_up Exporter status (1=up, 0=down)
|
|
# TYPE smart_drive_up gauge
|
|
smart_drive_up 0
|
|
|
|
# HELP smart_drive_exporter_info Exporter version information
|
|
# TYPE smart_drive_exporter_info gauge
|
|
smart_drive_exporter_info{version="1.0"} 1
|
|
|
|
EOF
|
|
return
|
|
fi
|
|
|
|
cat <<EOF
|
|
# HELP smart_drive_up Exporter status (1=up, 0=down)
|
|
# TYPE smart_drive_up gauge
|
|
smart_drive_up 1
|
|
|
|
# HELP smart_drive_exporter_info Exporter version information
|
|
# TYPE smart_drive_exporter_info gauge
|
|
smart_drive_exporter_info{version="1.0"} 1
|
|
|
|
EOF
|
|
|
|
# ========================================================================
|
|
# Drive Detection
|
|
# ========================================================================
|
|
local drive_list
|
|
drive_list=$(detect_drives)
|
|
|
|
if [ -z "$drive_list" ]; then
|
|
cat <<EOF
|
|
# HELP smart_drive_devices_total Total drives detected
|
|
# TYPE smart_drive_devices_total gauge
|
|
smart_drive_devices_total 0
|
|
|
|
EOF
|
|
else
|
|
local device_count=0
|
|
local health_lines=""
|
|
local temp_lines=""
|
|
local poh_lines=""
|
|
local pcc_lines=""
|
|
local cap_lines=""
|
|
local realloc_lines=""
|
|
local pending_lines=""
|
|
local uncorr_lines=""
|
|
local spin_lines=""
|
|
local cmdto_lines=""
|
|
local startstop_lines=""
|
|
local wear_lines=""
|
|
local nvme_pct_lines=""
|
|
local nvme_spare_lines=""
|
|
local nvme_thresh_lines=""
|
|
local nvme_media_lines=""
|
|
local nvme_crit_lines=""
|
|
local speed_lines=""
|
|
|
|
while IFS= read -r device; do
|
|
[ -z "$device" ] && continue
|
|
[ ! -e "$device" ] && continue
|
|
|
|
local parsed_file
|
|
parsed_file=$(mktemp)
|
|
parse_drive_data "$device" > "$parsed_file"
|
|
|
|
# Check for parse errors
|
|
if grep -q "^PARSE_ERROR" "$parsed_file" 2>/dev/null; then
|
|
rm -f "$parsed_file"
|
|
continue
|
|
fi
|
|
|
|
device_count=$((device_count + 1))
|
|
|
|
# Extract values
|
|
local dev_model dev_serial dev_type dev_health
|
|
local dev_temp dev_poh dev_pcc dev_cap
|
|
local dev_realloc dev_pending dev_uncorr
|
|
local dev_spin dev_cmdto dev_startstop dev_wear
|
|
local dev_nvme_pct dev_nvme_spare dev_nvme_thresh
|
|
local dev_nvme_media dev_nvme_crit dev_speed
|
|
|
|
dev_model=$(awk '/^MODEL / {$1=""; sub(/^ /, ""); print}' "$parsed_file")
|
|
dev_serial=$(awk '/^SERIAL / {$1=""; sub(/^ /, ""); print}' "$parsed_file")
|
|
dev_type=$(awk '/^TYPE / {print $2}' "$parsed_file")
|
|
dev_health=$(awk '/^HEALTH / {print $2}' "$parsed_file")
|
|
dev_cap=$(awk '/^CAPACITY / {print $2}' "$parsed_file")
|
|
|
|
local esc_dev esc_model esc_serial
|
|
esc_dev=$(prom_escape "$device")
|
|
esc_model=$(prom_escape "$dev_model")
|
|
esc_serial=$(prom_escape "$dev_serial")
|
|
|
|
local base_labels="device=\"$esc_dev\",model=\"$esc_model\",serial=\"$esc_serial\""
|
|
|
|
# Health
|
|
if [ "$dev_health" != "-1" ] && [ -n "$dev_health" ]; then
|
|
health_lines="${health_lines}smart_drive_health_ok{${base_labels},type=\"$dev_type\"} $dev_health
|
|
"
|
|
fi
|
|
|
|
# Temperature
|
|
dev_temp=$(awk '/^TEMPERATURE / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_temp" ]; then
|
|
temp_lines="${temp_lines}smart_drive_temperature_celsius{${base_labels}} $dev_temp
|
|
"
|
|
fi
|
|
|
|
# Power-on hours
|
|
dev_poh=$(awk '/^POWER_ON_HOURS / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_poh" ]; then
|
|
poh_lines="${poh_lines}smart_drive_power_on_hours{${base_labels}} $dev_poh
|
|
"
|
|
fi
|
|
|
|
# Power cycle count
|
|
dev_pcc=$(awk '/^POWER_CYCLE_COUNT / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_pcc" ]; then
|
|
pcc_lines="${pcc_lines}smart_drive_power_cycle_count{${base_labels}} $dev_pcc
|
|
"
|
|
fi
|
|
|
|
# Capacity
|
|
if [ -n "$dev_cap" ] && [ "$dev_cap" != "0" ]; then
|
|
cap_lines="${cap_lines}smart_drive_capacity_bytes{${base_labels}} $dev_cap
|
|
"
|
|
fi
|
|
|
|
# SATA: Reallocated sectors
|
|
dev_realloc=$(awk '/^REALLOCATED_SECTORS / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_realloc" ]; then
|
|
realloc_lines="${realloc_lines}smart_drive_reallocated_sectors{${base_labels}} $dev_realloc
|
|
"
|
|
fi
|
|
|
|
# SATA: Pending sectors
|
|
dev_pending=$(awk '/^PENDING_SECTORS / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_pending" ]; then
|
|
pending_lines="${pending_lines}smart_drive_pending_sectors{${base_labels}} $dev_pending
|
|
"
|
|
fi
|
|
|
|
# SATA: Uncorrectable errors
|
|
dev_uncorr=$(awk '/^UNCORRECTABLE / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_uncorr" ]; then
|
|
uncorr_lines="${uncorr_lines}smart_drive_uncorrectable_errors{${base_labels}} $dev_uncorr
|
|
"
|
|
fi
|
|
|
|
# SATA: Spin retry count
|
|
dev_spin=$(awk '/^SPIN_RETRY / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_spin" ]; then
|
|
spin_lines="${spin_lines}smart_drive_spin_retry_count{${base_labels}} $dev_spin
|
|
"
|
|
fi
|
|
|
|
# SATA: Command timeout
|
|
dev_cmdto=$(awk '/^COMMAND_TIMEOUT / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_cmdto" ]; then
|
|
cmdto_lines="${cmdto_lines}smart_drive_command_timeout{${base_labels}} $dev_cmdto
|
|
"
|
|
fi
|
|
|
|
# SATA: Start/stop count
|
|
dev_startstop=$(awk '/^START_STOP / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_startstop" ]; then
|
|
startstop_lines="${startstop_lines}smart_drive_start_stop_count{${base_labels}} $dev_startstop
|
|
"
|
|
fi
|
|
|
|
# SSD: Wear leveling
|
|
dev_wear=$(awk '/^WEAR_LEVELING / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_wear" ]; then
|
|
wear_lines="${wear_lines}smart_drive_wear_leveling_count{${base_labels}} $dev_wear
|
|
"
|
|
fi
|
|
|
|
# NVMe: Percentage used
|
|
dev_nvme_pct=$(awk '/^NVME_PCT_USED / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_nvme_pct" ]; then
|
|
nvme_pct_lines="${nvme_pct_lines}smart_drive_percentage_used{${base_labels}} $dev_nvme_pct
|
|
"
|
|
fi
|
|
|
|
# NVMe: Available spare
|
|
dev_nvme_spare=$(awk '/^NVME_SPARE / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_nvme_spare" ]; then
|
|
nvme_spare_lines="${nvme_spare_lines}smart_drive_available_spare{${base_labels}} $dev_nvme_spare
|
|
"
|
|
fi
|
|
|
|
# NVMe: Spare threshold
|
|
dev_nvme_thresh=$(awk '/^NVME_SPARE_THRESH / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_nvme_thresh" ]; then
|
|
nvme_thresh_lines="${nvme_thresh_lines}smart_drive_available_spare_threshold{${base_labels}} $dev_nvme_thresh
|
|
"
|
|
fi
|
|
|
|
# NVMe: Media errors
|
|
dev_nvme_media=$(awk '/^NVME_MEDIA_ERRORS / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_nvme_media" ]; then
|
|
nvme_media_lines="${nvme_media_lines}smart_drive_media_errors{${base_labels}} $dev_nvme_media
|
|
"
|
|
fi
|
|
|
|
# NVMe: Critical warning
|
|
dev_nvme_crit=$(awk '/^NVME_CRITICAL_WARN / {print $2}' "$parsed_file")
|
|
if [ -n "$dev_nvme_crit" ]; then
|
|
nvme_crit_lines="${nvme_crit_lines}smart_drive_critical_warning{${base_labels}} $dev_nvme_crit
|
|
"
|
|
fi
|
|
|
|
# SATA: Interface speed
|
|
dev_speed=$(awk '/^SATA_SPEED / {$1=""; sub(/^ /, ""); print}' "$parsed_file")
|
|
if [ -n "$dev_speed" ]; then
|
|
local esc_speed
|
|
esc_speed=$(prom_escape "$dev_speed")
|
|
speed_lines="${speed_lines}smart_drive_interface_speed{${base_labels},speed=\"$esc_speed\"} 1
|
|
"
|
|
fi
|
|
|
|
rm -f "$parsed_file"
|
|
done <<< "$drive_list"
|
|
|
|
# ================================================================
|
|
# Devices Total
|
|
# ================================================================
|
|
echo "# HELP smart_drive_devices_total Total drives detected"
|
|
echo "# TYPE smart_drive_devices_total gauge"
|
|
echo "smart_drive_devices_total $device_count"
|
|
echo ""
|
|
|
|
# ================================================================
|
|
# Health Status
|
|
# ================================================================
|
|
if [ -n "$health_lines" ]; then
|
|
echo "# HELP smart_drive_health_ok SMART health status (1=passed, 0=failed)"
|
|
echo "# TYPE smart_drive_health_ok gauge"
|
|
printf '%s' "$health_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Temperature
|
|
# ================================================================
|
|
if [ -n "$temp_lines" ]; then
|
|
echo "# HELP smart_drive_temperature_celsius Current drive temperature in Celsius"
|
|
echo "# TYPE smart_drive_temperature_celsius gauge"
|
|
printf '%s' "$temp_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Power-On Hours
|
|
# ================================================================
|
|
if [ -n "$poh_lines" ]; then
|
|
echo "# HELP smart_drive_power_on_hours Total power-on hours"
|
|
echo "# TYPE smart_drive_power_on_hours gauge"
|
|
printf '%s' "$poh_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Power Cycle Count
|
|
# ================================================================
|
|
if [ -n "$pcc_lines" ]; then
|
|
echo "# HELP smart_drive_power_cycle_count Total power cycle count"
|
|
echo "# TYPE smart_drive_power_cycle_count gauge"
|
|
printf '%s' "$pcc_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Capacity
|
|
# ================================================================
|
|
if [ -n "$cap_lines" ]; then
|
|
echo "# HELP smart_drive_capacity_bytes Drive capacity in bytes"
|
|
echo "# TYPE smart_drive_capacity_bytes gauge"
|
|
printf '%s' "$cap_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Reallocated Sectors
|
|
# ================================================================
|
|
if [ -n "$realloc_lines" ]; then
|
|
echo "# HELP smart_drive_reallocated_sectors Reallocated sector count"
|
|
echo "# TYPE smart_drive_reallocated_sectors gauge"
|
|
printf '%s' "$realloc_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Pending Sectors
|
|
# ================================================================
|
|
if [ -n "$pending_lines" ]; then
|
|
echo "# HELP smart_drive_pending_sectors Current pending sector count"
|
|
echo "# TYPE smart_drive_pending_sectors gauge"
|
|
printf '%s' "$pending_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Uncorrectable Errors
|
|
# ================================================================
|
|
if [ -n "$uncorr_lines" ]; then
|
|
echo "# HELP smart_drive_uncorrectable_errors Offline uncorrectable error count"
|
|
echo "# TYPE smart_drive_uncorrectable_errors gauge"
|
|
printf '%s' "$uncorr_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Spin Retry Count
|
|
# ================================================================
|
|
if [ -n "$spin_lines" ]; then
|
|
echo "# HELP smart_drive_spin_retry_count Spin retry count"
|
|
echo "# TYPE smart_drive_spin_retry_count gauge"
|
|
printf '%s' "$spin_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Command Timeout
|
|
# ================================================================
|
|
if [ -n "$cmdto_lines" ]; then
|
|
echo "# HELP smart_drive_command_timeout Command timeout count"
|
|
echo "# TYPE smart_drive_command_timeout gauge"
|
|
printf '%s' "$cmdto_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Start/Stop Count
|
|
# ================================================================
|
|
if [ -n "$startstop_lines" ]; then
|
|
echo "# HELP smart_drive_start_stop_count Start/stop count"
|
|
echo "# TYPE smart_drive_start_stop_count gauge"
|
|
printf '%s' "$startstop_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Wear Leveling Count
|
|
# ================================================================
|
|
if [ -n "$wear_lines" ]; then
|
|
echo "# HELP smart_drive_wear_leveling_count SSD wear leveling count"
|
|
echo "# TYPE smart_drive_wear_leveling_count gauge"
|
|
printf '%s' "$wear_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# NVMe Percentage Used
|
|
# ================================================================
|
|
if [ -n "$nvme_pct_lines" ]; then
|
|
echo "# HELP smart_drive_percentage_used NVMe percentage used estimate"
|
|
echo "# TYPE smart_drive_percentage_used gauge"
|
|
printf '%s' "$nvme_pct_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# NVMe Available Spare
|
|
# ================================================================
|
|
if [ -n "$nvme_spare_lines" ]; then
|
|
echo "# HELP smart_drive_available_spare NVMe available spare percentage"
|
|
echo "# TYPE smart_drive_available_spare gauge"
|
|
printf '%s' "$nvme_spare_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# NVMe Available Spare Threshold
|
|
# ================================================================
|
|
if [ -n "$nvme_thresh_lines" ]; then
|
|
echo "# HELP smart_drive_available_spare_threshold NVMe available spare threshold percentage"
|
|
echo "# TYPE smart_drive_available_spare_threshold gauge"
|
|
printf '%s' "$nvme_thresh_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# NVMe Media Errors
|
|
# ================================================================
|
|
if [ -n "$nvme_media_lines" ]; then
|
|
echo "# HELP smart_drive_media_errors NVMe media and data integrity errors"
|
|
echo "# TYPE smart_drive_media_errors gauge"
|
|
printf '%s' "$nvme_media_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# NVMe Critical Warning
|
|
# ================================================================
|
|
if [ -n "$nvme_crit_lines" ]; then
|
|
echo "# HELP smart_drive_critical_warning NVMe critical warning bitmap"
|
|
echo "# TYPE smart_drive_critical_warning gauge"
|
|
printf '%s' "$nvme_crit_lines"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Interface Speed
|
|
# ================================================================
|
|
if [ -n "$speed_lines" ]; then
|
|
echo "# HELP smart_drive_interface_speed SATA interface speed info metric"
|
|
echo "# TYPE smart_drive_interface_speed gauge"
|
|
printf '%s' "$speed_lines"
|
|
echo ""
|
|
fi
|
|
fi
|
|
|
|
# ========================================================================
|
|
# Exporter Runtime
|
|
# ========================================================================
|
|
local script_end script_duration
|
|
script_end=$(date +%s)
|
|
script_duration=$((script_end - script_start))
|
|
|
|
cat <<EOF
|
|
# HELP smart_drive_exporter_duration_seconds Time to generate all metrics
|
|
# TYPE smart_drive_exporter_duration_seconds gauge
|
|
smart_drive_exporter_duration_seconds $script_duration
|
|
|
|
# HELP smart_drive_exporter_last_run_timestamp Unix timestamp of last successful run
|
|
# TYPE smart_drive_exporter_last_run_timestamp gauge
|
|
smart_drive_exporter_last_run_timestamp $script_end
|
|
EOF
|
|
|
|
echo ""
|
|
}
|
|
|
|
# ============================================================================
|
|
# HTTP SERVER MODE
|
|
# ============================================================================
|
|
|
|
run_http_server() {
|
|
echo "Starting SMART drive exporter on port $HTTP_PORT..." >&2
|
|
|
|
if ! command -v nc >/dev/null 2>&1; then
|
|
echo "ERROR: netcat (nc) required for HTTP mode" >&2
|
|
exit 1
|
|
fi
|
|
|
|
trap 'echo "Shutting down SMART drive exporter..." >&2; exit 0' INT TERM
|
|
|
|
while true; do
|
|
{
|
|
read -r request
|
|
local body
|
|
if [[ "$request" =~ ^GET\ /metrics ]]; then
|
|
body=$(generate_metrics)
|
|
printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body"
|
|
else
|
|
body=$(cat <<'HTMLEOF'
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>SMART Drive Exporter v1.0</title></head>
|
|
<body>
|
|
<h1>SMART Drive Exporter v1.0</h1>
|
|
<p><a href="/metrics">Metrics</a></p>
|
|
<h2>Sections</h2>
|
|
<ul>
|
|
<li>Drive health status (PASSED/FAILED)</li>
|
|
<li>Temperature per drive</li>
|
|
<li>SATA attributes (reallocated sectors, pending sectors, etc.)</li>
|
|
<li>NVMe health (percentage used, available spare, media errors)</li>
|
|
<li>Power-on hours and power cycle count</li>
|
|
<li>SSD wear leveling</li>
|
|
</ul>
|
|
</body>
|
|
</html>
|
|
HTMLEOF
|
|
)
|
|
printf "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body"
|
|
fi
|
|
} | if nc -h 2>&1 | grep -q 'GNU\|traditional'; then
|
|
nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
|
|
else
|
|
nc -l "$HTTP_PORT" 2>/dev/null
|
|
fi
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN EXECUTION
|
|
# ============================================================================
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
setup_smartctl_cmd
|
|
|
|
if [ "$HTTP_MODE" = true ]; then
|
|
run_http_server
|
|
elif [ -n "$OUTPUT_FILE" ]; then
|
|
local output_dir
|
|
output_dir="$(dirname "$OUTPUT_FILE")"
|
|
mkdir -p "$output_dir"
|
|
|
|
local temp_file
|
|
temp_file=$(mktemp "${output_dir}/.smart_drive_metrics.XXXXXX")
|
|
|
|
if ! generate_metrics > "$temp_file" 2>/dev/null; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Failed to generate metrics" >&2
|
|
exit 1
|
|
fi
|
|
|
|
local file_lines
|
|
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
|
|
|
|
if [ "$file_lines" -lt 3 ]; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
|
|
exit 1
|
|
fi
|
|
|
|
chmod 644 "$temp_file"
|
|
mv -f "$temp_file" "$OUTPUT_FILE"
|
|
|
|
echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
|
|
else
|
|
generate_metrics
|
|
fi
|
|
}
|
|
|
|
main "$@"
|