#!/bin/bash ################################################################################ # Script Name: storage-health-exporter.sh # Version: 1.0 # Description: Prometheus exporter for storage health metrics covering all # common Linux filesystems. Exports inode usage, SMART disk health, # mdadm RAID status, LVM thin pool usage, btrfs/zfs health, # stale mount detection, and filesystem error counts from journal. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - Standard Unix tools (df, stat, awk, grep) # - netcat (nc) for HTTP mode # - Optional: smartctl (SMART), mdadm (RAID), lvs (LVM), # btrfs (btrfs stats), zpool (ZFS), journalctl (fs errors) # Each section is skipped gracefully if tools are missing. # # Usage: # # Output to stdout # ./storage-health-exporter.sh # # # HTTP server mode # ./storage-health-exporter.sh --http -p 9197 # # # Textfile collector mode # ./storage-health-exporter.sh --textfile # # Metrics Exported: # Core Status: # - storage_health_up - Exporter status (1=up, 0=down) # - storage_health_exporter_info{version} - Exporter version # # Filesystem Info: # - storage_health_fs_info{device,mountpoint,fstype} - Filesystem info # - storage_health_mount_readonly{device,mountpoint} - Read-only status # - storage_health_mount_stale{mountpoint} - Stale/hung mount detection # # Inodes: # - storage_health_inode_total{mountpoint} - Total inodes # - storage_health_inode_used{mountpoint} - Used inodes # - storage_health_inode_usage_percent{mountpoint} - Inode usage percentage # # SMART (if smartctl available): # - storage_health_smart_healthy{device} - 1=passed, 0=failed # - storage_health_smart_temperature_celsius{device} - Drive temperature # - storage_health_smart_power_on_hours{device} - Power on hours # - storage_health_smart_reallocated_sectors{device} - Reallocated sectors # # MD RAID (if /proc/mdstat exists): # - storage_health_mdraid_healthy{array} - 1=clean, 0=degraded # - storage_health_mdraid_degraded{array} - 1 if degraded # - storage_health_mdraid_devices_total{array} - Total devices # - storage_health_mdraid_devices_active{array} - Active devices # - storage_health_mdraid_sync_percent{array} - Rebuild percentage # # LVM Thin Pools (if lvs available): # - storage_health_lvm_thin_data_percent{vg,pool} - Data usage # - storage_health_lvm_thin_metadata_percent{vg,pool} - Metadata usage # # BTRFS (if btrfs mounted): # - storage_health_btrfs_errors{mountpoint,device,type} - Error counts # - storage_health_btrfs_usage_bytes{mountpoint,group,usage} - Block group usage # - storage_health_btrfs_scrub_status{mountpoint} - Scrub state # - storage_health_btrfs_scrub_age_seconds{mountpoint} - Scrub age # - storage_health_btrfs_scrub_errors{mountpoint} - Scrub error count # # ZFS (if zpool/zfs available): # - storage_health_zfs_pool_healthy{pool} - 1=ONLINE, 0=other # - storage_health_zfs_pool_errors{pool} - Error count # - storage_health_zfs_pool_size_bytes{pool} - Pool total size # - storage_health_zfs_pool_alloc_bytes{pool} - Pool allocated bytes # - storage_health_zfs_pool_free_bytes{pool} - Pool free bytes # - storage_health_zfs_pool_fragmentation_percent{pool} - Fragmentation # - storage_health_zfs_pool_capacity_percent{pool} - Capacity used # - storage_health_zfs_pool_dedup_ratio{pool} - Dedup ratio # - storage_health_zfs_scrub_status{pool} - Scrub state # - storage_health_zfs_scrub_age_seconds{pool} - Seconds since scrub # - storage_health_zfs_scrub_errors{pool} - Scrub errors found # - storage_health_zfs_scrub_progress_percent{pool} - Scrub progress # - storage_health_zfs_vdev_read_errors{pool,vdev} - Per-vdev read errors # - storage_health_zfs_vdev_write_errors{pool,vdev} - Per-vdev write errors # - storage_health_zfs_vdev_checksum_errors{pool,vdev} - Per-vdev cksum errors # - storage_health_zfs_dataset_used_bytes{dataset,type} - Dataset usage # - storage_health_zfs_dataset_avail_bytes{dataset,type} - Dataset available # - storage_health_zfs_dataset_refer_bytes{dataset,type} - Dataset referenced # - storage_health_zfs_dataset_compressratio{dataset,type} - Compression ratio # - storage_health_zfs_snapshot_count{dataset} - Snapshot count # - storage_health_zfs_snapshot_oldest_age_seconds{dataset} - Oldest snap age # - storage_health_zfs_snapshot_newest_age_seconds{dataset} - Newest snap age # - storage_health_zfs_arc_hits_total - ARC cache hits # - storage_health_zfs_arc_misses_total - ARC cache misses # - storage_health_zfs_arc_size_bytes - ARC cache size # # Journal: # - storage_health_journal_fs_errors_24h - FS error count from journal # # Exporter: # - storage_health_exporter_duration_seconds - Script execution time # - storage_health_exporter_last_run_timestamp - Last run timestamp # # Configuration: # Default HTTP port: 9197 # Textfile directory: /var/lib/node_exporter # ################################################################################ # ============================================================================ # CONFIGURATION VARIABLES # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9197 # ============================================================================ # HELPER FUNCTIONS # ============================================================================ show_usage() { cat <&2; exit 1 ;; esac done } # Get mounted filesystems excluding virtual ones # Returns: Lines of "device mountpoint fstype" get_mounted_filesystems() { df -T 2>/dev/null | awk 'NR>1 && $2 !~ /^(tmpfs|devtmpfs|overlay|squashfs|efivarfs|fuse\..*|nsfs|cgroup.*)$/ { print $1, $7, $2 }' } # Get inode usage for a mountpoint # Args: $1 - mountpoint # Returns: "total used" or "0 0" on failure get_inode_usage() { local mountpoint="$1" df -i "$mountpoint" 2>/dev/null | awk 'NR==2 { print $2, $3 }' } # Check if a mountpoint is read-only # Args: $1 - mountpoint # Returns: 1 if read-only, 0 if read-write get_mount_readonly() { local mountpoint="$1" local opts opts=$(awk -v mp="$mountpoint" '$2 == mp { print $4 }' /proc/mounts 2>/dev/null | head -1) if echo "$opts" | grep -qE '(^|,)ro(,|$)'; then echo "1" else echo "0" fi } # Check if a mount is stale/hung (e.g., NFS) # Args: $1 - mountpoint # Returns: 1 if stale, 0 if responsive check_stale_mount() { local mountpoint="$1" if timeout 2 stat -t "$mountpoint" >/dev/null 2>&1; then echo "0" else echo "1" fi } # Get SMART health status for a device # Args: $1 - device path (e.g., /dev/sda) # Returns: 1 if PASSED, 0 if FAILED or error get_smart_health() { local device="$1" local result result=$(smartctl -H "$device" 2>/dev/null) if echo "$result" | grep -qi "PASSED\|OK"; then echo "1" else echo "0" fi } # Get SMART attributes for a device # Args: $1 - device path # Returns: "temperature power_on_hours reallocated_sectors" get_smart_attributes() { local device="$1" local output output=$(smartctl -A "$device" 2>/dev/null) local temp power_hours realloc # Temperature: attribute ID 194 or 190 temp=$(echo "$output" | awk '$1 == "194" || $1 == "190" { print $10; exit }') # Power on hours: attribute ID 9 power_hours=$(echo "$output" | awk '$1 == "9" { print $10; exit }') # Reallocated sectors: attribute ID 5 realloc=$(echo "$output" | awk '$1 == "5" { print $10; exit }') echo "${temp:-0} ${power_hours:-0} ${realloc:-0}" } # Get list of MD RAID arrays # Returns: Array names (e.g., md0 md1), one per line get_mdraid_arrays() { [ -f /proc/mdstat ] || return awk '/^md[0-9]/ { print $1 }' /proc/mdstat 2>/dev/null } # Get MD RAID array status # Args: $1 - array name (e.g., md0) # Returns: "state total_devices active_devices sync_percent" get_mdraid_status() { local array="$1" local detail detail=$(mdadm --detail "/dev/$array" 2>/dev/null) local state total active sync_pct state=$(echo "$detail" | awk -F: '/State :/ { gsub(/^[ \t]+/, "", $2); print $2 }') total=$(echo "$detail" | awk '/Raid Devices :/ { print $NF }') active=$(echo "$detail" | awk '/Active Devices :/ { print $NF }') # Check for rebuild/resync percentage sync_pct=$(awk -v arr="$array" ' /^'"$array"'/ { found=1; next } found && /recovery|resync/ { match($0, /([0-9]+\.[0-9]+)%/, m) if (m[1] != "") print m[1] exit } found && /^md/ { exit } ' /proc/mdstat 2>/dev/null) echo "${state:-unknown} ${total:-0} ${active:-0} ${sync_pct:-100}" } # Get LVM thin pool usage # Returns: Lines of "vg_name lv_name data_percent metadata_percent" get_lvm_thin_pools() { command -v lvs >/dev/null 2>&1 || return lvs --noheadings --nosuffix -o vg_name,lv_name,data_percent,metadata_percent \ --select 'pool_lv=""' --select 'lv_attr=~[t]' 2>/dev/null | \ awk 'NF==4 && $3+0 > 0 { print $1, $2, $3, $4 }' } # Get BTRFS device stats error counts # Args: $1 - mountpoint # Returns: Lines of "device error_type count" get_btrfs_health() { local mountpoint="$1" command -v btrfs >/dev/null 2>&1 || return btrfs device stats "$mountpoint" 2>/dev/null | awk -F'[].[[:space:]]+' '{ # Format: [/dev/sda1].write_io_errs 0 gsub(/\[|\]/, "") if (NF >= 3) { device = $1 type = $2 count = $NF print device, type, count } }' } # Get ZFS pool health # Returns: Lines of "pool_name state error_count" get_zpool_health() { command -v zpool >/dev/null 2>&1 || return zpool list -H -o name,health 2>/dev/null | while read -r pool state; do local errors errors=$(zpool status "$pool" 2>/dev/null | awk '/errors:/ { if ($2 == "No") print 0; else print 1 }') echo "$pool $state ${errors:-0}" done } # Get ZFS pool capacity metrics # Returns: Lines of "pool_name size_bytes alloc_bytes free_bytes frag_pct cap_pct dedup_ratio" get_zpool_capacity() { command -v zpool >/dev/null 2>&1 || return zpool list -Hp -o name,size,alloc,free,frag,cap,dedup 2>/dev/null } # Get ZFS pool scrub status # Args: $1 - pool name # Returns: "state seconds_since_scrub errors_found" # state: 0=no scrub, 1=completed, 2=in progress get_zpool_scrub() { local pool="$1" local status_output status_output=$(zpool status "$pool" 2>/dev/null) local scrub_line scrub_line=$(echo "$status_output" | grep "scan:" | head -1) if echo "$scrub_line" | grep -q "in progress"; then local pct pct=$(echo "$status_output" | grep -oE '[0-9]+\.[0-9]+% done' | grep -oE '[0-9.]+' | head -1) echo "2 0 0 ${pct:-0}" elif echo "$scrub_line" | grep -q "scrub repaired"; then local scrub_date seconds_since errors scrub_date=$(echo "$scrub_line" | grep -oE '[A-Z][a-z]{2} [A-Z][a-z]{2} +[0-9]+ [0-9:]+ [0-9]+' | head -1) if [ -n "$scrub_date" ]; then local scrub_ts scrub_ts=$(date -d "$scrub_date" +%s 2>/dev/null || echo 0) seconds_since=$(( $(date +%s) - scrub_ts )) else seconds_since=0 fi errors=$(echo "$scrub_line" | grep -oE '[0-9]+ errors' | awk '{print $1}') echo "1 ${seconds_since:-0} ${errors:-0} 100" else echo "0 0 0 0" fi } # Get ZFS dataset metrics # Returns: Lines of "dataset used_bytes avail_bytes refer_bytes compressratio type" get_zfs_datasets() { command -v zfs >/dev/null 2>&1 || return zfs list -Hp -o name,used,avail,refer,compressratio,type 2>/dev/null } # Get ZFS snapshot count and age per dataset # Returns: Lines of "dataset count oldest_age_seconds newest_age_seconds" get_zfs_snapshot_stats() { command -v zfs >/dev/null 2>&1 || return local now now=$(date +%s) zfs list -t snapshot -Hp -o name,creation 2>/dev/null | \ awk -v now="$now" '{ split($1, parts, "@") ds = parts[1] ts = $2 count[ds]++ if (!(ds in oldest) || ts < oldest[ds]) oldest[ds] = ts if (!(ds in newest) || ts > newest[ds]) newest[ds] = ts } END { for (ds in count) { oldest_age = (oldest[ds] > 0) ? now - oldest[ds] : 0 newest_age = (newest[ds] > 0) ? now - newest[ds] : 0 print ds, count[ds], oldest_age, newest_age } }' } # Get ZFS per-vdev error counts from zpool status # Args: $1 - pool name # Returns: Lines of "vdev read_errors write_errors checksum_errors" get_zpool_vdev_errors() { local pool="$1" zpool status "$pool" 2>/dev/null | awk ' /NAME.*STATE.*READ.*WRITE.*CKSUM/ { header=1; next } header && /^$/ { exit } header && NF >= 5 { # Skip pool-level and mirror/raidz container lines name = $1 read_err = $(NF-2) write_err = $(NF-1) cksum_err = $NF if (read_err ~ /^[0-9]+$/ && name !~ /^(mirror|raidz|log|cache|spare)/) { print name, read_err, write_err, cksum_err } }' } # Get ZFS ARC stats from /proc/spl/kstat/zfs/arcstats (Linux only) # Returns: "hits misses size_bytes" get_zfs_arc_stats() { local arcstats="/proc/spl/kstat/zfs/arcstats" [ -f "$arcstats" ] || return local hits misses size hits=$(awk '$1 == "hits" { print $3 }' "$arcstats" 2>/dev/null) misses=$(awk '$1 == "misses" { print $3 }' "$arcstats" 2>/dev/null) size=$(awk '$1 == "size" { print $3 }' "$arcstats" 2>/dev/null) echo "${hits:-0} ${misses:-0} ${size:-0}" } # Get BTRFS filesystem usage (data/metadata/system) # Args: $1 - mountpoint # Returns: Lines of "type used_bytes total_bytes" get_btrfs_usage() { local mountpoint="$1" command -v btrfs >/dev/null 2>&1 || return btrfs filesystem usage -b "$mountpoint" 2>/dev/null | awk ' /^Data,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "data", used+0, total+0 } /^Metadata,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "metadata", used+0, total+0 } /^System,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "system", used+0, total+0 } ' } # Get BTRFS scrub status # Args: $1 - mountpoint # Returns: "state seconds_since_scrub errors_found" # state: 0=never, 1=completed, 2=in progress get_btrfs_scrub() { local mountpoint="$1" command -v btrfs >/dev/null 2>&1 || return local output output=$(btrfs scrub status "$mountpoint" 2>/dev/null) if echo "$output" | grep -q "running"; then echo "2 0 0" elif echo "$output" | grep -q "finished"; then local scrub_date seconds_since errors scrub_date=$(echo "$output" | grep -oE 'finished after [0-9:]+' | head -1) local started started=$(echo "$output" | grep "Scrub started:" | sed 's/.*Scrub started:[[:space:]]*//') if [ -n "$started" ]; then local scrub_ts scrub_ts=$(date -d "$started" +%s 2>/dev/null || echo 0) seconds_since=$(( $(date +%s) - scrub_ts )) else seconds_since=0 fi errors=$(echo "$output" | awk '/errors found:/ { print $NF }' | head -1) # Handle "no errors found" case if echo "$output" | grep -qi "no errors found"; then errors=0 fi echo "1 ${seconds_since:-0} ${errors:-0}" else echo "0 0 0" fi } # Count filesystem error messages from journal (last 24h) # Returns: Number of filesystem error entries get_fs_errors_from_journal() { command -v journalctl >/dev/null 2>&1 || { echo "0"; return; } local count count=$(journalctl --since "24 hours ago" --no-pager -q 2>/dev/null | \ grep -ciE "(EXT4-fs error|XFS.*error|BTRFS.*error|I/O error|Buffer I/O error)" 2>/dev/null) echo "${count:-0}" } # ============================================================================ # METRIC GENERATION # ============================================================================ # Generate all Prometheus metrics # Returns: Prometheus text format metrics on stdout generate_metrics() { local script_start script_start=$(date +%s) cat </dev/null; then pct=$(awk "BEGIN {printf \"%.2f\", ($used / $total) * 100}" 2>/dev/null || echo "0") else pct="0" fi echo "storage_health_inode_usage_percent{mountpoint=\"$mountpoint\"} $pct" done < <(get_mounted_filesystems) echo "" # ======================================================================== # SMART Disk Health (optional) # ======================================================================== if command -v smartctl >/dev/null 2>&1; then cat </dev/null) echo "" cat </dev/null) echo "" cat </dev/null) echo "" cat </dev/null) echo "" fi # ======================================================================== # MD RAID (optional) # ======================================================================== if [ -f /proc/mdstat ] && command -v mdadm >/dev/null 2>&1; then local arrays arrays=$(get_mdraid_arrays) if [ -n "$arrays" ]; then cat </dev/null 2>&1; then local thin_pools thin_pools=$(get_lvm_thin_pools) if [ -n "$thin_pools" ]; then cat </dev/null 2>&1; then local has_btrfs=0 while read -r device mountpoint fstype; do [ "$fstype" = "btrfs" ] && has_btrfs=1 done < <(get_mounted_filesystems) if [ "$has_btrfs" -eq 1 ]; then cat </dev/null 2>&1; then local zfs_pools zfs_pools=$(get_zpool_health) if [ -n "$zfs_pools" ]; then cat <&2 if ! command -v nc >/dev/null 2>&1; then echo "ERROR: netcat (nc) required for HTTP mode" >&2 exit 1 fi # Infinite loop accepting HTTP requests while true; do { read -r request # Check if request is for /metrics endpoint if [[ "$request" =~ ^GET\ /metrics ]]; then echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" generate_metrics else # Serve HTML landing page for other requests echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" cat < Storage Health Exporter v1.0

Storage Health Exporter v1.0

Metrics

Sections (auto-detected)

  • Filesystem info, mount status, inode usage
  • SMART disk health (requires smartctl)
  • MD RAID array status (requires mdadm)
  • LVM thin pool usage (requires lvs)
  • BTRFS device error stats (requires btrfs)
  • ZFS pool health (requires zpool)
  • Journal filesystem error count
EOF fi } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null done } # ============================================================================ # MAIN EXECUTION # ============================================================================ # Main entry point - routes to appropriate output mode main() { parse_args "$@" if [ "$HTTP_MODE" = true ]; then # Run HTTP server (blocks until killed) run_http_server elif [ -n "$OUTPUT_FILE" ]; then # Textfile collector mode: write atomically using temp file local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" # Create temp file in SAME directory for atomic rename (same filesystem) local temp_file temp_file=$(mktemp "${output_dir}/.storage_health_metrics.XXXXXX") # Generate metrics to temp file if ! generate_metrics > "$temp_file" 2>/dev/null; then rm -f "$temp_file" echo "ERROR: Failed to generate metrics" >&2 exit 1 fi # Validate: file must have content local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [ "$file_lines" -lt 10 ]; then rm -f "$temp_file" echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 exit 1 fi # Set permissions before move chmod 644 "$temp_file" # Atomic rename - no gap where file is missing mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 else # Default: output to stdout generate_metrics fi } # Execute main function with all script arguments main "$@"