linux-scripts/storage-health-exporter.sh

#!/bin/bash
################################################################################
# Script Name: storage-health-exporter.sh
# Version: 1.0
# Description: Prometheus exporter for storage health metrics covering all
#              common Linux filesystems. Exports inode usage, SMART disk health,
#              mdadm RAID status, LVM thin pool usage, btrfs/zfs health,
#              stale mount detection, and filesystem error counts from journal.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
#   - Standard Unix tools (df, stat, awk, grep)
#   - netcat (nc) for HTTP mode
#   - Optional: smartctl (SMART), mdadm (RAID), lvs (LVM),
#     btrfs (btrfs stats), zpool (ZFS), journalctl (fs errors)
#     Each section is skipped gracefully if tools are missing.
#
# Usage:
#   # Output to stdout
#   ./storage-health-exporter.sh
#
#   # HTTP server mode
#   ./storage-health-exporter.sh --http -p 9197
#
#   # Textfile collector mode
#   ./storage-health-exporter.sh --textfile
#
# Metrics Exported:
#   Core Status:
#     - storage_health_up - Exporter status (1=up, 0=down)
#     - storage_health_exporter_info{version} - Exporter version
#
#   Filesystem Info:
#     - storage_health_fs_info{device,mountpoint,fstype} - Filesystem info
#     - storage_health_mount_readonly{device,mountpoint} - Read-only status
#     - storage_health_mount_stale{mountpoint} - Stale/hung mount detection
#
#   Inodes:
#     - storage_health_inode_total{mountpoint} - Total inodes
#     - storage_health_inode_used{mountpoint} - Used inodes
#     - storage_health_inode_usage_percent{mountpoint} - Inode usage percentage
#
#   SMART (if smartctl available):
#     - storage_health_smart_healthy{device} - 1=passed, 0=failed
#     - storage_health_smart_temperature_celsius{device} - Drive temperature
#     - storage_health_smart_power_on_hours{device} - Power on hours
#     - storage_health_smart_reallocated_sectors{device} - Reallocated sectors
#
#   MD RAID (if /proc/mdstat exists):
#     - storage_health_mdraid_healthy{array} - 1=clean, 0=degraded
#     - storage_health_mdraid_degraded{array} - 1 if degraded
#     - storage_health_mdraid_devices_total{array} - Total devices
#     - storage_health_mdraid_devices_active{array} - Active devices
#     - storage_health_mdraid_sync_percent{array} - Rebuild percentage
#
#   LVM Thin Pools (if lvs available):
#     - storage_health_lvm_thin_data_percent{vg,pool} - Data usage
#     - storage_health_lvm_thin_metadata_percent{vg,pool} - Metadata usage
#
#   BTRFS (if btrfs mounted):
#     - storage_health_btrfs_errors{mountpoint,device,type} - Error counts
#     - storage_health_btrfs_usage_bytes{mountpoint,group,usage} - Block group usage
#     - storage_health_btrfs_scrub_status{mountpoint} - Scrub state
#     - storage_health_btrfs_scrub_age_seconds{mountpoint} - Scrub age
#     - storage_health_btrfs_scrub_errors{mountpoint} - Scrub error count
#
#   ZFS (if zpool/zfs available):
#     - storage_health_zfs_pool_healthy{pool} - 1=ONLINE, 0=other
#     - storage_health_zfs_pool_errors{pool} - Error count
#     - storage_health_zfs_pool_size_bytes{pool} - Pool total size
#     - storage_health_zfs_pool_alloc_bytes{pool} - Pool allocated bytes
#     - storage_health_zfs_pool_free_bytes{pool} - Pool free bytes
#     - storage_health_zfs_pool_fragmentation_percent{pool} - Fragmentation
#     - storage_health_zfs_pool_capacity_percent{pool} - Capacity used
#     - storage_health_zfs_pool_dedup_ratio{pool} - Dedup ratio
#     - storage_health_zfs_scrub_status{pool} - Scrub state
#     - storage_health_zfs_scrub_age_seconds{pool} - Seconds since scrub
#     - storage_health_zfs_scrub_errors{pool} - Scrub errors found
#     - storage_health_zfs_scrub_progress_percent{pool} - Scrub progress
#     - storage_health_zfs_vdev_read_errors{pool,vdev} - Per-vdev read errors
#     - storage_health_zfs_vdev_write_errors{pool,vdev} - Per-vdev write errors
#     - storage_health_zfs_vdev_checksum_errors{pool,vdev} - Per-vdev cksum errors
#     - storage_health_zfs_dataset_used_bytes{dataset,type} - Dataset usage
#     - storage_health_zfs_dataset_avail_bytes{dataset,type} - Dataset available
#     - storage_health_zfs_dataset_refer_bytes{dataset,type} - Dataset referenced
#     - storage_health_zfs_dataset_compressratio{dataset,type} - Compression ratio
#     - storage_health_zfs_snapshot_count{dataset} - Snapshot count
#     - storage_health_zfs_snapshot_oldest_age_seconds{dataset} - Oldest snap age
#     - storage_health_zfs_snapshot_newest_age_seconds{dataset} - Newest snap age
#     - storage_health_zfs_arc_hits_total - ARC cache hits
#     - storage_health_zfs_arc_misses_total - ARC cache misses
#     - storage_health_zfs_arc_size_bytes - ARC cache size
#
#   Journal:
#     - storage_health_journal_fs_errors_24h - FS error count from journal
#
#   Exporter:
#     - storage_health_exporter_duration_seconds - Script execution time
#     - storage_health_exporter_last_run_timestamp - Last run timestamp
#
# Configuration:
#   Default HTTP port: 9197
#   Textfile directory: /var/lib/node_exporter
#
################################################################################

# ============================================================================
# CONFIGURATION VARIABLES
# ============================================================================

TEXTFILE_DIR="/var/lib/node_exporter"
OUTPUT_FILE=""
HTTP_MODE=false
HTTP_PORT=9197

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

show_usage() {
    cat <<EOF
Usage: $0 [OPTIONS]

Export storage health statistics as Prometheus metrics (v1.0).

MODES:
    --textfile      Write to node_exporter textfile collector
    --http          Run HTTP server on port $HTTP_PORT

OPTIONS:
    -p, --port      HTTP port (default: 9197)
    -o, --output    Output file path

EXAMPLES:
    $0 --textfile                    # Write to textfile collector
    $0 --http --port 9197            # Run HTTP server
    $0 -o /tmp/storage_health.prom   # Write to custom file

SECTIONS (auto-detected, skipped if tools missing):
    - Filesystem info and inode usage (always available)
    - SMART disk health (requires smartctl)
    - MD RAID status (requires /proc/mdstat + mdadm)
    - LVM thin pool usage (requires lvs)
    - BTRFS device stats (requires btrfs)
    - ZFS pool health (requires zpool)
    - Journal filesystem errors (requires journalctl)

EOF
    exit 0
}

parse_args() {
    while [[ $# -gt 0 ]]; do
        case $1 in
            -h|--help) show_usage ;;
            --textfile) OUTPUT_FILE="$TEXTFILE_DIR/storage_health.prom"; shift ;;
            --http) HTTP_MODE=true; shift ;;
            -p|--port) HTTP_PORT="$2"; shift 2 ;;
            -o|--output) OUTPUT_FILE="$2"; shift 2 ;;
            *) echo "Unknown option: $1" >&2; exit 1 ;;
        esac
    done
}

# Get mounted filesystems excluding virtual ones
# Returns: Lines of "device mountpoint fstype"
get_mounted_filesystems() {
    df -T 2>/dev/null | awk 'NR>1 && $2 !~ /^(tmpfs|devtmpfs|overlay|squashfs|efivarfs|fuse\..*|nsfs|cgroup.*)$/ {
        print $1, $7, $2
    }'
}

# Get inode usage for a mountpoint
# Args: $1 - mountpoint
# Returns: "total used" or "0 0" on failure
get_inode_usage() {
    local mountpoint="$1"
    df -i "$mountpoint" 2>/dev/null | awk 'NR==2 { print $2, $3 }'
}

# Check if a mountpoint is read-only
# Args: $1 - mountpoint
# Returns: 1 if read-only, 0 if read-write
get_mount_readonly() {
    local mountpoint="$1"
    local opts
    opts=$(awk -v mp="$mountpoint" '$2 == mp { print $4 }' /proc/mounts 2>/dev/null | head -1)
    if echo "$opts" | grep -qE '(^|,)ro(,|$)'; then
        echo "1"
    else
        echo "0"
    fi
}

# Check if a mount is stale/hung (e.g., NFS)
# Args: $1 - mountpoint
# Returns: 1 if stale, 0 if responsive
check_stale_mount() {
    local mountpoint="$1"
    if timeout 2 stat -t "$mountpoint" >/dev/null 2>&1; then
        echo "0"
    else
        echo "1"
    fi
}

# Get SMART health status for a device
# Args: $1 - device path (e.g., /dev/sda)
# Returns: 1 if PASSED, 0 if FAILED or error
get_smart_health() {
    local device="$1"
    local result
    result=$(smartctl -H "$device" 2>/dev/null)
    if echo "$result" | grep -qi "PASSED\|OK"; then
        echo "1"
    else
        echo "0"
    fi
}

# Get SMART attributes for a device
# Args: $1 - device path
# Returns: "temperature power_on_hours reallocated_sectors"
get_smart_attributes() {
    local device="$1"
    local output
    output=$(smartctl -A "$device" 2>/dev/null)

    local temp power_hours realloc

    # Temperature: attribute ID 194 or 190
    temp=$(echo "$output" | awk '$1 == "194" || $1 == "190" { print $10; exit }')
    # Power on hours: attribute ID 9
    power_hours=$(echo "$output" | awk '$1 == "9" { print $10; exit }')
    # Reallocated sectors: attribute ID 5
    realloc=$(echo "$output" | awk '$1 == "5" { print $10; exit }')

    echo "${temp:-0} ${power_hours:-0} ${realloc:-0}"
}

# Get list of MD RAID arrays
# Returns: Array names (e.g., md0 md1), one per line
get_mdraid_arrays() {
    [ -f /proc/mdstat ] || return
    awk '/^md[0-9]/ { print $1 }' /proc/mdstat 2>/dev/null
}

# Get MD RAID array status
# Args: $1 - array name (e.g., md0)
# Returns: "state total_devices active_devices sync_percent"
get_mdraid_status() {
    local array="$1"
    local detail
    detail=$(mdadm --detail "/dev/$array" 2>/dev/null)

    local state total active sync_pct
    state=$(echo "$detail" | awk -F: '/State :/ { gsub(/^[ \t]+/, "", $2); print $2 }')
    total=$(echo "$detail" | awk '/Raid Devices :/ { print $NF }')
    active=$(echo "$detail" | awk '/Active Devices :/ { print $NF }')

    # Check for rebuild/resync percentage
    sync_pct=$(awk -v arr="$array" '
        /^'"$array"'/ { found=1; next }
        found && /recovery|resync/ {
            match($0, /([0-9]+\.[0-9]+)%/, m)
            if (m[1] != "") print m[1]
            exit
        }
        found && /^md/ { exit }
    ' /proc/mdstat 2>/dev/null)

    echo "${state:-unknown} ${total:-0} ${active:-0} ${sync_pct:-100}"
}

# Get LVM thin pool usage
# Returns: Lines of "vg_name lv_name data_percent metadata_percent"
get_lvm_thin_pools() {
    command -v lvs >/dev/null 2>&1 || return
    lvs --noheadings --nosuffix -o vg_name,lv_name,data_percent,metadata_percent \
        --select 'pool_lv=""' --select 'lv_attr=~[t]' 2>/dev/null | \
        awk 'NF==4 && $3+0 > 0 { print $1, $2, $3, $4 }'
}

# Get BTRFS device stats error counts
# Args: $1 - mountpoint
# Returns: Lines of "device error_type count"
get_btrfs_health() {
    local mountpoint="$1"
    command -v btrfs >/dev/null 2>&1 || return
    btrfs device stats "$mountpoint" 2>/dev/null | awk -F'[].[[:space:]]+' '{
        # Format: [/dev/sda1].write_io_errs    0
        gsub(/\[|\]/, "")
        if (NF >= 3) {
            device = $1
            type = $2
            count = $NF
            print device, type, count
        }
    }'
}

# Get ZFS pool health
# Returns: Lines of "pool_name state error_count"
get_zpool_health() {
    command -v zpool >/dev/null 2>&1 || return
    zpool list -H -o name,health 2>/dev/null | while read -r pool state; do
        local errors
        errors=$(zpool status "$pool" 2>/dev/null | awk '/errors:/ { if ($2 == "No") print 0; else print 1 }')
        echo "$pool $state ${errors:-0}"
    done
}

# Get ZFS pool capacity metrics
# Returns: Lines of "pool_name size_bytes alloc_bytes free_bytes frag_pct cap_pct dedup_ratio"
get_zpool_capacity() {
    command -v zpool >/dev/null 2>&1 || return
    zpool list -Hp -o name,size,alloc,free,frag,cap,dedup 2>/dev/null
}

# Get ZFS pool scrub status
# Args: $1 - pool name
# Returns: "state seconds_since_scrub errors_found"
#   state: 0=no scrub, 1=completed, 2=in progress
get_zpool_scrub() {
    local pool="$1"
    local status_output
    status_output=$(zpool status "$pool" 2>/dev/null)

    local scrub_line
    scrub_line=$(echo "$status_output" | grep "scan:" | head -1)

    if echo "$scrub_line" | grep -q "in progress"; then
        local pct
        pct=$(echo "$status_output" | grep -oE '[0-9]+\.[0-9]+% done' | grep -oE '[0-9.]+' | head -1)
        echo "2 0 0 ${pct:-0}"
    elif echo "$scrub_line" | grep -q "scrub repaired"; then
        local scrub_date seconds_since errors
        scrub_date=$(echo "$scrub_line" | grep -oE '[A-Z][a-z]{2} [A-Z][a-z]{2} +[0-9]+ [0-9:]+ [0-9]+' | head -1)
        if [ -n "$scrub_date" ]; then
            local scrub_ts
            scrub_ts=$(date -d "$scrub_date" +%s 2>/dev/null || echo 0)
            seconds_since=$(( $(date +%s) - scrub_ts ))
        else
            seconds_since=0
        fi
        errors=$(echo "$scrub_line" | grep -oE '[0-9]+ errors' | awk '{print $1}')
        echo "1 ${seconds_since:-0} ${errors:-0} 100"
    else
        echo "0 0 0 0"
    fi
}

# Get ZFS dataset metrics
# Returns: Lines of "dataset used_bytes avail_bytes refer_bytes compressratio type"
get_zfs_datasets() {
    command -v zfs >/dev/null 2>&1 || return
    zfs list -Hp -o name,used,avail,refer,compressratio,type 2>/dev/null
}

# Get ZFS snapshot count and age per dataset
# Returns: Lines of "dataset count oldest_age_seconds newest_age_seconds"
get_zfs_snapshot_stats() {
    command -v zfs >/dev/null 2>&1 || return
    local now
    now=$(date +%s)
    zfs list -t snapshot -Hp -o name,creation 2>/dev/null | \
        awk -v now="$now" '{
            split($1, parts, "@")
            ds = parts[1]
            ts = $2
            count[ds]++
            if (!(ds in oldest) || ts < oldest[ds]) oldest[ds] = ts
            if (!(ds in newest) || ts > newest[ds]) newest[ds] = ts
        }
        END {
            for (ds in count) {
                oldest_age = (oldest[ds] > 0) ? now - oldest[ds] : 0
                newest_age = (newest[ds] > 0) ? now - newest[ds] : 0
                print ds, count[ds], oldest_age, newest_age
            }
        }'
}

# Get ZFS per-vdev error counts from zpool status
# Args: $1 - pool name
# Returns: Lines of "vdev read_errors write_errors checksum_errors"
get_zpool_vdev_errors() {
    local pool="$1"
    zpool status "$pool" 2>/dev/null | awk '
        /NAME.*STATE.*READ.*WRITE.*CKSUM/ { header=1; next }
        header && /^$/ { exit }
        header && NF >= 5 {
            # Skip pool-level and mirror/raidz container lines
            name = $1
            read_err = $(NF-2)
            write_err = $(NF-1)
            cksum_err = $NF
            if (read_err ~ /^[0-9]+$/ && name !~ /^(mirror|raidz|log|cache|spare)/) {
                print name, read_err, write_err, cksum_err
            }
        }'
}

# Get ZFS ARC stats from /proc/spl/kstat/zfs/arcstats (Linux only)
# Returns: "hits misses size_bytes"
get_zfs_arc_stats() {
    local arcstats="/proc/spl/kstat/zfs/arcstats"
    [ -f "$arcstats" ] || return
    local hits misses size
    hits=$(awk '$1 == "hits" { print $3 }' "$arcstats" 2>/dev/null)
    misses=$(awk '$1 == "misses" { print $3 }' "$arcstats" 2>/dev/null)
    size=$(awk '$1 == "size" { print $3 }' "$arcstats" 2>/dev/null)
    echo "${hits:-0} ${misses:-0} ${size:-0}"
}

# Get BTRFS filesystem usage (data/metadata/system)
# Args: $1 - mountpoint
# Returns: Lines of "type used_bytes total_bytes"
get_btrfs_usage() {
    local mountpoint="$1"
    command -v btrfs >/dev/null 2>&1 || return
    btrfs filesystem usage -b "$mountpoint" 2>/dev/null | awk '
        /^Data,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "data", used+0, total+0 }
        /^Metadata,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "metadata", used+0, total+0 }
        /^System,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "system", used+0, total+0 }
    '
}

# Get BTRFS scrub status
# Args: $1 - mountpoint
# Returns: "state seconds_since_scrub errors_found"
#   state: 0=never, 1=completed, 2=in progress
get_btrfs_scrub() {
    local mountpoint="$1"
    command -v btrfs >/dev/null 2>&1 || return
    local output
    output=$(btrfs scrub status "$mountpoint" 2>/dev/null)

    if echo "$output" | grep -q "running"; then
        echo "2 0 0"
    elif echo "$output" | grep -q "finished"; then
        local scrub_date seconds_since errors
        scrub_date=$(echo "$output" | grep -oE 'finished after [0-9:]+' | head -1)
        local started
        started=$(echo "$output" | grep "Scrub started:" | sed 's/.*Scrub started:[[:space:]]*//')
        if [ -n "$started" ]; then
            local scrub_ts
            scrub_ts=$(date -d "$started" +%s 2>/dev/null || echo 0)
            seconds_since=$(( $(date +%s) - scrub_ts ))
        else
            seconds_since=0
        fi
        errors=$(echo "$output" | awk '/errors found:/ { print $NF }' | head -1)
        # Handle "no errors found" case
        if echo "$output" | grep -qi "no errors found"; then
            errors=0
        fi
        echo "1 ${seconds_since:-0} ${errors:-0}"
    else
        echo "0 0 0"
    fi
}

# Count filesystem error messages from journal (last 24h)
# Returns: Number of filesystem error entries
get_fs_errors_from_journal() {
    command -v journalctl >/dev/null 2>&1 || { echo "0"; return; }
    local count
    count=$(journalctl --since "24 hours ago" --no-pager -q 2>/dev/null | \
        grep -ciE "(EXT4-fs error|XFS.*error|BTRFS.*error|I/O error|Buffer I/O error)" 2>/dev/null)
    echo "${count:-0}"
}

# ============================================================================
# METRIC GENERATION
# ============================================================================

# Generate all Prometheus metrics
# Returns: Prometheus text format metrics on stdout
generate_metrics() {
    local script_start
    script_start=$(date +%s)

    cat <<EOF
# HELP storage_health_up Storage health exporter status
# TYPE storage_health_up gauge
storage_health_up 1

# HELP storage_health_exporter_info Storage health exporter information
# TYPE storage_health_exporter_info gauge
storage_health_exporter_info{version="1.0"} 1
EOF

    echo ""

    # ========================================================================
    # Filesystem Info and Inode Usage
    # ========================================================================
    cat <<EOF
# HELP storage_health_fs_info Filesystem information (always 1)
# TYPE storage_health_fs_info gauge
EOF

    while read -r device mountpoint fstype; do
        [ -z "$device" ] && continue
        echo "storage_health_fs_info{device=\"$device\",mountpoint=\"$mountpoint\",fstype=\"$fstype\"} 1"
    done < <(get_mounted_filesystems)

    echo ""

    cat <<EOF
# HELP storage_health_mount_readonly Mount read-only status (1=ro, 0=rw)
# TYPE storage_health_mount_readonly gauge
EOF

    while read -r device mountpoint fstype; do
        [ -z "$device" ] && continue
        local ro
        ro=$(get_mount_readonly "$mountpoint")
        echo "storage_health_mount_readonly{device=\"$device\",mountpoint=\"$mountpoint\"} $ro"
    done < <(get_mounted_filesystems)

    echo ""

    cat <<EOF
# HELP storage_health_mount_stale Mount stale/hung status (1=stale, 0=ok)
# TYPE storage_health_mount_stale gauge
EOF

    while read -r device mountpoint fstype; do
        [ -z "$device" ] && continue
        local stale
        stale=$(check_stale_mount "$mountpoint")
        echo "storage_health_mount_stale{mountpoint=\"$mountpoint\"} $stale"
    done < <(get_mounted_filesystems)

    echo ""

    cat <<EOF
# HELP storage_health_inode_total Total inodes per mountpoint
# TYPE storage_health_inode_total gauge
EOF

    while read -r device mountpoint fstype; do
        [ -z "$device" ] && continue
        local inode_info total used
        inode_info=$(get_inode_usage "$mountpoint")
        total=$(echo "$inode_info" | awk '{print $1}')
        echo "storage_health_inode_total{mountpoint=\"$mountpoint\"} ${total:-0}"
    done < <(get_mounted_filesystems)

    echo ""

    cat <<EOF
# HELP storage_health_inode_used Used inodes per mountpoint
# TYPE storage_health_inode_used gauge
EOF

    while read -r device mountpoint fstype; do
        [ -z "$device" ] && continue
        local inode_info used
        inode_info=$(get_inode_usage "$mountpoint")
        used=$(echo "$inode_info" | awk '{print $2}')
        echo "storage_health_inode_used{mountpoint=\"$mountpoint\"} ${used:-0}"
    done < <(get_mounted_filesystems)

    echo ""

    cat <<EOF
# HELP storage_health_inode_usage_percent Inode usage percentage per mountpoint
# TYPE storage_health_inode_usage_percent gauge
EOF

    while read -r device mountpoint fstype; do
        [ -z "$device" ] && continue
        local inode_info total used pct
        inode_info=$(get_inode_usage "$mountpoint")
        total=$(echo "$inode_info" | awk '{print $1}')
        used=$(echo "$inode_info" | awk '{print $2}')
        total=${total:-0}
        used=${used:-0}
        if [ "$total" -gt 0 ] 2>/dev/null; then
            pct=$(awk "BEGIN {printf \"%.2f\", ($used / $total) * 100}" 2>/dev/null || echo "0")
        else
            pct="0"
        fi
        echo "storage_health_inode_usage_percent{mountpoint=\"$mountpoint\"} $pct"
    done < <(get_mounted_filesystems)

    echo ""

    # ========================================================================
    # SMART Disk Health (optional)
    # ========================================================================
    if command -v smartctl >/dev/null 2>&1; then
        cat <<EOF
# HELP storage_health_smart_healthy SMART health status (1=passed, 0=failed)
# TYPE storage_health_smart_healthy gauge
EOF

        while read -r name dtype; do
            [ "$dtype" = "disk" ] || continue
            local device="/dev/$name"
            local healthy
            healthy=$(get_smart_health "$device")
            echo "storage_health_smart_healthy{device=\"$device\"} $healthy"
        done < <(lsblk -dno NAME,TYPE 2>/dev/null)

        echo ""

        cat <<EOF
# HELP storage_health_smart_temperature_celsius Drive temperature in Celsius
# TYPE storage_health_smart_temperature_celsius gauge
EOF

        while read -r name dtype; do
            [ "$dtype" = "disk" ] || continue
            local device="/dev/$name"
            local attrs temp
            attrs=$(get_smart_attributes "$device")
            temp=$(echo "$attrs" | awk '{print $1}')
            echo "storage_health_smart_temperature_celsius{device=\"$device\"} ${temp:-0}"
        done < <(lsblk -dno NAME,TYPE 2>/dev/null)

        echo ""

        cat <<EOF
# HELP storage_health_smart_power_on_hours Drive power on hours
# TYPE storage_health_smart_power_on_hours gauge
EOF

        while read -r name dtype; do
            [ "$dtype" = "disk" ] || continue
            local device="/dev/$name"
            local attrs hours
            attrs=$(get_smart_attributes "$device")
            hours=$(echo "$attrs" | awk '{print $2}')
            echo "storage_health_smart_power_on_hours{device=\"$device\"} ${hours:-0}"
        done < <(lsblk -dno NAME,TYPE 2>/dev/null)

        echo ""

        cat <<EOF
# HELP storage_health_smart_reallocated_sectors Reallocated sector count
# TYPE storage_health_smart_reallocated_sectors gauge
EOF

        while read -r name dtype; do
            [ "$dtype" = "disk" ] || continue
            local device="/dev/$name"
            local attrs realloc
            attrs=$(get_smart_attributes "$device")
            realloc=$(echo "$attrs" | awk '{print $3}')
            echo "storage_health_smart_reallocated_sectors{device=\"$device\"} ${realloc:-0}"
        done < <(lsblk -dno NAME,TYPE 2>/dev/null)

        echo ""
    fi

    # ========================================================================
    # MD RAID (optional)
    # ========================================================================
    if [ -f /proc/mdstat ] && command -v mdadm >/dev/null 2>&1; then
        local arrays
        arrays=$(get_mdraid_arrays)

        if [ -n "$arrays" ]; then
            cat <<EOF
# HELP storage_health_mdraid_healthy MD RAID array health (1=clean, 0=degraded)
# TYPE storage_health_mdraid_healthy gauge
EOF

            for array in $arrays; do
                local status state
                status=$(get_mdraid_status "$array")
                state=$(echo "$status" | awk '{print $1}')
                if echo "$state" | grep -qi "clean\|active"; then
                    echo "storage_health_mdraid_healthy{array=\"$array\"} 1"
                else
                    echo "storage_health_mdraid_healthy{array=\"$array\"} 0"
                fi
            done

            echo ""

            cat <<EOF
# HELP storage_health_mdraid_degraded MD RAID degraded status (1=degraded, 0=ok)
# TYPE storage_health_mdraid_degraded gauge
EOF

            for array in $arrays; do
                local status state
                status=$(get_mdraid_status "$array")
                state=$(echo "$status" | awk '{print $1}')
                if echo "$state" | grep -qi "degraded"; then
                    echo "storage_health_mdraid_degraded{array=\"$array\"} 1"
                else
                    echo "storage_health_mdraid_degraded{array=\"$array\"} 0"
                fi
            done

            echo ""

            cat <<EOF
# HELP storage_health_mdraid_devices_total Total devices in MD RAID array
# TYPE storage_health_mdraid_devices_total gauge
EOF

            for array in $arrays; do
                local status total
                status=$(get_mdraid_status "$array")
                total=$(echo "$status" | awk '{print $2}')
                echo "storage_health_mdraid_devices_total{array=\"$array\"} ${total:-0}"
            done

            echo ""

            cat <<EOF
# HELP storage_health_mdraid_devices_active Active devices in MD RAID array
# TYPE storage_health_mdraid_devices_active gauge
EOF

            for array in $arrays; do
                local status active
                status=$(get_mdraid_status "$array")
                active=$(echo "$status" | awk '{print $3}')
                echo "storage_health_mdraid_devices_active{array=\"$array\"} ${active:-0}"
            done

            echo ""

            cat <<EOF
# HELP storage_health_mdraid_sync_percent MD RAID sync/rebuild percentage
# TYPE storage_health_mdraid_sync_percent gauge
EOF

            for array in $arrays; do
                local status sync_pct
                status=$(get_mdraid_status "$array")
                sync_pct=$(echo "$status" | awk '{print $4}')
                echo "storage_health_mdraid_sync_percent{array=\"$array\"} ${sync_pct:-100}"
            done

            echo ""
        fi
    fi

    # ========================================================================
    # LVM Thin Pools (optional)
    # ========================================================================
    if command -v lvs >/dev/null 2>&1; then
        local thin_pools
        thin_pools=$(get_lvm_thin_pools)

        if [ -n "$thin_pools" ]; then
            cat <<EOF
# HELP storage_health_lvm_thin_data_percent LVM thin pool data usage percentage
# TYPE storage_health_lvm_thin_data_percent gauge
EOF

            echo "$thin_pools" | while read -r vg pool data_pct meta_pct; do
                echo "storage_health_lvm_thin_data_percent{vg=\"$vg\",pool=\"$pool\"} ${data_pct:-0}"
            done

            echo ""

            cat <<EOF
# HELP storage_health_lvm_thin_metadata_percent LVM thin pool metadata usage percentage
# TYPE storage_health_lvm_thin_metadata_percent gauge
EOF

            echo "$thin_pools" | while read -r vg pool data_pct meta_pct; do
                echo "storage_health_lvm_thin_metadata_percent{vg=\"$vg\",pool=\"$pool\"} ${meta_pct:-0}"
            done

            echo ""
        fi
    fi

    # ========================================================================
    # BTRFS Health (optional)
    # ========================================================================
    if command -v btrfs >/dev/null 2>&1; then
        local has_btrfs=0

        while read -r device mountpoint fstype; do
            [ "$fstype" = "btrfs" ] && has_btrfs=1
        done < <(get_mounted_filesystems)

        if [ "$has_btrfs" -eq 1 ]; then
            cat <<EOF
# HELP storage_health_btrfs_errors BTRFS device error counts
# TYPE storage_health_btrfs_errors gauge
EOF

            while read -r device mountpoint fstype; do
                [ "$fstype" = "btrfs" ] || continue
                while read -r bdev etype count; do
                    [ -z "$bdev" ] && continue
                    echo "storage_health_btrfs_errors{mountpoint=\"$mountpoint\",device=\"$bdev\",type=\"$etype\"} ${count:-0}"
                done < <(get_btrfs_health "$mountpoint")
            done < <(get_mounted_filesystems)

            echo ""

            cat <<EOF
# HELP storage_health_btrfs_usage_bytes BTRFS usage by block group type (data/metadata/system)
# TYPE storage_health_btrfs_usage_bytes gauge
EOF

            while read -r device mountpoint fstype; do
                [ "$fstype" = "btrfs" ] || continue
                while read -r btype used total; do
                    [ -z "$btype" ] && continue
                    echo "storage_health_btrfs_usage_bytes{mountpoint=\"$mountpoint\",group=\"$btype\",usage=\"used\"} ${used:-0}"
                    echo "storage_health_btrfs_usage_bytes{mountpoint=\"$mountpoint\",group=\"$btype\",usage=\"total\"} ${total:-0}"
                done < <(get_btrfs_usage "$mountpoint")
            done < <(get_mounted_filesystems)

            echo ""

            cat <<EOF
# HELP storage_health_btrfs_scrub_status BTRFS scrub state (0=never, 1=completed, 2=in progress)
# TYPE storage_health_btrfs_scrub_status gauge
# HELP storage_health_btrfs_scrub_age_seconds Seconds since last BTRFS scrub completed
# TYPE storage_health_btrfs_scrub_age_seconds gauge
# HELP storage_health_btrfs_scrub_errors Errors found during last BTRFS scrub
# TYPE storage_health_btrfs_scrub_errors gauge
EOF

            while read -r device mountpoint fstype; do
                [ "$fstype" = "btrfs" ] || continue
                local scrub_info state scrub_age scrub_errors
                scrub_info=$(get_btrfs_scrub "$mountpoint")
                state=$(echo "$scrub_info" | awk '{print $1}')
                scrub_age=$(echo "$scrub_info" | awk '{print $2}')
                scrub_errors=$(echo "$scrub_info" | awk '{print $3}')
                echo "storage_health_btrfs_scrub_status{mountpoint=\"$mountpoint\"} ${state:-0}"
                echo "storage_health_btrfs_scrub_age_seconds{mountpoint=\"$mountpoint\"} ${scrub_age:-0}"
                echo "storage_health_btrfs_scrub_errors{mountpoint=\"$mountpoint\"} ${scrub_errors:-0}"
            done < <(get_mounted_filesystems)

            echo ""
        fi
    fi

    # ========================================================================
    # ZFS Pool Health (optional)
    # ========================================================================
    if command -v zpool >/dev/null 2>&1; then
        local zfs_pools
        zfs_pools=$(get_zpool_health)

        if [ -n "$zfs_pools" ]; then
            cat <<EOF
# HELP storage_health_zfs_pool_healthy ZFS pool health (1=ONLINE, 0=other)
# TYPE storage_health_zfs_pool_healthy gauge
EOF

            echo "$zfs_pools" | while read -r pool state errors; do
                if [ "$state" = "ONLINE" ]; then
                    echo "storage_health_zfs_pool_healthy{pool=\"$pool\"} 1"
                else
                    echo "storage_health_zfs_pool_healthy{pool=\"$pool\"} 0"
                fi
            done

            echo ""

            cat <<EOF
# HELP storage_health_zfs_pool_errors ZFS pool error count
# TYPE storage_health_zfs_pool_errors gauge
EOF

            echo "$zfs_pools" | while read -r pool state errors; do
                echo "storage_health_zfs_pool_errors{pool=\"$pool\"} ${errors:-0}"
            done

            echo ""

            # ZFS pool capacity
            local zfs_capacity
            zfs_capacity=$(get_zpool_capacity)

            if [ -n "$zfs_capacity" ]; then
                cat <<EOF
# HELP storage_health_zfs_pool_size_bytes ZFS pool total size in bytes
# TYPE storage_health_zfs_pool_size_bytes gauge
# HELP storage_health_zfs_pool_alloc_bytes ZFS pool allocated bytes
# TYPE storage_health_zfs_pool_alloc_bytes gauge
# HELP storage_health_zfs_pool_free_bytes ZFS pool free bytes
# TYPE storage_health_zfs_pool_free_bytes gauge
# HELP storage_health_zfs_pool_fragmentation_percent ZFS pool fragmentation percentage
# TYPE storage_health_zfs_pool_fragmentation_percent gauge
# HELP storage_health_zfs_pool_capacity_percent ZFS pool capacity percentage
# TYPE storage_health_zfs_pool_capacity_percent gauge
# HELP storage_health_zfs_pool_dedup_ratio ZFS pool deduplication ratio
# TYPE storage_health_zfs_pool_dedup_ratio gauge
EOF

                echo "$zfs_capacity" | while read -r pool size alloc free frag cap dedup; do
                    echo "storage_health_zfs_pool_size_bytes{pool=\"$pool\"} ${size:-0}"
                    echo "storage_health_zfs_pool_alloc_bytes{pool=\"$pool\"} ${alloc:-0}"
                    echo "storage_health_zfs_pool_free_bytes{pool=\"$pool\"} ${free:-0}"
                    echo "storage_health_zfs_pool_fragmentation_percent{pool=\"$pool\"} ${frag:-0}"
                    echo "storage_health_zfs_pool_capacity_percent{pool=\"$pool\"} ${cap:-0}"
                    echo "storage_health_zfs_pool_dedup_ratio{pool=\"$pool\"} ${dedup:-1}"
                done

                echo ""
            fi

            # ZFS scrub status
            cat <<EOF
# HELP storage_health_zfs_scrub_status ZFS scrub state (0=none, 1=completed, 2=in progress)
# TYPE storage_health_zfs_scrub_status gauge
# HELP storage_health_zfs_scrub_age_seconds Seconds since last ZFS scrub completed
# TYPE storage_health_zfs_scrub_age_seconds gauge
# HELP storage_health_zfs_scrub_errors Errors found during last ZFS scrub
# TYPE storage_health_zfs_scrub_errors gauge
# HELP storage_health_zfs_scrub_progress_percent ZFS scrub progress percentage (100 if not running)
# TYPE storage_health_zfs_scrub_progress_percent gauge
EOF

            echo "$zfs_pools" | while read -r pool state errors; do
                local scrub_info scrub_state scrub_age scrub_errors scrub_pct
                scrub_info=$(get_zpool_scrub "$pool")
                scrub_state=$(echo "$scrub_info" | awk '{print $1}')
                scrub_age=$(echo "$scrub_info" | awk '{print $2}')
                scrub_errors=$(echo "$scrub_info" | awk '{print $3}')
                scrub_pct=$(echo "$scrub_info" | awk '{print $4}')
                echo "storage_health_zfs_scrub_status{pool=\"$pool\"} ${scrub_state:-0}"
                echo "storage_health_zfs_scrub_age_seconds{pool=\"$pool\"} ${scrub_age:-0}"
                echo "storage_health_zfs_scrub_errors{pool=\"$pool\"} ${scrub_errors:-0}"
                echo "storage_health_zfs_scrub_progress_percent{pool=\"$pool\"} ${scrub_pct:-0}"
            done

            echo ""

            # ZFS per-vdev errors
            cat <<EOF
# HELP storage_health_zfs_vdev_read_errors ZFS vdev read error count
# TYPE storage_health_zfs_vdev_read_errors gauge
# HELP storage_health_zfs_vdev_write_errors ZFS vdev write error count
# TYPE storage_health_zfs_vdev_write_errors gauge
# HELP storage_health_zfs_vdev_checksum_errors ZFS vdev checksum error count
# TYPE storage_health_zfs_vdev_checksum_errors gauge
EOF

            echo "$zfs_pools" | while read -r pool state errors; do
                while read -r vdev read_err write_err cksum_err; do
                    [ -z "$vdev" ] && continue
                    echo "storage_health_zfs_vdev_read_errors{pool=\"$pool\",vdev=\"$vdev\"} ${read_err:-0}"
                    echo "storage_health_zfs_vdev_write_errors{pool=\"$pool\",vdev=\"$vdev\"} ${write_err:-0}"
                    echo "storage_health_zfs_vdev_checksum_errors{pool=\"$pool\",vdev=\"$vdev\"} ${cksum_err:-0}"
                done < <(get_zpool_vdev_errors "$pool")
            done

            echo ""
        fi

        # ZFS datasets
        local zfs_datasets
        zfs_datasets=$(get_zfs_datasets)

        if [ -n "$zfs_datasets" ]; then
            cat <<EOF
# HELP storage_health_zfs_dataset_used_bytes ZFS dataset used bytes
# TYPE storage_health_zfs_dataset_used_bytes gauge
# HELP storage_health_zfs_dataset_avail_bytes ZFS dataset available bytes
# TYPE storage_health_zfs_dataset_avail_bytes gauge
# HELP storage_health_zfs_dataset_refer_bytes ZFS dataset referenced bytes
# TYPE storage_health_zfs_dataset_refer_bytes gauge
# HELP storage_health_zfs_dataset_compressratio ZFS dataset compression ratio
# TYPE storage_health_zfs_dataset_compressratio gauge
EOF

            echo "$zfs_datasets" | while read -r dataset used avail refer ratio dtype; do
                echo "storage_health_zfs_dataset_used_bytes{dataset=\"$dataset\",type=\"$dtype\"} ${used:-0}"
                echo "storage_health_zfs_dataset_avail_bytes{dataset=\"$dataset\",type=\"$dtype\"} ${avail:-0}"
                echo "storage_health_zfs_dataset_refer_bytes{dataset=\"$dataset\",type=\"$dtype\"} ${refer:-0}"
                echo "storage_health_zfs_dataset_compressratio{dataset=\"$dataset\",type=\"$dtype\"} ${ratio:-1}"
            done

            echo ""
        fi

        # ZFS snapshot stats
        local zfs_snaps
        zfs_snaps=$(get_zfs_snapshot_stats)

        if [ -n "$zfs_snaps" ]; then
            cat <<EOF
# HELP storage_health_zfs_snapshot_count ZFS snapshot count per dataset
# TYPE storage_health_zfs_snapshot_count gauge
# HELP storage_health_zfs_snapshot_oldest_age_seconds Age of oldest ZFS snapshot in seconds
# TYPE storage_health_zfs_snapshot_oldest_age_seconds gauge
# HELP storage_health_zfs_snapshot_newest_age_seconds Age of newest ZFS snapshot in seconds
# TYPE storage_health_zfs_snapshot_newest_age_seconds gauge
EOF

            echo "$zfs_snaps" | while read -r dataset count oldest newest; do
                echo "storage_health_zfs_snapshot_count{dataset=\"$dataset\"} ${count:-0}"
                echo "storage_health_zfs_snapshot_oldest_age_seconds{dataset=\"$dataset\"} ${oldest:-0}"
                echo "storage_health_zfs_snapshot_newest_age_seconds{dataset=\"$dataset\"} ${newest:-0}"
            done

            echo ""
        fi

        # ZFS ARC stats (Linux only)
        local arc_stats
        arc_stats=$(get_zfs_arc_stats)

        if [ -n "$arc_stats" ]; then
            local arc_hits arc_misses arc_size
            arc_hits=$(echo "$arc_stats" | awk '{print $1}')
            arc_misses=$(echo "$arc_stats" | awk '{print $2}')
            arc_size=$(echo "$arc_stats" | awk '{print $3}')

            cat <<EOF
# HELP storage_health_zfs_arc_hits_total ZFS ARC cache hits (counter)
# TYPE storage_health_zfs_arc_hits_total counter
storage_health_zfs_arc_hits_total ${arc_hits:-0}

# HELP storage_health_zfs_arc_misses_total ZFS ARC cache misses (counter)
# TYPE storage_health_zfs_arc_misses_total counter
storage_health_zfs_arc_misses_total ${arc_misses:-0}

# HELP storage_health_zfs_arc_size_bytes ZFS ARC cache size in bytes
# TYPE storage_health_zfs_arc_size_bytes gauge
storage_health_zfs_arc_size_bytes ${arc_size:-0}
EOF

            echo ""
        fi
    fi

    # ========================================================================
    # Journal Filesystem Errors
    # ========================================================================
    local fs_errors
    fs_errors=$(get_fs_errors_from_journal)

    cat <<EOF
# HELP storage_health_journal_fs_errors_24h Filesystem error messages in journal (24h)
# TYPE storage_health_journal_fs_errors_24h gauge
storage_health_journal_fs_errors_24h ${fs_errors:-0}
EOF

    echo ""

    # ========================================================================
    # Exporter Runtime
    # ========================================================================
    local script_end script_duration
    script_end=$(date +%s)
    script_duration=$((script_end - script_start))

    cat <<EOF
# HELP storage_health_exporter_duration_seconds Time to generate all metrics
# TYPE storage_health_exporter_duration_seconds gauge
storage_health_exporter_duration_seconds $script_duration

# HELP storage_health_exporter_last_run_timestamp Unix timestamp of last successful run
# TYPE storage_health_exporter_last_run_timestamp gauge
storage_health_exporter_last_run_timestamp $script_end
EOF

    echo ""
}

# ============================================================================
# HTTP SERVER MODE
# ============================================================================

# Run simple HTTP server using netcat
# Serves metrics on /metrics endpoint
run_http_server() {
    echo "Starting storage health exporter on port $HTTP_PORT..." >&2

    if ! command -v nc >/dev/null 2>&1; then
        echo "ERROR: netcat (nc) required for HTTP mode" >&2
        exit 1
    fi

    # Infinite loop accepting HTTP requests
    while true; do
        {
            read -r request
            # Check if request is for /metrics endpoint
            if [[ "$request" =~ ^GET\ /metrics ]]; then
                echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r"
                generate_metrics
            else  # Serve HTML landing page for other requests
                echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r"
                cat <<EOF
<!DOCTYPE html>
<html>
<head><title>Storage Health Exporter v1.0</title></head>
<body>
<h1>Storage Health Exporter v1.0</h1>
<p><a href="/metrics">Metrics</a></p>
<h2>Sections (auto-detected)</h2>
<ul>
<li>Filesystem info, mount status, inode usage</li>
<li>SMART disk health (requires smartctl)</li>
<li>MD RAID array status (requires mdadm)</li>
<li>LVM thin pool usage (requires lvs)</li>
<li>BTRFS device error stats (requires btrfs)</li>
<li>ZFS pool health (requires zpool)</li>
<li>Journal filesystem error count</li>
</ul>
</body>
</html>
EOF
            fi
        } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
    done
}

# ============================================================================
# MAIN EXECUTION
# ============================================================================

# Main entry point - routes to appropriate output mode
main() {
    parse_args "$@"

    if [ "$HTTP_MODE" = true ]; then
        # Run HTTP server (blocks until killed)
        run_http_server
    elif [ -n "$OUTPUT_FILE" ]; then
        # Textfile collector mode: write atomically using temp file
        local output_dir
        output_dir="$(dirname "$OUTPUT_FILE")"
        mkdir -p "$output_dir"

        # Create temp file in SAME directory for atomic rename (same filesystem)
        local temp_file
        temp_file=$(mktemp "${output_dir}/.storage_health_metrics.XXXXXX")

        # Generate metrics to temp file
        if ! generate_metrics > "$temp_file" 2>/dev/null; then
            rm -f "$temp_file"
            echo "ERROR: Failed to generate metrics" >&2
            exit 1
        fi

        # Validate: file must have content
        local file_lines
        file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)

        if [ "$file_lines" -lt 10 ]; then
            rm -f "$temp_file"
            echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
            exit 1
        fi

        # Set permissions before move
        chmod 644 "$temp_file"

        # Atomic rename - no gap where file is missing
        mv -f "$temp_file" "$OUTPUT_FILE"

        echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
    else
        # Default: output to stdout
        generate_metrics
    fi
}

# Execute main function with all script arguments
main "$@"