Files
linux-scripts/storage-health-exporter.sh
T
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

1179 lines
43 KiB
Bash

#!/bin/bash
################################################################################
# Script Name: storage-health-exporter.sh
# Version: 1.0
# Description: Prometheus exporter for storage health metrics covering all
# common Linux filesystems. Exports inode usage, SMART disk health,
# mdadm RAID status, LVM thin pool usage, btrfs/zfs health,
# stale mount detection, and filesystem error counts from journal.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
# - Standard Unix tools (df, stat, awk, grep)
# - netcat (nc) for HTTP mode
# - Optional: smartctl (SMART), mdadm (RAID), lvs (LVM),
# btrfs (btrfs stats), zpool (ZFS), journalctl (fs errors)
# Each section is skipped gracefully if tools are missing.
#
# Usage:
# # Output to stdout
# ./storage-health-exporter.sh
#
# # HTTP server mode
# ./storage-health-exporter.sh --http -p 9197
#
# # Textfile collector mode
# ./storage-health-exporter.sh --textfile
#
# Metrics Exported:
# Core Status:
# - storage_health_up - Exporter status (1=up, 0=down)
# - storage_health_exporter_info{version} - Exporter version
#
# Filesystem Info:
# - storage_health_fs_info{device,mountpoint,fstype} - Filesystem info
# - storage_health_mount_readonly{device,mountpoint} - Read-only status
# - storage_health_mount_stale{mountpoint} - Stale/hung mount detection
#
# Inodes:
# - storage_health_inode_total{mountpoint} - Total inodes
# - storage_health_inode_used{mountpoint} - Used inodes
# - storage_health_inode_usage_percent{mountpoint} - Inode usage percentage
#
# SMART (if smartctl available):
# - storage_health_smart_healthy{device} - 1=passed, 0=failed
# - storage_health_smart_temperature_celsius{device} - Drive temperature
# - storage_health_smart_power_on_hours{device} - Power on hours
# - storage_health_smart_reallocated_sectors{device} - Reallocated sectors
#
# MD RAID (if /proc/mdstat exists):
# - storage_health_mdraid_healthy{array} - 1=clean, 0=degraded
# - storage_health_mdraid_degraded{array} - 1 if degraded
# - storage_health_mdraid_devices_total{array} - Total devices
# - storage_health_mdraid_devices_active{array} - Active devices
# - storage_health_mdraid_sync_percent{array} - Rebuild percentage
#
# LVM Thin Pools (if lvs available):
# - storage_health_lvm_thin_data_percent{vg,pool} - Data usage
# - storage_health_lvm_thin_metadata_percent{vg,pool} - Metadata usage
#
# BTRFS (if btrfs mounted):
# - storage_health_btrfs_errors{mountpoint,device,type} - Error counts
# - storage_health_btrfs_usage_bytes{mountpoint,group,usage} - Block group usage
# - storage_health_btrfs_scrub_status{mountpoint} - Scrub state
# - storage_health_btrfs_scrub_age_seconds{mountpoint} - Scrub age
# - storage_health_btrfs_scrub_errors{mountpoint} - Scrub error count
#
# ZFS (if zpool/zfs available):
# - storage_health_zfs_pool_healthy{pool} - 1=ONLINE, 0=other
# - storage_health_zfs_pool_errors{pool} - Error count
# - storage_health_zfs_pool_size_bytes{pool} - Pool total size
# - storage_health_zfs_pool_alloc_bytes{pool} - Pool allocated bytes
# - storage_health_zfs_pool_free_bytes{pool} - Pool free bytes
# - storage_health_zfs_pool_fragmentation_percent{pool} - Fragmentation
# - storage_health_zfs_pool_capacity_percent{pool} - Capacity used
# - storage_health_zfs_pool_dedup_ratio{pool} - Dedup ratio
# - storage_health_zfs_scrub_status{pool} - Scrub state
# - storage_health_zfs_scrub_age_seconds{pool} - Seconds since scrub
# - storage_health_zfs_scrub_errors{pool} - Scrub errors found
# - storage_health_zfs_scrub_progress_percent{pool} - Scrub progress
# - storage_health_zfs_vdev_read_errors{pool,vdev} - Per-vdev read errors
# - storage_health_zfs_vdev_write_errors{pool,vdev} - Per-vdev write errors
# - storage_health_zfs_vdev_checksum_errors{pool,vdev} - Per-vdev cksum errors
# - storage_health_zfs_dataset_used_bytes{dataset,type} - Dataset usage
# - storage_health_zfs_dataset_avail_bytes{dataset,type} - Dataset available
# - storage_health_zfs_dataset_refer_bytes{dataset,type} - Dataset referenced
# - storage_health_zfs_dataset_compressratio{dataset,type} - Compression ratio
# - storage_health_zfs_snapshot_count{dataset} - Snapshot count
# - storage_health_zfs_snapshot_oldest_age_seconds{dataset} - Oldest snap age
# - storage_health_zfs_snapshot_newest_age_seconds{dataset} - Newest snap age
# - storage_health_zfs_arc_hits_total - ARC cache hits
# - storage_health_zfs_arc_misses_total - ARC cache misses
# - storage_health_zfs_arc_size_bytes - ARC cache size
#
# Journal:
# - storage_health_journal_fs_errors_24h - FS error count from journal
#
# Exporter:
# - storage_health_exporter_duration_seconds - Script execution time
# - storage_health_exporter_last_run_timestamp - Last run timestamp
#
# Configuration:
# Default HTTP port: 9197
# Textfile directory: /var/lib/node_exporter
#
################################################################################
# ============================================================================
# CONFIGURATION VARIABLES
# ============================================================================
TEXTFILE_DIR="/var/lib/node_exporter"
OUTPUT_FILE=""
HTTP_MODE=false
HTTP_PORT=9197
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
show_usage() {
cat <<EOF
Usage: $0 [OPTIONS]
Export storage health statistics as Prometheus metrics (v1.0).
MODES:
--textfile Write to node_exporter textfile collector
--http Run HTTP server on port $HTTP_PORT
OPTIONS:
-p, --port HTTP port (default: 9197)
-o, --output Output file path
EXAMPLES:
$0 --textfile # Write to textfile collector
$0 --http --port 9197 # Run HTTP server
$0 -o /tmp/storage_health.prom # Write to custom file
SECTIONS (auto-detected, skipped if tools missing):
- Filesystem info and inode usage (always available)
- SMART disk health (requires smartctl)
- MD RAID status (requires /proc/mdstat + mdadm)
- LVM thin pool usage (requires lvs)
- BTRFS device stats (requires btrfs)
- ZFS pool health (requires zpool)
- Journal filesystem errors (requires journalctl)
EOF
exit 0
}
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help) show_usage ;;
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/storage_health.prom"; shift ;;
--http) HTTP_MODE=true; shift ;;
-p|--port) HTTP_PORT="$2"; shift 2 ;;
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
}
# Get mounted filesystems excluding virtual ones
# Returns: Lines of "device mountpoint fstype"
get_mounted_filesystems() {
df -T 2>/dev/null | awk 'NR>1 && $2 !~ /^(tmpfs|devtmpfs|overlay|squashfs|efivarfs|fuse\..*|nsfs|cgroup.*)$/ {
print $1, $7, $2
}'
}
# Get inode usage for a mountpoint
# Args: $1 - mountpoint
# Returns: "total used" or "0 0" on failure
get_inode_usage() {
local mountpoint="$1"
df -i "$mountpoint" 2>/dev/null | awk 'NR==2 { print $2, $3 }'
}
# Check if a mountpoint is read-only
# Args: $1 - mountpoint
# Returns: 1 if read-only, 0 if read-write
get_mount_readonly() {
local mountpoint="$1"
local opts
opts=$(awk -v mp="$mountpoint" '$2 == mp { print $4 }' /proc/mounts 2>/dev/null | head -1)
if echo "$opts" | grep -qE '(^|,)ro(,|$)'; then
echo "1"
else
echo "0"
fi
}
# Check if a mount is stale/hung (e.g., NFS)
# Args: $1 - mountpoint
# Returns: 1 if stale, 0 if responsive
check_stale_mount() {
local mountpoint="$1"
if timeout 2 stat -t "$mountpoint" >/dev/null 2>&1; then
echo "0"
else
echo "1"
fi
}
# Get SMART health status for a device
# Args: $1 - device path (e.g., /dev/sda)
# Returns: 1 if PASSED, 0 if FAILED or error
get_smart_health() {
local device="$1"
local result
result=$(smartctl -H "$device" 2>/dev/null)
if echo "$result" | grep -qi "PASSED\|OK"; then
echo "1"
else
echo "0"
fi
}
# Get SMART attributes for a device
# Args: $1 - device path
# Returns: "temperature power_on_hours reallocated_sectors"
get_smart_attributes() {
local device="$1"
local output
output=$(smartctl -A "$device" 2>/dev/null)
local temp power_hours realloc
# Temperature: attribute ID 194 or 190
temp=$(echo "$output" | awk '$1 == "194" || $1 == "190" { print $10; exit }')
# Power on hours: attribute ID 9
power_hours=$(echo "$output" | awk '$1 == "9" { print $10; exit }')
# Reallocated sectors: attribute ID 5
realloc=$(echo "$output" | awk '$1 == "5" { print $10; exit }')
echo "${temp:-0} ${power_hours:-0} ${realloc:-0}"
}
# Get list of MD RAID arrays
# Returns: Array names (e.g., md0 md1), one per line
get_mdraid_arrays() {
[ -f /proc/mdstat ] || return
awk '/^md[0-9]/ { print $1 }' /proc/mdstat 2>/dev/null
}
# Get MD RAID array status
# Args: $1 - array name (e.g., md0)
# Returns: "state total_devices active_devices sync_percent"
get_mdraid_status() {
local array="$1"
local detail
detail=$(mdadm --detail "/dev/$array" 2>/dev/null)
local state total active sync_pct
state=$(echo "$detail" | awk -F: '/State :/ { gsub(/^[ \t]+/, "", $2); print $2 }')
total=$(echo "$detail" | awk '/Raid Devices :/ { print $NF }')
active=$(echo "$detail" | awk '/Active Devices :/ { print $NF }')
# Check for rebuild/resync percentage
sync_pct=$(awk -v arr="$array" '
/^'"$array"'/ { found=1; next }
found && /recovery|resync/ {
match($0, /([0-9]+\.[0-9]+)%/, m)
if (m[1] != "") print m[1]
exit
}
found && /^md/ { exit }
' /proc/mdstat 2>/dev/null)
echo "${state:-unknown} ${total:-0} ${active:-0} ${sync_pct:-100}"
}
# Get LVM thin pool usage
# Returns: Lines of "vg_name lv_name data_percent metadata_percent"
get_lvm_thin_pools() {
command -v lvs >/dev/null 2>&1 || return
lvs --noheadings --nosuffix -o vg_name,lv_name,data_percent,metadata_percent \
--select 'pool_lv=""' --select 'lv_attr=~[t]' 2>/dev/null | \
awk 'NF==4 && $3+0 > 0 { print $1, $2, $3, $4 }'
}
# Get BTRFS device stats error counts
# Args: $1 - mountpoint
# Returns: Lines of "device error_type count"
get_btrfs_health() {
local mountpoint="$1"
command -v btrfs >/dev/null 2>&1 || return
btrfs device stats "$mountpoint" 2>/dev/null | awk -F'[].[[:space:]]+' '{
# Format: [/dev/sda1].write_io_errs 0
gsub(/\[|\]/, "")
if (NF >= 3) {
device = $1
type = $2
count = $NF
print device, type, count
}
}'
}
# Get ZFS pool health
# Returns: Lines of "pool_name state error_count"
get_zpool_health() {
command -v zpool >/dev/null 2>&1 || return
zpool list -H -o name,health 2>/dev/null | while read -r pool state; do
local errors
errors=$(zpool status "$pool" 2>/dev/null | awk '/errors:/ { if ($2 == "No") print 0; else print 1 }')
echo "$pool $state ${errors:-0}"
done
}
# Get ZFS pool capacity metrics
# Returns: Lines of "pool_name size_bytes alloc_bytes free_bytes frag_pct cap_pct dedup_ratio"
get_zpool_capacity() {
command -v zpool >/dev/null 2>&1 || return
zpool list -Hp -o name,size,alloc,free,frag,cap,dedup 2>/dev/null
}
# Get ZFS pool scrub status
# Args: $1 - pool name
# Returns: "state seconds_since_scrub errors_found"
# state: 0=no scrub, 1=completed, 2=in progress
get_zpool_scrub() {
local pool="$1"
local status_output
status_output=$(zpool status "$pool" 2>/dev/null)
local scrub_line
scrub_line=$(echo "$status_output" | grep "scan:" | head -1)
if echo "$scrub_line" | grep -q "in progress"; then
local pct
pct=$(echo "$status_output" | grep -oE '[0-9]+\.[0-9]+% done' | grep -oE '[0-9.]+' | head -1)
echo "2 0 0 ${pct:-0}"
elif echo "$scrub_line" | grep -q "scrub repaired"; then
local scrub_date seconds_since errors
scrub_date=$(echo "$scrub_line" | grep -oE '[A-Z][a-z]{2} [A-Z][a-z]{2} +[0-9]+ [0-9:]+ [0-9]+' | head -1)
if [ -n "$scrub_date" ]; then
local scrub_ts
scrub_ts=$(date -d "$scrub_date" +%s 2>/dev/null || echo 0)
seconds_since=$(( $(date +%s) - scrub_ts ))
else
seconds_since=0
fi
errors=$(echo "$scrub_line" | grep -oE '[0-9]+ errors' | awk '{print $1}')
echo "1 ${seconds_since:-0} ${errors:-0} 100"
else
echo "0 0 0 0"
fi
}
# Get ZFS dataset metrics
# Returns: Lines of "dataset used_bytes avail_bytes refer_bytes compressratio type"
get_zfs_datasets() {
command -v zfs >/dev/null 2>&1 || return
zfs list -Hp -o name,used,avail,refer,compressratio,type 2>/dev/null
}
# Get ZFS snapshot count and age per dataset
# Returns: Lines of "dataset count oldest_age_seconds newest_age_seconds"
get_zfs_snapshot_stats() {
command -v zfs >/dev/null 2>&1 || return
local now
now=$(date +%s)
zfs list -t snapshot -Hp -o name,creation 2>/dev/null | \
awk -v now="$now" '{
split($1, parts, "@")
ds = parts[1]
ts = $2
count[ds]++
if (!(ds in oldest) || ts < oldest[ds]) oldest[ds] = ts
if (!(ds in newest) || ts > newest[ds]) newest[ds] = ts
}
END {
for (ds in count) {
oldest_age = (oldest[ds] > 0) ? now - oldest[ds] : 0
newest_age = (newest[ds] > 0) ? now - newest[ds] : 0
print ds, count[ds], oldest_age, newest_age
}
}'
}
# Get ZFS per-vdev error counts from zpool status
# Args: $1 - pool name
# Returns: Lines of "vdev read_errors write_errors checksum_errors"
get_zpool_vdev_errors() {
local pool="$1"
zpool status "$pool" 2>/dev/null | awk '
/NAME.*STATE.*READ.*WRITE.*CKSUM/ { header=1; next }
header && /^$/ { exit }
header && NF >= 5 {
# Skip pool-level and mirror/raidz container lines
name = $1
read_err = $(NF-2)
write_err = $(NF-1)
cksum_err = $NF
if (read_err ~ /^[0-9]+$/ && name !~ /^(mirror|raidz|log|cache|spare)/) {
print name, read_err, write_err, cksum_err
}
}'
}
# Get ZFS ARC stats from /proc/spl/kstat/zfs/arcstats (Linux only)
# Returns: "hits misses size_bytes"
get_zfs_arc_stats() {
local arcstats="/proc/spl/kstat/zfs/arcstats"
[ -f "$arcstats" ] || return
local hits misses size
hits=$(awk '$1 == "hits" { print $3 }' "$arcstats" 2>/dev/null)
misses=$(awk '$1 == "misses" { print $3 }' "$arcstats" 2>/dev/null)
size=$(awk '$1 == "size" { print $3 }' "$arcstats" 2>/dev/null)
echo "${hits:-0} ${misses:-0} ${size:-0}"
}
# Get BTRFS filesystem usage (data/metadata/system)
# Args: $1 - mountpoint
# Returns: Lines of "type used_bytes total_bytes"
get_btrfs_usage() {
local mountpoint="$1"
command -v btrfs >/dev/null 2>&1 || return
btrfs filesystem usage -b "$mountpoint" 2>/dev/null | awk '
/^Data,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "data", used+0, total+0 }
/^Metadata,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "metadata", used+0, total+0 }
/^System,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "system", used+0, total+0 }
'
}
# Get BTRFS scrub status
# Args: $1 - mountpoint
# Returns: "state seconds_since_scrub errors_found"
# state: 0=never, 1=completed, 2=in progress
get_btrfs_scrub() {
local mountpoint="$1"
command -v btrfs >/dev/null 2>&1 || return
local output
output=$(btrfs scrub status "$mountpoint" 2>/dev/null)
if echo "$output" | grep -q "running"; then
echo "2 0 0"
elif echo "$output" | grep -q "finished"; then
local scrub_date seconds_since errors
scrub_date=$(echo "$output" | grep -oE 'finished after [0-9:]+' | head -1)
local started
started=$(echo "$output" | grep "Scrub started:" | sed 's/.*Scrub started:[[:space:]]*//')
if [ -n "$started" ]; then
local scrub_ts
scrub_ts=$(date -d "$started" +%s 2>/dev/null || echo 0)
seconds_since=$(( $(date +%s) - scrub_ts ))
else
seconds_since=0
fi
errors=$(echo "$output" | awk '/errors found:/ { print $NF }' | head -1)
# Handle "no errors found" case
if echo "$output" | grep -qi "no errors found"; then
errors=0
fi
echo "1 ${seconds_since:-0} ${errors:-0}"
else
echo "0 0 0"
fi
}
# Count filesystem error messages from journal (last 24h)
# Returns: Number of filesystem error entries
get_fs_errors_from_journal() {
command -v journalctl >/dev/null 2>&1 || { echo "0"; return; }
local count
count=$(journalctl --since "24 hours ago" --no-pager -q 2>/dev/null | \
grep -ciE "(EXT4-fs error|XFS.*error|BTRFS.*error|I/O error|Buffer I/O error)" 2>/dev/null)
echo "${count:-0}"
}
# ============================================================================
# METRIC GENERATION
# ============================================================================
# Generate all Prometheus metrics
# Returns: Prometheus text format metrics on stdout
generate_metrics() {
local script_start
script_start=$(date +%s)
cat <<EOF
# HELP storage_health_up Storage health exporter status
# TYPE storage_health_up gauge
storage_health_up 1
# HELP storage_health_exporter_info Storage health exporter information
# TYPE storage_health_exporter_info gauge
storage_health_exporter_info{version="1.0"} 1
EOF
echo ""
# ========================================================================
# Filesystem Info and Inode Usage
# ========================================================================
cat <<EOF
# HELP storage_health_fs_info Filesystem information (always 1)
# TYPE storage_health_fs_info gauge
EOF
while read -r device mountpoint fstype; do
[ -z "$device" ] && continue
echo "storage_health_fs_info{device=\"$device\",mountpoint=\"$mountpoint\",fstype=\"$fstype\"} 1"
done < <(get_mounted_filesystems)
echo ""
cat <<EOF
# HELP storage_health_mount_readonly Mount read-only status (1=ro, 0=rw)
# TYPE storage_health_mount_readonly gauge
EOF
while read -r device mountpoint fstype; do
[ -z "$device" ] && continue
local ro
ro=$(get_mount_readonly "$mountpoint")
echo "storage_health_mount_readonly{device=\"$device\",mountpoint=\"$mountpoint\"} $ro"
done < <(get_mounted_filesystems)
echo ""
cat <<EOF
# HELP storage_health_mount_stale Mount stale/hung status (1=stale, 0=ok)
# TYPE storage_health_mount_stale gauge
EOF
while read -r device mountpoint fstype; do
[ -z "$device" ] && continue
local stale
stale=$(check_stale_mount "$mountpoint")
echo "storage_health_mount_stale{mountpoint=\"$mountpoint\"} $stale"
done < <(get_mounted_filesystems)
echo ""
cat <<EOF
# HELP storage_health_inode_total Total inodes per mountpoint
# TYPE storage_health_inode_total gauge
EOF
while read -r device mountpoint fstype; do
[ -z "$device" ] && continue
local inode_info total used
inode_info=$(get_inode_usage "$mountpoint")
total=$(echo "$inode_info" | awk '{print $1}')
echo "storage_health_inode_total{mountpoint=\"$mountpoint\"} ${total:-0}"
done < <(get_mounted_filesystems)
echo ""
cat <<EOF
# HELP storage_health_inode_used Used inodes per mountpoint
# TYPE storage_health_inode_used gauge
EOF
while read -r device mountpoint fstype; do
[ -z "$device" ] && continue
local inode_info used
inode_info=$(get_inode_usage "$mountpoint")
used=$(echo "$inode_info" | awk '{print $2}')
echo "storage_health_inode_used{mountpoint=\"$mountpoint\"} ${used:-0}"
done < <(get_mounted_filesystems)
echo ""
cat <<EOF
# HELP storage_health_inode_usage_percent Inode usage percentage per mountpoint
# TYPE storage_health_inode_usage_percent gauge
EOF
while read -r device mountpoint fstype; do
[ -z "$device" ] && continue
local inode_info total used pct
inode_info=$(get_inode_usage "$mountpoint")
total=$(echo "$inode_info" | awk '{print $1}')
used=$(echo "$inode_info" | awk '{print $2}')
total=${total:-0}
used=${used:-0}
if [ "$total" -gt 0 ] 2>/dev/null; then
pct=$(awk "BEGIN {printf \"%.2f\", ($used / $total) * 100}" 2>/dev/null || echo "0")
else
pct="0"
fi
echo "storage_health_inode_usage_percent{mountpoint=\"$mountpoint\"} $pct"
done < <(get_mounted_filesystems)
echo ""
# ========================================================================
# SMART Disk Health (optional)
# ========================================================================
if command -v smartctl >/dev/null 2>&1; then
cat <<EOF
# HELP storage_health_smart_healthy SMART health status (1=passed, 0=failed)
# TYPE storage_health_smart_healthy gauge
EOF
while read -r name dtype; do
[ "$dtype" = "disk" ] || continue
local device="/dev/$name"
local healthy
healthy=$(get_smart_health "$device")
echo "storage_health_smart_healthy{device=\"$device\"} $healthy"
done < <(lsblk -dno NAME,TYPE 2>/dev/null)
echo ""
cat <<EOF
# HELP storage_health_smart_temperature_celsius Drive temperature in Celsius
# TYPE storage_health_smart_temperature_celsius gauge
EOF
while read -r name dtype; do
[ "$dtype" = "disk" ] || continue
local device="/dev/$name"
local attrs temp
attrs=$(get_smart_attributes "$device")
temp=$(echo "$attrs" | awk '{print $1}')
echo "storage_health_smart_temperature_celsius{device=\"$device\"} ${temp:-0}"
done < <(lsblk -dno NAME,TYPE 2>/dev/null)
echo ""
cat <<EOF
# HELP storage_health_smart_power_on_hours Drive power on hours
# TYPE storage_health_smart_power_on_hours gauge
EOF
while read -r name dtype; do
[ "$dtype" = "disk" ] || continue
local device="/dev/$name"
local attrs hours
attrs=$(get_smart_attributes "$device")
hours=$(echo "$attrs" | awk '{print $2}')
echo "storage_health_smart_power_on_hours{device=\"$device\"} ${hours:-0}"
done < <(lsblk -dno NAME,TYPE 2>/dev/null)
echo ""
cat <<EOF
# HELP storage_health_smart_reallocated_sectors Reallocated sector count
# TYPE storage_health_smart_reallocated_sectors gauge
EOF
while read -r name dtype; do
[ "$dtype" = "disk" ] || continue
local device="/dev/$name"
local attrs realloc
attrs=$(get_smart_attributes "$device")
realloc=$(echo "$attrs" | awk '{print $3}')
echo "storage_health_smart_reallocated_sectors{device=\"$device\"} ${realloc:-0}"
done < <(lsblk -dno NAME,TYPE 2>/dev/null)
echo ""
fi
# ========================================================================
# MD RAID (optional)
# ========================================================================
if [ -f /proc/mdstat ] && command -v mdadm >/dev/null 2>&1; then
local arrays
arrays=$(get_mdraid_arrays)
if [ -n "$arrays" ]; then
cat <<EOF
# HELP storage_health_mdraid_healthy MD RAID array health (1=clean, 0=degraded)
# TYPE storage_health_mdraid_healthy gauge
EOF
for array in $arrays; do
local status state
status=$(get_mdraid_status "$array")
state=$(echo "$status" | awk '{print $1}')
if echo "$state" | grep -qi "clean\|active"; then
echo "storage_health_mdraid_healthy{array=\"$array\"} 1"
else
echo "storage_health_mdraid_healthy{array=\"$array\"} 0"
fi
done
echo ""
cat <<EOF
# HELP storage_health_mdraid_degraded MD RAID degraded status (1=degraded, 0=ok)
# TYPE storage_health_mdraid_degraded gauge
EOF
for array in $arrays; do
local status state
status=$(get_mdraid_status "$array")
state=$(echo "$status" | awk '{print $1}')
if echo "$state" | grep -qi "degraded"; then
echo "storage_health_mdraid_degraded{array=\"$array\"} 1"
else
echo "storage_health_mdraid_degraded{array=\"$array\"} 0"
fi
done
echo ""
cat <<EOF
# HELP storage_health_mdraid_devices_total Total devices in MD RAID array
# TYPE storage_health_mdraid_devices_total gauge
EOF
for array in $arrays; do
local status total
status=$(get_mdraid_status "$array")
total=$(echo "$status" | awk '{print $2}')
echo "storage_health_mdraid_devices_total{array=\"$array\"} ${total:-0}"
done
echo ""
cat <<EOF
# HELP storage_health_mdraid_devices_active Active devices in MD RAID array
# TYPE storage_health_mdraid_devices_active gauge
EOF
for array in $arrays; do
local status active
status=$(get_mdraid_status "$array")
active=$(echo "$status" | awk '{print $3}')
echo "storage_health_mdraid_devices_active{array=\"$array\"} ${active:-0}"
done
echo ""
cat <<EOF
# HELP storage_health_mdraid_sync_percent MD RAID sync/rebuild percentage
# TYPE storage_health_mdraid_sync_percent gauge
EOF
for array in $arrays; do
local status sync_pct
status=$(get_mdraid_status "$array")
sync_pct=$(echo "$status" | awk '{print $4}')
echo "storage_health_mdraid_sync_percent{array=\"$array\"} ${sync_pct:-100}"
done
echo ""
fi
fi
# ========================================================================
# LVM Thin Pools (optional)
# ========================================================================
if command -v lvs >/dev/null 2>&1; then
local thin_pools
thin_pools=$(get_lvm_thin_pools)
if [ -n "$thin_pools" ]; then
cat <<EOF
# HELP storage_health_lvm_thin_data_percent LVM thin pool data usage percentage
# TYPE storage_health_lvm_thin_data_percent gauge
EOF
echo "$thin_pools" | while read -r vg pool data_pct meta_pct; do
echo "storage_health_lvm_thin_data_percent{vg=\"$vg\",pool=\"$pool\"} ${data_pct:-0}"
done
echo ""
cat <<EOF
# HELP storage_health_lvm_thin_metadata_percent LVM thin pool metadata usage percentage
# TYPE storage_health_lvm_thin_metadata_percent gauge
EOF
echo "$thin_pools" | while read -r vg pool data_pct meta_pct; do
echo "storage_health_lvm_thin_metadata_percent{vg=\"$vg\",pool=\"$pool\"} ${meta_pct:-0}"
done
echo ""
fi
fi
# ========================================================================
# BTRFS Health (optional)
# ========================================================================
if command -v btrfs >/dev/null 2>&1; then
local has_btrfs=0
while read -r device mountpoint fstype; do
[ "$fstype" = "btrfs" ] && has_btrfs=1
done < <(get_mounted_filesystems)
if [ "$has_btrfs" -eq 1 ]; then
cat <<EOF
# HELP storage_health_btrfs_errors BTRFS device error counts
# TYPE storage_health_btrfs_errors gauge
EOF
while read -r device mountpoint fstype; do
[ "$fstype" = "btrfs" ] || continue
while read -r bdev etype count; do
[ -z "$bdev" ] && continue
echo "storage_health_btrfs_errors{mountpoint=\"$mountpoint\",device=\"$bdev\",type=\"$etype\"} ${count:-0}"
done < <(get_btrfs_health "$mountpoint")
done < <(get_mounted_filesystems)
echo ""
cat <<EOF
# HELP storage_health_btrfs_usage_bytes BTRFS usage by block group type (data/metadata/system)
# TYPE storage_health_btrfs_usage_bytes gauge
EOF
while read -r device mountpoint fstype; do
[ "$fstype" = "btrfs" ] || continue
while read -r btype used total; do
[ -z "$btype" ] && continue
echo "storage_health_btrfs_usage_bytes{mountpoint=\"$mountpoint\",group=\"$btype\",usage=\"used\"} ${used:-0}"
echo "storage_health_btrfs_usage_bytes{mountpoint=\"$mountpoint\",group=\"$btype\",usage=\"total\"} ${total:-0}"
done < <(get_btrfs_usage "$mountpoint")
done < <(get_mounted_filesystems)
echo ""
cat <<EOF
# HELP storage_health_btrfs_scrub_status BTRFS scrub state (0=never, 1=completed, 2=in progress)
# TYPE storage_health_btrfs_scrub_status gauge
# HELP storage_health_btrfs_scrub_age_seconds Seconds since last BTRFS scrub completed
# TYPE storage_health_btrfs_scrub_age_seconds gauge
# HELP storage_health_btrfs_scrub_errors Errors found during last BTRFS scrub
# TYPE storage_health_btrfs_scrub_errors gauge
EOF
while read -r device mountpoint fstype; do
[ "$fstype" = "btrfs" ] || continue
local scrub_info state scrub_age scrub_errors
scrub_info=$(get_btrfs_scrub "$mountpoint")
state=$(echo "$scrub_info" | awk '{print $1}')
scrub_age=$(echo "$scrub_info" | awk '{print $2}')
scrub_errors=$(echo "$scrub_info" | awk '{print $3}')
echo "storage_health_btrfs_scrub_status{mountpoint=\"$mountpoint\"} ${state:-0}"
echo "storage_health_btrfs_scrub_age_seconds{mountpoint=\"$mountpoint\"} ${scrub_age:-0}"
echo "storage_health_btrfs_scrub_errors{mountpoint=\"$mountpoint\"} ${scrub_errors:-0}"
done < <(get_mounted_filesystems)
echo ""
fi
fi
# ========================================================================
# ZFS Pool Health (optional)
# ========================================================================
if command -v zpool >/dev/null 2>&1; then
local zfs_pools
zfs_pools=$(get_zpool_health)
if [ -n "$zfs_pools" ]; then
cat <<EOF
# HELP storage_health_zfs_pool_healthy ZFS pool health (1=ONLINE, 0=other)
# TYPE storage_health_zfs_pool_healthy gauge
EOF
echo "$zfs_pools" | while read -r pool state errors; do
if [ "$state" = "ONLINE" ]; then
echo "storage_health_zfs_pool_healthy{pool=\"$pool\"} 1"
else
echo "storage_health_zfs_pool_healthy{pool=\"$pool\"} 0"
fi
done
echo ""
cat <<EOF
# HELP storage_health_zfs_pool_errors ZFS pool error count
# TYPE storage_health_zfs_pool_errors gauge
EOF
echo "$zfs_pools" | while read -r pool state errors; do
echo "storage_health_zfs_pool_errors{pool=\"$pool\"} ${errors:-0}"
done
echo ""
# ZFS pool capacity
local zfs_capacity
zfs_capacity=$(get_zpool_capacity)
if [ -n "$zfs_capacity" ]; then
cat <<EOF
# HELP storage_health_zfs_pool_size_bytes ZFS pool total size in bytes
# TYPE storage_health_zfs_pool_size_bytes gauge
# HELP storage_health_zfs_pool_alloc_bytes ZFS pool allocated bytes
# TYPE storage_health_zfs_pool_alloc_bytes gauge
# HELP storage_health_zfs_pool_free_bytes ZFS pool free bytes
# TYPE storage_health_zfs_pool_free_bytes gauge
# HELP storage_health_zfs_pool_fragmentation_percent ZFS pool fragmentation percentage
# TYPE storage_health_zfs_pool_fragmentation_percent gauge
# HELP storage_health_zfs_pool_capacity_percent ZFS pool capacity percentage
# TYPE storage_health_zfs_pool_capacity_percent gauge
# HELP storage_health_zfs_pool_dedup_ratio ZFS pool deduplication ratio
# TYPE storage_health_zfs_pool_dedup_ratio gauge
EOF
echo "$zfs_capacity" | while read -r pool size alloc free frag cap dedup; do
echo "storage_health_zfs_pool_size_bytes{pool=\"$pool\"} ${size:-0}"
echo "storage_health_zfs_pool_alloc_bytes{pool=\"$pool\"} ${alloc:-0}"
echo "storage_health_zfs_pool_free_bytes{pool=\"$pool\"} ${free:-0}"
echo "storage_health_zfs_pool_fragmentation_percent{pool=\"$pool\"} ${frag:-0}"
echo "storage_health_zfs_pool_capacity_percent{pool=\"$pool\"} ${cap:-0}"
echo "storage_health_zfs_pool_dedup_ratio{pool=\"$pool\"} ${dedup:-1}"
done
echo ""
fi
# ZFS scrub status
cat <<EOF
# HELP storage_health_zfs_scrub_status ZFS scrub state (0=none, 1=completed, 2=in progress)
# TYPE storage_health_zfs_scrub_status gauge
# HELP storage_health_zfs_scrub_age_seconds Seconds since last ZFS scrub completed
# TYPE storage_health_zfs_scrub_age_seconds gauge
# HELP storage_health_zfs_scrub_errors Errors found during last ZFS scrub
# TYPE storage_health_zfs_scrub_errors gauge
# HELP storage_health_zfs_scrub_progress_percent ZFS scrub progress percentage (100 if not running)
# TYPE storage_health_zfs_scrub_progress_percent gauge
EOF
echo "$zfs_pools" | while read -r pool state errors; do
local scrub_info scrub_state scrub_age scrub_errors scrub_pct
scrub_info=$(get_zpool_scrub "$pool")
scrub_state=$(echo "$scrub_info" | awk '{print $1}')
scrub_age=$(echo "$scrub_info" | awk '{print $2}')
scrub_errors=$(echo "$scrub_info" | awk '{print $3}')
scrub_pct=$(echo "$scrub_info" | awk '{print $4}')
echo "storage_health_zfs_scrub_status{pool=\"$pool\"} ${scrub_state:-0}"
echo "storage_health_zfs_scrub_age_seconds{pool=\"$pool\"} ${scrub_age:-0}"
echo "storage_health_zfs_scrub_errors{pool=\"$pool\"} ${scrub_errors:-0}"
echo "storage_health_zfs_scrub_progress_percent{pool=\"$pool\"} ${scrub_pct:-0}"
done
echo ""
# ZFS per-vdev errors
cat <<EOF
# HELP storage_health_zfs_vdev_read_errors ZFS vdev read error count
# TYPE storage_health_zfs_vdev_read_errors gauge
# HELP storage_health_zfs_vdev_write_errors ZFS vdev write error count
# TYPE storage_health_zfs_vdev_write_errors gauge
# HELP storage_health_zfs_vdev_checksum_errors ZFS vdev checksum error count
# TYPE storage_health_zfs_vdev_checksum_errors gauge
EOF
echo "$zfs_pools" | while read -r pool state errors; do
while read -r vdev read_err write_err cksum_err; do
[ -z "$vdev" ] && continue
echo "storage_health_zfs_vdev_read_errors{pool=\"$pool\",vdev=\"$vdev\"} ${read_err:-0}"
echo "storage_health_zfs_vdev_write_errors{pool=\"$pool\",vdev=\"$vdev\"} ${write_err:-0}"
echo "storage_health_zfs_vdev_checksum_errors{pool=\"$pool\",vdev=\"$vdev\"} ${cksum_err:-0}"
done < <(get_zpool_vdev_errors "$pool")
done
echo ""
fi
# ZFS datasets
local zfs_datasets
zfs_datasets=$(get_zfs_datasets)
if [ -n "$zfs_datasets" ]; then
cat <<EOF
# HELP storage_health_zfs_dataset_used_bytes ZFS dataset used bytes
# TYPE storage_health_zfs_dataset_used_bytes gauge
# HELP storage_health_zfs_dataset_avail_bytes ZFS dataset available bytes
# TYPE storage_health_zfs_dataset_avail_bytes gauge
# HELP storage_health_zfs_dataset_refer_bytes ZFS dataset referenced bytes
# TYPE storage_health_zfs_dataset_refer_bytes gauge
# HELP storage_health_zfs_dataset_compressratio ZFS dataset compression ratio
# TYPE storage_health_zfs_dataset_compressratio gauge
EOF
echo "$zfs_datasets" | while read -r dataset used avail refer ratio dtype; do
echo "storage_health_zfs_dataset_used_bytes{dataset=\"$dataset\",type=\"$dtype\"} ${used:-0}"
echo "storage_health_zfs_dataset_avail_bytes{dataset=\"$dataset\",type=\"$dtype\"} ${avail:-0}"
echo "storage_health_zfs_dataset_refer_bytes{dataset=\"$dataset\",type=\"$dtype\"} ${refer:-0}"
echo "storage_health_zfs_dataset_compressratio{dataset=\"$dataset\",type=\"$dtype\"} ${ratio:-1}"
done
echo ""
fi
# ZFS snapshot stats
local zfs_snaps
zfs_snaps=$(get_zfs_snapshot_stats)
if [ -n "$zfs_snaps" ]; then
cat <<EOF
# HELP storage_health_zfs_snapshot_count ZFS snapshot count per dataset
# TYPE storage_health_zfs_snapshot_count gauge
# HELP storage_health_zfs_snapshot_oldest_age_seconds Age of oldest ZFS snapshot in seconds
# TYPE storage_health_zfs_snapshot_oldest_age_seconds gauge
# HELP storage_health_zfs_snapshot_newest_age_seconds Age of newest ZFS snapshot in seconds
# TYPE storage_health_zfs_snapshot_newest_age_seconds gauge
EOF
echo "$zfs_snaps" | while read -r dataset count oldest newest; do
echo "storage_health_zfs_snapshot_count{dataset=\"$dataset\"} ${count:-0}"
echo "storage_health_zfs_snapshot_oldest_age_seconds{dataset=\"$dataset\"} ${oldest:-0}"
echo "storage_health_zfs_snapshot_newest_age_seconds{dataset=\"$dataset\"} ${newest:-0}"
done
echo ""
fi
# ZFS ARC stats (Linux only)
local arc_stats
arc_stats=$(get_zfs_arc_stats)
if [ -n "$arc_stats" ]; then
local arc_hits arc_misses arc_size
arc_hits=$(echo "$arc_stats" | awk '{print $1}')
arc_misses=$(echo "$arc_stats" | awk '{print $2}')
arc_size=$(echo "$arc_stats" | awk '{print $3}')
cat <<EOF
# HELP storage_health_zfs_arc_hits_total ZFS ARC cache hits (counter)
# TYPE storage_health_zfs_arc_hits_total counter
storage_health_zfs_arc_hits_total ${arc_hits:-0}
# HELP storage_health_zfs_arc_misses_total ZFS ARC cache misses (counter)
# TYPE storage_health_zfs_arc_misses_total counter
storage_health_zfs_arc_misses_total ${arc_misses:-0}
# HELP storage_health_zfs_arc_size_bytes ZFS ARC cache size in bytes
# TYPE storage_health_zfs_arc_size_bytes gauge
storage_health_zfs_arc_size_bytes ${arc_size:-0}
EOF
echo ""
fi
fi
# ========================================================================
# Journal Filesystem Errors
# ========================================================================
local fs_errors
fs_errors=$(get_fs_errors_from_journal)
cat <<EOF
# HELP storage_health_journal_fs_errors_24h Filesystem error messages in journal (24h)
# TYPE storage_health_journal_fs_errors_24h gauge
storage_health_journal_fs_errors_24h ${fs_errors:-0}
EOF
echo ""
# ========================================================================
# Exporter Runtime
# ========================================================================
local script_end script_duration
script_end=$(date +%s)
script_duration=$((script_end - script_start))
cat <<EOF
# HELP storage_health_exporter_duration_seconds Time to generate all metrics
# TYPE storage_health_exporter_duration_seconds gauge
storage_health_exporter_duration_seconds $script_duration
# HELP storage_health_exporter_last_run_timestamp Unix timestamp of last successful run
# TYPE storage_health_exporter_last_run_timestamp gauge
storage_health_exporter_last_run_timestamp $script_end
EOF
echo ""
}
# ============================================================================
# HTTP SERVER MODE
# ============================================================================
# Run simple HTTP server using netcat
# Serves metrics on /metrics endpoint
run_http_server() {
echo "Starting storage health exporter on port $HTTP_PORT..." >&2
if ! command -v nc >/dev/null 2>&1; then
echo "ERROR: netcat (nc) required for HTTP mode" >&2
exit 1
fi
# Infinite loop accepting HTTP requests
while true; do
{
read -r request
# Check if request is for /metrics endpoint
if [[ "$request" =~ ^GET\ /metrics ]]; then
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r"
generate_metrics
else # Serve HTML landing page for other requests
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r"
cat <<EOF
<!DOCTYPE html>
<html>
<head><title>Storage Health Exporter v1.0</title></head>
<body>
<h1>Storage Health Exporter v1.0</h1>
<p><a href="/metrics">Metrics</a></p>
<h2>Sections (auto-detected)</h2>
<ul>
<li>Filesystem info, mount status, inode usage</li>
<li>SMART disk health (requires smartctl)</li>
<li>MD RAID array status (requires mdadm)</li>
<li>LVM thin pool usage (requires lvs)</li>
<li>BTRFS device error stats (requires btrfs)</li>
<li>ZFS pool health (requires zpool)</li>
<li>Journal filesystem error count</li>
</ul>
</body>
</html>
EOF
fi
} | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
done
}
# ============================================================================
# MAIN EXECUTION
# ============================================================================
# Main entry point - routes to appropriate output mode
main() {
parse_args "$@"
if [ "$HTTP_MODE" = true ]; then
# Run HTTP server (blocks until killed)
run_http_server
elif [ -n "$OUTPUT_FILE" ]; then
# Textfile collector mode: write atomically using temp file
local output_dir
output_dir="$(dirname "$OUTPUT_FILE")"
mkdir -p "$output_dir"
# Create temp file in SAME directory for atomic rename (same filesystem)
local temp_file
temp_file=$(mktemp "${output_dir}/.storage_health_metrics.XXXXXX")
# Generate metrics to temp file
if ! generate_metrics > "$temp_file" 2>/dev/null; then
rm -f "$temp_file"
echo "ERROR: Failed to generate metrics" >&2
exit 1
fi
# Validate: file must have content
local file_lines
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
if [ "$file_lines" -lt 10 ]; then
rm -f "$temp_file"
echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
exit 1
fi
# Set permissions before move
chmod 644 "$temp_file"
# Atomic rename - no gap where file is missing
mv -f "$temp_file" "$OUTPUT_FILE"
echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
else
# Default: output to stdout
generate_metrics
fi
}
# Execute main function with all script arguments
main "$@"