a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
1179 lines
43 KiB
Bash
1179 lines
43 KiB
Bash
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: storage-health-exporter.sh
|
|
# Version: 1.0
|
|
# Description: Prometheus exporter for storage health metrics covering all
|
|
# common Linux filesystems. Exports inode usage, SMART disk health,
|
|
# mdadm RAID status, LVM thin pool usage, btrfs/zfs health,
|
|
# stale mount detection, and filesystem error counts from journal.
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
#
|
|
# Prerequisites:
|
|
# - Standard Unix tools (df, stat, awk, grep)
|
|
# - netcat (nc) for HTTP mode
|
|
# - Optional: smartctl (SMART), mdadm (RAID), lvs (LVM),
|
|
# btrfs (btrfs stats), zpool (ZFS), journalctl (fs errors)
|
|
# Each section is skipped gracefully if tools are missing.
|
|
#
|
|
# Usage:
|
|
# # Output to stdout
|
|
# ./storage-health-exporter.sh
|
|
#
|
|
# # HTTP server mode
|
|
# ./storage-health-exporter.sh --http -p 9197
|
|
#
|
|
# # Textfile collector mode
|
|
# ./storage-health-exporter.sh --textfile
|
|
#
|
|
# Metrics Exported:
|
|
# Core Status:
|
|
# - storage_health_up - Exporter status (1=up, 0=down)
|
|
# - storage_health_exporter_info{version} - Exporter version
|
|
#
|
|
# Filesystem Info:
|
|
# - storage_health_fs_info{device,mountpoint,fstype} - Filesystem info
|
|
# - storage_health_mount_readonly{device,mountpoint} - Read-only status
|
|
# - storage_health_mount_stale{mountpoint} - Stale/hung mount detection
|
|
#
|
|
# Inodes:
|
|
# - storage_health_inode_total{mountpoint} - Total inodes
|
|
# - storage_health_inode_used{mountpoint} - Used inodes
|
|
# - storage_health_inode_usage_percent{mountpoint} - Inode usage percentage
|
|
#
|
|
# SMART (if smartctl available):
|
|
# - storage_health_smart_healthy{device} - 1=passed, 0=failed
|
|
# - storage_health_smart_temperature_celsius{device} - Drive temperature
|
|
# - storage_health_smart_power_on_hours{device} - Power on hours
|
|
# - storage_health_smart_reallocated_sectors{device} - Reallocated sectors
|
|
#
|
|
# MD RAID (if /proc/mdstat exists):
|
|
# - storage_health_mdraid_healthy{array} - 1=clean, 0=degraded
|
|
# - storage_health_mdraid_degraded{array} - 1 if degraded
|
|
# - storage_health_mdraid_devices_total{array} - Total devices
|
|
# - storage_health_mdraid_devices_active{array} - Active devices
|
|
# - storage_health_mdraid_sync_percent{array} - Rebuild percentage
|
|
#
|
|
# LVM Thin Pools (if lvs available):
|
|
# - storage_health_lvm_thin_data_percent{vg,pool} - Data usage
|
|
# - storage_health_lvm_thin_metadata_percent{vg,pool} - Metadata usage
|
|
#
|
|
# BTRFS (if btrfs mounted):
|
|
# - storage_health_btrfs_errors{mountpoint,device,type} - Error counts
|
|
# - storage_health_btrfs_usage_bytes{mountpoint,group,usage} - Block group usage
|
|
# - storage_health_btrfs_scrub_status{mountpoint} - Scrub state
|
|
# - storage_health_btrfs_scrub_age_seconds{mountpoint} - Scrub age
|
|
# - storage_health_btrfs_scrub_errors{mountpoint} - Scrub error count
|
|
#
|
|
# ZFS (if zpool/zfs available):
|
|
# - storage_health_zfs_pool_healthy{pool} - 1=ONLINE, 0=other
|
|
# - storage_health_zfs_pool_errors{pool} - Error count
|
|
# - storage_health_zfs_pool_size_bytes{pool} - Pool total size
|
|
# - storage_health_zfs_pool_alloc_bytes{pool} - Pool allocated bytes
|
|
# - storage_health_zfs_pool_free_bytes{pool} - Pool free bytes
|
|
# - storage_health_zfs_pool_fragmentation_percent{pool} - Fragmentation
|
|
# - storage_health_zfs_pool_capacity_percent{pool} - Capacity used
|
|
# - storage_health_zfs_pool_dedup_ratio{pool} - Dedup ratio
|
|
# - storage_health_zfs_scrub_status{pool} - Scrub state
|
|
# - storage_health_zfs_scrub_age_seconds{pool} - Seconds since scrub
|
|
# - storage_health_zfs_scrub_errors{pool} - Scrub errors found
|
|
# - storage_health_zfs_scrub_progress_percent{pool} - Scrub progress
|
|
# - storage_health_zfs_vdev_read_errors{pool,vdev} - Per-vdev read errors
|
|
# - storage_health_zfs_vdev_write_errors{pool,vdev} - Per-vdev write errors
|
|
# - storage_health_zfs_vdev_checksum_errors{pool,vdev} - Per-vdev cksum errors
|
|
# - storage_health_zfs_dataset_used_bytes{dataset,type} - Dataset usage
|
|
# - storage_health_zfs_dataset_avail_bytes{dataset,type} - Dataset available
|
|
# - storage_health_zfs_dataset_refer_bytes{dataset,type} - Dataset referenced
|
|
# - storage_health_zfs_dataset_compressratio{dataset,type} - Compression ratio
|
|
# - storage_health_zfs_snapshot_count{dataset} - Snapshot count
|
|
# - storage_health_zfs_snapshot_oldest_age_seconds{dataset} - Oldest snap age
|
|
# - storage_health_zfs_snapshot_newest_age_seconds{dataset} - Newest snap age
|
|
# - storage_health_zfs_arc_hits_total - ARC cache hits
|
|
# - storage_health_zfs_arc_misses_total - ARC cache misses
|
|
# - storage_health_zfs_arc_size_bytes - ARC cache size
|
|
#
|
|
# Journal:
|
|
# - storage_health_journal_fs_errors_24h - FS error count from journal
|
|
#
|
|
# Exporter:
|
|
# - storage_health_exporter_duration_seconds - Script execution time
|
|
# - storage_health_exporter_last_run_timestamp - Last run timestamp
|
|
#
|
|
# Configuration:
|
|
# Default HTTP port: 9197
|
|
# Textfile directory: /var/lib/node_exporter
|
|
#
|
|
################################################################################
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION VARIABLES
|
|
# ============================================================================
|
|
|
|
TEXTFILE_DIR="/var/lib/node_exporter"
|
|
OUTPUT_FILE=""
|
|
HTTP_MODE=false
|
|
HTTP_PORT=9197
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
show_usage() {
|
|
cat <<EOF
|
|
Usage: $0 [OPTIONS]
|
|
|
|
Export storage health statistics as Prometheus metrics (v1.0).
|
|
|
|
MODES:
|
|
--textfile Write to node_exporter textfile collector
|
|
--http Run HTTP server on port $HTTP_PORT
|
|
|
|
OPTIONS:
|
|
-p, --port HTTP port (default: 9197)
|
|
-o, --output Output file path
|
|
|
|
EXAMPLES:
|
|
$0 --textfile # Write to textfile collector
|
|
$0 --http --port 9197 # Run HTTP server
|
|
$0 -o /tmp/storage_health.prom # Write to custom file
|
|
|
|
SECTIONS (auto-detected, skipped if tools missing):
|
|
- Filesystem info and inode usage (always available)
|
|
- SMART disk health (requires smartctl)
|
|
- MD RAID status (requires /proc/mdstat + mdadm)
|
|
- LVM thin pool usage (requires lvs)
|
|
- BTRFS device stats (requires btrfs)
|
|
- ZFS pool health (requires zpool)
|
|
- Journal filesystem errors (requires journalctl)
|
|
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help) show_usage ;;
|
|
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/storage_health.prom"; shift ;;
|
|
--http) HTTP_MODE=true; shift ;;
|
|
-p|--port) HTTP_PORT="$2"; shift 2 ;;
|
|
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
|
|
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
# Get mounted filesystems excluding virtual ones
|
|
# Returns: Lines of "device mountpoint fstype"
|
|
get_mounted_filesystems() {
|
|
df -T 2>/dev/null | awk 'NR>1 && $2 !~ /^(tmpfs|devtmpfs|overlay|squashfs|efivarfs|fuse\..*|nsfs|cgroup.*)$/ {
|
|
print $1, $7, $2
|
|
}'
|
|
}
|
|
|
|
# Get inode usage for a mountpoint
|
|
# Args: $1 - mountpoint
|
|
# Returns: "total used" or "0 0" on failure
|
|
get_inode_usage() {
|
|
local mountpoint="$1"
|
|
df -i "$mountpoint" 2>/dev/null | awk 'NR==2 { print $2, $3 }'
|
|
}
|
|
|
|
# Check if a mountpoint is read-only
|
|
# Args: $1 - mountpoint
|
|
# Returns: 1 if read-only, 0 if read-write
|
|
get_mount_readonly() {
|
|
local mountpoint="$1"
|
|
local opts
|
|
opts=$(awk -v mp="$mountpoint" '$2 == mp { print $4 }' /proc/mounts 2>/dev/null | head -1)
|
|
if echo "$opts" | grep -qE '(^|,)ro(,|$)'; then
|
|
echo "1"
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# Check if a mount is stale/hung (e.g., NFS)
|
|
# Args: $1 - mountpoint
|
|
# Returns: 1 if stale, 0 if responsive
|
|
check_stale_mount() {
|
|
local mountpoint="$1"
|
|
if timeout 2 stat -t "$mountpoint" >/dev/null 2>&1; then
|
|
echo "0"
|
|
else
|
|
echo "1"
|
|
fi
|
|
}
|
|
|
|
# Get SMART health status for a device
|
|
# Args: $1 - device path (e.g., /dev/sda)
|
|
# Returns: 1 if PASSED, 0 if FAILED or error
|
|
get_smart_health() {
|
|
local device="$1"
|
|
local result
|
|
result=$(smartctl -H "$device" 2>/dev/null)
|
|
if echo "$result" | grep -qi "PASSED\|OK"; then
|
|
echo "1"
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# Get SMART attributes for a device
|
|
# Args: $1 - device path
|
|
# Returns: "temperature power_on_hours reallocated_sectors"
|
|
get_smart_attributes() {
|
|
local device="$1"
|
|
local output
|
|
output=$(smartctl -A "$device" 2>/dev/null)
|
|
|
|
local temp power_hours realloc
|
|
|
|
# Temperature: attribute ID 194 or 190
|
|
temp=$(echo "$output" | awk '$1 == "194" || $1 == "190" { print $10; exit }')
|
|
# Power on hours: attribute ID 9
|
|
power_hours=$(echo "$output" | awk '$1 == "9" { print $10; exit }')
|
|
# Reallocated sectors: attribute ID 5
|
|
realloc=$(echo "$output" | awk '$1 == "5" { print $10; exit }')
|
|
|
|
echo "${temp:-0} ${power_hours:-0} ${realloc:-0}"
|
|
}
|
|
|
|
# Get list of MD RAID arrays
|
|
# Returns: Array names (e.g., md0 md1), one per line
|
|
get_mdraid_arrays() {
|
|
[ -f /proc/mdstat ] || return
|
|
awk '/^md[0-9]/ { print $1 }' /proc/mdstat 2>/dev/null
|
|
}
|
|
|
|
# Get MD RAID array status
|
|
# Args: $1 - array name (e.g., md0)
|
|
# Returns: "state total_devices active_devices sync_percent"
|
|
get_mdraid_status() {
|
|
local array="$1"
|
|
local detail
|
|
detail=$(mdadm --detail "/dev/$array" 2>/dev/null)
|
|
|
|
local state total active sync_pct
|
|
state=$(echo "$detail" | awk -F: '/State :/ { gsub(/^[ \t]+/, "", $2); print $2 }')
|
|
total=$(echo "$detail" | awk '/Raid Devices :/ { print $NF }')
|
|
active=$(echo "$detail" | awk '/Active Devices :/ { print $NF }')
|
|
|
|
# Check for rebuild/resync percentage
|
|
sync_pct=$(awk -v arr="$array" '
|
|
/^'"$array"'/ { found=1; next }
|
|
found && /recovery|resync/ {
|
|
match($0, /([0-9]+\.[0-9]+)%/, m)
|
|
if (m[1] != "") print m[1]
|
|
exit
|
|
}
|
|
found && /^md/ { exit }
|
|
' /proc/mdstat 2>/dev/null)
|
|
|
|
echo "${state:-unknown} ${total:-0} ${active:-0} ${sync_pct:-100}"
|
|
}
|
|
|
|
# Get LVM thin pool usage
|
|
# Returns: Lines of "vg_name lv_name data_percent metadata_percent"
|
|
get_lvm_thin_pools() {
|
|
command -v lvs >/dev/null 2>&1 || return
|
|
lvs --noheadings --nosuffix -o vg_name,lv_name,data_percent,metadata_percent \
|
|
--select 'pool_lv=""' --select 'lv_attr=~[t]' 2>/dev/null | \
|
|
awk 'NF==4 && $3+0 > 0 { print $1, $2, $3, $4 }'
|
|
}
|
|
|
|
# Get BTRFS device stats error counts
|
|
# Args: $1 - mountpoint
|
|
# Returns: Lines of "device error_type count"
|
|
get_btrfs_health() {
|
|
local mountpoint="$1"
|
|
command -v btrfs >/dev/null 2>&1 || return
|
|
btrfs device stats "$mountpoint" 2>/dev/null | awk -F'[].[[:space:]]+' '{
|
|
# Format: [/dev/sda1].write_io_errs 0
|
|
gsub(/\[|\]/, "")
|
|
if (NF >= 3) {
|
|
device = $1
|
|
type = $2
|
|
count = $NF
|
|
print device, type, count
|
|
}
|
|
}'
|
|
}
|
|
|
|
# Get ZFS pool health
|
|
# Returns: Lines of "pool_name state error_count"
|
|
get_zpool_health() {
|
|
command -v zpool >/dev/null 2>&1 || return
|
|
zpool list -H -o name,health 2>/dev/null | while read -r pool state; do
|
|
local errors
|
|
errors=$(zpool status "$pool" 2>/dev/null | awk '/errors:/ { if ($2 == "No") print 0; else print 1 }')
|
|
echo "$pool $state ${errors:-0}"
|
|
done
|
|
}
|
|
|
|
# Get ZFS pool capacity metrics
|
|
# Returns: Lines of "pool_name size_bytes alloc_bytes free_bytes frag_pct cap_pct dedup_ratio"
|
|
get_zpool_capacity() {
|
|
command -v zpool >/dev/null 2>&1 || return
|
|
zpool list -Hp -o name,size,alloc,free,frag,cap,dedup 2>/dev/null
|
|
}
|
|
|
|
# Get ZFS pool scrub status
|
|
# Args: $1 - pool name
|
|
# Returns: "state seconds_since_scrub errors_found"
|
|
# state: 0=no scrub, 1=completed, 2=in progress
|
|
get_zpool_scrub() {
|
|
local pool="$1"
|
|
local status_output
|
|
status_output=$(zpool status "$pool" 2>/dev/null)
|
|
|
|
local scrub_line
|
|
scrub_line=$(echo "$status_output" | grep "scan:" | head -1)
|
|
|
|
if echo "$scrub_line" | grep -q "in progress"; then
|
|
local pct
|
|
pct=$(echo "$status_output" | grep -oE '[0-9]+\.[0-9]+% done' | grep -oE '[0-9.]+' | head -1)
|
|
echo "2 0 0 ${pct:-0}"
|
|
elif echo "$scrub_line" | grep -q "scrub repaired"; then
|
|
local scrub_date seconds_since errors
|
|
scrub_date=$(echo "$scrub_line" | grep -oE '[A-Z][a-z]{2} [A-Z][a-z]{2} +[0-9]+ [0-9:]+ [0-9]+' | head -1)
|
|
if [ -n "$scrub_date" ]; then
|
|
local scrub_ts
|
|
scrub_ts=$(date -d "$scrub_date" +%s 2>/dev/null || echo 0)
|
|
seconds_since=$(( $(date +%s) - scrub_ts ))
|
|
else
|
|
seconds_since=0
|
|
fi
|
|
errors=$(echo "$scrub_line" | grep -oE '[0-9]+ errors' | awk '{print $1}')
|
|
echo "1 ${seconds_since:-0} ${errors:-0} 100"
|
|
else
|
|
echo "0 0 0 0"
|
|
fi
|
|
}
|
|
|
|
# Get ZFS dataset metrics
|
|
# Returns: Lines of "dataset used_bytes avail_bytes refer_bytes compressratio type"
|
|
get_zfs_datasets() {
|
|
command -v zfs >/dev/null 2>&1 || return
|
|
zfs list -Hp -o name,used,avail,refer,compressratio,type 2>/dev/null
|
|
}
|
|
|
|
# Get ZFS snapshot count and age per dataset
|
|
# Returns: Lines of "dataset count oldest_age_seconds newest_age_seconds"
|
|
get_zfs_snapshot_stats() {
|
|
command -v zfs >/dev/null 2>&1 || return
|
|
local now
|
|
now=$(date +%s)
|
|
zfs list -t snapshot -Hp -o name,creation 2>/dev/null | \
|
|
awk -v now="$now" '{
|
|
split($1, parts, "@")
|
|
ds = parts[1]
|
|
ts = $2
|
|
count[ds]++
|
|
if (!(ds in oldest) || ts < oldest[ds]) oldest[ds] = ts
|
|
if (!(ds in newest) || ts > newest[ds]) newest[ds] = ts
|
|
}
|
|
END {
|
|
for (ds in count) {
|
|
oldest_age = (oldest[ds] > 0) ? now - oldest[ds] : 0
|
|
newest_age = (newest[ds] > 0) ? now - newest[ds] : 0
|
|
print ds, count[ds], oldest_age, newest_age
|
|
}
|
|
}'
|
|
}
|
|
|
|
# Get ZFS per-vdev error counts from zpool status
|
|
# Args: $1 - pool name
|
|
# Returns: Lines of "vdev read_errors write_errors checksum_errors"
|
|
get_zpool_vdev_errors() {
|
|
local pool="$1"
|
|
zpool status "$pool" 2>/dev/null | awk '
|
|
/NAME.*STATE.*READ.*WRITE.*CKSUM/ { header=1; next }
|
|
header && /^$/ { exit }
|
|
header && NF >= 5 {
|
|
# Skip pool-level and mirror/raidz container lines
|
|
name = $1
|
|
read_err = $(NF-2)
|
|
write_err = $(NF-1)
|
|
cksum_err = $NF
|
|
if (read_err ~ /^[0-9]+$/ && name !~ /^(mirror|raidz|log|cache|spare)/) {
|
|
print name, read_err, write_err, cksum_err
|
|
}
|
|
}'
|
|
}
|
|
|
|
# Get ZFS ARC stats from /proc/spl/kstat/zfs/arcstats (Linux only)
|
|
# Returns: "hits misses size_bytes"
|
|
get_zfs_arc_stats() {
|
|
local arcstats="/proc/spl/kstat/zfs/arcstats"
|
|
[ -f "$arcstats" ] || return
|
|
local hits misses size
|
|
hits=$(awk '$1 == "hits" { print $3 }' "$arcstats" 2>/dev/null)
|
|
misses=$(awk '$1 == "misses" { print $3 }' "$arcstats" 2>/dev/null)
|
|
size=$(awk '$1 == "size" { print $3 }' "$arcstats" 2>/dev/null)
|
|
echo "${hits:-0} ${misses:-0} ${size:-0}"
|
|
}
|
|
|
|
# Get BTRFS filesystem usage (data/metadata/system)
|
|
# Args: $1 - mountpoint
|
|
# Returns: Lines of "type used_bytes total_bytes"
|
|
get_btrfs_usage() {
|
|
local mountpoint="$1"
|
|
command -v btrfs >/dev/null 2>&1 || return
|
|
btrfs filesystem usage -b "$mountpoint" 2>/dev/null | awk '
|
|
/^Data,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "data", used+0, total+0 }
|
|
/^Metadata,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "metadata", used+0, total+0 }
|
|
/^System,/ { getline; if (/used=/) { gsub(/[^0-9]/, "", $1); used=$1 }; getline; if (/size=/) { gsub(/[^0-9]/, "", $1); total=$1 }; print "system", used+0, total+0 }
|
|
'
|
|
}
|
|
|
|
# Get BTRFS scrub status
|
|
# Args: $1 - mountpoint
|
|
# Returns: "state seconds_since_scrub errors_found"
|
|
# state: 0=never, 1=completed, 2=in progress
|
|
get_btrfs_scrub() {
|
|
local mountpoint="$1"
|
|
command -v btrfs >/dev/null 2>&1 || return
|
|
local output
|
|
output=$(btrfs scrub status "$mountpoint" 2>/dev/null)
|
|
|
|
if echo "$output" | grep -q "running"; then
|
|
echo "2 0 0"
|
|
elif echo "$output" | grep -q "finished"; then
|
|
local scrub_date seconds_since errors
|
|
scrub_date=$(echo "$output" | grep -oE 'finished after [0-9:]+' | head -1)
|
|
local started
|
|
started=$(echo "$output" | grep "Scrub started:" | sed 's/.*Scrub started:[[:space:]]*//')
|
|
if [ -n "$started" ]; then
|
|
local scrub_ts
|
|
scrub_ts=$(date -d "$started" +%s 2>/dev/null || echo 0)
|
|
seconds_since=$(( $(date +%s) - scrub_ts ))
|
|
else
|
|
seconds_since=0
|
|
fi
|
|
errors=$(echo "$output" | awk '/errors found:/ { print $NF }' | head -1)
|
|
# Handle "no errors found" case
|
|
if echo "$output" | grep -qi "no errors found"; then
|
|
errors=0
|
|
fi
|
|
echo "1 ${seconds_since:-0} ${errors:-0}"
|
|
else
|
|
echo "0 0 0"
|
|
fi
|
|
}
|
|
|
|
# Count filesystem error messages from journal (last 24h)
|
|
# Returns: Number of filesystem error entries
|
|
get_fs_errors_from_journal() {
|
|
command -v journalctl >/dev/null 2>&1 || { echo "0"; return; }
|
|
local count
|
|
count=$(journalctl --since "24 hours ago" --no-pager -q 2>/dev/null | \
|
|
grep -ciE "(EXT4-fs error|XFS.*error|BTRFS.*error|I/O error|Buffer I/O error)" 2>/dev/null)
|
|
echo "${count:-0}"
|
|
}
|
|
|
|
# ============================================================================
|
|
# METRIC GENERATION
|
|
# ============================================================================
|
|
|
|
# Generate all Prometheus metrics
|
|
# Returns: Prometheus text format metrics on stdout
|
|
generate_metrics() {
|
|
local script_start
|
|
script_start=$(date +%s)
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_up Storage health exporter status
|
|
# TYPE storage_health_up gauge
|
|
storage_health_up 1
|
|
|
|
# HELP storage_health_exporter_info Storage health exporter information
|
|
# TYPE storage_health_exporter_info gauge
|
|
storage_health_exporter_info{version="1.0"} 1
|
|
EOF
|
|
|
|
echo ""
|
|
|
|
# ========================================================================
|
|
# Filesystem Info and Inode Usage
|
|
# ========================================================================
|
|
cat <<EOF
|
|
# HELP storage_health_fs_info Filesystem information (always 1)
|
|
# TYPE storage_health_fs_info gauge
|
|
EOF
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ -z "$device" ] && continue
|
|
echo "storage_health_fs_info{device=\"$device\",mountpoint=\"$mountpoint\",fstype=\"$fstype\"} 1"
|
|
done < <(get_mounted_filesystems)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_mount_readonly Mount read-only status (1=ro, 0=rw)
|
|
# TYPE storage_health_mount_readonly gauge
|
|
EOF
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ -z "$device" ] && continue
|
|
local ro
|
|
ro=$(get_mount_readonly "$mountpoint")
|
|
echo "storage_health_mount_readonly{device=\"$device\",mountpoint=\"$mountpoint\"} $ro"
|
|
done < <(get_mounted_filesystems)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_mount_stale Mount stale/hung status (1=stale, 0=ok)
|
|
# TYPE storage_health_mount_stale gauge
|
|
EOF
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ -z "$device" ] && continue
|
|
local stale
|
|
stale=$(check_stale_mount "$mountpoint")
|
|
echo "storage_health_mount_stale{mountpoint=\"$mountpoint\"} $stale"
|
|
done < <(get_mounted_filesystems)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_inode_total Total inodes per mountpoint
|
|
# TYPE storage_health_inode_total gauge
|
|
EOF
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ -z "$device" ] && continue
|
|
local inode_info total used
|
|
inode_info=$(get_inode_usage "$mountpoint")
|
|
total=$(echo "$inode_info" | awk '{print $1}')
|
|
echo "storage_health_inode_total{mountpoint=\"$mountpoint\"} ${total:-0}"
|
|
done < <(get_mounted_filesystems)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_inode_used Used inodes per mountpoint
|
|
# TYPE storage_health_inode_used gauge
|
|
EOF
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ -z "$device" ] && continue
|
|
local inode_info used
|
|
inode_info=$(get_inode_usage "$mountpoint")
|
|
used=$(echo "$inode_info" | awk '{print $2}')
|
|
echo "storage_health_inode_used{mountpoint=\"$mountpoint\"} ${used:-0}"
|
|
done < <(get_mounted_filesystems)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_inode_usage_percent Inode usage percentage per mountpoint
|
|
# TYPE storage_health_inode_usage_percent gauge
|
|
EOF
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ -z "$device" ] && continue
|
|
local inode_info total used pct
|
|
inode_info=$(get_inode_usage "$mountpoint")
|
|
total=$(echo "$inode_info" | awk '{print $1}')
|
|
used=$(echo "$inode_info" | awk '{print $2}')
|
|
total=${total:-0}
|
|
used=${used:-0}
|
|
if [ "$total" -gt 0 ] 2>/dev/null; then
|
|
pct=$(awk "BEGIN {printf \"%.2f\", ($used / $total) * 100}" 2>/dev/null || echo "0")
|
|
else
|
|
pct="0"
|
|
fi
|
|
echo "storage_health_inode_usage_percent{mountpoint=\"$mountpoint\"} $pct"
|
|
done < <(get_mounted_filesystems)
|
|
|
|
echo ""
|
|
|
|
# ========================================================================
|
|
# SMART Disk Health (optional)
|
|
# ========================================================================
|
|
if command -v smartctl >/dev/null 2>&1; then
|
|
cat <<EOF
|
|
# HELP storage_health_smart_healthy SMART health status (1=passed, 0=failed)
|
|
# TYPE storage_health_smart_healthy gauge
|
|
EOF
|
|
|
|
while read -r name dtype; do
|
|
[ "$dtype" = "disk" ] || continue
|
|
local device="/dev/$name"
|
|
local healthy
|
|
healthy=$(get_smart_health "$device")
|
|
echo "storage_health_smart_healthy{device=\"$device\"} $healthy"
|
|
done < <(lsblk -dno NAME,TYPE 2>/dev/null)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_smart_temperature_celsius Drive temperature in Celsius
|
|
# TYPE storage_health_smart_temperature_celsius gauge
|
|
EOF
|
|
|
|
while read -r name dtype; do
|
|
[ "$dtype" = "disk" ] || continue
|
|
local device="/dev/$name"
|
|
local attrs temp
|
|
attrs=$(get_smart_attributes "$device")
|
|
temp=$(echo "$attrs" | awk '{print $1}')
|
|
echo "storage_health_smart_temperature_celsius{device=\"$device\"} ${temp:-0}"
|
|
done < <(lsblk -dno NAME,TYPE 2>/dev/null)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_smart_power_on_hours Drive power on hours
|
|
# TYPE storage_health_smart_power_on_hours gauge
|
|
EOF
|
|
|
|
while read -r name dtype; do
|
|
[ "$dtype" = "disk" ] || continue
|
|
local device="/dev/$name"
|
|
local attrs hours
|
|
attrs=$(get_smart_attributes "$device")
|
|
hours=$(echo "$attrs" | awk '{print $2}')
|
|
echo "storage_health_smart_power_on_hours{device=\"$device\"} ${hours:-0}"
|
|
done < <(lsblk -dno NAME,TYPE 2>/dev/null)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_smart_reallocated_sectors Reallocated sector count
|
|
# TYPE storage_health_smart_reallocated_sectors gauge
|
|
EOF
|
|
|
|
while read -r name dtype; do
|
|
[ "$dtype" = "disk" ] || continue
|
|
local device="/dev/$name"
|
|
local attrs realloc
|
|
attrs=$(get_smart_attributes "$device")
|
|
realloc=$(echo "$attrs" | awk '{print $3}')
|
|
echo "storage_health_smart_reallocated_sectors{device=\"$device\"} ${realloc:-0}"
|
|
done < <(lsblk -dno NAME,TYPE 2>/dev/null)
|
|
|
|
echo ""
|
|
fi
|
|
|
|
# ========================================================================
|
|
# MD RAID (optional)
|
|
# ========================================================================
|
|
if [ -f /proc/mdstat ] && command -v mdadm >/dev/null 2>&1; then
|
|
local arrays
|
|
arrays=$(get_mdraid_arrays)
|
|
|
|
if [ -n "$arrays" ]; then
|
|
cat <<EOF
|
|
# HELP storage_health_mdraid_healthy MD RAID array health (1=clean, 0=degraded)
|
|
# TYPE storage_health_mdraid_healthy gauge
|
|
EOF
|
|
|
|
for array in $arrays; do
|
|
local status state
|
|
status=$(get_mdraid_status "$array")
|
|
state=$(echo "$status" | awk '{print $1}')
|
|
if echo "$state" | grep -qi "clean\|active"; then
|
|
echo "storage_health_mdraid_healthy{array=\"$array\"} 1"
|
|
else
|
|
echo "storage_health_mdraid_healthy{array=\"$array\"} 0"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_mdraid_degraded MD RAID degraded status (1=degraded, 0=ok)
|
|
# TYPE storage_health_mdraid_degraded gauge
|
|
EOF
|
|
|
|
for array in $arrays; do
|
|
local status state
|
|
status=$(get_mdraid_status "$array")
|
|
state=$(echo "$status" | awk '{print $1}')
|
|
if echo "$state" | grep -qi "degraded"; then
|
|
echo "storage_health_mdraid_degraded{array=\"$array\"} 1"
|
|
else
|
|
echo "storage_health_mdraid_degraded{array=\"$array\"} 0"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_mdraid_devices_total Total devices in MD RAID array
|
|
# TYPE storage_health_mdraid_devices_total gauge
|
|
EOF
|
|
|
|
for array in $arrays; do
|
|
local status total
|
|
status=$(get_mdraid_status "$array")
|
|
total=$(echo "$status" | awk '{print $2}')
|
|
echo "storage_health_mdraid_devices_total{array=\"$array\"} ${total:-0}"
|
|
done
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_mdraid_devices_active Active devices in MD RAID array
|
|
# TYPE storage_health_mdraid_devices_active gauge
|
|
EOF
|
|
|
|
for array in $arrays; do
|
|
local status active
|
|
status=$(get_mdraid_status "$array")
|
|
active=$(echo "$status" | awk '{print $3}')
|
|
echo "storage_health_mdraid_devices_active{array=\"$array\"} ${active:-0}"
|
|
done
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_mdraid_sync_percent MD RAID sync/rebuild percentage
|
|
# TYPE storage_health_mdraid_sync_percent gauge
|
|
EOF
|
|
|
|
for array in $arrays; do
|
|
local status sync_pct
|
|
status=$(get_mdraid_status "$array")
|
|
sync_pct=$(echo "$status" | awk '{print $4}')
|
|
echo "storage_health_mdraid_sync_percent{array=\"$array\"} ${sync_pct:-100}"
|
|
done
|
|
|
|
echo ""
|
|
fi
|
|
fi
|
|
|
|
# ========================================================================
|
|
# LVM Thin Pools (optional)
|
|
# ========================================================================
|
|
if command -v lvs >/dev/null 2>&1; then
|
|
local thin_pools
|
|
thin_pools=$(get_lvm_thin_pools)
|
|
|
|
if [ -n "$thin_pools" ]; then
|
|
cat <<EOF
|
|
# HELP storage_health_lvm_thin_data_percent LVM thin pool data usage percentage
|
|
# TYPE storage_health_lvm_thin_data_percent gauge
|
|
EOF
|
|
|
|
echo "$thin_pools" | while read -r vg pool data_pct meta_pct; do
|
|
echo "storage_health_lvm_thin_data_percent{vg=\"$vg\",pool=\"$pool\"} ${data_pct:-0}"
|
|
done
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_lvm_thin_metadata_percent LVM thin pool metadata usage percentage
|
|
# TYPE storage_health_lvm_thin_metadata_percent gauge
|
|
EOF
|
|
|
|
echo "$thin_pools" | while read -r vg pool data_pct meta_pct; do
|
|
echo "storage_health_lvm_thin_metadata_percent{vg=\"$vg\",pool=\"$pool\"} ${meta_pct:-0}"
|
|
done
|
|
|
|
echo ""
|
|
fi
|
|
fi
|
|
|
|
# ========================================================================
|
|
# BTRFS Health (optional)
|
|
# ========================================================================
|
|
if command -v btrfs >/dev/null 2>&1; then
|
|
local has_btrfs=0
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ "$fstype" = "btrfs" ] && has_btrfs=1
|
|
done < <(get_mounted_filesystems)
|
|
|
|
if [ "$has_btrfs" -eq 1 ]; then
|
|
cat <<EOF
|
|
# HELP storage_health_btrfs_errors BTRFS device error counts
|
|
# TYPE storage_health_btrfs_errors gauge
|
|
EOF
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ "$fstype" = "btrfs" ] || continue
|
|
while read -r bdev etype count; do
|
|
[ -z "$bdev" ] && continue
|
|
echo "storage_health_btrfs_errors{mountpoint=\"$mountpoint\",device=\"$bdev\",type=\"$etype\"} ${count:-0}"
|
|
done < <(get_btrfs_health "$mountpoint")
|
|
done < <(get_mounted_filesystems)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_btrfs_usage_bytes BTRFS usage by block group type (data/metadata/system)
|
|
# TYPE storage_health_btrfs_usage_bytes gauge
|
|
EOF
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ "$fstype" = "btrfs" ] || continue
|
|
while read -r btype used total; do
|
|
[ -z "$btype" ] && continue
|
|
echo "storage_health_btrfs_usage_bytes{mountpoint=\"$mountpoint\",group=\"$btype\",usage=\"used\"} ${used:-0}"
|
|
echo "storage_health_btrfs_usage_bytes{mountpoint=\"$mountpoint\",group=\"$btype\",usage=\"total\"} ${total:-0}"
|
|
done < <(get_btrfs_usage "$mountpoint")
|
|
done < <(get_mounted_filesystems)
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_btrfs_scrub_status BTRFS scrub state (0=never, 1=completed, 2=in progress)
|
|
# TYPE storage_health_btrfs_scrub_status gauge
|
|
# HELP storage_health_btrfs_scrub_age_seconds Seconds since last BTRFS scrub completed
|
|
# TYPE storage_health_btrfs_scrub_age_seconds gauge
|
|
# HELP storage_health_btrfs_scrub_errors Errors found during last BTRFS scrub
|
|
# TYPE storage_health_btrfs_scrub_errors gauge
|
|
EOF
|
|
|
|
while read -r device mountpoint fstype; do
|
|
[ "$fstype" = "btrfs" ] || continue
|
|
local scrub_info state scrub_age scrub_errors
|
|
scrub_info=$(get_btrfs_scrub "$mountpoint")
|
|
state=$(echo "$scrub_info" | awk '{print $1}')
|
|
scrub_age=$(echo "$scrub_info" | awk '{print $2}')
|
|
scrub_errors=$(echo "$scrub_info" | awk '{print $3}')
|
|
echo "storage_health_btrfs_scrub_status{mountpoint=\"$mountpoint\"} ${state:-0}"
|
|
echo "storage_health_btrfs_scrub_age_seconds{mountpoint=\"$mountpoint\"} ${scrub_age:-0}"
|
|
echo "storage_health_btrfs_scrub_errors{mountpoint=\"$mountpoint\"} ${scrub_errors:-0}"
|
|
done < <(get_mounted_filesystems)
|
|
|
|
echo ""
|
|
fi
|
|
fi
|
|
|
|
# ========================================================================
|
|
# ZFS Pool Health (optional)
|
|
# ========================================================================
|
|
if command -v zpool >/dev/null 2>&1; then
|
|
local zfs_pools
|
|
zfs_pools=$(get_zpool_health)
|
|
|
|
if [ -n "$zfs_pools" ]; then
|
|
cat <<EOF
|
|
# HELP storage_health_zfs_pool_healthy ZFS pool health (1=ONLINE, 0=other)
|
|
# TYPE storage_health_zfs_pool_healthy gauge
|
|
EOF
|
|
|
|
echo "$zfs_pools" | while read -r pool state errors; do
|
|
if [ "$state" = "ONLINE" ]; then
|
|
echo "storage_health_zfs_pool_healthy{pool=\"$pool\"} 1"
|
|
else
|
|
echo "storage_health_zfs_pool_healthy{pool=\"$pool\"} 0"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_zfs_pool_errors ZFS pool error count
|
|
# TYPE storage_health_zfs_pool_errors gauge
|
|
EOF
|
|
|
|
echo "$zfs_pools" | while read -r pool state errors; do
|
|
echo "storage_health_zfs_pool_errors{pool=\"$pool\"} ${errors:-0}"
|
|
done
|
|
|
|
echo ""
|
|
|
|
# ZFS pool capacity
|
|
local zfs_capacity
|
|
zfs_capacity=$(get_zpool_capacity)
|
|
|
|
if [ -n "$zfs_capacity" ]; then
|
|
cat <<EOF
|
|
# HELP storage_health_zfs_pool_size_bytes ZFS pool total size in bytes
|
|
# TYPE storage_health_zfs_pool_size_bytes gauge
|
|
# HELP storage_health_zfs_pool_alloc_bytes ZFS pool allocated bytes
|
|
# TYPE storage_health_zfs_pool_alloc_bytes gauge
|
|
# HELP storage_health_zfs_pool_free_bytes ZFS pool free bytes
|
|
# TYPE storage_health_zfs_pool_free_bytes gauge
|
|
# HELP storage_health_zfs_pool_fragmentation_percent ZFS pool fragmentation percentage
|
|
# TYPE storage_health_zfs_pool_fragmentation_percent gauge
|
|
# HELP storage_health_zfs_pool_capacity_percent ZFS pool capacity percentage
|
|
# TYPE storage_health_zfs_pool_capacity_percent gauge
|
|
# HELP storage_health_zfs_pool_dedup_ratio ZFS pool deduplication ratio
|
|
# TYPE storage_health_zfs_pool_dedup_ratio gauge
|
|
EOF
|
|
|
|
echo "$zfs_capacity" | while read -r pool size alloc free frag cap dedup; do
|
|
echo "storage_health_zfs_pool_size_bytes{pool=\"$pool\"} ${size:-0}"
|
|
echo "storage_health_zfs_pool_alloc_bytes{pool=\"$pool\"} ${alloc:-0}"
|
|
echo "storage_health_zfs_pool_free_bytes{pool=\"$pool\"} ${free:-0}"
|
|
echo "storage_health_zfs_pool_fragmentation_percent{pool=\"$pool\"} ${frag:-0}"
|
|
echo "storage_health_zfs_pool_capacity_percent{pool=\"$pool\"} ${cap:-0}"
|
|
echo "storage_health_zfs_pool_dedup_ratio{pool=\"$pool\"} ${dedup:-1}"
|
|
done
|
|
|
|
echo ""
|
|
fi
|
|
|
|
# ZFS scrub status
|
|
cat <<EOF
|
|
# HELP storage_health_zfs_scrub_status ZFS scrub state (0=none, 1=completed, 2=in progress)
|
|
# TYPE storage_health_zfs_scrub_status gauge
|
|
# HELP storage_health_zfs_scrub_age_seconds Seconds since last ZFS scrub completed
|
|
# TYPE storage_health_zfs_scrub_age_seconds gauge
|
|
# HELP storage_health_zfs_scrub_errors Errors found during last ZFS scrub
|
|
# TYPE storage_health_zfs_scrub_errors gauge
|
|
# HELP storage_health_zfs_scrub_progress_percent ZFS scrub progress percentage (100 if not running)
|
|
# TYPE storage_health_zfs_scrub_progress_percent gauge
|
|
EOF
|
|
|
|
echo "$zfs_pools" | while read -r pool state errors; do
|
|
local scrub_info scrub_state scrub_age scrub_errors scrub_pct
|
|
scrub_info=$(get_zpool_scrub "$pool")
|
|
scrub_state=$(echo "$scrub_info" | awk '{print $1}')
|
|
scrub_age=$(echo "$scrub_info" | awk '{print $2}')
|
|
scrub_errors=$(echo "$scrub_info" | awk '{print $3}')
|
|
scrub_pct=$(echo "$scrub_info" | awk '{print $4}')
|
|
echo "storage_health_zfs_scrub_status{pool=\"$pool\"} ${scrub_state:-0}"
|
|
echo "storage_health_zfs_scrub_age_seconds{pool=\"$pool\"} ${scrub_age:-0}"
|
|
echo "storage_health_zfs_scrub_errors{pool=\"$pool\"} ${scrub_errors:-0}"
|
|
echo "storage_health_zfs_scrub_progress_percent{pool=\"$pool\"} ${scrub_pct:-0}"
|
|
done
|
|
|
|
echo ""
|
|
|
|
# ZFS per-vdev errors
|
|
cat <<EOF
|
|
# HELP storage_health_zfs_vdev_read_errors ZFS vdev read error count
|
|
# TYPE storage_health_zfs_vdev_read_errors gauge
|
|
# HELP storage_health_zfs_vdev_write_errors ZFS vdev write error count
|
|
# TYPE storage_health_zfs_vdev_write_errors gauge
|
|
# HELP storage_health_zfs_vdev_checksum_errors ZFS vdev checksum error count
|
|
# TYPE storage_health_zfs_vdev_checksum_errors gauge
|
|
EOF
|
|
|
|
echo "$zfs_pools" | while read -r pool state errors; do
|
|
while read -r vdev read_err write_err cksum_err; do
|
|
[ -z "$vdev" ] && continue
|
|
echo "storage_health_zfs_vdev_read_errors{pool=\"$pool\",vdev=\"$vdev\"} ${read_err:-0}"
|
|
echo "storage_health_zfs_vdev_write_errors{pool=\"$pool\",vdev=\"$vdev\"} ${write_err:-0}"
|
|
echo "storage_health_zfs_vdev_checksum_errors{pool=\"$pool\",vdev=\"$vdev\"} ${cksum_err:-0}"
|
|
done < <(get_zpool_vdev_errors "$pool")
|
|
done
|
|
|
|
echo ""
|
|
fi
|
|
|
|
# ZFS datasets
|
|
local zfs_datasets
|
|
zfs_datasets=$(get_zfs_datasets)
|
|
|
|
if [ -n "$zfs_datasets" ]; then
|
|
cat <<EOF
|
|
# HELP storage_health_zfs_dataset_used_bytes ZFS dataset used bytes
|
|
# TYPE storage_health_zfs_dataset_used_bytes gauge
|
|
# HELP storage_health_zfs_dataset_avail_bytes ZFS dataset available bytes
|
|
# TYPE storage_health_zfs_dataset_avail_bytes gauge
|
|
# HELP storage_health_zfs_dataset_refer_bytes ZFS dataset referenced bytes
|
|
# TYPE storage_health_zfs_dataset_refer_bytes gauge
|
|
# HELP storage_health_zfs_dataset_compressratio ZFS dataset compression ratio
|
|
# TYPE storage_health_zfs_dataset_compressratio gauge
|
|
EOF
|
|
|
|
echo "$zfs_datasets" | while read -r dataset used avail refer ratio dtype; do
|
|
echo "storage_health_zfs_dataset_used_bytes{dataset=\"$dataset\",type=\"$dtype\"} ${used:-0}"
|
|
echo "storage_health_zfs_dataset_avail_bytes{dataset=\"$dataset\",type=\"$dtype\"} ${avail:-0}"
|
|
echo "storage_health_zfs_dataset_refer_bytes{dataset=\"$dataset\",type=\"$dtype\"} ${refer:-0}"
|
|
echo "storage_health_zfs_dataset_compressratio{dataset=\"$dataset\",type=\"$dtype\"} ${ratio:-1}"
|
|
done
|
|
|
|
echo ""
|
|
fi
|
|
|
|
# ZFS snapshot stats
|
|
local zfs_snaps
|
|
zfs_snaps=$(get_zfs_snapshot_stats)
|
|
|
|
if [ -n "$zfs_snaps" ]; then
|
|
cat <<EOF
|
|
# HELP storage_health_zfs_snapshot_count ZFS snapshot count per dataset
|
|
# TYPE storage_health_zfs_snapshot_count gauge
|
|
# HELP storage_health_zfs_snapshot_oldest_age_seconds Age of oldest ZFS snapshot in seconds
|
|
# TYPE storage_health_zfs_snapshot_oldest_age_seconds gauge
|
|
# HELP storage_health_zfs_snapshot_newest_age_seconds Age of newest ZFS snapshot in seconds
|
|
# TYPE storage_health_zfs_snapshot_newest_age_seconds gauge
|
|
EOF
|
|
|
|
echo "$zfs_snaps" | while read -r dataset count oldest newest; do
|
|
echo "storage_health_zfs_snapshot_count{dataset=\"$dataset\"} ${count:-0}"
|
|
echo "storage_health_zfs_snapshot_oldest_age_seconds{dataset=\"$dataset\"} ${oldest:-0}"
|
|
echo "storage_health_zfs_snapshot_newest_age_seconds{dataset=\"$dataset\"} ${newest:-0}"
|
|
done
|
|
|
|
echo ""
|
|
fi
|
|
|
|
# ZFS ARC stats (Linux only)
|
|
local arc_stats
|
|
arc_stats=$(get_zfs_arc_stats)
|
|
|
|
if [ -n "$arc_stats" ]; then
|
|
local arc_hits arc_misses arc_size
|
|
arc_hits=$(echo "$arc_stats" | awk '{print $1}')
|
|
arc_misses=$(echo "$arc_stats" | awk '{print $2}')
|
|
arc_size=$(echo "$arc_stats" | awk '{print $3}')
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_zfs_arc_hits_total ZFS ARC cache hits (counter)
|
|
# TYPE storage_health_zfs_arc_hits_total counter
|
|
storage_health_zfs_arc_hits_total ${arc_hits:-0}
|
|
|
|
# HELP storage_health_zfs_arc_misses_total ZFS ARC cache misses (counter)
|
|
# TYPE storage_health_zfs_arc_misses_total counter
|
|
storage_health_zfs_arc_misses_total ${arc_misses:-0}
|
|
|
|
# HELP storage_health_zfs_arc_size_bytes ZFS ARC cache size in bytes
|
|
# TYPE storage_health_zfs_arc_size_bytes gauge
|
|
storage_health_zfs_arc_size_bytes ${arc_size:-0}
|
|
EOF
|
|
|
|
echo ""
|
|
fi
|
|
fi
|
|
|
|
# ========================================================================
|
|
# Journal Filesystem Errors
|
|
# ========================================================================
|
|
local fs_errors
|
|
fs_errors=$(get_fs_errors_from_journal)
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_journal_fs_errors_24h Filesystem error messages in journal (24h)
|
|
# TYPE storage_health_journal_fs_errors_24h gauge
|
|
storage_health_journal_fs_errors_24h ${fs_errors:-0}
|
|
EOF
|
|
|
|
echo ""
|
|
|
|
# ========================================================================
|
|
# Exporter Runtime
|
|
# ========================================================================
|
|
local script_end script_duration
|
|
script_end=$(date +%s)
|
|
script_duration=$((script_end - script_start))
|
|
|
|
cat <<EOF
|
|
# HELP storage_health_exporter_duration_seconds Time to generate all metrics
|
|
# TYPE storage_health_exporter_duration_seconds gauge
|
|
storage_health_exporter_duration_seconds $script_duration
|
|
|
|
# HELP storage_health_exporter_last_run_timestamp Unix timestamp of last successful run
|
|
# TYPE storage_health_exporter_last_run_timestamp gauge
|
|
storage_health_exporter_last_run_timestamp $script_end
|
|
EOF
|
|
|
|
echo ""
|
|
}
|
|
|
|
# ============================================================================
|
|
# HTTP SERVER MODE
|
|
# ============================================================================
|
|
|
|
# Run simple HTTP server using netcat
|
|
# Serves metrics on /metrics endpoint
|
|
run_http_server() {
|
|
echo "Starting storage health exporter on port $HTTP_PORT..." >&2
|
|
|
|
if ! command -v nc >/dev/null 2>&1; then
|
|
echo "ERROR: netcat (nc) required for HTTP mode" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Infinite loop accepting HTTP requests
|
|
while true; do
|
|
{
|
|
read -r request
|
|
# Check if request is for /metrics endpoint
|
|
if [[ "$request" =~ ^GET\ /metrics ]]; then
|
|
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r"
|
|
generate_metrics
|
|
else # Serve HTML landing page for other requests
|
|
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r"
|
|
cat <<EOF
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Storage Health Exporter v1.0</title></head>
|
|
<body>
|
|
<h1>Storage Health Exporter v1.0</h1>
|
|
<p><a href="/metrics">Metrics</a></p>
|
|
<h2>Sections (auto-detected)</h2>
|
|
<ul>
|
|
<li>Filesystem info, mount status, inode usage</li>
|
|
<li>SMART disk health (requires smartctl)</li>
|
|
<li>MD RAID array status (requires mdadm)</li>
|
|
<li>LVM thin pool usage (requires lvs)</li>
|
|
<li>BTRFS device error stats (requires btrfs)</li>
|
|
<li>ZFS pool health (requires zpool)</li>
|
|
<li>Journal filesystem error count</li>
|
|
</ul>
|
|
</body>
|
|
</html>
|
|
EOF
|
|
fi
|
|
} | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN EXECUTION
|
|
# ============================================================================
|
|
|
|
# Main entry point - routes to appropriate output mode
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
if [ "$HTTP_MODE" = true ]; then
|
|
# Run HTTP server (blocks until killed)
|
|
run_http_server
|
|
elif [ -n "$OUTPUT_FILE" ]; then
|
|
# Textfile collector mode: write atomically using temp file
|
|
local output_dir
|
|
output_dir="$(dirname "$OUTPUT_FILE")"
|
|
mkdir -p "$output_dir"
|
|
|
|
# Create temp file in SAME directory for atomic rename (same filesystem)
|
|
local temp_file
|
|
temp_file=$(mktemp "${output_dir}/.storage_health_metrics.XXXXXX")
|
|
|
|
# Generate metrics to temp file
|
|
if ! generate_metrics > "$temp_file" 2>/dev/null; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Failed to generate metrics" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Validate: file must have content
|
|
local file_lines
|
|
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
|
|
|
|
if [ "$file_lines" -lt 10 ]; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Set permissions before move
|
|
chmod 644 "$temp_file"
|
|
|
|
# Atomic rename - no gap where file is missing
|
|
mv -f "$temp_file" "$OUTPUT_FILE"
|
|
|
|
echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
|
|
else
|
|
# Default: output to stdout
|
|
generate_metrics
|
|
fi
|
|
}
|
|
|
|
# Execute main function with all script arguments
|
|
main "$@"
|