Add all 44 scripts, update CI: error severity baseline, PowerShell validation, multi-distro testing
Amp-Thread-ID: https://ampcode.com/threads/T-019cc404-c628-759e-a50b-f5eeea35b91f Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
@@ -0,0 +1,354 @@
|
||||
#!/usr/bin/env bash
|
||||
# disk-io-exporter.sh — Prometheus exporter for per-disk I/O performance
|
||||
#
|
||||
# Reads /proc/diskstats and calculates per-disk IOPS, throughput,
|
||||
# latency, utilization, and queue depth. Takes two samples with a
|
||||
# configurable interval to compute rates from the cumulative counters.
|
||||
#
|
||||
# Author: Phil Connor
|
||||
# Contact: contact@mylinux.work
|
||||
# License: MIT
|
||||
# Date: 2026-03-03
|
||||
# Version: 1.0.0
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ── Configuration ───────────────────────────────────────────────────
|
||||
|
||||
readonly VERSION="1.0.0"
|
||||
readonly SCRIPT_NAME="${0##*/}"
|
||||
readonly NODE_DIR="${NODE_DIR:-/var/lib/node_exporter}"
|
||||
readonly OUTPUT_FILE="${NODE_DIR}/disk_io.prom"
|
||||
readonly TMP_FILE="${OUTPUT_FILE}.$$"
|
||||
readonly SAMPLE_INTERVAL="${SAMPLE_INTERVAL:-1}"
|
||||
readonly DISK_FILTER="${DISK_FILTER:-}"
|
||||
|
||||
# Runtime flags
|
||||
DRY_RUN=false
|
||||
DEBUG=${DEBUG:-}
|
||||
|
||||
# ── Helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
debug_echo() {
|
||||
if [[ -n "$DEBUG" ]]; then
|
||||
echo "[DEBUG] $*" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo "[ERROR] $*" >&2
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
rm -f "$TMP_FILE"
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
show_help() {
|
||||
cat <<EOF
|
||||
Usage: $SCRIPT_NAME [OPTIONS]
|
||||
|
||||
Prometheus textfile collector exporter for per-disk I/O performance.
|
||||
Reads /proc/diskstats, takes two samples ${SAMPLE_INTERVAL}s apart, and
|
||||
calculates rates per disk.
|
||||
|
||||
OPTIONS:
|
||||
--dry-run Output metrics to stdout instead of writing to file
|
||||
--debug Enable debug output
|
||||
--help Show this help message
|
||||
--version Show version
|
||||
|
||||
ENVIRONMENT VARIABLES:
|
||||
DISK_FILTER Regex of disk names to include (default: all real disks)
|
||||
Example: DISK_FILTER="^sd[a-z]+$|^nvme[0-9]+n[0-9]+$"
|
||||
NODE_DIR Textfile collector directory (default: /var/lib/node_exporter)
|
||||
SAMPLE_INTERVAL Seconds between the two samples (default: 1)
|
||||
DEBUG Enable debug output when set to any value
|
||||
|
||||
EXAMPLES:
|
||||
$SCRIPT_NAME --dry-run
|
||||
DISK_FILTER="^sda$" $SCRIPT_NAME
|
||||
SAMPLE_INTERVAL=2 $SCRIPT_NAME
|
||||
DEBUG=1 $SCRIPT_NAME --dry-run
|
||||
|
||||
FILTERED DEVICES:
|
||||
loop*, ram* devices are excluded by default. Use DISK_FILTER to
|
||||
restrict to specific disks (e.g. only sd* or nvme* devices).
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
show_version() {
|
||||
echo "$SCRIPT_NAME version $VERSION"
|
||||
exit 0
|
||||
}
|
||||
|
||||
# ── Snapshot /proc/diskstats ────────────────────────────────────────
|
||||
#
|
||||
# Fields from /proc/diskstats (kernel 4.18+):
|
||||
# $1 major
|
||||
# $2 minor
|
||||
# $3 device name
|
||||
# $4 reads completed
|
||||
# $5 reads merged
|
||||
# $6 sectors read
|
||||
# $7 time reading (ms)
|
||||
# $8 writes completed
|
||||
# $9 writes merged
|
||||
# $10 sectors written
|
||||
# $11 time writing (ms)
|
||||
# $12 I/Os in progress (instantaneous)
|
||||
# $13 time doing I/Os (ms)
|
||||
# $14 weighted time doing I/Os (ms)
|
||||
|
||||
take_snapshot() {
|
||||
local -n _snapshot=$1
|
||||
|
||||
while read -r _ _ dev reads _ sectors_read read_ms writes _ sectors_written write_ms inflight io_ms weighted_ms _; do
|
||||
# Skip loop and ram devices
|
||||
[[ "$dev" =~ ^loop[0-9] ]] && continue
|
||||
[[ "$dev" =~ ^ram[0-9] ]] && continue
|
||||
|
||||
# Skip partition devices (e.g. sda1, nvme0n1p1) — report whole disks only
|
||||
[[ "$dev" =~ [0-9]+p[0-9]+$ ]] && continue
|
||||
[[ "$dev" =~ ^[a-z]+[0-9]+$ && ! "$dev" =~ ^nvme ]] && continue
|
||||
|
||||
# Apply user filter if set
|
||||
if [[ -n "$DISK_FILTER" ]]; then
|
||||
if ! [[ "$dev" =~ $DISK_FILTER ]]; then
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
_snapshot["${dev}_reads"]="$reads"
|
||||
_snapshot["${dev}_sectors_read"]="$sectors_read"
|
||||
_snapshot["${dev}_read_ms"]="$read_ms"
|
||||
_snapshot["${dev}_writes"]="$writes"
|
||||
_snapshot["${dev}_sectors_written"]="$sectors_written"
|
||||
_snapshot["${dev}_write_ms"]="$write_ms"
|
||||
_snapshot["${dev}_inflight"]="$inflight"
|
||||
_snapshot["${dev}_io_ms"]="$io_ms"
|
||||
_snapshot["${dev}_weighted_ms"]="$weighted_ms"
|
||||
done < /proc/diskstats
|
||||
}
|
||||
|
||||
# ── Collect device list from a snapshot ─────────────────────────────
|
||||
|
||||
get_devices() {
|
||||
local -n _snap=$1
|
||||
local dev
|
||||
for key in "${!_snap[@]}"; do
|
||||
dev="${key%_reads}"
|
||||
if [[ "$dev" != "$key" ]]; then
|
||||
echo "$dev"
|
||||
fi
|
||||
done | sort
|
||||
}
|
||||
|
||||
# ── Metrics Collection ─────────────────────────────────────────────
|
||||
|
||||
collect_metrics() {
|
||||
local start_time
|
||||
start_time=$(date +%s%N)
|
||||
|
||||
# First snapshot
|
||||
declare -A snap1
|
||||
take_snapshot snap1
|
||||
debug_echo "First snapshot taken"
|
||||
|
||||
sleep "$SAMPLE_INTERVAL"
|
||||
|
||||
# Second snapshot
|
||||
declare -A snap2
|
||||
take_snapshot snap2
|
||||
debug_echo "Second snapshot taken after ${SAMPLE_INTERVAL}s interval"
|
||||
|
||||
local devices
|
||||
devices=$(get_devices snap2)
|
||||
|
||||
if [[ -z "$devices" ]]; then
|
||||
log_error "No disks found after filtering"
|
||||
echo "# No disks found"
|
||||
return
|
||||
fi
|
||||
|
||||
local interval="$SAMPLE_INTERVAL"
|
||||
|
||||
# ── HELP/TYPE headers and metric values ──
|
||||
|
||||
echo "# HELP linux_disk_io_read_iops Read operations per second"
|
||||
echo "# TYPE linux_disk_io_read_iops gauge"
|
||||
while read -r dev; do
|
||||
local r1 r2 delta
|
||||
r1="${snap1[${dev}_reads]:-0}"
|
||||
r2="${snap2[${dev}_reads]:-0}"
|
||||
delta=$((r2 - r1))
|
||||
local value
|
||||
value=$(awk "BEGIN {printf \"%.2f\", $delta / $interval}")
|
||||
echo "linux_disk_io_read_iops{disk=\"${dev}\"} ${value}"
|
||||
debug_echo "$dev read_iops=$value"
|
||||
done <<< "$devices"
|
||||
|
||||
echo "# HELP linux_disk_io_write_iops Write operations per second"
|
||||
echo "# TYPE linux_disk_io_write_iops gauge"
|
||||
while read -r dev; do
|
||||
local w1 w2 delta
|
||||
w1="${snap1[${dev}_writes]:-0}"
|
||||
w2="${snap2[${dev}_writes]:-0}"
|
||||
delta=$((w2 - w1))
|
||||
local value
|
||||
value=$(awk "BEGIN {printf \"%.2f\", $delta / $interval}")
|
||||
echo "linux_disk_io_write_iops{disk=\"${dev}\"} ${value}"
|
||||
debug_echo "$dev write_iops=$value"
|
||||
done <<< "$devices"
|
||||
|
||||
echo "# HELP linux_disk_io_read_bytes_per_sec Bytes read per second"
|
||||
echo "# TYPE linux_disk_io_read_bytes_per_sec gauge"
|
||||
while read -r dev; do
|
||||
local s1 s2 delta
|
||||
s1="${snap1[${dev}_sectors_read]:-0}"
|
||||
s2="${snap2[${dev}_sectors_read]:-0}"
|
||||
delta=$((s2 - s1))
|
||||
# Each sector is 512 bytes
|
||||
local value
|
||||
value=$(awk "BEGIN {printf \"%.2f\", ($delta * 512) / $interval}")
|
||||
echo "linux_disk_io_read_bytes_per_sec{disk=\"${dev}\"} ${value}"
|
||||
debug_echo "$dev read_bytes_per_sec=$value"
|
||||
done <<< "$devices"
|
||||
|
||||
echo "# HELP linux_disk_io_write_bytes_per_sec Bytes written per second"
|
||||
echo "# TYPE linux_disk_io_write_bytes_per_sec gauge"
|
||||
while read -r dev; do
|
||||
local s1 s2 delta
|
||||
s1="${snap1[${dev}_sectors_written]:-0}"
|
||||
s2="${snap2[${dev}_sectors_written]:-0}"
|
||||
delta=$((s2 - s1))
|
||||
local value
|
||||
value=$(awk "BEGIN {printf \"%.2f\", ($delta * 512) / $interval}")
|
||||
echo "linux_disk_io_write_bytes_per_sec{disk=\"${dev}\"} ${value}"
|
||||
debug_echo "$dev write_bytes_per_sec=$value"
|
||||
done <<< "$devices"
|
||||
|
||||
echo "# HELP linux_disk_io_await_ms Average I/O latency in milliseconds"
|
||||
echo "# TYPE linux_disk_io_await_ms gauge"
|
||||
while read -r dev; do
|
||||
local r1 r2 w1 w2 rm1 rm2 wm1 wm2
|
||||
r1="${snap1[${dev}_reads]:-0}"
|
||||
r2="${snap2[${dev}_reads]:-0}"
|
||||
w1="${snap1[${dev}_writes]:-0}"
|
||||
w2="${snap2[${dev}_writes]:-0}"
|
||||
rm1="${snap1[${dev}_read_ms]:-0}"
|
||||
rm2="${snap2[${dev}_read_ms]:-0}"
|
||||
wm1="${snap1[${dev}_write_ms]:-0}"
|
||||
wm2="${snap2[${dev}_write_ms]:-0}"
|
||||
local total_ops total_ms
|
||||
total_ops=$(( (r2 - r1) + (w2 - w1) ))
|
||||
total_ms=$(( (rm2 - rm1) + (wm2 - wm1) ))
|
||||
local value
|
||||
if [[ "$total_ops" -gt 0 ]]; then
|
||||
value=$(awk "BEGIN {printf \"%.2f\", $total_ms / $total_ops}")
|
||||
else
|
||||
value="0.00"
|
||||
fi
|
||||
echo "linux_disk_io_await_ms{disk=\"${dev}\"} ${value}"
|
||||
debug_echo "$dev await_ms=$value"
|
||||
done <<< "$devices"
|
||||
|
||||
echo "# HELP linux_disk_io_util_percent Disk utilization percentage"
|
||||
echo "# TYPE linux_disk_io_util_percent gauge"
|
||||
while read -r dev; do
|
||||
local m1 m2 delta
|
||||
m1="${snap1[${dev}_io_ms]:-0}"
|
||||
m2="${snap2[${dev}_io_ms]:-0}"
|
||||
delta=$((m2 - m1))
|
||||
# io_ms is milliseconds spent doing I/O; interval is in seconds
|
||||
local value
|
||||
value=$(awk "BEGIN {v = ($delta / ($interval * 1000)) * 100; if (v > 100) v = 100; printf \"%.2f\", v}")
|
||||
echo "linux_disk_io_util_percent{disk=\"${dev}\"} ${value}"
|
||||
debug_echo "$dev util_percent=$value"
|
||||
done <<< "$devices"
|
||||
|
||||
echo "# HELP linux_disk_io_queue_depth Weighted number of I/Os in progress (avgqu-sz)"
|
||||
echo "# TYPE linux_disk_io_queue_depth gauge"
|
||||
while read -r dev; do
|
||||
local m1 m2 delta
|
||||
m1="${snap1[${dev}_weighted_ms]:-0}"
|
||||
m2="${snap2[${dev}_weighted_ms]:-0}"
|
||||
delta=$((m2 - m1))
|
||||
local value
|
||||
value=$(awk "BEGIN {printf \"%.2f\", $delta / ($interval * 1000)}")
|
||||
echo "linux_disk_io_queue_depth{disk=\"${dev}\"} ${value}"
|
||||
debug_echo "$dev queue_depth=$value"
|
||||
done <<< "$devices"
|
||||
|
||||
# ── Script metadata metrics ──
|
||||
|
||||
local end_time runtime
|
||||
end_time=$(date +%s%N)
|
||||
runtime=$(awk "BEGIN {printf \"%.3f\", ($end_time - $start_time) / 1000000000}")
|
||||
|
||||
echo ""
|
||||
echo "# HELP linux_disk_io_exporter_duration_seconds Script execution time"
|
||||
echo "# TYPE linux_disk_io_exporter_duration_seconds gauge"
|
||||
echo "linux_disk_io_exporter_duration_seconds ${runtime}"
|
||||
|
||||
echo "# HELP linux_disk_io_exporter_last_run_timestamp Last successful run"
|
||||
echo "# TYPE linux_disk_io_exporter_last_run_timestamp gauge"
|
||||
echo "linux_disk_io_exporter_last_run_timestamp $(date +%s)"
|
||||
|
||||
echo "# HELP linux_disk_io_exporter_success Whether the exporter ran successfully"
|
||||
echo "# TYPE linux_disk_io_exporter_success gauge"
|
||||
echo "linux_disk_io_exporter_success 1"
|
||||
}
|
||||
|
||||
# ── Main ────────────────────────────────────────────────────────────
|
||||
|
||||
main() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--debug)
|
||||
DEBUG=1
|
||||
shift
|
||||
;;
|
||||
--help|-h)
|
||||
show_help
|
||||
;;
|
||||
--version|-v)
|
||||
show_version
|
||||
;;
|
||||
*)
|
||||
log_error "Unknown option: $1"
|
||||
echo "Use --help for usage information" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ! -f /proc/diskstats ]]; then
|
||||
log_error "/proc/diskstats not found — this script requires a Linux system"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$DRY_RUN" == true ]]; then
|
||||
collect_metrics
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ ! -d "$NODE_DIR" ]]; then
|
||||
log_error "Textfile collector directory does not exist: $NODE_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
collect_metrics > "$TMP_FILE"
|
||||
chmod 644 "$TMP_FILE"
|
||||
mv -f "$TMP_FILE" "$OUTPUT_FILE"
|
||||
debug_echo "Metrics written to $OUTPUT_FILE"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user