a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
724 lines
27 KiB
Bash
724 lines
27 KiB
Bash
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: memory-pressure-exporter.sh
|
|
# Version: 1.0
|
|
# Description: Prometheus exporter for memory and swap pressure metrics.
|
|
# Exports PSI stall information, OOM kill events, swap activity
|
|
# rates, NUMA memory balance, slab pressure, transparent hugepage
|
|
# stats, and zone watermark proximity.
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
#
|
|
# Prerequisites:
|
|
# - Standard Unix tools (awk, grep, cat)
|
|
# - netcat (nc) for HTTP mode
|
|
# - Optional: journalctl (OOM tracking), kernel 4.20+ (PSI),
|
|
# multi-node NUMA. Each section is skipped gracefully if unavailable.
|
|
#
|
|
# Usage:
|
|
# # Output to stdout
|
|
# ./memory-pressure-exporter.sh
|
|
#
|
|
# # HTTP server mode
|
|
# ./memory-pressure-exporter.sh --http -p 9198
|
|
#
|
|
# # Textfile collector mode
|
|
# ./memory-pressure-exporter.sh --textfile
|
|
#
|
|
# Metrics Exported:
|
|
# Core Status:
|
|
# - memory_pressure_up - Exporter status (1=up, 0=down)
|
|
# - memory_pressure_exporter_info{version} - Exporter version
|
|
#
|
|
# PSI Memory (if /proc/pressure/memory exists):
|
|
# - memory_pressure_psi_some_avg10 - PSI memory some avg10
|
|
# - memory_pressure_psi_some_avg60 - PSI memory some avg60
|
|
# - memory_pressure_psi_some_avg300 - PSI memory some avg300
|
|
# - memory_pressure_psi_some_total_microseconds - PSI memory some total
|
|
# - memory_pressure_psi_full_avg10 - PSI memory full avg10
|
|
# - memory_pressure_psi_full_avg60 - PSI memory full avg60
|
|
# - memory_pressure_psi_full_avg300 - PSI memory full avg300
|
|
# - memory_pressure_psi_full_total_microseconds - PSI memory full total
|
|
#
|
|
# PSI I/O (if /proc/pressure/io exists):
|
|
# - memory_pressure_psi_io_some_avg10 - PSI I/O some avg10
|
|
# - memory_pressure_psi_io_some_avg60 - PSI I/O some avg60
|
|
# - memory_pressure_psi_io_some_avg300 - PSI I/O some avg300
|
|
# - memory_pressure_psi_io_some_total_microseconds - PSI I/O some total
|
|
# - memory_pressure_psi_io_full_avg10 - PSI I/O full avg10
|
|
# - memory_pressure_psi_io_full_avg60 - PSI I/O full avg60
|
|
# - memory_pressure_psi_io_full_avg300 - PSI I/O full avg300
|
|
# - memory_pressure_psi_io_full_total_microseconds - PSI I/O full total
|
|
#
|
|
# OOM Kills (if journalctl available):
|
|
# - memory_pressure_oom_kills_24h - OOM kills in last 24 hours
|
|
# - memory_pressure_oom_last_kill_timestamp - Unix timestamp of last OOM
|
|
# - memory_pressure_oom_last_victim{process} - Last killed process (1)
|
|
#
|
|
# Swap Activity:
|
|
# - memory_pressure_swap_in_pages_per_sec - Swap in pages/sec
|
|
# - memory_pressure_swap_out_pages_per_sec - Swap out pages/sec
|
|
# - memory_pressure_swap_in_bytes_per_sec - Swap in bytes/sec
|
|
# - memory_pressure_swap_out_bytes_per_sec - Swap out bytes/sec
|
|
#
|
|
# NUMA (if multi-node):
|
|
# - memory_pressure_numa_total_bytes{node} - Total memory per node
|
|
# - memory_pressure_numa_free_bytes{node} - Free memory per node
|
|
# - memory_pressure_numa_used_percent{node} - Usage percentage per node
|
|
#
|
|
# Transparent Hugepages:
|
|
# - memory_pressure_thp_fault_alloc_total - THP fault allocations
|
|
# - memory_pressure_thp_collapse_alloc_total - THP collapse allocations
|
|
# - memory_pressure_thp_fault_fallback_total - THP fault fallbacks
|
|
# - memory_pressure_compact_stall_total - Compaction stalls
|
|
#
|
|
# Slab:
|
|
# - memory_pressure_slab_reclaimable_bytes - Reclaimable slab
|
|
# - memory_pressure_slab_unreclaimable_bytes - Unreclaimable slab
|
|
# - memory_pressure_slab_total_bytes - Total slab
|
|
# - memory_pressure_slab_unreclaimable_percent - Unreclaimable percentage
|
|
#
|
|
# Zone Watermarks:
|
|
# - memory_pressure_zone_free_pages{zone} - Current free pages
|
|
# - memory_pressure_zone_min_pages{zone} - Min watermark
|
|
# - memory_pressure_zone_low_pages{zone} - Low watermark
|
|
# - memory_pressure_zone_high_pages{zone} - High watermark
|
|
# - memory_pressure_zone_free_above_low{zone} - 1 if free > low
|
|
#
|
|
# Exporter:
|
|
# - memory_pressure_exporter_duration_seconds - Script execution time
|
|
# - memory_pressure_exporter_last_run_timestamp - Last run timestamp
|
|
#
|
|
# Configuration:
|
|
# Default HTTP port: 9198
|
|
# Textfile directory: /var/lib/node_exporter
|
|
# SAMPLE_INTERVAL: seconds between swap activity samples (default: 1)
|
|
#
|
|
################################################################################
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION VARIABLES
|
|
# ============================================================================
|
|
|
|
TEXTFILE_DIR="/var/lib/node_exporter"
|
|
OUTPUT_FILE=""
|
|
HTTP_MODE=false
|
|
HTTP_PORT=9198
|
|
SAMPLE_INTERVAL="${SAMPLE_INTERVAL:-1}"
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
show_usage() {
|
|
cat <<EOF
|
|
Usage: $0 [OPTIONS]
|
|
|
|
Export memory and swap pressure statistics as Prometheus metrics (v1.0).
|
|
|
|
MODES:
|
|
--textfile Write to node_exporter textfile collector
|
|
--http Run HTTP server on port $HTTP_PORT
|
|
|
|
OPTIONS:
|
|
-p, --port HTTP port (default: 9198)
|
|
-o, --output Output file path
|
|
|
|
EXAMPLES:
|
|
$0 --textfile # Write to textfile collector
|
|
$0 --http --port 9198 # Run HTTP server
|
|
$0 -o /tmp/memory_pressure.prom # Write to custom file
|
|
|
|
SECTIONS (auto-detected, skipped if unavailable):
|
|
- PSI memory and I/O pressure (requires kernel 4.20+)
|
|
- OOM kill tracking (requires journalctl)
|
|
- Swap activity rates (always available)
|
|
- NUMA memory balance (requires multi-node system)
|
|
- Transparent hugepage stats (always available)
|
|
- Slab pressure (always available)
|
|
- Zone watermark proximity (always available)
|
|
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help) show_usage ;;
|
|
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/memory_pressure.prom"; shift ;;
|
|
--http) HTTP_MODE=true; shift ;;
|
|
-p|--port) HTTP_PORT="$2"; shift 2 ;;
|
|
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
|
|
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# PSI PRESSURE
|
|
# ============================================================================
|
|
|
|
# Parse a PSI file (/proc/pressure/memory or /proc/pressure/io)
|
|
# Args: $1 - file path
|
|
# Output: lines of "type avg10 avg60 avg300 total"
|
|
# where type is "some" or "full"
|
|
get_psi_stats() {
|
|
local psi_file="$1"
|
|
[ -f "$psi_file" ] || return
|
|
|
|
awk '{
|
|
type = $1
|
|
avg10 = avg60 = avg300 = total = 0
|
|
for (i = 2; i <= NF; i++) {
|
|
split($i, kv, "=")
|
|
if (kv[1] == "avg10") avg10 = kv[2]
|
|
if (kv[1] == "avg60") avg60 = kv[2]
|
|
if (kv[1] == "avg300") avg300 = kv[2]
|
|
if (kv[1] == "total") total = kv[2]
|
|
}
|
|
print type, avg10, avg60, avg300, total
|
|
}' "$psi_file"
|
|
}
|
|
|
|
# ============================================================================
|
|
# OOM KILL TRACKING
|
|
# ============================================================================
|
|
|
|
# Get OOM kill count in last 24 hours
|
|
# Returns: count
|
|
get_oom_kill_count() {
|
|
if ! command -v journalctl >/dev/null 2>&1; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
local count
|
|
count=$(journalctl -k --grep="Out of memory" --since "24 hours ago" --no-pager -q 2>/dev/null | wc -l)
|
|
echo "${count:-0}"
|
|
}
|
|
|
|
# Get last OOM kill timestamp (unix epoch)
|
|
# Returns: timestamp or 0
|
|
get_oom_last_timestamp() {
|
|
if ! command -v journalctl >/dev/null 2>&1; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
local last_line
|
|
last_line=$(journalctl -k --grep="Out of memory" --since "24 hours ago" --no-pager -q -o short-unix 2>/dev/null | tail -1)
|
|
if [ -n "$last_line" ]; then
|
|
echo "$last_line" | awk '{printf "%d", $1}'
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# Get last OOM victim process name
|
|
# Returns: process name or empty
|
|
get_oom_last_victim() {
|
|
if ! command -v journalctl >/dev/null 2>&1; then
|
|
return
|
|
fi
|
|
journalctl -k --grep="Killed process" --since "24 hours ago" --no-pager -q 2>/dev/null \
|
|
| tail -1 \
|
|
| grep -oP 'Killed process \d+ \(\K[^)]+' \
|
|
| head -1
|
|
}
|
|
|
|
# ============================================================================
|
|
# SWAP ACTIVITY
|
|
# ============================================================================
|
|
|
|
# Read swap counters from /proc/vmstat
|
|
# Returns: "pswpin pswpout"
|
|
get_swap_counters() {
|
|
awk '/^pswpin / { pin=$2 } /^pswpout / { pout=$2 } END { print pin, pout }' /proc/vmstat 2>/dev/null
|
|
}
|
|
|
|
# ============================================================================
|
|
# NUMA MEMORY
|
|
# ============================================================================
|
|
|
|
# Check if system has multiple NUMA nodes
|
|
# Returns: 0 (true) if multi-node, 1 (false) if single
|
|
is_numa_multi_node() {
|
|
[ -d /sys/devices/system/node/node1 ]
|
|
}
|
|
|
|
# Get NUMA memory info per node
|
|
# Output: lines of "nodeN total_kb free_kb"
|
|
get_numa_memory() {
|
|
local node_dir="/sys/devices/system/node"
|
|
[ -d "$node_dir" ] || return
|
|
|
|
for node_path in "$node_dir"/node[0-9]*; do
|
|
[ -d "$node_path" ] || continue
|
|
local node_name
|
|
node_name=$(basename "$node_path")
|
|
local meminfo="$node_path/meminfo"
|
|
[ -f "$meminfo" ] || continue
|
|
|
|
local total free
|
|
total=$(awk '/MemTotal/ {print $4}' "$meminfo" 2>/dev/null)
|
|
free=$(awk '/MemFree/ {print $4}' "$meminfo" 2>/dev/null)
|
|
echo "$node_name ${total:-0} ${free:-0}"
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# TRANSPARENT HUGEPAGES & COMPACTION
|
|
# ============================================================================
|
|
|
|
# Get THP and compaction stats from /proc/vmstat
|
|
# Returns: "thp_fault_alloc thp_collapse_alloc thp_fault_fallback compact_stall"
|
|
get_thp_stats() {
|
|
awk '
|
|
/^thp_fault_alloc / { fault=$2 }
|
|
/^thp_collapse_alloc / { collapse=$2 }
|
|
/^thp_fault_fallback / { fallback=$2 }
|
|
/^compact_stall / { stall=$2 }
|
|
END { print fault+0, collapse+0, fallback+0, stall+0 }
|
|
' /proc/vmstat 2>/dev/null
|
|
}
|
|
|
|
# ============================================================================
|
|
# SLAB MEMORY
|
|
# ============================================================================
|
|
|
|
# Get slab memory from /proc/meminfo
|
|
# Returns: "reclaimable_kb unreclaimable_kb"
|
|
get_slab_stats() {
|
|
awk '
|
|
/^SReclaimable:/ { reclaimable=$2 }
|
|
/^SUnreclaim:/ { unreclaimable=$2 }
|
|
END { print reclaimable+0, unreclaimable+0 }
|
|
' /proc/meminfo 2>/dev/null
|
|
}
|
|
|
|
# ============================================================================
|
|
# ZONE WATERMARKS
|
|
# ============================================================================
|
|
|
|
# Parse /proc/zoneinfo for Normal and DMA32 zones
|
|
# Output: lines of "zone free min low high"
|
|
get_zone_watermarks() {
|
|
awk '
|
|
/^Node [0-9]+, zone +[A-Za-z0-9]+/ {
|
|
zone = $NF
|
|
}
|
|
zone == "Normal" || zone == "DMA32" {
|
|
if ($1 == "pages" && $2 == "free") free = $3
|
|
if ($1 == "min") min_wm = $2
|
|
if ($1 == "low") low_wm = $2
|
|
if ($1 == "high") {
|
|
high_wm = $2
|
|
print zone, free+0, min_wm+0, low_wm+0, high_wm+0
|
|
zone = ""
|
|
}
|
|
}
|
|
' /proc/zoneinfo 2>/dev/null
|
|
}
|
|
|
|
# ============================================================================
|
|
# METRICS GENERATION
|
|
# ============================================================================
|
|
|
|
generate_metrics() {
|
|
local script_start
|
|
script_start=$(date +%s)
|
|
|
|
# ========================================================================
|
|
# Exporter Status
|
|
# ========================================================================
|
|
cat <<EOF
|
|
# HELP memory_pressure_up Exporter status (1=up)
|
|
# TYPE memory_pressure_up gauge
|
|
memory_pressure_up 1
|
|
|
|
# HELP memory_pressure_exporter_info Exporter version information
|
|
# TYPE memory_pressure_exporter_info gauge
|
|
memory_pressure_exporter_info{version="1.0"} 1
|
|
|
|
EOF
|
|
|
|
# ========================================================================
|
|
# PSI Memory Pressure
|
|
# ========================================================================
|
|
if [ -f /proc/pressure/memory ]; then
|
|
local psi_mem
|
|
psi_mem=$(get_psi_stats /proc/pressure/memory)
|
|
|
|
if [ -n "$psi_mem" ]; then
|
|
echo "$psi_mem" | while read -r type avg10 avg60 avg300 total; do
|
|
echo "# HELP memory_pressure_psi_${type}_avg10 PSI memory ${type} avg10 percentage"
|
|
echo "# TYPE memory_pressure_psi_${type}_avg10 gauge"
|
|
echo "memory_pressure_psi_${type}_avg10 $avg10"
|
|
echo ""
|
|
echo "# HELP memory_pressure_psi_${type}_avg60 PSI memory ${type} avg60 percentage"
|
|
echo "# TYPE memory_pressure_psi_${type}_avg60 gauge"
|
|
echo "memory_pressure_psi_${type}_avg60 $avg60"
|
|
echo ""
|
|
echo "# HELP memory_pressure_psi_${type}_avg300 PSI memory ${type} avg300 percentage"
|
|
echo "# TYPE memory_pressure_psi_${type}_avg300 gauge"
|
|
echo "memory_pressure_psi_${type}_avg300 $avg300"
|
|
echo ""
|
|
echo "# HELP memory_pressure_psi_${type}_total_microseconds PSI memory ${type} total stall time in microseconds"
|
|
echo "# TYPE memory_pressure_psi_${type}_total_microseconds counter"
|
|
echo "memory_pressure_psi_${type}_total_microseconds $total"
|
|
echo ""
|
|
done
|
|
fi
|
|
fi
|
|
|
|
# ========================================================================
|
|
# PSI I/O Pressure
|
|
# ========================================================================
|
|
if [ -f /proc/pressure/io ]; then
|
|
local psi_io
|
|
psi_io=$(get_psi_stats /proc/pressure/io)
|
|
|
|
if [ -n "$psi_io" ]; then
|
|
echo "$psi_io" | while read -r type avg10 avg60 avg300 total; do
|
|
echo "# HELP memory_pressure_psi_io_${type}_avg10 PSI I/O ${type} avg10 percentage"
|
|
echo "# TYPE memory_pressure_psi_io_${type}_avg10 gauge"
|
|
echo "memory_pressure_psi_io_${type}_avg10 $avg10"
|
|
echo ""
|
|
echo "# HELP memory_pressure_psi_io_${type}_avg60 PSI I/O ${type} avg60 percentage"
|
|
echo "# TYPE memory_pressure_psi_io_${type}_avg60 gauge"
|
|
echo "memory_pressure_psi_io_${type}_avg60 $avg60"
|
|
echo ""
|
|
echo "# HELP memory_pressure_psi_io_${type}_avg300 PSI I/O ${type} avg300 percentage"
|
|
echo "# TYPE memory_pressure_psi_io_${type}_avg300 gauge"
|
|
echo "memory_pressure_psi_io_${type}_avg300 $avg300"
|
|
echo ""
|
|
echo "# HELP memory_pressure_psi_io_${type}_total_microseconds PSI I/O ${type} total stall time in microseconds"
|
|
echo "# TYPE memory_pressure_psi_io_${type}_total_microseconds counter"
|
|
echo "memory_pressure_psi_io_${type}_total_microseconds $total"
|
|
echo ""
|
|
done
|
|
fi
|
|
fi
|
|
|
|
# ========================================================================
|
|
# OOM Kill Events
|
|
# ========================================================================
|
|
local oom_count oom_timestamp oom_victim
|
|
oom_count=$(get_oom_kill_count)
|
|
oom_timestamp=$(get_oom_last_timestamp)
|
|
oom_victim=$(get_oom_last_victim)
|
|
|
|
cat <<EOF
|
|
# HELP memory_pressure_oom_kills_24h OOM kills in the last 24 hours
|
|
# TYPE memory_pressure_oom_kills_24h gauge
|
|
memory_pressure_oom_kills_24h ${oom_count:-0}
|
|
|
|
# HELP memory_pressure_oom_last_kill_timestamp Unix timestamp of last OOM kill (0 if none)
|
|
# TYPE memory_pressure_oom_last_kill_timestamp gauge
|
|
memory_pressure_oom_last_kill_timestamp ${oom_timestamp:-0}
|
|
|
|
EOF
|
|
|
|
if [ -n "$oom_victim" ]; then
|
|
cat <<EOF
|
|
# HELP memory_pressure_oom_last_victim Last OOM-killed process (value is always 1)
|
|
# TYPE memory_pressure_oom_last_victim gauge
|
|
memory_pressure_oom_last_victim{process="$oom_victim"} 1
|
|
|
|
EOF
|
|
fi
|
|
|
|
# ========================================================================
|
|
# Swap Activity (two-sample delta)
|
|
# ========================================================================
|
|
local swap1 swap2
|
|
swap1=$(get_swap_counters)
|
|
|
|
sleep "$SAMPLE_INTERVAL"
|
|
|
|
swap2=$(get_swap_counters)
|
|
|
|
local pin1 pout1 pin2 pout2
|
|
pin1=$(echo "$swap1" | awk '{print $1}')
|
|
pout1=$(echo "$swap1" | awk '{print $2}')
|
|
pin2=$(echo "$swap2" | awk '{print $1}')
|
|
pout2=$(echo "$swap2" | awk '{print $2}')
|
|
|
|
local swap_in_rate swap_out_rate swap_in_bytes swap_out_bytes
|
|
swap_in_rate=$(awk "BEGIN {printf \"%.2f\", (${pin2:-0} - ${pin1:-0}) / $SAMPLE_INTERVAL}")
|
|
swap_out_rate=$(awk "BEGIN {printf \"%.2f\", (${pout2:-0} - ${pout1:-0}) / $SAMPLE_INTERVAL}")
|
|
swap_in_bytes=$(awk "BEGIN {printf \"%.2f\", ((${pin2:-0} - ${pin1:-0}) * 4096) / $SAMPLE_INTERVAL}")
|
|
swap_out_bytes=$(awk "BEGIN {printf \"%.2f\", ((${pout2:-0} - ${pout1:-0}) * 4096) / $SAMPLE_INTERVAL}")
|
|
|
|
cat <<EOF
|
|
# HELP memory_pressure_swap_in_pages_per_sec Pages swapped in per second
|
|
# TYPE memory_pressure_swap_in_pages_per_sec gauge
|
|
memory_pressure_swap_in_pages_per_sec $swap_in_rate
|
|
|
|
# HELP memory_pressure_swap_out_pages_per_sec Pages swapped out per second
|
|
# TYPE memory_pressure_swap_out_pages_per_sec gauge
|
|
memory_pressure_swap_out_pages_per_sec $swap_out_rate
|
|
|
|
# HELP memory_pressure_swap_in_bytes_per_sec Bytes swapped in per second
|
|
# TYPE memory_pressure_swap_in_bytes_per_sec gauge
|
|
memory_pressure_swap_in_bytes_per_sec $swap_in_bytes
|
|
|
|
# HELP memory_pressure_swap_out_bytes_per_sec Bytes swapped out per second
|
|
# TYPE memory_pressure_swap_out_bytes_per_sec gauge
|
|
memory_pressure_swap_out_bytes_per_sec $swap_out_bytes
|
|
|
|
EOF
|
|
|
|
# ========================================================================
|
|
# NUMA Memory Balance
|
|
# ========================================================================
|
|
if is_numa_multi_node; then
|
|
local numa_data
|
|
numa_data=$(get_numa_memory)
|
|
|
|
if [ -n "$numa_data" ]; then
|
|
echo "$numa_data" | while read -r node total_kb free_kb; do
|
|
local total_bytes free_bytes used_pct
|
|
total_bytes=$((total_kb * 1024))
|
|
free_bytes=$((free_kb * 1024))
|
|
if [ "$total_kb" -gt 0 ]; then
|
|
used_pct=$(awk "BEGIN {printf \"%.2f\", (($total_kb - $free_kb) / $total_kb) * 100}")
|
|
else
|
|
used_pct="0.00"
|
|
fi
|
|
echo "# HELP memory_pressure_numa_total_bytes Total memory per NUMA node in bytes"
|
|
echo "# TYPE memory_pressure_numa_total_bytes gauge"
|
|
echo "memory_pressure_numa_total_bytes{node=\"$node\"} $total_bytes"
|
|
echo ""
|
|
echo "# HELP memory_pressure_numa_free_bytes Free memory per NUMA node in bytes"
|
|
echo "# TYPE memory_pressure_numa_free_bytes gauge"
|
|
echo "memory_pressure_numa_free_bytes{node=\"$node\"} $free_bytes"
|
|
echo ""
|
|
echo "# HELP memory_pressure_numa_used_percent Memory usage percentage per NUMA node"
|
|
echo "# TYPE memory_pressure_numa_used_percent gauge"
|
|
echo "memory_pressure_numa_used_percent{node=\"$node\"} $used_pct"
|
|
echo ""
|
|
done
|
|
fi
|
|
fi
|
|
|
|
# ========================================================================
|
|
# Transparent Hugepages & Compaction
|
|
# ========================================================================
|
|
local thp_stats
|
|
thp_stats=$(get_thp_stats)
|
|
|
|
if [ -n "$thp_stats" ]; then
|
|
local thp_fault thp_collapse thp_fallback compact_stall
|
|
thp_fault=$(echo "$thp_stats" | awk '{print $1}')
|
|
thp_collapse=$(echo "$thp_stats" | awk '{print $2}')
|
|
thp_fallback=$(echo "$thp_stats" | awk '{print $3}')
|
|
compact_stall=$(echo "$thp_stats" | awk '{print $4}')
|
|
|
|
cat <<EOF
|
|
# HELP memory_pressure_thp_fault_alloc_total THP fault allocations
|
|
# TYPE memory_pressure_thp_fault_alloc_total counter
|
|
memory_pressure_thp_fault_alloc_total ${thp_fault:-0}
|
|
|
|
# HELP memory_pressure_thp_collapse_alloc_total THP collapse allocations
|
|
# TYPE memory_pressure_thp_collapse_alloc_total counter
|
|
memory_pressure_thp_collapse_alloc_total ${thp_collapse:-0}
|
|
|
|
# HELP memory_pressure_thp_fault_fallback_total THP fault fallbacks to regular pages
|
|
# TYPE memory_pressure_thp_fault_fallback_total counter
|
|
memory_pressure_thp_fault_fallback_total ${thp_fallback:-0}
|
|
|
|
# HELP memory_pressure_compact_stall_total Memory compaction stalls
|
|
# TYPE memory_pressure_compact_stall_total counter
|
|
memory_pressure_compact_stall_total ${compact_stall:-0}
|
|
|
|
EOF
|
|
fi
|
|
|
|
# ========================================================================
|
|
# Slab Memory
|
|
# ========================================================================
|
|
local slab_stats
|
|
slab_stats=$(get_slab_stats)
|
|
|
|
if [ -n "$slab_stats" ]; then
|
|
local slab_reclaim_kb slab_unreclaim_kb
|
|
slab_reclaim_kb=$(echo "$slab_stats" | awk '{print $1}')
|
|
slab_unreclaim_kb=$(echo "$slab_stats" | awk '{print $2}')
|
|
|
|
local slab_reclaim_bytes slab_unreclaim_bytes slab_total_bytes slab_unreclaim_pct
|
|
slab_reclaim_bytes=$((slab_reclaim_kb * 1024))
|
|
slab_unreclaim_bytes=$((slab_unreclaim_kb * 1024))
|
|
slab_total_bytes=$(( slab_reclaim_bytes + slab_unreclaim_bytes ))
|
|
|
|
if [ "$slab_total_bytes" -gt 0 ]; then
|
|
slab_unreclaim_pct=$(awk "BEGIN {printf \"%.2f\", ($slab_unreclaim_bytes / $slab_total_bytes) * 100}")
|
|
else
|
|
slab_unreclaim_pct="0.00"
|
|
fi
|
|
|
|
cat <<EOF
|
|
# HELP memory_pressure_slab_reclaimable_bytes Reclaimable slab memory in bytes
|
|
# TYPE memory_pressure_slab_reclaimable_bytes gauge
|
|
memory_pressure_slab_reclaimable_bytes $slab_reclaim_bytes
|
|
|
|
# HELP memory_pressure_slab_unreclaimable_bytes Unreclaimable slab memory in bytes
|
|
# TYPE memory_pressure_slab_unreclaimable_bytes gauge
|
|
memory_pressure_slab_unreclaimable_bytes $slab_unreclaim_bytes
|
|
|
|
# HELP memory_pressure_slab_total_bytes Total slab memory in bytes
|
|
# TYPE memory_pressure_slab_total_bytes gauge
|
|
memory_pressure_slab_total_bytes $slab_total_bytes
|
|
|
|
# HELP memory_pressure_slab_unreclaimable_percent Percentage of slab memory that is unreclaimable
|
|
# TYPE memory_pressure_slab_unreclaimable_percent gauge
|
|
memory_pressure_slab_unreclaimable_percent $slab_unreclaim_pct
|
|
|
|
EOF
|
|
fi
|
|
|
|
# ========================================================================
|
|
# Zone Watermarks
|
|
# ========================================================================
|
|
local zone_data
|
|
zone_data=$(get_zone_watermarks)
|
|
|
|
if [ -n "$zone_data" ]; then
|
|
echo "$zone_data" | while read -r zone free min_wm low_wm high_wm; do
|
|
local above_low=1
|
|
if [ "$free" -le "$low_wm" ]; then
|
|
above_low=0
|
|
fi
|
|
echo "# HELP memory_pressure_zone_free_pages Current free pages per zone"
|
|
echo "# TYPE memory_pressure_zone_free_pages gauge"
|
|
echo "memory_pressure_zone_free_pages{zone=\"$zone\"} $free"
|
|
echo ""
|
|
echo "# HELP memory_pressure_zone_min_pages Min watermark pages per zone"
|
|
echo "# TYPE memory_pressure_zone_min_pages gauge"
|
|
echo "memory_pressure_zone_min_pages{zone=\"$zone\"} $min_wm"
|
|
echo ""
|
|
echo "# HELP memory_pressure_zone_low_pages Low watermark pages per zone"
|
|
echo "# TYPE memory_pressure_zone_low_pages gauge"
|
|
echo "memory_pressure_zone_low_pages{zone=\"$zone\"} $low_wm"
|
|
echo ""
|
|
echo "# HELP memory_pressure_zone_high_pages High watermark pages per zone"
|
|
echo "# TYPE memory_pressure_zone_high_pages gauge"
|
|
echo "memory_pressure_zone_high_pages{zone=\"$zone\"} $high_wm"
|
|
echo ""
|
|
echo "# HELP memory_pressure_zone_free_above_low Whether free pages are above the low watermark (1=above, 0=below)"
|
|
echo "# TYPE memory_pressure_zone_free_above_low gauge"
|
|
echo "memory_pressure_zone_free_above_low{zone=\"$zone\"} $above_low"
|
|
echo ""
|
|
done
|
|
fi
|
|
|
|
# ========================================================================
|
|
# Exporter Runtime
|
|
# ========================================================================
|
|
local script_end script_duration
|
|
script_end=$(date +%s)
|
|
script_duration=$((script_end - script_start))
|
|
|
|
cat <<EOF
|
|
# HELP memory_pressure_exporter_duration_seconds Time to generate all metrics
|
|
# TYPE memory_pressure_exporter_duration_seconds gauge
|
|
memory_pressure_exporter_duration_seconds $script_duration
|
|
|
|
# HELP memory_pressure_exporter_last_run_timestamp Unix timestamp of last successful run
|
|
# TYPE memory_pressure_exporter_last_run_timestamp gauge
|
|
memory_pressure_exporter_last_run_timestamp $script_end
|
|
EOF
|
|
|
|
echo ""
|
|
}
|
|
|
|
# ============================================================================
|
|
# HTTP SERVER MODE
|
|
# ============================================================================
|
|
|
|
run_http_server() {
|
|
echo "Starting memory pressure exporter on port $HTTP_PORT..." >&2
|
|
|
|
if ! command -v nc >/dev/null 2>&1; then
|
|
echo "ERROR: netcat (nc) required for HTTP mode" >&2
|
|
exit 1
|
|
fi
|
|
|
|
while true; do
|
|
{
|
|
read -r request
|
|
if [[ "$request" =~ ^GET\ /metrics ]]; then
|
|
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r"
|
|
generate_metrics
|
|
else
|
|
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r"
|
|
cat <<EOF
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Memory Pressure Exporter v1.0</title></head>
|
|
<body>
|
|
<h1>Memory Pressure Exporter v1.0</h1>
|
|
<p><a href="/metrics">Metrics</a></p>
|
|
<h2>Sections (auto-detected)</h2>
|
|
<ul>
|
|
<li>PSI memory and I/O pressure (requires kernel 4.20+)</li>
|
|
<li>OOM kill tracking (requires journalctl)</li>
|
|
<li>Swap activity rates</li>
|
|
<li>NUMA memory balance (requires multi-node system)</li>
|
|
<li>Transparent hugepage and compaction stats</li>
|
|
<li>Slab memory pressure</li>
|
|
<li>Zone watermark proximity</li>
|
|
</ul>
|
|
</body>
|
|
</html>
|
|
EOF
|
|
fi
|
|
} | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN EXECUTION
|
|
# ============================================================================
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
if [ "$HTTP_MODE" = true ]; then
|
|
run_http_server
|
|
elif [ -n "$OUTPUT_FILE" ]; then
|
|
local output_dir
|
|
output_dir="$(dirname "$OUTPUT_FILE")"
|
|
mkdir -p "$output_dir"
|
|
|
|
local temp_file
|
|
temp_file=$(mktemp "${output_dir}/.memory_pressure_metrics.XXXXXX")
|
|
|
|
if ! generate_metrics > "$temp_file" 2>/dev/null; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Failed to generate metrics" >&2
|
|
exit 1
|
|
fi
|
|
|
|
local file_lines
|
|
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
|
|
|
|
if [ "$file_lines" -lt 10 ]; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
|
|
exit 1
|
|
fi
|
|
|
|
chmod 644 "$temp_file"
|
|
mv -f "$temp_file" "$OUTPUT_FILE"
|
|
|
|
echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
|
|
else
|
|
generate_metrics
|
|
fi
|
|
}
|
|
|
|
main "$@"
|