Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
This commit is contained in:
@@ -0,0 +1,723 @@
|
||||
#!/bin/bash
|
||||
################################################################################
|
||||
# Script Name: memory-pressure-exporter.sh
|
||||
# Version: 1.0
|
||||
# Description: Prometheus exporter for memory and swap pressure metrics.
|
||||
# Exports PSI stall information, OOM kill events, swap activity
|
||||
# rates, NUMA memory balance, slab pressure, transparent hugepage
|
||||
# stats, and zone watermark proximity.
|
||||
#
|
||||
# Author: Phil Connor
|
||||
# Contact: contact@mylinux.work
|
||||
# Website: https://mylinux.work
|
||||
# License: MIT
|
||||
#
|
||||
# Prerequisites:
|
||||
# - Standard Unix tools (awk, grep, cat)
|
||||
# - netcat (nc) for HTTP mode
|
||||
# - Optional: journalctl (OOM tracking), kernel 4.20+ (PSI),
|
||||
# multi-node NUMA. Each section is skipped gracefully if unavailable.
|
||||
#
|
||||
# Usage:
|
||||
# # Output to stdout
|
||||
# ./memory-pressure-exporter.sh
|
||||
#
|
||||
# # HTTP server mode
|
||||
# ./memory-pressure-exporter.sh --http -p 9198
|
||||
#
|
||||
# # Textfile collector mode
|
||||
# ./memory-pressure-exporter.sh --textfile
|
||||
#
|
||||
# Metrics Exported:
|
||||
# Core Status:
|
||||
# - memory_pressure_up - Exporter status (1=up, 0=down)
|
||||
# - memory_pressure_exporter_info{version} - Exporter version
|
||||
#
|
||||
# PSI Memory (if /proc/pressure/memory exists):
|
||||
# - memory_pressure_psi_some_avg10 - PSI memory some avg10
|
||||
# - memory_pressure_psi_some_avg60 - PSI memory some avg60
|
||||
# - memory_pressure_psi_some_avg300 - PSI memory some avg300
|
||||
# - memory_pressure_psi_some_total_microseconds - PSI memory some total
|
||||
# - memory_pressure_psi_full_avg10 - PSI memory full avg10
|
||||
# - memory_pressure_psi_full_avg60 - PSI memory full avg60
|
||||
# - memory_pressure_psi_full_avg300 - PSI memory full avg300
|
||||
# - memory_pressure_psi_full_total_microseconds - PSI memory full total
|
||||
#
|
||||
# PSI I/O (if /proc/pressure/io exists):
|
||||
# - memory_pressure_psi_io_some_avg10 - PSI I/O some avg10
|
||||
# - memory_pressure_psi_io_some_avg60 - PSI I/O some avg60
|
||||
# - memory_pressure_psi_io_some_avg300 - PSI I/O some avg300
|
||||
# - memory_pressure_psi_io_some_total_microseconds - PSI I/O some total
|
||||
# - memory_pressure_psi_io_full_avg10 - PSI I/O full avg10
|
||||
# - memory_pressure_psi_io_full_avg60 - PSI I/O full avg60
|
||||
# - memory_pressure_psi_io_full_avg300 - PSI I/O full avg300
|
||||
# - memory_pressure_psi_io_full_total_microseconds - PSI I/O full total
|
||||
#
|
||||
# OOM Kills (if journalctl available):
|
||||
# - memory_pressure_oom_kills_24h - OOM kills in last 24 hours
|
||||
# - memory_pressure_oom_last_kill_timestamp - Unix timestamp of last OOM
|
||||
# - memory_pressure_oom_last_victim{process} - Last killed process (1)
|
||||
#
|
||||
# Swap Activity:
|
||||
# - memory_pressure_swap_in_pages_per_sec - Swap in pages/sec
|
||||
# - memory_pressure_swap_out_pages_per_sec - Swap out pages/sec
|
||||
# - memory_pressure_swap_in_bytes_per_sec - Swap in bytes/sec
|
||||
# - memory_pressure_swap_out_bytes_per_sec - Swap out bytes/sec
|
||||
#
|
||||
# NUMA (if multi-node):
|
||||
# - memory_pressure_numa_total_bytes{node} - Total memory per node
|
||||
# - memory_pressure_numa_free_bytes{node} - Free memory per node
|
||||
# - memory_pressure_numa_used_percent{node} - Usage percentage per node
|
||||
#
|
||||
# Transparent Hugepages:
|
||||
# - memory_pressure_thp_fault_alloc_total - THP fault allocations
|
||||
# - memory_pressure_thp_collapse_alloc_total - THP collapse allocations
|
||||
# - memory_pressure_thp_fault_fallback_total - THP fault fallbacks
|
||||
# - memory_pressure_compact_stall_total - Compaction stalls
|
||||
#
|
||||
# Slab:
|
||||
# - memory_pressure_slab_reclaimable_bytes - Reclaimable slab
|
||||
# - memory_pressure_slab_unreclaimable_bytes - Unreclaimable slab
|
||||
# - memory_pressure_slab_total_bytes - Total slab
|
||||
# - memory_pressure_slab_unreclaimable_percent - Unreclaimable percentage
|
||||
#
|
||||
# Zone Watermarks:
|
||||
# - memory_pressure_zone_free_pages{zone} - Current free pages
|
||||
# - memory_pressure_zone_min_pages{zone} - Min watermark
|
||||
# - memory_pressure_zone_low_pages{zone} - Low watermark
|
||||
# - memory_pressure_zone_high_pages{zone} - High watermark
|
||||
# - memory_pressure_zone_free_above_low{zone} - 1 if free > low
|
||||
#
|
||||
# Exporter:
|
||||
# - memory_pressure_exporter_duration_seconds - Script execution time
|
||||
# - memory_pressure_exporter_last_run_timestamp - Last run timestamp
|
||||
#
|
||||
# Configuration:
|
||||
# Default HTTP port: 9198
|
||||
# Textfile directory: /var/lib/node_exporter
|
||||
# SAMPLE_INTERVAL: seconds between swap activity samples (default: 1)
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# ============================================================================
|
||||
# CONFIGURATION VARIABLES
|
||||
# ============================================================================
|
||||
|
||||
TEXTFILE_DIR="/var/lib/node_exporter"
|
||||
OUTPUT_FILE=""
|
||||
HTTP_MODE=false
|
||||
HTTP_PORT=9198
|
||||
SAMPLE_INTERVAL="${SAMPLE_INTERVAL:-1}"
|
||||
|
||||
# ============================================================================
|
||||
# HELPER FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
show_usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [OPTIONS]
|
||||
|
||||
Export memory and swap pressure statistics as Prometheus metrics (v1.0).
|
||||
|
||||
MODES:
|
||||
--textfile Write to node_exporter textfile collector
|
||||
--http Run HTTP server on port $HTTP_PORT
|
||||
|
||||
OPTIONS:
|
||||
-p, --port HTTP port (default: 9198)
|
||||
-o, --output Output file path
|
||||
|
||||
EXAMPLES:
|
||||
$0 --textfile # Write to textfile collector
|
||||
$0 --http --port 9198 # Run HTTP server
|
||||
$0 -o /tmp/memory_pressure.prom # Write to custom file
|
||||
|
||||
SECTIONS (auto-detected, skipped if unavailable):
|
||||
- PSI memory and I/O pressure (requires kernel 4.20+)
|
||||
- OOM kill tracking (requires journalctl)
|
||||
- Swap activity rates (always available)
|
||||
- NUMA memory balance (requires multi-node system)
|
||||
- Transparent hugepage stats (always available)
|
||||
- Slab pressure (always available)
|
||||
- Zone watermark proximity (always available)
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
parse_args() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-h|--help) show_usage ;;
|
||||
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/memory_pressure.prom"; shift ;;
|
||||
--http) HTTP_MODE=true; shift ;;
|
||||
-p|--port) HTTP_PORT="$2"; shift 2 ;;
|
||||
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
|
||||
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# PSI PRESSURE
|
||||
# ============================================================================
|
||||
|
||||
# Parse a PSI file (/proc/pressure/memory or /proc/pressure/io)
|
||||
# Args: $1 - file path
|
||||
# Output: lines of "type avg10 avg60 avg300 total"
|
||||
# where type is "some" or "full"
|
||||
get_psi_stats() {
|
||||
local psi_file="$1"
|
||||
[ -f "$psi_file" ] || return
|
||||
|
||||
awk '{
|
||||
type = $1
|
||||
avg10 = avg60 = avg300 = total = 0
|
||||
for (i = 2; i <= NF; i++) {
|
||||
split($i, kv, "=")
|
||||
if (kv[1] == "avg10") avg10 = kv[2]
|
||||
if (kv[1] == "avg60") avg60 = kv[2]
|
||||
if (kv[1] == "avg300") avg300 = kv[2]
|
||||
if (kv[1] == "total") total = kv[2]
|
||||
}
|
||||
print type, avg10, avg60, avg300, total
|
||||
}' "$psi_file"
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# OOM KILL TRACKING
|
||||
# ============================================================================
|
||||
|
||||
# Get OOM kill count in last 24 hours
|
||||
# Returns: count
|
||||
get_oom_kill_count() {
|
||||
if ! command -v journalctl >/dev/null 2>&1; then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
local count
|
||||
count=$(journalctl -k --grep="Out of memory" --since "24 hours ago" --no-pager -q 2>/dev/null | wc -l)
|
||||
echo "${count:-0}"
|
||||
}
|
||||
|
||||
# Get last OOM kill timestamp (unix epoch)
|
||||
# Returns: timestamp or 0
|
||||
get_oom_last_timestamp() {
|
||||
if ! command -v journalctl >/dev/null 2>&1; then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
local last_line
|
||||
last_line=$(journalctl -k --grep="Out of memory" --since "24 hours ago" --no-pager -q -o short-unix 2>/dev/null | tail -1)
|
||||
if [ -n "$last_line" ]; then
|
||||
echo "$last_line" | awk '{printf "%d", $1}'
|
||||
else
|
||||
echo "0"
|
||||
fi
|
||||
}
|
||||
|
||||
# Get last OOM victim process name
|
||||
# Returns: process name or empty
|
||||
get_oom_last_victim() {
|
||||
if ! command -v journalctl >/dev/null 2>&1; then
|
||||
return
|
||||
fi
|
||||
journalctl -k --grep="Killed process" --since "24 hours ago" --no-pager -q 2>/dev/null \
|
||||
| tail -1 \
|
||||
| grep -oP 'Killed process \d+ \(\K[^)]+' \
|
||||
| head -1
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# SWAP ACTIVITY
|
||||
# ============================================================================
|
||||
|
||||
# Read swap counters from /proc/vmstat
|
||||
# Returns: "pswpin pswpout"
|
||||
get_swap_counters() {
|
||||
awk '/^pswpin / { pin=$2 } /^pswpout / { pout=$2 } END { print pin, pout }' /proc/vmstat 2>/dev/null
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# NUMA MEMORY
|
||||
# ============================================================================
|
||||
|
||||
# Check if system has multiple NUMA nodes
|
||||
# Returns: 0 (true) if multi-node, 1 (false) if single
|
||||
is_numa_multi_node() {
|
||||
[ -d /sys/devices/system/node/node1 ]
|
||||
}
|
||||
|
||||
# Get NUMA memory info per node
|
||||
# Output: lines of "nodeN total_kb free_kb"
|
||||
get_numa_memory() {
|
||||
local node_dir="/sys/devices/system/node"
|
||||
[ -d "$node_dir" ] || return
|
||||
|
||||
for node_path in "$node_dir"/node[0-9]*; do
|
||||
[ -d "$node_path" ] || continue
|
||||
local node_name
|
||||
node_name=$(basename "$node_path")
|
||||
local meminfo="$node_path/meminfo"
|
||||
[ -f "$meminfo" ] || continue
|
||||
|
||||
local total free
|
||||
total=$(awk '/MemTotal/ {print $4}' "$meminfo" 2>/dev/null)
|
||||
free=$(awk '/MemFree/ {print $4}' "$meminfo" 2>/dev/null)
|
||||
echo "$node_name ${total:-0} ${free:-0}"
|
||||
done
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# TRANSPARENT HUGEPAGES & COMPACTION
|
||||
# ============================================================================
|
||||
|
||||
# Get THP and compaction stats from /proc/vmstat
|
||||
# Returns: "thp_fault_alloc thp_collapse_alloc thp_fault_fallback compact_stall"
|
||||
get_thp_stats() {
|
||||
awk '
|
||||
/^thp_fault_alloc / { fault=$2 }
|
||||
/^thp_collapse_alloc / { collapse=$2 }
|
||||
/^thp_fault_fallback / { fallback=$2 }
|
||||
/^compact_stall / { stall=$2 }
|
||||
END { print fault+0, collapse+0, fallback+0, stall+0 }
|
||||
' /proc/vmstat 2>/dev/null
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# SLAB MEMORY
|
||||
# ============================================================================
|
||||
|
||||
# Get slab memory from /proc/meminfo
|
||||
# Returns: "reclaimable_kb unreclaimable_kb"
|
||||
get_slab_stats() {
|
||||
awk '
|
||||
/^SReclaimable:/ { reclaimable=$2 }
|
||||
/^SUnreclaim:/ { unreclaimable=$2 }
|
||||
END { print reclaimable+0, unreclaimable+0 }
|
||||
' /proc/meminfo 2>/dev/null
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# ZONE WATERMARKS
|
||||
# ============================================================================
|
||||
|
||||
# Parse /proc/zoneinfo for Normal and DMA32 zones
|
||||
# Output: lines of "zone free min low high"
|
||||
get_zone_watermarks() {
|
||||
awk '
|
||||
/^Node [0-9]+, zone +[A-Za-z0-9]+/ {
|
||||
zone = $NF
|
||||
}
|
||||
zone == "Normal" || zone == "DMA32" {
|
||||
if ($1 == "pages" && $2 == "free") free = $3
|
||||
if ($1 == "min") min_wm = $2
|
||||
if ($1 == "low") low_wm = $2
|
||||
if ($1 == "high") {
|
||||
high_wm = $2
|
||||
print zone, free+0, min_wm+0, low_wm+0, high_wm+0
|
||||
zone = ""
|
||||
}
|
||||
}
|
||||
' /proc/zoneinfo 2>/dev/null
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# METRICS GENERATION
|
||||
# ============================================================================
|
||||
|
||||
generate_metrics() {
|
||||
local script_start
|
||||
script_start=$(date +%s)
|
||||
|
||||
# ========================================================================
|
||||
# Exporter Status
|
||||
# ========================================================================
|
||||
cat <<EOF
|
||||
# HELP memory_pressure_up Exporter status (1=up)
|
||||
# TYPE memory_pressure_up gauge
|
||||
memory_pressure_up 1
|
||||
|
||||
# HELP memory_pressure_exporter_info Exporter version information
|
||||
# TYPE memory_pressure_exporter_info gauge
|
||||
memory_pressure_exporter_info{version="1.0"} 1
|
||||
|
||||
EOF
|
||||
|
||||
# ========================================================================
|
||||
# PSI Memory Pressure
|
||||
# ========================================================================
|
||||
if [ -f /proc/pressure/memory ]; then
|
||||
local psi_mem
|
||||
psi_mem=$(get_psi_stats /proc/pressure/memory)
|
||||
|
||||
if [ -n "$psi_mem" ]; then
|
||||
echo "$psi_mem" | while read -r type avg10 avg60 avg300 total; do
|
||||
echo "# HELP memory_pressure_psi_${type}_avg10 PSI memory ${type} avg10 percentage"
|
||||
echo "# TYPE memory_pressure_psi_${type}_avg10 gauge"
|
||||
echo "memory_pressure_psi_${type}_avg10 $avg10"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_psi_${type}_avg60 PSI memory ${type} avg60 percentage"
|
||||
echo "# TYPE memory_pressure_psi_${type}_avg60 gauge"
|
||||
echo "memory_pressure_psi_${type}_avg60 $avg60"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_psi_${type}_avg300 PSI memory ${type} avg300 percentage"
|
||||
echo "# TYPE memory_pressure_psi_${type}_avg300 gauge"
|
||||
echo "memory_pressure_psi_${type}_avg300 $avg300"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_psi_${type}_total_microseconds PSI memory ${type} total stall time in microseconds"
|
||||
echo "# TYPE memory_pressure_psi_${type}_total_microseconds counter"
|
||||
echo "memory_pressure_psi_${type}_total_microseconds $total"
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# ========================================================================
|
||||
# PSI I/O Pressure
|
||||
# ========================================================================
|
||||
if [ -f /proc/pressure/io ]; then
|
||||
local psi_io
|
||||
psi_io=$(get_psi_stats /proc/pressure/io)
|
||||
|
||||
if [ -n "$psi_io" ]; then
|
||||
echo "$psi_io" | while read -r type avg10 avg60 avg300 total; do
|
||||
echo "# HELP memory_pressure_psi_io_${type}_avg10 PSI I/O ${type} avg10 percentage"
|
||||
echo "# TYPE memory_pressure_psi_io_${type}_avg10 gauge"
|
||||
echo "memory_pressure_psi_io_${type}_avg10 $avg10"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_psi_io_${type}_avg60 PSI I/O ${type} avg60 percentage"
|
||||
echo "# TYPE memory_pressure_psi_io_${type}_avg60 gauge"
|
||||
echo "memory_pressure_psi_io_${type}_avg60 $avg60"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_psi_io_${type}_avg300 PSI I/O ${type} avg300 percentage"
|
||||
echo "# TYPE memory_pressure_psi_io_${type}_avg300 gauge"
|
||||
echo "memory_pressure_psi_io_${type}_avg300 $avg300"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_psi_io_${type}_total_microseconds PSI I/O ${type} total stall time in microseconds"
|
||||
echo "# TYPE memory_pressure_psi_io_${type}_total_microseconds counter"
|
||||
echo "memory_pressure_psi_io_${type}_total_microseconds $total"
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# ========================================================================
|
||||
# OOM Kill Events
|
||||
# ========================================================================
|
||||
local oom_count oom_timestamp oom_victim
|
||||
oom_count=$(get_oom_kill_count)
|
||||
oom_timestamp=$(get_oom_last_timestamp)
|
||||
oom_victim=$(get_oom_last_victim)
|
||||
|
||||
cat <<EOF
|
||||
# HELP memory_pressure_oom_kills_24h OOM kills in the last 24 hours
|
||||
# TYPE memory_pressure_oom_kills_24h gauge
|
||||
memory_pressure_oom_kills_24h ${oom_count:-0}
|
||||
|
||||
# HELP memory_pressure_oom_last_kill_timestamp Unix timestamp of last OOM kill (0 if none)
|
||||
# TYPE memory_pressure_oom_last_kill_timestamp gauge
|
||||
memory_pressure_oom_last_kill_timestamp ${oom_timestamp:-0}
|
||||
|
||||
EOF
|
||||
|
||||
if [ -n "$oom_victim" ]; then
|
||||
cat <<EOF
|
||||
# HELP memory_pressure_oom_last_victim Last OOM-killed process (value is always 1)
|
||||
# TYPE memory_pressure_oom_last_victim gauge
|
||||
memory_pressure_oom_last_victim{process="$oom_victim"} 1
|
||||
|
||||
EOF
|
||||
fi
|
||||
|
||||
# ========================================================================
|
||||
# Swap Activity (two-sample delta)
|
||||
# ========================================================================
|
||||
local swap1 swap2
|
||||
swap1=$(get_swap_counters)
|
||||
|
||||
sleep "$SAMPLE_INTERVAL"
|
||||
|
||||
swap2=$(get_swap_counters)
|
||||
|
||||
local pin1 pout1 pin2 pout2
|
||||
pin1=$(echo "$swap1" | awk '{print $1}')
|
||||
pout1=$(echo "$swap1" | awk '{print $2}')
|
||||
pin2=$(echo "$swap2" | awk '{print $1}')
|
||||
pout2=$(echo "$swap2" | awk '{print $2}')
|
||||
|
||||
local swap_in_rate swap_out_rate swap_in_bytes swap_out_bytes
|
||||
swap_in_rate=$(awk "BEGIN {printf \"%.2f\", (${pin2:-0} - ${pin1:-0}) / $SAMPLE_INTERVAL}")
|
||||
swap_out_rate=$(awk "BEGIN {printf \"%.2f\", (${pout2:-0} - ${pout1:-0}) / $SAMPLE_INTERVAL}")
|
||||
swap_in_bytes=$(awk "BEGIN {printf \"%.2f\", ((${pin2:-0} - ${pin1:-0}) * 4096) / $SAMPLE_INTERVAL}")
|
||||
swap_out_bytes=$(awk "BEGIN {printf \"%.2f\", ((${pout2:-0} - ${pout1:-0}) * 4096) / $SAMPLE_INTERVAL}")
|
||||
|
||||
cat <<EOF
|
||||
# HELP memory_pressure_swap_in_pages_per_sec Pages swapped in per second
|
||||
# TYPE memory_pressure_swap_in_pages_per_sec gauge
|
||||
memory_pressure_swap_in_pages_per_sec $swap_in_rate
|
||||
|
||||
# HELP memory_pressure_swap_out_pages_per_sec Pages swapped out per second
|
||||
# TYPE memory_pressure_swap_out_pages_per_sec gauge
|
||||
memory_pressure_swap_out_pages_per_sec $swap_out_rate
|
||||
|
||||
# HELP memory_pressure_swap_in_bytes_per_sec Bytes swapped in per second
|
||||
# TYPE memory_pressure_swap_in_bytes_per_sec gauge
|
||||
memory_pressure_swap_in_bytes_per_sec $swap_in_bytes
|
||||
|
||||
# HELP memory_pressure_swap_out_bytes_per_sec Bytes swapped out per second
|
||||
# TYPE memory_pressure_swap_out_bytes_per_sec gauge
|
||||
memory_pressure_swap_out_bytes_per_sec $swap_out_bytes
|
||||
|
||||
EOF
|
||||
|
||||
# ========================================================================
|
||||
# NUMA Memory Balance
|
||||
# ========================================================================
|
||||
if is_numa_multi_node; then
|
||||
local numa_data
|
||||
numa_data=$(get_numa_memory)
|
||||
|
||||
if [ -n "$numa_data" ]; then
|
||||
echo "$numa_data" | while read -r node total_kb free_kb; do
|
||||
local total_bytes free_bytes used_pct
|
||||
total_bytes=$((total_kb * 1024))
|
||||
free_bytes=$((free_kb * 1024))
|
||||
if [ "$total_kb" -gt 0 ]; then
|
||||
used_pct=$(awk "BEGIN {printf \"%.2f\", (($total_kb - $free_kb) / $total_kb) * 100}")
|
||||
else
|
||||
used_pct="0.00"
|
||||
fi
|
||||
echo "# HELP memory_pressure_numa_total_bytes Total memory per NUMA node in bytes"
|
||||
echo "# TYPE memory_pressure_numa_total_bytes gauge"
|
||||
echo "memory_pressure_numa_total_bytes{node=\"$node\"} $total_bytes"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_numa_free_bytes Free memory per NUMA node in bytes"
|
||||
echo "# TYPE memory_pressure_numa_free_bytes gauge"
|
||||
echo "memory_pressure_numa_free_bytes{node=\"$node\"} $free_bytes"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_numa_used_percent Memory usage percentage per NUMA node"
|
||||
echo "# TYPE memory_pressure_numa_used_percent gauge"
|
||||
echo "memory_pressure_numa_used_percent{node=\"$node\"} $used_pct"
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# ========================================================================
|
||||
# Transparent Hugepages & Compaction
|
||||
# ========================================================================
|
||||
local thp_stats
|
||||
thp_stats=$(get_thp_stats)
|
||||
|
||||
if [ -n "$thp_stats" ]; then
|
||||
local thp_fault thp_collapse thp_fallback compact_stall
|
||||
thp_fault=$(echo "$thp_stats" | awk '{print $1}')
|
||||
thp_collapse=$(echo "$thp_stats" | awk '{print $2}')
|
||||
thp_fallback=$(echo "$thp_stats" | awk '{print $3}')
|
||||
compact_stall=$(echo "$thp_stats" | awk '{print $4}')
|
||||
|
||||
cat <<EOF
|
||||
# HELP memory_pressure_thp_fault_alloc_total THP fault allocations
|
||||
# TYPE memory_pressure_thp_fault_alloc_total counter
|
||||
memory_pressure_thp_fault_alloc_total ${thp_fault:-0}
|
||||
|
||||
# HELP memory_pressure_thp_collapse_alloc_total THP collapse allocations
|
||||
# TYPE memory_pressure_thp_collapse_alloc_total counter
|
||||
memory_pressure_thp_collapse_alloc_total ${thp_collapse:-0}
|
||||
|
||||
# HELP memory_pressure_thp_fault_fallback_total THP fault fallbacks to regular pages
|
||||
# TYPE memory_pressure_thp_fault_fallback_total counter
|
||||
memory_pressure_thp_fault_fallback_total ${thp_fallback:-0}
|
||||
|
||||
# HELP memory_pressure_compact_stall_total Memory compaction stalls
|
||||
# TYPE memory_pressure_compact_stall_total counter
|
||||
memory_pressure_compact_stall_total ${compact_stall:-0}
|
||||
|
||||
EOF
|
||||
fi
|
||||
|
||||
# ========================================================================
|
||||
# Slab Memory
|
||||
# ========================================================================
|
||||
local slab_stats
|
||||
slab_stats=$(get_slab_stats)
|
||||
|
||||
if [ -n "$slab_stats" ]; then
|
||||
local slab_reclaim_kb slab_unreclaim_kb
|
||||
slab_reclaim_kb=$(echo "$slab_stats" | awk '{print $1}')
|
||||
slab_unreclaim_kb=$(echo "$slab_stats" | awk '{print $2}')
|
||||
|
||||
local slab_reclaim_bytes slab_unreclaim_bytes slab_total_bytes slab_unreclaim_pct
|
||||
slab_reclaim_bytes=$((slab_reclaim_kb * 1024))
|
||||
slab_unreclaim_bytes=$((slab_unreclaim_kb * 1024))
|
||||
slab_total_bytes=$(( slab_reclaim_bytes + slab_unreclaim_bytes ))
|
||||
|
||||
if [ "$slab_total_bytes" -gt 0 ]; then
|
||||
slab_unreclaim_pct=$(awk "BEGIN {printf \"%.2f\", ($slab_unreclaim_bytes / $slab_total_bytes) * 100}")
|
||||
else
|
||||
slab_unreclaim_pct="0.00"
|
||||
fi
|
||||
|
||||
cat <<EOF
|
||||
# HELP memory_pressure_slab_reclaimable_bytes Reclaimable slab memory in bytes
|
||||
# TYPE memory_pressure_slab_reclaimable_bytes gauge
|
||||
memory_pressure_slab_reclaimable_bytes $slab_reclaim_bytes
|
||||
|
||||
# HELP memory_pressure_slab_unreclaimable_bytes Unreclaimable slab memory in bytes
|
||||
# TYPE memory_pressure_slab_unreclaimable_bytes gauge
|
||||
memory_pressure_slab_unreclaimable_bytes $slab_unreclaim_bytes
|
||||
|
||||
# HELP memory_pressure_slab_total_bytes Total slab memory in bytes
|
||||
# TYPE memory_pressure_slab_total_bytes gauge
|
||||
memory_pressure_slab_total_bytes $slab_total_bytes
|
||||
|
||||
# HELP memory_pressure_slab_unreclaimable_percent Percentage of slab memory that is unreclaimable
|
||||
# TYPE memory_pressure_slab_unreclaimable_percent gauge
|
||||
memory_pressure_slab_unreclaimable_percent $slab_unreclaim_pct
|
||||
|
||||
EOF
|
||||
fi
|
||||
|
||||
# ========================================================================
|
||||
# Zone Watermarks
|
||||
# ========================================================================
|
||||
local zone_data
|
||||
zone_data=$(get_zone_watermarks)
|
||||
|
||||
if [ -n "$zone_data" ]; then
|
||||
echo "$zone_data" | while read -r zone free min_wm low_wm high_wm; do
|
||||
local above_low=1
|
||||
if [ "$free" -le "$low_wm" ]; then
|
||||
above_low=0
|
||||
fi
|
||||
echo "# HELP memory_pressure_zone_free_pages Current free pages per zone"
|
||||
echo "# TYPE memory_pressure_zone_free_pages gauge"
|
||||
echo "memory_pressure_zone_free_pages{zone=\"$zone\"} $free"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_zone_min_pages Min watermark pages per zone"
|
||||
echo "# TYPE memory_pressure_zone_min_pages gauge"
|
||||
echo "memory_pressure_zone_min_pages{zone=\"$zone\"} $min_wm"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_zone_low_pages Low watermark pages per zone"
|
||||
echo "# TYPE memory_pressure_zone_low_pages gauge"
|
||||
echo "memory_pressure_zone_low_pages{zone=\"$zone\"} $low_wm"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_zone_high_pages High watermark pages per zone"
|
||||
echo "# TYPE memory_pressure_zone_high_pages gauge"
|
||||
echo "memory_pressure_zone_high_pages{zone=\"$zone\"} $high_wm"
|
||||
echo ""
|
||||
echo "# HELP memory_pressure_zone_free_above_low Whether free pages are above the low watermark (1=above, 0=below)"
|
||||
echo "# TYPE memory_pressure_zone_free_above_low gauge"
|
||||
echo "memory_pressure_zone_free_above_low{zone=\"$zone\"} $above_low"
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
|
||||
# ========================================================================
|
||||
# Exporter Runtime
|
||||
# ========================================================================
|
||||
local script_end script_duration
|
||||
script_end=$(date +%s)
|
||||
script_duration=$((script_end - script_start))
|
||||
|
||||
cat <<EOF
|
||||
# HELP memory_pressure_exporter_duration_seconds Time to generate all metrics
|
||||
# TYPE memory_pressure_exporter_duration_seconds gauge
|
||||
memory_pressure_exporter_duration_seconds $script_duration
|
||||
|
||||
# HELP memory_pressure_exporter_last_run_timestamp Unix timestamp of last successful run
|
||||
# TYPE memory_pressure_exporter_last_run_timestamp gauge
|
||||
memory_pressure_exporter_last_run_timestamp $script_end
|
||||
EOF
|
||||
|
||||
echo ""
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# HTTP SERVER MODE
|
||||
# ============================================================================
|
||||
|
||||
run_http_server() {
|
||||
echo "Starting memory pressure exporter on port $HTTP_PORT..." >&2
|
||||
|
||||
if ! command -v nc >/dev/null 2>&1; then
|
||||
echo "ERROR: netcat (nc) required for HTTP mode" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
while true; do
|
||||
{
|
||||
read -r request
|
||||
if [[ "$request" =~ ^GET\ /metrics ]]; then
|
||||
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r"
|
||||
generate_metrics
|
||||
else
|
||||
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r"
|
||||
cat <<EOF
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Memory Pressure Exporter v1.0</title></head>
|
||||
<body>
|
||||
<h1>Memory Pressure Exporter v1.0</h1>
|
||||
<p><a href="/metrics">Metrics</a></p>
|
||||
<h2>Sections (auto-detected)</h2>
|
||||
<ul>
|
||||
<li>PSI memory and I/O pressure (requires kernel 4.20+)</li>
|
||||
<li>OOM kill tracking (requires journalctl)</li>
|
||||
<li>Swap activity rates</li>
|
||||
<li>NUMA memory balance (requires multi-node system)</li>
|
||||
<li>Transparent hugepage and compaction stats</li>
|
||||
<li>Slab memory pressure</li>
|
||||
<li>Zone watermark proximity</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
fi
|
||||
} | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
|
||||
done
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# MAIN EXECUTION
|
||||
# ============================================================================
|
||||
|
||||
main() {
|
||||
parse_args "$@"
|
||||
|
||||
if [ "$HTTP_MODE" = true ]; then
|
||||
run_http_server
|
||||
elif [ -n "$OUTPUT_FILE" ]; then
|
||||
local output_dir
|
||||
output_dir="$(dirname "$OUTPUT_FILE")"
|
||||
mkdir -p "$output_dir"
|
||||
|
||||
local temp_file
|
||||
temp_file=$(mktemp "${output_dir}/.memory_pressure_metrics.XXXXXX")
|
||||
|
||||
if ! generate_metrics > "$temp_file" 2>/dev/null; then
|
||||
rm -f "$temp_file"
|
||||
echo "ERROR: Failed to generate metrics" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local file_lines
|
||||
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
|
||||
|
||||
if [ "$file_lines" -lt 10 ]; then
|
||||
rm -f "$temp_file"
|
||||
echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
chmod 644 "$temp_file"
|
||||
mv -f "$temp_file" "$OUTPUT_FILE"
|
||||
|
||||
echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
|
||||
else
|
||||
generate_metrics
|
||||
fi
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user