#!/bin/bash ################################################################################ # Script Name: memory-pressure-exporter.sh # Version: 1.0 # Description: Prometheus exporter for memory and swap pressure metrics. # Exports PSI stall information, OOM kill events, swap activity # rates, NUMA memory balance, slab pressure, transparent hugepage # stats, and zone watermark proximity. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - Standard Unix tools (awk, grep, cat) # - netcat (nc) for HTTP mode # - Optional: journalctl (OOM tracking), kernel 4.20+ (PSI), # multi-node NUMA. Each section is skipped gracefully if unavailable. # # Usage: # # Output to stdout # ./memory-pressure-exporter.sh # # # HTTP server mode # ./memory-pressure-exporter.sh --http -p 9198 # # # Textfile collector mode # ./memory-pressure-exporter.sh --textfile # # Metrics Exported: # Core Status: # - memory_pressure_up - Exporter status (1=up, 0=down) # - memory_pressure_exporter_info{version} - Exporter version # # PSI Memory (if /proc/pressure/memory exists): # - memory_pressure_psi_some_avg10 - PSI memory some avg10 # - memory_pressure_psi_some_avg60 - PSI memory some avg60 # - memory_pressure_psi_some_avg300 - PSI memory some avg300 # - memory_pressure_psi_some_total_microseconds - PSI memory some total # - memory_pressure_psi_full_avg10 - PSI memory full avg10 # - memory_pressure_psi_full_avg60 - PSI memory full avg60 # - memory_pressure_psi_full_avg300 - PSI memory full avg300 # - memory_pressure_psi_full_total_microseconds - PSI memory full total # # PSI I/O (if /proc/pressure/io exists): # - memory_pressure_psi_io_some_avg10 - PSI I/O some avg10 # - memory_pressure_psi_io_some_avg60 - PSI I/O some avg60 # - memory_pressure_psi_io_some_avg300 - PSI I/O some avg300 # - memory_pressure_psi_io_some_total_microseconds - PSI I/O some total # - memory_pressure_psi_io_full_avg10 - PSI I/O full avg10 # - memory_pressure_psi_io_full_avg60 - PSI I/O full avg60 # - memory_pressure_psi_io_full_avg300 - PSI I/O full avg300 # - memory_pressure_psi_io_full_total_microseconds - PSI I/O full total # # OOM Kills (if journalctl available): # - memory_pressure_oom_kills_24h - OOM kills in last 24 hours # - memory_pressure_oom_last_kill_timestamp - Unix timestamp of last OOM # - memory_pressure_oom_last_victim{process} - Last killed process (1) # # Swap Activity: # - memory_pressure_swap_in_pages_per_sec - Swap in pages/sec # - memory_pressure_swap_out_pages_per_sec - Swap out pages/sec # - memory_pressure_swap_in_bytes_per_sec - Swap in bytes/sec # - memory_pressure_swap_out_bytes_per_sec - Swap out bytes/sec # # NUMA (if multi-node): # - memory_pressure_numa_total_bytes{node} - Total memory per node # - memory_pressure_numa_free_bytes{node} - Free memory per node # - memory_pressure_numa_used_percent{node} - Usage percentage per node # # Transparent Hugepages: # - memory_pressure_thp_fault_alloc_total - THP fault allocations # - memory_pressure_thp_collapse_alloc_total - THP collapse allocations # - memory_pressure_thp_fault_fallback_total - THP fault fallbacks # - memory_pressure_compact_stall_total - Compaction stalls # # Slab: # - memory_pressure_slab_reclaimable_bytes - Reclaimable slab # - memory_pressure_slab_unreclaimable_bytes - Unreclaimable slab # - memory_pressure_slab_total_bytes - Total slab # - memory_pressure_slab_unreclaimable_percent - Unreclaimable percentage # # Zone Watermarks: # - memory_pressure_zone_free_pages{zone} - Current free pages # - memory_pressure_zone_min_pages{zone} - Min watermark # - memory_pressure_zone_low_pages{zone} - Low watermark # - memory_pressure_zone_high_pages{zone} - High watermark # - memory_pressure_zone_free_above_low{zone} - 1 if free > low # # Exporter: # - memory_pressure_exporter_duration_seconds - Script execution time # - memory_pressure_exporter_last_run_timestamp - Last run timestamp # # Configuration: # Default HTTP port: 9198 # Textfile directory: /var/lib/node_exporter # SAMPLE_INTERVAL: seconds between swap activity samples (default: 1) # ################################################################################ # ============================================================================ # CONFIGURATION VARIABLES # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9198 SAMPLE_INTERVAL="${SAMPLE_INTERVAL:-1}" # ============================================================================ # HELPER FUNCTIONS # ============================================================================ show_usage() { cat <&2; exit 1 ;; esac done } # ============================================================================ # PSI PRESSURE # ============================================================================ # Parse a PSI file (/proc/pressure/memory or /proc/pressure/io) # Args: $1 - file path # Output: lines of "type avg10 avg60 avg300 total" # where type is "some" or "full" get_psi_stats() { local psi_file="$1" [ -f "$psi_file" ] || return awk '{ type = $1 avg10 = avg60 = avg300 = total = 0 for (i = 2; i <= NF; i++) { split($i, kv, "=") if (kv[1] == "avg10") avg10 = kv[2] if (kv[1] == "avg60") avg60 = kv[2] if (kv[1] == "avg300") avg300 = kv[2] if (kv[1] == "total") total = kv[2] } print type, avg10, avg60, avg300, total }' "$psi_file" } # ============================================================================ # OOM KILL TRACKING # ============================================================================ # Get OOM kill count in last 24 hours # Returns: count get_oom_kill_count() { if ! command -v journalctl >/dev/null 2>&1; then echo "0" return fi local count count=$(journalctl -k --grep="Out of memory" --since "24 hours ago" --no-pager -q 2>/dev/null | wc -l) echo "${count:-0}" } # Get last OOM kill timestamp (unix epoch) # Returns: timestamp or 0 get_oom_last_timestamp() { if ! command -v journalctl >/dev/null 2>&1; then echo "0" return fi local last_line last_line=$(journalctl -k --grep="Out of memory" --since "24 hours ago" --no-pager -q -o short-unix 2>/dev/null | tail -1) if [ -n "$last_line" ]; then echo "$last_line" | awk '{printf "%d", $1}' else echo "0" fi } # Get last OOM victim process name # Returns: process name or empty get_oom_last_victim() { if ! command -v journalctl >/dev/null 2>&1; then return fi journalctl -k --grep="Killed process" --since "24 hours ago" --no-pager -q 2>/dev/null \ | tail -1 \ | grep -oP 'Killed process \d+ \(\K[^)]+' \ | head -1 } # ============================================================================ # SWAP ACTIVITY # ============================================================================ # Read swap counters from /proc/vmstat # Returns: "pswpin pswpout" get_swap_counters() { awk '/^pswpin / { pin=$2 } /^pswpout / { pout=$2 } END { print pin, pout }' /proc/vmstat 2>/dev/null } # ============================================================================ # NUMA MEMORY # ============================================================================ # Check if system has multiple NUMA nodes # Returns: 0 (true) if multi-node, 1 (false) if single is_numa_multi_node() { [ -d /sys/devices/system/node/node1 ] } # Get NUMA memory info per node # Output: lines of "nodeN total_kb free_kb" get_numa_memory() { local node_dir="/sys/devices/system/node" [ -d "$node_dir" ] || return for node_path in "$node_dir"/node[0-9]*; do [ -d "$node_path" ] || continue local node_name node_name=$(basename "$node_path") local meminfo="$node_path/meminfo" [ -f "$meminfo" ] || continue local total free total=$(awk '/MemTotal/ {print $4}' "$meminfo" 2>/dev/null) free=$(awk '/MemFree/ {print $4}' "$meminfo" 2>/dev/null) echo "$node_name ${total:-0} ${free:-0}" done } # ============================================================================ # TRANSPARENT HUGEPAGES & COMPACTION # ============================================================================ # Get THP and compaction stats from /proc/vmstat # Returns: "thp_fault_alloc thp_collapse_alloc thp_fault_fallback compact_stall" get_thp_stats() { awk ' /^thp_fault_alloc / { fault=$2 } /^thp_collapse_alloc / { collapse=$2 } /^thp_fault_fallback / { fallback=$2 } /^compact_stall / { stall=$2 } END { print fault+0, collapse+0, fallback+0, stall+0 } ' /proc/vmstat 2>/dev/null } # ============================================================================ # SLAB MEMORY # ============================================================================ # Get slab memory from /proc/meminfo # Returns: "reclaimable_kb unreclaimable_kb" get_slab_stats() { awk ' /^SReclaimable:/ { reclaimable=$2 } /^SUnreclaim:/ { unreclaimable=$2 } END { print reclaimable+0, unreclaimable+0 } ' /proc/meminfo 2>/dev/null } # ============================================================================ # ZONE WATERMARKS # ============================================================================ # Parse /proc/zoneinfo for Normal and DMA32 zones # Output: lines of "zone free min low high" get_zone_watermarks() { awk ' /^Node [0-9]+, zone +[A-Za-z0-9]+/ { zone = $NF } zone == "Normal" || zone == "DMA32" { if ($1 == "pages" && $2 == "free") free = $3 if ($1 == "min") min_wm = $2 if ($1 == "low") low_wm = $2 if ($1 == "high") { high_wm = $2 print zone, free+0, min_wm+0, low_wm+0, high_wm+0 zone = "" } } ' /proc/zoneinfo 2>/dev/null } # ============================================================================ # METRICS GENERATION # ============================================================================ generate_metrics() { local script_start script_start=$(date +%s) # ======================================================================== # Exporter Status # ======================================================================== cat <&2 if ! command -v nc >/dev/null 2>&1; then echo "ERROR: netcat (nc) required for HTTP mode" >&2 exit 1 fi while true; do { read -r request if [[ "$request" =~ ^GET\ /metrics ]]; then echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" generate_metrics else echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" cat < Memory Pressure Exporter v1.0

Memory Pressure Exporter v1.0

Metrics

Sections (auto-detected)

  • PSI memory and I/O pressure (requires kernel 4.20+)
  • OOM kill tracking (requires journalctl)
  • Swap activity rates
  • NUMA memory balance (requires multi-node system)
  • Transparent hugepage and compaction stats
  • Slab memory pressure
  • Zone watermark proximity
EOF fi } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null done } # ============================================================================ # MAIN EXECUTION # ============================================================================ main() { parse_args "$@" if [ "$HTTP_MODE" = true ]; then run_http_server elif [ -n "$OUTPUT_FILE" ]; then local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" local temp_file temp_file=$(mktemp "${output_dir}/.memory_pressure_metrics.XXXXXX") if ! generate_metrics > "$temp_file" 2>/dev/null; then rm -f "$temp_file" echo "ERROR: Failed to generate metrics" >&2 exit 1 fi local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [ "$file_lines" -lt 10 ]; then rm -f "$temp_file" echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 exit 1 fi chmod 644 "$temp_file" mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 else generate_metrics fi } main "$@"