#!/bin/bash ################################################################################ # Script Name: process-metrics-exporter.sh # Version: 1.0 # Description: Prometheus exporter for system process metrics. Collects CPU, # memory, virtual memory per process, process state distribution, # system load averages, TCP connection states, top resource # consumers, file handles by program, and network connections. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - ps, awk, bc, lsof, ss (or netstat) # - netcat (nc) for HTTP mode # # Usage: # ./process-metrics-exporter.sh # stdout # ./process-metrics-exporter.sh --textfile # node_exporter textfile # ./process-metrics-exporter.sh --http # HTTP server on port 9200 # # Metrics Exported: # Per-Process: # - node_cpu_usage{process,pid,owner,state} - CPU % per process # - node_memory_usage_bytes{process,pid,owner} - RSS memory per process # - node_virtual_memory_bytes{process,pid,owner} - Virtual memory per process # # System: # - node_process_state_total{state} - Process count by state # - node_system_load{period} - Load averages (1m,5m,15m) # - node_network_connection_state{state} - TCP connections by state # # Top Consumers: # - node_top_cpu_consumer{rank,process,pid} - Top 10 CPU processes # - node_top_memory_consumer{rank,process,pid} - Top 10 memory processes # # File Handles: # - node_file_handles_by_program{program} - Open files per program # - node_file_handle_connection{fd,command,...} - Network connections by process # - node_file_handles_open - System open file handles # - node_file_handles_allocated - Allocated file handles # - node_file_handles_max - Maximum file handle limit # # Exporter: # - node_process_metrics_up - Exporter status (1=up) # - node_process_metrics_duration_seconds - Script execution time # - node_process_metrics_last_run_timestamp - Last run timestamp # # Configuration: # Default HTTP port: 9200 # Textfile directory: /var/lib/node_exporter # ################################################################################ set -uo pipefail # ============================================================================ # CONFIGURATION # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9200 PROCESS_LIMIT=1000 FILE_HANDLER_LIMIT=1000 FHCONN_LIMIT=50 # ============================================================================ # USAGE # ============================================================================ show_usage() { cat <&2; show_usage ;; esac done } # ============================================================================ # PREFLIGHT # ============================================================================ preflight() { local missing=() for cmd in ps awk bc lsof; do command -v "$cmd" &>/dev/null || missing+=("$cmd") done if [[ ${#missing[@]} -gt 0 ]]; then echo "# ERROR: Missing required commands: ${missing[*]}" >&2 exit 1 fi if ! command -v ss &>/dev/null && ! command -v netstat &>/dev/null; then echo "# WARN: Neither ss nor netstat found — network metrics unavailable" >&2 fi } # ============================================================================ # METRICS COLLECTION # ============================================================================ collect_metrics() { local start_time start_time=$(date +%s%N) local now now=$(date +%s) echo "# HELP node_process_metrics_up Exporter status (1=up)" echo "# TYPE node_process_metrics_up gauge" echo "node_process_metrics_up 1" echo "" # --- Per-process CPU, memory, virtual memory (single-pass awk) --- local ps_output ps_output=$(ps -eo pid,ppid,user,pcpu,rss,vsz,stat,comm --no-headers -ww 2>/dev/null) || true if [[ -n "$ps_output" ]]; then echo "$ps_output" | awk -v limit="$PROCESS_LIMIT" ' { if (NF < 8 || NR > limit) next if ($1 !~ /^[0-9]+$/ || $4 !~ /^[0-9]+\.?[0-9]*$/) next if ($5 !~ /^[0-9]+$/ || $6 !~ /^[0-9]+$/) next gsub(/[^a-zA-Z0-9_:-]/, "_", $8); gsub(/_+/, "_", $8) gsub(/[^a-zA-Z0-9_:-]/, "_", $3); gsub(/_+/, "_", $3) gsub(/[^a-zA-Z0-9_:-]/, "_", $7); gsub(/_+/, "_", $7) cpu[NR] = sprintf("node_cpu_usage{process=\"%s\",pid=\"%s\",owner=\"%s\",state=\"%s\"} %s", $8, $1, $3, $7, $4) mem[NR] = sprintf("node_memory_usage_bytes{process=\"%s\",pid=\"%s\",owner=\"%s\"} %d", $8, $1, $3, $5 * 1024) vmem[NR] = sprintf("node_virtual_memory_bytes{process=\"%s\",pid=\"%s\",owner=\"%s\"} %d", $8, $1, $3, $6 * 1024) } END { print "# HELP node_cpu_usage CPU usage by process" print "# TYPE node_cpu_usage gauge" for (i in cpu) print cpu[i] print "" print "# HELP node_memory_usage_bytes Physical memory (RSS) by process in bytes" print "# TYPE node_memory_usage_bytes gauge" for (i in mem) print mem[i] print "" print "# HELP node_virtual_memory_bytes Virtual memory by process in bytes" print "# TYPE node_virtual_memory_bytes gauge" for (i in vmem) print vmem[i] print "" }' fi # --- Process state distribution --- local stat_output stat_output=$(ps -eo stat --no-headers 2>/dev/null) || true if [[ -n "$stat_output" ]]; then echo "# HELP node_process_state_total Process count by state" echo "# TYPE node_process_state_total gauge" echo "$stat_output" | awk ' { s = substr($1, 1, 1) count[s]++ } END { for (s in count) { if (s == "R") name = "running" else if (s == "S") name = "sleeping" else if (s == "D") name = "uninterruptible_sleep" else if (s == "Z") name = "zombie" else if (s == "T") name = "stopped" else if (s == "I") name = "idle" else name = "other" printf "node_process_state_total{state=\"%s\"} %d\n", name, count[s] } }' echo "" fi # --- System load averages --- if [[ -r /proc/loadavg ]]; then echo "# HELP node_system_load System load averages" echo "# TYPE node_system_load gauge" awk '{ printf "node_system_load{period=\"1m\"} %s\n", $1 printf "node_system_load{period=\"5m\"} %s\n", $2 printf "node_system_load{period=\"15m\"} %s\n", $3 }' /proc/loadavg echo "" fi # --- TCP connection states --- local net_output="" if command -v ss &>/dev/null; then net_output=$(ss -nt 2>/dev/null) || true elif command -v netstat &>/dev/null; then net_output=$(netstat -nt 2>/dev/null) || true fi if [[ -n "$net_output" ]]; then echo "# HELP node_network_connection_state TCP connections by state" echo "# TYPE node_network_connection_state gauge" echo "$net_output" | awk ' /^tcp/ || /^ESTAB/ || /^CLOSE/ || /^TIME/ || /^LISTEN/ || /^SYN/ || /^FIN/ || /^LAST/ { state = $1 if ($1 ~ /^tcp/) state = $NF count[state]++ } END { for (s in count) printf "node_network_connection_state{state=\"%s\"} %d\n", s, count[s] }' echo "" fi # --- Top 10 CPU and memory consumers --- if [[ -n "$ps_output" ]]; then local top_cpu top_mem top_cpu=$(echo "$ps_output" | sort -k4 -nr | head -10) top_mem=$(echo "$ps_output" | sort -k5 -nr | head -10) echo "# HELP node_top_cpu_consumer Top 10 CPU consuming processes" echo "# TYPE node_top_cpu_consumer gauge" echo "$top_cpu" | awk ' { gsub(/[^a-zA-Z0-9_:-]/, "_", $8); gsub(/_+/, "_", $8) gsub(/[^a-zA-Z0-9_:-]/, "_", $3); gsub(/_+/, "_", $3) printf "node_top_cpu_consumer{rank=\"%d\",process=\"%s\",pid=\"%s\",owner=\"%s\"} %s\n", NR, $8, $1, $3, $4 }' echo "" echo "# HELP node_top_memory_consumer Top 10 memory consuming processes (bytes)" echo "# TYPE node_top_memory_consumer gauge" echo "$top_mem" | awk ' { gsub(/[^a-zA-Z0-9_:-]/, "_", $8); gsub(/_+/, "_", $8) gsub(/[^a-zA-Z0-9_:-]/, "_", $3); gsub(/_+/, "_", $3) printf "node_top_memory_consumer{rank=\"%d\",process=\"%s\",pid=\"%s\",owner=\"%s\"} %d\n", NR, $8, $1, $3, $5 * 1024 }' echo "" fi # --- File handles by program --- local lsof_output lsof_output=$(timeout 60 lsof 2>/dev/null) || true if [[ -n "$lsof_output" ]]; then echo "# HELP node_file_handles_by_program Open file handles by program" echo "# TYPE node_file_handles_by_program gauge" echo "$lsof_output" | awk -v limit="$FILE_HANDLER_LIMIT" ' /COMMAND/ { next } NR > limit { exit } { if (NF >= 5 && $1 != "" && $2 ~ /^[0-9]+$/) { gsub(/[^a-zA-Z0-9_:-]/, "_", $1) gsub(/_+/, "_", $1) count[$1]++ } } END { for (cmd in count) if (count[cmd] > 0) printf "node_file_handles_by_program{program=\"%s\"} %d\n", cmd, count[cmd] }' echo "" fi # --- File handle connections (network, deduplicated) --- local conn_output conn_output=$(timeout 60 lsof -i 2>/dev/null | head -"$FHCONN_LIMIT") || true if [[ -n "$conn_output" ]]; then echo "# HELP node_file_handle_connection Network connections by process" echo "# TYPE node_file_handle_connection gauge" echo "$conn_output" | awk -v limit="$FHCONN_LIMIT" ' NR == 1 { next } NR > limit + 1 { exit } { if (NF < 9) next gsub(/[\\]/, "_", $1); gsub(/[*:]/, "_", $2); gsub(/US\\/, "", $3); gsub(/[*:\\]/, "_", $9) gsub(/["\n\r\t\\]/, "_", $1); gsub(/["\n\r\t\\]/, "_", $3) gsub(/["\n\r\t\\]/, "_", $4); gsub(/["\n\r\t\\]/, "_", $5) gsub(/["\n\r\t\\]/, "_", $8); gsub(/["\n\r\t\\]/, "_", $9) key = $4 $1 $9 $3 $5 $8 if (!(key in seen)) { seen[key] = 1 printf "node_file_handle_connection{fd=\"%s\",command=\"%s\",connection=\"%s\",user=\"%s\",protocol=\"%s\",type=\"%s\"} %s\n", $4, $1, $9, $3, $5, $8, $2 } }' | sort -u echo "" fi # --- System-wide file handle stats --- if [[ -r /proc/sys/fs/file-nr ]]; then local file_nr file_nr=$(< /proc/sys/fs/file-nr) if [[ "$file_nr" =~ ^[0-9]+[[:space:]]+[0-9]+[[:space:]]+[0-9]+$ ]]; then local fh_open fh_alloc fh_max read -r fh_open fh_alloc fh_max <<< "$file_nr" echo "# HELP node_file_handles_open Currently open file handles" echo "# TYPE node_file_handles_open gauge" echo "node_file_handles_open ${fh_open}" echo "" echo "# HELP node_file_handles_allocated Allocated file handles" echo "# TYPE node_file_handles_allocated gauge" echo "node_file_handles_allocated ${fh_alloc}" echo "" echo "# HELP node_file_handles_max Maximum file handle limit" echo "# TYPE node_file_handles_max gauge" echo "node_file_handles_max ${fh_max}" echo "" fi fi # --- Exporter metadata --- local end_time duration end_time=$(date +%s%N) duration=$(echo "scale=3; ($end_time - $start_time) / 1000000000" | bc 2>/dev/null || echo "0") echo "# HELP node_process_metrics_duration_seconds Script execution time" echo "# TYPE node_process_metrics_duration_seconds gauge" echo "node_process_metrics_duration_seconds ${duration}" echo "" echo "# HELP node_process_metrics_last_run_timestamp Last successful run (unix timestamp)" echo "# TYPE node_process_metrics_last_run_timestamp gauge" echo "node_process_metrics_last_run_timestamp ${now}" } # ============================================================================ # OUTPUT HANDLING # ============================================================================ output_metrics() { local metrics metrics=$(collect_metrics) if [[ -n "$OUTPUT_FILE" ]]; then local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" local temp_file temp_file=$(mktemp "${output_dir}/.process_metrics.XXXXXX") echo "$metrics" > "$temp_file" local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [[ "$file_lines" -lt 10 ]]; then rm -f "$temp_file" echo "# ERROR: Metrics file too small (${file_lines} lines), keeping previous" >&2 exit 1 fi chmod 644 "$temp_file" mv -f "$temp_file" "$OUTPUT_FILE" echo "# Metrics written to ${OUTPUT_FILE} (${file_lines} lines)" >&2 else echo "$metrics" fi } serve_http() { echo "# Starting HTTP server on port ${HTTP_PORT}" >&2 echo "# Metrics endpoint: http://localhost:${HTTP_PORT}/metrics" >&2 if ! command -v nc &>/dev/null && ! command -v ncat &>/dev/null; then echo "# ERROR: netcat (nc/ncat) required for HTTP mode" >&2 exit 1 fi local nc_cmd="nc" command -v ncat &>/dev/null && nc_cmd="ncat" while true; do local metrics metrics=$(collect_metrics) local content_length=${#metrics} local response="HTTP/1.1 200 OK\r\nContent-Type: text/plain; charset=utf-8\r\nContent-Length: ${content_length}\r\nConnection: close\r\n\r\n${metrics}" echo -e "$response" | $nc_cmd -l -p "$HTTP_PORT" -q 1 2>/dev/null || \ echo -e "$response" | $nc_cmd -l "$HTTP_PORT" 2>/dev/null || true done } # ============================================================================ # MAIN # ============================================================================ parse_args "$@" preflight if [[ "$HTTP_MODE" == "true" ]]; then serve_http else output_metrics fi