#!/bin/bash ################################################ #### Podman Container Prometheus Exporter #### #### for node_exporter textfile collector #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### Version: 1.0.0.20260327 #### ################################################ set -euo pipefail SCRIPT_NAME=$(basename "$0") readonly SCRIPT_NAME # Default configuration readonly DEFAULT_NODE_DIR="/var/lib/node_exporter" readonly DEFAULT_COLLECTION_INTERVAL=60 readonly DEFAULT_PODMAN_CMD="podman" readonly DEFAULT_HTTP_PORT=9101 # Configuration variables (can be overridden by environment) NODE_DIR=${NODE_DIR:-$DEFAULT_NODE_DIR} COLLECTION_INTERVAL=${COLLECTION_INTERVAL:-$DEFAULT_COLLECTION_INTERVAL} PODMAN_CMD=${PODMAN_CMD:-$DEFAULT_PODMAN_CMD} HTTP_PORT=${HTTP_PORT:-$DEFAULT_HTTP_PORT} DEBUG=${DEBUG:-} # Runtime flags RUN_MODE="once" # HTTP mode PID tracking for cleanup HTTP_PID="" handle_error() { local exit_code=$1 local line_number=$2 echo "Error: $SCRIPT_NAME failed at line $line_number with exit code $exit_code" >&2 exit "$exit_code" } trap 'handle_error $? $LINENO' ERR cleanup() { if [[ -n "$HTTP_PID" ]] && kill -0 "$HTTP_PID" 2>/dev/null; then kill "$HTTP_PID" 2>/dev/null || true fi # Remove any leftover temp files rm -f "${NODE_DIR}/textfile_collector/podman_containers.prom."* 2>/dev/null || true } trap cleanup EXIT debug_echo() { if [[ -n "$DEBUG" ]]; then echo "[DEBUG] $*" >&2 fi } show_help() { cat << EOF Usage: $SCRIPT_NAME [OPTIONS] Podman container metrics collector for Prometheus node_exporter textfile directory. Collects per-container resource usage (CPU, memory, network, block I/O), state, restart counts, exit codes, and uptime via podman stats/inspect and writes them as Prometheus metrics. Supports both rootless and rootful Podman. OPTIONS: --once Run collection once and exit (default) --daemon Run continuously at COLLECTION_INTERVAL --http Serve metrics over HTTP on HTTP_PORT --help, -h Show this help message ENVIRONMENT VARIABLES: NODE_DIR Node exporter textfile directory (default: $DEFAULT_NODE_DIR) COLLECTION_INTERVAL Seconds between collections in daemon mode (default: $DEFAULT_COLLECTION_INTERVAL) PODMAN_CMD Podman binary path or command (default: $DEFAULT_PODMAN_CMD) HTTP_PORT Port for --http mode (default: $DEFAULT_HTTP_PORT) DEBUG Enable debug output EXAMPLES: $SCRIPT_NAME --once $SCRIPT_NAME --daemon COLLECTION_INTERVAL=30 $SCRIPT_NAME --daemon $SCRIPT_NAME --http PODMAN_CMD=/usr/bin/podman HTTP_PORT=9102 $SCRIPT_NAME --http OUTPUT: Writes metrics to \$NODE_DIR/textfile_collector/podman_containers.prom EOF exit 0 } # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in --once) RUN_MODE="once"; shift ;; --daemon) RUN_MODE="daemon"; shift ;; --http) RUN_MODE="http"; shift ;; --help|-h) show_help ;; *) echo "Unknown option: $1" >&2; show_help ;; esac done # Detect rootless mode detect_rootless() { if [[ $EUID -ne 0 ]]; then echo "true" else echo "false" fi } ROOTLESS=$(detect_rootless) readonly ROOTLESS # Validate configuration validate_config() { if ! command -v "$PODMAN_CMD" &>/dev/null; then echo "Error: $PODMAN_CMD is not installed or not in PATH" >&2 exit 1 fi if ! command -v jq &>/dev/null; then echo "Error: jq is not installed or not in PATH" >&2 exit 1 fi if [[ "$RUN_MODE" != "http" ]]; then local textfile_dir="${NODE_DIR}/textfile_collector" if [[ ! -d "$textfile_dir" ]]; then echo "Error: Textfile collector directory not found: $textfile_dir" >&2 echo "Create it: sudo mkdir -p $textfile_dir" >&2 exit 1 fi fi } # Parse human-readable byte strings (e.g. "1.23GiB", "456.7MB", "100kB") to bytes parse_bytes() { local raw="$1" # Strip to numeric value and unit local num unit num=$(echo "$raw" | sed 's/[^0-9.]//g') unit=$(echo "$raw" | sed 's/[0-9.]//g') if [[ -z "$num" ]]; then echo "0" return fi case "$unit" in B|b) awk "BEGIN {printf \"%.0f\", $num}" ;; kB|KB|kb) awk "BEGIN {printf \"%.0f\", $num * 1000}" ;; KiB|kiB) awk "BEGIN {printf \"%.0f\", $num * 1024}" ;; MB|mb) awk "BEGIN {printf \"%.0f\", $num * 1000000}" ;; MiB|miB) awk "BEGIN {printf \"%.0f\", $num * 1048576}" ;; GB|gb) awk "BEGIN {printf \"%.0f\", $num * 1000000000}" ;; GiB|giB) awk "BEGIN {printf \"%.0f\", $num * 1073741824}" ;; TB|tb) awk "BEGIN {printf \"%.0f\", $num * 1000000000000}" ;; TiB|tiB) awk "BEGIN {printf \"%.0f\", $num * 1099511627776}" ;; *) awk "BEGIN {printf \"%.0f\", $num}" ;; esac } # Collect metrics for all containers collect_all() { local output_dir="${NODE_DIR}/textfile_collector" local output_file="${output_dir}/podman_containers.prom" local temp_file temp_file=$(mktemp "${output_file}.XXXXXX") local start_time start_time=$(date +%s%N) local success=1 debug_echo "Starting collection (rootless=$ROOTLESS)..." { # Gather container list with state information local ps_json ps_json=$($PODMAN_CMD ps -a --format json 2>/dev/null) || { debug_echo "Failed to run $PODMAN_CMD ps" success=0 ps_json="[]" } local container_count container_count=$(echo "$ps_json" | jq 'length') if [[ "$container_count" -eq 0 ]]; then debug_echo "No containers found" fi # Gather stats for running containers local stats_json="[]" if [[ "$container_count" -gt 0 ]]; then stats_json=$($PODMAN_CMD stats --no-stream --format json 2>/dev/null) || { debug_echo "Failed to run $PODMAN_CMD stats" stats_json="[]" } fi # Per-container resource metrics headers echo "# HELP podman_container_cpu_percent CPU usage percentage of the container." echo "# TYPE podman_container_cpu_percent gauge" echo "# HELP podman_container_memory_usage_bytes Memory usage of the container in bytes." echo "# TYPE podman_container_memory_usage_bytes gauge" echo "# HELP podman_container_memory_limit_bytes Memory limit of the container in bytes." echo "# TYPE podman_container_memory_limit_bytes gauge" echo "# HELP podman_container_network_rx_bytes Total network bytes received by the container." echo "# TYPE podman_container_network_rx_bytes gauge" echo "# HELP podman_container_network_tx_bytes Total network bytes transmitted by the container." echo "# TYPE podman_container_network_tx_bytes gauge" echo "# HELP podman_container_block_read_bytes Total block bytes read by the container." echo "# TYPE podman_container_block_read_bytes gauge" echo "# HELP podman_container_block_write_bytes Total block bytes written by the container." echo "# TYPE podman_container_block_write_bytes gauge" echo "# HELP podman_container_pids Number of PIDs in the container." echo "# TYPE podman_container_pids gauge" echo "# HELP podman_container_running Whether the container is running (1=running, 0=stopped)." echo "# TYPE podman_container_running gauge" echo "# HELP podman_container_restart_count Number of container restarts." echo "# TYPE podman_container_restart_count gauge" echo "# HELP podman_container_uptime_seconds Seconds since the container started." echo "# TYPE podman_container_uptime_seconds gauge" echo "# HELP podman_container_exit_code Last exit code of the container." echo "# TYPE podman_container_exit_code gauge" echo "# HELP podman_container_info Container metadata (always 1)." echo "# TYPE podman_container_info gauge" local now now=$(date +%s) # Iterate over each container from ps output local i for (( i=0; i/dev/null) || { debug_echo "Failed to inspect container: $container_name" success=0 continue } local restart_count exit_code started_at pod_name restart_count=$(echo "$inspect_json" | jq -r '.[0].RestartCount // 0') exit_code=$(echo "$inspect_json" | jq -r '.[0].State.ExitCode // 0') started_at=$(echo "$inspect_json" | jq -r '.[0].State.StartedAt // ""') pod_name=$(echo "$inspect_json" | jq -r '.[0].Pod // ""') # If pod is a hash ID, try to resolve the pod name if [[ -n "$pod_name" ]] && [[ "$pod_name" =~ ^[a-f0-9]{64}$ ]]; then local resolved_pod resolved_pod=$($PODMAN_CMD pod inspect "$pod_name" 2>/dev/null | jq -r '.Name // ""') || resolved_pod="" [[ -n "$resolved_pod" ]] && pod_name="$resolved_pod" fi # Empty string if not in a pod pod_name="${pod_name:-}" # Common label set local labels="name=\"${container_name}\",image=\"${image}\",pod=\"${pod_name}\"" # Running state local running=0 if [[ "$state" == "running" ]]; then running=1 fi echo "podman_container_running{${labels},rootless=\"${ROOTLESS}\"} ${running}" # Restart count and exit code echo "podman_container_restart_count{${labels}} ${restart_count}" echo "podman_container_exit_code{${labels}} ${exit_code}" # Uptime calculation local uptime=0 if [[ -n "$started_at" ]] && [[ "$started_at" != "0001-01-01T00:00:00Z" ]] && [[ "$running" -eq 1 ]]; then local started_epoch started_epoch=$(date -d "$started_at" +%s 2>/dev/null) || started_epoch=0 if [[ "$started_epoch" -gt 0 ]]; then uptime=$((now - started_epoch)) [[ "$uptime" -lt 0 ]] && uptime=0 fi fi echo "podman_container_uptime_seconds{${labels}} ${uptime}" # Info metric (always 1) local short_id="${container_id:0:12}" echo "podman_container_info{${labels},id=\"${short_id}\",status=\"${state}\",rootless=\"${ROOTLESS}\"} 1" # Resource metrics from stats (only available for running containers) local stats_entry stats_entry=$(echo "$stats_json" | jq -r --arg name "$container_name" --arg id "$container_id" \ '[.[] | select(.Name == $name or .ContainerID == $id or .ID == $id)] | first // empty') if [[ -n "$stats_entry" ]]; then # CPU percent — strip the % sign local cpu_raw cpu_percent cpu_raw=$(echo "$stats_entry" | jq -r '.CPU // .cpu_percent // "0"') cpu_percent=$(echo "$cpu_raw" | sed 's/%//') echo "podman_container_cpu_percent{${labels}} ${cpu_percent}" # Memory usage and limit local mem_usage_raw mem_limit_raw mem_usage mem_limit mem_usage_raw=$(echo "$stats_entry" | jq -r '.MemUsage // .mem_usage // "0"' | sed 's/ \/.*//') mem_limit_raw=$(echo "$stats_entry" | jq -r '.MemUsage // .mem_usage // "0"' | sed 's/.*\/ //') # Handle MemUsage field that may be split into MemUsage and MemLimit if [[ "$mem_usage_raw" == "$mem_limit_raw" ]]; then mem_limit_raw=$(echo "$stats_entry" | jq -r '.MemLimit // "0"') fi mem_usage=$(parse_bytes "$mem_usage_raw") mem_limit=$(parse_bytes "$mem_limit_raw") echo "podman_container_memory_usage_bytes{${labels}} ${mem_usage}" echo "podman_container_memory_limit_bytes{${labels}} ${mem_limit}" # Network I/O local net_input_raw net_output_raw net_rx net_tx net_input_raw=$(echo "$stats_entry" | jq -r '.NetInput // .net_input // "0"' | sed 's/ \/.*//') net_output_raw=$(echo "$stats_entry" | jq -r '.NetOutput // .net_output // "0"' | sed 's/.*\/ //') # Handle combined NetIO field if [[ "$net_input_raw" == "0" ]]; then local net_io net_io=$(echo "$stats_entry" | jq -r '.NetIO // ""') if [[ -n "$net_io" ]] && [[ "$net_io" != "null" ]]; then net_input_raw=$(echo "$net_io" | sed 's/ \/.*//') net_output_raw=$(echo "$net_io" | sed 's/.*\/ //') fi fi net_rx=$(parse_bytes "$net_input_raw") net_tx=$(parse_bytes "$net_output_raw") echo "podman_container_network_rx_bytes{${labels}} ${net_rx}" echo "podman_container_network_tx_bytes{${labels}} ${net_tx}" # Block I/O local block_input_raw block_output_raw block_read block_write block_input_raw=$(echo "$stats_entry" | jq -r '.BlockInput // .block_input // "0"' | sed 's/ \/.*//') block_output_raw=$(echo "$stats_entry" | jq -r '.BlockOutput // .block_output // "0"' | sed 's/.*\/ //') # Handle combined BlockIO field if [[ "$block_input_raw" == "0" ]]; then local block_io block_io=$(echo "$stats_entry" | jq -r '.BlockIO // ""') if [[ -n "$block_io" ]] && [[ "$block_io" != "null" ]]; then block_input_raw=$(echo "$block_io" | sed 's/ \/.*//') block_output_raw=$(echo "$block_io" | sed 's/.*\/ //') fi fi block_read=$(parse_bytes "$block_input_raw") block_write=$(parse_bytes "$block_output_raw") echo "podman_container_block_read_bytes{${labels}} ${block_read}" echo "podman_container_block_write_bytes{${labels}} ${block_write}" # PIDs local pids pids=$(echo "$stats_entry" | jq -r '.PIDs // .pids // 0') echo "podman_container_pids{${labels}} ${pids}" else # Container is not running — emit zeroed resource metrics echo "podman_container_cpu_percent{${labels}} 0" echo "podman_container_memory_usage_bytes{${labels}} 0" echo "podman_container_memory_limit_bytes{${labels}} 0" echo "podman_container_network_rx_bytes{${labels}} 0" echo "podman_container_network_tx_bytes{${labels}} 0" echo "podman_container_block_read_bytes{${labels}} 0" echo "podman_container_block_write_bytes{${labels}} 0" echo "podman_container_pids{${labels}} 0" fi done # Aggregate: total containers by state echo "" echo "# HELP podman_containers_total Total number of containers by state." echo "# TYPE podman_containers_total gauge" local state_counts state_counts=$(echo "$ps_json" | jq -r '[.[].State // "unknown"] | map(ascii_downcase) | group_by(.) | map({state: .[0], count: length}) | .[]' 2>/dev/null) if [[ -n "$state_counts" ]]; then echo "$ps_json" | jq -r \ '[.[].State // "unknown"] | map(ascii_downcase) | group_by(.) | map("\(.[0]) \(length)") | .[]' 2>/dev/null | \ while IFS=' ' read -r s c; do echo "podman_containers_total{state=\"${s}\"} ${c}" done fi # Exporter metadata local end_time duration end_time=$(date +%s%N) duration=$(awk "BEGIN {printf \"%.4f\", ($end_time - $start_time) / 1000000000}") echo "" echo "# HELP podman_exporter_duration_seconds Time taken to collect metrics." echo "# TYPE podman_exporter_duration_seconds gauge" echo "podman_exporter_duration_seconds ${duration}" echo "" echo "# HELP podman_exporter_last_run_timestamp Unix timestamp of last collection." echo "# TYPE podman_exporter_last_run_timestamp gauge" echo "podman_exporter_last_run_timestamp $(date +%s)" echo "" echo "# HELP podman_exporter_success Whether the last collection succeeded (1=success, 0=failure)." echo "# TYPE podman_exporter_success gauge" echo "podman_exporter_success ${success}" } > "$temp_file" 2>/dev/null mv "$temp_file" "$output_file" debug_echo "Collection complete. Wrote to $output_file" } # Serve metrics over HTTP using bash TCP redirects or socat serve_http() { if ! command -v socat &>/dev/null; then echo "Error: socat is required for --http mode but is not installed" >&2 exit 1 fi echo "$SCRIPT_NAME serving metrics on http://0.0.0.0:${HTTP_PORT}/metrics (interval: ${COLLECTION_INTERVAL}s)" # Ensure textfile directory exists for http mode (use a temp dir) local http_dir http_dir=$(mktemp -d) NODE_DIR="$http_dir" mkdir -p "${http_dir}/textfile_collector" # Background collection loop ( while true; do collect_all sleep "$COLLECTION_INTERVAL" done ) & HTTP_PID=$! # Serve requests with socat while true; do socat "TCP-LISTEN:${HTTP_PORT},reuseaddr,fork" SYSTEM:" metrics_file=\"${http_dir}/textfile_collector/podman_containers.prom\" if [ -f \"\$metrics_file\" ]; then body=\$(cat \"\$metrics_file\") content_length=\$(echo -n \"\$body\" | wc -c) printf 'HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4; charset=utf-8\r\nContent-Length: %d\r\n\r\n%s' \"\$content_length\" \"\$body\" else printf 'HTTP/1.1 503 Service Unavailable\r\nContent-Type: text/plain\r\n\r\nMetrics not yet available\n' fi " 2>/dev/null || { debug_echo "socat exited, restarting listener..." sleep 1 } done } # Main main() { validate_config case "$RUN_MODE" in once) collect_all ;; daemon) echo "$SCRIPT_NAME running in daemon mode (interval: ${COLLECTION_INTERVAL}s)" while true; do collect_all sleep "$COLLECTION_INTERVAL" done ;; http) serve_http ;; esac } main