#!/bin/bash ################################################ #### Container Health Prometheus Exporter #### #### for node_exporter textfile collector #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### Version: 1.0.0.20260309 #### ################################################ set -o pipefail SCRIPT_NAME=$(basename "$0") readonly SCRIPT_NAME # Default configuration readonly DEFAULT_NODE_DIR="/var/lib/node_exporter" readonly DEFAULT_COLLECTION_INTERVAL=60 # Configuration variables (can be overridden by environment) NODE_DIR=${NODE_DIR:-$DEFAULT_NODE_DIR} COLLECTION_INTERVAL=${COLLECTION_INTERVAL:-$DEFAULT_COLLECTION_INTERVAL} DEBUG=${DEBUG:-} # Runtime flags RUN_MODE="once" handle_error() { local exit_code=$1 local line_number=$2 echo "Error: $SCRIPT_NAME failed at line $line_number with exit code $exit_code" >&2 exit "$exit_code" } trap 'handle_error $? $LINENO' ERR debug_echo() { if [[ -n "$DEBUG" ]]; then echo "[DEBUG] $*" >&2 fi } show_help() { cat << EOF Usage: $SCRIPT_NAME [OPTIONS] Container health metrics collector for Prometheus node_exporter textfile directory. Collects per-container health check status, image age, restart counts, exit codes, and running state via docker inspect and writes them as Prometheus metrics. OPTIONS: --once Run collection once and exit (default) --daemon Run continuously at COLLECTION_INTERVAL --help, -h Show this help message ENVIRONMENT VARIABLES: NODE_DIR Node exporter textfile directory (default: $DEFAULT_NODE_DIR) COLLECTION_INTERVAL Seconds between collections in daemon mode (default: $DEFAULT_COLLECTION_INTERVAL) DEBUG Enable debug output EXAMPLES: $SCRIPT_NAME --once $SCRIPT_NAME --daemon COLLECTION_INTERVAL=30 $SCRIPT_NAME --daemon OUTPUT: Writes metrics to \$NODE_DIR/textfile_collector/container_health.prom EOF exit 0 } # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in --once) RUN_MODE="once"; shift ;; --daemon) RUN_MODE="daemon"; shift ;; --help|-h) show_help ;; *) echo "Unknown option: $1" >&2; show_help ;; esac done # Validate configuration validate_config() { if ! command -v docker &>/dev/null; then echo "Error: docker is not installed or not in PATH" >&2 exit 1 fi local textfile_dir="${NODE_DIR}/textfile_collector" if [[ ! -d "$textfile_dir" ]]; then echo "Error: Textfile collector directory not found: $textfile_dir" >&2 echo "Create it: sudo mkdir -p $textfile_dir" >&2 exit 1 fi } # Collect metrics for all containers collect_all() { local output_dir="${NODE_DIR}/textfile_collector" local output_file="${output_dir}/container_health.prom" local temp_file temp_file=$(mktemp "${output_file}.XXXXXX") local start_time start_time=$(date +%s%N) local success=1 debug_echo "Starting collection..." { local containers containers=$(docker ps -a --format '{{.Names}}') if [[ -z "$containers" ]]; then debug_echo "No containers found" fi # Per-container metrics headers echo "# HELP container_health_status Health check status of the container (1 for current status)." echo "# TYPE container_health_status gauge" echo "# HELP container_image_age_seconds Age of the container image in seconds." echo "# TYPE container_image_age_seconds gauge" echo "# HELP container_restart_count Number of container restarts." echo "# TYPE container_restart_count gauge" echo "# HELP container_exit_code Exit code of the container." echo "# TYPE container_exit_code gauge" echo "# HELP container_running Whether the container is running (1=running, 0=stopped)." echo "# TYPE container_running gauge" local now now=$(date +%s) while IFS= read -r container_name; do [[ -z "$container_name" ]] && continue debug_echo "Inspecting container: $container_name" # Extract all fields in a single docker inspect call local inspect_data inspect_data=$(docker inspect --format \ '{{.Config.Image}}|{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}|{{.Created}}|{{.RestartCount}}|{{.State.ExitCode}}|{{.State.Running}}' \ "$container_name" 2>/dev/null) || { debug_echo "Failed to inspect container: $container_name" success=0 continue } local image health_status created restart_count exit_code running_raw IFS='|' read -r image health_status created restart_count exit_code running_raw <<< "$inspect_data" # Calculate image age in seconds local created_epoch image_age created_epoch=$(date -d "$created" +%s 2>/dev/null) || created_epoch=0 image_age=$((now - created_epoch)) # Convert running boolean to 0/1 local running=0 if [[ "$running_raw" == "true" ]]; then running=1 fi # Health status — emit a 1 for the current status, 0 for others for status in healthy unhealthy starting none; do if [[ "$health_status" == "$status" ]]; then echo "container_health_status{name=\"${container_name}\",image=\"${image}\",status=\"${status}\"} 1" else echo "container_health_status{name=\"${container_name}\",image=\"${image}\",status=\"${status}\"} 0" fi done echo "container_image_age_seconds{name=\"${container_name}\",image=\"${image}\"} ${image_age}" echo "container_restart_count{name=\"${container_name}\",image=\"${image}\"} ${restart_count}" echo "container_exit_code{name=\"${container_name}\",image=\"${image}\"} ${exit_code}" echo "container_running{name=\"${container_name}\",image=\"${image}\"} ${running}" done <<< "$containers" # Exporter metadata local end_time duration end_time=$(date +%s%N) duration=$(awk "BEGIN {printf \"%.4f\", ($end_time - $start_time) / 1000000000}") echo "" echo "# HELP container_health_exporter_duration_seconds Time taken to collect metrics." echo "# TYPE container_health_exporter_duration_seconds gauge" echo "container_health_exporter_duration_seconds ${duration}" echo "" echo "# HELP container_health_exporter_last_run_timestamp Unix timestamp of last collection." echo "# TYPE container_health_exporter_last_run_timestamp gauge" echo "container_health_exporter_last_run_timestamp $(date +%s)" echo "" echo "# HELP container_health_exporter_success Whether the last collection succeeded (1=success, 0=failure)." echo "# TYPE container_health_exporter_success gauge" echo "container_health_exporter_success ${success}" } > "$temp_file" 2>/dev/null mv "$temp_file" "$output_file" debug_echo "Collection complete. Wrote to $output_file" } # Main main() { validate_config case "$RUN_MODE" in once) collect_all ;; daemon) echo "$SCRIPT_NAME running in daemon mode (interval: ${COLLECTION_INTERVAL}s)" while true; do collect_all sleep "$COLLECTION_INTERVAL" done ;; esac } main