#!/usr/bin/env bash # # Consul Prometheus Metrics Exporter # # Prometheus textfile collector exporter for Consul. # Uses the Consul HTTP API to collect cluster health, Raft consensus, # service catalog, health check states, KV store entry counts, # and node membership. # # Usage: # ./consul-exporter.sh # ./consul-exporter.sh --textfile # CONSUL_TOKEN="xxx" ./consul-exporter.sh --textfile # ./consul-exporter.sh --install # # Parameters: # --textfile Write to textfile collector directory # --install Create cron job for automatic collection # --help Show usage # # Environment: # CONSUL_URL Consul HTTP API base URL (default: http://127.0.0.1:8500) # CONSUL_TOKEN ACL token (optional, required if ACLs are enabled) # TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector) # CURL_TIMEOUT API request timeout in seconds (default: 10) # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # Version: 1.0 # # Metrics Exported: # Core: # - consul_up # - consul_exporter_info{version} # - consul_peers_total # - consul_leader # # Catalog: # - consul_services_total # - consul_nodes_total # # Health: # - consul_health_checks_passing # - consul_health_checks_warning # - consul_health_checks_critical # # KV: # - consul_kv_entries_total # # Raft: # - consul_raft_commit_time_seconds # - consul_raft_last_contact_seconds # # Exporter: # - consul_exporter_duration_seconds # - consul_exporter_last_run_timestamp set -euo pipefail # --- Configuration --- readonly VERSION="1.0" readonly SCRIPT_NAME="$(basename "$0")" CONSUL_URL="${CONSUL_URL:-http://127.0.0.1:8500}" CONSUL_TOKEN="${CONSUL_TOKEN:-}" TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" CURL_TIMEOUT="${CURL_TIMEOUT:-10}" TEXTFILE_MODE=false OUTPUT="" START_TIME="" # --- Functions --- usage() { cat </dev/null; then missing+=("$cmd") fi done if [[ ${#missing[@]} -gt 0 ]]; then echo "ERROR: Missing required commands: ${missing[*]}" >&2 echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 exit 1 fi } validate_config() { # Strip trailing slash CONSUL_URL="${CONSUL_URL%/}" } api_get() { local endpoint="$1" local curl_args=(-sf --max-time "$CURL_TIMEOUT") if [[ -n "$CONSUL_TOKEN" ]]; then curl_args+=(-H "X-Consul-Token: ${CONSUL_TOKEN}") fi curl "${curl_args[@]}" "${CONSUL_URL}${endpoint}" 2>/dev/null || echo "" } add_metric() { local name="$1" local type="$2" local help="$3" local value="$4" local labels="${5:-}" if [[ -n "$labels" ]]; then OUTPUT+="# HELP ${name} ${help} # TYPE ${name} ${type} ${name}{${labels}} ${value} " else OUTPUT+="# HELP ${name} ${help} # TYPE ${name} ${type} ${name} ${value} " fi } add_metric_value() { local name="$1" local value="$2" local labels="${3:-}" if [[ -n "$labels" ]]; then OUTPUT+="${name}{${labels}} ${value} " else OUTPUT+="${name} ${value} " fi } collect_health() { local members_json members_json=$(api_get "/v1/agent/members") if [[ -z "$members_json" ]]; then add_metric "consul_up" "gauge" "Consul reachability (1=up, 0=down)" "0" return 1 fi add_metric "consul_up" "gauge" "Consul reachability (1=up, 0=down)" "1" # Node count from members local node_count node_count=$(echo "$members_json" | jq 'length' 2>/dev/null) add_metric "consul_nodes_total" "gauge" "Total number of cluster nodes" "${node_count:-0}" return 0 } collect_raft() { local raft_json raft_json=$(api_get "/v1/operator/raft/configuration") if [[ -z "$raft_json" ]]; then return fi # Peer count local peer_count peer_count=$(echo "$raft_json" | jq '.Servers | length' 2>/dev/null) add_metric "consul_peers_total" "gauge" "Number of Raft peers in the cluster" "${peer_count:-0}" # Leader detection — check if current node is leader local self_json leader_addr self_addr self_json=$(api_get "/v1/agent/self") if [[ -n "$self_json" ]]; then leader_addr=$(echo "$raft_json" | jq -r '.Servers[] | select(.Leader == true) | .Address' 2>/dev/null) self_addr=$(echo "$self_json" | jq -r '.Config.RaftAddress // .Stats.raft.applied_index // empty' 2>/dev/null) local self_name self_leader_name self_name=$(echo "$self_json" | jq -r '.Config.NodeName // empty' 2>/dev/null) self_leader_name=$(echo "$raft_json" | jq -r '.Servers[] | select(.Leader == true) | .Node' 2>/dev/null) if [[ -n "$self_name" && "$self_name" == "$self_leader_name" ]]; then add_metric "consul_leader" "gauge" "Whether this node is the cluster leader (1=leader, 0=follower)" "1" else add_metric "consul_leader" "gauge" "Whether this node is the cluster leader (1=leader, 0=follower)" "0" fi # Raft stats from /v1/agent/self local raft_commit_time raft_last_contact raft_commit_time=$(echo "$self_json" | jq -r '.Stats.raft.commit_time // empty' 2>/dev/null) raft_last_contact=$(echo "$self_json" | jq -r '.Stats.raft.last_contact // empty' 2>/dev/null) if [[ -n "$raft_commit_time" ]]; then # Convert milliseconds to seconds local commit_seconds commit_seconds=$(echo "scale=6; ${raft_commit_time%ms} / 1000" | bc 2>/dev/null || echo "0") add_metric "consul_raft_commit_time_seconds" "gauge" "Raft commit time in seconds" "$commit_seconds" fi if [[ -n "$raft_last_contact" ]]; then # Convert milliseconds to seconds local contact_seconds contact_seconds=$(echo "scale=6; ${raft_last_contact%ms} / 1000" | bc 2>/dev/null || echo "0") add_metric "consul_raft_last_contact_seconds" "gauge" "Time since last Raft leader contact in seconds" "$contact_seconds" fi fi } collect_services() { local services_json services_json=$(api_get "/v1/catalog/services") if [[ -z "$services_json" ]]; then return fi local service_count service_count=$(echo "$services_json" | jq 'keys | length' 2>/dev/null) add_metric "consul_services_total" "gauge" "Total number of registered services" "${service_count:-0}" } collect_health_checks() { local checks_json checks_json=$(api_get "/v1/health/state/any") if [[ -z "$checks_json" ]]; then return fi local passing warning critical passing=$(echo "$checks_json" | jq '[.[] | select(.Status == "passing")] | length' 2>/dev/null) warning=$(echo "$checks_json" | jq '[.[] | select(.Status == "warning")] | length' 2>/dev/null) critical=$(echo "$checks_json" | jq '[.[] | select(.Status == "critical")] | length' 2>/dev/null) add_metric "consul_health_checks_passing" "gauge" "Number of passing health checks" "${passing:-0}" add_metric "consul_health_checks_warning" "gauge" "Number of warning health checks" "${warning:-0}" add_metric "consul_health_checks_critical" "gauge" "Number of critical health checks" "${critical:-0}" } collect_kv() { local kv_json kv_json=$(api_get "/v1/kv/?keys") if [[ -z "$kv_json" ]]; then add_metric "consul_kv_entries_total" "gauge" "Total number of KV store entries" "0" return fi local kv_count kv_count=$(echo "$kv_json" | jq 'length' 2>/dev/null) add_metric "consul_kv_entries_total" "gauge" "Total number of KV store entries" "${kv_count:-0}" } write_output() { if [[ "$TEXTFILE_MODE" == true ]]; then local output_file="${TEXTFILE_DIR}/consul.prom" local temp_file="${output_file}.$$" mkdir -p "$TEXTFILE_DIR" echo "$OUTPUT" > "$temp_file" mv "$temp_file" "$output_file" else echo "$OUTPUT" fi } install_cron() { if [[ $EUID -ne 0 ]]; then echo "ERROR: --install requires root" >&2 exit 1 fi local script_path script_path=$(readlink -f "$0") cat > /etc/cron.d/consul-exporter </dev/null EOF chmod 644 /etc/cron.d/consul-exporter echo "Installed cron job: /etc/cron.d/consul-exporter" echo "Metrics will be written to: ${TEXTFILE_DIR}/consul.prom" } # --- Main --- main() { # Parse arguments for arg in "$@"; do case "$arg" in --textfile) TEXTFILE_MODE=true ;; --install) check_dependencies validate_config install_cron exit 0 ;; --help|-h) usage ;; *) echo "Unknown option: $arg" >&2; usage ;; esac done check_dependencies validate_config START_TIME=$(date +%s%N) # Exporter info add_metric "consul_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" # Collect metrics if collect_health; then collect_raft collect_services collect_health_checks collect_kv fi # Exporter performance local end_time duration end_time=$(date +%s%N) duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") add_metric "consul_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" add_metric "consul_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" write_output } main "$@"