#!/usr/bin/env bash # # Docker Swarm Prometheus Metrics Exporter # # Prometheus textfile collector exporter for Docker Swarm. # Uses the Docker CLI to collect node membership, service replica counts, # task states, overlay network counts, manager leader status, # and Raft consensus index. # # Usage: # ./docker-swarm-exporter.sh # ./docker-swarm-exporter.sh --textfile # ./docker-swarm-exporter.sh --install # # Parameters: # --textfile Write to textfile collector directory # --install Create cron job for automatic collection # --help Show usage # # Environment: # DOCKER_HOST Docker daemon socket or host (default: local socket) # TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector) # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # Version: 1.0 # # Metrics Exported: # Core: # - swarm_up # - swarm_exporter_info{version} # # Nodes: # - swarm_node_count # - swarm_nodes_ready # - swarm_nodes_down # - swarm_managers_total # - swarm_manager_leader # - swarm_workers_total # # Services: # - swarm_services_total # - swarm_service_replicas{service} # - swarm_service_replicas_running{service} # # Tasks: # - swarm_tasks_running # - swarm_tasks_failed # # Networks: # - swarm_networks_total # # Raft: # - swarm_raft_index # # Exporter: # - swarm_exporter_duration_seconds # - swarm_exporter_last_run_timestamp set -euo pipefail # --- Configuration --- readonly VERSION="1.0" readonly SCRIPT_NAME="$(basename "$0")" DOCKER_HOST="${DOCKER_HOST:-}" TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" TEXTFILE_MODE=false OUTPUT="" START_TIME="" # --- Functions --- usage() { cat </dev/null; then missing+=("$cmd") fi done if [[ ${#missing[@]} -gt 0 ]]; then echo "ERROR: Missing required commands: ${missing[*]}" >&2 echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 exit 1 fi } validate_config() { # Export DOCKER_HOST if set so docker CLI picks it up if [[ -n "$DOCKER_HOST" ]]; then export DOCKER_HOST fi } docker_cmd() { # Run a docker command and return its output # Returns empty string on failure docker "$@" 2>/dev/null || echo "" } add_metric() { local name="$1" local type="$2" local help="$3" local value="$4" local labels="${5:-}" if [[ -n "$labels" ]]; then OUTPUT+="# HELP ${name} ${help} # TYPE ${name} ${type} ${name}{${labels}} ${value} " else OUTPUT+="# HELP ${name} ${help} # TYPE ${name} ${type} ${name} ${value} " fi } add_metric_value() { local name="$1" local value="$2" local labels="${3:-}" if [[ -n "$labels" ]]; then OUTPUT+="${name}{${labels}} ${value} " else OUTPUT+="${name} ${value} " fi } collect_nodes() { local nodes_json nodes_json=$(docker_cmd node ls --format '{{json .}}') if [[ -z "$nodes_json" ]]; then add_metric "swarm_up" "gauge" "Docker Swarm reachability (1=up, 0=down)" "0" return 1 fi add_metric "swarm_up" "gauge" "Docker Swarm reachability (1=up, 0=down)" "1" # Total node count local node_count node_count=$(echo "$nodes_json" | wc -l) add_metric "swarm_node_count" "gauge" "Total number of nodes in the swarm" "${node_count}" # Nodes by status local nodes_ready nodes_down nodes_ready=$(echo "$nodes_json" | jq -r 'select(.Status == "Ready")' | jq -s 'length') nodes_down=$(echo "$nodes_json" | jq -r 'select(.Status == "Down")' | jq -s 'length') add_metric "swarm_nodes_ready" "gauge" "Number of nodes in ready state" "${nodes_ready}" add_metric "swarm_nodes_down" "gauge" "Number of nodes in down state" "${nodes_down}" # Manager and worker counts local managers_total workers_total managers_total=$(echo "$nodes_json" | jq -r 'select(.ManagerStatus != "")' | jq -s 'length') workers_total=$(echo "$nodes_json" | jq -r 'select(.ManagerStatus == "")' | jq -s 'length') add_metric "swarm_managers_total" "gauge" "Total number of manager nodes" "${managers_total}" add_metric "swarm_workers_total" "gauge" "Total number of worker nodes" "${workers_total}" # Leader detection — check if the current node is the leader local is_leader is_leader=$(echo "$nodes_json" | jq -r 'select(.Self == "true" or .Self == true) | select(.ManagerStatus == "Leader")' | jq -s 'length') if [[ "$is_leader" -gt 0 ]]; then add_metric "swarm_manager_leader" "gauge" "Whether this node is the leader (1=leader, 0=not leader)" "1" else add_metric "swarm_manager_leader" "gauge" "Whether this node is the leader (1=leader, 0=not leader)" "0" fi return 0 } collect_services() { local services_json services_json=$(docker_cmd service ls --format '{{json .}}') if [[ -z "$services_json" ]]; then add_metric "swarm_services_total" "gauge" "Total number of services" "0" return fi # Total service count local service_count service_count=$(echo "$services_json" | wc -l) add_metric "swarm_services_total" "gauge" "Total number of services" "${service_count}" # Per-service replica metrics # docker service ls --format '{{json .}}' gives us Name and Replicas ("3/3" format) local first_replicas=true local first_running=true while IFS= read -r line; do local service_name replicas_str desired running service_name=$(echo "$line" | jq -r '.Name') replicas_str=$(echo "$line" | jq -r '.Replicas') # Replicas format is "RUNNING/DESIRED" (e.g. "3/3") or "RUNNING/DESIRED (max N per node)" # Strip any parenthetical suffix replicas_str="${replicas_str%% (*}" running=$(echo "$replicas_str" | cut -d'/' -f1) desired=$(echo "$replicas_str" | cut -d'/' -f2) # Validate numeric if ! [[ "$desired" =~ ^[0-9]+$ ]]; then desired=0 fi if ! [[ "$running" =~ ^[0-9]+$ ]]; then running=0 fi if [[ "$first_replicas" == true ]]; then OUTPUT+="# HELP swarm_service_replicas Desired replica count per service # TYPE swarm_service_replicas gauge " first_replicas=false fi OUTPUT+="swarm_service_replicas{service=\"${service_name}\"} ${desired} " if [[ "$first_running" == true ]]; then first_running=false fi done <<< "$services_json" # Running replicas — separate HELP/TYPE block OUTPUT+="# HELP swarm_service_replicas_running Running replica count per service # TYPE swarm_service_replicas_running gauge " while IFS= read -r line; do local service_name replicas_str running service_name=$(echo "$line" | jq -r '.Name') replicas_str=$(echo "$line" | jq -r '.Replicas') replicas_str="${replicas_str%% (*}" running=$(echo "$replicas_str" | cut -d'/' -f1) if ! [[ "$running" =~ ^[0-9]+$ ]]; then running=0 fi OUTPUT+="swarm_service_replicas_running{service=\"${service_name}\"} ${running} " done <<< "$services_json" } collect_tasks() { # Count running tasks local tasks_running tasks_running=$(docker_cmd node ps --format '{{json .}}' --filter 'desired-state=running' 2>/dev/null | jq -s 'length' 2>/dev/null) if [[ -z "$tasks_running" || "$tasks_running" == "null" ]]; then tasks_running=0 fi add_metric "swarm_tasks_running" "gauge" "Total number of running tasks" "${tasks_running}" # Count failed tasks across all services local tasks_failed tasks_failed=$(docker_cmd service ls -q 2>/dev/null | while read -r svc_id; do docker service ps "$svc_id" --format '{{json .}}' --filter 'desired-state=shutdown' 2>/dev/null done | jq -r 'select(.CurrentState | test("^Failed|^Rejected"; "i"))' 2>/dev/null | jq -s 'length' 2>/dev/null) if [[ -z "$tasks_failed" || "$tasks_failed" == "null" ]]; then tasks_failed=0 fi add_metric "swarm_tasks_failed" "gauge" "Total number of failed tasks" "${tasks_failed}" } collect_networks() { local networks_json networks_json=$(docker_cmd network ls --filter driver=overlay --format '{{json .}}') local network_count=0 if [[ -n "$networks_json" ]]; then network_count=$(echo "$networks_json" | wc -l) fi add_metric "swarm_networks_total" "gauge" "Total number of overlay networks" "${network_count}" } collect_raft() { # Get Raft index from docker info local info_json info_json=$(docker_cmd info --format '{{json .}}') if [[ -z "$info_json" ]]; then return fi local raft_index raft_index=$(echo "$info_json" | jq -r '.Swarm.Cluster.RaftIndex // .Swarm.RaftIndex // empty' 2>/dev/null) # Fallback — try extracting from Swarm.Cluster directly if [[ -z "$raft_index" ]]; then raft_index=$(echo "$info_json" | jq -r '.Swarm.Cluster.Version.Index // empty' 2>/dev/null) fi if [[ -n "$raft_index" && "$raft_index" != "null" ]]; then add_metric "swarm_raft_index" "gauge" "Raft applied index" "${raft_index}" else add_metric "swarm_raft_index" "gauge" "Raft applied index" "0" fi } write_output() { if [[ "$TEXTFILE_MODE" == true ]]; then local output_file="${TEXTFILE_DIR}/docker_swarm.prom" local temp_file="${output_file}.$$" mkdir -p "$TEXTFILE_DIR" echo "$OUTPUT" > "$temp_file" mv "$temp_file" "$output_file" else echo "$OUTPUT" fi } install_cron() { if [[ $EUID -ne 0 ]]; then echo "ERROR: --install requires root" >&2 exit 1 fi local script_path script_path=$(readlink -f "$0") local env_lines="" if [[ -n "$DOCKER_HOST" ]]; then env_lines="DOCKER_HOST=${DOCKER_HOST} " fi cat > /etc/cron.d/docker-swarm-exporter </dev/null EOF chmod 644 /etc/cron.d/docker-swarm-exporter echo "Installed cron job: /etc/cron.d/docker-swarm-exporter" echo "Metrics will be written to: ${TEXTFILE_DIR}/docker_swarm.prom" } # --- Main --- main() { # Parse arguments for arg in "$@"; do case "$arg" in --textfile) TEXTFILE_MODE=true ;; --install) check_dependencies validate_config install_cron exit 0 ;; --help|-h) usage ;; *) echo "Unknown option: $arg" >&2; usage ;; esac done check_dependencies validate_config START_TIME=$(date +%s%N) # Exporter info add_metric "swarm_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" # Collect metrics if collect_nodes; then collect_services collect_tasks collect_networks collect_raft fi # Exporter performance local end_time duration end_time=$(date +%s%N) duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") add_metric "swarm_exporter_duration_seconds" "gauge" "Script execution time" "$duration" add_metric "swarm_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" write_output } main "$@"