Files
linux-scripts/docker-swarm-exporter.sh
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

417 lines
12 KiB
Bash

#!/usr/bin/env bash
#
# Docker Swarm Prometheus Metrics Exporter
#
# Prometheus textfile collector exporter for Docker Swarm.
# Uses the Docker CLI to collect node membership, service replica counts,
# task states, overlay network counts, manager leader status,
# and Raft consensus index.
#
# Usage:
# ./docker-swarm-exporter.sh
# ./docker-swarm-exporter.sh --textfile
# ./docker-swarm-exporter.sh --install
#
# Parameters:
# --textfile Write to textfile collector directory
# --install Create cron job for automatic collection
# --help Show usage
#
# Environment:
# DOCKER_HOST Docker daemon socket or host (default: local socket)
# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector)
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
# Version: 1.0
#
# Metrics Exported:
# Core:
# - swarm_up
# - swarm_exporter_info{version}
#
# Nodes:
# - swarm_node_count
# - swarm_nodes_ready
# - swarm_nodes_down
# - swarm_managers_total
# - swarm_manager_leader
# - swarm_workers_total
#
# Services:
# - swarm_services_total
# - swarm_service_replicas{service}
# - swarm_service_replicas_running{service}
#
# Tasks:
# - swarm_tasks_running
# - swarm_tasks_failed
#
# Networks:
# - swarm_networks_total
#
# Raft:
# - swarm_raft_index
#
# Exporter:
# - swarm_exporter_duration_seconds
# - swarm_exporter_last_run_timestamp
set -euo pipefail
# --- Configuration ---
readonly VERSION="1.0"
readonly SCRIPT_NAME="$(basename "$0")"
DOCKER_HOST="${DOCKER_HOST:-}"
TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}"
TEXTFILE_MODE=false
OUTPUT=""
START_TIME=""
# --- Functions ---
usage() {
cat <<EOF
Usage: $SCRIPT_NAME [OPTIONS]
Docker Swarm Prometheus Metrics Exporter
Options:
--textfile Write metrics to textfile collector directory
--install Create cron job for automatic collection
--help Show this help message
Environment Variables:
DOCKER_HOST Docker daemon socket or host (default: local socket)
TEXTFILE_DIR Output directory (default: /var/lib/node_exporter/textfile_collector)
Examples:
$SCRIPT_NAME
$SCRIPT_NAME --textfile
DOCKER_HOST="tcp://swarm-manager:2375" $SCRIPT_NAME --textfile
$SCRIPT_NAME --install
EOF
exit 0
}
check_dependencies() {
local missing=()
for cmd in docker jq; do
if ! command -v "$cmd" &>/dev/null; then
missing+=("$cmd")
fi
done
if [[ ${#missing[@]} -gt 0 ]]; then
echo "ERROR: Missing required commands: ${missing[*]}" >&2
echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2
exit 1
fi
}
validate_config() {
# Export DOCKER_HOST if set so docker CLI picks it up
if [[ -n "$DOCKER_HOST" ]]; then
export DOCKER_HOST
fi
}
docker_cmd() {
# Run a docker command and return its output
# Returns empty string on failure
docker "$@" 2>/dev/null || echo ""
}
add_metric() {
local name="$1"
local type="$2"
local help="$3"
local value="$4"
local labels="${5:-}"
if [[ -n "$labels" ]]; then
OUTPUT+="# HELP ${name} ${help}
# TYPE ${name} ${type}
${name}{${labels}} ${value}
"
else
OUTPUT+="# HELP ${name} ${help}
# TYPE ${name} ${type}
${name} ${value}
"
fi
}
add_metric_value() {
local name="$1"
local value="$2"
local labels="${3:-}"
if [[ -n "$labels" ]]; then
OUTPUT+="${name}{${labels}} ${value}
"
else
OUTPUT+="${name} ${value}
"
fi
}
collect_nodes() {
local nodes_json
nodes_json=$(docker_cmd node ls --format '{{json .}}')
if [[ -z "$nodes_json" ]]; then
add_metric "swarm_up" "gauge" "Docker Swarm reachability (1=up, 0=down)" "0"
return 1
fi
add_metric "swarm_up" "gauge" "Docker Swarm reachability (1=up, 0=down)" "1"
# Total node count
local node_count
node_count=$(echo "$nodes_json" | wc -l)
add_metric "swarm_node_count" "gauge" "Total number of nodes in the swarm" "${node_count}"
# Nodes by status
local nodes_ready nodes_down
nodes_ready=$(echo "$nodes_json" | jq -r 'select(.Status == "Ready")' | jq -s 'length')
nodes_down=$(echo "$nodes_json" | jq -r 'select(.Status == "Down")' | jq -s 'length')
add_metric "swarm_nodes_ready" "gauge" "Number of nodes in ready state" "${nodes_ready}"
add_metric "swarm_nodes_down" "gauge" "Number of nodes in down state" "${nodes_down}"
# Manager and worker counts
local managers_total workers_total
managers_total=$(echo "$nodes_json" | jq -r 'select(.ManagerStatus != "")' | jq -s 'length')
workers_total=$(echo "$nodes_json" | jq -r 'select(.ManagerStatus == "")' | jq -s 'length')
add_metric "swarm_managers_total" "gauge" "Total number of manager nodes" "${managers_total}"
add_metric "swarm_workers_total" "gauge" "Total number of worker nodes" "${workers_total}"
# Leader detection — check if the current node is the leader
local is_leader
is_leader=$(echo "$nodes_json" | jq -r 'select(.Self == "true" or .Self == true) | select(.ManagerStatus == "Leader")' | jq -s 'length')
if [[ "$is_leader" -gt 0 ]]; then
add_metric "swarm_manager_leader" "gauge" "Whether this node is the leader (1=leader, 0=not leader)" "1"
else
add_metric "swarm_manager_leader" "gauge" "Whether this node is the leader (1=leader, 0=not leader)" "0"
fi
return 0
}
collect_services() {
local services_json
services_json=$(docker_cmd service ls --format '{{json .}}')
if [[ -z "$services_json" ]]; then
add_metric "swarm_services_total" "gauge" "Total number of services" "0"
return
fi
# Total service count
local service_count
service_count=$(echo "$services_json" | wc -l)
add_metric "swarm_services_total" "gauge" "Total number of services" "${service_count}"
# Per-service replica metrics
# docker service ls --format '{{json .}}' gives us Name and Replicas ("3/3" format)
local first_replicas=true
local first_running=true
while IFS= read -r line; do
local service_name replicas_str desired running
service_name=$(echo "$line" | jq -r '.Name')
replicas_str=$(echo "$line" | jq -r '.Replicas')
# Replicas format is "RUNNING/DESIRED" (e.g. "3/3") or "RUNNING/DESIRED (max N per node)"
# Strip any parenthetical suffix
replicas_str="${replicas_str%% (*}"
running=$(echo "$replicas_str" | cut -d'/' -f1)
desired=$(echo "$replicas_str" | cut -d'/' -f2)
# Validate numeric
if ! [[ "$desired" =~ ^[0-9]+$ ]]; then
desired=0
fi
if ! [[ "$running" =~ ^[0-9]+$ ]]; then
running=0
fi
if [[ "$first_replicas" == true ]]; then
OUTPUT+="# HELP swarm_service_replicas Desired replica count per service
# TYPE swarm_service_replicas gauge
"
first_replicas=false
fi
OUTPUT+="swarm_service_replicas{service=\"${service_name}\"} ${desired}
"
if [[ "$first_running" == true ]]; then
first_running=false
fi
done <<< "$services_json"
# Running replicas — separate HELP/TYPE block
OUTPUT+="# HELP swarm_service_replicas_running Running replica count per service
# TYPE swarm_service_replicas_running gauge
"
while IFS= read -r line; do
local service_name replicas_str running
service_name=$(echo "$line" | jq -r '.Name')
replicas_str=$(echo "$line" | jq -r '.Replicas')
replicas_str="${replicas_str%% (*}"
running=$(echo "$replicas_str" | cut -d'/' -f1)
if ! [[ "$running" =~ ^[0-9]+$ ]]; then
running=0
fi
OUTPUT+="swarm_service_replicas_running{service=\"${service_name}\"} ${running}
"
done <<< "$services_json"
}
collect_tasks() {
# Count running tasks
local tasks_running
tasks_running=$(docker_cmd node ps --format '{{json .}}' --filter 'desired-state=running' 2>/dev/null | jq -s 'length' 2>/dev/null)
if [[ -z "$tasks_running" || "$tasks_running" == "null" ]]; then
tasks_running=0
fi
add_metric "swarm_tasks_running" "gauge" "Total number of running tasks" "${tasks_running}"
# Count failed tasks across all services
local tasks_failed
tasks_failed=$(docker_cmd service ls -q 2>/dev/null | while read -r svc_id; do
docker service ps "$svc_id" --format '{{json .}}' --filter 'desired-state=shutdown' 2>/dev/null
done | jq -r 'select(.CurrentState | test("^Failed|^Rejected"; "i"))' 2>/dev/null | jq -s 'length' 2>/dev/null)
if [[ -z "$tasks_failed" || "$tasks_failed" == "null" ]]; then
tasks_failed=0
fi
add_metric "swarm_tasks_failed" "gauge" "Total number of failed tasks" "${tasks_failed}"
}
collect_networks() {
local networks_json
networks_json=$(docker_cmd network ls --filter driver=overlay --format '{{json .}}')
local network_count=0
if [[ -n "$networks_json" ]]; then
network_count=$(echo "$networks_json" | wc -l)
fi
add_metric "swarm_networks_total" "gauge" "Total number of overlay networks" "${network_count}"
}
collect_raft() {
# Get Raft index from docker info
local info_json
info_json=$(docker_cmd info --format '{{json .}}')
if [[ -z "$info_json" ]]; then
return
fi
local raft_index
raft_index=$(echo "$info_json" | jq -r '.Swarm.Cluster.RaftIndex // .Swarm.RaftIndex // empty' 2>/dev/null)
# Fallback — try extracting from Swarm.Cluster directly
if [[ -z "$raft_index" ]]; then
raft_index=$(echo "$info_json" | jq -r '.Swarm.Cluster.Version.Index // empty' 2>/dev/null)
fi
if [[ -n "$raft_index" && "$raft_index" != "null" ]]; then
add_metric "swarm_raft_index" "gauge" "Raft applied index" "${raft_index}"
else
add_metric "swarm_raft_index" "gauge" "Raft applied index" "0"
fi
}
write_output() {
if [[ "$TEXTFILE_MODE" == true ]]; then
local output_file="${TEXTFILE_DIR}/docker_swarm.prom"
local temp_file="${output_file}.$$"
mkdir -p "$TEXTFILE_DIR"
echo "$OUTPUT" > "$temp_file"
mv "$temp_file" "$output_file"
else
echo "$OUTPUT"
fi
}
install_cron() {
if [[ $EUID -ne 0 ]]; then
echo "ERROR: --install requires root" >&2
exit 1
fi
local script_path
script_path=$(readlink -f "$0")
local env_lines=""
if [[ -n "$DOCKER_HOST" ]]; then
env_lines="DOCKER_HOST=${DOCKER_HOST}
"
fi
cat > /etc/cron.d/docker-swarm-exporter <<EOF
# Docker Swarm Prometheus Exporter — runs every 2 minutes
${env_lines}TEXTFILE_DIR=${TEXTFILE_DIR}
*/2 * * * * root ${script_path} --textfile 2>/dev/null
EOF
chmod 644 /etc/cron.d/docker-swarm-exporter
echo "Installed cron job: /etc/cron.d/docker-swarm-exporter"
echo "Metrics will be written to: ${TEXTFILE_DIR}/docker_swarm.prom"
}
# --- Main ---
main() {
# Parse arguments
for arg in "$@"; do
case "$arg" in
--textfile) TEXTFILE_MODE=true ;;
--install)
check_dependencies
validate_config
install_cron
exit 0
;;
--help|-h) usage ;;
*) echo "Unknown option: $arg" >&2; usage ;;
esac
done
check_dependencies
validate_config
START_TIME=$(date +%s%N)
# Exporter info
add_metric "swarm_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\""
# Collect metrics
if collect_nodes; then
collect_services
collect_tasks
collect_networks
collect_raft
fi
# Exporter performance
local end_time duration
end_time=$(date +%s%N)
duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0")
add_metric "swarm_exporter_duration_seconds" "gauge" "Script execution time" "$duration"
add_metric "swarm_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)"
write_output
}
main "$@"