a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
417 lines
12 KiB
Bash
417 lines
12 KiB
Bash
#!/usr/bin/env bash
|
|
#
|
|
# Docker Swarm Prometheus Metrics Exporter
|
|
#
|
|
# Prometheus textfile collector exporter for Docker Swarm.
|
|
# Uses the Docker CLI to collect node membership, service replica counts,
|
|
# task states, overlay network counts, manager leader status,
|
|
# and Raft consensus index.
|
|
#
|
|
# Usage:
|
|
# ./docker-swarm-exporter.sh
|
|
# ./docker-swarm-exporter.sh --textfile
|
|
# ./docker-swarm-exporter.sh --install
|
|
#
|
|
# Parameters:
|
|
# --textfile Write to textfile collector directory
|
|
# --install Create cron job for automatic collection
|
|
# --help Show usage
|
|
#
|
|
# Environment:
|
|
# DOCKER_HOST Docker daemon socket or host (default: local socket)
|
|
# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector)
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
# Version: 1.0
|
|
#
|
|
# Metrics Exported:
|
|
# Core:
|
|
# - swarm_up
|
|
# - swarm_exporter_info{version}
|
|
#
|
|
# Nodes:
|
|
# - swarm_node_count
|
|
# - swarm_nodes_ready
|
|
# - swarm_nodes_down
|
|
# - swarm_managers_total
|
|
# - swarm_manager_leader
|
|
# - swarm_workers_total
|
|
#
|
|
# Services:
|
|
# - swarm_services_total
|
|
# - swarm_service_replicas{service}
|
|
# - swarm_service_replicas_running{service}
|
|
#
|
|
# Tasks:
|
|
# - swarm_tasks_running
|
|
# - swarm_tasks_failed
|
|
#
|
|
# Networks:
|
|
# - swarm_networks_total
|
|
#
|
|
# Raft:
|
|
# - swarm_raft_index
|
|
#
|
|
# Exporter:
|
|
# - swarm_exporter_duration_seconds
|
|
# - swarm_exporter_last_run_timestamp
|
|
|
|
set -euo pipefail
|
|
|
|
# --- Configuration ---
|
|
readonly VERSION="1.0"
|
|
readonly SCRIPT_NAME="$(basename "$0")"
|
|
DOCKER_HOST="${DOCKER_HOST:-}"
|
|
TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}"
|
|
TEXTFILE_MODE=false
|
|
OUTPUT=""
|
|
START_TIME=""
|
|
|
|
# --- Functions ---
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: $SCRIPT_NAME [OPTIONS]
|
|
|
|
Docker Swarm Prometheus Metrics Exporter
|
|
|
|
Options:
|
|
--textfile Write metrics to textfile collector directory
|
|
--install Create cron job for automatic collection
|
|
--help Show this help message
|
|
|
|
Environment Variables:
|
|
DOCKER_HOST Docker daemon socket or host (default: local socket)
|
|
TEXTFILE_DIR Output directory (default: /var/lib/node_exporter/textfile_collector)
|
|
|
|
Examples:
|
|
$SCRIPT_NAME
|
|
$SCRIPT_NAME --textfile
|
|
DOCKER_HOST="tcp://swarm-manager:2375" $SCRIPT_NAME --textfile
|
|
$SCRIPT_NAME --install
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
check_dependencies() {
|
|
local missing=()
|
|
for cmd in docker jq; do
|
|
if ! command -v "$cmd" &>/dev/null; then
|
|
missing+=("$cmd")
|
|
fi
|
|
done
|
|
if [[ ${#missing[@]} -gt 0 ]]; then
|
|
echo "ERROR: Missing required commands: ${missing[*]}" >&2
|
|
echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
validate_config() {
|
|
# Export DOCKER_HOST if set so docker CLI picks it up
|
|
if [[ -n "$DOCKER_HOST" ]]; then
|
|
export DOCKER_HOST
|
|
fi
|
|
}
|
|
|
|
docker_cmd() {
|
|
# Run a docker command and return its output
|
|
# Returns empty string on failure
|
|
docker "$@" 2>/dev/null || echo ""
|
|
}
|
|
|
|
add_metric() {
|
|
local name="$1"
|
|
local type="$2"
|
|
local help="$3"
|
|
local value="$4"
|
|
local labels="${5:-}"
|
|
|
|
if [[ -n "$labels" ]]; then
|
|
OUTPUT+="# HELP ${name} ${help}
|
|
# TYPE ${name} ${type}
|
|
${name}{${labels}} ${value}
|
|
"
|
|
else
|
|
OUTPUT+="# HELP ${name} ${help}
|
|
# TYPE ${name} ${type}
|
|
${name} ${value}
|
|
"
|
|
fi
|
|
}
|
|
|
|
add_metric_value() {
|
|
local name="$1"
|
|
local value="$2"
|
|
local labels="${3:-}"
|
|
|
|
if [[ -n "$labels" ]]; then
|
|
OUTPUT+="${name}{${labels}} ${value}
|
|
"
|
|
else
|
|
OUTPUT+="${name} ${value}
|
|
"
|
|
fi
|
|
}
|
|
|
|
collect_nodes() {
|
|
local nodes_json
|
|
nodes_json=$(docker_cmd node ls --format '{{json .}}')
|
|
|
|
if [[ -z "$nodes_json" ]]; then
|
|
add_metric "swarm_up" "gauge" "Docker Swarm reachability (1=up, 0=down)" "0"
|
|
return 1
|
|
fi
|
|
|
|
add_metric "swarm_up" "gauge" "Docker Swarm reachability (1=up, 0=down)" "1"
|
|
|
|
# Total node count
|
|
local node_count
|
|
node_count=$(echo "$nodes_json" | wc -l)
|
|
add_metric "swarm_node_count" "gauge" "Total number of nodes in the swarm" "${node_count}"
|
|
|
|
# Nodes by status
|
|
local nodes_ready nodes_down
|
|
nodes_ready=$(echo "$nodes_json" | jq -r 'select(.Status == "Ready")' | jq -s 'length')
|
|
nodes_down=$(echo "$nodes_json" | jq -r 'select(.Status == "Down")' | jq -s 'length')
|
|
add_metric "swarm_nodes_ready" "gauge" "Number of nodes in ready state" "${nodes_ready}"
|
|
add_metric "swarm_nodes_down" "gauge" "Number of nodes in down state" "${nodes_down}"
|
|
|
|
# Manager and worker counts
|
|
local managers_total workers_total
|
|
managers_total=$(echo "$nodes_json" | jq -r 'select(.ManagerStatus != "")' | jq -s 'length')
|
|
workers_total=$(echo "$nodes_json" | jq -r 'select(.ManagerStatus == "")' | jq -s 'length')
|
|
add_metric "swarm_managers_total" "gauge" "Total number of manager nodes" "${managers_total}"
|
|
add_metric "swarm_workers_total" "gauge" "Total number of worker nodes" "${workers_total}"
|
|
|
|
# Leader detection — check if the current node is the leader
|
|
local is_leader
|
|
is_leader=$(echo "$nodes_json" | jq -r 'select(.Self == "true" or .Self == true) | select(.ManagerStatus == "Leader")' | jq -s 'length')
|
|
if [[ "$is_leader" -gt 0 ]]; then
|
|
add_metric "swarm_manager_leader" "gauge" "Whether this node is the leader (1=leader, 0=not leader)" "1"
|
|
else
|
|
add_metric "swarm_manager_leader" "gauge" "Whether this node is the leader (1=leader, 0=not leader)" "0"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
collect_services() {
|
|
local services_json
|
|
services_json=$(docker_cmd service ls --format '{{json .}}')
|
|
|
|
if [[ -z "$services_json" ]]; then
|
|
add_metric "swarm_services_total" "gauge" "Total number of services" "0"
|
|
return
|
|
fi
|
|
|
|
# Total service count
|
|
local service_count
|
|
service_count=$(echo "$services_json" | wc -l)
|
|
add_metric "swarm_services_total" "gauge" "Total number of services" "${service_count}"
|
|
|
|
# Per-service replica metrics
|
|
# docker service ls --format '{{json .}}' gives us Name and Replicas ("3/3" format)
|
|
local first_replicas=true
|
|
local first_running=true
|
|
|
|
while IFS= read -r line; do
|
|
local service_name replicas_str desired running
|
|
|
|
service_name=$(echo "$line" | jq -r '.Name')
|
|
replicas_str=$(echo "$line" | jq -r '.Replicas')
|
|
|
|
# Replicas format is "RUNNING/DESIRED" (e.g. "3/3") or "RUNNING/DESIRED (max N per node)"
|
|
# Strip any parenthetical suffix
|
|
replicas_str="${replicas_str%% (*}"
|
|
|
|
running=$(echo "$replicas_str" | cut -d'/' -f1)
|
|
desired=$(echo "$replicas_str" | cut -d'/' -f2)
|
|
|
|
# Validate numeric
|
|
if ! [[ "$desired" =~ ^[0-9]+$ ]]; then
|
|
desired=0
|
|
fi
|
|
if ! [[ "$running" =~ ^[0-9]+$ ]]; then
|
|
running=0
|
|
fi
|
|
|
|
if [[ "$first_replicas" == true ]]; then
|
|
OUTPUT+="# HELP swarm_service_replicas Desired replica count per service
|
|
# TYPE swarm_service_replicas gauge
|
|
"
|
|
first_replicas=false
|
|
fi
|
|
OUTPUT+="swarm_service_replicas{service=\"${service_name}\"} ${desired}
|
|
"
|
|
|
|
if [[ "$first_running" == true ]]; then
|
|
first_running=false
|
|
fi
|
|
done <<< "$services_json"
|
|
|
|
# Running replicas — separate HELP/TYPE block
|
|
OUTPUT+="# HELP swarm_service_replicas_running Running replica count per service
|
|
# TYPE swarm_service_replicas_running gauge
|
|
"
|
|
while IFS= read -r line; do
|
|
local service_name replicas_str running
|
|
|
|
service_name=$(echo "$line" | jq -r '.Name')
|
|
replicas_str=$(echo "$line" | jq -r '.Replicas')
|
|
replicas_str="${replicas_str%% (*}"
|
|
running=$(echo "$replicas_str" | cut -d'/' -f1)
|
|
|
|
if ! [[ "$running" =~ ^[0-9]+$ ]]; then
|
|
running=0
|
|
fi
|
|
|
|
OUTPUT+="swarm_service_replicas_running{service=\"${service_name}\"} ${running}
|
|
"
|
|
done <<< "$services_json"
|
|
}
|
|
|
|
collect_tasks() {
|
|
# Count running tasks
|
|
local tasks_running
|
|
tasks_running=$(docker_cmd node ps --format '{{json .}}' --filter 'desired-state=running' 2>/dev/null | jq -s 'length' 2>/dev/null)
|
|
if [[ -z "$tasks_running" || "$tasks_running" == "null" ]]; then
|
|
tasks_running=0
|
|
fi
|
|
add_metric "swarm_tasks_running" "gauge" "Total number of running tasks" "${tasks_running}"
|
|
|
|
# Count failed tasks across all services
|
|
local tasks_failed
|
|
tasks_failed=$(docker_cmd service ls -q 2>/dev/null | while read -r svc_id; do
|
|
docker service ps "$svc_id" --format '{{json .}}' --filter 'desired-state=shutdown' 2>/dev/null
|
|
done | jq -r 'select(.CurrentState | test("^Failed|^Rejected"; "i"))' 2>/dev/null | jq -s 'length' 2>/dev/null)
|
|
if [[ -z "$tasks_failed" || "$tasks_failed" == "null" ]]; then
|
|
tasks_failed=0
|
|
fi
|
|
add_metric "swarm_tasks_failed" "gauge" "Total number of failed tasks" "${tasks_failed}"
|
|
}
|
|
|
|
collect_networks() {
|
|
local networks_json
|
|
networks_json=$(docker_cmd network ls --filter driver=overlay --format '{{json .}}')
|
|
|
|
local network_count=0
|
|
if [[ -n "$networks_json" ]]; then
|
|
network_count=$(echo "$networks_json" | wc -l)
|
|
fi
|
|
|
|
add_metric "swarm_networks_total" "gauge" "Total number of overlay networks" "${network_count}"
|
|
}
|
|
|
|
collect_raft() {
|
|
# Get Raft index from docker info
|
|
local info_json
|
|
info_json=$(docker_cmd info --format '{{json .}}')
|
|
|
|
if [[ -z "$info_json" ]]; then
|
|
return
|
|
fi
|
|
|
|
local raft_index
|
|
raft_index=$(echo "$info_json" | jq -r '.Swarm.Cluster.RaftIndex // .Swarm.RaftIndex // empty' 2>/dev/null)
|
|
|
|
# Fallback — try extracting from Swarm.Cluster directly
|
|
if [[ -z "$raft_index" ]]; then
|
|
raft_index=$(echo "$info_json" | jq -r '.Swarm.Cluster.Version.Index // empty' 2>/dev/null)
|
|
fi
|
|
|
|
if [[ -n "$raft_index" && "$raft_index" != "null" ]]; then
|
|
add_metric "swarm_raft_index" "gauge" "Raft applied index" "${raft_index}"
|
|
else
|
|
add_metric "swarm_raft_index" "gauge" "Raft applied index" "0"
|
|
fi
|
|
}
|
|
|
|
write_output() {
|
|
if [[ "$TEXTFILE_MODE" == true ]]; then
|
|
local output_file="${TEXTFILE_DIR}/docker_swarm.prom"
|
|
local temp_file="${output_file}.$$"
|
|
|
|
mkdir -p "$TEXTFILE_DIR"
|
|
echo "$OUTPUT" > "$temp_file"
|
|
mv "$temp_file" "$output_file"
|
|
else
|
|
echo "$OUTPUT"
|
|
fi
|
|
}
|
|
|
|
install_cron() {
|
|
if [[ $EUID -ne 0 ]]; then
|
|
echo "ERROR: --install requires root" >&2
|
|
exit 1
|
|
fi
|
|
|
|
local script_path
|
|
script_path=$(readlink -f "$0")
|
|
|
|
local env_lines=""
|
|
if [[ -n "$DOCKER_HOST" ]]; then
|
|
env_lines="DOCKER_HOST=${DOCKER_HOST}
|
|
"
|
|
fi
|
|
|
|
cat > /etc/cron.d/docker-swarm-exporter <<EOF
|
|
# Docker Swarm Prometheus Exporter — runs every 2 minutes
|
|
${env_lines}TEXTFILE_DIR=${TEXTFILE_DIR}
|
|
*/2 * * * * root ${script_path} --textfile 2>/dev/null
|
|
EOF
|
|
|
|
chmod 644 /etc/cron.d/docker-swarm-exporter
|
|
echo "Installed cron job: /etc/cron.d/docker-swarm-exporter"
|
|
echo "Metrics will be written to: ${TEXTFILE_DIR}/docker_swarm.prom"
|
|
}
|
|
|
|
# --- Main ---
|
|
|
|
main() {
|
|
# Parse arguments
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--textfile) TEXTFILE_MODE=true ;;
|
|
--install)
|
|
check_dependencies
|
|
validate_config
|
|
install_cron
|
|
exit 0
|
|
;;
|
|
--help|-h) usage ;;
|
|
*) echo "Unknown option: $arg" >&2; usage ;;
|
|
esac
|
|
done
|
|
|
|
check_dependencies
|
|
validate_config
|
|
|
|
START_TIME=$(date +%s%N)
|
|
|
|
# Exporter info
|
|
add_metric "swarm_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\""
|
|
|
|
# Collect metrics
|
|
if collect_nodes; then
|
|
collect_services
|
|
collect_tasks
|
|
collect_networks
|
|
collect_raft
|
|
fi
|
|
|
|
# Exporter performance
|
|
local end_time duration
|
|
end_time=$(date +%s%N)
|
|
duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0")
|
|
add_metric "swarm_exporter_duration_seconds" "gauge" "Script execution time" "$duration"
|
|
add_metric "swarm_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)"
|
|
|
|
write_output
|
|
}
|
|
|
|
main "$@"
|