88551536e6
Amp-Thread-ID: https://ampcode.com/threads/T-019cc404-c628-759e-a50b-f5eeea35b91f Co-authored-by: Amp <amp@ampcode.com>
1315 lines
46 KiB
Bash
Executable File
1315 lines
46 KiB
Bash
Executable File
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: salt-master-metrics.sh
|
|
# Version: 3.1
|
|
# Author: Phil Connor, contact@mylinux.work
|
|
# License: MIT
|
|
# Description: Production Prometheus exporter for Salt Master metrics
|
|
#
|
|
# Exports metrics for:
|
|
# - Master process health (CPU, memory, uptime)
|
|
# - Minion connectivity (up, down, accepted, rejected, denied, unaccepted)
|
|
# - Per-minion last-seen timestamp (stale minions only)
|
|
# - Minion version drift (match vs mismatch count)
|
|
# - Job statistics (active, cached, recent, completed 1h/24h, failed 24h)
|
|
# - Jobs by function breakdown (top 10)
|
|
# - Per-function expected/actual responses with success/failure (24h)
|
|
# - Per-function new job counts with success/failure (24h)
|
|
# - Scheduled job returns per minion/function/state (24h)
|
|
# - Key management counts
|
|
# - Event bus health
|
|
# - ZeroMQ port status (4505, 4506)
|
|
# - Worker thread utilization
|
|
# - Salt versions (master vs minion drift)
|
|
# - Cache disk usage and inode count
|
|
# - Salt master log error rate (1h)
|
|
# - File server cache size
|
|
# - Configuration values (keep_jobs, master_stats)
|
|
# - Highstate metrics (failures, last timestamp per minion)
|
|
# - Extended configuration detection (state_events, presence_events, timeout, job_cache)
|
|
# - Salt API process status
|
|
# - Auth failure rate from logs
|
|
# - File roots total size
|
|
# - Master log file size
|
|
# - Minion auth/key events from journal
|
|
#
|
|
# Modes:
|
|
# --textfile Write to node_exporter textfile collector
|
|
# --http Run HTTP server for direct Prometheus scraping
|
|
# stdout Default: print metrics to stdout
|
|
#
|
|
# Changelog:
|
|
# 3.1 - Added per-function expected/actual response metrics, per-function
|
|
# new job metrics with success/failure, scheduled job return
|
|
# metrics per minion/function/state. All from job cache parsing.
|
|
# 3.0 - Added config detection (state_events, presence_events, timeout,
|
|
# job_cache, publish_port, ret_port), salt-api process status,
|
|
# auth failure rate, file_roots size, master log size, minion
|
|
# auth events. Pure bash (no python3 dependency).
|
|
# 2.0 - Added per-minion last-seen, version drift, jobs completed/failed,
|
|
# jobs by function, ZeroMQ port checks, cache disk/inode metrics,
|
|
# log error rate, fileserver cache size, config values, highstate
|
|
# metrics. Expanded caching layer for expensive operations.
|
|
# 1.0 - Initial release
|
|
################################################################################
|
|
|
|
SCRIPT_VERSION="3.1"
|
|
TEXTFILE_DIR="/var/lib/node_exporter"
|
|
OUTPUT_FILE=""
|
|
HTTP_MODE=false
|
|
HTTP_PORT=9417
|
|
LOCK_FILE="/var/run/salt-master-metrics.lock"
|
|
SALT_MASTER_CONFIG="/etc/salt/master"
|
|
SALT_CACHE_DIR="/var/cache/salt/master"
|
|
SALT_PKI_DIR="/etc/salt/pki/master"
|
|
SALT_RUN_DIR="/var/run/salt/master"
|
|
|
|
# Timeouts for salt commands (seconds)
|
|
SALT_CMD_TIMEOUT=15
|
|
|
|
# Cache for expensive operations
|
|
MINION_UP_CACHE=""
|
|
MINION_DOWN_CACHE=""
|
|
KEY_LIST_CACHE=""
|
|
VERSIONS_CACHE=""
|
|
LOG_ERRORS_CACHE=""
|
|
LOG_CRITICAL_CACHE=""
|
|
HIGHSTATE_FAILURES_CACHE=""
|
|
ACTIVE_JOBS_CACHE=""
|
|
JOB_LIST_CACHE=""
|
|
LOG_AUTH_FAILURES_CACHE=""
|
|
LOG_KEY_EVENTS_CACHE=""
|
|
|
|
show_usage() {
|
|
cat <<EOF
|
|
Usage: $0 [OPTIONS]
|
|
|
|
Export Salt Master metrics as Prometheus metrics (v${SCRIPT_VERSION}).
|
|
|
|
MODES:
|
|
--textfile Write to node_exporter textfile collector
|
|
--http Run HTTP server on port $HTTP_PORT
|
|
|
|
OPTIONS:
|
|
-p, --port HTTP port (default: $HTTP_PORT)
|
|
-o, --output Output file
|
|
-h, --help Show help
|
|
|
|
REQUIREMENTS:
|
|
- Must run as root or salt user
|
|
- salt-run and salt-key commands must be available
|
|
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help) show_usage ;;
|
|
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/salt_master_metrics.prom"; shift ;;
|
|
--http) HTTP_MODE=true; shift ;;
|
|
-p|--port) HTTP_PORT="$2"; shift 2 ;;
|
|
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
|
|
*) echo "Unknown: $1"; exit 1 ;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
acquire_lock() {
|
|
if [ -f "$LOCK_FILE" ]; then
|
|
local pid
|
|
pid=$(cat "$LOCK_FILE" 2>/dev/null)
|
|
if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
|
|
echo "ERROR: Another instance is already running (PID: $pid)" >&2
|
|
exit 1
|
|
else
|
|
echo "Removing stale lock file" >&2
|
|
rm -f "$LOCK_FILE"
|
|
fi
|
|
fi
|
|
echo $$ > "$LOCK_FILE"
|
|
trap cleanup EXIT INT TERM
|
|
}
|
|
|
|
cleanup() {
|
|
rm -f "$LOCK_FILE"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data collection (cached)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
cache_key_list() {
|
|
KEY_LIST_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-key -L 2>/dev/null || echo "")
|
|
}
|
|
|
|
# Use plain-text salt-run commands (one minion per line, no python needed)
|
|
cache_minion_up() {
|
|
MINION_UP_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run manage.up 2>/dev/null || echo "")
|
|
}
|
|
|
|
cache_minion_down() {
|
|
MINION_DOWN_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run manage.down 2>/dev/null || echo "")
|
|
}
|
|
|
|
cache_versions() {
|
|
VERSIONS_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run manage.versions 2>/dev/null || echo "")
|
|
}
|
|
|
|
cache_active_jobs() {
|
|
ACTIVE_JOBS_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run jobs.active 2>/dev/null || echo "")
|
|
}
|
|
|
|
cache_job_list() {
|
|
JOB_LIST_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run jobs.list_jobs --out=txt 2>/dev/null || echo "")
|
|
}
|
|
|
|
# Run all expensive salt-run commands in parallel using temp files
|
|
cache_all_salt_data() {
|
|
local tmp_dir
|
|
tmp_dir=$(mktemp -d /tmp/salt_metrics_cache.XXXXXX)
|
|
|
|
timeout "$SALT_CMD_TIMEOUT" salt-key -L > "$tmp_dir/keys" 2>/dev/null &
|
|
timeout "$SALT_CMD_TIMEOUT" salt-run manage.up > "$tmp_dir/up" 2>/dev/null &
|
|
timeout "$SALT_CMD_TIMEOUT" salt-run manage.down > "$tmp_dir/down" 2>/dev/null &
|
|
timeout "$SALT_CMD_TIMEOUT" salt-run manage.versions > "$tmp_dir/versions" 2>/dev/null &
|
|
timeout "$SALT_CMD_TIMEOUT" salt-run jobs.active > "$tmp_dir/active" 2>/dev/null &
|
|
timeout "$SALT_CMD_TIMEOUT" salt-run jobs.list_jobs --out=txt > "$tmp_dir/joblist" 2>/dev/null &
|
|
wait
|
|
|
|
KEY_LIST_CACHE=$(cat "$tmp_dir/keys" 2>/dev/null)
|
|
MINION_UP_CACHE=$(cat "$tmp_dir/up" 2>/dev/null)
|
|
MINION_DOWN_CACHE=$(cat "$tmp_dir/down" 2>/dev/null)
|
|
VERSIONS_CACHE=$(cat "$tmp_dir/versions" 2>/dev/null)
|
|
ACTIVE_JOBS_CACHE=$(cat "$tmp_dir/active" 2>/dev/null)
|
|
JOB_LIST_CACHE=$(cat "$tmp_dir/joblist" 2>/dev/null)
|
|
|
|
rm -rf "$tmp_dir"
|
|
}
|
|
|
|
cache_log_data() {
|
|
local since_time
|
|
since_time=$(date -d '1 hour ago' '+%Y-%m-%d %H:%M:%S' 2>/dev/null)
|
|
if command -v journalctl >/dev/null 2>&1; then
|
|
LOG_ERRORS_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -c "ERROR" 2>/dev/null || true)
|
|
LOG_CRITICAL_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -c "CRITICAL" 2>/dev/null || true)
|
|
elif [ -f /var/log/salt/master ]; then
|
|
local cutoff
|
|
cutoff=$(date -d '1 hour ago' '+%Y-%m-%d %H:%M' 2>/dev/null)
|
|
LOG_ERRORS_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -c "ERROR" 2>/dev/null || true)
|
|
LOG_CRITICAL_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -c "CRITICAL" 2>/dev/null || true)
|
|
else
|
|
LOG_ERRORS_CACHE="0"
|
|
LOG_CRITICAL_CACHE="0"
|
|
fi
|
|
|
|
# Auth failures and key events
|
|
if command -v journalctl >/dev/null 2>&1; then
|
|
LOG_AUTH_FAILURES_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -ic "authentication denied\|failed to authenticate\|salt\.crypt.*denied" 2>/dev/null || true)
|
|
LOG_KEY_EVENTS_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -ic "salt\.key\|key.*accept\|key.*reject\|key.*denied\|new key" 2>/dev/null || true)
|
|
elif [ -f /var/log/salt/master ]; then
|
|
LOG_AUTH_FAILURES_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -ic "authentication denied\|failed to authenticate\|salt\.crypt.*denied" 2>/dev/null || true)
|
|
LOG_KEY_EVENTS_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -ic "salt\.key\|key.*accept\|key.*reject\|key.*denied\|new key" 2>/dev/null || true)
|
|
else
|
|
LOG_AUTH_FAILURES_CACHE="0"
|
|
LOG_KEY_EVENTS_CACHE="0"
|
|
fi
|
|
}
|
|
|
|
cache_highstate_data() {
|
|
local since_time
|
|
since_time=$(date -d '24 hours ago' '+%Y-%m-%d %H:%M:%S' 2>/dev/null)
|
|
if command -v journalctl >/dev/null 2>&1; then
|
|
HIGHSTATE_FAILURES_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -c "highstate.*fail\|Highstate.*fail\|state.highstate.*False" 2>/dev/null || true)
|
|
elif [ -f /var/log/salt/master ]; then
|
|
local cutoff
|
|
cutoff=$(date -d '24 hours ago' '+%Y-%m-%d %H:%M' 2>/dev/null)
|
|
HIGHSTATE_FAILURES_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -c "highstate.*fail\|Highstate.*fail\|state.highstate.*False" 2>/dev/null || true)
|
|
else
|
|
HIGHSTATE_FAILURES_CACHE="0"
|
|
fi
|
|
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Key metrics
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_key_count() {
|
|
local category="$1"
|
|
if [ -z "$KEY_LIST_CACHE" ]; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
case "$category" in
|
|
accepted)
|
|
find "$SALT_PKI_DIR/minions/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
|
|
;;
|
|
denied)
|
|
find "$SALT_PKI_DIR/minions_denied/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
|
|
;;
|
|
unaccepted)
|
|
find "$SALT_PKI_DIR/minions_pre/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
|
|
;;
|
|
rejected)
|
|
find "$SALT_PKI_DIR/minions_rejected/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
|
|
;;
|
|
*) echo "0" ;;
|
|
esac
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Minion status metrics (plain-text output: one minion per line, prefixed "- ")
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_minions_up() {
|
|
if [ -z "$MINION_UP_CACHE" ]; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
echo "$MINION_UP_CACHE" | grep -c "^-" 2>/dev/null || true
|
|
}
|
|
|
|
get_minions_down() {
|
|
if [ -z "$MINION_DOWN_CACHE" ]; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
echo "$MINION_DOWN_CACHE" | grep -c "^-" 2>/dev/null || true
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Process metrics
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_master_pid() {
|
|
pgrep -f "salt-master" -o 2>/dev/null || echo ""
|
|
}
|
|
|
|
get_master_uptime_seconds() {
|
|
local pid
|
|
pid=$(get_master_pid)
|
|
if [ -n "$pid" ] && [ -d "/proc/$pid" ]; then
|
|
local start_time
|
|
start_time=$(stat -c %Y "/proc/$pid" 2>/dev/null || echo "0")
|
|
if [ "$start_time" -gt 0 ]; then
|
|
echo $(( $(date +%s) - start_time ))
|
|
else
|
|
echo "0"
|
|
fi
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_master_memory_bytes() {
|
|
local pid
|
|
pid=$(get_master_pid)
|
|
if [ -n "$pid" ] && [ -f "/proc/$pid/status" ]; then
|
|
local rss_kb
|
|
rss_kb=$(grep VmRSS "/proc/$pid/status" 2>/dev/null | awk '{print $2}')
|
|
if [ -n "$rss_kb" ]; then
|
|
echo $(( rss_kb * 1024 ))
|
|
else
|
|
echo "0"
|
|
fi
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_master_cpu_percent() {
|
|
local pid
|
|
pid=$(get_master_pid)
|
|
if [ -n "$pid" ]; then
|
|
ps -p "$pid" -o %cpu --no-headers 2>/dev/null | tr -d ' ' || echo "0"
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_master_thread_count() {
|
|
local pid
|
|
pid=$(get_master_pid)
|
|
if [ -n "$pid" ] && [ -d "/proc/$pid/task" ]; then
|
|
find "/proc/$pid/task" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_master_open_fds() {
|
|
local pid
|
|
pid=$(get_master_pid)
|
|
if [ -n "$pid" ] && [ -d "/proc/$pid/fd" ]; then
|
|
find "/proc/$pid/fd" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_salt_process_count() {
|
|
pgrep -c -f "salt-master" 2>/dev/null || true
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Worker thread config
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_configured_workers() {
|
|
local workers
|
|
workers=$(grep -E "^worker_threads:" "$SALT_MASTER_CONFIG" 2>/dev/null | awk '{print $2}')
|
|
if [ -z "$workers" ]; then
|
|
for f in /etc/salt/master.d/*.conf; do
|
|
[ -f "$f" ] || continue
|
|
workers=$(grep -E "^worker_threads:" "$f" 2>/dev/null | awk '{print $2}')
|
|
[ -n "$workers" ] && break
|
|
done
|
|
fi
|
|
echo "${workers:-5}"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Job cache metrics
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_job_cache_count() {
|
|
if [ -d "$SALT_CACHE_DIR/jobs" ]; then
|
|
find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f 2>/dev/null | wc -l
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_job_cache_size_bytes() {
|
|
if [ -d "$SALT_CACHE_DIR/jobs" ]; then
|
|
du -sb "$SALT_CACHE_DIR/jobs" 2>/dev/null | awk '{print $1}'
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_active_jobs() {
|
|
if [ -z "$ACTIVE_JOBS_CACHE" ]; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
echo "$ACTIVE_JOBS_CACHE" | grep -c "^[0-9]" 2>/dev/null || true
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Event bus / IPC health
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_event_pub_socket_exists() {
|
|
if [ -S "$SALT_RUN_DIR/master_event_pub.ipc" ]; then
|
|
echo "1"
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_event_pull_socket_exists() {
|
|
if [ -S "$SALT_RUN_DIR/master_event_pull.ipc" ]; then
|
|
echo "1"
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Salt version
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_salt_version() {
|
|
salt --version 2>/dev/null | awk '{print $2}' || echo "unknown"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Minion cache staleness
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_minion_cache_count() {
|
|
if [ -d "$SALT_CACHE_DIR/minions" ]; then
|
|
find "$SALT_CACHE_DIR/minions" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Syndic detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_syndic_count() {
|
|
if [ -d "$SALT_PKI_DIR/syndics" ]; then
|
|
find "$SALT_PKI_DIR/syndics" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Per-minion last-seen (stale only, >1h)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
generate_minion_last_seen_metrics() {
|
|
local now oldest_age minion_dir mtime age
|
|
now=$(date +%s)
|
|
oldest_age=0
|
|
|
|
if [ ! -d "$SALT_CACHE_DIR/minions" ]; then
|
|
echo "salt_master_minion_cache_oldest_seconds 0"
|
|
return
|
|
fi
|
|
|
|
for minion_dir in "$SALT_CACHE_DIR/minions"/*/; do
|
|
[ -d "$minion_dir" ] || continue
|
|
local minion_name
|
|
minion_name=$(basename "$minion_dir")
|
|
mtime=$(stat -c %Y "$minion_dir" 2>/dev/null || echo "$now")
|
|
age=$(( now - mtime ))
|
|
if [ "$age" -gt "$oldest_age" ]; then
|
|
oldest_age=$age
|
|
fi
|
|
echo "salt_master_minion_last_seen_seconds{minion=\"${minion_name}\"} ${mtime}"
|
|
done
|
|
|
|
echo "salt_master_minion_cache_oldest_seconds $oldest_age"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Minion version drift
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_version_match_count() {
|
|
if [ -z "$VERSIONS_CACHE" ]; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
# manage.versions text output: "Up to date:" header followed by "- minion" lines
|
|
local in_section=0 count=0
|
|
while IFS= read -r line; do
|
|
if [[ "$line" =~ ^"Up to date:" ]]; then
|
|
in_section=1
|
|
continue
|
|
fi
|
|
if [ "$in_section" -eq 1 ]; then
|
|
if [[ "$line" =~ ^"- " ]]; then
|
|
count=$((count + 1))
|
|
elif [[ "$line" =~ ^[A-Za-z] ]]; then
|
|
break
|
|
fi
|
|
fi
|
|
done <<< "$VERSIONS_CACHE"
|
|
echo "$count"
|
|
}
|
|
|
|
get_version_mismatch_count() {
|
|
if [ -z "$VERSIONS_CACHE" ]; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
# Count "- minion" lines NOT under "Up to date:" section
|
|
local in_uptodate=0 count=0
|
|
while IFS= read -r line; do
|
|
if [[ "$line" =~ ^"Up to date:" ]]; then
|
|
in_uptodate=1
|
|
continue
|
|
elif [[ "$line" =~ ^[A-Za-z] ]]; then
|
|
in_uptodate=0
|
|
continue
|
|
fi
|
|
if [ "$in_uptodate" -eq 0 ] && [[ "$line" =~ ^"- " ]]; then
|
|
count=$((count + 1))
|
|
fi
|
|
done <<< "$VERSIONS_CACHE"
|
|
echo "$count"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Jobs completed (1h and 24h)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_jobs_completed_1h() {
|
|
if [ -d "$SALT_CACHE_DIR/jobs" ]; then
|
|
find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -60 2>/dev/null | wc -l
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_jobs_completed_24h() {
|
|
if [ -d "$SALT_CACHE_DIR/jobs" ]; then
|
|
find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -1440 2>/dev/null | wc -l
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Failed jobs (24h)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_jobs_failed_24h() {
|
|
if [ -z "$JOB_LIST_CACHE" ]; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
# salt-run jobs.list_jobs --out=txt contains "Result: False" or "retcode: <N>" for failures
|
|
# Count lines containing "False" in Result field from last 24h job listing
|
|
echo "$JOB_LIST_CACHE" | grep -ic "result.*false\|retcode: [1-9]" 2>/dev/null || true
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Jobs by function (top 10, last 24h)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
generate_jobs_by_function_metrics() {
|
|
if [ -z "$JOB_LIST_CACHE" ]; then
|
|
return
|
|
fi
|
|
# Extract "Function:" lines, count by function name, emit top 10
|
|
echo "$JOB_LIST_CACHE" | grep -i "Function:" 2>/dev/null | \
|
|
awk -F': *' '{print $NF}' | sort | uniq -c | sort -rn | head -10 | \
|
|
while read -r count func; do
|
|
[ -n "$func" ] && echo "salt_master_jobs_by_function{function=\"${func}\"} ${count}"
|
|
done
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Job cache single-pass scan (collects all per-function metrics at once)
|
|
# Max 200 jobs to avoid slow scans on busy masters.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
JOB_CACHE_SCAN_MAX=200
|
|
|
|
generate_job_cache_metrics() {
|
|
if [ ! -d "$SALT_CACHE_DIR/jobs" ]; then
|
|
return
|
|
fi
|
|
|
|
declare -A func_expected
|
|
declare -A func_success
|
|
declare -A func_failure
|
|
declare -A sched_key_success
|
|
declare -A sched_key_failure
|
|
|
|
local jid_dirs
|
|
jid_dirs=$(find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -1440 2>/dev/null | head -n "$JOB_CACHE_SCAN_MAX")
|
|
|
|
if [ -z "$jid_dirs" ]; then
|
|
return
|
|
fi
|
|
|
|
while IFS= read -r load_file; do
|
|
[ -z "$load_file" ] && continue
|
|
local job_dir func_name
|
|
job_dir=$(dirname "$load_file")
|
|
|
|
func_name=""
|
|
if [ -n "$JOB_LIST_CACHE" ]; then
|
|
local jid_tail jid_prefix full_jid
|
|
jid_tail=$(basename "$job_dir")
|
|
jid_prefix=$(basename "$(dirname "$job_dir")")
|
|
full_jid="${jid_prefix}${jid_tail}"
|
|
func_name=$(echo "$JOB_LIST_CACHE" | grep -A5 "$full_jid" 2>/dev/null \
|
|
| grep -i "Function:" | head -1 | sed 's/.*Function:[[:space:]]*//' | tr -d '[:space:]')
|
|
fi
|
|
|
|
local load_strings=""
|
|
if [ -z "$func_name" ]; then
|
|
load_strings=$(timeout 2 strings "$load_file" 2>/dev/null || true)
|
|
func_name=$(echo "$load_strings" | grep -oE '(cmd\.[a-z_]+|state\.[a-z_]+|test\.[a-z_]+|grains\.[a-z_]+|pillar\.[a-z_]+|saltutil\.[a-z_]+|pkg\.[a-z_]+|service\.[a-z_]+|file\.[a-z_]+|sys\.[a-z_]+)' | head -1)
|
|
fi
|
|
[ -z "$func_name" ] && func_name="unknown"
|
|
|
|
local is_scheduled=0
|
|
local state_name=""
|
|
if [ -n "$load_strings" ]; then
|
|
if echo "$load_strings" | grep -qi "schedule" 2>/dev/null; then
|
|
is_scheduled=1
|
|
state_name=$(echo "$load_strings" | grep -oE '\b[a-z_]+\.(sls|init)\b' | head -1 | sed 's/\.sls$//' | sed 's/\.init$//')
|
|
fi
|
|
elif [ -z "$load_strings" ]; then
|
|
load_strings=$(timeout 2 strings "$load_file" 2>/dev/null || true)
|
|
if echo "$load_strings" | grep -qi "schedule" 2>/dev/null; then
|
|
is_scheduled=1
|
|
state_name=$(echo "$load_strings" | grep -oE '\b[a-z_]+\.(sls|init)\b' | head -1 | sed 's/\.sls$//' | sed 's/\.init$//')
|
|
fi
|
|
fi
|
|
[ -z "$state_name" ] && state_name=""
|
|
|
|
local minion_count=0
|
|
local minion_dir
|
|
for minion_dir in "$job_dir"/*/; do
|
|
[ -d "$minion_dir" ] || continue
|
|
minion_count=$((minion_count + 1))
|
|
|
|
local minion_name
|
|
minion_name=$(basename "$minion_dir")
|
|
|
|
if [ -f "$minion_dir/return.p" ]; then
|
|
local ret_content
|
|
ret_content=$(timeout 2 strings "$minion_dir/return.p" 2>/dev/null || true)
|
|
local is_fail=0
|
|
if echo "$ret_content" | grep -qiE "false|traceback|error|exception" 2>/dev/null; then
|
|
is_fail=1
|
|
fi
|
|
|
|
if [ "$is_fail" -eq 1 ]; then
|
|
func_failure["$func_name"]=$(( ${func_failure["$func_name"]:-0} + 1 ))
|
|
else
|
|
func_success["$func_name"]=$(( ${func_success["$func_name"]:-0} + 1 ))
|
|
fi
|
|
|
|
if [ "$is_scheduled" -eq 1 ]; then
|
|
local skey="${func_name}|${minion_name}|${state_name}"
|
|
if [ "$is_fail" -eq 1 ]; then
|
|
sched_key_failure["$skey"]=$(( ${sched_key_failure["$skey"]:-0} + 1 ))
|
|
else
|
|
sched_key_success["$skey"]=$(( ${sched_key_success["$skey"]:-0} + 1 ))
|
|
fi
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [ "$minion_count" -gt 0 ]; then
|
|
func_expected["$func_name"]=$(( ${func_expected["$func_name"]:-0} + minion_count ))
|
|
fi
|
|
done <<< "$jid_dirs"
|
|
|
|
for func in "${!func_expected[@]}"; do
|
|
echo "salt_master_expected_responses_total{function=\"${func}\",state=\"\"} ${func_expected[$func]}"
|
|
done
|
|
|
|
local all_funcs
|
|
all_funcs=$(printf '%s\n' "${!func_success[@]}" "${!func_failure[@]}" | sort -u)
|
|
while IFS= read -r func; do
|
|
[ -z "$func" ] && continue
|
|
echo "salt_master_function_responses_total{function=\"${func}\",state=\"\",success=\"true\"} ${func_success[$func]:-0}"
|
|
echo "salt_master_function_responses_total{function=\"${func}\",state=\"\",success=\"false\"} ${func_failure[$func]:-0}"
|
|
done <<< "$all_funcs"
|
|
|
|
local all_keys
|
|
all_keys=$(printf '%s\n' "${!sched_key_success[@]}" "${!sched_key_failure[@]}" | sort -u)
|
|
while IFS= read -r key; do
|
|
[ -z "$key" ] && continue
|
|
local func minion state
|
|
func=$(echo "$key" | cut -d'|' -f1)
|
|
minion=$(echo "$key" | cut -d'|' -f2)
|
|
state=$(echo "$key" | cut -d'|' -f3)
|
|
echo "salt_master_scheduled_job_return_total{function=\"${func}\",minion=\"${minion}\",state=\"${state}\",success=\"true\"} ${sched_key_success[$key]:-0}"
|
|
echo "salt_master_scheduled_job_return_total{function=\"${func}\",minion=\"${minion}\",state=\"${state}\",success=\"false\"} ${sched_key_failure[$key]:-0}"
|
|
done <<< "$all_keys"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# New jobs by function with success/failure (from job list text output)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
generate_new_job_metrics() {
|
|
if [ -z "$JOB_LIST_CACHE" ]; then
|
|
return
|
|
fi
|
|
|
|
declare -A func_total
|
|
declare -A func_failed
|
|
|
|
local current_func=""
|
|
while IFS= read -r line; do
|
|
if [[ "$line" =~ Function: ]]; then
|
|
current_func=$(echo "$line" | sed 's/.*Function:[[:space:]]*//' | tr -d '[:space:]')
|
|
[ -n "$current_func" ] && func_total["$current_func"]=$(( ${func_total["$current_func"]:-0} + 1 ))
|
|
fi
|
|
if [[ "$line" =~ Result:.*False ]] || [[ "$line" =~ retcode:\ [1-9] ]]; then
|
|
[ -n "$current_func" ] && func_failed["$current_func"]=$(( ${func_failed["$current_func"]:-0} + 1 ))
|
|
fi
|
|
done <<< "$JOB_LIST_CACHE"
|
|
|
|
for func in "${!func_total[@]}"; do
|
|
local total=${func_total[$func]}
|
|
local failed=${func_failed[$func]:-0}
|
|
local succeeded=$((total - failed))
|
|
echo "salt_master_new_job_total{function=\"${func}\",state=\"\",success=\"true\"} ${succeeded}"
|
|
echo "salt_master_new_job_total{function=\"${func}\",state=\"\",success=\"false\"} ${failed}"
|
|
done
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ZeroMQ port status
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_port_listening() {
|
|
local port="$1"
|
|
if ss -tlnp 2>/dev/null | grep -q ":${port} " 2>/dev/null; then
|
|
echo "1"
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cache disk usage and inode count
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_cache_disk_used_bytes() {
|
|
if [ -d "$SALT_CACHE_DIR" ]; then
|
|
du -sb "$SALT_CACHE_DIR" 2>/dev/null | awk '{print $1}'
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_cache_disk_available_bytes() {
|
|
if [ -d "$SALT_CACHE_DIR" ]; then
|
|
df -B1 "$SALT_CACHE_DIR" 2>/dev/null | tail -1 | awk '{print $4}'
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_cache_inode_count() {
|
|
if [ -d "$SALT_CACHE_DIR" ]; then
|
|
find "$SALT_CACHE_DIR" 2>/dev/null | wc -l
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# File server cache size
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_fileserver_cache_size_bytes() {
|
|
if [ -d "$SALT_CACHE_DIR/file_lists" ]; then
|
|
du -sb "$SALT_CACHE_DIR/file_lists" 2>/dev/null | awk '{print $1}'
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration values
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_config_value() {
|
|
local key="$1" default="$2"
|
|
local val
|
|
|
|
val=$(grep -E "^[[:space:]]*${key}:" "$SALT_MASTER_CONFIG" 2>/dev/null | head -1 | sed "s/^[[:space:]]*${key}:[[:space:]]*//" | tr -d '[:space:]')
|
|
if [ -z "$val" ]; then
|
|
for f in /etc/salt/master.d/*.conf; do
|
|
[ -f "$f" ] || continue
|
|
val=$(grep -E "^[[:space:]]*${key}:" "$f" 2>/dev/null | head -1 | sed "s/^[[:space:]]*${key}:[[:space:]]*//" | tr -d '[:space:]')
|
|
[ -n "$val" ] && break
|
|
done
|
|
fi
|
|
if [ -z "$val" ]; then
|
|
val=$(timeout "$SALT_CMD_TIMEOUT" salt-run config.get "$key" 2>/dev/null | tr -d '[:space:]')
|
|
[ "$val" = "None" ] || [ "$val" = "" ] && val=""
|
|
fi
|
|
echo "${val:-$default}"
|
|
}
|
|
|
|
get_config_bool() {
|
|
local val
|
|
val=$(get_config_value "$1" "$2")
|
|
case "${val,,}" in
|
|
true|yes|1) echo "1" ;;
|
|
*) echo "0" ;;
|
|
esac
|
|
}
|
|
|
|
get_config_keep_jobs() {
|
|
get_config_value "keep_jobs" "24"
|
|
}
|
|
|
|
get_config_master_stats_enabled() {
|
|
get_config_bool "master_stats" "false"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Salt API status
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_salt_api_running() {
|
|
if pgrep -f "salt-api" >/dev/null 2>&1; then
|
|
echo "1"
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
get_salt_api_port() {
|
|
ss -tlnp 2>/dev/null | grep "salt-api" | awk '{print $4}' | grep -oE '[0-9]+$' | head -1
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# File roots size
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_file_roots_size_bytes() {
|
|
local roots_dir
|
|
roots_dir=$(get_config_value "file_roots" "")
|
|
if [ -z "$roots_dir" ] && [ -d "/srv/salt" ]; then
|
|
roots_dir="/srv/salt"
|
|
fi
|
|
if [ -n "$roots_dir" ] && [ -d "$roots_dir" ]; then
|
|
du -sb "$roots_dir" 2>/dev/null | awk '{print $1}'
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Master log file size
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_master_log_size_bytes() {
|
|
local log_file
|
|
log_file=$(get_config_value "log_file" "/var/log/salt/master")
|
|
if [ -f "$log_file" ]; then
|
|
stat -c %s "$log_file" 2>/dev/null || echo "0"
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pillar roots size
|
|
# ---------------------------------------------------------------------------
|
|
|
|
get_pillar_roots_size_bytes() {
|
|
local pillar_dir
|
|
pillar_dir=$(get_config_value "pillar_roots" "")
|
|
if [ -z "$pillar_dir" ] && [ -d "/srv/pillar" ]; then
|
|
pillar_dir="/srv/pillar"
|
|
fi
|
|
if [ -n "$pillar_dir" ] && [ -d "$pillar_dir" ]; then
|
|
du -sb "$pillar_dir" 2>/dev/null | awk '{print $1}'
|
|
else
|
|
echo "0"
|
|
fi
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Metric generation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
generate_metrics() {
|
|
local start_time
|
|
start_time=$(date +%s)
|
|
|
|
local master_pid master_running
|
|
master_pid=$(get_master_pid)
|
|
if [ -n "$master_pid" ]; then
|
|
master_running=1
|
|
else
|
|
master_running=0
|
|
fi
|
|
|
|
local salt_version
|
|
salt_version=$(get_salt_version)
|
|
|
|
local keys_accepted keys_denied keys_unaccepted keys_rejected
|
|
keys_accepted=$(find "$SALT_PKI_DIR/minions/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l)
|
|
keys_denied=$(find "$SALT_PKI_DIR/minions_denied/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l)
|
|
keys_unaccepted=$(find "$SALT_PKI_DIR/minions_pre/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l)
|
|
keys_rejected=$(find "$SALT_PKI_DIR/minions_rejected/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l)
|
|
local keys_total=$((keys_accepted + keys_denied + keys_unaccepted + keys_rejected))
|
|
|
|
local minions_up minions_down
|
|
minions_up=$(get_minions_up)
|
|
minions_down=$(get_minions_down)
|
|
|
|
local master_uptime master_memory master_cpu master_threads master_fds salt_procs
|
|
master_uptime=$(get_master_uptime_seconds)
|
|
master_memory=$(get_master_memory_bytes)
|
|
master_cpu=$(get_master_cpu_percent)
|
|
master_threads=$(get_master_thread_count)
|
|
master_fds=$(get_master_open_fds)
|
|
salt_procs=$(get_salt_process_count)
|
|
|
|
local configured_workers
|
|
configured_workers=$(get_configured_workers)
|
|
|
|
local job_cache_count job_cache_size active_jobs
|
|
job_cache_count=$(get_job_cache_count)
|
|
job_cache_size=$(get_job_cache_size_bytes)
|
|
active_jobs=$(get_active_jobs)
|
|
|
|
local event_pub_socket event_pull_socket
|
|
event_pub_socket=$(get_event_pub_socket_exists)
|
|
event_pull_socket=$(get_event_pull_socket_exists)
|
|
|
|
local minion_cache_count syndic_count
|
|
minion_cache_count=$(get_minion_cache_count)
|
|
syndic_count=$(get_syndic_count)
|
|
|
|
local version_match version_mismatch
|
|
version_match=$(get_version_match_count)
|
|
version_mismatch=$(get_version_mismatch_count)
|
|
|
|
local jobs_1h jobs_24h jobs_failed_24h
|
|
jobs_1h=$(get_jobs_completed_1h)
|
|
jobs_24h=$(get_jobs_completed_24h)
|
|
jobs_failed_24h=$(get_jobs_failed_24h)
|
|
|
|
local port_4505 port_4506
|
|
port_4505=$(get_port_listening 4505)
|
|
port_4506=$(get_port_listening 4506)
|
|
|
|
local cache_disk_used cache_disk_avail cache_inode_count
|
|
cache_disk_used=$(get_cache_disk_used_bytes)
|
|
cache_disk_avail=$(get_cache_disk_available_bytes)
|
|
cache_inode_count=$(get_cache_inode_count)
|
|
|
|
local fileserver_cache_size
|
|
fileserver_cache_size=$(get_fileserver_cache_size_bytes)
|
|
|
|
local config_keep_jobs config_master_stats
|
|
config_keep_jobs=$(get_config_keep_jobs)
|
|
config_master_stats=$(get_config_master_stats_enabled)
|
|
|
|
local config_state_events config_presence_events config_timeout config_job_cache
|
|
config_state_events=$(get_config_bool "state_events" "false")
|
|
config_presence_events=$(get_config_bool "presence_events" "false")
|
|
config_timeout=$(get_config_value "timeout" "5")
|
|
config_job_cache=$(get_config_bool "job_cache" "true")
|
|
|
|
local config_publish_port config_ret_port
|
|
config_publish_port=$(get_config_value "publish_port" "4505")
|
|
config_ret_port=$(get_config_value "ret_port" "4506")
|
|
|
|
local salt_api_running salt_api_port
|
|
salt_api_running=$(get_salt_api_running)
|
|
salt_api_port=$(get_salt_api_port)
|
|
|
|
local file_roots_size pillar_roots_size master_log_size
|
|
file_roots_size=$(get_file_roots_size_bytes)
|
|
pillar_roots_size=$(get_pillar_roots_size_bytes)
|
|
master_log_size=$(get_master_log_size_bytes)
|
|
|
|
cat <<EOF
|
|
# HELP salt_master_info Salt master info and version
|
|
# TYPE salt_master_info gauge
|
|
salt_master_info{version="${salt_version}",exporter_version="${SCRIPT_VERSION}"} 1
|
|
|
|
# HELP salt_master_up Whether the salt-master process is running
|
|
# TYPE salt_master_up gauge
|
|
salt_master_up $master_running
|
|
|
|
# HELP salt_master_uptime_seconds Salt master process uptime in seconds
|
|
# TYPE salt_master_uptime_seconds gauge
|
|
salt_master_uptime_seconds $master_uptime
|
|
|
|
# HELP salt_master_memory_bytes Salt master RSS memory usage in bytes
|
|
# TYPE salt_master_memory_bytes gauge
|
|
salt_master_memory_bytes $master_memory
|
|
|
|
# HELP salt_master_cpu_percent Salt master CPU usage percentage
|
|
# TYPE salt_master_cpu_percent gauge
|
|
salt_master_cpu_percent $master_cpu
|
|
|
|
# HELP salt_master_threads Salt master thread count
|
|
# TYPE salt_master_threads gauge
|
|
salt_master_threads $master_threads
|
|
|
|
# HELP salt_master_open_fds Salt master open file descriptors
|
|
# TYPE salt_master_open_fds gauge
|
|
salt_master_open_fds $master_fds
|
|
|
|
# HELP salt_master_processes Total salt-master process count (workers + parent)
|
|
# TYPE salt_master_processes gauge
|
|
salt_master_processes $salt_procs
|
|
|
|
# HELP salt_master_worker_threads_configured Configured worker_threads value
|
|
# TYPE salt_master_worker_threads_configured gauge
|
|
salt_master_worker_threads_configured $configured_workers
|
|
|
|
# HELP salt_master_keys_total Total minion keys by status
|
|
# TYPE salt_master_keys_total gauge
|
|
salt_master_keys_total{status="accepted"} $keys_accepted
|
|
salt_master_keys_total{status="denied"} $keys_denied
|
|
salt_master_keys_total{status="unaccepted"} $keys_unaccepted
|
|
salt_master_keys_total{status="rejected"} $keys_rejected
|
|
salt_master_keys_total{status="all"} $keys_total
|
|
|
|
# HELP salt_master_minions_up Minions currently responding
|
|
# TYPE salt_master_minions_up gauge
|
|
salt_master_minions_up $minions_up
|
|
|
|
# HELP salt_master_minions_down Minions currently not responding
|
|
# TYPE salt_master_minions_down gauge
|
|
salt_master_minions_down $minions_down
|
|
|
|
# HELP salt_master_minions_connectivity_ratio Ratio of up minions to accepted keys
|
|
# TYPE salt_master_minions_connectivity_ratio gauge
|
|
EOF
|
|
|
|
if [ "$keys_accepted" -gt 0 ]; then
|
|
local ratio
|
|
ratio=$(awk "BEGIN {printf \"%.4f\", $minions_up / $keys_accepted}")
|
|
echo "salt_master_minions_connectivity_ratio $ratio"
|
|
else
|
|
echo "salt_master_minions_connectivity_ratio 0"
|
|
fi
|
|
|
|
cat <<EOF
|
|
|
|
# HELP salt_master_job_cache_count Number of jobs in the job cache
|
|
# TYPE salt_master_job_cache_count gauge
|
|
salt_master_job_cache_count $job_cache_count
|
|
|
|
# HELP salt_master_job_cache_size_bytes Size of the job cache directory in bytes
|
|
# TYPE salt_master_job_cache_size_bytes gauge
|
|
salt_master_job_cache_size_bytes ${job_cache_size:-0}
|
|
|
|
# HELP salt_master_active_jobs Currently running jobs
|
|
# TYPE salt_master_active_jobs gauge
|
|
salt_master_active_jobs $active_jobs
|
|
|
|
# HELP salt_master_event_socket IPC event socket availability (1=exists, 0=missing)
|
|
# TYPE salt_master_event_socket gauge
|
|
salt_master_event_socket{socket="pub"} $event_pub_socket
|
|
salt_master_event_socket{socket="pull"} $event_pull_socket
|
|
|
|
# HELP salt_master_minion_cache_count Minions in the master cache directory
|
|
# TYPE salt_master_minion_cache_count gauge
|
|
salt_master_minion_cache_count $minion_cache_count
|
|
|
|
# HELP salt_master_syndic_count Number of connected syndics
|
|
# TYPE salt_master_syndic_count gauge
|
|
salt_master_syndic_count $syndic_count
|
|
|
|
# HELP salt_master_minion_version_match Minions matching master version
|
|
# TYPE salt_master_minion_version_match gauge
|
|
salt_master_minion_version_match $version_match
|
|
|
|
# HELP salt_master_minion_version_mismatch Minions not matching master version
|
|
# TYPE salt_master_minion_version_mismatch gauge
|
|
salt_master_minion_version_mismatch $version_mismatch
|
|
|
|
# HELP salt_master_jobs_completed Jobs completed in time period
|
|
# TYPE salt_master_jobs_completed gauge
|
|
salt_master_jobs_completed{period="1h"} $jobs_1h
|
|
salt_master_jobs_completed{period="24h"} $jobs_24h
|
|
|
|
# HELP salt_master_jobs_failed_24h Jobs with non-zero retcode in last 24h
|
|
# TYPE salt_master_jobs_failed_24h gauge
|
|
salt_master_jobs_failed_24h $jobs_failed_24h
|
|
|
|
# HELP salt_master_port_listening ZeroMQ port listening status (1=listening, 0=not)
|
|
# TYPE salt_master_port_listening gauge
|
|
salt_master_port_listening{port="4505"} $port_4505
|
|
salt_master_port_listening{port="4506"} $port_4506
|
|
|
|
# HELP salt_master_cache_disk_used_bytes Salt cache directory disk usage in bytes
|
|
# TYPE salt_master_cache_disk_used_bytes gauge
|
|
salt_master_cache_disk_used_bytes ${cache_disk_used:-0}
|
|
|
|
# HELP salt_master_cache_disk_available_bytes Available disk space on cache partition in bytes
|
|
# TYPE salt_master_cache_disk_available_bytes gauge
|
|
salt_master_cache_disk_available_bytes ${cache_disk_avail:-0}
|
|
|
|
# HELP salt_master_cache_inode_count Total file and directory count in salt cache
|
|
# TYPE salt_master_cache_inode_count gauge
|
|
salt_master_cache_inode_count ${cache_inode_count:-0}
|
|
|
|
# HELP salt_master_log_errors_1h Salt master ERROR log entries in last 1h
|
|
# TYPE salt_master_log_errors_1h gauge
|
|
salt_master_log_errors_1h ${LOG_ERRORS_CACHE:-0}
|
|
|
|
# HELP salt_master_log_critical_1h Salt master CRITICAL log entries in last 1h
|
|
# TYPE salt_master_log_critical_1h gauge
|
|
salt_master_log_critical_1h ${LOG_CRITICAL_CACHE:-0}
|
|
|
|
# HELP salt_master_fileserver_cache_size_bytes Size of file_lists cache in bytes
|
|
# TYPE salt_master_fileserver_cache_size_bytes gauge
|
|
salt_master_fileserver_cache_size_bytes ${fileserver_cache_size:-0}
|
|
|
|
# HELP salt_master_config_keep_jobs Configured keep_jobs value (hours)
|
|
# TYPE salt_master_config_keep_jobs gauge
|
|
salt_master_config_keep_jobs $config_keep_jobs
|
|
|
|
# HELP salt_master_config_master_stats_enabled Whether master_stats is enabled (1=yes, 0=no)
|
|
# TYPE salt_master_config_master_stats_enabled gauge
|
|
salt_master_config_master_stats_enabled $config_master_stats
|
|
|
|
# HELP salt_master_config_state_events Whether state_events is enabled (1=yes, 0=no)
|
|
# TYPE salt_master_config_state_events gauge
|
|
salt_master_config_state_events $config_state_events
|
|
|
|
# HELP salt_master_config_presence_events Whether presence_events is enabled (1=yes, 0=no)
|
|
# TYPE salt_master_config_presence_events gauge
|
|
salt_master_config_presence_events $config_presence_events
|
|
|
|
# HELP salt_master_config_timeout Default salt command timeout (seconds)
|
|
# TYPE salt_master_config_timeout gauge
|
|
salt_master_config_timeout $config_timeout
|
|
|
|
# HELP salt_master_config_job_cache Whether job_cache is enabled (1=yes, 0=no)
|
|
# TYPE salt_master_config_job_cache gauge
|
|
salt_master_config_job_cache $config_job_cache
|
|
|
|
# HELP salt_master_config_publish_port Configured publish port
|
|
# TYPE salt_master_config_publish_port gauge
|
|
salt_master_config_publish_port $config_publish_port
|
|
|
|
# HELP salt_master_config_ret_port Configured return port
|
|
# TYPE salt_master_config_ret_port gauge
|
|
salt_master_config_ret_port $config_ret_port
|
|
|
|
# HELP salt_master_highstate_failures_24h Highstate failures detected in last 24h
|
|
# TYPE salt_master_highstate_failures_24h gauge
|
|
salt_master_highstate_failures_24h ${HIGHSTATE_FAILURES_CACHE:-0}
|
|
|
|
# HELP salt_master_log_auth_failures_1h Authentication failure events in last 1h
|
|
# TYPE salt_master_log_auth_failures_1h gauge
|
|
salt_master_log_auth_failures_1h ${LOG_AUTH_FAILURES_CACHE:-0}
|
|
|
|
# HELP salt_master_log_key_events_1h Key management events in last 1h
|
|
# TYPE salt_master_log_key_events_1h gauge
|
|
salt_master_log_key_events_1h ${LOG_KEY_EVENTS_CACHE:-0}
|
|
|
|
# HELP salt_master_salt_api_running Whether salt-api process is running (1=yes, 0=no)
|
|
# TYPE salt_master_salt_api_running gauge
|
|
salt_master_salt_api_running $salt_api_running
|
|
|
|
# HELP salt_master_salt_api_port Salt API listening port (0 if not running)
|
|
# TYPE salt_master_salt_api_port gauge
|
|
salt_master_salt_api_port ${salt_api_port:-0}
|
|
|
|
# HELP salt_master_file_roots_size_bytes Total size of file_roots directory
|
|
# TYPE salt_master_file_roots_size_bytes gauge
|
|
salt_master_file_roots_size_bytes ${file_roots_size:-0}
|
|
|
|
# HELP salt_master_pillar_roots_size_bytes Total size of pillar_roots directory
|
|
# TYPE salt_master_pillar_roots_size_bytes gauge
|
|
salt_master_pillar_roots_size_bytes ${pillar_roots_size:-0}
|
|
|
|
# HELP salt_master_log_file_size_bytes Size of the salt master log file
|
|
# TYPE salt_master_log_file_size_bytes gauge
|
|
salt_master_log_file_size_bytes ${master_log_size:-0}
|
|
|
|
EOF
|
|
|
|
# Per-minion last-seen (stale only)
|
|
echo "# HELP salt_master_minion_last_seen_seconds Last-seen cache mtime per minion (unix epoch)"
|
|
echo "# TYPE salt_master_minion_last_seen_seconds gauge"
|
|
echo ""
|
|
echo "# HELP salt_master_minion_cache_oldest_seconds Age of the stalest minion cache in seconds"
|
|
echo "# TYPE salt_master_minion_cache_oldest_seconds gauge"
|
|
generate_minion_last_seen_metrics
|
|
|
|
# Jobs by function (top 10, last 24h)
|
|
echo ""
|
|
echo "# HELP salt_master_jobs_by_function Job count by function in last 24h (top 10)"
|
|
echo "# TYPE salt_master_jobs_by_function gauge"
|
|
generate_jobs_by_function_metrics
|
|
|
|
# Per-function job cache metrics (single-pass scan)
|
|
echo ""
|
|
echo "# HELP salt_master_expected_responses_total Expected minion responses per function (24h gauge)"
|
|
echo "# TYPE salt_master_expected_responses_total gauge"
|
|
echo "# HELP salt_master_function_responses_total Minion responses per function with success status (24h gauge)"
|
|
echo "# TYPE salt_master_function_responses_total gauge"
|
|
echo "# HELP salt_master_scheduled_job_return_total Scheduled job returns per function/minion/state (24h gauge)"
|
|
echo "# TYPE salt_master_scheduled_job_return_total gauge"
|
|
generate_job_cache_metrics
|
|
|
|
# New jobs by function with success/failure (24h, from job list)
|
|
echo ""
|
|
echo "# HELP salt_master_new_job_total Jobs by function with success status (24h gauge)"
|
|
echo "# TYPE salt_master_new_job_total gauge"
|
|
generate_new_job_metrics
|
|
|
|
# Highstate job count from job list (based on Function: state.highstate)
|
|
echo ""
|
|
echo "# HELP salt_master_highstate_jobs_24h Count of highstate jobs in last 24h"
|
|
echo "# TYPE salt_master_highstate_jobs_24h gauge"
|
|
if [ -n "$JOB_LIST_CACHE" ]; then
|
|
local hs_count
|
|
hs_count=$(echo "$JOB_LIST_CACHE" | grep -ic "Function:.*state\.highstate" 2>/dev/null || true)
|
|
echo "salt_master_highstate_jobs_24h ${hs_count:-0}"
|
|
else
|
|
echo "salt_master_highstate_jobs_24h 0"
|
|
fi
|
|
|
|
cat <<EOF
|
|
|
|
# HELP salt_master_scrape_timestamp_seconds Unix timestamp of metric generation
|
|
# TYPE salt_master_scrape_timestamp_seconds gauge
|
|
salt_master_scrape_timestamp_seconds $(date +%s)
|
|
|
|
# HELP salt_master_exporter_duration_seconds Time to generate all metrics
|
|
# TYPE salt_master_exporter_duration_seconds gauge
|
|
salt_master_exporter_duration_seconds $(($(date +%s) - start_time))
|
|
EOF
|
|
|
|
echo ""
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTTP server
|
|
# ---------------------------------------------------------------------------
|
|
|
|
run_http_server() {
|
|
echo "Starting Salt Master exporter on port $HTTP_PORT..." >&2
|
|
|
|
while true; do
|
|
{
|
|
read -r request
|
|
if [[ "$request" =~ ^GET\ /metrics ]]; then
|
|
printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4; charset=utf-8\r\n\r\n"
|
|
cache_all_salt_data
|
|
cache_log_data
|
|
cache_highstate_data
|
|
generate_metrics
|
|
else
|
|
printf "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n"
|
|
echo "<h1>Salt Master Exporter v${SCRIPT_VERSION}</h1><a href='/metrics'>Metrics</a>"
|
|
fi
|
|
} | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
|
|
done
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
[ "$HTTP_MODE" != true ] && acquire_lock
|
|
|
|
if [ "$HTTP_MODE" = true ]; then
|
|
run_http_server
|
|
elif [ -n "$OUTPUT_FILE" ]; then
|
|
cache_all_salt_data
|
|
cache_log_data
|
|
cache_highstate_data
|
|
|
|
mkdir -p "$(dirname "$OUTPUT_FILE")"
|
|
|
|
local temp_file
|
|
temp_file=$(mktemp /tmp/salt_master_metrics.XXXXXX)
|
|
|
|
generate_metrics > "$temp_file"
|
|
|
|
rm -f "$OUTPUT_FILE"
|
|
mv "$temp_file" "$OUTPUT_FILE"
|
|
chmod 644 "$OUTPUT_FILE"
|
|
sync
|
|
else
|
|
cache_all_salt_data
|
|
cache_log_data
|
|
cache_highstate_data
|
|
generate_metrics
|
|
fi
|
|
}
|
|
|
|
main "$@"
|