Files
linux-scripts/salt-master-metrics.sh
T

1315 lines
46 KiB
Bash
Executable File

#!/bin/bash
################################################################################
# Script Name: salt-master-metrics.sh
# Version: 3.1
# Author: Phil Connor, contact@mylinux.work
# License: MIT
# Description: Production Prometheus exporter for Salt Master metrics
#
# Exports metrics for:
# - Master process health (CPU, memory, uptime)
# - Minion connectivity (up, down, accepted, rejected, denied, unaccepted)
# - Per-minion last-seen timestamp (stale minions only)
# - Minion version drift (match vs mismatch count)
# - Job statistics (active, cached, recent, completed 1h/24h, failed 24h)
# - Jobs by function breakdown (top 10)
# - Per-function expected/actual responses with success/failure (24h)
# - Per-function new job counts with success/failure (24h)
# - Scheduled job returns per minion/function/state (24h)
# - Key management counts
# - Event bus health
# - ZeroMQ port status (4505, 4506)
# - Worker thread utilization
# - Salt versions (master vs minion drift)
# - Cache disk usage and inode count
# - Salt master log error rate (1h)
# - File server cache size
# - Configuration values (keep_jobs, master_stats)
# - Highstate metrics (failures, last timestamp per minion)
# - Extended configuration detection (state_events, presence_events, timeout, job_cache)
# - Salt API process status
# - Auth failure rate from logs
# - File roots total size
# - Master log file size
# - Minion auth/key events from journal
#
# Modes:
# --textfile Write to node_exporter textfile collector
# --http Run HTTP server for direct Prometheus scraping
# stdout Default: print metrics to stdout
#
# Changelog:
# 3.1 - Added per-function expected/actual response metrics, per-function
# new job metrics with success/failure, scheduled job return
# metrics per minion/function/state. All from job cache parsing.
# 3.0 - Added config detection (state_events, presence_events, timeout,
# job_cache, publish_port, ret_port), salt-api process status,
# auth failure rate, file_roots size, master log size, minion
# auth events. Pure bash (no python3 dependency).
# 2.0 - Added per-minion last-seen, version drift, jobs completed/failed,
# jobs by function, ZeroMQ port checks, cache disk/inode metrics,
# log error rate, fileserver cache size, config values, highstate
# metrics. Expanded caching layer for expensive operations.
# 1.0 - Initial release
################################################################################
SCRIPT_VERSION="3.1"
TEXTFILE_DIR="/var/lib/node_exporter"
OUTPUT_FILE=""
HTTP_MODE=false
HTTP_PORT=9417
LOCK_FILE="/var/run/salt-master-metrics.lock"
SALT_MASTER_CONFIG="/etc/salt/master"
SALT_CACHE_DIR="/var/cache/salt/master"
SALT_PKI_DIR="/etc/salt/pki/master"
SALT_RUN_DIR="/var/run/salt/master"
# Timeouts for salt commands (seconds)
SALT_CMD_TIMEOUT=15
# Cache for expensive operations
MINION_UP_CACHE=""
MINION_DOWN_CACHE=""
KEY_LIST_CACHE=""
VERSIONS_CACHE=""
LOG_ERRORS_CACHE=""
LOG_CRITICAL_CACHE=""
HIGHSTATE_FAILURES_CACHE=""
ACTIVE_JOBS_CACHE=""
JOB_LIST_CACHE=""
LOG_AUTH_FAILURES_CACHE=""
LOG_KEY_EVENTS_CACHE=""
show_usage() {
cat <<EOF
Usage: $0 [OPTIONS]
Export Salt Master metrics as Prometheus metrics (v${SCRIPT_VERSION}).
MODES:
--textfile Write to node_exporter textfile collector
--http Run HTTP server on port $HTTP_PORT
OPTIONS:
-p, --port HTTP port (default: $HTTP_PORT)
-o, --output Output file
-h, --help Show help
REQUIREMENTS:
- Must run as root or salt user
- salt-run and salt-key commands must be available
EOF
exit 0
}
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help) show_usage ;;
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/salt_master_metrics.prom"; shift ;;
--http) HTTP_MODE=true; shift ;;
-p|--port) HTTP_PORT="$2"; shift 2 ;;
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
*) echo "Unknown: $1"; exit 1 ;;
esac
done
}
acquire_lock() {
if [ -f "$LOCK_FILE" ]; then
local pid
pid=$(cat "$LOCK_FILE" 2>/dev/null)
if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
echo "ERROR: Another instance is already running (PID: $pid)" >&2
exit 1
else
echo "Removing stale lock file" >&2
rm -f "$LOCK_FILE"
fi
fi
echo $$ > "$LOCK_FILE"
trap cleanup EXIT INT TERM
}
cleanup() {
rm -f "$LOCK_FILE"
}
# ---------------------------------------------------------------------------
# Data collection (cached)
# ---------------------------------------------------------------------------
cache_key_list() {
KEY_LIST_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-key -L 2>/dev/null || echo "")
}
# Use plain-text salt-run commands (one minion per line, no python needed)
cache_minion_up() {
MINION_UP_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run manage.up 2>/dev/null || echo "")
}
cache_minion_down() {
MINION_DOWN_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run manage.down 2>/dev/null || echo "")
}
cache_versions() {
VERSIONS_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run manage.versions 2>/dev/null || echo "")
}
cache_active_jobs() {
ACTIVE_JOBS_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run jobs.active 2>/dev/null || echo "")
}
cache_job_list() {
JOB_LIST_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run jobs.list_jobs --out=txt 2>/dev/null || echo "")
}
# Run all expensive salt-run commands in parallel using temp files
cache_all_salt_data() {
local tmp_dir
tmp_dir=$(mktemp -d /tmp/salt_metrics_cache.XXXXXX)
timeout "$SALT_CMD_TIMEOUT" salt-key -L > "$tmp_dir/keys" 2>/dev/null &
timeout "$SALT_CMD_TIMEOUT" salt-run manage.up > "$tmp_dir/up" 2>/dev/null &
timeout "$SALT_CMD_TIMEOUT" salt-run manage.down > "$tmp_dir/down" 2>/dev/null &
timeout "$SALT_CMD_TIMEOUT" salt-run manage.versions > "$tmp_dir/versions" 2>/dev/null &
timeout "$SALT_CMD_TIMEOUT" salt-run jobs.active > "$tmp_dir/active" 2>/dev/null &
timeout "$SALT_CMD_TIMEOUT" salt-run jobs.list_jobs --out=txt > "$tmp_dir/joblist" 2>/dev/null &
wait
KEY_LIST_CACHE=$(cat "$tmp_dir/keys" 2>/dev/null)
MINION_UP_CACHE=$(cat "$tmp_dir/up" 2>/dev/null)
MINION_DOWN_CACHE=$(cat "$tmp_dir/down" 2>/dev/null)
VERSIONS_CACHE=$(cat "$tmp_dir/versions" 2>/dev/null)
ACTIVE_JOBS_CACHE=$(cat "$tmp_dir/active" 2>/dev/null)
JOB_LIST_CACHE=$(cat "$tmp_dir/joblist" 2>/dev/null)
rm -rf "$tmp_dir"
}
cache_log_data() {
local since_time
since_time=$(date -d '1 hour ago' '+%Y-%m-%d %H:%M:%S' 2>/dev/null)
if command -v journalctl >/dev/null 2>&1; then
LOG_ERRORS_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -c "ERROR" 2>/dev/null || true)
LOG_CRITICAL_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -c "CRITICAL" 2>/dev/null || true)
elif [ -f /var/log/salt/master ]; then
local cutoff
cutoff=$(date -d '1 hour ago' '+%Y-%m-%d %H:%M' 2>/dev/null)
LOG_ERRORS_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -c "ERROR" 2>/dev/null || true)
LOG_CRITICAL_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -c "CRITICAL" 2>/dev/null || true)
else
LOG_ERRORS_CACHE="0"
LOG_CRITICAL_CACHE="0"
fi
# Auth failures and key events
if command -v journalctl >/dev/null 2>&1; then
LOG_AUTH_FAILURES_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -ic "authentication denied\|failed to authenticate\|salt\.crypt.*denied" 2>/dev/null || true)
LOG_KEY_EVENTS_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -ic "salt\.key\|key.*accept\|key.*reject\|key.*denied\|new key" 2>/dev/null || true)
elif [ -f /var/log/salt/master ]; then
LOG_AUTH_FAILURES_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -ic "authentication denied\|failed to authenticate\|salt\.crypt.*denied" 2>/dev/null || true)
LOG_KEY_EVENTS_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -ic "salt\.key\|key.*accept\|key.*reject\|key.*denied\|new key" 2>/dev/null || true)
else
LOG_AUTH_FAILURES_CACHE="0"
LOG_KEY_EVENTS_CACHE="0"
fi
}
cache_highstate_data() {
local since_time
since_time=$(date -d '24 hours ago' '+%Y-%m-%d %H:%M:%S' 2>/dev/null)
if command -v journalctl >/dev/null 2>&1; then
HIGHSTATE_FAILURES_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -c "highstate.*fail\|Highstate.*fail\|state.highstate.*False" 2>/dev/null || true)
elif [ -f /var/log/salt/master ]; then
local cutoff
cutoff=$(date -d '24 hours ago' '+%Y-%m-%d %H:%M' 2>/dev/null)
HIGHSTATE_FAILURES_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -c "highstate.*fail\|Highstate.*fail\|state.highstate.*False" 2>/dev/null || true)
else
HIGHSTATE_FAILURES_CACHE="0"
fi
}
# ---------------------------------------------------------------------------
# Key metrics
# ---------------------------------------------------------------------------
get_key_count() {
local category="$1"
if [ -z "$KEY_LIST_CACHE" ]; then
echo "0"
return
fi
case "$category" in
accepted)
find "$SALT_PKI_DIR/minions/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
;;
denied)
find "$SALT_PKI_DIR/minions_denied/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
;;
unaccepted)
find "$SALT_PKI_DIR/minions_pre/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
;;
rejected)
find "$SALT_PKI_DIR/minions_rejected/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
;;
*) echo "0" ;;
esac
}
# ---------------------------------------------------------------------------
# Minion status metrics (plain-text output: one minion per line, prefixed "- ")
# ---------------------------------------------------------------------------
get_minions_up() {
if [ -z "$MINION_UP_CACHE" ]; then
echo "0"
return
fi
echo "$MINION_UP_CACHE" | grep -c "^-" 2>/dev/null || true
}
get_minions_down() {
if [ -z "$MINION_DOWN_CACHE" ]; then
echo "0"
return
fi
echo "$MINION_DOWN_CACHE" | grep -c "^-" 2>/dev/null || true
}
# ---------------------------------------------------------------------------
# Process metrics
# ---------------------------------------------------------------------------
get_master_pid() {
pgrep -f "salt-master" -o 2>/dev/null || echo ""
}
get_master_uptime_seconds() {
local pid
pid=$(get_master_pid)
if [ -n "$pid" ] && [ -d "/proc/$pid" ]; then
local start_time
start_time=$(stat -c %Y "/proc/$pid" 2>/dev/null || echo "0")
if [ "$start_time" -gt 0 ]; then
echo $(( $(date +%s) - start_time ))
else
echo "0"
fi
else
echo "0"
fi
}
get_master_memory_bytes() {
local pid
pid=$(get_master_pid)
if [ -n "$pid" ] && [ -f "/proc/$pid/status" ]; then
local rss_kb
rss_kb=$(grep VmRSS "/proc/$pid/status" 2>/dev/null | awk '{print $2}')
if [ -n "$rss_kb" ]; then
echo $(( rss_kb * 1024 ))
else
echo "0"
fi
else
echo "0"
fi
}
get_master_cpu_percent() {
local pid
pid=$(get_master_pid)
if [ -n "$pid" ]; then
ps -p "$pid" -o %cpu --no-headers 2>/dev/null | tr -d ' ' || echo "0"
else
echo "0"
fi
}
get_master_thread_count() {
local pid
pid=$(get_master_pid)
if [ -n "$pid" ] && [ -d "/proc/$pid/task" ]; then
find "/proc/$pid/task" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
else
echo "0"
fi
}
get_master_open_fds() {
local pid
pid=$(get_master_pid)
if [ -n "$pid" ] && [ -d "/proc/$pid/fd" ]; then
find "/proc/$pid/fd" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
else
echo "0"
fi
}
get_salt_process_count() {
pgrep -c -f "salt-master" 2>/dev/null || true
}
# ---------------------------------------------------------------------------
# Worker thread config
# ---------------------------------------------------------------------------
get_configured_workers() {
local workers
workers=$(grep -E "^worker_threads:" "$SALT_MASTER_CONFIG" 2>/dev/null | awk '{print $2}')
if [ -z "$workers" ]; then
for f in /etc/salt/master.d/*.conf; do
[ -f "$f" ] || continue
workers=$(grep -E "^worker_threads:" "$f" 2>/dev/null | awk '{print $2}')
[ -n "$workers" ] && break
done
fi
echo "${workers:-5}"
}
# ---------------------------------------------------------------------------
# Job cache metrics
# ---------------------------------------------------------------------------
get_job_cache_count() {
if [ -d "$SALT_CACHE_DIR/jobs" ]; then
find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f 2>/dev/null | wc -l
else
echo "0"
fi
}
get_job_cache_size_bytes() {
if [ -d "$SALT_CACHE_DIR/jobs" ]; then
du -sb "$SALT_CACHE_DIR/jobs" 2>/dev/null | awk '{print $1}'
else
echo "0"
fi
}
get_active_jobs() {
if [ -z "$ACTIVE_JOBS_CACHE" ]; then
echo "0"
return
fi
echo "$ACTIVE_JOBS_CACHE" | grep -c "^[0-9]" 2>/dev/null || true
}
# ---------------------------------------------------------------------------
# Event bus / IPC health
# ---------------------------------------------------------------------------
get_event_pub_socket_exists() {
if [ -S "$SALT_RUN_DIR/master_event_pub.ipc" ]; then
echo "1"
else
echo "0"
fi
}
get_event_pull_socket_exists() {
if [ -S "$SALT_RUN_DIR/master_event_pull.ipc" ]; then
echo "1"
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# Salt version
# ---------------------------------------------------------------------------
get_salt_version() {
salt --version 2>/dev/null | awk '{print $2}' || echo "unknown"
}
# ---------------------------------------------------------------------------
# Minion cache staleness
# ---------------------------------------------------------------------------
get_minion_cache_count() {
if [ -d "$SALT_CACHE_DIR/minions" ]; then
find "$SALT_CACHE_DIR/minions" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# Syndic detection
# ---------------------------------------------------------------------------
get_syndic_count() {
if [ -d "$SALT_PKI_DIR/syndics" ]; then
find "$SALT_PKI_DIR/syndics" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# Per-minion last-seen (stale only, >1h)
# ---------------------------------------------------------------------------
generate_minion_last_seen_metrics() {
local now oldest_age minion_dir mtime age
now=$(date +%s)
oldest_age=0
if [ ! -d "$SALT_CACHE_DIR/minions" ]; then
echo "salt_master_minion_cache_oldest_seconds 0"
return
fi
for minion_dir in "$SALT_CACHE_DIR/minions"/*/; do
[ -d "$minion_dir" ] || continue
local minion_name
minion_name=$(basename "$minion_dir")
mtime=$(stat -c %Y "$minion_dir" 2>/dev/null || echo "$now")
age=$(( now - mtime ))
if [ "$age" -gt "$oldest_age" ]; then
oldest_age=$age
fi
echo "salt_master_minion_last_seen_seconds{minion=\"${minion_name}\"} ${mtime}"
done
echo "salt_master_minion_cache_oldest_seconds $oldest_age"
}
# ---------------------------------------------------------------------------
# Minion version drift
# ---------------------------------------------------------------------------
get_version_match_count() {
if [ -z "$VERSIONS_CACHE" ]; then
echo "0"
return
fi
# manage.versions text output: "Up to date:" header followed by "- minion" lines
local in_section=0 count=0
while IFS= read -r line; do
if [[ "$line" =~ ^"Up to date:" ]]; then
in_section=1
continue
fi
if [ "$in_section" -eq 1 ]; then
if [[ "$line" =~ ^"- " ]]; then
count=$((count + 1))
elif [[ "$line" =~ ^[A-Za-z] ]]; then
break
fi
fi
done <<< "$VERSIONS_CACHE"
echo "$count"
}
get_version_mismatch_count() {
if [ -z "$VERSIONS_CACHE" ]; then
echo "0"
return
fi
# Count "- minion" lines NOT under "Up to date:" section
local in_uptodate=0 count=0
while IFS= read -r line; do
if [[ "$line" =~ ^"Up to date:" ]]; then
in_uptodate=1
continue
elif [[ "$line" =~ ^[A-Za-z] ]]; then
in_uptodate=0
continue
fi
if [ "$in_uptodate" -eq 0 ] && [[ "$line" =~ ^"- " ]]; then
count=$((count + 1))
fi
done <<< "$VERSIONS_CACHE"
echo "$count"
}
# ---------------------------------------------------------------------------
# Jobs completed (1h and 24h)
# ---------------------------------------------------------------------------
get_jobs_completed_1h() {
if [ -d "$SALT_CACHE_DIR/jobs" ]; then
find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -60 2>/dev/null | wc -l
else
echo "0"
fi
}
get_jobs_completed_24h() {
if [ -d "$SALT_CACHE_DIR/jobs" ]; then
find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -1440 2>/dev/null | wc -l
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# Failed jobs (24h)
# ---------------------------------------------------------------------------
get_jobs_failed_24h() {
if [ -z "$JOB_LIST_CACHE" ]; then
echo "0"
return
fi
# salt-run jobs.list_jobs --out=txt contains "Result: False" or "retcode: <N>" for failures
# Count lines containing "False" in Result field from last 24h job listing
echo "$JOB_LIST_CACHE" | grep -ic "result.*false\|retcode: [1-9]" 2>/dev/null || true
}
# ---------------------------------------------------------------------------
# Jobs by function (top 10, last 24h)
# ---------------------------------------------------------------------------
generate_jobs_by_function_metrics() {
if [ -z "$JOB_LIST_CACHE" ]; then
return
fi
# Extract "Function:" lines, count by function name, emit top 10
echo "$JOB_LIST_CACHE" | grep -i "Function:" 2>/dev/null | \
awk -F': *' '{print $NF}' | sort | uniq -c | sort -rn | head -10 | \
while read -r count func; do
[ -n "$func" ] && echo "salt_master_jobs_by_function{function=\"${func}\"} ${count}"
done
}
# ---------------------------------------------------------------------------
# Job cache single-pass scan (collects all per-function metrics at once)
# Max 200 jobs to avoid slow scans on busy masters.
# ---------------------------------------------------------------------------
JOB_CACHE_SCAN_MAX=200
generate_job_cache_metrics() {
if [ ! -d "$SALT_CACHE_DIR/jobs" ]; then
return
fi
declare -A func_expected
declare -A func_success
declare -A func_failure
declare -A sched_key_success
declare -A sched_key_failure
local jid_dirs
jid_dirs=$(find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -1440 2>/dev/null | head -n "$JOB_CACHE_SCAN_MAX")
if [ -z "$jid_dirs" ]; then
return
fi
while IFS= read -r load_file; do
[ -z "$load_file" ] && continue
local job_dir func_name
job_dir=$(dirname "$load_file")
func_name=""
if [ -n "$JOB_LIST_CACHE" ]; then
local jid_tail jid_prefix full_jid
jid_tail=$(basename "$job_dir")
jid_prefix=$(basename "$(dirname "$job_dir")")
full_jid="${jid_prefix}${jid_tail}"
func_name=$(echo "$JOB_LIST_CACHE" | grep -A5 "$full_jid" 2>/dev/null \
| grep -i "Function:" | head -1 | sed 's/.*Function:[[:space:]]*//' | tr -d '[:space:]')
fi
local load_strings=""
if [ -z "$func_name" ]; then
load_strings=$(timeout 2 strings "$load_file" 2>/dev/null || true)
func_name=$(echo "$load_strings" | grep -oE '(cmd\.[a-z_]+|state\.[a-z_]+|test\.[a-z_]+|grains\.[a-z_]+|pillar\.[a-z_]+|saltutil\.[a-z_]+|pkg\.[a-z_]+|service\.[a-z_]+|file\.[a-z_]+|sys\.[a-z_]+)' | head -1)
fi
[ -z "$func_name" ] && func_name="unknown"
local is_scheduled=0
local state_name=""
if [ -n "$load_strings" ]; then
if echo "$load_strings" | grep -qi "schedule" 2>/dev/null; then
is_scheduled=1
state_name=$(echo "$load_strings" | grep -oE '\b[a-z_]+\.(sls|init)\b' | head -1 | sed 's/\.sls$//' | sed 's/\.init$//')
fi
elif [ -z "$load_strings" ]; then
load_strings=$(timeout 2 strings "$load_file" 2>/dev/null || true)
if echo "$load_strings" | grep -qi "schedule" 2>/dev/null; then
is_scheduled=1
state_name=$(echo "$load_strings" | grep -oE '\b[a-z_]+\.(sls|init)\b' | head -1 | sed 's/\.sls$//' | sed 's/\.init$//')
fi
fi
[ -z "$state_name" ] && state_name=""
local minion_count=0
local minion_dir
for minion_dir in "$job_dir"/*/; do
[ -d "$minion_dir" ] || continue
minion_count=$((minion_count + 1))
local minion_name
minion_name=$(basename "$minion_dir")
if [ -f "$minion_dir/return.p" ]; then
local ret_content
ret_content=$(timeout 2 strings "$minion_dir/return.p" 2>/dev/null || true)
local is_fail=0
if echo "$ret_content" | grep -qiE "false|traceback|error|exception" 2>/dev/null; then
is_fail=1
fi
if [ "$is_fail" -eq 1 ]; then
func_failure["$func_name"]=$(( ${func_failure["$func_name"]:-0} + 1 ))
else
func_success["$func_name"]=$(( ${func_success["$func_name"]:-0} + 1 ))
fi
if [ "$is_scheduled" -eq 1 ]; then
local skey="${func_name}|${minion_name}|${state_name}"
if [ "$is_fail" -eq 1 ]; then
sched_key_failure["$skey"]=$(( ${sched_key_failure["$skey"]:-0} + 1 ))
else
sched_key_success["$skey"]=$(( ${sched_key_success["$skey"]:-0} + 1 ))
fi
fi
fi
done
if [ "$minion_count" -gt 0 ]; then
func_expected["$func_name"]=$(( ${func_expected["$func_name"]:-0} + minion_count ))
fi
done <<< "$jid_dirs"
for func in "${!func_expected[@]}"; do
echo "salt_master_expected_responses_total{function=\"${func}\",state=\"\"} ${func_expected[$func]}"
done
local all_funcs
all_funcs=$(printf '%s\n' "${!func_success[@]}" "${!func_failure[@]}" | sort -u)
while IFS= read -r func; do
[ -z "$func" ] && continue
echo "salt_master_function_responses_total{function=\"${func}\",state=\"\",success=\"true\"} ${func_success[$func]:-0}"
echo "salt_master_function_responses_total{function=\"${func}\",state=\"\",success=\"false\"} ${func_failure[$func]:-0}"
done <<< "$all_funcs"
local all_keys
all_keys=$(printf '%s\n' "${!sched_key_success[@]}" "${!sched_key_failure[@]}" | sort -u)
while IFS= read -r key; do
[ -z "$key" ] && continue
local func minion state
func=$(echo "$key" | cut -d'|' -f1)
minion=$(echo "$key" | cut -d'|' -f2)
state=$(echo "$key" | cut -d'|' -f3)
echo "salt_master_scheduled_job_return_total{function=\"${func}\",minion=\"${minion}\",state=\"${state}\",success=\"true\"} ${sched_key_success[$key]:-0}"
echo "salt_master_scheduled_job_return_total{function=\"${func}\",minion=\"${minion}\",state=\"${state}\",success=\"false\"} ${sched_key_failure[$key]:-0}"
done <<< "$all_keys"
}
# ---------------------------------------------------------------------------
# New jobs by function with success/failure (from job list text output)
# ---------------------------------------------------------------------------
generate_new_job_metrics() {
if [ -z "$JOB_LIST_CACHE" ]; then
return
fi
declare -A func_total
declare -A func_failed
local current_func=""
while IFS= read -r line; do
if [[ "$line" =~ Function: ]]; then
current_func=$(echo "$line" | sed 's/.*Function:[[:space:]]*//' | tr -d '[:space:]')
[ -n "$current_func" ] && func_total["$current_func"]=$(( ${func_total["$current_func"]:-0} + 1 ))
fi
if [[ "$line" =~ Result:.*False ]] || [[ "$line" =~ retcode:\ [1-9] ]]; then
[ -n "$current_func" ] && func_failed["$current_func"]=$(( ${func_failed["$current_func"]:-0} + 1 ))
fi
done <<< "$JOB_LIST_CACHE"
for func in "${!func_total[@]}"; do
local total=${func_total[$func]}
local failed=${func_failed[$func]:-0}
local succeeded=$((total - failed))
echo "salt_master_new_job_total{function=\"${func}\",state=\"\",success=\"true\"} ${succeeded}"
echo "salt_master_new_job_total{function=\"${func}\",state=\"\",success=\"false\"} ${failed}"
done
}
# ---------------------------------------------------------------------------
# ZeroMQ port status
# ---------------------------------------------------------------------------
get_port_listening() {
local port="$1"
if ss -tlnp 2>/dev/null | grep -q ":${port} " 2>/dev/null; then
echo "1"
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# Cache disk usage and inode count
# ---------------------------------------------------------------------------
get_cache_disk_used_bytes() {
if [ -d "$SALT_CACHE_DIR" ]; then
du -sb "$SALT_CACHE_DIR" 2>/dev/null | awk '{print $1}'
else
echo "0"
fi
}
get_cache_disk_available_bytes() {
if [ -d "$SALT_CACHE_DIR" ]; then
df -B1 "$SALT_CACHE_DIR" 2>/dev/null | tail -1 | awk '{print $4}'
else
echo "0"
fi
}
get_cache_inode_count() {
if [ -d "$SALT_CACHE_DIR" ]; then
find "$SALT_CACHE_DIR" 2>/dev/null | wc -l
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# File server cache size
# ---------------------------------------------------------------------------
get_fileserver_cache_size_bytes() {
if [ -d "$SALT_CACHE_DIR/file_lists" ]; then
du -sb "$SALT_CACHE_DIR/file_lists" 2>/dev/null | awk '{print $1}'
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# Configuration values
# ---------------------------------------------------------------------------
get_config_value() {
local key="$1" default="$2"
local val
val=$(grep -E "^[[:space:]]*${key}:" "$SALT_MASTER_CONFIG" 2>/dev/null | head -1 | sed "s/^[[:space:]]*${key}:[[:space:]]*//" | tr -d '[:space:]')
if [ -z "$val" ]; then
for f in /etc/salt/master.d/*.conf; do
[ -f "$f" ] || continue
val=$(grep -E "^[[:space:]]*${key}:" "$f" 2>/dev/null | head -1 | sed "s/^[[:space:]]*${key}:[[:space:]]*//" | tr -d '[:space:]')
[ -n "$val" ] && break
done
fi
if [ -z "$val" ]; then
val=$(timeout "$SALT_CMD_TIMEOUT" salt-run config.get "$key" 2>/dev/null | tr -d '[:space:]')
[ "$val" = "None" ] || [ "$val" = "" ] && val=""
fi
echo "${val:-$default}"
}
get_config_bool() {
local val
val=$(get_config_value "$1" "$2")
case "${val,,}" in
true|yes|1) echo "1" ;;
*) echo "0" ;;
esac
}
get_config_keep_jobs() {
get_config_value "keep_jobs" "24"
}
get_config_master_stats_enabled() {
get_config_bool "master_stats" "false"
}
# ---------------------------------------------------------------------------
# Salt API status
# ---------------------------------------------------------------------------
get_salt_api_running() {
if pgrep -f "salt-api" >/dev/null 2>&1; then
echo "1"
else
echo "0"
fi
}
get_salt_api_port() {
ss -tlnp 2>/dev/null | grep "salt-api" | awk '{print $4}' | grep -oE '[0-9]+$' | head -1
}
# ---------------------------------------------------------------------------
# File roots size
# ---------------------------------------------------------------------------
get_file_roots_size_bytes() {
local roots_dir
roots_dir=$(get_config_value "file_roots" "")
if [ -z "$roots_dir" ] && [ -d "/srv/salt" ]; then
roots_dir="/srv/salt"
fi
if [ -n "$roots_dir" ] && [ -d "$roots_dir" ]; then
du -sb "$roots_dir" 2>/dev/null | awk '{print $1}'
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# Master log file size
# ---------------------------------------------------------------------------
get_master_log_size_bytes() {
local log_file
log_file=$(get_config_value "log_file" "/var/log/salt/master")
if [ -f "$log_file" ]; then
stat -c %s "$log_file" 2>/dev/null || echo "0"
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# Pillar roots size
# ---------------------------------------------------------------------------
get_pillar_roots_size_bytes() {
local pillar_dir
pillar_dir=$(get_config_value "pillar_roots" "")
if [ -z "$pillar_dir" ] && [ -d "/srv/pillar" ]; then
pillar_dir="/srv/pillar"
fi
if [ -n "$pillar_dir" ] && [ -d "$pillar_dir" ]; then
du -sb "$pillar_dir" 2>/dev/null | awk '{print $1}'
else
echo "0"
fi
}
# ---------------------------------------------------------------------------
# Metric generation
# ---------------------------------------------------------------------------
generate_metrics() {
local start_time
start_time=$(date +%s)
local master_pid master_running
master_pid=$(get_master_pid)
if [ -n "$master_pid" ]; then
master_running=1
else
master_running=0
fi
local salt_version
salt_version=$(get_salt_version)
local keys_accepted keys_denied keys_unaccepted keys_rejected
keys_accepted=$(find "$SALT_PKI_DIR/minions/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l)
keys_denied=$(find "$SALT_PKI_DIR/minions_denied/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l)
keys_unaccepted=$(find "$SALT_PKI_DIR/minions_pre/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l)
keys_rejected=$(find "$SALT_PKI_DIR/minions_rejected/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l)
local keys_total=$((keys_accepted + keys_denied + keys_unaccepted + keys_rejected))
local minions_up minions_down
minions_up=$(get_minions_up)
minions_down=$(get_minions_down)
local master_uptime master_memory master_cpu master_threads master_fds salt_procs
master_uptime=$(get_master_uptime_seconds)
master_memory=$(get_master_memory_bytes)
master_cpu=$(get_master_cpu_percent)
master_threads=$(get_master_thread_count)
master_fds=$(get_master_open_fds)
salt_procs=$(get_salt_process_count)
local configured_workers
configured_workers=$(get_configured_workers)
local job_cache_count job_cache_size active_jobs
job_cache_count=$(get_job_cache_count)
job_cache_size=$(get_job_cache_size_bytes)
active_jobs=$(get_active_jobs)
local event_pub_socket event_pull_socket
event_pub_socket=$(get_event_pub_socket_exists)
event_pull_socket=$(get_event_pull_socket_exists)
local minion_cache_count syndic_count
minion_cache_count=$(get_minion_cache_count)
syndic_count=$(get_syndic_count)
local version_match version_mismatch
version_match=$(get_version_match_count)
version_mismatch=$(get_version_mismatch_count)
local jobs_1h jobs_24h jobs_failed_24h
jobs_1h=$(get_jobs_completed_1h)
jobs_24h=$(get_jobs_completed_24h)
jobs_failed_24h=$(get_jobs_failed_24h)
local port_4505 port_4506
port_4505=$(get_port_listening 4505)
port_4506=$(get_port_listening 4506)
local cache_disk_used cache_disk_avail cache_inode_count
cache_disk_used=$(get_cache_disk_used_bytes)
cache_disk_avail=$(get_cache_disk_available_bytes)
cache_inode_count=$(get_cache_inode_count)
local fileserver_cache_size
fileserver_cache_size=$(get_fileserver_cache_size_bytes)
local config_keep_jobs config_master_stats
config_keep_jobs=$(get_config_keep_jobs)
config_master_stats=$(get_config_master_stats_enabled)
local config_state_events config_presence_events config_timeout config_job_cache
config_state_events=$(get_config_bool "state_events" "false")
config_presence_events=$(get_config_bool "presence_events" "false")
config_timeout=$(get_config_value "timeout" "5")
config_job_cache=$(get_config_bool "job_cache" "true")
local config_publish_port config_ret_port
config_publish_port=$(get_config_value "publish_port" "4505")
config_ret_port=$(get_config_value "ret_port" "4506")
local salt_api_running salt_api_port
salt_api_running=$(get_salt_api_running)
salt_api_port=$(get_salt_api_port)
local file_roots_size pillar_roots_size master_log_size
file_roots_size=$(get_file_roots_size_bytes)
pillar_roots_size=$(get_pillar_roots_size_bytes)
master_log_size=$(get_master_log_size_bytes)
cat <<EOF
# HELP salt_master_info Salt master info and version
# TYPE salt_master_info gauge
salt_master_info{version="${salt_version}",exporter_version="${SCRIPT_VERSION}"} 1
# HELP salt_master_up Whether the salt-master process is running
# TYPE salt_master_up gauge
salt_master_up $master_running
# HELP salt_master_uptime_seconds Salt master process uptime in seconds
# TYPE salt_master_uptime_seconds gauge
salt_master_uptime_seconds $master_uptime
# HELP salt_master_memory_bytes Salt master RSS memory usage in bytes
# TYPE salt_master_memory_bytes gauge
salt_master_memory_bytes $master_memory
# HELP salt_master_cpu_percent Salt master CPU usage percentage
# TYPE salt_master_cpu_percent gauge
salt_master_cpu_percent $master_cpu
# HELP salt_master_threads Salt master thread count
# TYPE salt_master_threads gauge
salt_master_threads $master_threads
# HELP salt_master_open_fds Salt master open file descriptors
# TYPE salt_master_open_fds gauge
salt_master_open_fds $master_fds
# HELP salt_master_processes Total salt-master process count (workers + parent)
# TYPE salt_master_processes gauge
salt_master_processes $salt_procs
# HELP salt_master_worker_threads_configured Configured worker_threads value
# TYPE salt_master_worker_threads_configured gauge
salt_master_worker_threads_configured $configured_workers
# HELP salt_master_keys_total Total minion keys by status
# TYPE salt_master_keys_total gauge
salt_master_keys_total{status="accepted"} $keys_accepted
salt_master_keys_total{status="denied"} $keys_denied
salt_master_keys_total{status="unaccepted"} $keys_unaccepted
salt_master_keys_total{status="rejected"} $keys_rejected
salt_master_keys_total{status="all"} $keys_total
# HELP salt_master_minions_up Minions currently responding
# TYPE salt_master_minions_up gauge
salt_master_minions_up $minions_up
# HELP salt_master_minions_down Minions currently not responding
# TYPE salt_master_minions_down gauge
salt_master_minions_down $minions_down
# HELP salt_master_minions_connectivity_ratio Ratio of up minions to accepted keys
# TYPE salt_master_minions_connectivity_ratio gauge
EOF
if [ "$keys_accepted" -gt 0 ]; then
local ratio
ratio=$(awk "BEGIN {printf \"%.4f\", $minions_up / $keys_accepted}")
echo "salt_master_minions_connectivity_ratio $ratio"
else
echo "salt_master_minions_connectivity_ratio 0"
fi
cat <<EOF
# HELP salt_master_job_cache_count Number of jobs in the job cache
# TYPE salt_master_job_cache_count gauge
salt_master_job_cache_count $job_cache_count
# HELP salt_master_job_cache_size_bytes Size of the job cache directory in bytes
# TYPE salt_master_job_cache_size_bytes gauge
salt_master_job_cache_size_bytes ${job_cache_size:-0}
# HELP salt_master_active_jobs Currently running jobs
# TYPE salt_master_active_jobs gauge
salt_master_active_jobs $active_jobs
# HELP salt_master_event_socket IPC event socket availability (1=exists, 0=missing)
# TYPE salt_master_event_socket gauge
salt_master_event_socket{socket="pub"} $event_pub_socket
salt_master_event_socket{socket="pull"} $event_pull_socket
# HELP salt_master_minion_cache_count Minions in the master cache directory
# TYPE salt_master_minion_cache_count gauge
salt_master_minion_cache_count $minion_cache_count
# HELP salt_master_syndic_count Number of connected syndics
# TYPE salt_master_syndic_count gauge
salt_master_syndic_count $syndic_count
# HELP salt_master_minion_version_match Minions matching master version
# TYPE salt_master_minion_version_match gauge
salt_master_minion_version_match $version_match
# HELP salt_master_minion_version_mismatch Minions not matching master version
# TYPE salt_master_minion_version_mismatch gauge
salt_master_minion_version_mismatch $version_mismatch
# HELP salt_master_jobs_completed Jobs completed in time period
# TYPE salt_master_jobs_completed gauge
salt_master_jobs_completed{period="1h"} $jobs_1h
salt_master_jobs_completed{period="24h"} $jobs_24h
# HELP salt_master_jobs_failed_24h Jobs with non-zero retcode in last 24h
# TYPE salt_master_jobs_failed_24h gauge
salt_master_jobs_failed_24h $jobs_failed_24h
# HELP salt_master_port_listening ZeroMQ port listening status (1=listening, 0=not)
# TYPE salt_master_port_listening gauge
salt_master_port_listening{port="4505"} $port_4505
salt_master_port_listening{port="4506"} $port_4506
# HELP salt_master_cache_disk_used_bytes Salt cache directory disk usage in bytes
# TYPE salt_master_cache_disk_used_bytes gauge
salt_master_cache_disk_used_bytes ${cache_disk_used:-0}
# HELP salt_master_cache_disk_available_bytes Available disk space on cache partition in bytes
# TYPE salt_master_cache_disk_available_bytes gauge
salt_master_cache_disk_available_bytes ${cache_disk_avail:-0}
# HELP salt_master_cache_inode_count Total file and directory count in salt cache
# TYPE salt_master_cache_inode_count gauge
salt_master_cache_inode_count ${cache_inode_count:-0}
# HELP salt_master_log_errors_1h Salt master ERROR log entries in last 1h
# TYPE salt_master_log_errors_1h gauge
salt_master_log_errors_1h ${LOG_ERRORS_CACHE:-0}
# HELP salt_master_log_critical_1h Salt master CRITICAL log entries in last 1h
# TYPE salt_master_log_critical_1h gauge
salt_master_log_critical_1h ${LOG_CRITICAL_CACHE:-0}
# HELP salt_master_fileserver_cache_size_bytes Size of file_lists cache in bytes
# TYPE salt_master_fileserver_cache_size_bytes gauge
salt_master_fileserver_cache_size_bytes ${fileserver_cache_size:-0}
# HELP salt_master_config_keep_jobs Configured keep_jobs value (hours)
# TYPE salt_master_config_keep_jobs gauge
salt_master_config_keep_jobs $config_keep_jobs
# HELP salt_master_config_master_stats_enabled Whether master_stats is enabled (1=yes, 0=no)
# TYPE salt_master_config_master_stats_enabled gauge
salt_master_config_master_stats_enabled $config_master_stats
# HELP salt_master_config_state_events Whether state_events is enabled (1=yes, 0=no)
# TYPE salt_master_config_state_events gauge
salt_master_config_state_events $config_state_events
# HELP salt_master_config_presence_events Whether presence_events is enabled (1=yes, 0=no)
# TYPE salt_master_config_presence_events gauge
salt_master_config_presence_events $config_presence_events
# HELP salt_master_config_timeout Default salt command timeout (seconds)
# TYPE salt_master_config_timeout gauge
salt_master_config_timeout $config_timeout
# HELP salt_master_config_job_cache Whether job_cache is enabled (1=yes, 0=no)
# TYPE salt_master_config_job_cache gauge
salt_master_config_job_cache $config_job_cache
# HELP salt_master_config_publish_port Configured publish port
# TYPE salt_master_config_publish_port gauge
salt_master_config_publish_port $config_publish_port
# HELP salt_master_config_ret_port Configured return port
# TYPE salt_master_config_ret_port gauge
salt_master_config_ret_port $config_ret_port
# HELP salt_master_highstate_failures_24h Highstate failures detected in last 24h
# TYPE salt_master_highstate_failures_24h gauge
salt_master_highstate_failures_24h ${HIGHSTATE_FAILURES_CACHE:-0}
# HELP salt_master_log_auth_failures_1h Authentication failure events in last 1h
# TYPE salt_master_log_auth_failures_1h gauge
salt_master_log_auth_failures_1h ${LOG_AUTH_FAILURES_CACHE:-0}
# HELP salt_master_log_key_events_1h Key management events in last 1h
# TYPE salt_master_log_key_events_1h gauge
salt_master_log_key_events_1h ${LOG_KEY_EVENTS_CACHE:-0}
# HELP salt_master_salt_api_running Whether salt-api process is running (1=yes, 0=no)
# TYPE salt_master_salt_api_running gauge
salt_master_salt_api_running $salt_api_running
# HELP salt_master_salt_api_port Salt API listening port (0 if not running)
# TYPE salt_master_salt_api_port gauge
salt_master_salt_api_port ${salt_api_port:-0}
# HELP salt_master_file_roots_size_bytes Total size of file_roots directory
# TYPE salt_master_file_roots_size_bytes gauge
salt_master_file_roots_size_bytes ${file_roots_size:-0}
# HELP salt_master_pillar_roots_size_bytes Total size of pillar_roots directory
# TYPE salt_master_pillar_roots_size_bytes gauge
salt_master_pillar_roots_size_bytes ${pillar_roots_size:-0}
# HELP salt_master_log_file_size_bytes Size of the salt master log file
# TYPE salt_master_log_file_size_bytes gauge
salt_master_log_file_size_bytes ${master_log_size:-0}
EOF
# Per-minion last-seen (stale only)
echo "# HELP salt_master_minion_last_seen_seconds Last-seen cache mtime per minion (unix epoch)"
echo "# TYPE salt_master_minion_last_seen_seconds gauge"
echo ""
echo "# HELP salt_master_minion_cache_oldest_seconds Age of the stalest minion cache in seconds"
echo "# TYPE salt_master_minion_cache_oldest_seconds gauge"
generate_minion_last_seen_metrics
# Jobs by function (top 10, last 24h)
echo ""
echo "# HELP salt_master_jobs_by_function Job count by function in last 24h (top 10)"
echo "# TYPE salt_master_jobs_by_function gauge"
generate_jobs_by_function_metrics
# Per-function job cache metrics (single-pass scan)
echo ""
echo "# HELP salt_master_expected_responses_total Expected minion responses per function (24h gauge)"
echo "# TYPE salt_master_expected_responses_total gauge"
echo "# HELP salt_master_function_responses_total Minion responses per function with success status (24h gauge)"
echo "# TYPE salt_master_function_responses_total gauge"
echo "# HELP salt_master_scheduled_job_return_total Scheduled job returns per function/minion/state (24h gauge)"
echo "# TYPE salt_master_scheduled_job_return_total gauge"
generate_job_cache_metrics
# New jobs by function with success/failure (24h, from job list)
echo ""
echo "# HELP salt_master_new_job_total Jobs by function with success status (24h gauge)"
echo "# TYPE salt_master_new_job_total gauge"
generate_new_job_metrics
# Highstate job count from job list (based on Function: state.highstate)
echo ""
echo "# HELP salt_master_highstate_jobs_24h Count of highstate jobs in last 24h"
echo "# TYPE salt_master_highstate_jobs_24h gauge"
if [ -n "$JOB_LIST_CACHE" ]; then
local hs_count
hs_count=$(echo "$JOB_LIST_CACHE" | grep -ic "Function:.*state\.highstate" 2>/dev/null || true)
echo "salt_master_highstate_jobs_24h ${hs_count:-0}"
else
echo "salt_master_highstate_jobs_24h 0"
fi
cat <<EOF
# HELP salt_master_scrape_timestamp_seconds Unix timestamp of metric generation
# TYPE salt_master_scrape_timestamp_seconds gauge
salt_master_scrape_timestamp_seconds $(date +%s)
# HELP salt_master_exporter_duration_seconds Time to generate all metrics
# TYPE salt_master_exporter_duration_seconds gauge
salt_master_exporter_duration_seconds $(($(date +%s) - start_time))
EOF
echo ""
}
# ---------------------------------------------------------------------------
# HTTP server
# ---------------------------------------------------------------------------
run_http_server() {
echo "Starting Salt Master exporter on port $HTTP_PORT..." >&2
while true; do
{
read -r request
if [[ "$request" =~ ^GET\ /metrics ]]; then
printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4; charset=utf-8\r\n\r\n"
cache_all_salt_data
cache_log_data
cache_highstate_data
generate_metrics
else
printf "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n"
echo "<h1>Salt Master Exporter v${SCRIPT_VERSION}</h1><a href='/metrics'>Metrics</a>"
fi
} | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
done
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
main() {
parse_args "$@"
[ "$HTTP_MODE" != true ] && acquire_lock
if [ "$HTTP_MODE" = true ]; then
run_http_server
elif [ -n "$OUTPUT_FILE" ]; then
cache_all_salt_data
cache_log_data
cache_highstate_data
mkdir -p "$(dirname "$OUTPUT_FILE")"
local temp_file
temp_file=$(mktemp /tmp/salt_master_metrics.XXXXXX)
generate_metrics > "$temp_file"
rm -f "$OUTPUT_FILE"
mv "$temp_file" "$OUTPUT_FILE"
chmod 644 "$OUTPUT_FILE"
sync
else
cache_all_salt_data
cache_log_data
cache_highstate_data
generate_metrics
fi
}
main "$@"