#!/bin/bash ################################################################################ # Script Name: salt-master-metrics.sh # Version: 3.2 # Author: Phil Connor, contact@mylinux.work # License: MIT # Description: Production Prometheus exporter for Salt Master metrics # # Exports metrics for: # - Master process health (CPU, memory, uptime) # - Minion connectivity (up, down, accepted, rejected, denied, unaccepted) # - Per-minion last-seen timestamp (stale minions only) # - Minion version drift (match vs mismatch count) # - Job statistics (active, cached, recent, completed 1h/24h, failed 24h) # - Jobs by function breakdown (top 10) # - Per-function expected/actual responses with success/failure (24h) # - Per-function new job counts with success/failure (24h) # - Scheduled job returns per minion/function/state (24h) # - Key management counts # - Event bus health # - ZeroMQ port status (4505, 4506) # - Worker thread utilization # - Salt versions (master vs minion drift) # - Cache disk usage and inode count # - Salt master log error rate (1h) # - File server cache size # - Configuration values (keep_jobs, master_stats) # - Highstate metrics (failures, last timestamp per minion) # - Extended configuration detection (state_events, presence_events, timeout, job_cache) # - Salt API process status # - Auth failure rate from logs # - File roots total size # - Master log file size # - Minion auth/key events from journal # # Modes: # --textfile Write to node_exporter textfile collector # --http Run HTTP server for direct Prometheus scraping # stdout Default: print metrics to stdout # # Changelog: # 3.2 - Added per-minion highstate detail metrics (succeeded/failed/changed # state counts per minion from most recent highstate run) # 3.1 - Added per-function expected/actual response metrics, per-function # new job metrics with success/failure, scheduled job return # metrics per minion/function/state. All from job cache parsing. # 3.0 - Added config detection (state_events, presence_events, timeout, # job_cache, publish_port, ret_port), salt-api process status, # auth failure rate, file_roots size, master log size, minion # auth events. Pure bash (no python3 dependency). # 2.0 - Added per-minion last-seen, version drift, jobs completed/failed, # jobs by function, ZeroMQ port checks, cache disk/inode metrics, # log error rate, fileserver cache size, config values, highstate # metrics. Expanded caching layer for expensive operations. # 1.0 - Initial release ################################################################################ SCRIPT_VERSION="3.2" TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9417 LOCK_FILE="/var/run/salt-master-metrics.lock" SALT_MASTER_CONFIG="/etc/salt/master" SALT_CACHE_DIR="/var/cache/salt/master" SALT_PKI_DIR="/etc/salt/pki/master" SALT_RUN_DIR="/var/run/salt/master" # Timeouts for salt commands (seconds) SALT_CMD_TIMEOUT=15 # Cache for expensive operations MINION_UP_CACHE="" MINION_DOWN_CACHE="" KEY_LIST_CACHE="" VERSIONS_CACHE="" LOG_ERRORS_CACHE="" LOG_CRITICAL_CACHE="" HIGHSTATE_FAILURES_CACHE="" ACTIVE_JOBS_CACHE="" JOB_LIST_CACHE="" LOG_AUTH_FAILURES_CACHE="" LOG_KEY_EVENTS_CACHE="" show_usage() { cat </dev/null) if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then echo "ERROR: Another instance is already running (PID: $pid)" >&2 exit 1 else echo "Removing stale lock file" >&2 rm -f "$LOCK_FILE" fi fi echo $$ > "$LOCK_FILE" trap cleanup EXIT INT TERM } cleanup() { rm -f "$LOCK_FILE" } # --------------------------------------------------------------------------- # Data collection (cached) # --------------------------------------------------------------------------- cache_key_list() { KEY_LIST_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-key -L 2>/dev/null || echo "") } # Use plain-text salt-run commands (one minion per line, no python needed) cache_minion_up() { MINION_UP_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run manage.up 2>/dev/null || echo "") } cache_minion_down() { MINION_DOWN_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run manage.down 2>/dev/null || echo "") } cache_versions() { VERSIONS_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run manage.versions 2>/dev/null || echo "") } cache_active_jobs() { ACTIVE_JOBS_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run jobs.active 2>/dev/null || echo "") } cache_job_list() { JOB_LIST_CACHE=$(timeout "$SALT_CMD_TIMEOUT" salt-run jobs.list_jobs --out=txt 2>/dev/null || echo "") } # Run all expensive salt-run commands in parallel using temp files cache_all_salt_data() { local tmp_dir tmp_dir=$(mktemp -d /tmp/salt_metrics_cache.XXXXXX) timeout "$SALT_CMD_TIMEOUT" salt-key -L > "$tmp_dir/keys" 2>/dev/null & timeout "$SALT_CMD_TIMEOUT" salt-run manage.up > "$tmp_dir/up" 2>/dev/null & timeout "$SALT_CMD_TIMEOUT" salt-run manage.down > "$tmp_dir/down" 2>/dev/null & timeout "$SALT_CMD_TIMEOUT" salt-run manage.versions > "$tmp_dir/versions" 2>/dev/null & timeout "$SALT_CMD_TIMEOUT" salt-run jobs.active > "$tmp_dir/active" 2>/dev/null & timeout "$SALT_CMD_TIMEOUT" salt-run jobs.list_jobs --out=txt > "$tmp_dir/joblist" 2>/dev/null & wait KEY_LIST_CACHE=$(cat "$tmp_dir/keys" 2>/dev/null) MINION_UP_CACHE=$(cat "$tmp_dir/up" 2>/dev/null) MINION_DOWN_CACHE=$(cat "$tmp_dir/down" 2>/dev/null) VERSIONS_CACHE=$(cat "$tmp_dir/versions" 2>/dev/null) ACTIVE_JOBS_CACHE=$(cat "$tmp_dir/active" 2>/dev/null) JOB_LIST_CACHE=$(cat "$tmp_dir/joblist" 2>/dev/null) rm -rf "$tmp_dir" } cache_log_data() { local since_time since_time=$(date -d '1 hour ago' '+%Y-%m-%d %H:%M:%S' 2>/dev/null) if command -v journalctl >/dev/null 2>&1; then LOG_ERRORS_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -c "ERROR" 2>/dev/null || true) LOG_CRITICAL_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -c "CRITICAL" 2>/dev/null || true) elif [ -f /var/log/salt/master ]; then local cutoff cutoff=$(date -d '1 hour ago' '+%Y-%m-%d %H:%M' 2>/dev/null) LOG_ERRORS_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -c "ERROR" 2>/dev/null || true) LOG_CRITICAL_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -c "CRITICAL" 2>/dev/null || true) else LOG_ERRORS_CACHE="0" LOG_CRITICAL_CACHE="0" fi # Auth failures and key events if command -v journalctl >/dev/null 2>&1; then LOG_AUTH_FAILURES_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -ic "authentication denied\|failed to authenticate\|salt\.crypt.*denied" 2>/dev/null || true) LOG_KEY_EVENTS_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -ic "salt\.key\|key.*accept\|key.*reject\|key.*denied\|new key" 2>/dev/null || true) elif [ -f /var/log/salt/master ]; then LOG_AUTH_FAILURES_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -ic "authentication denied\|failed to authenticate\|salt\.crypt.*denied" 2>/dev/null || true) LOG_KEY_EVENTS_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -ic "salt\.key\|key.*accept\|key.*reject\|key.*denied\|new key" 2>/dev/null || true) else LOG_AUTH_FAILURES_CACHE="0" LOG_KEY_EVENTS_CACHE="0" fi } cache_highstate_data() { local since_time since_time=$(date -d '24 hours ago' '+%Y-%m-%d %H:%M:%S' 2>/dev/null) if command -v journalctl >/dev/null 2>&1; then HIGHSTATE_FAILURES_CACHE=$(journalctl -u salt-master --since "$since_time" --no-pager 2>/dev/null | grep -c "highstate.*fail\|Highstate.*fail\|state.highstate.*False" 2>/dev/null || true) elif [ -f /var/log/salt/master ]; then local cutoff cutoff=$(date -d '24 hours ago' '+%Y-%m-%d %H:%M' 2>/dev/null) HIGHSTATE_FAILURES_CACHE=$(awk -v cutoff="$cutoff" '$0 >= cutoff' /var/log/salt/master 2>/dev/null | grep -c "highstate.*fail\|Highstate.*fail\|state.highstate.*False" 2>/dev/null || true) else HIGHSTATE_FAILURES_CACHE="0" fi } # --------------------------------------------------------------------------- # Per-minion highstate detail metrics (succeeded/failed/changed per minion) # Scans job cache for most recent highstate per minion, counts state results. # --------------------------------------------------------------------------- generate_highstate_detail_metrics() { if [ ! -d "$SALT_CACHE_DIR/jobs" ]; then return fi declare -A minion_succeeded declare -A minion_failed declare -A minion_changed declare -A minion_jid local jid_dirs jid_dirs=$(find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -1440 2>/dev/null | head -n "$JOB_CACHE_SCAN_MAX") if [ -z "$jid_dirs" ]; then return fi while IFS= read -r load_file; do [ -z "$load_file" ] && continue local job_dir func_name job_dir=$(dirname "$load_file") func_name="" if [ -n "$JOB_LIST_CACHE" ]; then local jid_tail jid_prefix full_jid jid_tail=$(basename "$job_dir") jid_prefix=$(basename "$(dirname "$job_dir")") full_jid="${jid_prefix}${jid_tail}" func_name=$(echo "$JOB_LIST_CACHE" | grep -A5 "$full_jid" 2>/dev/null \ | grep -i "Function:" | head -1 | sed 's/.*Function:[[:space:]]*//' | tr -d '[:space:]') fi if [ -z "$func_name" ]; then local load_strings load_strings=$(timeout 2 strings "$load_file" 2>/dev/null || true) func_name=$(echo "$load_strings" | grep -oE '(state\.[a-z_]+)' | head -1) fi # Only process highstate jobs (state.highstate or state.apply with no specific state) case "$func_name" in state.highstate) ;; state.apply) local load_check load_check=$(timeout 2 strings "$load_file" 2>/dev/null || true) if echo "$load_check" | grep -qE '\b[a-z_]+\.(sls|init)\b' 2>/dev/null; then continue fi ;; *) continue ;; esac local jid_tail jid_prefix full_jid jid_tail=$(basename "$job_dir") jid_prefix=$(basename "$(dirname "$job_dir")") full_jid="${jid_prefix}${jid_tail}" local minion_dir for minion_dir in "$job_dir"/*/; do [ -d "$minion_dir" ] || continue local minion_name minion_name=$(basename "$minion_dir") # Only keep the most recent highstate per minion (higher JID = more recent) if [ -n "${minion_jid[$minion_name]+x}" ]; then if [[ "$full_jid" < "${minion_jid[$minion_name]}" ]]; then continue fi fi if [ -f "$minion_dir/return.p" ]; then local ret_content ret_content=$(timeout 2 strings "$minion_dir/return.p" 2>/dev/null || true) local succeeded=0 failed=0 changed=0 succeeded=$(echo "$ret_content" | grep -ciE 'Result.*True' 2>/dev/null || true) failed=$(echo "$ret_content" | grep -ciE 'Result.*False' 2>/dev/null || true) changed=$(echo "$ret_content" | grep -ciE 'Changed:.*True' 2>/dev/null || true) minion_succeeded["$minion_name"]=$succeeded minion_failed["$minion_name"]=$failed minion_changed["$minion_name"]=$changed minion_jid["$minion_name"]=$full_jid fi done done <<< "$jid_dirs" local all_minions all_minions=$(printf '%s\n' "${!minion_succeeded[@]}" | sort -u) while IFS= read -r minion; do [ -z "$minion" ] && continue echo "salt_master_highstate_detail{minion=\"${minion}\",result=\"succeeded\"} ${minion_succeeded[$minion]:-0}" echo "salt_master_highstate_detail{minion=\"${minion}\",result=\"failed\"} ${minion_failed[$minion]:-0}" echo "salt_master_highstate_detail{minion=\"${minion}\",result=\"changed\"} ${minion_changed[$minion]:-0}" done <<< "$all_minions" } # --------------------------------------------------------------------------- # Key metrics # --------------------------------------------------------------------------- get_key_count() { local category="$1" if [ -z "$KEY_LIST_CACHE" ]; then echo "0" return fi case "$category" in accepted) find "$SALT_PKI_DIR/minions/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l ;; denied) find "$SALT_PKI_DIR/minions_denied/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l ;; unaccepted) find "$SALT_PKI_DIR/minions_pre/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l ;; rejected) find "$SALT_PKI_DIR/minions_rejected/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l ;; *) echo "0" ;; esac } # --------------------------------------------------------------------------- # Minion status metrics (plain-text output: one minion per line, prefixed "- ") # --------------------------------------------------------------------------- get_minions_up() { if [ -z "$MINION_UP_CACHE" ]; then echo "0" return fi echo "$MINION_UP_CACHE" | grep -c "^-" 2>/dev/null || true } get_minions_down() { if [ -z "$MINION_DOWN_CACHE" ]; then echo "0" return fi echo "$MINION_DOWN_CACHE" | grep -c "^-" 2>/dev/null || true } # --------------------------------------------------------------------------- # Process metrics # --------------------------------------------------------------------------- get_master_pid() { pgrep -f "salt-master" -o 2>/dev/null || echo "" } get_master_uptime_seconds() { local pid pid=$(get_master_pid) if [ -n "$pid" ] && [ -d "/proc/$pid" ]; then local start_time start_time=$(stat -c %Y "/proc/$pid" 2>/dev/null || echo "0") if [ "$start_time" -gt 0 ]; then echo $(( $(date +%s) - start_time )) else echo "0" fi else echo "0" fi } get_master_memory_bytes() { local pid pid=$(get_master_pid) if [ -n "$pid" ] && [ -f "/proc/$pid/status" ]; then local rss_kb rss_kb=$(grep VmRSS "/proc/$pid/status" 2>/dev/null | awk '{print $2}') if [ -n "$rss_kb" ]; then echo $(( rss_kb * 1024 )) else echo "0" fi else echo "0" fi } get_master_cpu_percent() { local pid pid=$(get_master_pid) if [ -n "$pid" ]; then ps -p "$pid" -o %cpu --no-headers 2>/dev/null | tr -d ' ' || echo "0" else echo "0" fi } get_master_thread_count() { local pid pid=$(get_master_pid) if [ -n "$pid" ] && [ -d "/proc/$pid/task" ]; then find "/proc/$pid/task" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l else echo "0" fi } get_master_open_fds() { local pid pid=$(get_master_pid) if [ -n "$pid" ] && [ -d "/proc/$pid/fd" ]; then find "/proc/$pid/fd" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l else echo "0" fi } get_salt_process_count() { pgrep -c -f "salt-master" 2>/dev/null || true } # --------------------------------------------------------------------------- # Worker thread config # --------------------------------------------------------------------------- get_configured_workers() { local workers workers=$(grep -E "^worker_threads:" "$SALT_MASTER_CONFIG" 2>/dev/null | awk '{print $2}') if [ -z "$workers" ]; then for f in /etc/salt/master.d/*.conf; do [ -f "$f" ] || continue workers=$(grep -E "^worker_threads:" "$f" 2>/dev/null | awk '{print $2}') [ -n "$workers" ] && break done fi echo "${workers:-5}" } # --------------------------------------------------------------------------- # Job cache metrics # --------------------------------------------------------------------------- get_job_cache_count() { if [ -d "$SALT_CACHE_DIR/jobs" ]; then find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f 2>/dev/null | wc -l else echo "0" fi } get_job_cache_size_bytes() { if [ -d "$SALT_CACHE_DIR/jobs" ]; then du -sb "$SALT_CACHE_DIR/jobs" 2>/dev/null | awk '{print $1}' else echo "0" fi } get_active_jobs() { if [ -z "$ACTIVE_JOBS_CACHE" ]; then echo "0" return fi echo "$ACTIVE_JOBS_CACHE" | grep -c "^[0-9]" 2>/dev/null || true } # --------------------------------------------------------------------------- # Event bus / IPC health # --------------------------------------------------------------------------- get_event_pub_socket_exists() { if [ -S "$SALT_RUN_DIR/master_event_pub.ipc" ]; then echo "1" else echo "0" fi } get_event_pull_socket_exists() { if [ -S "$SALT_RUN_DIR/master_event_pull.ipc" ]; then echo "1" else echo "0" fi } # --------------------------------------------------------------------------- # Salt version # --------------------------------------------------------------------------- get_salt_version() { salt --version 2>/dev/null | awk '{print $2}' || echo "unknown" } # --------------------------------------------------------------------------- # Minion cache staleness # --------------------------------------------------------------------------- get_minion_cache_count() { if [ -d "$SALT_CACHE_DIR/minions" ]; then find "$SALT_CACHE_DIR/minions" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l else echo "0" fi } # --------------------------------------------------------------------------- # Syndic detection # --------------------------------------------------------------------------- get_syndic_count() { if [ -d "$SALT_PKI_DIR/syndics" ]; then find "$SALT_PKI_DIR/syndics" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l else echo "0" fi } # --------------------------------------------------------------------------- # Per-minion last-seen (stale only, >1h) # --------------------------------------------------------------------------- generate_minion_last_seen_metrics() { local now oldest_age minion_dir mtime age now=$(date +%s) oldest_age=0 if [ ! -d "$SALT_CACHE_DIR/minions" ]; then echo "salt_master_minion_cache_oldest_seconds 0" return fi for minion_dir in "$SALT_CACHE_DIR/minions"/*/; do [ -d "$minion_dir" ] || continue local minion_name minion_name=$(basename "$minion_dir") mtime=$(stat -c %Y "$minion_dir" 2>/dev/null || echo "$now") age=$(( now - mtime )) if [ "$age" -gt "$oldest_age" ]; then oldest_age=$age fi echo "salt_master_minion_last_seen_seconds{minion=\"${minion_name}\"} ${mtime}" done echo "salt_master_minion_cache_oldest_seconds $oldest_age" } # --------------------------------------------------------------------------- # Minion version drift # --------------------------------------------------------------------------- get_version_match_count() { if [ -z "$VERSIONS_CACHE" ]; then echo "0" return fi # manage.versions text output: "Up to date:" header followed by "- minion" lines local in_section=0 count=0 while IFS= read -r line; do if [[ "$line" =~ ^"Up to date:" ]]; then in_section=1 continue fi if [ "$in_section" -eq 1 ]; then if [[ "$line" =~ ^"- " ]]; then count=$((count + 1)) elif [[ "$line" =~ ^[A-Za-z] ]]; then break fi fi done <<< "$VERSIONS_CACHE" echo "$count" } get_version_mismatch_count() { if [ -z "$VERSIONS_CACHE" ]; then echo "0" return fi # Count "- minion" lines NOT under "Up to date:" section local in_uptodate=0 count=0 while IFS= read -r line; do if [[ "$line" =~ ^"Up to date:" ]]; then in_uptodate=1 continue elif [[ "$line" =~ ^[A-Za-z] ]]; then in_uptodate=0 continue fi if [ "$in_uptodate" -eq 0 ] && [[ "$line" =~ ^"- " ]]; then count=$((count + 1)) fi done <<< "$VERSIONS_CACHE" echo "$count" } # --------------------------------------------------------------------------- # Jobs completed (1h and 24h) # --------------------------------------------------------------------------- get_jobs_completed_1h() { if [ -d "$SALT_CACHE_DIR/jobs" ]; then find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -60 2>/dev/null | wc -l else echo "0" fi } get_jobs_completed_24h() { if [ -d "$SALT_CACHE_DIR/jobs" ]; then find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -1440 2>/dev/null | wc -l else echo "0" fi } # --------------------------------------------------------------------------- # Failed jobs (24h) # --------------------------------------------------------------------------- get_jobs_failed_24h() { if [ -z "$JOB_LIST_CACHE" ]; then echo "0" return fi # salt-run jobs.list_jobs --out=txt contains "Result: False" or "retcode: " for failures # Count lines containing "False" in Result field from last 24h job listing echo "$JOB_LIST_CACHE" | grep -ic "result.*false\|retcode: [1-9]" 2>/dev/null || true } # --------------------------------------------------------------------------- # Jobs by function (top 10, last 24h) # --------------------------------------------------------------------------- generate_jobs_by_function_metrics() { if [ -z "$JOB_LIST_CACHE" ]; then return fi # Extract "Function:" lines, count by function name, emit top 10 echo "$JOB_LIST_CACHE" | grep -i "Function:" 2>/dev/null | \ awk -F': *' '{print $NF}' | sort | uniq -c | sort -rn | head -10 | \ while read -r count func; do [ -n "$func" ] && echo "salt_master_jobs_by_function{function=\"${func}\"} ${count}" done } # --------------------------------------------------------------------------- # Job cache single-pass scan (collects all per-function metrics at once) # Max 200 jobs to avoid slow scans on busy masters. # --------------------------------------------------------------------------- JOB_CACHE_SCAN_MAX=200 generate_job_cache_metrics() { if [ ! -d "$SALT_CACHE_DIR/jobs" ]; then return fi declare -A func_expected declare -A func_success declare -A func_failure declare -A sched_key_success declare -A sched_key_failure local jid_dirs jid_dirs=$(find "$SALT_CACHE_DIR/jobs" -name ".load.p" -type f -mmin -1440 2>/dev/null | head -n "$JOB_CACHE_SCAN_MAX") if [ -z "$jid_dirs" ]; then return fi while IFS= read -r load_file; do [ -z "$load_file" ] && continue local job_dir func_name job_dir=$(dirname "$load_file") func_name="" if [ -n "$JOB_LIST_CACHE" ]; then local jid_tail jid_prefix full_jid jid_tail=$(basename "$job_dir") jid_prefix=$(basename "$(dirname "$job_dir")") full_jid="${jid_prefix}${jid_tail}" func_name=$(echo "$JOB_LIST_CACHE" | grep -A5 "$full_jid" 2>/dev/null \ | grep -i "Function:" | head -1 | sed 's/.*Function:[[:space:]]*//' | tr -d '[:space:]') fi local load_strings="" if [ -z "$func_name" ]; then load_strings=$(timeout 2 strings "$load_file" 2>/dev/null || true) func_name=$(echo "$load_strings" | grep -oE '(cmd\.[a-z_]+|state\.[a-z_]+|test\.[a-z_]+|grains\.[a-z_]+|pillar\.[a-z_]+|saltutil\.[a-z_]+|pkg\.[a-z_]+|service\.[a-z_]+|file\.[a-z_]+|sys\.[a-z_]+)' | head -1) fi [ -z "$func_name" ] && func_name="unknown" local is_scheduled=0 local state_name="" if [ -n "$load_strings" ]; then if echo "$load_strings" | grep -qi "schedule" 2>/dev/null; then is_scheduled=1 state_name=$(echo "$load_strings" | grep -oE '\b[a-z_]+\.(sls|init)\b' | head -1 | sed 's/\.sls$//' | sed 's/\.init$//') fi elif [ -z "$load_strings" ]; then load_strings=$(timeout 2 strings "$load_file" 2>/dev/null || true) if echo "$load_strings" | grep -qi "schedule" 2>/dev/null; then is_scheduled=1 state_name=$(echo "$load_strings" | grep -oE '\b[a-z_]+\.(sls|init)\b' | head -1 | sed 's/\.sls$//' | sed 's/\.init$//') fi fi [ -z "$state_name" ] && state_name="" local minion_count=0 local minion_dir for minion_dir in "$job_dir"/*/; do [ -d "$minion_dir" ] || continue minion_count=$((minion_count + 1)) local minion_name minion_name=$(basename "$minion_dir") if [ -f "$minion_dir/return.p" ]; then local ret_content ret_content=$(timeout 2 strings "$minion_dir/return.p" 2>/dev/null || true) local is_fail=0 if echo "$ret_content" | grep -qiE "false|traceback|error|exception" 2>/dev/null; then is_fail=1 fi if [ "$is_fail" -eq 1 ]; then func_failure["$func_name"]=$(( ${func_failure["$func_name"]:-0} + 1 )) else func_success["$func_name"]=$(( ${func_success["$func_name"]:-0} + 1 )) fi if [ "$is_scheduled" -eq 1 ]; then local skey="${func_name}|${minion_name}|${state_name}" if [ "$is_fail" -eq 1 ]; then sched_key_failure["$skey"]=$(( ${sched_key_failure["$skey"]:-0} + 1 )) else sched_key_success["$skey"]=$(( ${sched_key_success["$skey"]:-0} + 1 )) fi fi fi done if [ "$minion_count" -gt 0 ]; then func_expected["$func_name"]=$(( ${func_expected["$func_name"]:-0} + minion_count )) fi done <<< "$jid_dirs" for func in "${!func_expected[@]}"; do echo "salt_master_expected_responses_total{function=\"${func}\",state=\"\"} ${func_expected[$func]}" done local all_funcs all_funcs=$(printf '%s\n' "${!func_success[@]}" "${!func_failure[@]}" | sort -u) while IFS= read -r func; do [ -z "$func" ] && continue echo "salt_master_function_responses_total{function=\"${func}\",state=\"\",success=\"true\"} ${func_success[$func]:-0}" echo "salt_master_function_responses_total{function=\"${func}\",state=\"\",success=\"false\"} ${func_failure[$func]:-0}" done <<< "$all_funcs" local all_keys all_keys=$(printf '%s\n' "${!sched_key_success[@]}" "${!sched_key_failure[@]}" | sort -u) while IFS= read -r key; do [ -z "$key" ] && continue local func minion state func=$(echo "$key" | cut -d'|' -f1) minion=$(echo "$key" | cut -d'|' -f2) state=$(echo "$key" | cut -d'|' -f3) echo "salt_master_scheduled_job_return_total{function=\"${func}\",minion=\"${minion}\",state=\"${state}\",success=\"true\"} ${sched_key_success[$key]:-0}" echo "salt_master_scheduled_job_return_total{function=\"${func}\",minion=\"${minion}\",state=\"${state}\",success=\"false\"} ${sched_key_failure[$key]:-0}" done <<< "$all_keys" } # --------------------------------------------------------------------------- # New jobs by function with success/failure (from job list text output) # --------------------------------------------------------------------------- generate_new_job_metrics() { if [ -z "$JOB_LIST_CACHE" ]; then return fi declare -A func_total declare -A func_failed local current_func="" while IFS= read -r line; do if [[ "$line" =~ Function: ]]; then current_func=$(echo "$line" | sed 's/.*Function:[[:space:]]*//' | tr -d '[:space:]') [ -n "$current_func" ] && func_total["$current_func"]=$(( ${func_total["$current_func"]:-0} + 1 )) fi if [[ "$line" =~ Result:.*False ]] || [[ "$line" =~ retcode:\ [1-9] ]]; then [ -n "$current_func" ] && func_failed["$current_func"]=$(( ${func_failed["$current_func"]:-0} + 1 )) fi done <<< "$JOB_LIST_CACHE" for func in "${!func_total[@]}"; do local total=${func_total[$func]} local failed=${func_failed[$func]:-0} local succeeded=$((total - failed)) echo "salt_master_new_job_total{function=\"${func}\",state=\"\",success=\"true\"} ${succeeded}" echo "salt_master_new_job_total{function=\"${func}\",state=\"\",success=\"false\"} ${failed}" done } # --------------------------------------------------------------------------- # ZeroMQ port status # --------------------------------------------------------------------------- get_port_listening() { local port="$1" if ss -tlnp 2>/dev/null | grep -q ":${port} " 2>/dev/null; then echo "1" else echo "0" fi } # --------------------------------------------------------------------------- # Cache disk usage and inode count # --------------------------------------------------------------------------- get_cache_disk_used_bytes() { if [ -d "$SALT_CACHE_DIR" ]; then du -sb "$SALT_CACHE_DIR" 2>/dev/null | awk '{print $1}' else echo "0" fi } get_cache_disk_available_bytes() { if [ -d "$SALT_CACHE_DIR" ]; then df -B1 "$SALT_CACHE_DIR" 2>/dev/null | tail -1 | awk '{print $4}' else echo "0" fi } get_cache_inode_count() { if [ -d "$SALT_CACHE_DIR" ]; then find "$SALT_CACHE_DIR" 2>/dev/null | wc -l else echo "0" fi } # --------------------------------------------------------------------------- # File server cache size # --------------------------------------------------------------------------- get_fileserver_cache_size_bytes() { if [ -d "$SALT_CACHE_DIR/file_lists" ]; then du -sb "$SALT_CACHE_DIR/file_lists" 2>/dev/null | awk '{print $1}' else echo "0" fi } # --------------------------------------------------------------------------- # Configuration values # --------------------------------------------------------------------------- get_config_value() { local key="$1" default="$2" local val val=$(grep -E "^[[:space:]]*${key}:" "$SALT_MASTER_CONFIG" 2>/dev/null | head -1 | sed "s/^[[:space:]]*${key}:[[:space:]]*//" | tr -d '[:space:]') if [ -z "$val" ]; then for f in /etc/salt/master.d/*.conf; do [ -f "$f" ] || continue val=$(grep -E "^[[:space:]]*${key}:" "$f" 2>/dev/null | head -1 | sed "s/^[[:space:]]*${key}:[[:space:]]*//" | tr -d '[:space:]') [ -n "$val" ] && break done fi if [ -z "$val" ]; then val=$(timeout "$SALT_CMD_TIMEOUT" salt-run config.get "$key" 2>/dev/null | tr -d '[:space:]') [ "$val" = "None" ] || [ "$val" = "" ] && val="" fi echo "${val:-$default}" } get_config_bool() { local val val=$(get_config_value "$1" "$2") case "${val,,}" in true|yes|1) echo "1" ;; *) echo "0" ;; esac } get_config_keep_jobs() { get_config_value "keep_jobs" "24" } get_config_master_stats_enabled() { get_config_bool "master_stats" "false" } # --------------------------------------------------------------------------- # Salt API status # --------------------------------------------------------------------------- get_salt_api_running() { if pgrep -f "salt-api" >/dev/null 2>&1; then echo "1" else echo "0" fi } get_salt_api_port() { ss -tlnp 2>/dev/null | grep "salt-api" | awk '{print $4}' | grep -oE '[0-9]+$' | head -1 } # --------------------------------------------------------------------------- # File roots size # --------------------------------------------------------------------------- get_file_roots_size_bytes() { local roots_dir roots_dir=$(get_config_value "file_roots" "") if [ -z "$roots_dir" ] && [ -d "/srv/salt" ]; then roots_dir="/srv/salt" fi if [ -n "$roots_dir" ] && [ -d "$roots_dir" ]; then du -sb "$roots_dir" 2>/dev/null | awk '{print $1}' else echo "0" fi } # --------------------------------------------------------------------------- # Master log file size # --------------------------------------------------------------------------- get_master_log_size_bytes() { local log_file log_file=$(get_config_value "log_file" "/var/log/salt/master") if [ -f "$log_file" ]; then stat -c %s "$log_file" 2>/dev/null || echo "0" else echo "0" fi } # --------------------------------------------------------------------------- # Pillar roots size # --------------------------------------------------------------------------- get_pillar_roots_size_bytes() { local pillar_dir pillar_dir=$(get_config_value "pillar_roots" "") if [ -z "$pillar_dir" ] && [ -d "/srv/pillar" ]; then pillar_dir="/srv/pillar" fi if [ -n "$pillar_dir" ] && [ -d "$pillar_dir" ]; then du -sb "$pillar_dir" 2>/dev/null | awk '{print $1}' else echo "0" fi } # --------------------------------------------------------------------------- # Metric generation # --------------------------------------------------------------------------- generate_metrics() { local start_time start_time=$(date +%s) local master_pid master_running master_pid=$(get_master_pid) if [ -n "$master_pid" ]; then master_running=1 else master_running=0 fi local salt_version salt_version=$(get_salt_version) local keys_accepted keys_denied keys_unaccepted keys_rejected keys_accepted=$(find "$SALT_PKI_DIR/minions/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l) keys_denied=$(find "$SALT_PKI_DIR/minions_denied/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l) keys_unaccepted=$(find "$SALT_PKI_DIR/minions_pre/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l) keys_rejected=$(find "$SALT_PKI_DIR/minions_rejected/" -maxdepth 1 -mindepth 1 2>/dev/null | wc -l) local keys_total=$((keys_accepted + keys_denied + keys_unaccepted + keys_rejected)) local minions_up minions_down minions_up=$(get_minions_up) minions_down=$(get_minions_down) local master_uptime master_memory master_cpu master_threads master_fds salt_procs master_uptime=$(get_master_uptime_seconds) master_memory=$(get_master_memory_bytes) master_cpu=$(get_master_cpu_percent) master_threads=$(get_master_thread_count) master_fds=$(get_master_open_fds) salt_procs=$(get_salt_process_count) local configured_workers configured_workers=$(get_configured_workers) local job_cache_count job_cache_size active_jobs job_cache_count=$(get_job_cache_count) job_cache_size=$(get_job_cache_size_bytes) active_jobs=$(get_active_jobs) local event_pub_socket event_pull_socket event_pub_socket=$(get_event_pub_socket_exists) event_pull_socket=$(get_event_pull_socket_exists) local minion_cache_count syndic_count minion_cache_count=$(get_minion_cache_count) syndic_count=$(get_syndic_count) local version_match version_mismatch version_match=$(get_version_match_count) version_mismatch=$(get_version_mismatch_count) local jobs_1h jobs_24h jobs_failed_24h jobs_1h=$(get_jobs_completed_1h) jobs_24h=$(get_jobs_completed_24h) jobs_failed_24h=$(get_jobs_failed_24h) local port_4505 port_4506 port_4505=$(get_port_listening 4505) port_4506=$(get_port_listening 4506) local cache_disk_used cache_disk_avail cache_inode_count cache_disk_used=$(get_cache_disk_used_bytes) cache_disk_avail=$(get_cache_disk_available_bytes) cache_inode_count=$(get_cache_inode_count) local fileserver_cache_size fileserver_cache_size=$(get_fileserver_cache_size_bytes) local config_keep_jobs config_master_stats config_keep_jobs=$(get_config_keep_jobs) config_master_stats=$(get_config_master_stats_enabled) local config_state_events config_presence_events config_timeout config_job_cache config_state_events=$(get_config_bool "state_events" "false") config_presence_events=$(get_config_bool "presence_events" "false") config_timeout=$(get_config_value "timeout" "5") config_job_cache=$(get_config_bool "job_cache" "true") local config_publish_port config_ret_port config_publish_port=$(get_config_value "publish_port" "4505") config_ret_port=$(get_config_value "ret_port" "4506") local salt_api_running salt_api_port salt_api_running=$(get_salt_api_running) salt_api_port=$(get_salt_api_port) local file_roots_size pillar_roots_size master_log_size file_roots_size=$(get_file_roots_size_bytes) pillar_roots_size=$(get_pillar_roots_size_bytes) master_log_size=$(get_master_log_size_bytes) cat </dev/null || true) echo "salt_master_highstate_jobs_24h ${hs_count:-0}" else echo "salt_master_highstate_jobs_24h 0" fi # Per-minion highstate detail (succeeded/failed/changed from most recent run) echo "" echo "# HELP salt_master_highstate_detail Per-minion highstate state counts from last run" echo "# TYPE salt_master_highstate_detail gauge" generate_highstate_detail_metrics cat <&2 while true; do { read -r request if [[ "$request" =~ ^GET\ /metrics ]]; then printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4; charset=utf-8\r\n\r\n" cache_all_salt_data cache_log_data cache_highstate_data generate_metrics else printf "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n" echo "

Salt Master Exporter v${SCRIPT_VERSION}

Metrics" fi } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null done } # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- main() { parse_args "$@" [ "$HTTP_MODE" != true ] && acquire_lock if [ "$HTTP_MODE" = true ]; then run_http_server elif [ -n "$OUTPUT_FILE" ]; then cache_all_salt_data cache_log_data cache_highstate_data mkdir -p "$(dirname "$OUTPUT_FILE")" local temp_file temp_file=$(mktemp /tmp/salt_master_metrics.XXXXXX) generate_metrics > "$temp_file" rm -f "$OUTPUT_FILE" mv "$temp_file" "$OUTPUT_FILE" chmod 644 "$OUTPUT_FILE" sync else cache_all_salt_data cache_log_data cache_highstate_data generate_metrics fi } main "$@"