#!/bin/bash ##################################################### ### ### ### Description: Expose metrics from salt-minion. ### ### ### ### Phil Connor, contact@mylinux.work ### ### License: MIT ### ### Version 2.28.0.20250915 ### ### ### ##################################################### # Exit on any error, treat unset variables as errors, and fail pipes on first failure set -euo pipefail # Parse command line arguments DRY_RUN=false VERBOSE=false QUIET=false NO_CRON=false SCRIPT_VERSION="2.28.0.20250915" show_version() { echo "Salt Status Monitor Bash Script" echo "Version: $SCRIPT_VERSION" echo "Author: Phil Connor pconnor@ara.com" } show_help() { echo "Usage: $0 [OPTIONS]" echo "Monitor Salt minion status and export Prometheus metrics" echo "" echo "Options:" echo " --dry-run Output metrics to console instead of file" echo " --verbose Enable verbose debug output" echo " --quiet Suppress non-error output" echo " --no-cron Skip cron job installation" echo " --timeout N Override timeout seconds (default: varies by operation)" echo " --version Show version and exit" echo " --help Show this help message" } # Logging functions log_verbose() { [[ "$VERBOSE" == "true" ]] && echo "[$(date '+%Y-%m-%d %H:%M:%S')] [VERBOSE] $1" } log_info() { [[ "$QUIET" == "false" ]] && echo "[$(date '+%Y-%m-%d %H:%M:%S')] [INFO] $1" } while [[ $# -gt 0 ]]; do case $1 in --dry-run) DRY_RUN=true shift ;; --verbose|-v) VERBOSE=true shift ;; --quiet|-q) QUIET=true shift ;; --no-cron) NO_CRON=true shift ;; --timeout) if [[ -n "$2" && "$2" =~ ^[0-9]+$ ]]; then TIMEOUT_OVERRIDE="$2" shift 2 else echo "Error: --timeout requires a numeric value" >&2 exit 1 fi ;; --version) show_version exit 0 ;; -h|--help) show_help exit 0 ;; *) echo "Unknown option: $1" >&2 echo "Use --help for usage information" >&2 exit 1 ;; esac done # Get absolute path to this script for cron job installation readonly SCRIPT_PATH="$(readlink -f "$0")" # Configuration with defaults - can be overridden by environment variables readonly CRONTAB_USER="${CRONTAB_USER:-root}" # User to install cron job under readonly NODE_EXPORTER_DIR="${NODE_EXPORTER_DIR:-/var/lib/node_exporter}" # Directory where Prometheus metrics are stored readonly PROMETHEUS_USER="${PROMETHEUS_USER:-prometheus}" # User that owns the metrics directory readonly LOCK_DIR="${LOCK_DIR:-/var/run}" # Directory for lock files to prevent concurrent runs readonly UPDATE_INTERVAL="${UPDATE_INTERVAL:-*/10 * * * *}" # Cron schedule - every 10 minutes by default readonly SALT_MASTER_PORT=4505 # Salt master communication port # Status codes used in Prometheus metrics readonly STATUS_SUCCESS=1 # Service is working correctly readonly STATUS_FAILURE=0 # Service has failed or is not responding readonly STATUS_NOT_FOUND=2 # Service/command not found on system # Validate that critical environment variables are set [[ -z "$NODE_EXPORTER_DIR" || -z "$PROMETHEUS_USER" ]] && { echo "ERROR: Required environment variables not set" >&2 exit 1 } # Error handling function that logs to stderr and exits with specified code handle_error() { echo "ERROR: $1" >&2 exit "${2:-1}" } # Logging function with timestamp and level log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [$1] $2" } # Find a command in PATH or fallback directories # Returns the full path to the executable or exits with error find_command() { local cmd="$1" shift local fallback_paths=("$@") # First try to find command in PATH if command -v "$cmd" &>/dev/null; then command -v "$cmd" return 0 fi # If not in PATH, check fallback directories for path in "${fallback_paths[@]}"; do local full_path="$path/$cmd" [[ -x "$full_path" ]] && { echo "$full_path" return 0 } done # Command not found anywhere handle_error "Could not find '$cmd' executable" } # Install a cron job to run this script periodically # Only installs if the job doesn't already exist install_cron_job() { # Check if cron job already exists crontab -l 2>/dev/null | grep -q "$SCRIPT_PATH" && return 0 # Create temporary file for new crontab local temp_cron temp_cron=$(mktemp) # Combine existing crontab with new job { crontab -l 2>/dev/null || true # Get existing crontab, ignore errors if empty echo "$UPDATE_INTERVAL $SCRIPT_PATH > $NODE_EXPORTER_DIR/salt_status.prom 2>&1" } > "$temp_cron" # Install the new crontab if crontab -u "$CRONTAB_USER" "$temp_cron"; then log_info "Cron job installed successfully" else rm -f "$temp_cron" handle_error "Failed to install cron job" fi # Clean up temporary file rm -f "$temp_cron" } # Set up file locking to prevent multiple instances of this script running # Uses file descriptor 9 for the lock setup_lock() { # Ensure lock directory exists [[ ! -d "$LOCK_DIR" ]] && handle_error "Lock directory does not exist: $LOCK_DIR" # Clean up old lock files (older than 60 minutes) find "$LOCK_DIR" -name "salt_status.*" -type f -mmin +60 -delete 2>/dev/null || true # Create unique lock file lockfile=$(mktemp -p "$LOCK_DIR" salt_status.XXXXXX) || handle_error "Failed to create lock file" # Open lock file on file descriptor 9 and attempt to lock it exec 9>"$lockfile" flock -n 9 || handle_error "Script is already running" # Set up cleanup trap to release lock and remove file on exit trap 'flock -u 9; exec 9>&-; rm -f "$lockfile"' EXIT INT TERM } # Ensure the Node Exporter directory exists and is writable # Creates the directory if running as root and sets proper ownership setup_directories() { # Return early if directory already exists [[ -d "$NODE_EXPORTER_DIR" ]] && return 0 # Create directory if running as root if [[ "$(id -u)" == "0" ]]; then mkdir -p "$NODE_EXPORTER_DIR" # Set ownership to prometheus user, ignore errors if user doesn't exist chown "$PROMETHEUS_USER:" "$NODE_EXPORTER_DIR" 2>/dev/null || true fi # Verify the directory is writable [[ ! -w "$NODE_EXPORTER_DIR" ]] && handle_error "$NODE_EXPORTER_DIR is not writable" } # Check if Salt-minion has an active network connection to Salt-master # Uses ss (socket statistics) to check for established connections on port 4505 check_salt_connection() { local ss_path ss_path=$(find_command ss /bin /usr/bin /usr/sbin) log_verbose "Checking for Salt connection on port $SALT_MASTER_PORT" # Check for established connections (-nt = numeric, no header, TCP) if "$ss_path" -nt | grep -q "\b$SALT_MASTER_PORT\b"; then log_verbose "Found active connection on port $SALT_MASTER_PORT" echo $STATUS_SUCCESS else log_verbose "No active connection found on port $SALT_MASTER_PORT" echo $STATUS_FAILURE fi } # Test if Salt-minion can successfully ping the Salt-master # Uses salt-call test.ping to verify two-way communication check_salt_ping() { local salt_call_path # Try to find salt-call command, return NOT_FOUND if missing if ! salt_call_path=$(find_command salt-call /bin /usr/bin /usr/sbin 2>/dev/null); then echo $STATUS_NOT_FOUND return fi # Execute ping test and check for True response if "$salt_call_path" test.ping 2>/dev/null | grep -q '\bTrue\b'; then echo $STATUS_SUCCESS else echo $STATUS_FAILURE fi } # Check if Salt-minion service is active using systemctl check_salt_service() { local systemctl_path # Find systemctl command if ! systemctl_path=$(find_command systemctl /bin /usr/bin /sbin /usr/sbin 2>/dev/null); then echo $STATUS_NOT_FOUND return fi # Check if salt-minion service is active if "$systemctl_path" is-active salt-minion &>/dev/null; then echo $STATUS_SUCCESS else echo $STATUS_FAILURE fi } # Get timestamp of last successful Salt communication check_salt_last_communication() { local salt_call_path # Try to find salt-call command, return 0 if missing if ! salt_call_path=$(find_command salt-call /bin /usr/bin /usr/sbin 2>/dev/null); then echo "0" return fi # Get current timestamp if ping succeeds, otherwise 0 if "$salt_call_path" test.ping 2>/dev/null | grep -q '\bTrue\b'; then date +%s else echo "0" fi } # Get Salt-minion version information get_salt_version() { local salt_call_path # Try to find salt-call command, return empty if missing if ! salt_call_path=$(find_command salt-call /bin /usr/bin /usr/sbin 2>/dev/null); then echo "0" return fi # Extract version number and convert to numeric (e.g., 3006.1 becomes 3006.1) local version version=$("$salt_call_path" --version 2>/dev/null | grep -o '[0-9]\+\.[0-9]\+' | head -1) echo "${version:-0}" } # Get Salt-minion process memory usage in bytes get_salt_memory_usage() { local ps_path # Find ps command if ! ps_path=$(find_command ps /bin /usr/bin 2>/dev/null); then echo "0" return fi # Get RSS memory usage in KB and convert to bytes local memory_kb memory_kb=$("$ps_path" -eo comm,rss | grep -E '^salt-minion' | awk '{sum+=$2} END {print sum+0}' 2>/dev/null) [[ -z "$memory_kb" ]] && memory_kb=0 echo "$((memory_kb * 1024))" } # Count recent errors in salt-minion log count_salt_errors() { local log_file="/var/log/salt/minion" # Return 0 if log file doesn't exist or isn't readable [[ ! -r "$log_file" ]] && { echo "0"; return; } # Count ERROR lines from last 24 hours local error_count error_count=$(grep -c "\[ERROR\]" "$log_file" 2>/dev/null) echo "${error_count:-0}" } # Output a Prometheus metric in the correct format # Parameters: metric_name, value, help_text, metric_type output_metric() { local name="$1" value="$2" help="$3" type="$4" # Output in Prometheus exposition format cat << EOF # HELP $name $help # TYPE $name $type $name $value EOF } # Main function that orchestrates the metric collection process main() { # Skip setup steps in dry-run mode if [[ "$DRY_RUN" == "false" ]]; then # Set up file locking to prevent concurrent execution setup_lock # Ensure output directory exists and is writable setup_directories # Install cron job for periodic execution (only if script file exists and not disabled) if [[ -f "$SCRIPT_PATH" && "$NO_CRON" == "false" ]]; then install_cron_job elif [[ "$NO_CRON" == "true" ]]; then log_info "Skipping cron job installation (--no-cron specified)" fi else echo "=== DRY RUN MODE - Metrics that would be written to $NODE_EXPORTER_DIR/salt_status.prom ===" >&2 fi # Collect Salt status metrics local connection_status ping_status service_status last_comm version memory_usage error_count connection_status=$(check_salt_connection) ping_status=$(check_salt_ping) service_status=$(check_salt_service) last_comm=$(check_salt_last_communication) version=$(get_salt_version) memory_usage=$(get_salt_memory_usage) error_count=$(count_salt_errors) # Output metrics in Prometheus format output_metric "minion_connection_status" "$connection_status" \ "Shows if Salt-Minion is connected to Salt-Master." "gauge" output_metric "minion_ping_status" "$ping_status" \ "Shows if Salt-Minion is able to ping Salt-Master." "gauge" output_metric "minion_service_status" "$service_status" \ "Shows if Salt-Minion service is active." "gauge" output_metric "minion_last_communication_timestamp" "$last_comm" \ "Timestamp of last successful communication with Salt-Master." "gauge" output_metric "minion_version" "$version" \ "Salt-Minion version number." "gauge" output_metric "minion_memory_usage_bytes" "$memory_usage" \ "Salt-Minion process memory usage in bytes." "gauge" output_metric "minion_error_count" "$error_count" \ "Number of error entries in Salt-Minion log file." "counter" if [[ "$DRY_RUN" == "true" ]]; then echo "=== END DRY RUN OUTPUT ===" >&2 fi } # Execute main function with all script arguments main "$@"