Add all 44 scripts, update CI: error severity baseline, PowerShell validation, multi-distro testing
Amp-Thread-ID: https://ampcode.com/threads/T-019cc404-c628-759e-a50b-f5eeea35b91f Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Executable
+409
@@ -0,0 +1,409 @@
|
||||
#!/bin/bash
|
||||
|
||||
#####################################################
|
||||
### ###
|
||||
### Description: Expose metrics from salt-minion. ###
|
||||
### ###
|
||||
### Phil Connor, contact@mylinux.work ###
|
||||
### License: MIT ###
|
||||
### Version 2.28.0.20250915 ###
|
||||
### ###
|
||||
#####################################################
|
||||
|
||||
# Exit on any error, treat unset variables as errors, and fail pipes on first failure
|
||||
set -euo pipefail
|
||||
|
||||
# Parse command line arguments
|
||||
DRY_RUN=false
|
||||
VERBOSE=false
|
||||
QUIET=false
|
||||
NO_CRON=false
|
||||
SCRIPT_VERSION="2.28.0.20250915"
|
||||
|
||||
show_version() {
|
||||
echo "Salt Status Monitor Bash Script"
|
||||
echo "Version: $SCRIPT_VERSION"
|
||||
echo "Author: Phil Connor pconnor@ara.com"
|
||||
}
|
||||
|
||||
show_help() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo "Monitor Salt minion status and export Prometheus metrics"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --dry-run Output metrics to console instead of file"
|
||||
echo " --verbose Enable verbose debug output"
|
||||
echo " --quiet Suppress non-error output"
|
||||
echo " --no-cron Skip cron job installation"
|
||||
echo " --timeout N Override timeout seconds (default: varies by operation)"
|
||||
echo " --version Show version and exit"
|
||||
echo " --help Show this help message"
|
||||
}
|
||||
|
||||
# Logging functions
|
||||
log_verbose() {
|
||||
[[ "$VERBOSE" == "true" ]] && echo "[$(date '+%Y-%m-%d %H:%M:%S')] [VERBOSE] $1"
|
||||
}
|
||||
|
||||
log_info() {
|
||||
[[ "$QUIET" == "false" ]] && echo "[$(date '+%Y-%m-%d %H:%M:%S')] [INFO] $1"
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--verbose|-v)
|
||||
VERBOSE=true
|
||||
shift
|
||||
;;
|
||||
--quiet|-q)
|
||||
QUIET=true
|
||||
shift
|
||||
;;
|
||||
--no-cron)
|
||||
NO_CRON=true
|
||||
shift
|
||||
;;
|
||||
--timeout)
|
||||
if [[ -n "$2" && "$2" =~ ^[0-9]+$ ]]; then
|
||||
TIMEOUT_OVERRIDE="$2"
|
||||
shift 2
|
||||
else
|
||||
echo "Error: --timeout requires a numeric value" >&2
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--version)
|
||||
show_version
|
||||
exit 0
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
echo "Use --help for usage information" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Get absolute path to this script for cron job installation
|
||||
readonly SCRIPT_PATH="$(readlink -f "$0")"
|
||||
|
||||
# Configuration with defaults - can be overridden by environment variables
|
||||
readonly CRONTAB_USER="${CRONTAB_USER:-root}" # User to install cron job under
|
||||
readonly NODE_EXPORTER_DIR="${NODE_EXPORTER_DIR:-/var/lib/node_exporter}" # Directory where Prometheus metrics are stored
|
||||
readonly PROMETHEUS_USER="${PROMETHEUS_USER:-prometheus}" # User that owns the metrics directory
|
||||
readonly LOCK_DIR="${LOCK_DIR:-/var/run}" # Directory for lock files to prevent concurrent runs
|
||||
readonly UPDATE_INTERVAL="${UPDATE_INTERVAL:-*/10 * * * *}" # Cron schedule - every 10 minutes by default
|
||||
readonly SALT_MASTER_PORT=4505 # Salt master communication port
|
||||
|
||||
# Status codes used in Prometheus metrics
|
||||
readonly STATUS_SUCCESS=1 # Service is working correctly
|
||||
readonly STATUS_FAILURE=0 # Service has failed or is not responding
|
||||
readonly STATUS_NOT_FOUND=2 # Service/command not found on system
|
||||
|
||||
# Validate that critical environment variables are set
|
||||
[[ -z "$NODE_EXPORTER_DIR" || -z "$PROMETHEUS_USER" ]] && {
|
||||
echo "ERROR: Required environment variables not set" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Error handling function that logs to stderr and exits with specified code
|
||||
handle_error() {
|
||||
echo "ERROR: $1" >&2
|
||||
exit "${2:-1}"
|
||||
}
|
||||
|
||||
# Logging function with timestamp and level
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [$1] $2"
|
||||
}
|
||||
|
||||
# Find a command in PATH or fallback directories
|
||||
# Returns the full path to the executable or exits with error
|
||||
find_command() {
|
||||
local cmd="$1"
|
||||
shift
|
||||
local fallback_paths=("$@")
|
||||
|
||||
# First try to find command in PATH
|
||||
if command -v "$cmd" &>/dev/null; then
|
||||
command -v "$cmd"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# If not in PATH, check fallback directories
|
||||
for path in "${fallback_paths[@]}"; do
|
||||
local full_path="$path/$cmd"
|
||||
[[ -x "$full_path" ]] && {
|
||||
echo "$full_path"
|
||||
return 0
|
||||
}
|
||||
done
|
||||
|
||||
# Command not found anywhere
|
||||
handle_error "Could not find '$cmd' executable"
|
||||
}
|
||||
|
||||
# Install a cron job to run this script periodically
|
||||
# Only installs if the job doesn't already exist
|
||||
install_cron_job() {
|
||||
# Check if cron job already exists
|
||||
crontab -l 2>/dev/null | grep -q "$SCRIPT_PATH" && return 0
|
||||
|
||||
# Create temporary file for new crontab
|
||||
local temp_cron
|
||||
temp_cron=$(mktemp)
|
||||
|
||||
# Combine existing crontab with new job
|
||||
{
|
||||
crontab -l 2>/dev/null || true # Get existing crontab, ignore errors if empty
|
||||
echo "$UPDATE_INTERVAL $SCRIPT_PATH > $NODE_EXPORTER_DIR/salt_status.prom 2>&1"
|
||||
} > "$temp_cron"
|
||||
|
||||
# Install the new crontab
|
||||
if crontab -u "$CRONTAB_USER" "$temp_cron"; then
|
||||
log_info "Cron job installed successfully"
|
||||
else
|
||||
rm -f "$temp_cron"
|
||||
handle_error "Failed to install cron job"
|
||||
fi
|
||||
|
||||
# Clean up temporary file
|
||||
rm -f "$temp_cron"
|
||||
}
|
||||
|
||||
# Set up file locking to prevent multiple instances of this script running
|
||||
# Uses file descriptor 9 for the lock
|
||||
setup_lock() {
|
||||
# Ensure lock directory exists
|
||||
[[ ! -d "$LOCK_DIR" ]] && handle_error "Lock directory does not exist: $LOCK_DIR"
|
||||
|
||||
# Clean up old lock files (older than 60 minutes)
|
||||
find "$LOCK_DIR" -name "salt_status.*" -type f -mmin +60 -delete 2>/dev/null || true
|
||||
|
||||
# Create unique lock file
|
||||
lockfile=$(mktemp -p "$LOCK_DIR" salt_status.XXXXXX) || handle_error "Failed to create lock file"
|
||||
|
||||
# Open lock file on file descriptor 9 and attempt to lock it
|
||||
exec 9>"$lockfile"
|
||||
flock -n 9 || handle_error "Script is already running"
|
||||
|
||||
# Set up cleanup trap to release lock and remove file on exit
|
||||
trap 'flock -u 9; exec 9>&-; rm -f "$lockfile"' EXIT INT TERM
|
||||
}
|
||||
|
||||
# Ensure the Node Exporter directory exists and is writable
|
||||
# Creates the directory if running as root and sets proper ownership
|
||||
setup_directories() {
|
||||
# Return early if directory already exists
|
||||
[[ -d "$NODE_EXPORTER_DIR" ]] && return 0
|
||||
|
||||
# Create directory if running as root
|
||||
if [[ "$(id -u)" == "0" ]]; then
|
||||
mkdir -p "$NODE_EXPORTER_DIR"
|
||||
# Set ownership to prometheus user, ignore errors if user doesn't exist
|
||||
chown "$PROMETHEUS_USER:" "$NODE_EXPORTER_DIR" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Verify the directory is writable
|
||||
[[ ! -w "$NODE_EXPORTER_DIR" ]] && handle_error "$NODE_EXPORTER_DIR is not writable"
|
||||
}
|
||||
|
||||
# Check if Salt-minion has an active network connection to Salt-master
|
||||
# Uses ss (socket statistics) to check for established connections on port 4505
|
||||
check_salt_connection() {
|
||||
local ss_path
|
||||
ss_path=$(find_command ss /bin /usr/bin /usr/sbin)
|
||||
|
||||
log_verbose "Checking for Salt connection on port $SALT_MASTER_PORT"
|
||||
|
||||
# Check for established connections (-nt = numeric, no header, TCP)
|
||||
if "$ss_path" -nt | grep -q "\b$SALT_MASTER_PORT\b"; then
|
||||
log_verbose "Found active connection on port $SALT_MASTER_PORT"
|
||||
echo $STATUS_SUCCESS
|
||||
else
|
||||
log_verbose "No active connection found on port $SALT_MASTER_PORT"
|
||||
echo $STATUS_FAILURE
|
||||
fi
|
||||
}
|
||||
|
||||
# Test if Salt-minion can successfully ping the Salt-master
|
||||
# Uses salt-call test.ping to verify two-way communication
|
||||
check_salt_ping() {
|
||||
local salt_call_path
|
||||
|
||||
# Try to find salt-call command, return NOT_FOUND if missing
|
||||
if ! salt_call_path=$(find_command salt-call /bin /usr/bin /usr/sbin 2>/dev/null); then
|
||||
echo $STATUS_NOT_FOUND
|
||||
return
|
||||
fi
|
||||
|
||||
# Execute ping test and check for True response
|
||||
if "$salt_call_path" test.ping 2>/dev/null | grep -q '\bTrue\b'; then
|
||||
echo $STATUS_SUCCESS
|
||||
else
|
||||
echo $STATUS_FAILURE
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if Salt-minion service is active using systemctl
|
||||
check_salt_service() {
|
||||
local systemctl_path
|
||||
|
||||
# Find systemctl command
|
||||
if ! systemctl_path=$(find_command systemctl /bin /usr/bin /sbin /usr/sbin 2>/dev/null); then
|
||||
echo $STATUS_NOT_FOUND
|
||||
return
|
||||
fi
|
||||
|
||||
# Check if salt-minion service is active
|
||||
if "$systemctl_path" is-active salt-minion &>/dev/null; then
|
||||
echo $STATUS_SUCCESS
|
||||
else
|
||||
echo $STATUS_FAILURE
|
||||
fi
|
||||
}
|
||||
|
||||
# Get timestamp of last successful Salt communication
|
||||
check_salt_last_communication() {
|
||||
local salt_call_path
|
||||
|
||||
# Try to find salt-call command, return 0 if missing
|
||||
if ! salt_call_path=$(find_command salt-call /bin /usr/bin /usr/sbin 2>/dev/null); then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
|
||||
# Get current timestamp if ping succeeds, otherwise 0
|
||||
if "$salt_call_path" test.ping 2>/dev/null | grep -q '\bTrue\b'; then
|
||||
date +%s
|
||||
else
|
||||
echo "0"
|
||||
fi
|
||||
}
|
||||
|
||||
# Get Salt-minion version information
|
||||
get_salt_version() {
|
||||
local salt_call_path
|
||||
|
||||
# Try to find salt-call command, return empty if missing
|
||||
if ! salt_call_path=$(find_command salt-call /bin /usr/bin /usr/sbin 2>/dev/null); then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
|
||||
# Extract version number and convert to numeric (e.g., 3006.1 becomes 3006.1)
|
||||
local version
|
||||
version=$("$salt_call_path" --version 2>/dev/null | grep -o '[0-9]\+\.[0-9]\+' | head -1)
|
||||
echo "${version:-0}"
|
||||
}
|
||||
|
||||
# Get Salt-minion process memory usage in bytes
|
||||
get_salt_memory_usage() {
|
||||
local ps_path
|
||||
|
||||
# Find ps command
|
||||
if ! ps_path=$(find_command ps /bin /usr/bin 2>/dev/null); then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
|
||||
# Get RSS memory usage in KB and convert to bytes
|
||||
local memory_kb
|
||||
memory_kb=$("$ps_path" -eo comm,rss | grep -E '^salt-minion' | awk '{sum+=$2} END {print sum+0}' 2>/dev/null)
|
||||
[[ -z "$memory_kb" ]] && memory_kb=0
|
||||
echo "$((memory_kb * 1024))"
|
||||
}
|
||||
|
||||
# Count recent errors in salt-minion log
|
||||
count_salt_errors() {
|
||||
local log_file="/var/log/salt/minion"
|
||||
|
||||
# Return 0 if log file doesn't exist or isn't readable
|
||||
[[ ! -r "$log_file" ]] && { echo "0"; return; }
|
||||
|
||||
# Count ERROR lines from last 24 hours
|
||||
local error_count
|
||||
error_count=$(grep -c "\[ERROR\]" "$log_file" 2>/dev/null)
|
||||
echo "${error_count:-0}"
|
||||
}
|
||||
|
||||
# Output a Prometheus metric in the correct format
|
||||
# Parameters: metric_name, value, help_text, metric_type
|
||||
output_metric() {
|
||||
local name="$1" value="$2" help="$3" type="$4"
|
||||
|
||||
# Output in Prometheus exposition format
|
||||
cat << EOF
|
||||
# HELP $name $help
|
||||
# TYPE $name $type
|
||||
$name $value
|
||||
EOF
|
||||
}
|
||||
|
||||
# Main function that orchestrates the metric collection process
|
||||
main() {
|
||||
# Skip setup steps in dry-run mode
|
||||
if [[ "$DRY_RUN" == "false" ]]; then
|
||||
# Set up file locking to prevent concurrent execution
|
||||
setup_lock
|
||||
|
||||
# Ensure output directory exists and is writable
|
||||
setup_directories
|
||||
|
||||
# Install cron job for periodic execution (only if script file exists and not disabled)
|
||||
if [[ -f "$SCRIPT_PATH" && "$NO_CRON" == "false" ]]; then
|
||||
install_cron_job
|
||||
elif [[ "$NO_CRON" == "true" ]]; then
|
||||
log_info "Skipping cron job installation (--no-cron specified)"
|
||||
fi
|
||||
else
|
||||
echo "=== DRY RUN MODE - Metrics that would be written to $NODE_EXPORTER_DIR/salt_status.prom ===" >&2
|
||||
fi
|
||||
|
||||
# Collect Salt status metrics
|
||||
local connection_status ping_status service_status last_comm version memory_usage error_count
|
||||
connection_status=$(check_salt_connection)
|
||||
ping_status=$(check_salt_ping)
|
||||
service_status=$(check_salt_service)
|
||||
last_comm=$(check_salt_last_communication)
|
||||
version=$(get_salt_version)
|
||||
memory_usage=$(get_salt_memory_usage)
|
||||
error_count=$(count_salt_errors)
|
||||
|
||||
# Output metrics in Prometheus format
|
||||
output_metric "minion_connection_status" "$connection_status" \
|
||||
"Shows if Salt-Minion is connected to Salt-Master." "gauge"
|
||||
|
||||
output_metric "minion_ping_status" "$ping_status" \
|
||||
"Shows if Salt-Minion is able to ping Salt-Master." "gauge"
|
||||
|
||||
output_metric "minion_service_status" "$service_status" \
|
||||
"Shows if Salt-Minion service is active." "gauge"
|
||||
|
||||
output_metric "minion_last_communication_timestamp" "$last_comm" \
|
||||
"Timestamp of last successful communication with Salt-Master." "gauge"
|
||||
|
||||
output_metric "minion_version" "$version" \
|
||||
"Salt-Minion version number." "gauge"
|
||||
|
||||
output_metric "minion_memory_usage_bytes" "$memory_usage" \
|
||||
"Salt-Minion process memory usage in bytes." "gauge"
|
||||
|
||||
output_metric "minion_error_count" "$error_count" \
|
||||
"Number of error entries in Salt-Minion log file." "counter"
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
echo "=== END DRY RUN OUTPUT ===" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
# Execute main function with all script arguments
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user