#!/bin/bash ############################################################# #### YUM/DNF Package Updates Exporter for Prometheus #### #### Expose pending yum/dnf updates as Prometheus #### #### metrics for RHEL, Rocky, Alma, and CentOS servers #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version: 3.8 #### #### #### #### Usage: ./yum-updates-exporter.sh #### ############################################################# show_usage() { cat <&2; exit 1 ;; esac done } parse_args "$@" set -o pipefail ############################################# #### CONFIGURATION VARIABLES #### ############################################# # Cron Configuration readonly CRONTAB_USER="${CRONTAB_USER:-root}" # Cron User (default: root) readonly CRON_NAME="${CRON_NAME:-yum_updates.sh}" # Cron Job Name (default: yum_updates.sh) readonly UPDATE_INTERVAL="${UPDATE_INTERVAL:-0 0 * * * }" # Cron Update Interval (daily - default: 0 0 * * * ) # File Paths readonly CONFIG_FILE="${CONFIG_FILE:-/etc/yum/yum_updates.conf}" # Config File Location readonly EXCLUSION_FILE="${EXCLUSION_FILE:-/etc/yum/updates_exclude.conf}" # Exclusion List Location readonly LOCK_DIR="${LOCK_DIR:-/var/run}" # Lock Directory readonly LOG_FILE="${LOG_FILE:-/var/log/yum_updates.log}" # Log File Location readonly TMP_DIR="${TMP_DIR:-/tmp}" # Temporary Directory # Prometheus Configuration readonly NODE_EXPORTER_DIR="${NODE_EXPORTER_DIR:-/var/lib/node_exporter}" # Node Exporter Directory readonly PROMETHEUS_USER="${PROMETHEUS_USER:-prometheus}" # Prometheus User # Script Configuration readonly SCRIPT_PATH="$(readlink -f "$0")" # Script Path readonly DEBUG="${DEBUG:-0}" # Debug Mode (0 Off/ 1 On) readonly AUTO_UPDATE="${AUTO_UPDATE:-0}" # Auto Update Switch (default: 0 - off) readonly AUTO_UPDATE_DELAY_DAYS="${AUTO_UPDATE_DELAY_DAYS:-3}" # Days to wait before auto-update (default: 3) readonly USE_EXCLUSIONS="${USE_EXCLUSIONS:-0}" # Exclusion List Switch (default: 0 - off) # Runtime Variables - set during script execution LOCKFILE="" # Path to process lock file TEMP_YUM_OUTPUT="" # Temporary file for package manager output SKIP_REBOOT_CHECK=0 # Flag to skip reboot checking if tools unavailable LAST_UPDATE_FILE="" # Path to file storing last update timestamp LAST_UPDATED_PACKAGES_FILE="" # Path to file storing list of last auto-updated packages # Command Paths - populated by find_commands function # Stores full paths to required system commands declare -A COMMANDS # Exclusion List - populated by load_exclusions function # Contains package patterns to exclude from updates/metrics declare -a EXCLUSIONS ############################################# #### UTILITY FUNCTIONS #### ############################################# # Error handling function that logs error and exits with specified code # Args: $1 = error message, $2 = exit code (defaults to 1) handle_error() { local err_msg="$1" local exit_code="${2:-1}" echo "ERROR: $err_msg" >&2 exit "$exit_code" } # Logging function that outputs to stdout and optionally to log file # Args: $1 = log level (INFO, WARNING, ERROR, DEBUG), $2 = message # DEBUG messages only shown when DEBUG=1 log() { local level="$1" local message="$2" local log_entry="[$(date '+%Y-%m-%d %H:%M:%S')] [$level] $message" # Show all levels except DEBUG (unless DEBUG mode is enabled) if [[ "$level" != "DEBUG" || "$DEBUG" -eq 1 ]]; then # Always write to stderr for console output echo "$log_entry" >&2 # Always append to log file (regardless of stdout redirection) if [[ -n "$LOG_FILE" ]]; then echo "$log_entry" >> "$LOG_FILE" 2>/dev/null || true fi fi } # Rotate log file if it exceeds 10MB to prevent unlimited growth # Moves current log to .old extension rotate_log_if_needed() { # Check if log file exists and is larger than 10MB (10485760 bytes) if [[ -f "$LOG_FILE" && $(stat -c%s "$LOG_FILE" 2>/dev/null || echo 0) -gt 10485760 ]]; then mv "$LOG_FILE" "${LOG_FILE}.old" 2>/dev/null || true fi } # Find command in PATH or fallback locations and verify it's executable # Args: $1 = command name, $2+ = fallback directory paths to search # Returns: full path to executable command find_command() { local command_name="$1" local fallback_paths=("${@:2}") local path # First try to find command in PATH path=$(command -v "$command_name" 2>/dev/null) if [[ -z "$path" ]]; then # If not found in PATH, search fallback directories for fallback in "${fallback_paths[@]}"; do if [[ -x "$fallback/$command_name" ]]; then echo "$fallback/$command_name" return 0 fi done handle_error "Could not find '$command_name' executable" fi # Verify the found command is executable if [[ ! -x "$path" ]]; then handle_error "Found '$command_name' at '$path' but it's not executable" fi echo "$path" } ############################################# #### INITIALIZATION FUNCTIONS #### ############################################# # Load optional configuration file to override default variables, allows customization of script behavior without modifying the script load_config() { if [[ -f "$CONFIG_FILE" ]]; then # shellcheck disable=SC1090 if ! source "$CONFIG_FILE"; then log "WARNING" "Failed to load configuration from $CONFIG_FILE" fi fi } # Load package exclusion patterns from file to filter updates and prevent specific packages from being reported or updated load_exclusions() { EXCLUSIONS=() if [[ "$USE_EXCLUSIONS" -eq 1 && -f "$EXCLUSION_FILE" ]]; then log "INFO" "Loading package exclusions from $EXCLUSION_FILE" while IFS= read -r line; do # Skip empty lines and comments (lines starting with #) [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue # Remove leading/trailing whitespace from package names line=$(echo "$line" | "${COMMANDS[sed]}" 's/^[[:space:]]*//;s/[[:space:]]*$//') [[ -n "$line" ]] && EXCLUSIONS+=("$line") done < "$EXCLUSION_FILE" log "INFO" "Loaded ${#EXCLUSIONS[@]} package exclusions" elif [[ "$USE_EXCLUSIONS" -eq 0 ]]; then log "INFO" "Package exclusions disabled" else log "INFO" "No exclusion file found at $EXCLUSION_FILE" fi } # Locate all required system commands and store their paths, and ensures all necessary tools are available before proceeding find_commands() { # Essential commands required for package parsing and metrics generation local required_commands=( "awk:/usr/bin" "sed:/usr/bin" "sort:/usr/bin" "uniq:/usr/bin" "xargs:/usr/bin" ) # Find and validate each required command for cmd_info in "${required_commands[@]}"; do local cmd="${cmd_info%:*}" local fallback="${cmd_info#*:}" COMMANDS["$cmd"]=$(find_command "$cmd" "$fallback") done # Detect and locate the package manager (yum or dnf) if command -v yum >/dev/null 2>&1; then COMMANDS["pkg_mgr"]=$(find_command yum /usr/bin) elif command -v dnf >/dev/null 2>&1; then COMMANDS["pkg_mgr"]=$(find_command dnf /usr/bin) else handle_error "Neither yum nor dnf found" fi # Optional: Find needs-restarting for reboot requirement detection if command -v needs-restarting >/dev/null 2>&1; then COMMANDS["needs_restarting"]=$(command -v needs-restarting) else log "WARNING" "needs-restarting not found. Install yum-utils/dnf-utils to enable reboot checks" SKIP_REBOOT_CHECK=1 fi } validate_environment() { # Define validation patterns for environment variables (var_name:regex_pattern) local validations=( "CRONTAB_USER:^[a-z_][a-z0-9_-]*$" "PROMETHEUS_USER:^[a-z_][a-z0-9_-]*$" ) # Validate each environment variable against its pattern for validation in "${validations[@]}"; do local var="${validation%:*}" # Extract variable name local pattern="${validation#*:}" # Extract regex pattern local value="${!var}" # Get variable value via indirect expansion # Check if value exists and doesn't match the pattern if [[ -n "$value" && ! "$value" =~ $pattern ]]; then handle_error "Invalid format for $var: $value" fi done # Ensure required environment variables are set [[ -z "$NODE_EXPORTER_DIR" || -z "$PROMETHEUS_USER" ]] && handle_error "Required environment variables not set" # Verify TMP directory exists and is accessible [[ ! -d "$TMP_DIR" ]] && handle_error "TMP directory does not exist or is not writable" } setup_locking() { # Verify lock directory exists and is accessible [[ ! -d "$LOCK_DIR" ]] && handle_error "Lock directory does not exist or is not writable" # Clean up old lock files older than 60 minutes to prevent accumulation find "$LOCK_DIR" -name "updates.*" -type f -mmin +60 -delete 2>/dev/null || true # Create a unique temporary lock file in the lock directory LOCKFILE=$(mktemp -p "$LOCK_DIR" updates.XXXXXX) || handle_error "Failed to create lock file" # Open file descriptor 9 for the lock file and attempt to acquire exclusive lock exec 9>"$LOCKFILE" if ! flock -n 9; then handle_error "Script is already running (unable to acquire lock)" fi } # Create and configure the node exporter directory with proper permissions setup_directories() { # Create directory if it doesn't exist (only if running as root) if [[ ! -d "$NODE_EXPORTER_DIR" ]]; then if [[ "$(id -u)" = "0" ]]; then mkdir -p "$NODE_EXPORTER_DIR" || handle_error "Failed to create $NODE_EXPORTER_DIR" # Set ownership to prometheus user for security chown "${PROMETHEUS_USER}" "$NODE_EXPORTER_DIR" 2>/dev/null || log "WARNING" "Failed to set ownership of $NODE_EXPORTER_DIR" fi fi # Verify directory is writable before proceeding [[ ! -w "$NODE_EXPORTER_DIR" ]] && handle_error "$NODE_EXPORTER_DIR is not writable" } # Install cron job to run script periodically and output metrics setup_cron() { # Only install cron if script exists and not already scheduled if [[ -f "$SCRIPT_PATH" ]] && ! crontab -l 2>/dev/null | grep -q "$SCRIPT_PATH"; then # Create cron entry that redirects metrics to prometheus file, logs to log file local cron_entry="$UPDATE_INTERVAL $SCRIPT_PATH > $NODE_EXPORTER_DIR/updates.prom 2>> $LOG_FILE" local existing_cron existing_cron=$(crontab -u "$CRONTAB_USER" -l 2>/dev/null || echo '') # Add new cron entry to existing crontab if ! echo -e "${existing_cron}\n${cron_entry}" | crontab -u "$CRONTAB_USER" -; then log "WARNING" "Failed to install cron job for user $CRONTAB_USER" fi fi } # Create temporary file for capturing yum command output setup_temp_files() { TEMP_YUM_OUTPUT=$(mktemp -p "$TMP_DIR" yum_output.XXXXXX) || handle_error "Failed to create temporary file" } # Set up update tracking file for delay functionality setup_update_tracking() { LAST_UPDATE_FILE="$TMP_DIR/yum_updates_last_check" LAST_UPDATED_PACKAGES_FILE="$TMP_DIR/yum_updates_last_packages" } # Clean up lock files and temporary files on script exit cleanup() { # Release file lock and clean up lock file [[ -n "$LOCKFILE" ]] && { flock -u 9 2>/dev/null || true exec 9>&- 2>/dev/null || true rm -f "$LOCKFILE" 2>/dev/null || true } # Remove temporary yum output file [[ -n "$TEMP_YUM_OUTPUT" ]] && rm -f "$TEMP_YUM_OUTPUT" 2>/dev/null || true } ############################################# #### METRICS COLLECTION FUNCTIONS #### ############################################# # Check if package manager is already running any command (not just check-update) # Prevents conflicts and ensures clean execution by waiting for completion check_package_manager_running() { local pkg_mgr_name pkg_mgr_name=$(basename "${COMMANDS[pkg_mgr]}") # Look for any existing package manager processes to avoid conflicts if pgrep -x "$pkg_mgr_name" >/dev/null 2>&1; then log "WARNING" "$pkg_mgr_name is already running" # Wait up to 10 minutes (600 seconds) for the existing process to complete local timeout=600 local elapsed=0 while pgrep -x "$pkg_mgr_name" >/dev/null 2>&1 && [[ $elapsed -lt $timeout ]]; do log "INFO" "Waiting for existing $pkg_mgr_name process to complete..." sleep 30 elapsed=$((elapsed + 30)) done # If still running after timeout, abort to prevent system overload if pgrep -x "$pkg_mgr_name" >/dev/null 2>&1; then handle_error "$pkg_mgr_name is still running after ${timeout}s timeout" fi fi } # Check if package manager is already running check-update command # Prevents conflicts and ensures clean execution by waiting for completion check_yum_running() { local pkg_mgr_name pkg_mgr_name=$(basename "${COMMANDS[pkg_mgr]}") # Look for existing check-update processes to avoid conflicts if pgrep -f "${pkg_mgr_name}.*check-update" >/dev/null 2>&1; then log "WARNING" "$pkg_mgr_name check-update is already running" # Wait up to 5 minutes (300 seconds) for the existing process to complete local timeout=300 local elapsed=0 while pgrep -f "${pkg_mgr_name}.*check-update" >/dev/null 2>&1 && [[ $elapsed -lt $timeout ]]; do log "INFO" "Waiting for existing $pkg_mgr_name check-update to complete..." sleep 10 elapsed=$((elapsed + 10)) done # If still running after timeout, abort to prevent system overload if pgrep -f "${pkg_mgr_name}.*check-update" >/dev/null 2>&1; then handle_error "$pkg_mgr_name check-update is still running after ${timeout}s timeout" fi fi } run_package_check() { local attempts=3 local attempt=1 # Check if yum/dnf is already running before we start check_yum_running while [[ $attempt -le $attempts ]]; do # Redirect yum output to temp file, errors to log file to prevent metric pollution if timeout 300 "${COMMANDS[pkg_mgr]}" -q check-update > "$TEMP_YUM_OUTPUT" 2>> "$LOG_FILE"; then return 0 elif [[ $? -eq 124 ]]; then log "WARNING" "Attempt $attempt: Package check timed out after 300 seconds" ((attempt++)) sleep 10 else return 1 fi done return 1 } apply_exclusions() { local input="$1" local result="$input" if [[ "$USE_EXCLUSIONS" -eq 1 && ${#EXCLUSIONS[@]} -gt 0 ]]; then for exclusion in "${EXCLUSIONS[@]}"; do # Match package name followed by dot, dash, whitespace, or end of line # This handles package formats like: kernel.x86_64, kernel-core.x86_64, etc. result=$(echo "$result" | "${COMMANDS[sed]}" "/^${exclusion}\\([.-]\\|[[:space:]]\\|$\\)/d") done fi echo "$result" } # Collect package upgrade counts grouped by repository origin, then generates Prometheus metrics showing available updates per repository collect_upgrade_counts() { local output raw_output # Get package manager output and format into 3-column format raw_output=$("${COMMANDS[pkg_mgr]}" -q check-update | "${COMMANDS[xargs]}" -n3 | "${COMMANDS[sed]}" '/Obsoleting Packages/q') # Apply package exclusions, then count updates by repository origin output=$(apply_exclusions "$raw_output" | "${COMMANDS[sort]}" | "${COMMANDS[uniq]}" -c | "${COMMANDS[awk]}" '{print "node_upgrades_pending{origin=\""$2"\"} "$1}') # Return zero metric if no updates found if [[ -z "$output" ]]; then log "WARNING" "No upgrade count data found" echo 'node_upgrades_pending{origin=""} 0' return 1 fi echo "$output" } collect_upgrade_list() { local output raw_output raw_output=$("${COMMANDS[pkg_mgr]}" -q check-update | "${COMMANDS[awk]}" '!seen[$0]++' | "${COMMANDS[sed]}" '/Obsoleting Packages/q' | "${COMMANDS[sed]}" '/@/d') # Apply exclusions before processing with awk output=$(apply_exclusions "$raw_output" | "${COMMANDS[awk]}" 'NF>=3 && NR>1 {print "node_upgrade_list{pkgname=\"" $1 "\", uvers=\"" $2 "\", repo=\"" $3 "\"}", 1}') if [[ -z "$output" ]]; then echo 'node_upgrade_list{pkgname=""} 0' return 1 fi echo "$output" } check_reboot_required() { if [[ "$SKIP_REBOOT_CHECK" -eq 1 || ! -x "${COMMANDS[needs_restarting]:-}" ]]; then return 0 fi echo '# HELP node_reboot_required Boolean indicator (0/1) if system requires reboot after updates.' echo '# TYPE node_reboot_required gauge' if timeout 60 "${COMMANDS[needs_restarting]}" -r >/dev/null 2>&1; then echo 'node_reboot_required 0' else echo 'node_reboot_required 1' fi } # Output metrics for last auto-updated packages output_last_updated_packages() { echo '# HELP node_auto_updated_packages Information about packages that were last auto-updated.' echo '# TYPE node_auto_updated_packages gauge' if [[ ! -f "$LAST_UPDATED_PACKAGES_FILE" ]]; then echo 'node_auto_updated_packages{packages="none",timestamp="0"} 0' return 0 fi local timestamp packages timestamp=$(grep "^timestamp:" "$LAST_UPDATED_PACKAGES_FILE" 2>/dev/null | cut -d: -f2- || echo "0") packages=$(grep "^packages:" "$LAST_UPDATED_PACKAGES_FILE" 2>/dev/null | cut -d: -f2- || echo "none") # Escape quotes in package names for Prometheus label packages=$(echo "$packages" | "${COMMANDS[sed]}" 's/"/\\"/g') # Output metric with package list and timestamp as labels echo "node_auto_updated_packages{packages=\"$packages\",timestamp=\"$timestamp\"} 1" } ############################################# #### AUTO-UPDATE FUNCTIONS #### ############################################# # Check if sufficient time has passed since last update check check_update_delay() { if [[ ! -f "$LAST_UPDATE_FILE" ]]; then log "INFO" "No previous update timestamp found, proceeding with update" return 0 fi local last_update last_update=$(cat "$LAST_UPDATE_FILE" 2>/dev/null || echo 0) local current_time current_time=$(date +%s) local delay_seconds=$((AUTO_UPDATE_DELAY_DAYS * 86400)) local time_diff=$((current_time - last_update)) if [[ $time_diff -lt $delay_seconds ]]; then local remaining_hours=$(((delay_seconds - time_diff) / 3600)) log "INFO" "Update delay active: $remaining_hours hours remaining before next auto-update" return 1 fi return 0 } # Record timestamp of successful update for delay tracking record_update_timestamp() { date +%s > "$LAST_UPDATE_FILE" || log "WARNING" "Failed to record update timestamp" } # Record list of packages that were updated during auto-update record_updated_packages() { local updated_packages="$1" local current_time current_time=$(date +%s) # Create a structured record with timestamp and package list { echo "timestamp:$current_time" echo "packages:$updated_packages" } > "$LAST_UPDATED_PACKAGES_FILE" || log "WARNING" "Failed to record updated packages list" } # Automatically update packages if AUTO_UPDATE is enabled, applies the same exclusions as metrics collection perform_auto_update() { if [[ "$AUTO_UPDATE" -ne 1 ]]; then log "DEBUG" "Auto-update disabled, skipping" return 0 fi # Check if delay period has passed since last update if ! check_update_delay; then return 0 fi log "INFO" "Starting automatic package updates" # Verify root privileges are available for package updates if [[ "$(id -u)" -ne 0 ]]; then log "ERROR" "Auto-update requires root privileges" return 1 fi # Build update command with automatic confirmation local update_command=("${COMMANDS[pkg_mgr]}" "-y" "update") # Apply same exclusions used in metrics collection if [[ "$USE_EXCLUSIONS" -eq 1 && ${#EXCLUSIONS[@]} -gt 0 ]]; then log "INFO" "Applying exclusions to auto-update: ${EXCLUSIONS[*]}" for exclusion in "${EXCLUSIONS[@]}"; do update_command+=("--exclude=${exclusion}") done fi log "INFO" "Executing: ${update_command[*]}" # Create separate log file for update output local update_log="${LOG_FILE%.log}_autoupdate.log" local temp_update_output="${TMP_DIR}/update_output.tmp" # Execute update with 30-minute timeout to prevent hanging, capture output if timeout 1800 "${update_command[@]}" > "$temp_update_output" 2>&1; then log "INFO" "Auto-update completed successfully" # Extract updated package list from output local updated_packages updated_packages=$(grep -E "^Updated:|^Upgraded:" "$temp_update_output" | "${COMMANDS[sed]}" 's/^Updated://;s/^Upgraded://' | tr '\n' ',' | "${COMMANDS[sed]}" 's/,$//') # Record successful update timestamp for delay tracking record_update_timestamp # Record the list of updated packages record_updated_packages "$updated_packages" # Also append to main update log cat "$temp_update_output" >> "$update_log" rm -f "$temp_update_output" # Check if system reboot is required after updates if [[ "$SKIP_REBOOT_CHECK" -eq 0 && -x "${COMMANDS[needs_restarting]:-}" ]]; then if ! timeout 60 "${COMMANDS[needs_restarting]}" -r >/dev/null 2>&1; then log "WARNING" "System requires reboot after updates" fi fi return 0 else local exit_code=$? # Clean up temp file on failure rm -f "$temp_update_output" if [[ $exit_code -eq 124 ]]; then log "ERROR" "Auto-update timed out after 30 minutes" else log "ERROR" "Auto-update failed with exit code $exit_code" fi return $exit_code fi } ############################################# #### MAIN EXECUTION FLOW #### ############################################# output_upgrade_metrics() { local upgrades upgradelist echo '# HELP node_upgrades_pending Count of pending package updates by repository origin.' echo '# TYPE node_upgrades_pending gauge' if upgrades=$(collect_upgrade_counts); then echo "$upgrades" else log "ERROR" "Failed to collect upgrade counts" echo 'node_upgrades_pending{origin=""} 0' fi echo '# HELP node_upgrade_list Detailed list of available package updates including version info.' echo '# TYPE node_upgrade_list gauge' if upgradelist=$(collect_upgrade_list); then echo "$upgradelist" else log "ERROR" "Failed to collect upgrade list" echo 'node_upgrade_list{pkgname=""} 0' fi } # Main execution function that sets the environment, performs updates if enabled, and collects metrics main() { # Set up cleanup handlers for graceful shutdown trap cleanup EXIT INT TERM PIPE # Initialize script environment and configuration rotate_log_if_needed load_config find_commands # Check if package manager is already running before proceeding check_package_manager_running load_exclusions validate_environment setup_locking setup_directories setup_cron setup_temp_files setup_update_tracking log "INFO" "Starting package update metrics collection" # Execute automatic updates before collecting metrics (if enabled) # This ensures metrics reflect post-update state if [[ "$AUTO_UPDATE" -eq 1 ]]; then perform_auto_update fi # Generate Prometheus metrics for package updates output_upgrade_metrics check_reboot_required output_last_updated_packages log "INFO" "Package update metrics collection completed" } ############################################# #### SCRIPT EXECUTION #### ############################################# if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then if [[ -n "$OUTPUT_FILE" ]]; then output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" temp_file=$(mktemp "${output_dir}/.yum_updates_metrics.XXXXXX") if ! main "$@" > "$temp_file" 2>/dev/null; then rm -f "$temp_file" echo "ERROR: Failed to generate metrics" >&2 exit 1 fi file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [[ "$file_lines" -lt 5 ]]; then rm -f "$temp_file" echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 exit 1 fi chmod 644 "$temp_file" mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 else main "$@" fi fi