#!/bin/bash ################################################################################ # Script Name: systemd-timer-exporter.sh # Version: 1.1 # Description: Prometheus exporter for systemd timers providing comprehensive # metrics for monitoring timer status, scheduling, associated # service results, and health detection # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - systemd-based system with systemctl # - netcat (nc) for HTTP mode # - Standard Unix tools (grep, awk, sort) # # Usage: # # Output to stdout # ./systemd-timer-exporter.sh # # # HTTP server mode # ./systemd-timer-exporter.sh --http -p 9199 # # # Textfile collector mode # ./systemd-timer-exporter.sh --textfile # # Metrics Exported: # Core Status: # - systemd_timer_up - Exporter status (1=up, 0=down) # - systemd_timer_exporter_info{version} - Exporter info # # Timer Counts: # - systemd_timer_count_total - Total number of timers # - systemd_timer_count_by_state{state} - Count per state # # Per-Timer Metrics: # - systemd_timer_active{timer} - 1 if active, 0 if not # - systemd_timer_next_trigger_timestamp{timer} - Next trigger unix timestamp # - systemd_timer_last_trigger_timestamp{timer} - Last trigger unix timestamp # - systemd_timer_seconds_until_next{timer} - Seconds until next trigger # - systemd_timer_seconds_since_last{timer} - Seconds since last trigger # # Associated Service Metrics: # - systemd_timer_service_result{timer,service,result} - Service result # - systemd_timer_service_exit_code{timer,service} - Exit code of last run # - systemd_timer_service_duration_seconds{timer,service} - Last run duration # # Health: # - systemd_timer_overdue_count - Timers past their next trigger time # - systemd_timer_failed_service_count - Associated services in failed state # # Exporter: # - systemd_timer_exporter_duration_seconds - Script execution time # - systemd_timer_exporter_last_run_timestamp - Last run unix timestamp # # Configuration: # Default HTTP port: 9199 # Textfile directory: /var/lib/node_exporter # ################################################################################ # ============================================================================ # CONFIGURATION VARIABLES # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9199 # ============================================================================ # HELPER FUNCTIONS # ============================================================================ show_usage() { cat <&2; exit 1 ;; esac done } # Verify systemctl is available and systemd is running # Returns: 0 if systemd is available, 1 otherwise check_systemd() { if ! command -v systemctl >/dev/null 2>&1; then return 1 fi # is-system-running returns "degraded" (exit 1) when any unit has failed, # which is normal on most production servers. Only fail if systemd is # genuinely unavailable (offline, unknown, etc.) local state state=$(systemctl is-system-running 2>/dev/null || true) case "$state" in running|degraded|starting|stopping) return 0 ;; *) return 1 ;; esac } # Get list of all timer unit names # Returns: One timer unit name per line get_timer_list() { systemctl list-timers --all --no-pager --no-legend 2>/dev/null | \ awk '{for(i=1;i<=NF;i++) if($i ~ /\.timer$/) {print $i; break}}' } # Get the state of a timer unit (active/inactive/failed) # Args: $1 - timer unit name # Returns: State string get_timer_state() { local timer="$1" local state state=$(systemctl show "$timer" --property=ActiveState --value 2>/dev/null) echo "${state:-inactive}" } # Get unix timestamp of next trigger for a timer # Args: $1 - timer unit name # Returns: Unix timestamp (0 if unavailable) get_timer_next_trigger() { local timer="$1" local usec usec=$(systemctl show "$timer" --property=NextElapseUSecRealtime --value 2>/dev/null) if [ -z "$usec" ] || [ "$usec" = "0" ]; then echo "0" return fi echo "$(( usec / 1000000 ))" } # Get unix timestamp of last trigger for a timer # Args: $1 - timer unit name # Returns: Unix timestamp (0 if unavailable) get_timer_last_trigger() { local timer="$1" local usec usec=$(systemctl show "$timer" --property=LastTriggerUSec --value 2>/dev/null) if [ -z "$usec" ] || [ "$usec" = "0" ]; then echo "0" return fi echo "$(( usec / 1000000 ))" } # Get the result of the associated service unit # Args: $1 - timer unit name # Returns: Result string (success/failed/exit-code/timeout) get_associated_service_result() { local timer="$1" local service="${timer%.timer}.service" local result result=$(systemctl show "$service" --property=Result --value 2>/dev/null) echo "${result:-success}" } # Get the exit code of the associated service unit # Args: $1 - timer unit name # Returns: Exit code integer get_associated_service_exit_code() { local timer="$1" local service="${timer%.timer}.service" local code code=$(systemctl show "$service" --property=ExecMainStatus --value 2>/dev/null) echo "${code:-0}" } # Get the last execution duration of the associated service in seconds # Args: $1 - timer unit name # Returns: Duration in seconds (0 if unavailable) get_associated_service_runtime() { local timer="$1" local service="${timer%.timer}.service" local start_usec local exit_usec start_usec=$(systemctl show "$service" --property=ExecMainStartTimestampMonotonic --value 2>/dev/null) exit_usec=$(systemctl show "$service" --property=ExecMainExitTimestampMonotonic --value 2>/dev/null) if [ -z "$start_usec" ] || [ "$start_usec" = "0" ] || [ -z "$exit_usec" ] || [ "$exit_usec" = "0" ]; then echo "0" return fi echo "$(( (exit_usec - start_usec) / 1000000 ))" } # Get count of timers where NextElapseUSecRealtime is in the past # Returns: Number of overdue timers get_overdue_timers() { local now now=$(date +%s) local count=0 local timer next_ts while read -r timer; do [ -z "$timer" ] && continue next_ts=$(get_timer_next_trigger "$timer") if [ "$next_ts" -gt 0 ] && [ "$next_ts" -lt "$now" ]; then count=$((count + 1)) fi done < <(get_timer_list) echo "$count" } # Count timers by state (active, inactive, failed) # Args: $1 - state to count # Returns: Count of timers in that state count_timers_by_state() { local target_state="$1" local count=0 local timer state while read -r timer; do [ -z "$timer" ] && continue state=$(get_timer_state "$timer") if [ "$state" = "$target_state" ]; then count=$((count + 1)) fi done < <(get_timer_list) echo "$count" } # ============================================================================ # METRIC GENERATION # ============================================================================ # Generate all Prometheus metrics # Returns: Prometheus text format metrics on stdout generate_metrics() { local script_start script_start=$(date +%s) # Check systemd availability if ! check_systemd; then cat </dev/null) total_timers=${total_timers:-0} # Pre-compute states for all timers (avoids repeated systemctl calls) declare -A timer_states local timer state while read -r timer; do [ -z "$timer" ] && continue state=$(get_timer_state "$timer") timer_states["$timer"]="$state" done <<< "$timer_cache" local active_count=0 local inactive_count=0 local failed_count=0 for timer in "${!timer_states[@]}"; do case "${timer_states[$timer]}" in active) active_count=$((active_count + 1)) ;; inactive) inactive_count=$((inactive_count + 1)) ;; failed) failed_count=$((failed_count + 1)) ;; esac done cat <&2 if ! command -v nc >/dev/null 2>&1; then echo "ERROR: netcat (nc) required for HTTP mode" >&2 exit 1 fi # Infinite loop accepting HTTP requests while true; do { read -r request # Check if request is for /metrics endpoint if [[ "$request" =~ ^GET\ /metrics ]]; then echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" generate_metrics else # Serve HTML landing page for other requests echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" cat < Systemd Timer Exporter v1.0

Systemd Timer Exporter v1.0

Metrics

Metric Categories

  • Core Status: exporter up/down, version info
  • Timer Counts: total timers, count by state (active/inactive/failed)
  • Per-Timer Metrics: active state, next/last trigger timestamps, time until/since trigger
  • Associated Service Metrics: service result, exit code, execution duration
  • Health: overdue timers, failed service count
  • Exporter: script duration, last run timestamp
EOF fi } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null # -q 1: wait 1 second after EOF before closing done } # ============================================================================ # MAIN EXECUTION # ============================================================================ # Main entry point - routes to appropriate output mode main() { parse_args "$@" if [ "$HTTP_MODE" = true ]; then # Run HTTP server (blocks until killed) run_http_server elif [ -n "$OUTPUT_FILE" ]; then # Textfile collector mode: write atomically using temp file local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" # Create temp file in SAME directory for atomic rename (same filesystem) local temp_file temp_file=$(mktemp "${output_dir}/.systemd_timer_metrics.XXXXXX") # Generate metrics to temp file if ! generate_metrics > "$temp_file" 2>/dev/null; then rm -f "$temp_file" echo "ERROR: Failed to generate metrics" >&2 exit 1 fi # Validate: file must exist, have content, and contain enough metric lines local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [ "$file_lines" -lt 10 ]; then rm -f "$temp_file" echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 exit 1 fi # Set permissions before move chmod 644 "$temp_file" # Atomic rename - no gap where file is missing mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 else # Default: output to stdout generate_metrics fi } # Execute main function with all script arguments main "$@"