#!/bin/bash ############################################################# #### Cron Job Monitoring Exporter for Prometheus #### #### Tracks whether scheduled cron jobs ran successfully, #### #### their exit codes, duration, and staleness #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version: 1.0 #### #### #### #### Usage: ./cron-job-exporter.sh [OPTIONS] #### ############################################################# # # Monitors cron job execution by wrapping cron commands. # Two modes of operation: # 1. Wrapper mode: wrap a cron command to record metrics # 2. Collector mode: scan state files and write .prom output # # Metrics exported: # - cron_job_exit_code (last exit code) # - cron_job_duration_seconds (last execution time) # - cron_job_last_run_timestamp (unix timestamp of last run) # - cron_job_success (1 if last run exited 0, else 0) # - cron_job_runs_total (total number of runs) # # Requirements: # - Bash 4.0+ # - node_exporter with textfile collector enabled # set -euo pipefail ######################### ### Configuration ### ######################### NODE_DIR="${NODE_DIR:-/var/lib/node_exporter}" STATE_DIR="${STATE_DIR:-/var/lib/cron-job-exporter}" PROM_FILE="${NODE_DIR}/cron_jobs.prom" STALE_THRESHOLD="${STALE_THRESHOLD:-86400}" # 24 hours DEBUG="${DEBUG:-}" ######################### ### Logging ### ######################### RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' log_info() { echo -e "${GREEN}[INFO]${NC} $1" >&2 } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1" >&2 } log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2 } log_debug() { [[ -n "$DEBUG" ]] && echo "[DEBUG] $1" >&2 } ######################### ### Parse Arguments ### ######################### show_help() { cat < -- Collector mode: $0 --collect WRAPPER MODE (use in crontab): Wraps a cron command, records exit code, duration, and timestamp to a state file. Run --collect separately to generate .prom output. Example crontab: * * * * * /opt/cron-job-exporter.sh --wrap --name backup_db -- /opt/backup-db.sh 0 * * * * /opt/cron-job-exporter.sh --wrap --name log_cleanup -- /opt/cleanup-logs.sh COLLECTOR MODE (run on schedule or as oneshot): Reads all state files and writes a single .prom file for node_exporter. Example crontab: * * * * * /opt/cron-job-exporter.sh --collect OPTIONS: --wrap Wrapper mode: run a command and record metrics --collect Collector mode: generate .prom from state files --name NAME Job name for wrapper mode (required with --wrap) --stale-threshold SEC Seconds before a job is considered stale (default: 86400) --state-dir DIR State file directory (default: /var/lib/cron-job-exporter) --help Show this help EOF exit 0 } MODE="" JOB_NAME="" JOB_CMD=() parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --wrap) MODE="wrap"; shift ;; --collect) MODE="collect"; shift ;; --name) JOB_NAME="$2"; shift 2 ;; --stale-threshold) STALE_THRESHOLD="$2"; shift 2 ;; --state-dir) STATE_DIR="$2"; shift 2 ;; --help) show_help ;; --) shift; JOB_CMD=("$@"); break ;; *) log_error "Unknown option: $1"; exit 1 ;; esac done if [[ -z "$MODE" ]]; then log_error "Must specify --wrap or --collect" echo "Run '$0 --help' for usage." exit 1 fi if [[ "$MODE" == "wrap" ]]; then if [[ -z "$JOB_NAME" ]]; then log_error "--name is required in wrapper mode" exit 1 fi if [[ ${#JOB_CMD[@]} -eq 0 ]]; then log_error "No command specified after --" exit 1 fi fi } ######################### ### Sanitize ### ######################### sanitize_name() { local name="$1" name="${name,,}" name="${name// /_}" name=$(echo "$name" | sed 's/[^a-z0-9_]/_/g') name=$(echo "$name" | sed 's/__*/_/g; s/^_//; s/_$//') echo "$name" } ######################### ### Wrapper Mode ### ######################### run_wrapper() { mkdir -p "$STATE_DIR" local safe_name safe_name=$(sanitize_name "$JOB_NAME") local state_file="${STATE_DIR}/${safe_name}.state" log_debug "Wrapping command: ${JOB_CMD[*]}" log_debug "Job name: $safe_name" local start_time end_time duration exit_code start_time=$(date +%s%N) # Run the command, capturing exit code set +e "${JOB_CMD[@]}" exit_code=$? set -e end_time=$(date +%s%N) duration=$(echo "scale=3; ($end_time - $start_time) / 1000000000" | bc 2>/dev/null || echo "0") # Read current run count local runs=0 if [[ -f "$state_file" ]]; then runs=$(grep '^runs=' "$state_file" 2>/dev/null | cut -d= -f2 || echo "0") fi runs=$((runs + 1)) # Write state file atomically local tmpfile tmpfile=$(mktemp "${state_file}.XXXXXX") cat > "$tmpfile" < STALE_THRESHOLD )); then stale=1 fi metrics+="cron_job_exit_code{job=\"${name}\"} ${exit_code} " metrics+="cron_job_duration_seconds{job=\"${name}\"} ${duration} " metrics+="cron_job_last_run_timestamp{job=\"${name}\"} ${timestamp} " metrics+="cron_job_success{job=\"${name}\"} ${success} " metrics+="cron_job_runs_total{job=\"${name}\"} ${runs} " metrics+="cron_job_stale{job=\"${name}\"} ${stale} " log_debug "Collected: $name (exit=$exit_code, stale=$stale)" done if [[ $found -eq 0 ]]; then log_debug "No state files found in $STATE_DIR" fi # Collector metadata metrics+=" # HELP cron_job_collector_last_run_timestamp Unix timestamp of last collector run # TYPE cron_job_collector_last_run_timestamp gauge cron_job_collector_last_run_timestamp $now " # Atomic write local tmpfile tmpfile=$(mktemp "${PROM_FILE}.XXXXXX") echo "$metrics" > "$tmpfile" mv "$tmpfile" "$PROM_FILE" log_info "Metrics written to $PROM_FILE ($found jobs)" } ######################### ### Main ### ######################### main() { parse_args "$@" case "$MODE" in wrap) run_wrapper ;; collect) run_collector ;; esac } main "$@"