#!/bin/bash ################################################################################ # Script Name: backup-status-exporter.sh # Version: 1.0 # Description: Prometheus textfile collector exporter for backup job status # Monitors backup age, size, and success/failure from multiple # sources including timestamp files, log files, and directories # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # Date: 2026-03-03 # # Prerequisites: # - node_exporter with textfile collector enabled # - /var/lib/node_exporter directory exists # - Config file at /etc/backup-status-exporter.conf # # Usage: # # Run with default config # sudo ./backup-status-exporter.sh # # # Dry run (output to stdout) # ./backup-status-exporter.sh --dry-run # # # Debug mode # DEBUG=1 sudo ./backup-status-exporter.sh # # Config Format (pipe-delimited, one job per line): # job_name|type|path|max_age_hours # # Types: # directory - find newest file in directory, report mtime and size # statusfile - read unix timestamp of last success from a file # logfile - grep for success/failure patterns in a log file # # Metrics Exported: # - linux_backup_last_success_timestamp{job} - Unix timestamp of last backup # - linux_backup_age_hours{job} - Hours since last backup # - linux_backup_size_bytes{job} - Size of last backup in bytes # - linux_backup_status{job} - 1=ok, 0=stale/failed # ################################################################################ set -o pipefail # ============================================================================ # CONFIGURATION # ============================================================================ readonly VERSION="1.0" readonly SCRIPT_NAME="${0##*/}" readonly TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter}" readonly OUTPUT_FILE="${TEXTFILE_DIR}/backup_status.prom" readonly CONFIG_FILE="${CONFIG_FILE:-/etc/backup-status-exporter.conf}" readonly TMP_FILE="${OUTPUT_FILE}.$$" # Runtime flags DRY_RUN=false DEBUG=${DEBUG:-} # Log success patterns (case-insensitive grep) readonly SUCCESS_PATTERNS="(completed successfully|backup successful|backup finished|success|completed without error)" readonly FAILURE_PATTERNS="(failed|error|fatal|backup failed|aborted)" # ============================================================================ # HELPER FUNCTIONS # ============================================================================ debug_echo() { if [[ -n "$DEBUG" ]]; then echo "[DEBUG] $*" >&2 fi } log_error() { echo "[ERROR] $*" >&2 } cleanup() { rm -f "$TMP_FILE" } trap cleanup EXIT show_help() { cat </dev/null | sort -rn | head -1) if [[ -z "$newest_file" ]]; then debug_echo "[$job_name] No files found in: $path" echo "0|0|0|0" return fi local file_epoch file_epoch=$(echo "$newest_file" | awk '{printf "%.0f", $1}') local file_size file_size=$(echo "$newest_file" | awk '{print $2}') local file_path file_path=$(echo "$newest_file" | awk '{$1=""; $2=""; print}' | sed 's/^ //') local now now=$(date +%s) local age_seconds=$((now - file_epoch)) local age_hours age_hours=$(awk "BEGIN {printf \"%.1f\", $age_seconds / 3600}") local max_age_seconds=$((max_age_hours * 3600)) local status=1 if [[ "$age_seconds" -gt "$max_age_seconds" ]]; then status=0 fi debug_echo "[$job_name] Newest file: $file_path (age=${age_hours}h, size=${file_size}B, status=$status)" echo "${file_epoch}|${age_hours}|${file_size}|${status}" } check_statusfile() { local job_name="$1" local path="$2" local max_age_hours="$3" if [[ ! -f "$path" ]]; then debug_echo "[$job_name] Status file not found: $path" echo "0|0|0|0" return fi local timestamp timestamp=$(head -1 "$path" 2>/dev/null) timestamp="${timestamp//[[:space:]]/}" if [[ -z "$timestamp" ]] || ! [[ "$timestamp" =~ ^[0-9]+$ ]]; then debug_echo "[$job_name] Invalid timestamp in status file: $path" echo "0|0|0|0" return fi local now now=$(date +%s) local age_seconds=$((now - timestamp)) local age_hours age_hours=$(awk "BEGIN {printf \"%.1f\", $age_seconds / 3600}") # Status files don't have a meaningful size — report file size of the status file itself local file_size file_size=$(stat -c '%s' "$path" 2>/dev/null) || file_size=0 local max_age_seconds=$((max_age_hours * 3600)) local status=1 if [[ "$age_seconds" -gt "$max_age_seconds" ]]; then status=0 fi debug_echo "[$job_name] Status timestamp: $timestamp (age=${age_hours}h, status=$status)" echo "${timestamp}|${age_hours}|${file_size}|${status}" } check_logfile() { local job_name="$1" local path="$2" local max_age_hours="$3" if [[ ! -f "$path" ]]; then debug_echo "[$job_name] Log file not found: $path" echo "0|0|0|0" return fi # Check for failure patterns first (most recent occurrence) local last_failure last_failure=$(grep -inE "$FAILURE_PATTERNS" "$path" 2>/dev/null | tail -1) || true local last_success last_success=$(grep -inE "$SUCCESS_PATTERNS" "$path" 2>/dev/null | tail -1) || true local failure_line=0 local success_line=0 if [[ -n "$last_failure" ]]; then failure_line=$(echo "$last_failure" | cut -d: -f1) fi if [[ -n "$last_success" ]]; then success_line=$(echo "$last_success" | cut -d: -f1) fi # Use the log file's mtime as the timestamp local file_epoch file_epoch=$(stat -c '%Y' "$path" 2>/dev/null) || file_epoch=0 local file_size file_size=$(stat -c '%s' "$path" 2>/dev/null) || file_size=0 local now now=$(date +%s) local age_seconds=$((now - file_epoch)) local age_hours age_hours=$(awk "BEGIN {printf \"%.1f\", $age_seconds / 3600}") local max_age_seconds=$((max_age_hours * 3600)) # Determine status: success if last success line is after last failure line # and the log is not stale local status=0 if [[ "$success_line" -gt "$failure_line" ]] && [[ "$age_seconds" -le "$max_age_seconds" ]]; then status=1 fi if [[ "$success_line" -eq 0 ]] && [[ "$failure_line" -eq 0 ]]; then debug_echo "[$job_name] No success or failure patterns found in: $path" status=0 fi debug_echo "[$job_name] Log file: $path (age=${age_hours}h, success_line=$success_line, failure_line=$failure_line, status=$status)" echo "${file_epoch}|${age_hours}|${file_size}|${status}" } # ============================================================================ # METRICS COLLECTION # ============================================================================ collect_metrics() { local jobs=() while IFS= read -r job_line; do jobs+=("$job_line") done < <(load_jobs) local output="" local timestamps="" local ages="" local sizes="" local statuses="" for job_line in "${jobs[@]}"; do local job_name job_name=$(echo "$job_line" | cut -d'|' -f1) local job_type job_type=$(echo "$job_line" | cut -d'|' -f2) local job_path job_path=$(echo "$job_line" | cut -d'|' -f3) local max_age_hours max_age_hours=$(echo "$job_line" | cut -d'|' -f4) if [[ -z "$job_name" ]] || [[ -z "$job_type" ]] || [[ -z "$job_path" ]] || [[ -z "$max_age_hours" ]]; then log_error "Invalid config line: $job_line (expected: job_name|type|path|max_age_hours)" continue fi local result="" case "$job_type" in directory) result=$(check_directory "$job_name" "$job_path" "$max_age_hours") ;; statusfile) result=$(check_statusfile "$job_name" "$job_path" "$max_age_hours") ;; logfile) result=$(check_logfile "$job_name" "$job_path" "$max_age_hours") ;; *) log_error "Unknown job type '$job_type' for job '$job_name' (expected: directory, statusfile, logfile)" continue ;; esac local ts ts=$(echo "$result" | cut -d'|' -f1) local age age=$(echo "$result" | cut -d'|' -f2) local size size=$(echo "$result" | cut -d'|' -f3) local st st=$(echo "$result" | cut -d'|' -f4) timestamps+="linux_backup_last_success_timestamp{job=\"${job_name}\"} ${ts}\n" ages+="linux_backup_age_hours{job=\"${job_name}\"} ${age}\n" sizes+="linux_backup_size_bytes{job=\"${job_name}\"} ${size}\n" statuses+="linux_backup_status{job=\"${job_name}\"} ${st}\n" done output+="# HELP linux_backup_last_success_timestamp Unix timestamp of the last successful backup\n" output+="# TYPE linux_backup_last_success_timestamp gauge\n" output+="$timestamps" output+="# HELP linux_backup_age_hours Hours since the last successful backup\n" output+="# TYPE linux_backup_age_hours gauge\n" output+="$ages" output+="# HELP linux_backup_size_bytes Size of the last backup in bytes\n" output+="# TYPE linux_backup_size_bytes gauge\n" output+="$sizes" output+="# HELP linux_backup_status Backup job status (1=ok, 0=stale or failed)\n" output+="# TYPE linux_backup_status gauge\n" output+="$statuses" printf '%b' "$output" } # ============================================================================ # OUTPUT # ============================================================================ write_metrics() { local metrics metrics=$(collect_metrics) if [[ "$DRY_RUN" == "true" ]]; then echo "$metrics" return fi if [[ ! -d "$TEXTFILE_DIR" ]]; then log_error "Textfile collector directory does not exist: $TEXTFILE_DIR" exit 1 fi echo "$metrics" > "$TMP_FILE" mv "$TMP_FILE" "$OUTPUT_FILE" debug_echo "Metrics written to $OUTPUT_FILE" } # ============================================================================ # MAIN # ============================================================================ main() { while [[ $# -gt 0 ]]; do case "$1" in --dry-run) DRY_RUN=true shift ;; --debug) DEBUG=1 shift ;; --help|-h) show_help ;; --version|-v) show_version ;; *) log_error "Unknown option: $1" echo "Use --help for usage information" >&2 exit 1 ;; esac done write_metrics } main "$@"