Add all 44 scripts, update CI: error severity baseline, PowerShell validation, multi-distro testing
Amp-Thread-ID: https://ampcode.com/threads/T-019cc404-c628-759e-a50b-f5eeea35b91f Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Executable
+452
@@ -0,0 +1,452 @@
|
||||
#!/bin/bash
|
||||
################################################################################
|
||||
# Script Name: backup-status-exporter.sh
|
||||
# Version: 1.0
|
||||
# Description: Prometheus textfile collector exporter for backup job status
|
||||
# Monitors backup age, size, and success/failure from multiple
|
||||
# sources including timestamp files, log files, and directories
|
||||
#
|
||||
# Author: Phil Connor
|
||||
# Contact: contact@mylinux.work
|
||||
# Website: https://mylinux.work
|
||||
# License: MIT
|
||||
# Date: 2026-03-03
|
||||
#
|
||||
# Prerequisites:
|
||||
# - node_exporter with textfile collector enabled
|
||||
# - /var/lib/node_exporter directory exists
|
||||
# - Config file at /etc/backup-status-exporter.conf
|
||||
#
|
||||
# Usage:
|
||||
# # Run with default config
|
||||
# sudo ./backup-status-exporter.sh
|
||||
#
|
||||
# # Dry run (output to stdout)
|
||||
# ./backup-status-exporter.sh --dry-run
|
||||
#
|
||||
# # Debug mode
|
||||
# DEBUG=1 sudo ./backup-status-exporter.sh
|
||||
#
|
||||
# Config Format (pipe-delimited, one job per line):
|
||||
# job_name|type|path|max_age_hours
|
||||
#
|
||||
# Types:
|
||||
# directory - find newest file in directory, report mtime and size
|
||||
# statusfile - read unix timestamp of last success from a file
|
||||
# logfile - grep for success/failure patterns in a log file
|
||||
#
|
||||
# Metrics Exported:
|
||||
# - linux_backup_last_success_timestamp{job} - Unix timestamp of last backup
|
||||
# - linux_backup_age_hours{job} - Hours since last backup
|
||||
# - linux_backup_size_bytes{job} - Size of last backup in bytes
|
||||
# - linux_backup_status{job} - 1=ok, 0=stale/failed
|
||||
#
|
||||
################################################################################
|
||||
|
||||
set -o pipefail
|
||||
|
||||
# ============================================================================
|
||||
# CONFIGURATION
|
||||
# ============================================================================
|
||||
|
||||
readonly VERSION="1.0"
|
||||
readonly SCRIPT_NAME="${0##*/}"
|
||||
readonly TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter}"
|
||||
readonly OUTPUT_FILE="${TEXTFILE_DIR}/backup_status.prom"
|
||||
readonly CONFIG_FILE="${CONFIG_FILE:-/etc/backup-status-exporter.conf}"
|
||||
readonly TMP_FILE="${OUTPUT_FILE}.$$"
|
||||
|
||||
# Runtime flags
|
||||
DRY_RUN=false
|
||||
DEBUG=${DEBUG:-}
|
||||
|
||||
# Log success patterns (case-insensitive grep)
|
||||
readonly SUCCESS_PATTERNS="(completed successfully|backup successful|backup finished|success|completed without error)"
|
||||
readonly FAILURE_PATTERNS="(failed|error|fatal|backup failed|aborted)"
|
||||
|
||||
# ============================================================================
|
||||
# HELPER FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
debug_echo() {
|
||||
if [[ -n "$DEBUG" ]]; then
|
||||
echo "[DEBUG] $*" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo "[ERROR] $*" >&2
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
rm -f "$TMP_FILE"
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
show_help() {
|
||||
cat <<EOF
|
||||
Usage: $SCRIPT_NAME [OPTIONS]
|
||||
|
||||
Prometheus textfile collector exporter for backup job status.
|
||||
Monitors backup age, size, and success/failure from multiple sources.
|
||||
|
||||
OPTIONS:
|
||||
--dry-run Output metrics to stdout instead of writing to file
|
||||
--debug Enable debug output
|
||||
--help Show this help message
|
||||
--version Show version
|
||||
|
||||
CONFIGURATION:
|
||||
Jobs are configured in /etc/backup-status-exporter.conf (or set CONFIG_FILE).
|
||||
Each line defines a backup job in pipe-delimited format:
|
||||
|
||||
job_name|type|path|max_age_hours
|
||||
|
||||
Types:
|
||||
directory Find the newest file in a directory, report mtime and size
|
||||
statusfile Read a file containing a unix timestamp of last success
|
||||
logfile Parse a log file for success/failure patterns
|
||||
|
||||
Example config:
|
||||
daily_db|directory|/backups/db/|26
|
||||
rsync_home|statusfile|/var/log/rsync-home.status|26
|
||||
restic_full|logfile|/var/log/restic-backup.log|170
|
||||
|
||||
Lines starting with # are comments. Blank lines are ignored.
|
||||
|
||||
ENVIRONMENT VARIABLES:
|
||||
CONFIG_FILE Path to config file (default: /etc/backup-status-exporter.conf)
|
||||
TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter)
|
||||
DEBUG Enable debug output when set to any value
|
||||
|
||||
EXAMPLES:
|
||||
sudo $SCRIPT_NAME
|
||||
$SCRIPT_NAME --dry-run
|
||||
DEBUG=1 sudo $SCRIPT_NAME
|
||||
CONFIG_FILE=/etc/my-backups.conf sudo $SCRIPT_NAME
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
show_version() {
|
||||
echo "$SCRIPT_NAME version $VERSION"
|
||||
exit 0
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# JOB LOADING
|
||||
# ============================================================================
|
||||
|
||||
load_jobs() {
|
||||
if [[ ! -f "$CONFIG_FILE" ]]; then
|
||||
log_error "Config file not found: $CONFIG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local job_count=0
|
||||
|
||||
while IFS= read -r line; do
|
||||
# Strip comments and whitespace
|
||||
line="${line%%#*}"
|
||||
line="${line#"${line%%[![:space:]]*}"}"
|
||||
line="${line%"${line##*[![:space:]]}"}"
|
||||
|
||||
if [[ -z "$line" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "$line"
|
||||
job_count=$((job_count + 1))
|
||||
done < "$CONFIG_FILE"
|
||||
|
||||
if [[ "$job_count" -eq 0 ]]; then
|
||||
log_error "No jobs found in config file: $CONFIG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
debug_echo "Loaded $job_count backup jobs from $CONFIG_FILE"
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# BACKUP CHECK FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
check_directory() {
|
||||
local job_name="$1"
|
||||
local path="$2"
|
||||
local max_age_hours="$3"
|
||||
|
||||
if [[ ! -d "$path" ]]; then
|
||||
debug_echo "[$job_name] Directory not found: $path"
|
||||
echo "0|0|0|0"
|
||||
return
|
||||
fi
|
||||
|
||||
# Find the newest file in the directory
|
||||
local newest_file
|
||||
newest_file=$(find "$path" -type f -printf '%T@ %s %p\n' 2>/dev/null | sort -rn | head -1)
|
||||
|
||||
if [[ -z "$newest_file" ]]; then
|
||||
debug_echo "[$job_name] No files found in: $path"
|
||||
echo "0|0|0|0"
|
||||
return
|
||||
fi
|
||||
|
||||
local file_epoch
|
||||
file_epoch=$(echo "$newest_file" | awk '{printf "%.0f", $1}')
|
||||
local file_size
|
||||
file_size=$(echo "$newest_file" | awk '{print $2}')
|
||||
local file_path
|
||||
file_path=$(echo "$newest_file" | awk '{$1=""; $2=""; print}' | sed 's/^ //')
|
||||
|
||||
local now
|
||||
now=$(date +%s)
|
||||
local age_seconds=$((now - file_epoch))
|
||||
local age_hours
|
||||
age_hours=$(awk "BEGIN {printf \"%.1f\", $age_seconds / 3600}")
|
||||
|
||||
local max_age_seconds=$((max_age_hours * 3600))
|
||||
local status=1
|
||||
if [[ "$age_seconds" -gt "$max_age_seconds" ]]; then
|
||||
status=0
|
||||
fi
|
||||
|
||||
debug_echo "[$job_name] Newest file: $file_path (age=${age_hours}h, size=${file_size}B, status=$status)"
|
||||
echo "${file_epoch}|${age_hours}|${file_size}|${status}"
|
||||
}
|
||||
|
||||
check_statusfile() {
|
||||
local job_name="$1"
|
||||
local path="$2"
|
||||
local max_age_hours="$3"
|
||||
|
||||
if [[ ! -f "$path" ]]; then
|
||||
debug_echo "[$job_name] Status file not found: $path"
|
||||
echo "0|0|0|0"
|
||||
return
|
||||
fi
|
||||
|
||||
local timestamp
|
||||
timestamp=$(head -1 "$path" 2>/dev/null)
|
||||
timestamp="${timestamp//[[:space:]]/}"
|
||||
|
||||
if [[ -z "$timestamp" ]] || ! [[ "$timestamp" =~ ^[0-9]+$ ]]; then
|
||||
debug_echo "[$job_name] Invalid timestamp in status file: $path"
|
||||
echo "0|0|0|0"
|
||||
return
|
||||
fi
|
||||
|
||||
local now
|
||||
now=$(date +%s)
|
||||
local age_seconds=$((now - timestamp))
|
||||
local age_hours
|
||||
age_hours=$(awk "BEGIN {printf \"%.1f\", $age_seconds / 3600}")
|
||||
|
||||
# Status files don't have a meaningful size — report file size of the status file itself
|
||||
local file_size
|
||||
file_size=$(stat -c '%s' "$path" 2>/dev/null) || file_size=0
|
||||
|
||||
local max_age_seconds=$((max_age_hours * 3600))
|
||||
local status=1
|
||||
if [[ "$age_seconds" -gt "$max_age_seconds" ]]; then
|
||||
status=0
|
||||
fi
|
||||
|
||||
debug_echo "[$job_name] Status timestamp: $timestamp (age=${age_hours}h, status=$status)"
|
||||
echo "${timestamp}|${age_hours}|${file_size}|${status}"
|
||||
}
|
||||
|
||||
check_logfile() {
|
||||
local job_name="$1"
|
||||
local path="$2"
|
||||
local max_age_hours="$3"
|
||||
|
||||
if [[ ! -f "$path" ]]; then
|
||||
debug_echo "[$job_name] Log file not found: $path"
|
||||
echo "0|0|0|0"
|
||||
return
|
||||
fi
|
||||
|
||||
# Check for failure patterns first (most recent occurrence)
|
||||
local last_failure
|
||||
last_failure=$(grep -inE "$FAILURE_PATTERNS" "$path" 2>/dev/null | tail -1) || true
|
||||
local last_success
|
||||
last_success=$(grep -inE "$SUCCESS_PATTERNS" "$path" 2>/dev/null | tail -1) || true
|
||||
|
||||
local failure_line=0
|
||||
local success_line=0
|
||||
|
||||
if [[ -n "$last_failure" ]]; then
|
||||
failure_line=$(echo "$last_failure" | cut -d: -f1)
|
||||
fi
|
||||
if [[ -n "$last_success" ]]; then
|
||||
success_line=$(echo "$last_success" | cut -d: -f1)
|
||||
fi
|
||||
|
||||
# Use the log file's mtime as the timestamp
|
||||
local file_epoch
|
||||
file_epoch=$(stat -c '%Y' "$path" 2>/dev/null) || file_epoch=0
|
||||
local file_size
|
||||
file_size=$(stat -c '%s' "$path" 2>/dev/null) || file_size=0
|
||||
|
||||
local now
|
||||
now=$(date +%s)
|
||||
local age_seconds=$((now - file_epoch))
|
||||
local age_hours
|
||||
age_hours=$(awk "BEGIN {printf \"%.1f\", $age_seconds / 3600}")
|
||||
|
||||
local max_age_seconds=$((max_age_hours * 3600))
|
||||
|
||||
# Determine status: success if last success line is after last failure line
|
||||
# and the log is not stale
|
||||
local status=0
|
||||
if [[ "$success_line" -gt "$failure_line" ]] && [[ "$age_seconds" -le "$max_age_seconds" ]]; then
|
||||
status=1
|
||||
fi
|
||||
|
||||
if [[ "$success_line" -eq 0 ]] && [[ "$failure_line" -eq 0 ]]; then
|
||||
debug_echo "[$job_name] No success or failure patterns found in: $path"
|
||||
status=0
|
||||
fi
|
||||
|
||||
debug_echo "[$job_name] Log file: $path (age=${age_hours}h, success_line=$success_line, failure_line=$failure_line, status=$status)"
|
||||
echo "${file_epoch}|${age_hours}|${file_size}|${status}"
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# METRICS COLLECTION
|
||||
# ============================================================================
|
||||
|
||||
collect_metrics() {
|
||||
local jobs=()
|
||||
while IFS= read -r job_line; do
|
||||
jobs+=("$job_line")
|
||||
done < <(load_jobs)
|
||||
|
||||
local output=""
|
||||
local timestamps=""
|
||||
local ages=""
|
||||
local sizes=""
|
||||
local statuses=""
|
||||
|
||||
for job_line in "${jobs[@]}"; do
|
||||
local job_name
|
||||
job_name=$(echo "$job_line" | cut -d'|' -f1)
|
||||
local job_type
|
||||
job_type=$(echo "$job_line" | cut -d'|' -f2)
|
||||
local job_path
|
||||
job_path=$(echo "$job_line" | cut -d'|' -f3)
|
||||
local max_age_hours
|
||||
max_age_hours=$(echo "$job_line" | cut -d'|' -f4)
|
||||
|
||||
if [[ -z "$job_name" ]] || [[ -z "$job_type" ]] || [[ -z "$job_path" ]] || [[ -z "$max_age_hours" ]]; then
|
||||
log_error "Invalid config line: $job_line (expected: job_name|type|path|max_age_hours)"
|
||||
continue
|
||||
fi
|
||||
|
||||
local result=""
|
||||
case "$job_type" in
|
||||
directory)
|
||||
result=$(check_directory "$job_name" "$job_path" "$max_age_hours")
|
||||
;;
|
||||
statusfile)
|
||||
result=$(check_statusfile "$job_name" "$job_path" "$max_age_hours")
|
||||
;;
|
||||
logfile)
|
||||
result=$(check_logfile "$job_name" "$job_path" "$max_age_hours")
|
||||
;;
|
||||
*)
|
||||
log_error "Unknown job type '$job_type' for job '$job_name' (expected: directory, statusfile, logfile)"
|
||||
continue
|
||||
;;
|
||||
esac
|
||||
|
||||
local ts
|
||||
ts=$(echo "$result" | cut -d'|' -f1)
|
||||
local age
|
||||
age=$(echo "$result" | cut -d'|' -f2)
|
||||
local size
|
||||
size=$(echo "$result" | cut -d'|' -f3)
|
||||
local st
|
||||
st=$(echo "$result" | cut -d'|' -f4)
|
||||
|
||||
timestamps+="linux_backup_last_success_timestamp{job=\"${job_name}\"} ${ts}\n"
|
||||
ages+="linux_backup_age_hours{job=\"${job_name}\"} ${age}\n"
|
||||
sizes+="linux_backup_size_bytes{job=\"${job_name}\"} ${size}\n"
|
||||
statuses+="linux_backup_status{job=\"${job_name}\"} ${st}\n"
|
||||
done
|
||||
|
||||
output+="# HELP linux_backup_last_success_timestamp Unix timestamp of the last successful backup\n"
|
||||
output+="# TYPE linux_backup_last_success_timestamp gauge\n"
|
||||
output+="$timestamps"
|
||||
output+="# HELP linux_backup_age_hours Hours since the last successful backup\n"
|
||||
output+="# TYPE linux_backup_age_hours gauge\n"
|
||||
output+="$ages"
|
||||
output+="# HELP linux_backup_size_bytes Size of the last backup in bytes\n"
|
||||
output+="# TYPE linux_backup_size_bytes gauge\n"
|
||||
output+="$sizes"
|
||||
output+="# HELP linux_backup_status Backup job status (1=ok, 0=stale or failed)\n"
|
||||
output+="# TYPE linux_backup_status gauge\n"
|
||||
output+="$statuses"
|
||||
|
||||
printf '%b' "$output"
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# OUTPUT
|
||||
# ============================================================================
|
||||
|
||||
write_metrics() {
|
||||
local metrics
|
||||
metrics=$(collect_metrics)
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
echo "$metrics"
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ ! -d "$TEXTFILE_DIR" ]]; then
|
||||
log_error "Textfile collector directory does not exist: $TEXTFILE_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$metrics" > "$TMP_FILE"
|
||||
mv "$TMP_FILE" "$OUTPUT_FILE"
|
||||
debug_echo "Metrics written to $OUTPUT_FILE"
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# MAIN
|
||||
# ============================================================================
|
||||
|
||||
main() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--debug)
|
||||
DEBUG=1
|
||||
shift
|
||||
;;
|
||||
--help|-h)
|
||||
show_help
|
||||
;;
|
||||
--version|-v)
|
||||
show_version
|
||||
;;
|
||||
*)
|
||||
log_error "Unknown option: $1"
|
||||
echo "Use --help for usage information" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
write_metrics
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user