#!/usr/bin/env bash ######################################################################################### #### promstack-backup.sh — Backup/restore Prometheus, Grafana, Alertmanager, #### #### and Blackbox Exporter. Export dashboards, alert rules, datasources, configs, #### #### and TSDB snapshots for disaster recovery. #### #### Requires: bash 4+, curl, optionally jq for dashboard export #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version 1.11 #### #### #### #### Usage: #### #### ./promstack-backup.sh --backup --output-dir ./backups #### #### #### #### See --help for all options. #### ######################################################################################### set -euo pipefail # ── Colors (pre-initialized) ───────────────────────────────────────── RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" setup_colors() { if [[ "${COLOR:-auto}" == "never" ]]; then return fi if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' BOLD='\033[1m' DIM='\033[2m' RESET='\033[0m' fi } # ── Logging ─────────────────────────────────────────────────────────── log() { echo -e "${BLUE}[INFO]${RESET} $*"; } warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } die() { err "$*"; exit 1; } section_header() { echo "" echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" echo "" } field() { printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" } field_color() { printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" } elapsed() { local end_time end_time=$(date +%s) echo "$(( end_time - START_TIME ))s" } # ── Defaults ────────────────────────────────────────────────────────── RUN_MODE="" OUTPUT_DIR="${PSB_OUTPUT_DIR:-./monitoring-backups}" RESTORE_DIR="" GRAFANA_URL="${GRAFANA_URL:-http://localhost:3000}" GRAFANA_TOKEN="${GRAFANA_API_KEY:-}" GRAFANA_USER="${GRAFANA_USER:-admin}" GRAFANA_PASS="${GRAFANA_PASS:-}" PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9090}" ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://localhost:9093}" AM_CONFIG_PATH="${AM_CONFIG_PATH:-/etc/alertmanager}" BLACKBOX_URL="${BLACKBOX_URL:-http://localhost:9115}" BLACKBOX_CONFIG_PATH="${BLACKBOX_CONFIG_PATH:-/etc/blackbox_exporter}" PROM_CONFIG_PATH="${PROM_CONFIG_PATH:-/etc/prometheus}" PROM_DATA_PATH="${PROM_DATA_PATH:-/var/lib/prometheus}" COMPONENTS="${PSB_COMPONENTS:-all}" VERBOSE="${VERBOSE:-false}" COLOR="${COLOR:-auto}" # ── State ───────────────────────────────────────────────────────────── SCRIPT_NAME="$(basename "$0")" readonly SCRIPT_NAME START_TIME="" BACKUP_ID="" BACKUP_COUNT=0 ERROR_COUNT=0 # ── API helpers ────────────────────────────────────────────────────── grafana_api() { local method="$1" endpoint="$2" shift 2 local -a auth_args=() if [[ -n "$GRAFANA_TOKEN" ]]; then auth_args+=(-H "Authorization: Bearer ${GRAFANA_TOKEN}") elif [[ -n "$GRAFANA_PASS" ]]; then auth_args+=(-u "${GRAFANA_USER}:${GRAFANA_PASS}") fi curl -sS -X "$method" "${auth_args[@]}" -H "Content-Type: application/json" \ "${GRAFANA_URL}${endpoint}" "$@" } backup_item() { local label="$1" outfile="$2" shift 2 verbose "Backing up: ${label}" if "$@" > "$outfile" 2>/dev/null && [[ -s "$outfile" ]]; then ((BACKUP_COUNT++)) || true echo -e " ${GREEN}✓${RESET} ${label}" else ((ERROR_COUNT++)) || true echo -e " ${YELLOW}⊘${RESET} ${label} ${DIM}(skipped)${RESET}" rm -f "$outfile" fi } component_selected() { [[ "$COMPONENTS" == "all" ]] || echo ",$COMPONENTS," | grep -qi ",$1," } # Find a config file: check primary path first, fall back to /etc/prometheus/ find_config() { local filename="$1" primary_dir="$2" if [[ -f "${primary_dir}/${filename}" ]]; then echo "${primary_dir}/${filename}" elif [[ -f "${PROM_CONFIG_PATH}/${filename}" ]]; then echo "${PROM_CONFIG_PATH}/${filename}" fi } # ══════════════════════════════════════════════════════════════════════ # BACKUP # ══════════════════════════════════════════════════════════════════════ backup_grafana() { local backup_dir="$1" local grafana_dir="${backup_dir}/grafana" mkdir -p "${grafana_dir}/dashboards" section_header "Grafana Backup" field "URL:" "$GRAFANA_URL" echo "" # Test connectivity if ! grafana_api GET "/api/health" > /dev/null 2>&1; then warn "Cannot reach Grafana at ${GRAFANA_URL} — skipping" ((ERROR_COUNT++)) || true return fi # Export datasources backup_item "Datasources" "${grafana_dir}/datasources.json" \ grafana_api GET "/api/datasources" # Export folders backup_item "Folders" "${grafana_dir}/folders.json" \ grafana_api GET "/api/folders" # Export dashboards local dashboard_list dashboard_list=$(grafana_api GET "/api/search?type=dash-db&limit=5000" 2>/dev/null || echo "[]") local dash_count=0 if command -v jq &>/dev/null; then local uids uids=$(echo "$dashboard_list" | jq -r '.[].uid // empty' 2>/dev/null || true) while IFS= read -r uid; do [[ -z "$uid" ]] && continue local dash_file="${grafana_dir}/dashboards/${uid}.json" if grafana_api GET "/api/dashboards/uid/${uid}" > "$dash_file" 2>/dev/null && [[ -s "$dash_file" ]]; then ((dash_count++)) || true ((BACKUP_COUNT++)) || true else rm -f "$dash_file" fi done <<< "$uids" echo -e " ${GREEN}✓${RESET} Dashboards (${dash_count} exported)" else backup_item "Dashboard list" "${grafana_dir}/dashboard-list.json" \ echo "$dashboard_list" fi # Export alert rules backup_item "Alert rules" "${grafana_dir}/alert-rules.json" \ grafana_api GET "/api/v1/provisioning/alert-rules" # Export notification policies backup_item "Notification policies" "${grafana_dir}/notification-policies.json" \ grafana_api GET "/api/v1/provisioning/policies" } backup_prometheus() { local backup_dir="$1" local prom_dir="${backup_dir}/prometheus" mkdir -p "${prom_dir}/rules" section_header "Prometheus Backup" field "Config path:" "$PROM_CONFIG_PATH" field "API:" "$PROMETHEUS_URL" echo "" # Copy prometheus.yml if [[ -f "${PROM_CONFIG_PATH}/prometheus.yml" ]]; then backup_item "prometheus.yml" "${prom_dir}/prometheus.yml" \ cp "${PROM_CONFIG_PATH}/prometheus.yml" "${prom_dir}/prometheus.yml" # Fix: backup_item ran cp redirected to file, but cp doesn't output to stdout # Re-copy properly if [[ -f "${PROM_CONFIG_PATH}/prometheus.yml" ]]; then cp "${PROM_CONFIG_PATH}/prometheus.yml" "${prom_dir}/prometheus.yml" 2>/dev/null || true fi else warn "prometheus.yml not found at ${PROM_CONFIG_PATH}" ((ERROR_COUNT++)) || true fi # Copy alert/recording rules local rules_count=0 if [[ -d "${PROM_CONFIG_PATH}/rules" ]]; then while IFS= read -r -d '' rf; do cp "$rf" "${prom_dir}/rules/" 2>/dev/null && ((rules_count++)) || true done < <(find "${PROM_CONFIG_PATH}/rules" \( -name '*.yml' -o -name '*.yaml' \) -print0 2>/dev/null | sort -z) elif [[ -d "${PROM_CONFIG_PATH}/rules.d" ]]; then while IFS= read -r -d '' rf; do cp "$rf" "${prom_dir}/rules/" 2>/dev/null && ((rules_count++)) || true done < <(find "${PROM_CONFIG_PATH}/rules.d" \( -name '*.yml' -o -name '*.yaml' \) -print0 2>/dev/null | sort -z) fi if [[ "$rules_count" -gt 0 ]]; then echo -e " ${GREEN}✓${RESET} Alert/recording rules (${rules_count} files)" ((BACKUP_COUNT++)) || true fi # Export current alerts via API backup_item "Active alerts" "${prom_dir}/active-alerts.json" \ curl -sS "${PROMETHEUS_URL}/api/v1/alerts" # Export targets backup_item "Scrape targets" "${prom_dir}/targets.json" \ curl -sS "${PROMETHEUS_URL}/api/v1/targets" # Export rule groups via API backup_item "Rule groups (API)" "${prom_dir}/rule-groups.json" \ curl -sS "${PROMETHEUS_URL}/api/v1/rules" } backup_alertmanager() { local backup_dir="$1" local am_dir="${backup_dir}/alertmanager" mkdir -p "$am_dir" section_header "Alertmanager Backup" field "API:" "$ALERTMANAGER_URL" echo "" # Get status (includes config) backup_item "Config (via API)" "${am_dir}/status.json" \ curl -sS "${ALERTMANAGER_URL}/api/v2/status" # Export silences backup_item "Silences" "${am_dir}/silences.json" \ curl -sS "${ALERTMANAGER_URL}/api/v2/silences" # Export active alerts backup_item "Active alerts" "${am_dir}/alerts.json" \ curl -sS "${ALERTMANAGER_URL}/api/v2/alerts" # Copy config file — check canonical path, then /etc/prometheus/ local am_conf am_conf=$(find_config "alertmanager.yml" "$AM_CONFIG_PATH") if [[ -n "$am_conf" ]]; then verbose "Found alertmanager.yml at ${am_conf}" backup_item "alertmanager.yml" "${am_dir}/alertmanager.yml" \ cat "$am_conf" fi # Copy templates local tmpl_dir="" local tmpl_count=0 if [[ -d "${AM_CONFIG_PATH}/templates" ]]; then tmpl_dir="${AM_CONFIG_PATH}/templates" elif [[ -d "${PROM_CONFIG_PATH}/templates" ]]; then tmpl_dir="${PROM_CONFIG_PATH}/templates" fi if [[ -n "$tmpl_dir" ]]; then mkdir -p "${am_dir}/templates" while IFS= read -r -d '' tf; do cp "$tf" "${am_dir}/templates/" 2>/dev/null && ((tmpl_count++)) || true done < <(find "$tmpl_dir" -type f -print0 2>/dev/null | sort -z) if [[ "$tmpl_count" -gt 0 ]]; then echo -e " ${GREEN}✓${RESET} Templates (${tmpl_count} files)" ((BACKUP_COUNT++)) || true fi fi } backup_blackbox() { local backup_dir="$1" local bb_dir="${backup_dir}/blackbox" mkdir -p "$bb_dir" section_header "Blackbox Exporter Backup" field "Config path:" "$BLACKBOX_CONFIG_PATH" field "API:" "$BLACKBOX_URL" echo "" # Test connectivity if ! curl -sS "${BLACKBOX_URL}/-/healthy" > /dev/null 2>&1; then warn "Cannot reach Blackbox Exporter at ${BLACKBOX_URL} — skipping" ((ERROR_COUNT++)) || true return fi # Copy blackbox.yml — check canonical path, then /etc/prometheus/ local bb_conf bb_conf=$(find_config "blackbox.yml" "$BLACKBOX_CONFIG_PATH") if [[ -z "$bb_conf" ]]; then bb_conf=$(find_config "config.yml" "$BLACKBOX_CONFIG_PATH") fi if [[ -n "$bb_conf" ]]; then local bb_name bb_name=$(basename "$bb_conf") verbose "Found ${bb_name} at ${bb_conf}" backup_item "${bb_name}" "${bb_dir}/${bb_name}" \ cat "$bb_conf" else warn "No blackbox config found at ${BLACKBOX_CONFIG_PATH} or ${PROM_CONFIG_PATH}" ((ERROR_COUNT++)) || true fi # Export probe config via API backup_item "Config (via API)" "${bb_dir}/config-api.json" \ curl -sS "${BLACKBOX_URL}/config" } do_backup() { BACKUP_ID="$(date +%Y%m%d-%H%M%S)" local backup_dir="${OUTPUT_DIR}/${BACKUP_ID}" mkdir -p "$backup_dir" log "Starting monitoring backup..." field "Backup ID:" "$BACKUP_ID" field "Output:" "$backup_dir" field "Components:" "$COMPONENTS" if component_selected "grafana"; then backup_grafana "$backup_dir" fi if component_selected "prometheus"; then backup_prometheus "$backup_dir" fi if component_selected "alertmanager"; then backup_alertmanager "$backup_dir" fi if component_selected "blackbox"; then backup_blackbox "$backup_dir" fi # Create manifest section_header "Finalizing" local total_size total_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}' || echo "unknown") local file_count file_count=$(find "$backup_dir" -type f 2>/dev/null | wc -l || echo 0) cat > "${backup_dir}/manifest.json" </dev/null \ | sort -z \ | xargs -0 sha256sum 2>/dev/null > "${backup_dir}/checksums.sha256" || true echo -e " ${GREEN}✓${RESET} Manifest and checksums created" section_header "Backup Summary" field "Backup ID:" "$BACKUP_ID" field "Location:" "$backup_dir" field_color "Files backed up:" "${GREEN}${BACKUP_COUNT}${RESET}" if [[ "$ERROR_COUNT" -gt 0 ]]; then field_color "Errors:" "${RED}${ERROR_COUNT}${RESET}" else field_color "Errors:" "${GREEN}0${RESET}" fi field "Total size:" "$total_size" field "Duration:" "$(elapsed)" } # ══════════════════════════════════════════════════════════════════════ # RESTORE # ══════════════════════════════════════════════════════════════════════ do_restore() { [[ -z "$RESTORE_DIR" ]] && die "No restore directory specified (--restore-dir)" [[ ! -d "$RESTORE_DIR" ]] && die "Restore directory not found: ${RESTORE_DIR}" [[ ! -f "${RESTORE_DIR}/manifest.json" ]] && die "No manifest.json in ${RESTORE_DIR}" log "Restoring from ${RESTORE_DIR}..." # Verify checksums first if [[ -f "${RESTORE_DIR}/checksums.sha256" ]]; then log "Verifying backup integrity..." if (cd / && sha256sum -c "${RESTORE_DIR}/checksums.sha256" > /dev/null 2>&1); then echo -e " ${GREEN}✓${RESET} Checksums verified" else warn "Some checksums failed — proceed with caution" fi fi # Restore Grafana if [[ -d "${RESTORE_DIR}/grafana" ]] && component_selected "grafana"; then section_header "Restoring Grafana" # Restore datasources if [[ -f "${RESTORE_DIR}/grafana/datasources.json" ]] && command -v jq &>/dev/null; then local ds_count=0 while IFS= read -r ds; do [[ -z "$ds" ]] && continue local ds_name ds_name=$(echo "$ds" | jq -r '.name // "unknown"') if grafana_api POST "/api/datasources" -d "$ds" > /dev/null 2>&1; then echo -e " ${GREEN}✓${RESET} Datasource: ${ds_name}" ((ds_count++)) || true else echo -e " ${YELLOW}⊘${RESET} Datasource: ${ds_name} (may already exist)" fi done < <(jq -c '.[]' "${RESTORE_DIR}/grafana/datasources.json" 2>/dev/null) log "Restored ${ds_count} datasources" fi # Restore dashboards if [[ -d "${RESTORE_DIR}/grafana/dashboards" ]]; then local dash_count=0 for df in "${RESTORE_DIR}/grafana/dashboards"/*.json; do [[ ! -f "$df" ]] && continue if command -v jq &>/dev/null; then local payload payload=$(jq '{dashboard: .dashboard, overwrite: true}' "$df" 2>/dev/null || cat "$df") if grafana_api POST "/api/dashboards/db" -d "$payload" > /dev/null 2>&1; then ((dash_count++)) || true fi fi done echo -e " ${GREEN}✓${RESET} Dashboards (${dash_count} imported)" fi fi # Restore Prometheus config if [[ -d "${RESTORE_DIR}/prometheus" ]] && component_selected "prometheus"; then section_header "Restoring Prometheus" if [[ -f "${RESTORE_DIR}/prometheus/prometheus.yml" ]]; then if cp "${RESTORE_DIR}/prometheus/prometheus.yml" "${PROM_CONFIG_PATH}/prometheus.yml" 2>/dev/null; then echo -e " ${GREEN}✓${RESET} prometheus.yml restored" else echo -e " ${RED}✗${RESET} Failed to restore prometheus.yml (check permissions)" fi fi if [[ -d "${RESTORE_DIR}/prometheus/rules" ]]; then local target_rules="${PROM_CONFIG_PATH}/rules" mkdir -p "$target_rules" 2>/dev/null || true local rule_count=0 for rf in "${RESTORE_DIR}/prometheus/rules"/*; do [[ ! -f "$rf" ]] && continue cp "$rf" "$target_rules/" 2>/dev/null && ((rule_count++)) || true done echo -e " ${GREEN}✓${RESET} Alert rules (${rule_count} files)" fi # Reload Prometheus if curl -sS -X POST "${PROMETHEUS_URL}/-/reload" > /dev/null 2>&1; then echo -e " ${GREEN}✓${RESET} Prometheus reloaded" else warn "Could not reload Prometheus — restart manually" fi fi # Restore Alertmanager config if [[ -d "${RESTORE_DIR}/alertmanager" ]] && component_selected "alertmanager"; then section_header "Restoring Alertmanager" if [[ -f "${RESTORE_DIR}/alertmanager/alertmanager.yml" ]]; then # Detect where alertmanager.yml lives on this system local am_target="${AM_CONFIG_PATH}/alertmanager.yml" if [[ ! -d "$AM_CONFIG_PATH" ]] && [[ -f "${PROM_CONFIG_PATH}/alertmanager.yml" ]]; then am_target="${PROM_CONFIG_PATH}/alertmanager.yml" fi if cp "${RESTORE_DIR}/alertmanager/alertmanager.yml" "$am_target" 2>/dev/null; then echo -e " ${GREEN}✓${RESET} alertmanager.yml restored → ${am_target}" else echo -e " ${RED}✗${RESET} Failed to restore alertmanager.yml" fi fi if curl -sS -X POST "${ALERTMANAGER_URL}/-/reload" > /dev/null 2>&1; then echo -e " ${GREEN}✓${RESET} Alertmanager reloaded" else warn "Could not reload Alertmanager — restart manually" fi fi # Restore Blackbox Exporter config if [[ -d "${RESTORE_DIR}/blackbox" ]] && component_selected "blackbox"; then section_header "Restoring Blackbox Exporter" local bb_conf="" if [[ -f "${RESTORE_DIR}/blackbox/blackbox.yml" ]]; then bb_conf="blackbox.yml" elif [[ -f "${RESTORE_DIR}/blackbox/config.yml" ]]; then bb_conf="config.yml" fi if [[ -n "$bb_conf" ]]; then # Detect where blackbox config lives on this system local bb_target="${BLACKBOX_CONFIG_PATH}/${bb_conf}" if [[ ! -d "$BLACKBOX_CONFIG_PATH" ]] && [[ -f "${PROM_CONFIG_PATH}/${bb_conf}" ]]; then bb_target="${PROM_CONFIG_PATH}/${bb_conf}" fi if cp "${RESTORE_DIR}/blackbox/${bb_conf}" "$bb_target" 2>/dev/null; then echo -e " ${GREEN}✓${RESET} ${bb_conf} restored → ${bb_target}" else echo -e " ${RED}✗${RESET} Failed to restore ${bb_conf}" fi fi if curl -sS -X POST "${BLACKBOX_URL}/-/reload" > /dev/null 2>&1; then echo -e " ${GREEN}✓${RESET} Blackbox Exporter reloaded" else warn "Could not reload Blackbox Exporter — restart manually" fi fi section_header "Restore Summary" field "Restored from:" "$RESTORE_DIR" field "Duration:" "$(elapsed)" log "Restore complete — verify services are healthy" } # ══════════════════════════════════════════════════════════════════════ # VERIFY # ══════════════════════════════════════════════════════════════════════ do_verify() { local verify_dir="${RESTORE_DIR:-}" [[ -z "$verify_dir" ]] && die "Specify backup directory with --restore-dir" [[ ! -d "$verify_dir" ]] && die "Directory not found: ${verify_dir}" section_header "Backup Verification" field "Directory:" "$verify_dir" echo "" local pass=0 fail=0 # Check manifest if [[ -f "${verify_dir}/manifest.json" ]]; then echo -e " ${GREEN}✓${RESET} manifest.json present" ((pass++)) || true else echo -e " ${RED}✗${RESET} manifest.json missing" ((fail++)) || true fi # Check checksums if [[ -f "${verify_dir}/checksums.sha256" ]]; then if (cd / && sha256sum -c "${verify_dir}/checksums.sha256" > /dev/null 2>&1); then echo -e " ${GREEN}✓${RESET} All checksums valid" ((pass++)) || true else echo -e " ${RED}✗${RESET} Checksum verification failed" ((fail++)) || true fi else echo -e " ${YELLOW}!${RESET} No checksums file" fi # Check components for comp in grafana prometheus alertmanager blackbox; do if [[ -d "${verify_dir}/${comp}" ]]; then local fcount fcount=$(find "${verify_dir}/${comp}" -type f 2>/dev/null | wc -l || echo 0) echo -e " ${GREEN}✓${RESET} ${comp}/ (${fcount} files)" ((pass++)) || true fi done echo "" field_color "Passed:" "${GREEN}${pass}${RESET}" if [[ "$fail" -gt 0 ]]; then field_color "Failed:" "${RED}${fail}${RESET}" else field_color "Failed:" "${GREEN}0${RESET}" fi } # ══════════════════════════════════════════════════════════════════════ # LIST # ══════════════════════════════════════════════════════════════════════ do_list() { [[ ! -d "$OUTPUT_DIR" ]] && die "Backup directory not found: ${OUTPUT_DIR}" section_header "Available Backups" printf " ${BOLD}%-20s %-22s %-16s %8s %6s${RESET}\n" "BACKUP ID" "TIMESTAMP" "COMPONENTS" "SIZE" "FILES" printf " %s\n" "$(printf '%.0s─' {1..76})" local count=0 while IFS= read -r d; do local manifest="${d}/manifest.json" [[ ! -f "$manifest" ]] && continue local bid ts comp sz fc bid=$(basename "$d") if command -v jq &>/dev/null; then ts=$(jq -r '.timestamp // "unknown"' "$manifest" 2>/dev/null || echo "unknown") comp=$(jq -r '.components // "unknown"' "$manifest" 2>/dev/null || echo "unknown") sz=$(jq -r '.size // "?"' "$manifest" 2>/dev/null || echo "?") fc=$(jq -r '.files // 0' "$manifest" 2>/dev/null || echo 0) else ts="(jq required)" comp="?" sz="?" fc="?" fi printf " %-20s %-22s %-16s %8s %6s\n" "$bid" "${ts:0:20}" "${comp:0:14}" "$sz" "$fc" ((count++)) || true done < <(find "$OUTPUT_DIR" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort -r) echo "" field "Total backups:" "$count" if [[ "$count" -eq 0 ]]; then warn "No backups found in ${OUTPUT_DIR}" fi } # ══════════════════════════════════════════════════════════════════════ # HELP # ══════════════════════════════════════════════════════════════════════ show_help() { cat <