#!/usr/bin/env bash ######################################################################################### #### prometheus-smoke-tester.sh — Focused smoke tests for Prometheus + Alertmanager #### #### Zero external dependencies. Runs in air-gapped environments. #### #### Requires: bash 4+, curl, openssl (optional) #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version 1.01 #### #### #### #### Usage: #### #### export PROMETHEUS_URL="http://prometheus:9090" #### #### ./prometheus-smoke-tester.sh #### #### #### #### See --help for all options. #### ######################################################################################### set -euo pipefail # ── Defaults ────────────────────────────────────────────────────────── PROMETHEUS_URL="${PROMETHEUS_URL:-}" PROMETHEUS_AUTH_TOKEN="${PROMETHEUS_AUTH_TOKEN:-}" ALERTMANAGER_URL="${ALERTMANAGER_URL:-}" EXPECTED_JOBS="${EXPECTED_JOBS:-}" WATCHDOG_ALERT_NAME="${WATCHDOG_ALERT_NAME:-Watchdog}" QUERY_TIMEOUT_MS="${QUERY_TIMEOUT_MS:-5000}" TSDB_PATH="${TSDB_PATH:-}" CURL_TIMEOUT="${CURL_TIMEOUT:-10}" CURL_INSECURE="${CURL_INSECURE:-false}" SKIP_ALERTMANAGER="${SKIP_ALERTMANAGER:-false}" SKIP_REMOTE_WRITE="${SKIP_REMOTE_WRITE:-false}" OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}" VERBOSE="${VERBOSE:-false}" COLOR="${COLOR:-auto}" PASS=0 FAIL=0 SKIP=0 TOTAL=0 RESULTS=() START_TIME=0 TEST_ALERT_NAME="smoke_test_alert_$$" # ── Colors ──────────────────────────────────────────────────────────── setup_colors() { if [[ "$COLOR" == "never" ]]; then RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" return fi if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' BOLD='\033[1m' RESET='\033[0m' else RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" fi } # ── Logging ─────────────────────────────────────────────────────────── log() { echo -e "${BLUE}[INFO]${RESET} $*"; } warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } # ── Test Recording ──────────────────────────────────────────────────── record_pass() { local name="$1" local detail="${2:-}" ((PASS++)) || true ((TOTAL++)) || true RESULTS+=("PASS|${name}|${detail}") if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name}" else echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}" fi } record_fail() { local name="$1" local detail="${2:-}" ((FAIL++)) || true ((TOTAL++)) || true RESULTS+=("FAIL|${name}|${detail}") if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "not ok ${TOTAL} - ${name}" [[ -n "$detail" ]] && echo " # ${detail}" else echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}" fi } record_skip() { local name="$1" local reason="${2:-}" ((SKIP++)) || true ((TOTAL++)) || true RESULTS+=("SKIP|${name}|${reason}") if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name} # SKIP ${reason}" else echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}" fi } # ── TAP ─────────────────────────────────────────────────────────────── print_tap_header() { echo "TAP version 13" } print_tap_footer() { echo "1..${TOTAL}" echo "# pass ${PASS}" echo "# fail ${FAIL}" echo "# skip ${SKIP}" } # ── JUnit XML ───────────────────────────────────────────────────────── xml_escape() { local s="$1" s="${s//&/&}" s="${s///>}" s="${s//\"/"}" echo "$s" } write_junit() { local end_time end_time=$(date +%s) local duration=$(( end_time - START_TIME )) cat > "$JUNIT_FILE" < JUNIT_EOF local result status name detail for result in "${RESULTS[@]}"; do status="${result%%|*}" local rest="${result#*|}" name="${rest%%|*}" detail="${rest#*|}" name=$(xml_escape "$name") detail=$(xml_escape "$detail") case "$status" in PASS) echo " " >> "$JUNIT_FILE" if [[ -n "$detail" ]]; then echo " ${detail}" >> "$JUNIT_FILE" fi echo " " >> "$JUNIT_FILE" ;; FAIL) echo " " >> "$JUNIT_FILE" echo " FAILED: ${name} — ${detail}" >> "$JUNIT_FILE" echo " " >> "$JUNIT_FILE" ;; SKIP) echo " " >> "$JUNIT_FILE" echo " " >> "$JUNIT_FILE" echo " " >> "$JUNIT_FILE" ;; esac done echo " " >> "$JUNIT_FILE" echo "" >> "$JUNIT_FILE" log "JUnit report written to ${JUNIT_FILE}" } # ── curl Wrapper ────────────────────────────────────────────────────── prom_curl() { local url="$1" shift local curl_opts=(-s -S --max-time "$CURL_TIMEOUT") if [[ "$CURL_INSECURE" == "true" ]]; then curl_opts+=(-k) fi if [[ -n "$PROMETHEUS_AUTH_TOKEN" ]]; then curl_opts+=(-H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}") fi verbose "curl GET ${url}" curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null } prom_curl_status() { local url="$1" shift local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") if [[ "$CURL_INSECURE" == "true" ]]; then curl_opts+=(-k) fi if [[ -n "$PROMETHEUS_AUTH_TOKEN" ]]; then curl_opts+=(-H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}") fi curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null } am_curl() { local url="$1" shift local curl_opts=(-s -S --max-time "$CURL_TIMEOUT") if [[ "$CURL_INSECURE" == "true" ]]; then curl_opts+=(-k) fi verbose "curl GET ${url}" curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null } am_curl_status() { local url="$1" shift local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") if [[ "$CURL_INSECURE" == "true" ]]; then curl_opts+=(-k) fi curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null } # ── JSON Helpers (no jq) ────────────────────────────────────────────── json_value() { local key="$1" local json="$2" echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"?\K[^\",}]+" || true; } | head -1 } json_value_string() { local key="$1" local json="$2" echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"\K[^\"]*" || true; } | head -1 } # ── Cleanup ─────────────────────────────────────────────────────────── cleanup() { # Remove test alert from Alertmanager if we posted one if [[ "${TEST_ALERT_POSTED:-false}" == "true" && "$SKIP_ALERTMANAGER" != "true" && -n "$ALERTMANAGER_URL" ]]; then verbose "Cleaning up test alert from Alertmanager" am_curl "${ALERTMANAGER_URL}/api/v2/silences" >/dev/null 2>&1 || true fi } trap cleanup EXIT # ── Usage ───────────────────────────────────────────────────────────── usage() { cat < 5.0' | wc -l) if [[ $slow_groups -eq 0 ]]; then record_pass "Rule group evaluation speed" "all groups under 5s" else record_fail "Slow rule groups" "${slow_groups} group(s) over 5s evaluation time" fi } # ── Alerts ──────────────────────────────────────────────────────────── test_alerts() { section "Alerts" local response response=$(prom_curl "${PROMETHEUS_URL}/api/v1/alerts") || { record_fail "Alerts API reachable" "connection failed" return } local status status=$(json_value "status" "$response") if [[ "$status" != "success" ]]; then record_fail "Alerts API reachable" "status: ${status:-empty}" return fi # Check for firing critical alerts local critical_count critical_count=$(echo "$response" | { grep -oP '"severity"\s*:\s*"critical"' || true; } | wc -l) if [[ $critical_count -eq 0 ]]; then record_pass "No critical alerts firing" "clean" else record_fail "Critical alerts firing" "${critical_count} critical alert(s) active" fi # Watchdog dead man's switch local watchdog watchdog=$(echo "$response" | { grep -oP "\"alertname\"\s*:\s*\"${WATCHDOG_ALERT_NAME}\"" || true; } | wc -l) if [[ $watchdog -gt 0 ]]; then record_pass "Watchdog alert firing" "dead man's switch active" else record_fail "Watchdog alert not firing" "${WATCHDOG_ALERT_NAME} should always fire" fi } # ── Alertmanager Integration ────────────────────────────────────────── test_alertmanager() { section "Alertmanager Integration" if [[ "$SKIP_ALERTMANAGER" == "true" ]]; then record_skip "Alertmanager tests" "skipped via --skip-alertmanager" return fi if [[ -z "$ALERTMANAGER_URL" ]]; then record_skip "Alertmanager tests" "ALERTMANAGER_URL not set" return fi # Reachability local am_status am_status=$(am_curl_status "${ALERTMANAGER_URL}/-/healthy") || am_status="000" if [[ "$am_status" == "200" ]]; then record_pass "Alertmanager reachable" "HTTP 200" else record_fail "Alertmanager reachable" "HTTP ${am_status}" return fi # Check Prometheus can reach Alertmanager via its own API local am_response am_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/alertmanagers") || { record_skip "Prometheus→Alertmanager link" "API unavailable" return } local active_am active_am=$(echo "$am_response" | grep -oP '"activeAlertmanagers"\s*:\s*\[\K[^\]]*' | grep -c "url" || true) if [[ $active_am -gt 0 ]]; then record_pass "Prometheus→Alertmanager connected" "${active_am} active Alertmanager(s)" else record_fail "Prometheus→Alertmanager disconnected" "no active Alertmanagers" fi # Fire a test alert local alert_payload alert_payload=$(cat </dev/null || date -u -v+2M +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "2099-01-01T00:00:00Z")"}] ALERT_EOF ) local post_status post_status=$(am_curl_status "${ALERTMANAGER_URL}/api/v2/alerts" -X POST -H "Content-Type: application/json" -d "$alert_payload") || post_status="000" if [[ "$post_status" == "200" ]]; then TEST_ALERT_POSTED=true record_pass "Test alert posted to Alertmanager" "HTTP 200" # Wait briefly and verify sleep 2 local verify_response verify_response=$(am_curl "${ALERTMANAGER_URL}/api/v2/alerts?filter=alertname%3D${TEST_ALERT_NAME}") || verify_response="" if echo "$verify_response" | grep -q "$TEST_ALERT_NAME"; then record_pass "Test alert visible in Alertmanager" "alert round-trip confirmed" else record_fail "Test alert not visible" "posted but not found in /api/v2/alerts" fi # Silence and expire the test alert local silence_payload silence_payload=$(cat </dev/null || date -u -v+1M +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "2099-01-01T00:00:00Z")","createdBy":"prometheus-smoke-tester","comment":"auto-cleanup"} SILENCE_EOF ) am_curl_status "${ALERTMANAGER_URL}/api/v2/silences" -X POST -H "Content-Type: application/json" -d "$silence_payload" >/dev/null 2>&1 || true verbose "Test alert silenced for cleanup" else record_fail "Test alert post failed" "HTTP ${post_status}" fi } # ── Configuration ───────────────────────────────────────────────────── test_configuration() { section "Configuration" # Config loaded local config_response config_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/config") || { record_fail "Configuration loaded" "connection failed" return } local config_status config_status=$(json_value "status" "$config_response") if [[ "$config_status" == "success" ]]; then record_pass "Configuration loaded" "prometheus.yml active" else record_fail "Configuration loaded" "status: ${config_status:-empty}" fi # Runtime flags local flags_response flags_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/flags") || { record_fail "Runtime flags" "connection failed" return } local flags_status flags_status=$(json_value "status" "$flags_response") if [[ "$flags_status" == "success" ]]; then local retention retention=$(json_value_string "storage.tsdb.retention.time" "$flags_response") if [[ -z "$retention" ]]; then retention="default" fi record_pass "Runtime flags accessible" "retention: ${retention}" else record_fail "Runtime flags" "status: ${flags_status:-empty}" fi } # ── TSDB Health ─────────────────────────────────────────────────────── test_tsdb() { section "TSDB Health" local response response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/tsdb") || { record_fail "TSDB status API" "connection failed" return } local status status=$(json_value "status" "$response") if [[ "$status" != "success" ]]; then record_fail "TSDB status API" "status: ${status:-empty}" return fi # Head stats local head_series head_series=$(json_value "numSeries" "$response") if [[ -n "$head_series" ]]; then record_pass "TSDB head series" "${head_series} active series" else record_pass "TSDB status" "responding" fi local head_chunks head_chunks=$(json_value "chunkCount" "$response") if [[ -n "$head_chunks" ]]; then record_pass "TSDB head chunks" "${head_chunks} chunks in head" fi # Runtime info for WAL and compaction local runtime_response runtime_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/runtimeinfo") || { verbose "Runtime info unavailable" return } local runtime_status runtime_status=$(json_value "status" "$runtime_response") if [[ "$runtime_status" != "success" ]]; then return fi # WAL corruption check — storageRetention present means TSDB is functional local storage_retention storage_retention=$(json_value_string "storageRetention" "$runtime_response") if [[ -n "$storage_retention" ]]; then record_pass "TSDB WAL operational" "retention: ${storage_retention}" fi # Last compaction local last_gc last_gc=$(json_value_string "lastGarbageCollection" "$runtime_response") if [[ -n "$last_gc" ]]; then record_pass "TSDB compaction" "last GC: ${last_gc}" fi } # ── Storage ─────────────────────────────────────────────────────────── test_storage() { section "Storage" # Retention from flags local flags_response flags_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/flags") || { record_fail "Storage flags" "connection failed" return } local retention_time retention_time=$(json_value_string "storage.tsdb.retention.time" "$flags_response") local retention_size retention_size=$(json_value_string "storage.tsdb.retention.size" "$flags_response") if [[ -n "$retention_time" ]]; then record_pass "Retention configured" "time: ${retention_time}${retention_size:+, size: ${retention_size}}" else record_pass "Retention" "default (15d)" fi # Local disk check if path provided if [[ -n "$TSDB_PATH" ]]; then if [[ -d "$TSDB_PATH" ]]; then local disk_usage disk_usage=$(du -sh "$TSDB_PATH" 2>/dev/null | awk '{print $1}') record_pass "TSDB disk usage" "${disk_usage} at ${TSDB_PATH}" # Check WAL directory if [[ -d "${TSDB_PATH}/wal" ]]; then local wal_size wal_size=$(du -sh "${TSDB_PATH}/wal" 2>/dev/null | awk '{print $1}') record_pass "WAL directory" "${wal_size}" else record_fail "WAL directory" "not found at ${TSDB_PATH}/wal" fi # Check chunks_head directory if [[ -d "${TSDB_PATH}/chunks_head" ]]; then local chunks_size chunks_size=$(du -sh "${TSDB_PATH}/chunks_head" 2>/dev/null | awk '{print $1}') record_pass "Head chunks on disk" "${chunks_size}" fi # Check for lock file (Prometheus is running) if [[ -f "${TSDB_PATH}/lock" ]]; then record_pass "TSDB lock file" "Prometheus holds lock" else record_fail "TSDB lock file" "no lock — Prometheus may not be running" fi else record_fail "TSDB path" "${TSDB_PATH} not found" fi else record_skip "Local disk checks" "TSDB_PATH not set" fi } # ── Remote Write ────────────────────────────────────────────────────── test_remote_write() { section "Remote Write" if [[ "$SKIP_REMOTE_WRITE" == "true" ]]; then record_skip "Remote write tests" "skipped via --skip-remote-write" return fi local response response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/runtimeinfo") || { record_fail "Runtime info for remote write" "connection failed" return } local status status=$(json_value "status" "$response") if [[ "$status" != "success" ]]; then record_fail "Runtime info" "status: ${status:-empty}" return fi # Check if remote write is configured by looking at the config local config_response config_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/config") || { record_skip "Remote write config" "config API unavailable" return } if echo "$config_response" | grep -q "remote_write"; then record_pass "Remote write configured" "remote_write section present" # Check WAL for backlog via runtime info local wal_size wal_size=$(json_value "walSize" "$response") if [[ -n "$wal_size" ]]; then record_pass "Remote write WAL" "WAL size: ${wal_size}" fi else record_skip "Remote write" "not configured" fi } # ── Performance ─────────────────────────────────────────────────────── test_performance() { section "Performance" # Query up metric and measure response time local start_ms start_ms=$(date +%s%N 2>/dev/null || echo "0") local response response=$(prom_curl "${PROMETHEUS_URL}/api/v1/query?query=up") || { record_fail "Query performance" "query failed" return } local end_ms end_ms=$(date +%s%N 2>/dev/null || echo "0") local status status=$(json_value "status" "$response") if [[ "$status" != "success" ]]; then record_fail "Query engine" "status: ${status:-empty}" return fi # Calculate duration if [[ "$start_ms" != "0" && "$end_ms" != "0" ]]; then local duration_ms=$(( (end_ms - start_ms) / 1000000 )) if [[ $duration_ms -lt $QUERY_TIMEOUT_MS ]]; then record_pass "Query response time" "${duration_ms}ms (threshold: ${QUERY_TIMEOUT_MS}ms)" else record_fail "Query response slow" "${duration_ms}ms exceeds ${QUERY_TIMEOUT_MS}ms threshold" fi else record_pass "Query engine" "up query returned successfully" fi # Count result series local result_count result_count=$(echo "$response" | { grep -oP '"__name__"\s*:\s*"up"' || true; } | wc -l) if [[ $result_count -gt 0 ]]; then record_pass "Query results" "${result_count} up metrics returned" else # Fallback — just verify we got a result if echo "$response" | grep -q '"result"'; then record_pass "Query results" "results returned" else record_fail "Query results" "no results from up query" fi fi # TLS check if HTTPS if [[ "$PROMETHEUS_URL" == https://* ]]; then local prom_host prom_host="${PROMETHEUS_URL#https://}" prom_host="${prom_host%%/*}" prom_host="${prom_host%%:*}" local prom_port prom_port="${PROMETHEUS_URL#https://}" prom_port="${prom_port%%/*}" if [[ "$prom_port" == *:* ]]; then prom_port="${prom_port##*:}" else prom_port="443" fi local cert_expiry cert_expiry=$(echo | openssl s_client -connect "${prom_host}:${prom_port}" -servername "$prom_host" 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) if [[ -n "$cert_expiry" ]]; then local expiry_epoch expiry_epoch=$(date -d "$cert_expiry" +%s 2>/dev/null || date -jf "%b %d %T %Y %Z" "$cert_expiry" +%s 2>/dev/null || echo "0") local now_epoch now_epoch=$(date +%s) if [[ "$expiry_epoch" != "0" ]]; then local days_left=$(( (expiry_epoch - now_epoch) / 86400 )) if [[ $days_left -gt 30 ]]; then record_pass "TLS certificate valid" "${days_left} days remaining" elif [[ $days_left -gt 0 ]]; then record_fail "TLS certificate expiring soon" "${days_left} days remaining" else record_fail "TLS certificate expired" "expired ${cert_expiry}" fi fi fi fi } # ══════════════════════════════════════════════════════════════════════ # MAIN # ══════════════════════════════════════════════════════════════════════ main() { # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in --skip-alertmanager) SKIP_ALERTMANAGER=true ;; --skip-remote-write) SKIP_REMOTE_WRITE=true ;; --insecure) CURL_INSECURE=true ;; --timeout) CURL_TIMEOUT="$2"; shift ;; --format) OUTPUT_FORMAT="$2"; shift ;; --junit-file) JUNIT_FILE="$2"; shift ;; --expected-jobs) EXPECTED_JOBS="$2"; shift ;; --query-timeout) QUERY_TIMEOUT_MS="$2"; shift ;; --tsdb-path) TSDB_PATH="$2"; shift ;; --verbose) VERBOSE=true ;; --no-color) COLOR=never ;; --help|-h) usage; exit 0 ;; *) err "Unknown option: $1"; usage; exit 1 ;; esac shift done setup_colors # Validate required vars if [[ -z "$PROMETHEUS_URL" ]]; then err "PROMETHEUS_URL is required" echo "" usage exit 1 fi # Strip trailing slash PROMETHEUS_URL="${PROMETHEUS_URL%/}" if [[ -n "$ALERTMANAGER_URL" ]]; then ALERTMANAGER_URL="${ALERTMANAGER_URL%/}" fi START_TIME=$(date +%s) # Header if [[ "$OUTPUT_FORMAT" == "tap" ]]; then print_tap_header else echo -e "${BOLD}Prometheus Smoke Tester${RESET}" echo -e "Target: ${PROMETHEUS_URL}" if [[ -n "$ALERTMANAGER_URL" && "$SKIP_ALERTMANAGER" != "true" ]]; then echo -e "Alertmanager: ${ALERTMANAGER_URL}" fi echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" fi # Run test suites test_targets test_rules test_alerts test_alertmanager test_configuration test_tsdb test_storage test_remote_write test_performance # Output if [[ "$OUTPUT_FORMAT" == "tap" ]]; then print_tap_footer elif [[ "$OUTPUT_FORMAT" == "junit" ]]; then write_junit fi if [[ "$OUTPUT_FORMAT" != "tap" ]]; then print_summary fi # Exit code if [[ $FAIL -gt 0 ]]; then exit 1 fi exit 0 } main "$@"