Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
This commit is contained in:
Executable
+991
@@ -0,0 +1,991 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
#########################################################################################
|
||||
#### prometheus-smoke-tester.sh — Focused smoke tests for Prometheus + Alertmanager ####
|
||||
#### Zero external dependencies. Runs in air-gapped environments. ####
|
||||
#### Requires: bash 4+, curl, openssl (optional) ####
|
||||
#### ####
|
||||
#### Author: Phil Connor ####
|
||||
#### Contact: contact@mylinux.work ####
|
||||
#### License: MIT ####
|
||||
#### Version 1.01 ####
|
||||
#### ####
|
||||
#### Usage: ####
|
||||
#### export PROMETHEUS_URL="http://prometheus:9090" ####
|
||||
#### ./prometheus-smoke-tester.sh ####
|
||||
#### ####
|
||||
#### See --help for all options. ####
|
||||
#########################################################################################
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ── Defaults ──────────────────────────────────────────────────────────
|
||||
|
||||
PROMETHEUS_URL="${PROMETHEUS_URL:-}"
|
||||
PROMETHEUS_AUTH_TOKEN="${PROMETHEUS_AUTH_TOKEN:-}"
|
||||
ALERTMANAGER_URL="${ALERTMANAGER_URL:-}"
|
||||
EXPECTED_JOBS="${EXPECTED_JOBS:-}"
|
||||
WATCHDOG_ALERT_NAME="${WATCHDOG_ALERT_NAME:-Watchdog}"
|
||||
QUERY_TIMEOUT_MS="${QUERY_TIMEOUT_MS:-5000}"
|
||||
TSDB_PATH="${TSDB_PATH:-}"
|
||||
CURL_TIMEOUT="${CURL_TIMEOUT:-10}"
|
||||
CURL_INSECURE="${CURL_INSECURE:-false}"
|
||||
SKIP_ALERTMANAGER="${SKIP_ALERTMANAGER:-false}"
|
||||
SKIP_REMOTE_WRITE="${SKIP_REMOTE_WRITE:-false}"
|
||||
OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}"
|
||||
JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}"
|
||||
VERBOSE="${VERBOSE:-false}"
|
||||
COLOR="${COLOR:-auto}"
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
SKIP=0
|
||||
TOTAL=0
|
||||
RESULTS=()
|
||||
START_TIME=0
|
||||
TEST_ALERT_NAME="smoke_test_alert_$$"
|
||||
|
||||
# ── Colors ────────────────────────────────────────────────────────────
|
||||
|
||||
setup_colors() {
|
||||
if [[ "$COLOR" == "never" ]]; then
|
||||
RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET=""
|
||||
return
|
||||
fi
|
||||
if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
BLUE='\033[0;34m'
|
||||
BOLD='\033[1m'
|
||||
RESET='\033[0m'
|
||||
else
|
||||
RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET=""
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Logging ───────────────────────────────────────────────────────────
|
||||
|
||||
log() { echo -e "${BLUE}[INFO]${RESET} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; }
|
||||
err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; }
|
||||
verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; }
|
||||
|
||||
# ── Test Recording ────────────────────────────────────────────────────
|
||||
|
||||
record_pass() {
|
||||
local name="$1"
|
||||
local detail="${2:-}"
|
||||
((PASS++)) || true
|
||||
((TOTAL++)) || true
|
||||
RESULTS+=("PASS|${name}|${detail}")
|
||||
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
|
||||
echo "ok ${TOTAL} - ${name}"
|
||||
else
|
||||
echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}"
|
||||
fi
|
||||
}
|
||||
|
||||
record_fail() {
|
||||
local name="$1"
|
||||
local detail="${2:-}"
|
||||
((FAIL++)) || true
|
||||
((TOTAL++)) || true
|
||||
RESULTS+=("FAIL|${name}|${detail}")
|
||||
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
|
||||
echo "not ok ${TOTAL} - ${name}"
|
||||
[[ -n "$detail" ]] && echo " # ${detail}"
|
||||
else
|
||||
echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}"
|
||||
fi
|
||||
}
|
||||
|
||||
record_skip() {
|
||||
local name="$1"
|
||||
local reason="${2:-}"
|
||||
((SKIP++)) || true
|
||||
((TOTAL++)) || true
|
||||
RESULTS+=("SKIP|${name}|${reason}")
|
||||
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
|
||||
echo "ok ${TOTAL} - ${name} # SKIP ${reason}"
|
||||
else
|
||||
echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── TAP ───────────────────────────────────────────────────────────────
|
||||
|
||||
print_tap_header() {
|
||||
echo "TAP version 13"
|
||||
}
|
||||
|
||||
print_tap_footer() {
|
||||
echo "1..${TOTAL}"
|
||||
echo "# pass ${PASS}"
|
||||
echo "# fail ${FAIL}"
|
||||
echo "# skip ${SKIP}"
|
||||
}
|
||||
|
||||
# ── JUnit XML ─────────────────────────────────────────────────────────
|
||||
|
||||
xml_escape() {
|
||||
local s="$1"
|
||||
s="${s//&/&}"
|
||||
s="${s//</<}"
|
||||
s="${s//>/>}"
|
||||
s="${s//\"/"}"
|
||||
echo "$s"
|
||||
}
|
||||
|
||||
write_junit() {
|
||||
local end_time
|
||||
end_time=$(date +%s)
|
||||
local duration=$(( end_time - START_TIME ))
|
||||
|
||||
cat > "$JUNIT_FILE" <<JUNIT_EOF
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<testsuites tests="${TOTAL}" failures="${FAIL}" skipped="${SKIP}" time="${duration}">
|
||||
<testsuite name="prometheus-smoke-tester" tests="${TOTAL}" failures="${FAIL}" skipped="${SKIP}" time="${duration}">
|
||||
JUNIT_EOF
|
||||
|
||||
local result status name detail
|
||||
for result in "${RESULTS[@]}"; do
|
||||
status="${result%%|*}"
|
||||
local rest="${result#*|}"
|
||||
name="${rest%%|*}"
|
||||
detail="${rest#*|}"
|
||||
|
||||
name=$(xml_escape "$name")
|
||||
detail=$(xml_escape "$detail")
|
||||
|
||||
case "$status" in
|
||||
PASS)
|
||||
echo " <testcase name=\"${name}\" classname=\"smoke\">" >> "$JUNIT_FILE"
|
||||
if [[ -n "$detail" ]]; then
|
||||
echo " <system-out>${detail}</system-out>" >> "$JUNIT_FILE"
|
||||
fi
|
||||
echo " </testcase>" >> "$JUNIT_FILE"
|
||||
;;
|
||||
FAIL)
|
||||
echo " <testcase name=\"${name}\" classname=\"smoke\">" >> "$JUNIT_FILE"
|
||||
echo " <failure message=\"${detail}\">FAILED: ${name} — ${detail}</failure>" >> "$JUNIT_FILE"
|
||||
echo " </testcase>" >> "$JUNIT_FILE"
|
||||
;;
|
||||
SKIP)
|
||||
echo " <testcase name=\"${name}\" classname=\"smoke\">" >> "$JUNIT_FILE"
|
||||
echo " <skipped message=\"${detail}\"/>" >> "$JUNIT_FILE"
|
||||
echo " </testcase>" >> "$JUNIT_FILE"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo " </testsuite>" >> "$JUNIT_FILE"
|
||||
echo "</testsuites>" >> "$JUNIT_FILE"
|
||||
|
||||
log "JUnit report written to ${JUNIT_FILE}"
|
||||
}
|
||||
|
||||
# ── curl Wrapper ──────────────────────────────────────────────────────
|
||||
|
||||
prom_curl() {
|
||||
local url="$1"
|
||||
shift
|
||||
local curl_opts=(-s -S --max-time "$CURL_TIMEOUT")
|
||||
|
||||
if [[ "$CURL_INSECURE" == "true" ]]; then
|
||||
curl_opts+=(-k)
|
||||
fi
|
||||
if [[ -n "$PROMETHEUS_AUTH_TOKEN" ]]; then
|
||||
curl_opts+=(-H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}")
|
||||
fi
|
||||
|
||||
verbose "curl GET ${url}"
|
||||
curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null
|
||||
}
|
||||
|
||||
prom_curl_status() {
|
||||
local url="$1"
|
||||
shift
|
||||
local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT")
|
||||
|
||||
if [[ "$CURL_INSECURE" == "true" ]]; then
|
||||
curl_opts+=(-k)
|
||||
fi
|
||||
if [[ -n "$PROMETHEUS_AUTH_TOKEN" ]]; then
|
||||
curl_opts+=(-H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}")
|
||||
fi
|
||||
|
||||
curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null
|
||||
}
|
||||
|
||||
am_curl() {
|
||||
local url="$1"
|
||||
shift
|
||||
local curl_opts=(-s -S --max-time "$CURL_TIMEOUT")
|
||||
|
||||
if [[ "$CURL_INSECURE" == "true" ]]; then
|
||||
curl_opts+=(-k)
|
||||
fi
|
||||
|
||||
verbose "curl GET ${url}"
|
||||
curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null
|
||||
}
|
||||
|
||||
am_curl_status() {
|
||||
local url="$1"
|
||||
shift
|
||||
local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT")
|
||||
|
||||
if [[ "$CURL_INSECURE" == "true" ]]; then
|
||||
curl_opts+=(-k)
|
||||
fi
|
||||
|
||||
curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null
|
||||
}
|
||||
|
||||
# ── JSON Helpers (no jq) ──────────────────────────────────────────────
|
||||
|
||||
json_value() {
|
||||
local key="$1"
|
||||
local json="$2"
|
||||
echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"?\K[^\",}]+" || true; } | head -1
|
||||
}
|
||||
|
||||
json_value_string() {
|
||||
local key="$1"
|
||||
local json="$2"
|
||||
echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"\K[^\"]*" || true; } | head -1
|
||||
}
|
||||
|
||||
# ── Cleanup ───────────────────────────────────────────────────────────
|
||||
|
||||
cleanup() {
|
||||
# Remove test alert from Alertmanager if we posted one
|
||||
if [[ "${TEST_ALERT_POSTED:-false}" == "true" && "$SKIP_ALERTMANAGER" != "true" && -n "$ALERTMANAGER_URL" ]]; then
|
||||
verbose "Cleaning up test alert from Alertmanager"
|
||||
am_curl "${ALERTMANAGER_URL}/api/v2/silences" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
# ── Usage ─────────────────────────────────────────────────────────────
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [OPTIONS]
|
||||
|
||||
Smoke-test a Prometheus + Alertmanager stack. Zero external dependencies — bash 4+ and curl only.
|
||||
Validates targets, rules, alerts, TSDB health, storage, remote write, and query performance.
|
||||
|
||||
Required environment variables:
|
||||
PROMETHEUS_URL Prometheus base URL (http://prometheus:9090)
|
||||
|
||||
Optional environment variables:
|
||||
PROMETHEUS_AUTH_TOKEN Bearer token for authentication
|
||||
ALERTMANAGER_URL Alertmanager base URL (http://alertmanager:9093)
|
||||
EXPECTED_JOBS Comma-separated list of expected scrape job names
|
||||
WATCHDOG_ALERT_NAME Dead man's switch alert name (default: Watchdog)
|
||||
QUERY_TIMEOUT_MS Max query response time in ms (default: 5000)
|
||||
TSDB_PATH Local TSDB data path for disk checks
|
||||
CURL_TIMEOUT HTTP timeout in seconds (default: 10)
|
||||
CURL_INSECURE Allow self-signed certs (default: false)
|
||||
SKIP_ALERTMANAGER Skip Alertmanager tests (default: false)
|
||||
SKIP_REMOTE_WRITE Skip remote write checks (default: false)
|
||||
OUTPUT_FORMAT Output: text (default), tap, junit
|
||||
JUNIT_FILE JUnit output path (default: smoke-results.xml)
|
||||
|
||||
Options:
|
||||
--skip-alertmanager Skip Alertmanager integration tests
|
||||
--skip-remote-write Skip remote write checks
|
||||
--insecure Allow self-signed TLS certificates (-k)
|
||||
--timeout N curl timeout in seconds (default: 10)
|
||||
--format FORMAT Output: text (default), tap, junit
|
||||
--junit-file FILE JUnit output path (default: smoke-results.xml)
|
||||
--expected-jobs JOBS Comma-separated expected scrape jobs
|
||||
--query-timeout MS Max query response time in ms (default: 5000)
|
||||
--tsdb-path PATH Local TSDB data path for disk checks
|
||||
--verbose Show debug output
|
||||
--no-color Disable colored output
|
||||
--help Show this help
|
||||
|
||||
Examples:
|
||||
# Basic run
|
||||
export PROMETHEUS_URL="http://prometheus:9090"
|
||||
./$(basename "$0")
|
||||
|
||||
# With expected jobs and Alertmanager
|
||||
export PROMETHEUS_URL="http://prometheus:9090"
|
||||
export ALERTMANAGER_URL="http://alertmanager:9093"
|
||||
export EXPECTED_JOBS="node,prometheus,blackbox,cadvisor"
|
||||
./$(basename "$0")
|
||||
|
||||
# JUnit XML for CI
|
||||
./$(basename "$0") --format junit --junit-file results.xml
|
||||
|
||||
# Self-signed certs
|
||||
./$(basename "$0") --insecure
|
||||
EOF
|
||||
}
|
||||
|
||||
# ── Section Header ────────────────────────────────────────────────────
|
||||
|
||||
section() {
|
||||
if [[ "$OUTPUT_FORMAT" != "tap" ]]; then
|
||||
echo ""
|
||||
echo -e "${BOLD}$1${RESET}"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Summary ───────────────────────────────────────────────────────────
|
||||
|
||||
print_summary() {
|
||||
local end_time
|
||||
end_time=$(date +%s)
|
||||
local duration=$(( end_time - START_TIME ))
|
||||
|
||||
echo ""
|
||||
echo -e "${BOLD}────────────────────────────────────────${RESET}"
|
||||
echo -e "${BOLD}Summary${RESET} ${PROMETHEUS_URL}"
|
||||
echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)"
|
||||
echo -e "${BOLD}────────────────────────────────────────${RESET}"
|
||||
|
||||
if [[ $FAIL -eq 0 ]]; then
|
||||
echo -e "${GREEN}${BOLD}All tests passed.${RESET}"
|
||||
else
|
||||
echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}"
|
||||
fi
|
||||
}
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# TEST SUITES
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
# ── Targets ───────────────────────────────────────────────────────────
|
||||
|
||||
test_targets() {
|
||||
section "Targets"
|
||||
|
||||
local response
|
||||
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/targets") || {
|
||||
record_fail "Targets API reachable" "connection failed"
|
||||
return
|
||||
}
|
||||
|
||||
local status
|
||||
status=$(json_value "status" "$response")
|
||||
if [[ "$status" == "success" ]]; then
|
||||
record_pass "Targets API reachable" "HTTP 200"
|
||||
else
|
||||
record_fail "Targets API reachable" "status: ${status:-empty}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Count active targets
|
||||
local up_count
|
||||
up_count=$(echo "$response" | { grep -oP '"health"\s*:\s*"up"' || true; } | wc -l)
|
||||
local down_count
|
||||
down_count=$(echo "$response" | { grep -oP '"health"\s*:\s*"down"' || true; } | wc -l)
|
||||
local total_targets=$(( up_count + down_count ))
|
||||
|
||||
if [[ $down_count -eq 0 ]]; then
|
||||
record_pass "All targets healthy" "${up_count}/${total_targets} targets up"
|
||||
else
|
||||
record_fail "Targets health" "${down_count}/${total_targets} targets down"
|
||||
fi
|
||||
|
||||
# Scrape errors
|
||||
local scrape_errors
|
||||
scrape_errors=$(echo "$response" | { grep -oP '"lastError"\s*:\s*"\K[^"]+' || true; } | head -5)
|
||||
if [[ -z "$scrape_errors" ]]; then
|
||||
record_pass "No scrape errors" "all targets scraping cleanly"
|
||||
else
|
||||
local error_count
|
||||
error_count=$(echo "$response" | { grep -oP '"lastError"\s*:\s*"[^"]+"' || true; } | grep -vc '""') || error_count=0
|
||||
if [[ $error_count -eq 0 ]]; then
|
||||
record_pass "No scrape errors" "all targets scraping cleanly"
|
||||
else
|
||||
record_fail "Scrape errors detected" "${error_count} target(s) with errors"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check expected jobs if specified
|
||||
if [[ -n "$EXPECTED_JOBS" ]]; then
|
||||
local active_jobs
|
||||
active_jobs=$(echo "$response" | { grep -oP '"job"\s*:\s*"\K[^"]+' || true; } | sort -u)
|
||||
|
||||
IFS=',' read -ra expected_arr <<< "$EXPECTED_JOBS"
|
||||
local missing=""
|
||||
for job in "${expected_arr[@]}"; do
|
||||
job="${job// /}"
|
||||
if ! echo "$active_jobs" | grep -qx "$job"; then
|
||||
if [[ -n "$missing" ]]; then
|
||||
missing="${missing}, ${job}"
|
||||
else
|
||||
missing="$job"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ -z "$missing" ]]; then
|
||||
record_pass "Expected jobs present" "${#expected_arr[@]} jobs found"
|
||||
else
|
||||
record_fail "Expected jobs missing" "missing: ${missing}"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Rules ─────────────────────────────────────────────────────────────
|
||||
|
||||
test_rules() {
|
||||
section "Rules"
|
||||
|
||||
local response
|
||||
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/rules") || {
|
||||
record_fail "Rules API reachable" "connection failed"
|
||||
return
|
||||
}
|
||||
|
||||
local status
|
||||
status=$(json_value "status" "$response")
|
||||
if [[ "$status" != "success" ]]; then
|
||||
record_fail "Rules API reachable" "status: ${status:-empty}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Count rule groups
|
||||
local group_count
|
||||
group_count=$(echo "$response" | { grep -oP '"name"\s*:\s*"' || true; } | wc -l)
|
||||
|
||||
# Recording rules
|
||||
local recording_count
|
||||
recording_count=$(echo "$response" | { grep -oP '"type"\s*:\s*"recording"' || true; } | wc -l)
|
||||
if [[ $recording_count -gt 0 ]]; then
|
||||
record_pass "Recording rules loaded" "${recording_count} recording rules in ${group_count} groups"
|
||||
else
|
||||
record_pass "Recording rules" "none configured"
|
||||
fi
|
||||
|
||||
# Alerting rules
|
||||
local alerting_count
|
||||
alerting_count=$(echo "$response" | { grep -oP '"type"\s*:\s*"alerting"' || true; } | wc -l)
|
||||
if [[ $alerting_count -gt 0 ]]; then
|
||||
record_pass "Alerting rules loaded" "${alerting_count} alerting rules"
|
||||
else
|
||||
record_skip "Alerting rules" "none configured"
|
||||
fi
|
||||
|
||||
# Rule evaluation errors
|
||||
local eval_errors
|
||||
eval_errors=$(echo "$response" | { grep -oP '"lastError"\s*:\s*"\K[^"]+' || true; } | head -5)
|
||||
local error_count=0
|
||||
if [[ -n "$eval_errors" ]]; then
|
||||
error_count=$(echo "$eval_errors" | wc -l)
|
||||
fi
|
||||
if [[ $error_count -eq 0 ]]; then
|
||||
record_pass "Rule evaluation healthy" "no evaluation errors"
|
||||
else
|
||||
record_fail "Rule evaluation errors" "${error_count} rule(s) with errors"
|
||||
fi
|
||||
|
||||
# Rule group evaluation duration
|
||||
local slow_groups
|
||||
slow_groups=$(echo "$response" | { grep -oP '"evaluationTime"\s*:\s*\K[0-9.]+' || true; } | awk '$1 > 5.0' | wc -l)
|
||||
if [[ $slow_groups -eq 0 ]]; then
|
||||
record_pass "Rule group evaluation speed" "all groups under 5s"
|
||||
else
|
||||
record_fail "Slow rule groups" "${slow_groups} group(s) over 5s evaluation time"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Alerts ────────────────────────────────────────────────────────────
|
||||
|
||||
test_alerts() {
|
||||
section "Alerts"
|
||||
|
||||
local response
|
||||
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/alerts") || {
|
||||
record_fail "Alerts API reachable" "connection failed"
|
||||
return
|
||||
}
|
||||
|
||||
local status
|
||||
status=$(json_value "status" "$response")
|
||||
if [[ "$status" != "success" ]]; then
|
||||
record_fail "Alerts API reachable" "status: ${status:-empty}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Check for firing critical alerts
|
||||
local critical_count
|
||||
critical_count=$(echo "$response" | { grep -oP '"severity"\s*:\s*"critical"' || true; } | wc -l)
|
||||
if [[ $critical_count -eq 0 ]]; then
|
||||
record_pass "No critical alerts firing" "clean"
|
||||
else
|
||||
record_fail "Critical alerts firing" "${critical_count} critical alert(s) active"
|
||||
fi
|
||||
|
||||
# Watchdog dead man's switch
|
||||
local watchdog
|
||||
watchdog=$(echo "$response" | { grep -oP "\"alertname\"\s*:\s*\"${WATCHDOG_ALERT_NAME}\"" || true; } | wc -l)
|
||||
if [[ $watchdog -gt 0 ]]; then
|
||||
record_pass "Watchdog alert firing" "dead man's switch active"
|
||||
else
|
||||
record_fail "Watchdog alert not firing" "${WATCHDOG_ALERT_NAME} should always fire"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Alertmanager Integration ──────────────────────────────────────────
|
||||
|
||||
test_alertmanager() {
|
||||
section "Alertmanager Integration"
|
||||
|
||||
if [[ "$SKIP_ALERTMANAGER" == "true" ]]; then
|
||||
record_skip "Alertmanager tests" "skipped via --skip-alertmanager"
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ -z "$ALERTMANAGER_URL" ]]; then
|
||||
record_skip "Alertmanager tests" "ALERTMANAGER_URL not set"
|
||||
return
|
||||
fi
|
||||
|
||||
# Reachability
|
||||
local am_status
|
||||
am_status=$(am_curl_status "${ALERTMANAGER_URL}/-/healthy") || am_status="000"
|
||||
if [[ "$am_status" == "200" ]]; then
|
||||
record_pass "Alertmanager reachable" "HTTP 200"
|
||||
else
|
||||
record_fail "Alertmanager reachable" "HTTP ${am_status}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Check Prometheus can reach Alertmanager via its own API
|
||||
local am_response
|
||||
am_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/alertmanagers") || {
|
||||
record_skip "Prometheus→Alertmanager link" "API unavailable"
|
||||
return
|
||||
}
|
||||
|
||||
local active_am
|
||||
active_am=$(echo "$am_response" | grep -oP '"activeAlertmanagers"\s*:\s*\[\K[^\]]*' | grep -c "url" || true)
|
||||
if [[ $active_am -gt 0 ]]; then
|
||||
record_pass "Prometheus→Alertmanager connected" "${active_am} active Alertmanager(s)"
|
||||
else
|
||||
record_fail "Prometheus→Alertmanager disconnected" "no active Alertmanagers"
|
||||
fi
|
||||
|
||||
# Fire a test alert
|
||||
local alert_payload
|
||||
alert_payload=$(cat <<ALERT_EOF
|
||||
[{"labels":{"alertname":"${TEST_ALERT_NAME}","severity":"none","source":"smoke-test"},"annotations":{"summary":"Prometheus smoke test alert — will auto-resolve"},"startsAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","endsAt":"$(date -u -d '+2 minutes' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v+2M +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "2099-01-01T00:00:00Z")"}]
|
||||
ALERT_EOF
|
||||
)
|
||||
|
||||
local post_status
|
||||
post_status=$(am_curl_status "${ALERTMANAGER_URL}/api/v2/alerts" -X POST -H "Content-Type: application/json" -d "$alert_payload") || post_status="000"
|
||||
if [[ "$post_status" == "200" ]]; then
|
||||
TEST_ALERT_POSTED=true
|
||||
record_pass "Test alert posted to Alertmanager" "HTTP 200"
|
||||
|
||||
# Wait briefly and verify
|
||||
sleep 2
|
||||
local verify_response
|
||||
verify_response=$(am_curl "${ALERTMANAGER_URL}/api/v2/alerts?filter=alertname%3D${TEST_ALERT_NAME}") || verify_response=""
|
||||
if echo "$verify_response" | grep -q "$TEST_ALERT_NAME"; then
|
||||
record_pass "Test alert visible in Alertmanager" "alert round-trip confirmed"
|
||||
else
|
||||
record_fail "Test alert not visible" "posted but not found in /api/v2/alerts"
|
||||
fi
|
||||
|
||||
# Silence and expire the test alert
|
||||
local silence_payload
|
||||
silence_payload=$(cat <<SILENCE_EOF
|
||||
{"matchers":[{"name":"alertname","value":"${TEST_ALERT_NAME}","isRegex":false}],"startsAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","endsAt":"$(date -u -d '+1 minute' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v+1M +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "2099-01-01T00:00:00Z")","createdBy":"prometheus-smoke-tester","comment":"auto-cleanup"}
|
||||
SILENCE_EOF
|
||||
)
|
||||
am_curl_status "${ALERTMANAGER_URL}/api/v2/silences" -X POST -H "Content-Type: application/json" -d "$silence_payload" >/dev/null 2>&1 || true
|
||||
verbose "Test alert silenced for cleanup"
|
||||
else
|
||||
record_fail "Test alert post failed" "HTTP ${post_status}"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Configuration ─────────────────────────────────────────────────────
|
||||
|
||||
test_configuration() {
|
||||
section "Configuration"
|
||||
|
||||
# Config loaded
|
||||
local config_response
|
||||
config_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/config") || {
|
||||
record_fail "Configuration loaded" "connection failed"
|
||||
return
|
||||
}
|
||||
|
||||
local config_status
|
||||
config_status=$(json_value "status" "$config_response")
|
||||
if [[ "$config_status" == "success" ]]; then
|
||||
record_pass "Configuration loaded" "prometheus.yml active"
|
||||
else
|
||||
record_fail "Configuration loaded" "status: ${config_status:-empty}"
|
||||
fi
|
||||
|
||||
# Runtime flags
|
||||
local flags_response
|
||||
flags_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/flags") || {
|
||||
record_fail "Runtime flags" "connection failed"
|
||||
return
|
||||
}
|
||||
|
||||
local flags_status
|
||||
flags_status=$(json_value "status" "$flags_response")
|
||||
if [[ "$flags_status" == "success" ]]; then
|
||||
local retention
|
||||
retention=$(json_value_string "storage.tsdb.retention.time" "$flags_response")
|
||||
if [[ -z "$retention" ]]; then
|
||||
retention="default"
|
||||
fi
|
||||
record_pass "Runtime flags accessible" "retention: ${retention}"
|
||||
else
|
||||
record_fail "Runtime flags" "status: ${flags_status:-empty}"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── TSDB Health ───────────────────────────────────────────────────────
|
||||
|
||||
test_tsdb() {
|
||||
section "TSDB Health"
|
||||
|
||||
local response
|
||||
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/tsdb") || {
|
||||
record_fail "TSDB status API" "connection failed"
|
||||
return
|
||||
}
|
||||
|
||||
local status
|
||||
status=$(json_value "status" "$response")
|
||||
if [[ "$status" != "success" ]]; then
|
||||
record_fail "TSDB status API" "status: ${status:-empty}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Head stats
|
||||
local head_series
|
||||
head_series=$(json_value "numSeries" "$response")
|
||||
if [[ -n "$head_series" ]]; then
|
||||
record_pass "TSDB head series" "${head_series} active series"
|
||||
else
|
||||
record_pass "TSDB status" "responding"
|
||||
fi
|
||||
|
||||
local head_chunks
|
||||
head_chunks=$(json_value "chunkCount" "$response")
|
||||
if [[ -n "$head_chunks" ]]; then
|
||||
record_pass "TSDB head chunks" "${head_chunks} chunks in head"
|
||||
fi
|
||||
|
||||
# Runtime info for WAL and compaction
|
||||
local runtime_response
|
||||
runtime_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/runtimeinfo") || {
|
||||
verbose "Runtime info unavailable"
|
||||
return
|
||||
}
|
||||
|
||||
local runtime_status
|
||||
runtime_status=$(json_value "status" "$runtime_response")
|
||||
if [[ "$runtime_status" != "success" ]]; then
|
||||
return
|
||||
fi
|
||||
|
||||
# WAL corruption check — storageRetention present means TSDB is functional
|
||||
local storage_retention
|
||||
storage_retention=$(json_value_string "storageRetention" "$runtime_response")
|
||||
if [[ -n "$storage_retention" ]]; then
|
||||
record_pass "TSDB WAL operational" "retention: ${storage_retention}"
|
||||
fi
|
||||
|
||||
# Last compaction
|
||||
local last_gc
|
||||
last_gc=$(json_value_string "lastGarbageCollection" "$runtime_response")
|
||||
if [[ -n "$last_gc" ]]; then
|
||||
record_pass "TSDB compaction" "last GC: ${last_gc}"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Storage ───────────────────────────────────────────────────────────
|
||||
|
||||
test_storage() {
|
||||
section "Storage"
|
||||
|
||||
# Retention from flags
|
||||
local flags_response
|
||||
flags_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/flags") || {
|
||||
record_fail "Storage flags" "connection failed"
|
||||
return
|
||||
}
|
||||
|
||||
local retention_time
|
||||
retention_time=$(json_value_string "storage.tsdb.retention.time" "$flags_response")
|
||||
local retention_size
|
||||
retention_size=$(json_value_string "storage.tsdb.retention.size" "$flags_response")
|
||||
|
||||
if [[ -n "$retention_time" ]]; then
|
||||
record_pass "Retention configured" "time: ${retention_time}${retention_size:+, size: ${retention_size}}"
|
||||
else
|
||||
record_pass "Retention" "default (15d)"
|
||||
fi
|
||||
|
||||
# Local disk check if path provided
|
||||
if [[ -n "$TSDB_PATH" ]]; then
|
||||
if [[ -d "$TSDB_PATH" ]]; then
|
||||
local disk_usage
|
||||
disk_usage=$(du -sh "$TSDB_PATH" 2>/dev/null | awk '{print $1}')
|
||||
record_pass "TSDB disk usage" "${disk_usage} at ${TSDB_PATH}"
|
||||
|
||||
# Check WAL directory
|
||||
if [[ -d "${TSDB_PATH}/wal" ]]; then
|
||||
local wal_size
|
||||
wal_size=$(du -sh "${TSDB_PATH}/wal" 2>/dev/null | awk '{print $1}')
|
||||
record_pass "WAL directory" "${wal_size}"
|
||||
else
|
||||
record_fail "WAL directory" "not found at ${TSDB_PATH}/wal"
|
||||
fi
|
||||
|
||||
# Check chunks_head directory
|
||||
if [[ -d "${TSDB_PATH}/chunks_head" ]]; then
|
||||
local chunks_size
|
||||
chunks_size=$(du -sh "${TSDB_PATH}/chunks_head" 2>/dev/null | awk '{print $1}')
|
||||
record_pass "Head chunks on disk" "${chunks_size}"
|
||||
fi
|
||||
|
||||
# Check for lock file (Prometheus is running)
|
||||
if [[ -f "${TSDB_PATH}/lock" ]]; then
|
||||
record_pass "TSDB lock file" "Prometheus holds lock"
|
||||
else
|
||||
record_fail "TSDB lock file" "no lock — Prometheus may not be running"
|
||||
fi
|
||||
else
|
||||
record_fail "TSDB path" "${TSDB_PATH} not found"
|
||||
fi
|
||||
else
|
||||
record_skip "Local disk checks" "TSDB_PATH not set"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Remote Write ──────────────────────────────────────────────────────
|
||||
|
||||
test_remote_write() {
|
||||
section "Remote Write"
|
||||
|
||||
if [[ "$SKIP_REMOTE_WRITE" == "true" ]]; then
|
||||
record_skip "Remote write tests" "skipped via --skip-remote-write"
|
||||
return
|
||||
fi
|
||||
|
||||
local response
|
||||
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/runtimeinfo") || {
|
||||
record_fail "Runtime info for remote write" "connection failed"
|
||||
return
|
||||
}
|
||||
|
||||
local status
|
||||
status=$(json_value "status" "$response")
|
||||
if [[ "$status" != "success" ]]; then
|
||||
record_fail "Runtime info" "status: ${status:-empty}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Check if remote write is configured by looking at the config
|
||||
local config_response
|
||||
config_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/config") || {
|
||||
record_skip "Remote write config" "config API unavailable"
|
||||
return
|
||||
}
|
||||
|
||||
if echo "$config_response" | grep -q "remote_write"; then
|
||||
record_pass "Remote write configured" "remote_write section present"
|
||||
|
||||
# Check WAL for backlog via runtime info
|
||||
local wal_size
|
||||
wal_size=$(json_value "walSize" "$response")
|
||||
if [[ -n "$wal_size" ]]; then
|
||||
record_pass "Remote write WAL" "WAL size: ${wal_size}"
|
||||
fi
|
||||
else
|
||||
record_skip "Remote write" "not configured"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Performance ───────────────────────────────────────────────────────
|
||||
|
||||
test_performance() {
|
||||
section "Performance"
|
||||
|
||||
# Query up metric and measure response time
|
||||
local start_ms
|
||||
start_ms=$(date +%s%N 2>/dev/null || echo "0")
|
||||
|
||||
local response
|
||||
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/query?query=up") || {
|
||||
record_fail "Query performance" "query failed"
|
||||
return
|
||||
}
|
||||
|
||||
local end_ms
|
||||
end_ms=$(date +%s%N 2>/dev/null || echo "0")
|
||||
|
||||
local status
|
||||
status=$(json_value "status" "$response")
|
||||
if [[ "$status" != "success" ]]; then
|
||||
record_fail "Query engine" "status: ${status:-empty}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Calculate duration
|
||||
if [[ "$start_ms" != "0" && "$end_ms" != "0" ]]; then
|
||||
local duration_ms=$(( (end_ms - start_ms) / 1000000 ))
|
||||
if [[ $duration_ms -lt $QUERY_TIMEOUT_MS ]]; then
|
||||
record_pass "Query response time" "${duration_ms}ms (threshold: ${QUERY_TIMEOUT_MS}ms)"
|
||||
else
|
||||
record_fail "Query response slow" "${duration_ms}ms exceeds ${QUERY_TIMEOUT_MS}ms threshold"
|
||||
fi
|
||||
else
|
||||
record_pass "Query engine" "up query returned successfully"
|
||||
fi
|
||||
|
||||
# Count result series
|
||||
local result_count
|
||||
result_count=$(echo "$response" | { grep -oP '"__name__"\s*:\s*"up"' || true; } | wc -l)
|
||||
if [[ $result_count -gt 0 ]]; then
|
||||
record_pass "Query results" "${result_count} up metrics returned"
|
||||
else
|
||||
# Fallback — just verify we got a result
|
||||
if echo "$response" | grep -q '"result"'; then
|
||||
record_pass "Query results" "results returned"
|
||||
else
|
||||
record_fail "Query results" "no results from up query"
|
||||
fi
|
||||
fi
|
||||
|
||||
# TLS check if HTTPS
|
||||
if [[ "$PROMETHEUS_URL" == https://* ]]; then
|
||||
local prom_host
|
||||
prom_host="${PROMETHEUS_URL#https://}"
|
||||
prom_host="${prom_host%%/*}"
|
||||
prom_host="${prom_host%%:*}"
|
||||
local prom_port
|
||||
prom_port="${PROMETHEUS_URL#https://}"
|
||||
prom_port="${prom_port%%/*}"
|
||||
if [[ "$prom_port" == *:* ]]; then
|
||||
prom_port="${prom_port##*:}"
|
||||
else
|
||||
prom_port="443"
|
||||
fi
|
||||
|
||||
local cert_expiry
|
||||
cert_expiry=$(echo | openssl s_client -connect "${prom_host}:${prom_port}" -servername "$prom_host" 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2)
|
||||
if [[ -n "$cert_expiry" ]]; then
|
||||
local expiry_epoch
|
||||
expiry_epoch=$(date -d "$cert_expiry" +%s 2>/dev/null || date -jf "%b %d %T %Y %Z" "$cert_expiry" +%s 2>/dev/null || echo "0")
|
||||
local now_epoch
|
||||
now_epoch=$(date +%s)
|
||||
if [[ "$expiry_epoch" != "0" ]]; then
|
||||
local days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
|
||||
if [[ $days_left -gt 30 ]]; then
|
||||
record_pass "TLS certificate valid" "${days_left} days remaining"
|
||||
elif [[ $days_left -gt 0 ]]; then
|
||||
record_fail "TLS certificate expiring soon" "${days_left} days remaining"
|
||||
else
|
||||
record_fail "TLS certificate expired" "expired ${cert_expiry}"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# MAIN
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
main() {
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--skip-alertmanager) SKIP_ALERTMANAGER=true ;;
|
||||
--skip-remote-write) SKIP_REMOTE_WRITE=true ;;
|
||||
--insecure) CURL_INSECURE=true ;;
|
||||
--timeout) CURL_TIMEOUT="$2"; shift ;;
|
||||
--format) OUTPUT_FORMAT="$2"; shift ;;
|
||||
--junit-file) JUNIT_FILE="$2"; shift ;;
|
||||
--expected-jobs) EXPECTED_JOBS="$2"; shift ;;
|
||||
--query-timeout) QUERY_TIMEOUT_MS="$2"; shift ;;
|
||||
--tsdb-path) TSDB_PATH="$2"; shift ;;
|
||||
--verbose) VERBOSE=true ;;
|
||||
--no-color) COLOR=never ;;
|
||||
--help|-h) usage; exit 0 ;;
|
||||
*) err "Unknown option: $1"; usage; exit 1 ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
setup_colors
|
||||
|
||||
# Validate required vars
|
||||
if [[ -z "$PROMETHEUS_URL" ]]; then
|
||||
err "PROMETHEUS_URL is required"
|
||||
echo ""
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Strip trailing slash
|
||||
PROMETHEUS_URL="${PROMETHEUS_URL%/}"
|
||||
if [[ -n "$ALERTMANAGER_URL" ]]; then
|
||||
ALERTMANAGER_URL="${ALERTMANAGER_URL%/}"
|
||||
fi
|
||||
|
||||
START_TIME=$(date +%s)
|
||||
|
||||
# Header
|
||||
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
|
||||
print_tap_header
|
||||
else
|
||||
echo -e "${BOLD}Prometheus Smoke Tester${RESET}"
|
||||
echo -e "Target: ${PROMETHEUS_URL}"
|
||||
if [[ -n "$ALERTMANAGER_URL" && "$SKIP_ALERTMANAGER" != "true" ]]; then
|
||||
echo -e "Alertmanager: ${ALERTMANAGER_URL}"
|
||||
fi
|
||||
echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
fi
|
||||
|
||||
# Run test suites
|
||||
test_targets
|
||||
test_rules
|
||||
test_alerts
|
||||
test_alertmanager
|
||||
test_configuration
|
||||
test_tsdb
|
||||
test_storage
|
||||
test_remote_write
|
||||
test_performance
|
||||
|
||||
# Output
|
||||
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
|
||||
print_tap_footer
|
||||
elif [[ "$OUTPUT_FORMAT" == "junit" ]]; then
|
||||
write_junit
|
||||
fi
|
||||
|
||||
if [[ "$OUTPUT_FORMAT" != "tap" ]]; then
|
||||
print_summary
|
||||
fi
|
||||
|
||||
# Exit code
|
||||
if [[ $FAIL -gt 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user