Sync all scripts from website downloads — 352 scripts total

Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
This commit is contained in:
2026-05-25 03:31:08 +02:00
parent dbd6bf0324
commit a1a17e81a1
332 changed files with 174509 additions and 1106 deletions
+991
View File
@@ -0,0 +1,991 @@
#!/usr/bin/env bash
#########################################################################################
#### prometheus-smoke-tester.sh — Focused smoke tests for Prometheus + Alertmanager ####
#### Zero external dependencies. Runs in air-gapped environments. ####
#### Requires: bash 4+, curl, openssl (optional) ####
#### ####
#### Author: Phil Connor ####
#### Contact: contact@mylinux.work ####
#### License: MIT ####
#### Version 1.01 ####
#### ####
#### Usage: ####
#### export PROMETHEUS_URL="http://prometheus:9090" ####
#### ./prometheus-smoke-tester.sh ####
#### ####
#### See --help for all options. ####
#########################################################################################
set -euo pipefail
# ── Defaults ──────────────────────────────────────────────────────────
PROMETHEUS_URL="${PROMETHEUS_URL:-}"
PROMETHEUS_AUTH_TOKEN="${PROMETHEUS_AUTH_TOKEN:-}"
ALERTMANAGER_URL="${ALERTMANAGER_URL:-}"
EXPECTED_JOBS="${EXPECTED_JOBS:-}"
WATCHDOG_ALERT_NAME="${WATCHDOG_ALERT_NAME:-Watchdog}"
QUERY_TIMEOUT_MS="${QUERY_TIMEOUT_MS:-5000}"
TSDB_PATH="${TSDB_PATH:-}"
CURL_TIMEOUT="${CURL_TIMEOUT:-10}"
CURL_INSECURE="${CURL_INSECURE:-false}"
SKIP_ALERTMANAGER="${SKIP_ALERTMANAGER:-false}"
SKIP_REMOTE_WRITE="${SKIP_REMOTE_WRITE:-false}"
OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}"
JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}"
VERBOSE="${VERBOSE:-false}"
COLOR="${COLOR:-auto}"
PASS=0
FAIL=0
SKIP=0
TOTAL=0
RESULTS=()
START_TIME=0
TEST_ALERT_NAME="smoke_test_alert_$$"
# ── Colors ────────────────────────────────────────────────────────────
setup_colors() {
if [[ "$COLOR" == "never" ]]; then
RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET=""
return
fi
if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
RESET='\033[0m'
else
RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET=""
fi
}
# ── Logging ───────────────────────────────────────────────────────────
log() { echo -e "${BLUE}[INFO]${RESET} $*"; }
warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; }
err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; }
verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; }
# ── Test Recording ────────────────────────────────────────────────────
record_pass() {
local name="$1"
local detail="${2:-}"
((PASS++)) || true
((TOTAL++)) || true
RESULTS+=("PASS|${name}|${detail}")
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
echo "ok ${TOTAL} - ${name}"
else
echo -e " ${GREEN}${RESET} ${name}${detail:+ — ${detail}}"
fi
}
record_fail() {
local name="$1"
local detail="${2:-}"
((FAIL++)) || true
((TOTAL++)) || true
RESULTS+=("FAIL|${name}|${detail}")
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
echo "not ok ${TOTAL} - ${name}"
[[ -n "$detail" ]] && echo " # ${detail}"
else
echo -e " ${RED}${RESET} ${name}${detail:+ — ${detail}}"
fi
}
record_skip() {
local name="$1"
local reason="${2:-}"
((SKIP++)) || true
((TOTAL++)) || true
RESULTS+=("SKIP|${name}|${reason}")
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
echo "ok ${TOTAL} - ${name} # SKIP ${reason}"
else
echo -e " ${YELLOW}${RESET} ${name}${reason:+ — ${reason}}"
fi
}
# ── TAP ───────────────────────────────────────────────────────────────
print_tap_header() {
echo "TAP version 13"
}
print_tap_footer() {
echo "1..${TOTAL}"
echo "# pass ${PASS}"
echo "# fail ${FAIL}"
echo "# skip ${SKIP}"
}
# ── JUnit XML ─────────────────────────────────────────────────────────
xml_escape() {
local s="$1"
s="${s//&/&}"
s="${s//</&lt;}"
s="${s//>/&gt;}"
s="${s//\"/&quot;}"
echo "$s"
}
write_junit() {
local end_time
end_time=$(date +%s)
local duration=$(( end_time - START_TIME ))
cat > "$JUNIT_FILE" <<JUNIT_EOF
<?xml version="1.0" encoding="UTF-8"?>
<testsuites tests="${TOTAL}" failures="${FAIL}" skipped="${SKIP}" time="${duration}">
<testsuite name="prometheus-smoke-tester" tests="${TOTAL}" failures="${FAIL}" skipped="${SKIP}" time="${duration}">
JUNIT_EOF
local result status name detail
for result in "${RESULTS[@]}"; do
status="${result%%|*}"
local rest="${result#*|}"
name="${rest%%|*}"
detail="${rest#*|}"
name=$(xml_escape "$name")
detail=$(xml_escape "$detail")
case "$status" in
PASS)
echo " <testcase name=\"${name}\" classname=\"smoke\">" >> "$JUNIT_FILE"
if [[ -n "$detail" ]]; then
echo " <system-out>${detail}</system-out>" >> "$JUNIT_FILE"
fi
echo " </testcase>" >> "$JUNIT_FILE"
;;
FAIL)
echo " <testcase name=\"${name}\" classname=\"smoke\">" >> "$JUNIT_FILE"
echo " <failure message=\"${detail}\">FAILED: ${name}${detail}</failure>" >> "$JUNIT_FILE"
echo " </testcase>" >> "$JUNIT_FILE"
;;
SKIP)
echo " <testcase name=\"${name}\" classname=\"smoke\">" >> "$JUNIT_FILE"
echo " <skipped message=\"${detail}\"/>" >> "$JUNIT_FILE"
echo " </testcase>" >> "$JUNIT_FILE"
;;
esac
done
echo " </testsuite>" >> "$JUNIT_FILE"
echo "</testsuites>" >> "$JUNIT_FILE"
log "JUnit report written to ${JUNIT_FILE}"
}
# ── curl Wrapper ──────────────────────────────────────────────────────
prom_curl() {
local url="$1"
shift
local curl_opts=(-s -S --max-time "$CURL_TIMEOUT")
if [[ "$CURL_INSECURE" == "true" ]]; then
curl_opts+=(-k)
fi
if [[ -n "$PROMETHEUS_AUTH_TOKEN" ]]; then
curl_opts+=(-H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}")
fi
verbose "curl GET ${url}"
curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null
}
prom_curl_status() {
local url="$1"
shift
local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT")
if [[ "$CURL_INSECURE" == "true" ]]; then
curl_opts+=(-k)
fi
if [[ -n "$PROMETHEUS_AUTH_TOKEN" ]]; then
curl_opts+=(-H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}")
fi
curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null
}
am_curl() {
local url="$1"
shift
local curl_opts=(-s -S --max-time "$CURL_TIMEOUT")
if [[ "$CURL_INSECURE" == "true" ]]; then
curl_opts+=(-k)
fi
verbose "curl GET ${url}"
curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null
}
am_curl_status() {
local url="$1"
shift
local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT")
if [[ "$CURL_INSECURE" == "true" ]]; then
curl_opts+=(-k)
fi
curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null
}
# ── JSON Helpers (no jq) ──────────────────────────────────────────────
json_value() {
local key="$1"
local json="$2"
echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"?\K[^\",}]+" || true; } | head -1
}
json_value_string() {
local key="$1"
local json="$2"
echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"\K[^\"]*" || true; } | head -1
}
# ── Cleanup ───────────────────────────────────────────────────────────
cleanup() {
# Remove test alert from Alertmanager if we posted one
if [[ "${TEST_ALERT_POSTED:-false}" == "true" && "$SKIP_ALERTMANAGER" != "true" && -n "$ALERTMANAGER_URL" ]]; then
verbose "Cleaning up test alert from Alertmanager"
am_curl "${ALERTMANAGER_URL}/api/v2/silences" >/dev/null 2>&1 || true
fi
}
trap cleanup EXIT
# ── Usage ─────────────────────────────────────────────────────────────
usage() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS]
Smoke-test a Prometheus + Alertmanager stack. Zero external dependencies — bash 4+ and curl only.
Validates targets, rules, alerts, TSDB health, storage, remote write, and query performance.
Required environment variables:
PROMETHEUS_URL Prometheus base URL (http://prometheus:9090)
Optional environment variables:
PROMETHEUS_AUTH_TOKEN Bearer token for authentication
ALERTMANAGER_URL Alertmanager base URL (http://alertmanager:9093)
EXPECTED_JOBS Comma-separated list of expected scrape job names
WATCHDOG_ALERT_NAME Dead man's switch alert name (default: Watchdog)
QUERY_TIMEOUT_MS Max query response time in ms (default: 5000)
TSDB_PATH Local TSDB data path for disk checks
CURL_TIMEOUT HTTP timeout in seconds (default: 10)
CURL_INSECURE Allow self-signed certs (default: false)
SKIP_ALERTMANAGER Skip Alertmanager tests (default: false)
SKIP_REMOTE_WRITE Skip remote write checks (default: false)
OUTPUT_FORMAT Output: text (default), tap, junit
JUNIT_FILE JUnit output path (default: smoke-results.xml)
Options:
--skip-alertmanager Skip Alertmanager integration tests
--skip-remote-write Skip remote write checks
--insecure Allow self-signed TLS certificates (-k)
--timeout N curl timeout in seconds (default: 10)
--format FORMAT Output: text (default), tap, junit
--junit-file FILE JUnit output path (default: smoke-results.xml)
--expected-jobs JOBS Comma-separated expected scrape jobs
--query-timeout MS Max query response time in ms (default: 5000)
--tsdb-path PATH Local TSDB data path for disk checks
--verbose Show debug output
--no-color Disable colored output
--help Show this help
Examples:
# Basic run
export PROMETHEUS_URL="http://prometheus:9090"
./$(basename "$0")
# With expected jobs and Alertmanager
export PROMETHEUS_URL="http://prometheus:9090"
export ALERTMANAGER_URL="http://alertmanager:9093"
export EXPECTED_JOBS="node,prometheus,blackbox,cadvisor"
./$(basename "$0")
# JUnit XML for CI
./$(basename "$0") --format junit --junit-file results.xml
# Self-signed certs
./$(basename "$0") --insecure
EOF
}
# ── Section Header ────────────────────────────────────────────────────
section() {
if [[ "$OUTPUT_FORMAT" != "tap" ]]; then
echo ""
echo -e "${BOLD}$1${RESET}"
fi
}
# ── Summary ───────────────────────────────────────────────────────────
print_summary() {
local end_time
end_time=$(date +%s)
local duration=$(( end_time - START_TIME ))
echo ""
echo -e "${BOLD}────────────────────────────────────────${RESET}"
echo -e "${BOLD}Summary${RESET} ${PROMETHEUS_URL}"
echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)"
echo -e "${BOLD}────────────────────────────────────────${RESET}"
if [[ $FAIL -eq 0 ]]; then
echo -e "${GREEN}${BOLD}All tests passed.${RESET}"
else
echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}"
fi
}
# ══════════════════════════════════════════════════════════════════════
# TEST SUITES
# ══════════════════════════════════════════════════════════════════════
# ── Targets ───────────────────────────────────────────────────────────
test_targets() {
section "Targets"
local response
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/targets") || {
record_fail "Targets API reachable" "connection failed"
return
}
local status
status=$(json_value "status" "$response")
if [[ "$status" == "success" ]]; then
record_pass "Targets API reachable" "HTTP 200"
else
record_fail "Targets API reachable" "status: ${status:-empty}"
return
fi
# Count active targets
local up_count
up_count=$(echo "$response" | { grep -oP '"health"\s*:\s*"up"' || true; } | wc -l)
local down_count
down_count=$(echo "$response" | { grep -oP '"health"\s*:\s*"down"' || true; } | wc -l)
local total_targets=$(( up_count + down_count ))
if [[ $down_count -eq 0 ]]; then
record_pass "All targets healthy" "${up_count}/${total_targets} targets up"
else
record_fail "Targets health" "${down_count}/${total_targets} targets down"
fi
# Scrape errors
local scrape_errors
scrape_errors=$(echo "$response" | { grep -oP '"lastError"\s*:\s*"\K[^"]+' || true; } | head -5)
if [[ -z "$scrape_errors" ]]; then
record_pass "No scrape errors" "all targets scraping cleanly"
else
local error_count
error_count=$(echo "$response" | { grep -oP '"lastError"\s*:\s*"[^"]+"' || true; } | grep -vc '""') || error_count=0
if [[ $error_count -eq 0 ]]; then
record_pass "No scrape errors" "all targets scraping cleanly"
else
record_fail "Scrape errors detected" "${error_count} target(s) with errors"
fi
fi
# Check expected jobs if specified
if [[ -n "$EXPECTED_JOBS" ]]; then
local active_jobs
active_jobs=$(echo "$response" | { grep -oP '"job"\s*:\s*"\K[^"]+' || true; } | sort -u)
IFS=',' read -ra expected_arr <<< "$EXPECTED_JOBS"
local missing=""
for job in "${expected_arr[@]}"; do
job="${job// /}"
if ! echo "$active_jobs" | grep -qx "$job"; then
if [[ -n "$missing" ]]; then
missing="${missing}, ${job}"
else
missing="$job"
fi
fi
done
if [[ -z "$missing" ]]; then
record_pass "Expected jobs present" "${#expected_arr[@]} jobs found"
else
record_fail "Expected jobs missing" "missing: ${missing}"
fi
fi
}
# ── Rules ─────────────────────────────────────────────────────────────
test_rules() {
section "Rules"
local response
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/rules") || {
record_fail "Rules API reachable" "connection failed"
return
}
local status
status=$(json_value "status" "$response")
if [[ "$status" != "success" ]]; then
record_fail "Rules API reachable" "status: ${status:-empty}"
return
fi
# Count rule groups
local group_count
group_count=$(echo "$response" | { grep -oP '"name"\s*:\s*"' || true; } | wc -l)
# Recording rules
local recording_count
recording_count=$(echo "$response" | { grep -oP '"type"\s*:\s*"recording"' || true; } | wc -l)
if [[ $recording_count -gt 0 ]]; then
record_pass "Recording rules loaded" "${recording_count} recording rules in ${group_count} groups"
else
record_pass "Recording rules" "none configured"
fi
# Alerting rules
local alerting_count
alerting_count=$(echo "$response" | { grep -oP '"type"\s*:\s*"alerting"' || true; } | wc -l)
if [[ $alerting_count -gt 0 ]]; then
record_pass "Alerting rules loaded" "${alerting_count} alerting rules"
else
record_skip "Alerting rules" "none configured"
fi
# Rule evaluation errors
local eval_errors
eval_errors=$(echo "$response" | { grep -oP '"lastError"\s*:\s*"\K[^"]+' || true; } | head -5)
local error_count=0
if [[ -n "$eval_errors" ]]; then
error_count=$(echo "$eval_errors" | wc -l)
fi
if [[ $error_count -eq 0 ]]; then
record_pass "Rule evaluation healthy" "no evaluation errors"
else
record_fail "Rule evaluation errors" "${error_count} rule(s) with errors"
fi
# Rule group evaluation duration
local slow_groups
slow_groups=$(echo "$response" | { grep -oP '"evaluationTime"\s*:\s*\K[0-9.]+' || true; } | awk '$1 > 5.0' | wc -l)
if [[ $slow_groups -eq 0 ]]; then
record_pass "Rule group evaluation speed" "all groups under 5s"
else
record_fail "Slow rule groups" "${slow_groups} group(s) over 5s evaluation time"
fi
}
# ── Alerts ────────────────────────────────────────────────────────────
test_alerts() {
section "Alerts"
local response
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/alerts") || {
record_fail "Alerts API reachable" "connection failed"
return
}
local status
status=$(json_value "status" "$response")
if [[ "$status" != "success" ]]; then
record_fail "Alerts API reachable" "status: ${status:-empty}"
return
fi
# Check for firing critical alerts
local critical_count
critical_count=$(echo "$response" | { grep -oP '"severity"\s*:\s*"critical"' || true; } | wc -l)
if [[ $critical_count -eq 0 ]]; then
record_pass "No critical alerts firing" "clean"
else
record_fail "Critical alerts firing" "${critical_count} critical alert(s) active"
fi
# Watchdog dead man's switch
local watchdog
watchdog=$(echo "$response" | { grep -oP "\"alertname\"\s*:\s*\"${WATCHDOG_ALERT_NAME}\"" || true; } | wc -l)
if [[ $watchdog -gt 0 ]]; then
record_pass "Watchdog alert firing" "dead man's switch active"
else
record_fail "Watchdog alert not firing" "${WATCHDOG_ALERT_NAME} should always fire"
fi
}
# ── Alertmanager Integration ──────────────────────────────────────────
test_alertmanager() {
section "Alertmanager Integration"
if [[ "$SKIP_ALERTMANAGER" == "true" ]]; then
record_skip "Alertmanager tests" "skipped via --skip-alertmanager"
return
fi
if [[ -z "$ALERTMANAGER_URL" ]]; then
record_skip "Alertmanager tests" "ALERTMANAGER_URL not set"
return
fi
# Reachability
local am_status
am_status=$(am_curl_status "${ALERTMANAGER_URL}/-/healthy") || am_status="000"
if [[ "$am_status" == "200" ]]; then
record_pass "Alertmanager reachable" "HTTP 200"
else
record_fail "Alertmanager reachable" "HTTP ${am_status}"
return
fi
# Check Prometheus can reach Alertmanager via its own API
local am_response
am_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/alertmanagers") || {
record_skip "Prometheus→Alertmanager link" "API unavailable"
return
}
local active_am
active_am=$(echo "$am_response" | grep -oP '"activeAlertmanagers"\s*:\s*\[\K[^\]]*' | grep -c "url" || true)
if [[ $active_am -gt 0 ]]; then
record_pass "Prometheus→Alertmanager connected" "${active_am} active Alertmanager(s)"
else
record_fail "Prometheus→Alertmanager disconnected" "no active Alertmanagers"
fi
# Fire a test alert
local alert_payload
alert_payload=$(cat <<ALERT_EOF
[{"labels":{"alertname":"${TEST_ALERT_NAME}","severity":"none","source":"smoke-test"},"annotations":{"summary":"Prometheus smoke test alert — will auto-resolve"},"startsAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","endsAt":"$(date -u -d '+2 minutes' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v+2M +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "2099-01-01T00:00:00Z")"}]
ALERT_EOF
)
local post_status
post_status=$(am_curl_status "${ALERTMANAGER_URL}/api/v2/alerts" -X POST -H "Content-Type: application/json" -d "$alert_payload") || post_status="000"
if [[ "$post_status" == "200" ]]; then
TEST_ALERT_POSTED=true
record_pass "Test alert posted to Alertmanager" "HTTP 200"
# Wait briefly and verify
sleep 2
local verify_response
verify_response=$(am_curl "${ALERTMANAGER_URL}/api/v2/alerts?filter=alertname%3D${TEST_ALERT_NAME}") || verify_response=""
if echo "$verify_response" | grep -q "$TEST_ALERT_NAME"; then
record_pass "Test alert visible in Alertmanager" "alert round-trip confirmed"
else
record_fail "Test alert not visible" "posted but not found in /api/v2/alerts"
fi
# Silence and expire the test alert
local silence_payload
silence_payload=$(cat <<SILENCE_EOF
{"matchers":[{"name":"alertname","value":"${TEST_ALERT_NAME}","isRegex":false}],"startsAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","endsAt":"$(date -u -d '+1 minute' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v+1M +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "2099-01-01T00:00:00Z")","createdBy":"prometheus-smoke-tester","comment":"auto-cleanup"}
SILENCE_EOF
)
am_curl_status "${ALERTMANAGER_URL}/api/v2/silences" -X POST -H "Content-Type: application/json" -d "$silence_payload" >/dev/null 2>&1 || true
verbose "Test alert silenced for cleanup"
else
record_fail "Test alert post failed" "HTTP ${post_status}"
fi
}
# ── Configuration ─────────────────────────────────────────────────────
test_configuration() {
section "Configuration"
# Config loaded
local config_response
config_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/config") || {
record_fail "Configuration loaded" "connection failed"
return
}
local config_status
config_status=$(json_value "status" "$config_response")
if [[ "$config_status" == "success" ]]; then
record_pass "Configuration loaded" "prometheus.yml active"
else
record_fail "Configuration loaded" "status: ${config_status:-empty}"
fi
# Runtime flags
local flags_response
flags_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/flags") || {
record_fail "Runtime flags" "connection failed"
return
}
local flags_status
flags_status=$(json_value "status" "$flags_response")
if [[ "$flags_status" == "success" ]]; then
local retention
retention=$(json_value_string "storage.tsdb.retention.time" "$flags_response")
if [[ -z "$retention" ]]; then
retention="default"
fi
record_pass "Runtime flags accessible" "retention: ${retention}"
else
record_fail "Runtime flags" "status: ${flags_status:-empty}"
fi
}
# ── TSDB Health ───────────────────────────────────────────────────────
test_tsdb() {
section "TSDB Health"
local response
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/tsdb") || {
record_fail "TSDB status API" "connection failed"
return
}
local status
status=$(json_value "status" "$response")
if [[ "$status" != "success" ]]; then
record_fail "TSDB status API" "status: ${status:-empty}"
return
fi
# Head stats
local head_series
head_series=$(json_value "numSeries" "$response")
if [[ -n "$head_series" ]]; then
record_pass "TSDB head series" "${head_series} active series"
else
record_pass "TSDB status" "responding"
fi
local head_chunks
head_chunks=$(json_value "chunkCount" "$response")
if [[ -n "$head_chunks" ]]; then
record_pass "TSDB head chunks" "${head_chunks} chunks in head"
fi
# Runtime info for WAL and compaction
local runtime_response
runtime_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/runtimeinfo") || {
verbose "Runtime info unavailable"
return
}
local runtime_status
runtime_status=$(json_value "status" "$runtime_response")
if [[ "$runtime_status" != "success" ]]; then
return
fi
# WAL corruption check — storageRetention present means TSDB is functional
local storage_retention
storage_retention=$(json_value_string "storageRetention" "$runtime_response")
if [[ -n "$storage_retention" ]]; then
record_pass "TSDB WAL operational" "retention: ${storage_retention}"
fi
# Last compaction
local last_gc
last_gc=$(json_value_string "lastGarbageCollection" "$runtime_response")
if [[ -n "$last_gc" ]]; then
record_pass "TSDB compaction" "last GC: ${last_gc}"
fi
}
# ── Storage ───────────────────────────────────────────────────────────
test_storage() {
section "Storage"
# Retention from flags
local flags_response
flags_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/flags") || {
record_fail "Storage flags" "connection failed"
return
}
local retention_time
retention_time=$(json_value_string "storage.tsdb.retention.time" "$flags_response")
local retention_size
retention_size=$(json_value_string "storage.tsdb.retention.size" "$flags_response")
if [[ -n "$retention_time" ]]; then
record_pass "Retention configured" "time: ${retention_time}${retention_size:+, size: ${retention_size}}"
else
record_pass "Retention" "default (15d)"
fi
# Local disk check if path provided
if [[ -n "$TSDB_PATH" ]]; then
if [[ -d "$TSDB_PATH" ]]; then
local disk_usage
disk_usage=$(du -sh "$TSDB_PATH" 2>/dev/null | awk '{print $1}')
record_pass "TSDB disk usage" "${disk_usage} at ${TSDB_PATH}"
# Check WAL directory
if [[ -d "${TSDB_PATH}/wal" ]]; then
local wal_size
wal_size=$(du -sh "${TSDB_PATH}/wal" 2>/dev/null | awk '{print $1}')
record_pass "WAL directory" "${wal_size}"
else
record_fail "WAL directory" "not found at ${TSDB_PATH}/wal"
fi
# Check chunks_head directory
if [[ -d "${TSDB_PATH}/chunks_head" ]]; then
local chunks_size
chunks_size=$(du -sh "${TSDB_PATH}/chunks_head" 2>/dev/null | awk '{print $1}')
record_pass "Head chunks on disk" "${chunks_size}"
fi
# Check for lock file (Prometheus is running)
if [[ -f "${TSDB_PATH}/lock" ]]; then
record_pass "TSDB lock file" "Prometheus holds lock"
else
record_fail "TSDB lock file" "no lock — Prometheus may not be running"
fi
else
record_fail "TSDB path" "${TSDB_PATH} not found"
fi
else
record_skip "Local disk checks" "TSDB_PATH not set"
fi
}
# ── Remote Write ──────────────────────────────────────────────────────
test_remote_write() {
section "Remote Write"
if [[ "$SKIP_REMOTE_WRITE" == "true" ]]; then
record_skip "Remote write tests" "skipped via --skip-remote-write"
return
fi
local response
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/runtimeinfo") || {
record_fail "Runtime info for remote write" "connection failed"
return
}
local status
status=$(json_value "status" "$response")
if [[ "$status" != "success" ]]; then
record_fail "Runtime info" "status: ${status:-empty}"
return
fi
# Check if remote write is configured by looking at the config
local config_response
config_response=$(prom_curl "${PROMETHEUS_URL}/api/v1/status/config") || {
record_skip "Remote write config" "config API unavailable"
return
}
if echo "$config_response" | grep -q "remote_write"; then
record_pass "Remote write configured" "remote_write section present"
# Check WAL for backlog via runtime info
local wal_size
wal_size=$(json_value "walSize" "$response")
if [[ -n "$wal_size" ]]; then
record_pass "Remote write WAL" "WAL size: ${wal_size}"
fi
else
record_skip "Remote write" "not configured"
fi
}
# ── Performance ───────────────────────────────────────────────────────
test_performance() {
section "Performance"
# Query up metric and measure response time
local start_ms
start_ms=$(date +%s%N 2>/dev/null || echo "0")
local response
response=$(prom_curl "${PROMETHEUS_URL}/api/v1/query?query=up") || {
record_fail "Query performance" "query failed"
return
}
local end_ms
end_ms=$(date +%s%N 2>/dev/null || echo "0")
local status
status=$(json_value "status" "$response")
if [[ "$status" != "success" ]]; then
record_fail "Query engine" "status: ${status:-empty}"
return
fi
# Calculate duration
if [[ "$start_ms" != "0" && "$end_ms" != "0" ]]; then
local duration_ms=$(( (end_ms - start_ms) / 1000000 ))
if [[ $duration_ms -lt $QUERY_TIMEOUT_MS ]]; then
record_pass "Query response time" "${duration_ms}ms (threshold: ${QUERY_TIMEOUT_MS}ms)"
else
record_fail "Query response slow" "${duration_ms}ms exceeds ${QUERY_TIMEOUT_MS}ms threshold"
fi
else
record_pass "Query engine" "up query returned successfully"
fi
# Count result series
local result_count
result_count=$(echo "$response" | { grep -oP '"__name__"\s*:\s*"up"' || true; } | wc -l)
if [[ $result_count -gt 0 ]]; then
record_pass "Query results" "${result_count} up metrics returned"
else
# Fallback — just verify we got a result
if echo "$response" | grep -q '"result"'; then
record_pass "Query results" "results returned"
else
record_fail "Query results" "no results from up query"
fi
fi
# TLS check if HTTPS
if [[ "$PROMETHEUS_URL" == https://* ]]; then
local prom_host
prom_host="${PROMETHEUS_URL#https://}"
prom_host="${prom_host%%/*}"
prom_host="${prom_host%%:*}"
local prom_port
prom_port="${PROMETHEUS_URL#https://}"
prom_port="${prom_port%%/*}"
if [[ "$prom_port" == *:* ]]; then
prom_port="${prom_port##*:}"
else
prom_port="443"
fi
local cert_expiry
cert_expiry=$(echo | openssl s_client -connect "${prom_host}:${prom_port}" -servername "$prom_host" 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2)
if [[ -n "$cert_expiry" ]]; then
local expiry_epoch
expiry_epoch=$(date -d "$cert_expiry" +%s 2>/dev/null || date -jf "%b %d %T %Y %Z" "$cert_expiry" +%s 2>/dev/null || echo "0")
local now_epoch
now_epoch=$(date +%s)
if [[ "$expiry_epoch" != "0" ]]; then
local days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
if [[ $days_left -gt 30 ]]; then
record_pass "TLS certificate valid" "${days_left} days remaining"
elif [[ $days_left -gt 0 ]]; then
record_fail "TLS certificate expiring soon" "${days_left} days remaining"
else
record_fail "TLS certificate expired" "expired ${cert_expiry}"
fi
fi
fi
fi
}
# ══════════════════════════════════════════════════════════════════════
# MAIN
# ══════════════════════════════════════════════════════════════════════
main() {
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--skip-alertmanager) SKIP_ALERTMANAGER=true ;;
--skip-remote-write) SKIP_REMOTE_WRITE=true ;;
--insecure) CURL_INSECURE=true ;;
--timeout) CURL_TIMEOUT="$2"; shift ;;
--format) OUTPUT_FORMAT="$2"; shift ;;
--junit-file) JUNIT_FILE="$2"; shift ;;
--expected-jobs) EXPECTED_JOBS="$2"; shift ;;
--query-timeout) QUERY_TIMEOUT_MS="$2"; shift ;;
--tsdb-path) TSDB_PATH="$2"; shift ;;
--verbose) VERBOSE=true ;;
--no-color) COLOR=never ;;
--help|-h) usage; exit 0 ;;
*) err "Unknown option: $1"; usage; exit 1 ;;
esac
shift
done
setup_colors
# Validate required vars
if [[ -z "$PROMETHEUS_URL" ]]; then
err "PROMETHEUS_URL is required"
echo ""
usage
exit 1
fi
# Strip trailing slash
PROMETHEUS_URL="${PROMETHEUS_URL%/}"
if [[ -n "$ALERTMANAGER_URL" ]]; then
ALERTMANAGER_URL="${ALERTMANAGER_URL%/}"
fi
START_TIME=$(date +%s)
# Header
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
print_tap_header
else
echo -e "${BOLD}Prometheus Smoke Tester${RESET}"
echo -e "Target: ${PROMETHEUS_URL}"
if [[ -n "$ALERTMANAGER_URL" && "$SKIP_ALERTMANAGER" != "true" ]]; then
echo -e "Alertmanager: ${ALERTMANAGER_URL}"
fi
echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
fi
# Run test suites
test_targets
test_rules
test_alerts
test_alertmanager
test_configuration
test_tsdb
test_storage
test_remote_write
test_performance
# Output
if [[ "$OUTPUT_FORMAT" == "tap" ]]; then
print_tap_footer
elif [[ "$OUTPUT_FORMAT" == "junit" ]]; then
write_junit
fi
if [[ "$OUTPUT_FORMAT" != "tap" ]]; then
print_summary
fi
# Exit code
if [[ $FAIL -gt 0 ]]; then
exit 1
fi
exit 0
}
main "$@"