#!/usr/bin/env bash ######################################################################################### #### infra-smoke-tests.sh — Verify Prometheus/Grafana/Alertmanager/Loki stack health #### #### Zero external dependencies. Runs in air-gapped environments. #### #### Requires: bash 4+, curl, openssl (optional) #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version 1.01 #### #### #### #### Usage: #### #### export PROMETHEUS_URL="http://prometheus:9090" #### #### export GRAFANA_URL="http://grafana:3000" #### #### ./infra-smoke-tests.sh #### #### #### #### See --help for all options. #### ######################################################################################### set -euo pipefail # ── Defaults ────────────────────────────────────────────────────────── PROMETHEUS_URL="${PROMETHEUS_URL:-}" GRAFANA_URL="${GRAFANA_URL:-}" GRAFANA_TOKEN="${GRAFANA_TOKEN:-}" ALERTMANAGER_URL="${ALERTMANAGER_URL:-}" LOKI_URL="${LOKI_URL:-}" EXPECTED_JOBS="${EXPECTED_JOBS:-}" CURL_TIMEOUT="${CURL_TIMEOUT:-10}" CURL_INSECURE="${CURL_INSECURE:-false}" SKIP_LOKI="${SKIP_LOKI:-false}" SKIP_ALERTMANAGER="${SKIP_ALERTMANAGER:-false}" SKIP_GRAFANA="${SKIP_GRAFANA:-false}" OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" # text, tap, junit JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}" VERBOSE="${VERBOSE:-false}" COLOR="${COLOR:-auto}" # ── State ───────────────────────────────────────────────────────────── PASS=0 FAIL=0 SKIP=0 TOTAL=0 RESULTS=() START_TIME="" # ── Colors ──────────────────────────────────────────────────────────── setup_colors() { if [[ "$COLOR" == "never" ]]; then RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" return fi if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' BOLD='\033[1m' RESET='\033[0m' else RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" fi } # ── Logging ─────────────────────────────────────────────────────────── log() { echo -e "${BLUE}[INFO]${RESET} $*"; } warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } # ── Test Result Recording ───────────────────────────────────────────── record_pass() { local name="$1" local detail="${2:-}" ((PASS++)) || true ((TOTAL++)) || true RESULTS+=("PASS|${name}|${detail}") if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name}" else echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}" fi } record_fail() { local name="$1" local detail="${2:-}" ((FAIL++)) || true ((TOTAL++)) || true RESULTS+=("FAIL|${name}|${detail}") if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "not ok ${TOTAL} - ${name}" [[ -n "$detail" ]] && echo " # ${detail}" else echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}" fi } record_skip() { local name="$1" local reason="${2:-}" ((SKIP++)) || true ((TOTAL++)) || true RESULTS+=("SKIP|${name}|${reason}") if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name} # SKIP ${reason}" else echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}" fi } # ── curl wrappers ───────────────────────────────────────────────────── http_get() { local url="$1" shift local curl_opts=(-s -S --max-time "$CURL_TIMEOUT") [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) verbose "curl GET ${url}" curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null } http_get_status() { local url="$1" shift local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) verbose "curl GET (status) ${url}" curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null } http_get_with_status() { local url="$1" shift local curl_opts=(-s -S -w "\n%{http_code}" --max-time "$CURL_TIMEOUT") [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) verbose "curl GET (body+status) ${url}" curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null } grafana_get() { local endpoint="$1" local url="${GRAFANA_URL}${endpoint}" if [[ -n "$GRAFANA_TOKEN" ]]; then http_get "$url" -H "Authorization: Bearer ${GRAFANA_TOKEN}" else http_get "$url" fi } grafana_get_status() { local endpoint="$1" local url="${GRAFANA_URL}${endpoint}" if [[ -n "$GRAFANA_TOKEN" ]]; then http_get_status "$url" -H "Authorization: Bearer ${GRAFANA_TOKEN}" else http_get_status "$url" fi } # ── JSON parsing (no jq required) ──────────────────────────────────── json_value() { local key="$1" local json="$2" echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"?\K[^\",}]+" || true; } | head -1 } json_value_string() { local key="$1" local json="$2" echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"\K[^\"]*" || true; } | head -1 } json_count() { local key="$1" local json="$2" echo "$json" | { grep -oP "\"${key}\"" || true; } | wc -l } # ── TLS certificate check ──────────────────────────────────────────── check_tls_cert() { local url="$1" local name="$2" local host port host=$(echo "$url" | sed -E 's|https?://||; s|/.*||; s|:.*||') port=$(echo "$url" | grep -oP ':\K[0-9]+(?=/|$)' || echo "443") if ! command -v openssl &>/dev/null; then record_skip "${name} TLS certificate" "openssl not available" return fi if [[ "$url" != https://* ]]; then record_skip "${name} TLS certificate" "not HTTPS" return fi local cert_info cert_info=$(echo | openssl s_client -servername "$host" -connect "${host}:${port}" 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null) || { record_fail "${name} TLS certificate" "could not retrieve certificate" return } local end_date end_date=$(echo "$cert_info" | sed -n 's/notAfter=//p') if [[ -z "$end_date" ]]; then record_fail "${name} TLS certificate" "could not parse expiry" return fi local end_epoch now_epoch days_left end_epoch=$(date -d "$end_date" +%s 2>/dev/null) || { record_fail "${name} TLS certificate" "could not parse date: ${end_date}" return } now_epoch=$(date +%s) days_left=$(( (end_epoch - now_epoch) / 86400 )) if [[ $days_left -lt 0 ]]; then record_fail "${name} TLS certificate" "EXPIRED ${days_left#-} days ago" elif [[ $days_left -lt 14 ]]; then record_fail "${name} TLS certificate" "expires in ${days_left} days (critical)" elif [[ $days_left -lt 30 ]]; then record_pass "${name} TLS certificate" "${days_left} days remaining (warning)" else record_pass "${name} TLS certificate" "${days_left} days remaining" fi } # ── Output Functions ────────────────────────────────────────────────── section_header() { local name="$1" if [[ "$OUTPUT_FORMAT" == "text" ]]; then echo "" echo -e "${BOLD}${name}${RESET}" fi } print_header() { if [[ "$OUTPUT_FORMAT" == "text" ]]; then echo -e "${BOLD}Infra Smoke Tests${RESET}" [[ -n "$PROMETHEUS_URL" ]] && echo "Prometheus: ${PROMETHEUS_URL}" [[ -n "$GRAFANA_URL" ]] && echo "Grafana: ${GRAFANA_URL}" [[ -n "$ALERTMANAGER_URL" ]] && echo "Alertmanager: ${ALERTMANAGER_URL}" [[ -n "$LOKI_URL" ]] && echo "Loki: ${LOKI_URL}" echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" fi } print_summary() { local end_time end_time=$(date +%s) local duration=$(( end_time - START_TIME )) local target="${PROMETHEUS_URL:-${GRAFANA_URL:-${ALERTMANAGER_URL:-${LOKI_URL:-unknown}}}}" if [[ "$OUTPUT_FORMAT" == "text" ]]; then echo "" echo -e "${BOLD}────────────────────────────────────────${RESET}" echo -e "${BOLD}Summary${RESET} ${target}" echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" echo -e "${BOLD}────────────────────────────────────────${RESET}" if [[ $FAIL -eq 0 ]]; then echo -e "${GREEN}${BOLD}All tests passed.${RESET}" else echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}" fi fi } print_tap_header() { echo "TAP version 13"; } print_tap_footer() { echo "1..${TOTAL}" echo "# pass ${PASS}" echo "# fail ${FAIL}" echo "# skip ${SKIP}" } write_junit() { local end_time end_time=$(date +%s) local duration=$(( end_time - START_TIME )) cat > "$JUNIT_FILE" < JUNIT_EOF local result for result in "${RESULTS[@]}"; do local status name detail status=$(echo "$result" | cut -d'|' -f1) name=$(echo "$result" | cut -d'|' -f2) detail=$(echo "$result" | cut -d'|' -f3) name=$(echo "$name" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') detail=$(echo "$detail" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') case "$status" in PASS) echo " " >> "$JUNIT_FILE" [[ -n "$detail" ]] && echo " ${detail}" >> "$JUNIT_FILE" echo " " >> "$JUNIT_FILE" ;; FAIL) echo " " >> "$JUNIT_FILE" echo " FAILED: ${name} — ${detail}" >> "$JUNIT_FILE" echo " " >> "$JUNIT_FILE" ;; SKIP) echo " " >> "$JUNIT_FILE" echo " " >> "$JUNIT_FILE" echo " " >> "$JUNIT_FILE" ;; esac done echo " " >> "$JUNIT_FILE" echo "" >> "$JUNIT_FILE" } # ── Test Suites ─────────────────────────────────────────────────────── test_connectivity() { section_header "Connectivity" # Prometheus health if [[ -n "$PROMETHEUS_URL" ]]; then local status status=$(http_get_status "${PROMETHEUS_URL}/-/healthy") || status="000" if [[ "$status" == "200" ]]; then record_pass "Prometheus health endpoint" "HTTP ${status}" else record_fail "Prometheus health endpoint" "HTTP ${status}" fi check_tls_cert "$PROMETHEUS_URL" "Prometheus" fi # Grafana health if [[ -n "$GRAFANA_URL" && "$SKIP_GRAFANA" != "true" ]]; then local status status=$(http_get_status "${GRAFANA_URL}/api/health") || status="000" if [[ "$status" == "200" ]]; then record_pass "Grafana health endpoint" "HTTP ${status}" else record_fail "Grafana health endpoint" "HTTP ${status}" fi check_tls_cert "$GRAFANA_URL" "Grafana" fi # Alertmanager health if [[ -n "$ALERTMANAGER_URL" && "$SKIP_ALERTMANAGER" != "true" ]]; then local status status=$(http_get_status "${ALERTMANAGER_URL}/-/healthy") || status="000" if [[ "$status" == "200" ]]; then record_pass "Alertmanager health endpoint" "HTTP ${status}" else record_fail "Alertmanager health endpoint" "HTTP ${status}" fi check_tls_cert "$ALERTMANAGER_URL" "Alertmanager" fi # Loki health if [[ -n "$LOKI_URL" && "$SKIP_LOKI" != "true" ]]; then local status status=$(http_get_status "${LOKI_URL}/ready") || status="000" if [[ "$status" == "200" ]]; then record_pass "Loki ready endpoint" "HTTP ${status}" else record_fail "Loki ready endpoint" "HTTP ${status}" fi check_tls_cert "$LOKI_URL" "Loki" fi } test_prometheus() { if [[ -z "$PROMETHEUS_URL" ]]; then return; fi section_header "Prometheus" # Targets local body body=$(http_get "${PROMETHEUS_URL}/api/v1/targets?state=active") || body="" if [[ -n "$body" ]]; then local active_count active_count=$(echo "$body" | { grep -oP '"health"\s*:\s*"up"' || true; } | wc -l) local total_count total_count=$(echo "$body" | { grep -oP '"health"\s*:\s*"' || true; } | wc -l) if [[ $total_count -gt 0 ]]; then record_pass "Prometheus targets" "${active_count}/${total_count} targets up" else record_fail "Prometheus targets" "no targets found" fi else record_fail "Prometheus targets" "could not query targets API" fi # Check expected jobs if configured if [[ -n "$EXPECTED_JOBS" ]]; then local targets_body targets_body=$(http_get "${PROMETHEUS_URL}/api/v1/targets") || targets_body="" IFS=',' read -ra jobs <<< "$EXPECTED_JOBS" local job for job in "${jobs[@]}"; do job=$(echo "$job" | xargs) # trim whitespace if echo "$targets_body" | grep -qP "\"job\"\s*:\s*\"${job}\""; then local job_health job_health=$(echo "$targets_body" | { grep -oP "\"job\"\s*:\s*\"${job}\"[^}]*\"health\"\s*:\s*\"\K[^\"]*" || true; } | head -1) if [[ "$job_health" == "up" ]]; then record_pass "Expected job: ${job}" "up" else record_fail "Expected job: ${job}" "health: ${job_health:-unknown}" fi else record_fail "Expected job: ${job}" "not found in targets" fi done fi # Alerting rules local rules_body rules_body=$(http_get "${PROMETHEUS_URL}/api/v1/rules") || rules_body="" if [[ -n "$rules_body" ]]; then local rule_groups firing_count rule_groups=$(echo "$rules_body" | { grep -oP '"type"\s*:\s*"alerting"' || true; } | wc -l) firing_count=$(echo "$rules_body" | { grep -oP '"state"\s*:\s*"firing"' || true; } | wc -l) record_pass "Prometheus alerting rules" "${rule_groups} rules loaded, ${firing_count} firing" else record_fail "Prometheus alerting rules" "could not query rules API" fi # TSDB stats local tsdb_body tsdb_body=$(http_get "${PROMETHEUS_URL}/api/v1/status/tsdb") || tsdb_body="" if [[ -n "$tsdb_body" ]]; then local num_series num_series=$(json_value "numSeries" "$tsdb_body") if [[ -n "$num_series" && "$num_series" -gt 0 ]] 2>/dev/null; then record_pass "Prometheus TSDB stats" "${num_series} time series" else record_pass "Prometheus TSDB stats" "responding" fi else record_fail "Prometheus TSDB stats" "could not query TSDB status" fi # Config reload check local config_body config_body=$(http_get "${PROMETHEUS_URL}/api/v1/status/config") || config_body="" if [[ -n "$config_body" ]]; then local config_status config_status=$(json_value "status" "$config_body") if [[ "$config_status" == "success" ]]; then record_pass "Prometheus config" "loaded successfully" else record_fail "Prometheus config" "status: ${config_status:-unknown}" fi else record_fail "Prometheus config" "could not query config API" fi } test_grafana() { if [[ -z "$GRAFANA_URL" || "$SKIP_GRAFANA" == "true" ]]; then return; fi section_header "Grafana" # Datasources local ds_body ds_body=$(grafana_get "/api/datasources") || ds_body="" if [[ -n "$ds_body" && "$ds_body" != "null" ]]; then local ds_count ds_count=$(echo "$ds_body" | { grep -oP '"id"\s*:' || true; } | wc -l) if [[ $ds_count -gt 0 ]]; then record_pass "Grafana datasources" "${ds_count} configured" else record_pass "Grafana datasources" "API responding (0 datasources)" fi else local ds_status ds_status=$(grafana_get_status "/api/datasources") || ds_status="000" if [[ "$ds_status" == "401" || "$ds_status" == "403" ]]; then record_skip "Grafana datasources" "authentication required (HTTP ${ds_status})" else record_fail "Grafana datasources" "could not query datasources API" fi fi # Dashboards search local dash_body dash_body=$(grafana_get "/api/search?type=dash-db&limit=1000") || dash_body="" if [[ -n "$dash_body" && "$dash_body" != "null" ]]; then local dash_count dash_count=$(echo "$dash_body" | { grep -oP '"uid"\s*:' || true; } | wc -l) record_pass "Grafana dashboards" "${dash_count} found" else local dash_status dash_status=$(grafana_get_status "/api/search?type=dash-db") || dash_status="000" if [[ "$dash_status" == "401" || "$dash_status" == "403" ]]; then record_skip "Grafana dashboards" "authentication required (HTTP ${dash_status})" else record_fail "Grafana dashboards" "could not query search API" fi fi # Auth check (org info) local org_body org_body=$(grafana_get "/api/org") || org_body="" if [[ -n "$org_body" ]]; then local org_name org_name=$(json_value_string "name" "$org_body") if [[ -n "$org_name" ]]; then record_pass "Grafana authentication" "org: ${org_name}" else local org_status org_status=$(grafana_get_status "/api/org") || org_status="000" if [[ "$org_status" == "401" || "$org_status" == "403" ]]; then record_skip "Grafana authentication" "token not provided or invalid" else record_pass "Grafana authentication" "API responding" fi fi else record_skip "Grafana authentication" "could not query org API" fi } test_alertmanager() { if [[ -z "$ALERTMANAGER_URL" || "$SKIP_ALERTMANAGER" == "true" ]]; then return; fi section_header "Alertmanager" # Cluster status local cluster_body cluster_body=$(http_get "${ALERTMANAGER_URL}/api/v2/status") || cluster_body="" if [[ -n "$cluster_body" ]]; then local cluster_status cluster_status=$(json_value_string "status" "$cluster_body") local peers peers=$(echo "$cluster_body" | { grep -oP '"address"\s*:' || true; } | wc -l) if [[ "$cluster_status" == "ready" || -n "$cluster_status" ]]; then record_pass "Alertmanager cluster status" "${cluster_status:-ok}, ${peers} peer(s)" else record_pass "Alertmanager cluster status" "responding" fi else record_fail "Alertmanager cluster status" "could not query status API" fi # Active alerts local alerts_body alerts_body=$(http_get "${ALERTMANAGER_URL}/api/v2/alerts?active=true") || alerts_body="" if [[ -n "$alerts_body" ]]; then local alert_count alert_count=$(echo "$alerts_body" | { grep -oP '"fingerprint"\s*:' || true; } | wc -l) record_pass "Alertmanager active alerts" "${alert_count} active" else record_fail "Alertmanager active alerts" "could not query alerts API" fi # Receivers local receivers_body receivers_body=$(http_get "${ALERTMANAGER_URL}/api/v2/receivers") || receivers_body="" if [[ -n "$receivers_body" ]]; then local receiver_count receiver_count=$(echo "$receivers_body" | { grep -oP '"name"\s*:' || true; } | wc -l) record_pass "Alertmanager receivers" "${receiver_count} configured" else record_fail "Alertmanager receivers" "could not query receivers API" fi } test_loki() { if [[ -z "$LOKI_URL" || "$SKIP_LOKI" == "true" ]]; then return; fi section_header "Loki" # Labels local labels_body labels_body=$(http_get "${LOKI_URL}/loki/api/v1/labels") || labels_body="" if [[ -n "$labels_body" ]]; then local labels_status labels_status=$(json_value "status" "$labels_body") if [[ "$labels_status" == "success" ]]; then local label_count label_count=$(echo "$labels_body" | { grep -oP '"[^"]+"\s*[,\]]' || true; } | wc -l) record_pass "Loki labels" "${label_count} labels found" else record_fail "Loki labels" "status: ${labels_status:-unknown}" fi else record_fail "Loki labels" "could not query labels API" fi # Basic query local query_body local encoded_query encoded_query=$(printf '%s' '{job=~".+"}' | curl -Gso /dev/null -w '%{url_effective}' --data-urlencode @- '' 2>/dev/null | sed 's/^.//') || encoded_query='%7Bjob%3D~%22.%2B%22%7D' query_body=$(http_get "${LOKI_URL}/loki/api/v1/query?query=${encoded_query}&limit=1") || query_body="" if [[ -n "$query_body" ]]; then local query_status query_status=$(json_value "status" "$query_body") if [[ "$query_status" == "success" ]]; then record_pass "Loki query" "query engine responding" else record_fail "Loki query" "status: ${query_status:-unknown}" fi else record_fail "Loki query" "could not query Loki" fi } test_integration() { if [[ -z "$PROMETHEUS_URL" || -z "$GRAFANA_URL" || "$SKIP_GRAFANA" == "true" ]]; then return; fi section_header "Integration" # Grafana → Prometheus query via Grafana proxy local ds_body ds_body=$(grafana_get "/api/datasources") || ds_body="" if [[ -z "$ds_body" || "$ds_body" == "null" ]]; then record_skip "Grafana → Prometheus query" "could not list datasources" return fi # Find a Prometheus datasource UID local prom_uid prom_uid=$(echo "$ds_body" | { grep -oP '"type"\s*:\s*"prometheus"[^}]*"uid"\s*:\s*"\K[^"]*' || true; } | head -1) if [[ -z "$prom_uid" ]]; then prom_uid=$(echo "$ds_body" | { grep -oP '"uid"\s*:\s*"\K[^"]*' || true; } | head -1) fi if [[ -z "$prom_uid" ]]; then record_skip "Grafana → Prometheus query" "no Prometheus datasource found" return fi local proxy_body proxy_body=$(grafana_get "/api/datasources/proxy/uid/${prom_uid}/api/v1/query?query=up") || proxy_body="" if [[ -n "$proxy_body" ]]; then local proxy_status proxy_status=$(json_value "status" "$proxy_body") if [[ "$proxy_status" == "success" ]]; then record_pass "Grafana → Prometheus query" "proxy query succeeded via datasource ${prom_uid}" else record_fail "Grafana → Prometheus query" "proxy returned: ${proxy_status:-unknown}" fi else record_skip "Grafana → Prometheus query" "proxy query returned empty (may need auth)" fi } # ── Main / Argument Parsing ─────────────────────────────────────────── usage() { cat <