Files
linux-scripts/alertmanager-exporter.sh
T
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

684 lines
21 KiB
Bash

#!/bin/bash
################################################################################
# Script Name: alertmanager-exporter.sh
# Version: 1.0
# Description: Prometheus exporter for Alertmanager operational overview.
# Queries the Alertmanager API for active alerts, silences,
# cluster health, and config status. Complements the built-in
# /metrics endpoint with higher-level operational metrics.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
# - curl
# - jq
# - Alertmanager running and accessible
# - netcat (nc) for HTTP mode
#
# Usage:
# # Output to stdout
# ./alertmanager-exporter.sh
#
# # HTTP server mode
# ./alertmanager-exporter.sh --http -p 9094
#
# # Textfile collector mode
# ./alertmanager-exporter.sh --textfile
#
# Metrics Exported:
# - alertmanager_overview_up - Exporter status (1=up, 0=down)
# - alertmanager_overview_info - Alertmanager version info
# - alertmanager_overview_alerts_active_total - Total active alerts
# - alertmanager_overview_alerts_by_state - Alerts by state
# - alertmanager_overview_alerts_by_severity - Alerts by severity
# - alertmanager_overview_alerts_by_receiver - Alerts by receiver
# - alertmanager_overview_alert_groups_total - Alert group count
# - alertmanager_overview_silences_active - Active silences
# - alertmanager_overview_silences_pending - Pending silences
# - alertmanager_overview_silences_expired - Expired silences
# - alertmanager_overview_silence_coverage_ratio - Silence coverage
# - alertmanager_overview_cluster_peers - Peer count
# - alertmanager_overview_cluster_peer_healthy - Per-peer health
# - alertmanager_overview_config_hash - Config hash for drift detection
# - alertmanager_overview_uptime_seconds - Uptime
# - alertmanager_overview_last_config_reload_timestamp - Last reload
# - alertmanager_overview_exporter_duration_seconds - Script duration
# - alertmanager_overview_exporter_last_run_timestamp - Last run time
#
# Configuration:
# Default HTTP port: 9094
# Textfile directory: /var/lib/node_exporter
# Alertmanager URL: http://localhost:9093
#
################################################################################
# ============================================================================
# CONFIGURATION VARIABLES
# ============================================================================
TEXTFILE_DIR="/var/lib/node_exporter"
OUTPUT_FILE=""
HTTP_MODE=false
HTTP_PORT=9094
AM_URL="http://localhost:9093"
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
show_usage() {
cat <<EOF
Usage: $0 [OPTIONS]
Export Alertmanager operational overview as Prometheus metrics.
MODES:
--textfile Write to node_exporter textfile collector
--http Run HTTP server on port $HTTP_PORT
OPTIONS:
-p, --port HTTP port (default: 9094)
-u, --url Alertmanager URL (default: $AM_URL)
-o, --output Output file path
EXAMPLES:
$0 --textfile # Write to textfile collector
$0 --http --port 9094 # Run HTTP server
$0 --url http://alertmanager:9093 # Custom Alertmanager URL
$0 -o /tmp/alertmanager-overview.prom # Write to custom file
EOF
exit 0
}
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help) show_usage ;;
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/alertmanager-overview.prom"; shift ;;
--http) HTTP_MODE=true; shift ;;
-p|--port) HTTP_PORT="$2"; shift 2 ;;
-u|--url) AM_URL="$2"; shift 2 ;;
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
}
# Check prerequisites
check_requirements() {
local missing=0
if ! command -v curl >/dev/null 2>&1; then
echo "ERROR: curl not found" >&2
missing=1
fi
if ! command -v jq >/dev/null 2>&1; then
echo "ERROR: jq not found" >&2
missing=1
fi
return $missing
}
# Query an Alertmanager API endpoint
# Args: $1 - endpoint path (e.g., /api/v2/alerts)
# Returns: JSON response or empty string on failure
am_api() {
local endpoint="$1"
curl -sf --connect-timeout 5 --max-time 10 "${AM_URL}${endpoint}" 2>/dev/null
}
# ============================================================================
# METRIC COLLECTION FUNCTIONS
# ============================================================================
# Get alert counts by state
# Populates global variables: ALERTS_ACTIVE, ALERTS_SUPPRESSED, ALERTS_UNPROCESSED
collect_alerts() {
local alerts_json
alerts_json=$(am_api "/api/v2/alerts")
if [ -z "$alerts_json" ]; then
ALERTS_TOTAL=0
ALERTS_ACTIVE=0
ALERTS_SUPPRESSED=0
ALERTS_UNPROCESSED=0
ALERTS_JSON="[]"
return 1
fi
ALERTS_JSON="$alerts_json"
ALERTS_TOTAL=$(echo "$alerts_json" | jq 'length')
ALERTS_ACTIVE=$(echo "$alerts_json" | jq '[.[] | select(.status.state == "active")] | length')
ALERTS_SUPPRESSED=$(echo "$alerts_json" | jq '[.[] | select(.status.state == "suppressed")] | length')
ALERTS_UNPROCESSED=$(echo "$alerts_json" | jq '[.[] | select(.status.state == "unprocessed")] | length')
}
# Get alert counts by severity label
# Returns: metrics lines for each severity
collect_alerts_by_severity() {
local severity count
for severity in critical warning info; do
count=$(echo "$ALERTS_JSON" | jq --arg sev "$severity" \
'[.[] | select(.labels.severity == $sev)] | length')
echo "alertmanager_overview_alerts_by_severity{severity=\"$severity\"} ${count:-0}"
done
# Count alerts with no severity or other severity values
count=$(echo "$ALERTS_JSON" | jq \
'[.[] | select(.labels.severity != "critical" and .labels.severity != "warning" and .labels.severity != "info")] | length')
if [ "$count" -gt 0 ]; then
echo "alertmanager_overview_alerts_by_severity{severity=\"other\"} $count"
fi
}
# Get alert counts by receiver
collect_alerts_by_receiver() {
echo "$ALERTS_JSON" | jq -r '
[.[] | .receivers[]?.name // "unknown"] |
group_by(.) |
map({receiver: .[0], count: length}) |
.[] |
"alertmanager_overview_alerts_by_receiver{receiver=\"\(.receiver)\"} \(.count)"
' 2>/dev/null
}
# Get alert group count
collect_alert_groups() {
local groups_json
groups_json=$(am_api "/api/v2/alerts/groups")
if [ -z "$groups_json" ]; then
echo "0"
return
fi
echo "$groups_json" | jq 'length'
}
# Get silence counts by state
collect_silences() {
local silences_json
silences_json=$(am_api "/api/v2/silences")
if [ -z "$silences_json" ]; then
SILENCES_ACTIVE=0
SILENCES_PENDING=0
SILENCES_EXPIRED=0
return 1
fi
SILENCES_ACTIVE=$(echo "$silences_json" | jq '[.[] | select(.status.state == "active")] | length')
SILENCES_PENDING=$(echo "$silences_json" | jq '[.[] | select(.status.state == "pending")] | length')
SILENCES_EXPIRED=$(echo "$silences_json" | jq '[.[] | select(.status.state == "expired")] | length')
}
# Calculate silence coverage ratio
# Returns: ratio 0.0-1.0 (suppressed / total alerts)
calculate_silence_coverage() {
if [ "$ALERTS_TOTAL" -gt 0 ]; then
awk "BEGIN {printf \"%.4f\", $ALERTS_SUPPRESSED / $ALERTS_TOTAL}"
else
echo "0"
fi
}
# Get cluster status
collect_cluster_status() {
local status_json
status_json=$(am_api "/api/v2/status")
if [ -z "$status_json" ]; then
CLUSTER_PEERS=0
CLUSTER_STATUS="unknown"
AM_VERSION="unknown"
AM_UPTIME_SECONDS=0
CONFIG_HASH="0"
LAST_RELOAD=0
return 1
fi
AM_VERSION=$(echo "$status_json" | jq -r '.versionInfo.version // "unknown"')
# Cluster info
# shellcheck disable=SC2034 # reserved for future use
CLUSTER_STATUS=$(echo "$status_json" | jq -r '.cluster.status // "disabled"')
CLUSTER_PEERS=$(echo "$status_json" | jq '.cluster.peers // [] | length')
# Peer details (for per-peer health metrics)
CLUSTER_PEERS_JSON=$(echo "$status_json" | jq '.cluster.peers // []')
# Uptime from start time
local start_time
start_time=$(echo "$status_json" | jq -r '.uptime // empty' 2>/dev/null)
if [ -n "$start_time" ]; then
local start_epoch now_epoch
start_epoch=$(date -d "$start_time" +%s 2>/dev/null || echo 0)
now_epoch=$(date +%s)
if [ "$start_epoch" -gt 0 ]; then
AM_UPTIME_SECONDS=$((now_epoch - start_epoch))
else
AM_UPTIME_SECONDS=0
fi
else
AM_UPTIME_SECONDS=0
fi
# Config hash — hash the config JSON for drift detection
local config_json
config_json=$(echo "$status_json" | jq -r '.config.original // ""')
if [ -n "$config_json" ]; then
CONFIG_HASH=$(echo "$config_json" | sha256sum | awk '{print $1}' | head -c 16)
else
CONFIG_HASH="0"
fi
# Last config reload — not directly available from /api/v2/status
# We'll pull this from the built-in /metrics if reachable
local reload_ts
reload_ts=$(curl -sf "${AM_URL}/metrics" 2>/dev/null | \
grep "^alertmanager_config_last_reload_success_timestamp_seconds" | \
awk '{print $2}' | head -1)
LAST_RELOAD=${reload_ts:-0}
}
# Output per-peer health metrics
output_peer_metrics() {
if [ "$CLUSTER_PEERS" -eq 0 ] || [ -z "$CLUSTER_PEERS_JSON" ]; then
return
fi
echo "$CLUSTER_PEERS_JSON" | jq -r '
.[] |
"alertmanager_overview_cluster_peer_healthy{peer=\"\(.address // "unknown")\"} 1"
' 2>/dev/null
}
# Get notification metrics from built-in /metrics endpoint
collect_notification_metrics() {
local metrics_raw
metrics_raw=$(curl -sf "${AM_URL}/metrics" 2>/dev/null)
if [ -z "$metrics_raw" ]; then
return 1
fi
NOTIFICATION_METRICS="$metrics_raw"
}
# Output notification rate per receiver (from built-in metrics)
output_notification_rates() {
if [ -z "$NOTIFICATION_METRICS" ]; then
return
fi
echo "$NOTIFICATION_METRICS" | \
grep "^alertmanager_notifications_total{" | \
sed 's/alertmanager_notifications_total/alertmanager_overview_notification_rate/' 2>/dev/null
}
# Output notification failures per receiver (from built-in metrics)
output_notification_failures() {
if [ -z "$NOTIFICATION_METRICS" ]; then
return
fi
echo "$NOTIFICATION_METRICS" | \
grep "^alertmanager_notifications_failed_total{" | \
sed 's/alertmanager_notifications_failed_total/alertmanager_overview_notification_failures/' 2>/dev/null
}
# Output notification latency per receiver (from built-in metrics)
output_notification_latency() {
if [ -z "$NOTIFICATION_METRICS" ]; then
return
fi
# Use the _sum and _count to compute average latency per integration
echo "$NOTIFICATION_METRICS" | \
grep "^alertmanager_notification_latency_seconds_sum{" | \
sed 's/alertmanager_notification_latency_seconds_sum/alertmanager_overview_notification_latency_seconds/' 2>/dev/null
}
# ============================================================================
# METRIC OUTPUT
# ============================================================================
generate_metrics() {
local script_start
script_start=$(date +%s)
# Check if Alertmanager is reachable
local am_up=1
if ! am_api "/api/v2/status" >/dev/null 2>&1; then
am_up=0
fi
cat <<EOF
# HELP alertmanager_overview_up Alertmanager exporter status (1=up, 0=down)
# TYPE alertmanager_overview_up gauge
alertmanager_overview_up $am_up
EOF
# If Alertmanager is down, output minimal metrics and exit
if [ "$am_up" -eq 0 ]; then
local script_end script_duration
script_end=$(date +%s)
script_duration=$((script_end - script_start))
cat <<EOF
# HELP alertmanager_overview_exporter_duration_seconds Time to generate all metrics
# TYPE alertmanager_overview_exporter_duration_seconds gauge
alertmanager_overview_exporter_duration_seconds $script_duration
# HELP alertmanager_overview_exporter_last_run_timestamp Unix timestamp of last run
# TYPE alertmanager_overview_exporter_last_run_timestamp gauge
alertmanager_overview_exporter_last_run_timestamp $script_end
EOF
return
fi
# Collect data from API
collect_alerts
collect_silences
collect_cluster_status
collect_notification_metrics
echo ""
# Version info
cat <<EOF
# HELP alertmanager_overview_info Alertmanager version and exporter version
# TYPE alertmanager_overview_info gauge
alertmanager_overview_info{version="$AM_VERSION",exporter_version="1.0"} 1
EOF
echo ""
# Active alerts total
cat <<EOF
# HELP alertmanager_overview_alerts_active_total Total active alerts
# TYPE alertmanager_overview_alerts_active_total gauge
alertmanager_overview_alerts_active_total $ALERTS_TOTAL
EOF
echo ""
# Alerts by state
cat <<EOF
# HELP alertmanager_overview_alerts_by_state Active alerts by state
# TYPE alertmanager_overview_alerts_by_state gauge
alertmanager_overview_alerts_by_state{state="active"} $ALERTS_ACTIVE
alertmanager_overview_alerts_by_state{state="suppressed"} $ALERTS_SUPPRESSED
alertmanager_overview_alerts_by_state{state="unprocessed"} $ALERTS_UNPROCESSED
EOF
echo ""
# Alerts by severity
cat <<EOF
# HELP alertmanager_overview_alerts_by_severity Active alerts by severity label
# TYPE alertmanager_overview_alerts_by_severity gauge
EOF
collect_alerts_by_severity
echo ""
# Alerts by receiver
local receiver_lines
receiver_lines=$(collect_alerts_by_receiver)
if [ -n "$receiver_lines" ]; then
cat <<EOF
# HELP alertmanager_overview_alerts_by_receiver Active alerts by receiver
# TYPE alertmanager_overview_alerts_by_receiver gauge
$receiver_lines
EOF
echo ""
fi
# Alert groups
local group_count
group_count=$(collect_alert_groups)
cat <<EOF
# HELP alertmanager_overview_alert_groups_total Total alert group count
# TYPE alertmanager_overview_alert_groups_total gauge
alertmanager_overview_alert_groups_total $group_count
EOF
echo ""
# Alert resolution rate
local resolution_rate
if [ "$ALERTS_TOTAL" -gt 0 ]; then
resolution_rate=$(awk "BEGIN {printf \"%.4f\", $ALERTS_SUPPRESSED / $ALERTS_TOTAL}")
else
resolution_rate="0"
fi
cat <<EOF
# HELP alertmanager_overview_alert_resolution_rate Ratio of suppressed to total alerts
# TYPE alertmanager_overview_alert_resolution_rate gauge
alertmanager_overview_alert_resolution_rate $resolution_rate
EOF
echo ""
# Silences
cat <<EOF
# HELP alertmanager_overview_silences_active Number of active silences
# TYPE alertmanager_overview_silences_active gauge
alertmanager_overview_silences_active $SILENCES_ACTIVE
# HELP alertmanager_overview_silences_pending Number of pending silences
# TYPE alertmanager_overview_silences_pending gauge
alertmanager_overview_silences_pending $SILENCES_PENDING
# HELP alertmanager_overview_silences_expired Number of expired silences
# TYPE alertmanager_overview_silences_expired gauge
alertmanager_overview_silences_expired $SILENCES_EXPIRED
EOF
echo ""
# Silence coverage
local silence_coverage
silence_coverage=$(calculate_silence_coverage)
cat <<EOF
# HELP alertmanager_overview_silence_coverage_ratio Ratio of silenced alerts to total alerts
# TYPE alertmanager_overview_silence_coverage_ratio gauge
alertmanager_overview_silence_coverage_ratio $silence_coverage
EOF
echo ""
# Cluster health
cat <<EOF
# HELP alertmanager_overview_cluster_peers Number of cluster peers
# TYPE alertmanager_overview_cluster_peers gauge
alertmanager_overview_cluster_peers $CLUSTER_PEERS
EOF
echo ""
# Per-peer health
local peer_lines
peer_lines=$(output_peer_metrics)
if [ -n "$peer_lines" ]; then
cat <<EOF
# HELP alertmanager_overview_cluster_peer_healthy Peer health status (1=healthy)
# TYPE alertmanager_overview_cluster_peer_healthy gauge
$peer_lines
EOF
echo ""
fi
# Config hash
cat <<EOF
# HELP alertmanager_overview_config_hash Config hash for drift detection (first 16 chars of SHA256)
# TYPE alertmanager_overview_config_hash gauge
alertmanager_overview_config_hash{hash="$CONFIG_HASH"} 1
EOF
echo ""
# Notification rate per receiver
local notif_rates
notif_rates=$(output_notification_rates)
if [ -n "$notif_rates" ]; then
cat <<EOF
# HELP alertmanager_overview_notification_rate Total notifications sent per integration
# TYPE alertmanager_overview_notification_rate counter
$notif_rates
EOF
echo ""
fi
# Notification failures per receiver
local notif_failures
notif_failures=$(output_notification_failures)
if [ -n "$notif_failures" ]; then
cat <<EOF
# HELP alertmanager_overview_notification_failures Total notification failures per integration
# TYPE alertmanager_overview_notification_failures counter
$notif_failures
EOF
echo ""
fi
# Notification latency per receiver
local notif_latency
notif_latency=$(output_notification_latency)
if [ -n "$notif_latency" ]; then
cat <<EOF
# HELP alertmanager_overview_notification_latency_seconds Notification latency sum per integration
# TYPE alertmanager_overview_notification_latency_seconds counter
$notif_latency
EOF
echo ""
fi
# Uptime
cat <<EOF
# HELP alertmanager_overview_uptime_seconds Alertmanager uptime in seconds
# TYPE alertmanager_overview_uptime_seconds gauge
alertmanager_overview_uptime_seconds $AM_UPTIME_SECONDS
EOF
echo ""
# Last config reload
cat <<EOF
# HELP alertmanager_overview_last_config_reload_timestamp Unix timestamp of last successful config reload
# TYPE alertmanager_overview_last_config_reload_timestamp gauge
alertmanager_overview_last_config_reload_timestamp $LAST_RELOAD
EOF
echo ""
# Exporter runtime
local script_end script_duration
script_end=$(date +%s)
script_duration=$((script_end - script_start))
cat <<EOF
# HELP alertmanager_overview_exporter_duration_seconds Time to generate all metrics
# TYPE alertmanager_overview_exporter_duration_seconds gauge
alertmanager_overview_exporter_duration_seconds $script_duration
# HELP alertmanager_overview_exporter_last_run_timestamp Unix timestamp of last successful run
# TYPE alertmanager_overview_exporter_last_run_timestamp gauge
alertmanager_overview_exporter_last_run_timestamp $script_end
EOF
echo ""
}
# ============================================================================
# HTTP SERVER MODE
# ============================================================================
run_http_server() {
echo "Starting alertmanager-overview exporter on port $HTTP_PORT..." >&2
echo "Alertmanager URL: $AM_URL" >&2
if ! command -v nc >/dev/null 2>&1; then
echo "ERROR: netcat (nc) required for HTTP mode" >&2
exit 1
fi
while true; do
{
read -r request
if [[ "$request" =~ ^GET\ /metrics ]]; then
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r"
generate_metrics
else
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r"
cat <<EOF
<!DOCTYPE html>
<html>
<head><title>Alertmanager Overview Exporter</title></head>
<body>
<h1>Alertmanager Overview Exporter v1.0</h1>
<p>Alertmanager URL: $AM_URL</p>
<p><a href="/metrics">Metrics</a></p>
</body>
</html>
EOF
fi
} | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
done
}
# ============================================================================
# MAIN EXECUTION
# ============================================================================
main() {
parse_args "$@"
if ! check_requirements; then
exit 1
fi
if [ "$HTTP_MODE" = true ]; then
run_http_server
elif [ -n "$OUTPUT_FILE" ]; then
local output_dir
output_dir="$(dirname "$OUTPUT_FILE")"
mkdir -p "$output_dir"
local temp_file
temp_file=$(mktemp "${output_dir}/.alertmanager_overview.XXXXXX")
if ! generate_metrics > "$temp_file" 2>/dev/null; then
rm -f "$temp_file"
echo "ERROR: Failed to generate metrics" >&2
exit 1
fi
local file_lines
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
if [ "$file_lines" -lt 5 ]; then
rm -f "$temp_file"
echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
exit 1
fi
chmod 644 "$temp_file"
mv -f "$temp_file" "$OUTPUT_FILE"
echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
else
generate_metrics
fi
}
main "$@"