#!/bin/bash ################################################################################ # Script Name: alertmanager-exporter.sh # Version: 1.0 # Description: Prometheus exporter for Alertmanager operational overview. # Queries the Alertmanager API for active alerts, silences, # cluster health, and config status. Complements the built-in # /metrics endpoint with higher-level operational metrics. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - curl # - jq # - Alertmanager running and accessible # - netcat (nc) for HTTP mode # # Usage: # # Output to stdout # ./alertmanager-exporter.sh # # # HTTP server mode # ./alertmanager-exporter.sh --http -p 9094 # # # Textfile collector mode # ./alertmanager-exporter.sh --textfile # # Metrics Exported: # - alertmanager_overview_up - Exporter status (1=up, 0=down) # - alertmanager_overview_info - Alertmanager version info # - alertmanager_overview_alerts_active_total - Total active alerts # - alertmanager_overview_alerts_by_state - Alerts by state # - alertmanager_overview_alerts_by_severity - Alerts by severity # - alertmanager_overview_alerts_by_receiver - Alerts by receiver # - alertmanager_overview_alert_groups_total - Alert group count # - alertmanager_overview_silences_active - Active silences # - alertmanager_overview_silences_pending - Pending silences # - alertmanager_overview_silences_expired - Expired silences # - alertmanager_overview_silence_coverage_ratio - Silence coverage # - alertmanager_overview_cluster_peers - Peer count # - alertmanager_overview_cluster_peer_healthy - Per-peer health # - alertmanager_overview_config_hash - Config hash for drift detection # - alertmanager_overview_uptime_seconds - Uptime # - alertmanager_overview_last_config_reload_timestamp - Last reload # - alertmanager_overview_exporter_duration_seconds - Script duration # - alertmanager_overview_exporter_last_run_timestamp - Last run time # # Configuration: # Default HTTP port: 9094 # Textfile directory: /var/lib/node_exporter # Alertmanager URL: http://localhost:9093 # ################################################################################ # ============================================================================ # CONFIGURATION VARIABLES # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9094 AM_URL="http://localhost:9093" # ============================================================================ # HELPER FUNCTIONS # ============================================================================ show_usage() { cat <&2; exit 1 ;; esac done } # Check prerequisites check_requirements() { local missing=0 if ! command -v curl >/dev/null 2>&1; then echo "ERROR: curl not found" >&2 missing=1 fi if ! command -v jq >/dev/null 2>&1; then echo "ERROR: jq not found" >&2 missing=1 fi return $missing } # Query an Alertmanager API endpoint # Args: $1 - endpoint path (e.g., /api/v2/alerts) # Returns: JSON response or empty string on failure am_api() { local endpoint="$1" curl -sf --connect-timeout 5 --max-time 10 "${AM_URL}${endpoint}" 2>/dev/null } # ============================================================================ # METRIC COLLECTION FUNCTIONS # ============================================================================ # Get alert counts by state # Populates global variables: ALERTS_ACTIVE, ALERTS_SUPPRESSED, ALERTS_UNPROCESSED collect_alerts() { local alerts_json alerts_json=$(am_api "/api/v2/alerts") if [ -z "$alerts_json" ]; then ALERTS_TOTAL=0 ALERTS_ACTIVE=0 ALERTS_SUPPRESSED=0 ALERTS_UNPROCESSED=0 ALERTS_JSON="[]" return 1 fi ALERTS_JSON="$alerts_json" ALERTS_TOTAL=$(echo "$alerts_json" | jq 'length') ALERTS_ACTIVE=$(echo "$alerts_json" | jq '[.[] | select(.status.state == "active")] | length') ALERTS_SUPPRESSED=$(echo "$alerts_json" | jq '[.[] | select(.status.state == "suppressed")] | length') ALERTS_UNPROCESSED=$(echo "$alerts_json" | jq '[.[] | select(.status.state == "unprocessed")] | length') } # Get alert counts by severity label # Returns: metrics lines for each severity collect_alerts_by_severity() { local severity count for severity in critical warning info; do count=$(echo "$ALERTS_JSON" | jq --arg sev "$severity" \ '[.[] | select(.labels.severity == $sev)] | length') echo "alertmanager_overview_alerts_by_severity{severity=\"$severity\"} ${count:-0}" done # Count alerts with no severity or other severity values count=$(echo "$ALERTS_JSON" | jq \ '[.[] | select(.labels.severity != "critical" and .labels.severity != "warning" and .labels.severity != "info")] | length') if [ "$count" -gt 0 ]; then echo "alertmanager_overview_alerts_by_severity{severity=\"other\"} $count" fi } # Get alert counts by receiver collect_alerts_by_receiver() { echo "$ALERTS_JSON" | jq -r ' [.[] | .receivers[]?.name // "unknown"] | group_by(.) | map({receiver: .[0], count: length}) | .[] | "alertmanager_overview_alerts_by_receiver{receiver=\"\(.receiver)\"} \(.count)" ' 2>/dev/null } # Get alert group count collect_alert_groups() { local groups_json groups_json=$(am_api "/api/v2/alerts/groups") if [ -z "$groups_json" ]; then echo "0" return fi echo "$groups_json" | jq 'length' } # Get silence counts by state collect_silences() { local silences_json silences_json=$(am_api "/api/v2/silences") if [ -z "$silences_json" ]; then SILENCES_ACTIVE=0 SILENCES_PENDING=0 SILENCES_EXPIRED=0 return 1 fi SILENCES_ACTIVE=$(echo "$silences_json" | jq '[.[] | select(.status.state == "active")] | length') SILENCES_PENDING=$(echo "$silences_json" | jq '[.[] | select(.status.state == "pending")] | length') SILENCES_EXPIRED=$(echo "$silences_json" | jq '[.[] | select(.status.state == "expired")] | length') } # Calculate silence coverage ratio # Returns: ratio 0.0-1.0 (suppressed / total alerts) calculate_silence_coverage() { if [ "$ALERTS_TOTAL" -gt 0 ]; then awk "BEGIN {printf \"%.4f\", $ALERTS_SUPPRESSED / $ALERTS_TOTAL}" else echo "0" fi } # Get cluster status collect_cluster_status() { local status_json status_json=$(am_api "/api/v2/status") if [ -z "$status_json" ]; then CLUSTER_PEERS=0 CLUSTER_STATUS="unknown" AM_VERSION="unknown" AM_UPTIME_SECONDS=0 CONFIG_HASH="0" LAST_RELOAD=0 return 1 fi AM_VERSION=$(echo "$status_json" | jq -r '.versionInfo.version // "unknown"') # Cluster info # shellcheck disable=SC2034 # reserved for future use CLUSTER_STATUS=$(echo "$status_json" | jq -r '.cluster.status // "disabled"') CLUSTER_PEERS=$(echo "$status_json" | jq '.cluster.peers // [] | length') # Peer details (for per-peer health metrics) CLUSTER_PEERS_JSON=$(echo "$status_json" | jq '.cluster.peers // []') # Uptime from start time local start_time start_time=$(echo "$status_json" | jq -r '.uptime // empty' 2>/dev/null) if [ -n "$start_time" ]; then local start_epoch now_epoch start_epoch=$(date -d "$start_time" +%s 2>/dev/null || echo 0) now_epoch=$(date +%s) if [ "$start_epoch" -gt 0 ]; then AM_UPTIME_SECONDS=$((now_epoch - start_epoch)) else AM_UPTIME_SECONDS=0 fi else AM_UPTIME_SECONDS=0 fi # Config hash — hash the config JSON for drift detection local config_json config_json=$(echo "$status_json" | jq -r '.config.original // ""') if [ -n "$config_json" ]; then CONFIG_HASH=$(echo "$config_json" | sha256sum | awk '{print $1}' | head -c 16) else CONFIG_HASH="0" fi # Last config reload — not directly available from /api/v2/status # We'll pull this from the built-in /metrics if reachable local reload_ts reload_ts=$(curl -sf "${AM_URL}/metrics" 2>/dev/null | \ grep "^alertmanager_config_last_reload_success_timestamp_seconds" | \ awk '{print $2}' | head -1) LAST_RELOAD=${reload_ts:-0} } # Output per-peer health metrics output_peer_metrics() { if [ "$CLUSTER_PEERS" -eq 0 ] || [ -z "$CLUSTER_PEERS_JSON" ]; then return fi echo "$CLUSTER_PEERS_JSON" | jq -r ' .[] | "alertmanager_overview_cluster_peer_healthy{peer=\"\(.address // "unknown")\"} 1" ' 2>/dev/null } # Get notification metrics from built-in /metrics endpoint collect_notification_metrics() { local metrics_raw metrics_raw=$(curl -sf "${AM_URL}/metrics" 2>/dev/null) if [ -z "$metrics_raw" ]; then return 1 fi NOTIFICATION_METRICS="$metrics_raw" } # Output notification rate per receiver (from built-in metrics) output_notification_rates() { if [ -z "$NOTIFICATION_METRICS" ]; then return fi echo "$NOTIFICATION_METRICS" | \ grep "^alertmanager_notifications_total{" | \ sed 's/alertmanager_notifications_total/alertmanager_overview_notification_rate/' 2>/dev/null } # Output notification failures per receiver (from built-in metrics) output_notification_failures() { if [ -z "$NOTIFICATION_METRICS" ]; then return fi echo "$NOTIFICATION_METRICS" | \ grep "^alertmanager_notifications_failed_total{" | \ sed 's/alertmanager_notifications_failed_total/alertmanager_overview_notification_failures/' 2>/dev/null } # Output notification latency per receiver (from built-in metrics) output_notification_latency() { if [ -z "$NOTIFICATION_METRICS" ]; then return fi # Use the _sum and _count to compute average latency per integration echo "$NOTIFICATION_METRICS" | \ grep "^alertmanager_notification_latency_seconds_sum{" | \ sed 's/alertmanager_notification_latency_seconds_sum/alertmanager_overview_notification_latency_seconds/' 2>/dev/null } # ============================================================================ # METRIC OUTPUT # ============================================================================ generate_metrics() { local script_start script_start=$(date +%s) # Check if Alertmanager is reachable local am_up=1 if ! am_api "/api/v2/status" >/dev/null 2>&1; then am_up=0 fi cat <&2 echo "Alertmanager URL: $AM_URL" >&2 if ! command -v nc >/dev/null 2>&1; then echo "ERROR: netcat (nc) required for HTTP mode" >&2 exit 1 fi while true; do { read -r request if [[ "$request" =~ ^GET\ /metrics ]]; then echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" generate_metrics else echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" cat < Alertmanager Overview Exporter

Alertmanager Overview Exporter v1.0

Alertmanager URL: $AM_URL

Metrics

EOF fi } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null done } # ============================================================================ # MAIN EXECUTION # ============================================================================ main() { parse_args "$@" if ! check_requirements; then exit 1 fi if [ "$HTTP_MODE" = true ]; then run_http_server elif [ -n "$OUTPUT_FILE" ]; then local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" local temp_file temp_file=$(mktemp "${output_dir}/.alertmanager_overview.XXXXXX") if ! generate_metrics > "$temp_file" 2>/dev/null; then rm -f "$temp_file" echo "ERROR: Failed to generate metrics" >&2 exit 1 fi local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [ "$file_lines" -lt 5 ]; then rm -f "$temp_file" echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 exit 1 fi chmod 644 "$temp_file" mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 else generate_metrics fi } main "$@"