Files
linux-scripts/alb-health-reporter.sh
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

575 lines
22 KiB
Bash
Executable File

#!/usr/bin/env bash
#########################################################################################
#### alb-health-reporter.sh — Check AWS ALB/NLB target group health and alert ####
#### Reports unhealthy targets, CloudWatch metrics, and sends SNS alerts ####
#### Requires: bash 4+, aws-cli v2, jq ####
#### ####
#### Author: Phil Connor ####
#### Contact: contact@mylinux.work ####
#### License: MIT ####
#### Version 1.00 ####
#### ####
#### Usage: ####
#### ./alb-health-reporter.sh --check ####
#### ####
#### See --help for all options. ####
#########################################################################################
set -euo pipefail
# ── Defaults ──────────────────────────────────────────────────────────
AWS_REGION="${AWS_REGION:-}"
ALB_NAME="${ALB_NAME:-}"
TARGET_GROUP="${TARGET_GROUP:-}"
SNS_TOPIC_ARN="${SNS_TOPIC_ARN:-}"
OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}"
VERBOSE="${VERBOSE:-false}"
COLOR="${COLOR:-auto}"
# ── State ─────────────────────────────────────────────────────────────
SCRIPT_NAME="$(basename "$0")"
readonly SCRIPT_NAME
RUN_MODE=""
START_TIME=""
UNHEALTHY_COUNT=0
HEALTHY_COUNT=0
DRAINING_COUNT=0
TOTAL_TARGETS=0
# ── Colors ────────────────────────────────────────────────────────────
setup_colors() {
if [[ "$COLOR" == "never" ]]; then
RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET=""
return
fi
if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
DIM='\033[2m'
RESET='\033[0m'
else
RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET=""
fi
}
# ── Logging ───────────────────────────────────────────────────────────
log() { echo -e "${BLUE}[INFO]${RESET} $*"; }
warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; }
err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; }
verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; }
die() { err "$*"; exit 1; }
elapsed() {
local end_time
end_time=$(date +%s)
echo "$(( end_time - START_TIME ))s"
}
# ── AWS CLI wrapper ───────────────────────────────────────────────────
aws_cmd() {
local args=("$@")
[[ -n "$AWS_REGION" ]] && args+=(--region "$AWS_REGION")
verbose "aws ${args[*]}"
aws "${args[@]}"
}
# ── Dependency check ──────────────────────────────────────────────────
check_deps() {
for cmd in aws jq; do
if ! command -v "$cmd" &>/dev/null; then
die "${cmd} is required but not installed"
fi
done
if ! aws sts get-caller-identity &>/dev/null; then
die "AWS credentials not configured or expired"
fi
if [[ -z "$AWS_REGION" ]]; then
AWS_REGION=$(aws configure get region 2>/dev/null || echo "")
if [[ -z "$AWS_REGION" ]]; then
die "AWS_REGION is required"
fi
fi
verbose "Using region: ${AWS_REGION}"
}
# ── Get load balancers ────────────────────────────────────────────────
get_load_balancers() {
local query_args=(elbv2 describe-load-balancers)
if [[ -n "$ALB_NAME" ]]; then
query_args+=(--names "$ALB_NAME")
fi
aws_cmd "${query_args[@]}" \
--query 'LoadBalancers[*].{ARN:LoadBalancerArn,Name:LoadBalancerName,Type:Type,State:State.Code,DNSName:DNSName}' \
--output json 2>/dev/null
}
# ── Get target groups for a load balancer ─────────────────────────────
get_target_groups() {
local lb_arn="$1"
if [[ -n "$TARGET_GROUP" ]]; then
aws_cmd elbv2 describe-target-groups \
--target-group-arns "$TARGET_GROUP" \
--query 'TargetGroups[*].{ARN:TargetGroupArn,Name:TargetGroupName,Protocol:Protocol,Port:Port,HealthPath:HealthCheckPath}' \
--output json 2>/dev/null
return
fi
aws_cmd elbv2 describe-target-groups \
--load-balancer-arn "$lb_arn" \
--query 'TargetGroups[*].{ARN:TargetGroupArn,Name:TargetGroupName,Protocol:Protocol,Port:Port,HealthPath:HealthCheckPath}' \
--output json 2>/dev/null
}
# ══════════════════════════════════════════════════════════════════════
# CHECK MODE
# ══════════════════════════════════════════════════════════════════════
do_check() {
log "Checking target group health..."
local lbs_json
lbs_json=$(get_load_balancers)
local lb_count
lb_count=$(echo "$lbs_json" | jq 'length')
if [[ "$lb_count" -eq 0 ]]; then
log "No load balancers found"
return
fi
log "Found ${lb_count} load balancer(s)"
echo "$lbs_json" | jq -c '.[]' | while IFS= read -r lb; do
local lb_arn lb_name lb_type lb_state
lb_arn=$(echo "$lb" | jq -r '.ARN')
lb_name=$(echo "$lb" | jq -r '.Name')
lb_type=$(echo "$lb" | jq -r '.Type')
lb_state=$(echo "$lb" | jq -r '.State')
echo ""
echo -e " ${BOLD}${lb_name}${RESET} (${lb_type}, ${lb_state})"
local tgs_json
tgs_json=$(get_target_groups "$lb_arn")
local tg_count
tg_count=$(echo "$tgs_json" | jq 'length')
if [[ "$tg_count" -eq 0 ]]; then
echo " No target groups"
continue
fi
echo "$tgs_json" | jq -c '.[]' | while IFS= read -r tg; do
local tg_arn tg_name tg_proto tg_port
tg_arn=$(echo "$tg" | jq -r '.ARN')
tg_name=$(echo "$tg" | jq -r '.Name')
tg_proto=$(echo "$tg" | jq -r '.Protocol')
tg_port=$(echo "$tg" | jq -r '.Port')
echo ""
echo -e " ${BOLD}Target Group: ${tg_name}${RESET} (${tg_proto}:${tg_port})"
local health_json
health_json=$(aws_cmd elbv2 describe-target-health \
--target-group-arn "$tg_arn" \
--query 'TargetHealthDescriptions[*].{Id:Target.Id,Port:Target.Port,State:TargetHealth.State,Reason:TargetHealth.Reason,Desc:TargetHealth.Description}' \
--output json 2>/dev/null)
local target_count
target_count=$(echo "$health_json" | jq 'length')
TOTAL_TARGETS=$((TOTAL_TARGETS + target_count))
if [[ "$target_count" -eq 0 ]]; then
echo -e " ${YELLOW}No registered targets${RESET}"
continue
fi
printf " ${BOLD}%-22s %-8s %-12s %s${RESET}\n" "TARGET" "PORT" "STATE" "REASON"
printf " %s\n" "$(printf '%.0s─' {1..60})"
echo "$health_json" | jq -c '.[]' | while IFS= read -r target; do
local tid tport tstate treason
tid=$(echo "$target" | jq -r '.Id')
tport=$(echo "$target" | jq -r '.Port')
tstate=$(echo "$target" | jq -r '.State')
treason=$(echo "$target" | jq -r '.Reason // "-"')
local icon color
case "$tstate" in
healthy)
icon="${GREEN}${RESET}"
color="$GREEN"
HEALTHY_COUNT=$((HEALTHY_COUNT + 1))
;;
unhealthy)
icon="${RED}${RESET}"
color="$RED"
UNHEALTHY_COUNT=$((UNHEALTHY_COUNT + 1))
;;
draining)
icon="${YELLOW}${RESET}"
color="$YELLOW"
DRAINING_COUNT=$((DRAINING_COUNT + 1))
;;
*)
icon="${DIM}?${RESET}"
color="$DIM"
;;
esac
printf " ${icon} %-20s %-8s ${color}%-12s${RESET} %s\n" "$tid" "$tport" "$tstate" "$treason"
done
done
done
echo ""
echo -e " ${BOLD}Summary${RESET}"
echo " Total targets: ${TOTAL_TARGETS}"
echo -e " Healthy: ${GREEN}${HEALTHY_COUNT}${RESET}"
[[ "$UNHEALTHY_COUNT" -gt 0 ]] && echo -e " Unhealthy: ${RED}${UNHEALTHY_COUNT}${RESET}" || echo " Unhealthy: 0"
[[ "$DRAINING_COUNT" -gt 0 ]] && echo -e " Draining: ${YELLOW}${DRAINING_COUNT}${RESET}" || echo " Draining: 0"
log "Completed in $(elapsed)"
if [[ "$UNHEALTHY_COUNT" -gt 0 ]]; then
return 2
elif [[ "$DRAINING_COUNT" -gt 0 ]]; then
return 1
fi
return 0
}
# ══════════════════════════════════════════════════════════════════════
# LIST MODE
# ══════════════════════════════════════════════════════════════════════
do_list() {
log "Listing load balancers..."
local lbs_json
lbs_json=$(get_load_balancers)
if [[ "$OUTPUT_FORMAT" == "json" ]]; then
echo "$lbs_json" | jq '.'
return
fi
echo ""
printf " ${BOLD}%-30s %-12s %-10s %s${RESET}\n" "NAME" "TYPE" "STATE" "DNS NAME"
printf " %s\n" "$(printf '%.0s─' {1..90})"
echo "$lbs_json" | jq -c '.[]' | while IFS= read -r lb; do
local lb_name lb_type lb_state dns_name
lb_name=$(echo "$lb" | jq -r '.Name')
lb_type=$(echo "$lb" | jq -r '.Type')
lb_state=$(echo "$lb" | jq -r '.State')
dns_name=$(echo "$lb" | jq -r '.DNSName')
local color="$GREEN"
[[ "$lb_state" != "active" ]] && color="$YELLOW"
printf " %-30s %-12s ${color}%-10s${RESET} %s\n" \
"${lb_name:0:30}" "$lb_type" "$lb_state" "${dns_name:0:50}"
done
local count
count=$(echo "$lbs_json" | jq 'length')
echo ""
log "Total: ${count} load balancer(s)"
}
# ══════════════════════════════════════════════════════════════════════
# METRICS MODE
# ══════════════════════════════════════════════════════════════════════
do_metrics() {
log "Fetching CloudWatch metrics (last 1 hour)..."
local lbs_json
lbs_json=$(get_load_balancers)
local now
now=$(date -u +%Y-%m-%dT%H:%M:%SZ)
local one_hour_ago
one_hour_ago=$(date -u -d "-1 hour" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null) || \
one_hour_ago=$(date -u -v-1H +%Y-%m-%dT%H:%M:%SZ 2>/dev/null) || \
one_hour_ago="$now"
echo "$lbs_json" | jq -c '.[]' | while IFS= read -r lb; do
local lb_arn lb_name lb_type
lb_arn=$(echo "$lb" | jq -r '.ARN')
lb_name=$(echo "$lb" | jq -r '.Name')
lb_type=$(echo "$lb" | jq -r '.Type')
# Extract the ALB suffix for CloudWatch dimension
local lb_suffix
lb_suffix=${lb_arn##*loadbalancer/}
echo ""
echo -e " ${BOLD}${lb_name}${RESET}"
local namespace="AWS/ApplicationELB"
[[ "$lb_type" == "network" ]] && namespace="AWS/NetworkELB"
# Request count
local req_count
req_count=$(aws_cmd cloudwatch get-metric-statistics \
--namespace "$namespace" \
--metric-name "RequestCount" \
--dimensions "Name=LoadBalancer,Value=${lb_suffix}" \
--start-time "$one_hour_ago" \
--end-time "$now" \
--period 3600 \
--statistics Sum \
--query 'Datapoints[0].Sum' \
--output text 2>/dev/null) || req_count="N/A"
[[ "$req_count" == "None" ]] && req_count="0"
echo " Request count (1h): ${req_count}"
# 5xx errors
local err_5xx
err_5xx=$(aws_cmd cloudwatch get-metric-statistics \
--namespace "$namespace" \
--metric-name "HTTPCode_Target_5XX_Count" \
--dimensions "Name=LoadBalancer,Value=${lb_suffix}" \
--start-time "$one_hour_ago" \
--end-time "$now" \
--period 3600 \
--statistics Sum \
--query 'Datapoints[0].Sum' \
--output text 2>/dev/null) || err_5xx="N/A"
[[ "$err_5xx" == "None" ]] && err_5xx="0"
if [[ "$err_5xx" != "0" && "$err_5xx" != "N/A" ]]; then
echo -e " 5XX errors (1h): ${RED}${err_5xx}${RESET}"
else
echo " 5XX errors (1h): ${err_5xx}"
fi
# Response time
local resp_time
resp_time=$(aws_cmd cloudwatch get-metric-statistics \
--namespace "$namespace" \
--metric-name "TargetResponseTime" \
--dimensions "Name=LoadBalancer,Value=${lb_suffix}" \
--start-time "$one_hour_ago" \
--end-time "$now" \
--period 3600 \
--statistics Average \
--query 'Datapoints[0].Average' \
--output text 2>/dev/null) || resp_time="N/A"
[[ "$resp_time" == "None" ]] && resp_time="N/A"
if [[ "$resp_time" != "N/A" ]]; then
local resp_ms
resp_ms=$(awk "BEGIN { printf \"%.1f\", $resp_time * 1000 }" 2>/dev/null || echo "$resp_time")
echo " Avg response time: ${resp_ms}ms"
else
echo " Avg response time: N/A"
fi
done
echo ""
log "Metrics collected in $(elapsed)"
}
# ══════════════════════════════════════════════════════════════════════
# ALERT MODE
# ══════════════════════════════════════════════════════════════════════
do_alert() {
local check_exit=0
do_check || check_exit=$?
if [[ "$check_exit" -eq 2 && -n "$SNS_TOPIC_ARN" ]]; then
log "Sending SNS alert for ${UNHEALTHY_COUNT} unhealthy target(s)..."
local subject="ALB Health Alert: ${UNHEALTHY_COUNT} unhealthy target(s) in ${AWS_REGION}"
local message
message="ALB Health Reporter Alert
Region: ${AWS_REGION}
Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)
Hostname: $(hostname -f 2>/dev/null || hostname)
Summary:
Total targets: ${TOTAL_TARGETS}
Healthy: ${HEALTHY_COUNT}
Unhealthy: ${UNHEALTHY_COUNT}
Draining: ${DRAINING_COUNT}
Action required: ${UNHEALTHY_COUNT} target(s) are unhealthy.
Run: alb-health-reporter.sh --check for details."
if aws_cmd sns publish \
--topic-arn "$SNS_TOPIC_ARN" \
--subject "${subject:0:100}" \
--message "$message" \
--output text &>/dev/null; then
echo -e " ${GREEN}${RESET} SNS alert sent to ${SNS_TOPIC_ARN}"
else
warn "Failed to send SNS alert"
fi
elif [[ "$check_exit" -eq 2 && -z "$SNS_TOPIC_ARN" ]]; then
warn "Unhealthy targets found but no --sns-topic specified"
fi
exit "$check_exit"
}
# ══════════════════════════════════════════════════════════════════════
# PROMETHEUS OUTPUT
# ══════════════════════════════════════════════════════════════════════
print_prometheus() {
# Run check silently to collect counts
do_check > /dev/null 2>&1 || true
local ts
ts=$(date +%s)
cat <<EOF
# HELP alb_targets_total Total registered targets
# TYPE alb_targets_total gauge
alb_targets_total{region="${AWS_REGION}"} ${TOTAL_TARGETS}
# HELP alb_targets_healthy Healthy targets
# TYPE alb_targets_healthy gauge
alb_targets_healthy{region="${AWS_REGION}"} ${HEALTHY_COUNT}
# HELP alb_targets_unhealthy Unhealthy targets
# TYPE alb_targets_unhealthy gauge
alb_targets_unhealthy{region="${AWS_REGION}"} ${UNHEALTHY_COUNT}
# HELP alb_targets_draining Draining targets
# TYPE alb_targets_draining gauge
alb_targets_draining{region="${AWS_REGION}"} ${DRAINING_COUNT}
# HELP alb_health_check_timestamp_seconds Last health check timestamp
# TYPE alb_health_check_timestamp_seconds gauge
alb_health_check_timestamp_seconds{region="${AWS_REGION}"} ${ts}
EOF
}
# ══════════════════════════════════════════════════════════════════════
# HELP
# ══════════════════════════════════════════════════════════════════════
show_help() {
cat <<EOF
Usage: $SCRIPT_NAME [MODE] [OPTIONS]
Check AWS ALB/NLB target group health and alert on failures.
MODES:
--check Check target health (default)
--list List load balancers
--metrics Show CloudWatch metrics (last 1 hour)
--alert Check health and send SNS alert if unhealthy
OPTIONS:
--alb-name NAME Filter by load balancer name
--target-group ARN Check a specific target group
--sns-topic ARN SNS topic ARN for alerts (used with --alert)
--format FORMAT Output: text (default), json, prometheus
--verbose Debug output
--no-color Disable colored output
--help, -h Show this help
EXIT CODES:
0 All targets healthy
1 Targets draining (warning)
2 Unhealthy targets found
ENVIRONMENT VARIABLES:
AWS_PROFILE AWS CLI profile
AWS_REGION AWS region
ALB_NAME Load balancer name filter
TARGET_GROUP Target group ARN
SNS_TOPIC_ARN SNS topic for alerts
OUTPUT_FORMAT Output format (default: text)
VERBOSE Debug output (default: false)
COLOR Color mode: auto, always, never
EXAMPLES:
# Check all ALBs
./$SCRIPT_NAME --check
# Check specific ALB
./$SCRIPT_NAME --check --alb-name my-web-alb
# List load balancers
./$SCRIPT_NAME --list
# CloudWatch metrics
./$SCRIPT_NAME --metrics
# Alert on unhealthy targets
./$SCRIPT_NAME --alert --sns-topic arn:aws:sns:us-east-1:123456789:alerts
# Prometheus output
./$SCRIPT_NAME --check --format prometheus
EOF
}
# ══════════════════════════════════════════════════════════════════════
# MAIN
# ══════════════════════════════════════════════════════════════════════
main() {
while [[ $# -gt 0 ]]; do
case "$1" in
--check) RUN_MODE="check"; shift ;;
--list) RUN_MODE="list"; shift ;;
--metrics) RUN_MODE="metrics"; shift ;;
--alert) RUN_MODE="alert"; shift ;;
--alb-name) ALB_NAME="$2"; shift 2 ;;
--target-group) TARGET_GROUP="$2"; shift 2 ;;
--sns-topic) SNS_TOPIC_ARN="$2"; shift 2 ;;
--format) OUTPUT_FORMAT="$2"; shift 2 ;;
--verbose) VERBOSE="true"; shift ;;
--no-color) COLOR="never"; shift ;;
--help|-h) show_help; exit 0 ;;
*) die "Unknown option: $1 (see --help)" ;;
esac
done
setup_colors
if [[ -z "$RUN_MODE" ]]; then
RUN_MODE="check"
fi
START_TIME=$(date +%s)
echo ""
echo -e "${BOLD}ALB Health Reporter${RESET}"
echo "Region: ${AWS_REGION:-$(aws configure get region 2>/dev/null || echo 'default')}"
echo "Mode: ${RUN_MODE}"
echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo ""
check_deps
if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then
print_prometheus
return
fi
case "$RUN_MODE" in
check) do_check ;;
list) do_list ;;
metrics) do_metrics ;;
alert) do_alert ;;
esac
}
main "$@"