#!/usr/bin/env bash ######################################################################################### #### alb-health-reporter.sh — Check AWS ALB/NLB target group health and alert #### #### Reports unhealthy targets, CloudWatch metrics, and sends SNS alerts #### #### Requires: bash 4+, aws-cli v2, jq #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version 1.00 #### #### #### #### Usage: #### #### ./alb-health-reporter.sh --check #### #### #### #### See --help for all options. #### ######################################################################################### set -euo pipefail # ── Defaults ────────────────────────────────────────────────────────── AWS_REGION="${AWS_REGION:-}" ALB_NAME="${ALB_NAME:-}" TARGET_GROUP="${TARGET_GROUP:-}" SNS_TOPIC_ARN="${SNS_TOPIC_ARN:-}" OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" VERBOSE="${VERBOSE:-false}" COLOR="${COLOR:-auto}" # ── State ───────────────────────────────────────────────────────────── SCRIPT_NAME="$(basename "$0")" readonly SCRIPT_NAME RUN_MODE="" START_TIME="" UNHEALTHY_COUNT=0 HEALTHY_COUNT=0 DRAINING_COUNT=0 TOTAL_TARGETS=0 # ── Colors ──────────────────────────────────────────────────────────── setup_colors() { if [[ "$COLOR" == "never" ]]; then RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" return fi if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' BOLD='\033[1m' DIM='\033[2m' RESET='\033[0m' else RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" fi } # ── Logging ─────────────────────────────────────────────────────────── log() { echo -e "${BLUE}[INFO]${RESET} $*"; } warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } die() { err "$*"; exit 1; } elapsed() { local end_time end_time=$(date +%s) echo "$(( end_time - START_TIME ))s" } # ── AWS CLI wrapper ─────────────────────────────────────────────────── aws_cmd() { local args=("$@") [[ -n "$AWS_REGION" ]] && args+=(--region "$AWS_REGION") verbose "aws ${args[*]}" aws "${args[@]}" } # ── Dependency check ────────────────────────────────────────────────── check_deps() { for cmd in aws jq; do if ! command -v "$cmd" &>/dev/null; then die "${cmd} is required but not installed" fi done if ! aws sts get-caller-identity &>/dev/null; then die "AWS credentials not configured or expired" fi if [[ -z "$AWS_REGION" ]]; then AWS_REGION=$(aws configure get region 2>/dev/null || echo "") if [[ -z "$AWS_REGION" ]]; then die "AWS_REGION is required" fi fi verbose "Using region: ${AWS_REGION}" } # ── Get load balancers ──────────────────────────────────────────────── get_load_balancers() { local query_args=(elbv2 describe-load-balancers) if [[ -n "$ALB_NAME" ]]; then query_args+=(--names "$ALB_NAME") fi aws_cmd "${query_args[@]}" \ --query 'LoadBalancers[*].{ARN:LoadBalancerArn,Name:LoadBalancerName,Type:Type,State:State.Code,DNSName:DNSName}' \ --output json 2>/dev/null } # ── Get target groups for a load balancer ───────────────────────────── get_target_groups() { local lb_arn="$1" if [[ -n "$TARGET_GROUP" ]]; then aws_cmd elbv2 describe-target-groups \ --target-group-arns "$TARGET_GROUP" \ --query 'TargetGroups[*].{ARN:TargetGroupArn,Name:TargetGroupName,Protocol:Protocol,Port:Port,HealthPath:HealthCheckPath}' \ --output json 2>/dev/null return fi aws_cmd elbv2 describe-target-groups \ --load-balancer-arn "$lb_arn" \ --query 'TargetGroups[*].{ARN:TargetGroupArn,Name:TargetGroupName,Protocol:Protocol,Port:Port,HealthPath:HealthCheckPath}' \ --output json 2>/dev/null } # ══════════════════════════════════════════════════════════════════════ # CHECK MODE # ══════════════════════════════════════════════════════════════════════ do_check() { log "Checking target group health..." local lbs_json lbs_json=$(get_load_balancers) local lb_count lb_count=$(echo "$lbs_json" | jq 'length') if [[ "$lb_count" -eq 0 ]]; then log "No load balancers found" return fi log "Found ${lb_count} load balancer(s)" echo "$lbs_json" | jq -c '.[]' | while IFS= read -r lb; do local lb_arn lb_name lb_type lb_state lb_arn=$(echo "$lb" | jq -r '.ARN') lb_name=$(echo "$lb" | jq -r '.Name') lb_type=$(echo "$lb" | jq -r '.Type') lb_state=$(echo "$lb" | jq -r '.State') echo "" echo -e " ${BOLD}${lb_name}${RESET} (${lb_type}, ${lb_state})" local tgs_json tgs_json=$(get_target_groups "$lb_arn") local tg_count tg_count=$(echo "$tgs_json" | jq 'length') if [[ "$tg_count" -eq 0 ]]; then echo " No target groups" continue fi echo "$tgs_json" | jq -c '.[]' | while IFS= read -r tg; do local tg_arn tg_name tg_proto tg_port tg_arn=$(echo "$tg" | jq -r '.ARN') tg_name=$(echo "$tg" | jq -r '.Name') tg_proto=$(echo "$tg" | jq -r '.Protocol') tg_port=$(echo "$tg" | jq -r '.Port') echo "" echo -e " ${BOLD}Target Group: ${tg_name}${RESET} (${tg_proto}:${tg_port})" local health_json health_json=$(aws_cmd elbv2 describe-target-health \ --target-group-arn "$tg_arn" \ --query 'TargetHealthDescriptions[*].{Id:Target.Id,Port:Target.Port,State:TargetHealth.State,Reason:TargetHealth.Reason,Desc:TargetHealth.Description}' \ --output json 2>/dev/null) local target_count target_count=$(echo "$health_json" | jq 'length') TOTAL_TARGETS=$((TOTAL_TARGETS + target_count)) if [[ "$target_count" -eq 0 ]]; then echo -e " ${YELLOW}No registered targets${RESET}" continue fi printf " ${BOLD}%-22s %-8s %-12s %s${RESET}\n" "TARGET" "PORT" "STATE" "REASON" printf " %s\n" "$(printf '%.0s─' {1..60})" echo "$health_json" | jq -c '.[]' | while IFS= read -r target; do local tid tport tstate treason tid=$(echo "$target" | jq -r '.Id') tport=$(echo "$target" | jq -r '.Port') tstate=$(echo "$target" | jq -r '.State') treason=$(echo "$target" | jq -r '.Reason // "-"') local icon color case "$tstate" in healthy) icon="${GREEN}✓${RESET}" color="$GREEN" HEALTHY_COUNT=$((HEALTHY_COUNT + 1)) ;; unhealthy) icon="${RED}✗${RESET}" color="$RED" UNHEALTHY_COUNT=$((UNHEALTHY_COUNT + 1)) ;; draining) icon="${YELLOW}⊘${RESET}" color="$YELLOW" DRAINING_COUNT=$((DRAINING_COUNT + 1)) ;; *) icon="${DIM}?${RESET}" color="$DIM" ;; esac printf " ${icon} %-20s %-8s ${color}%-12s${RESET} %s\n" "$tid" "$tport" "$tstate" "$treason" done done done echo "" echo -e " ${BOLD}Summary${RESET}" echo " Total targets: ${TOTAL_TARGETS}" echo -e " Healthy: ${GREEN}${HEALTHY_COUNT}${RESET}" [[ "$UNHEALTHY_COUNT" -gt 0 ]] && echo -e " Unhealthy: ${RED}${UNHEALTHY_COUNT}${RESET}" || echo " Unhealthy: 0" [[ "$DRAINING_COUNT" -gt 0 ]] && echo -e " Draining: ${YELLOW}${DRAINING_COUNT}${RESET}" || echo " Draining: 0" log "Completed in $(elapsed)" if [[ "$UNHEALTHY_COUNT" -gt 0 ]]; then return 2 elif [[ "$DRAINING_COUNT" -gt 0 ]]; then return 1 fi return 0 } # ══════════════════════════════════════════════════════════════════════ # LIST MODE # ══════════════════════════════════════════════════════════════════════ do_list() { log "Listing load balancers..." local lbs_json lbs_json=$(get_load_balancers) if [[ "$OUTPUT_FORMAT" == "json" ]]; then echo "$lbs_json" | jq '.' return fi echo "" printf " ${BOLD}%-30s %-12s %-10s %s${RESET}\n" "NAME" "TYPE" "STATE" "DNS NAME" printf " %s\n" "$(printf '%.0s─' {1..90})" echo "$lbs_json" | jq -c '.[]' | while IFS= read -r lb; do local lb_name lb_type lb_state dns_name lb_name=$(echo "$lb" | jq -r '.Name') lb_type=$(echo "$lb" | jq -r '.Type') lb_state=$(echo "$lb" | jq -r '.State') dns_name=$(echo "$lb" | jq -r '.DNSName') local color="$GREEN" [[ "$lb_state" != "active" ]] && color="$YELLOW" printf " %-30s %-12s ${color}%-10s${RESET} %s\n" \ "${lb_name:0:30}" "$lb_type" "$lb_state" "${dns_name:0:50}" done local count count=$(echo "$lbs_json" | jq 'length') echo "" log "Total: ${count} load balancer(s)" } # ══════════════════════════════════════════════════════════════════════ # METRICS MODE # ══════════════════════════════════════════════════════════════════════ do_metrics() { log "Fetching CloudWatch metrics (last 1 hour)..." local lbs_json lbs_json=$(get_load_balancers) local now now=$(date -u +%Y-%m-%dT%H:%M:%SZ) local one_hour_ago one_hour_ago=$(date -u -d "-1 hour" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null) || \ one_hour_ago=$(date -u -v-1H +%Y-%m-%dT%H:%M:%SZ 2>/dev/null) || \ one_hour_ago="$now" echo "$lbs_json" | jq -c '.[]' | while IFS= read -r lb; do local lb_arn lb_name lb_type lb_arn=$(echo "$lb" | jq -r '.ARN') lb_name=$(echo "$lb" | jq -r '.Name') lb_type=$(echo "$lb" | jq -r '.Type') # Extract the ALB suffix for CloudWatch dimension local lb_suffix lb_suffix=${lb_arn##*loadbalancer/} echo "" echo -e " ${BOLD}${lb_name}${RESET}" local namespace="AWS/ApplicationELB" [[ "$lb_type" == "network" ]] && namespace="AWS/NetworkELB" # Request count local req_count req_count=$(aws_cmd cloudwatch get-metric-statistics \ --namespace "$namespace" \ --metric-name "RequestCount" \ --dimensions "Name=LoadBalancer,Value=${lb_suffix}" \ --start-time "$one_hour_ago" \ --end-time "$now" \ --period 3600 \ --statistics Sum \ --query 'Datapoints[0].Sum' \ --output text 2>/dev/null) || req_count="N/A" [[ "$req_count" == "None" ]] && req_count="0" echo " Request count (1h): ${req_count}" # 5xx errors local err_5xx err_5xx=$(aws_cmd cloudwatch get-metric-statistics \ --namespace "$namespace" \ --metric-name "HTTPCode_Target_5XX_Count" \ --dimensions "Name=LoadBalancer,Value=${lb_suffix}" \ --start-time "$one_hour_ago" \ --end-time "$now" \ --period 3600 \ --statistics Sum \ --query 'Datapoints[0].Sum' \ --output text 2>/dev/null) || err_5xx="N/A" [[ "$err_5xx" == "None" ]] && err_5xx="0" if [[ "$err_5xx" != "0" && "$err_5xx" != "N/A" ]]; then echo -e " 5XX errors (1h): ${RED}${err_5xx}${RESET}" else echo " 5XX errors (1h): ${err_5xx}" fi # Response time local resp_time resp_time=$(aws_cmd cloudwatch get-metric-statistics \ --namespace "$namespace" \ --metric-name "TargetResponseTime" \ --dimensions "Name=LoadBalancer,Value=${lb_suffix}" \ --start-time "$one_hour_ago" \ --end-time "$now" \ --period 3600 \ --statistics Average \ --query 'Datapoints[0].Average' \ --output text 2>/dev/null) || resp_time="N/A" [[ "$resp_time" == "None" ]] && resp_time="N/A" if [[ "$resp_time" != "N/A" ]]; then local resp_ms resp_ms=$(awk "BEGIN { printf \"%.1f\", $resp_time * 1000 }" 2>/dev/null || echo "$resp_time") echo " Avg response time: ${resp_ms}ms" else echo " Avg response time: N/A" fi done echo "" log "Metrics collected in $(elapsed)" } # ══════════════════════════════════════════════════════════════════════ # ALERT MODE # ══════════════════════════════════════════════════════════════════════ do_alert() { local check_exit=0 do_check || check_exit=$? if [[ "$check_exit" -eq 2 && -n "$SNS_TOPIC_ARN" ]]; then log "Sending SNS alert for ${UNHEALTHY_COUNT} unhealthy target(s)..." local subject="ALB Health Alert: ${UNHEALTHY_COUNT} unhealthy target(s) in ${AWS_REGION}" local message message="ALB Health Reporter Alert Region: ${AWS_REGION} Time: $(date -u +%Y-%m-%dT%H:%M:%SZ) Hostname: $(hostname -f 2>/dev/null || hostname) Summary: Total targets: ${TOTAL_TARGETS} Healthy: ${HEALTHY_COUNT} Unhealthy: ${UNHEALTHY_COUNT} Draining: ${DRAINING_COUNT} Action required: ${UNHEALTHY_COUNT} target(s) are unhealthy. Run: alb-health-reporter.sh --check for details." if aws_cmd sns publish \ --topic-arn "$SNS_TOPIC_ARN" \ --subject "${subject:0:100}" \ --message "$message" \ --output text &>/dev/null; then echo -e " ${GREEN}✓${RESET} SNS alert sent to ${SNS_TOPIC_ARN}" else warn "Failed to send SNS alert" fi elif [[ "$check_exit" -eq 2 && -z "$SNS_TOPIC_ARN" ]]; then warn "Unhealthy targets found but no --sns-topic specified" fi exit "$check_exit" } # ══════════════════════════════════════════════════════════════════════ # PROMETHEUS OUTPUT # ══════════════════════════════════════════════════════════════════════ print_prometheus() { # Run check silently to collect counts do_check > /dev/null 2>&1 || true local ts ts=$(date +%s) cat </dev/null || echo 'default')}" echo "Mode: ${RUN_MODE}" echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" echo "" check_deps if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then print_prometheus return fi case "$RUN_MODE" in check) do_check ;; list) do_list ;; metrics) do_metrics ;; alert) do_alert ;; esac } main "$@"