linux-scripts/prometheus-performance-analyzer.sh

#!/bin/bash
################################################################################
# Script Name: prometheus-performance-analyzer.sh
# Version: 1.01
# Description: Diagnostic tool that analyzes Prometheus server performance.
#              Queries TSDB status, runtime info, flags, config, targets,
#              rules, and internal metrics to produce a detailed report
#              with actionable recommendations.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
#   - curl
#   - jq
#   - bc (for calculations)
#   - Network access to Prometheus API
#
# Usage:
#   # Analyze local Prometheus
#   ./prometheus-performance-analyzer.sh
#
#   # Analyze remote Prometheus
#   ./prometheus-performance-analyzer.sh --url http://prometheus:9090
#
#   # JSON output for automation
#   ./prometheus-performance-analyzer.sh --json
#
#   # Analyze specific section only
#   ./prometheus-performance-analyzer.sh --section cardinality
#
#   # Save report to file (auto-disables color)
#   ./prometheus-performance-analyzer.sh -o report.txt
#
#   # Custom series threshold
#   ./prometheus-performance-analyzer.sh --threshold-series 5000000
#
# Sections:
#   overview     - Version, uptime, series counts, retention
#   tsdb         - TSDB head stats, compaction, WAL health
#   cardinality  - High cardinality metrics, labels, label-value pairs
#   queries      - Query engine performance and latency
#   scrapes      - Scrape target health and duration analysis
#   rules        - Recording/alerting rule evaluation
#   storage      - Disk, retention, WAL, growth estimation
#   memory       - RSS, Go heap, GC, goroutines
#   config       - Configuration review and best practices
#   summary      - Health score and all recommendations
#
################################################################################

# ============================================================================
# CONFIGURATION & DEFAULTS
# ============================================================================

PROM_URL="http://localhost:9090"
OUTPUT_FILE=""
JSON_MODE=false
NO_COLOR=false
SECTION=""
THRESHOLD_SERIES=1000000

# Colors
RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
CYAN='\033[0;36m'
BOLD='\033[1m'
DIM='\033[2m'
NC='\033[0m'

# Global state
declare -a RECOMMENDATIONS=()
CRITICAL_COUNT=0
WARNING_COUNT=0
INFO_COUNT=0

# Cached API responses
TSDB_STATUS=""
RUNTIME_INFO=""
FLAGS_DATA=""
CONFIG_DATA=""
TARGETS_DATA=""
RULES_DATA=""
METRICS_RAW=""

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

show_usage() {
    cat <<EOF
Usage: $0 [OPTIONS]

Analyze Prometheus server performance and produce recommendations (v1.0).

OPTIONS:
   --url URL            Prometheus URL (default: $PROM_URL)
   --section NAME       Run a specific section only
                         (overview|tsdb|cardinality|queries|scrapes|rules|
                          storage|memory|config|summary)
   --json               Output as JSON
   --no-color           Disable colored output
   --threshold-series N Warning threshold for total series (default: $THRESHOLD_SERIES)
    -o, --output FILE    Write report to file (auto-disables color)
    -h, --help           Show this help

EXAMPLES:
    $0                                              # Analyze localhost:9090
    $0 --url http://prometheus:9090                 # Remote server
    $0 --json                                       # JSON output
    $0 --section cardinality                        # Single section
    $0 -o report.txt                                # Save to file
    $0 --threshold-series 5000000                   # Custom threshold

EOF
    exit 0
}

parse_args() {
    while [[ $# -gt 0 ]]; do
        case $1 in
            -h|--help) show_usage ;;
           --url) PROM_URL="$2"; shift 2 ;;
           --section) SECTION="$2"; shift 2 ;;
           --json) JSON_MODE=true; shift ;;
           --no-color) NO_COLOR=true; shift ;;
           --threshold-series) THRESHOLD_SERIES="$2"; shift 2 ;;
            -o|--output) OUTPUT_FILE="$2"; NO_COLOR=true; shift 2 ;;
            *) echo "Unknown option: $1" >&2; exit 1 ;;
        esac
    done

    # Strip trailing slash
    PROM_URL="${PROM_URL%/}"

    # Disable colors if requested
    if [ "$NO_COLOR" = true ]; then
        RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC=""
    fi
}

check_requirements() {
    local missing=0

    for cmd in curl jq awk; do
        if ! command -v "$cmd" >/dev/null 2>&1; then
            echo "ERROR: $cmd not found" >&2
            missing=1
        fi
    done

    # Auto-install bc if missing
    if ! command -v bc >/dev/null 2>&1; then
        echo "bc not found -- installing..." >&2
        if command -v apt-get >/dev/null 2>&1; then
            sudo apt-get install -y bc >/dev/null 2>&1
        elif command -v dnf >/dev/null 2>&1; then
            sudo dnf install -y bc >/dev/null 2>&1
        elif command -v yum >/dev/null 2>&1; then
            sudo yum install -y bc >/dev/null 2>&1
        fi
        if ! command -v bc >/dev/null 2>&1; then
            echo "ERROR: failed to install bc -- install it manually" >&2
            missing=1
        fi
    fi

    return $missing
}

# Query Prometheus API endpoint -- returns JSON body
prom_api() {
    local endpoint="$1"
    curl -sf --connect-timeout 5 --max-time 15 "${PROM_URL}${endpoint}" 2>/dev/null
}

# Query a Prometheus metric via instant query API -- returns numeric value
prom_query() {
    local query="$1"
    local encoded
    encoded=$(printf '%s' "$query" | jq -sRr @uri)
    local result
    result=$(prom_api "/api/v1/query?query=${encoded}")
    if [ -z "$result" ]; then
        echo "0"
        return 1
    fi
    echo "$result" | jq -r '.data.result[0].value[1] // "0"' 2>/dev/null
}

# Extract a metric value from raw /metrics text
prom_metric() {
    local metric="$1"
    if [ -z "$METRICS_RAW" ]; then
        echo "0"
        return
    fi
    local val
    val=$(echo "$METRICS_RAW" | grep "^${metric} " | head -1 | awk '{print $2}')
    echo "${val:-0}"
}

# Extract a metric with labels from raw /metrics text
prom_metric_labeled() {
    local pattern="$1"
    if [ -z "$METRICS_RAW" ]; then
        return
    fi
    echo "$METRICS_RAW" | grep "^${pattern}" 2>/dev/null
}

# Format bytes to human-readable
format_bytes() {
    local bytes="$1"
    if [ -z "$bytes" ] || [ "$bytes" = "0" ] || [ "$bytes" = "null" ]; then
        echo "0 B"
        return
    fi
    echo "$bytes" | awk '{
        if ($1 >= 1099511627776) printf "%.1f TB", $1/1099511627776
        else if ($1 >= 1073741824) printf "%.1f GB", $1/1073741824
        else if ($1 >= 1048576) printf "%.1f MB", $1/1048576
        else if ($1 >= 1024) printf "%.1f KB", $1/1024
        else printf "%d B", $1
    }'
}

# Format seconds to human-readable duration
format_duration() {
    local total="$1"
    if [ -z "$total" ] || [ "$total" = "0" ]; then
        echo "0s"
        return
    fi
    # Handle float seconds
    local secs
    secs=$(echo "$total" | awk '{printf "%d", $1}')
    local days=$((secs / 86400))
    local hours=$(( (secs % 86400) / 3600 ))
    local mins=$(( (secs % 3600) / 60 ))
    local s=$((secs % 60))

    local result=""
    [ "$days" -gt 0 ] && result="${days}d "
    [ "$hours" -gt 0 ] && result="${result}${hours}h "
    [ "$mins" -gt 0 ] && result="${result}${mins}m "
    [ "$s" -gt 0 ] || [ -z "$result" ] && result="${result}${s}s"
    echo "${result% }"
}

# Format number with commas
format_number() {
    local n="$1"
    if [ -z "$n" ] || [ "$n" = "null" ]; then
        echo "0"
        return
    fi
    printf "%'.0f" "$n" 2>/dev/null || echo "$n"
}

# Add a recommendation
add_recommendation() {
    local severity="$1"
    local section="$2"
    local message="$3"

    case "$severity" in
        CRITICAL) CRITICAL_COUNT=$((CRITICAL_COUNT + 1)) ;;
        WARNING)  WARNING_COUNT=$((WARNING_COUNT + 1)) ;;
        INFO)     INFO_COUNT=$((INFO_COUNT + 1)) ;;
    esac

    RECOMMENDATIONS+=("${severity}|${section}|${message}")
}

# Print section header
print_header() {
    local title="$1"
    echo ""
    echo -e "${BOLD}${CYAN}====================================================${NC}"
    echo -e "${BOLD}${CYAN}  ${title}${NC}"
    echo -e "${BOLD}${CYAN}====================================================${NC}"
    echo ""
}

# Print a metric line (label + value, aligned)
print_metric() {
    local label="$1"
    local value="$2"
    printf "  ${DIM}%-30s${NC} %s\n" "$label:" "$value"
}

# Print severity tag
severity_tag() {
    local sev="$1"
    case "$sev" in
        CRITICAL) echo -e "${RED}[CRITICAL]${NC}" ;;
        WARNING)  echo -e "${YELLOW}[WARNING]${NC}" ;;
        INFO)     echo -e "${GREEN}[INFO]${NC}" ;;
        OK)       echo -e "${GREEN}[OK]${NC}" ;;
    esac
}

# Print a status line
print_status() {
    local severity="$1"
    local message="$2"
    echo -e "  $(severity_tag "$severity") $message"
}

# ============================================================================
# DATA COLLECTION
# ============================================================================

collect_all_data() {
    echo -e "${DIM}Collecting data from ${PROM_URL}...${NC}" >&2

    # Check connectivity first
    if ! prom_api "/api/v1/status/runtimeinfo" >/dev/null 2>&1; then
        echo -e "${RED}ERROR: Cannot reach Prometheus at ${PROM_URL}${NC}" >&2
        echo "Check the URL and ensure Prometheus is running." >&2
        exit 1
    fi

    TSDB_STATUS=$(prom_api "/api/v1/status/tsdb")
    RUNTIME_INFO=$(prom_api "/api/v1/status/runtimeinfo")
    FLAGS_DATA=$(prom_api "/api/v1/status/flags")
    CONFIG_DATA=$(prom_api "/api/v1/status/config")
    TARGETS_DATA=$(prom_api "/api/v1/targets")
    RULES_DATA=$(prom_api "/api/v1/rules")
    METRICS_RAW=$(curl -sf --connect-timeout 5 --max-time 15 "${PROM_URL}/metrics" 2>/dev/null)

    echo -e "${DIM}Data collection complete.${NC}" >&2
}

# ============================================================================
# SECTION: OVERVIEW
# ============================================================================

analyze_overview() {
    print_header "Overview"

    # Version and uptime from runtime info
    local version goroutines gomaxprocs storage_path
    version=$(echo "$RUNTIME_INFO" | jq -r '.data.version // "unknown"')
    goroutines=$(echo "$RUNTIME_INFO" | jq -r '.data.goroutines // "0"')
    gomaxprocs=$(echo "$RUNTIME_INFO" | jq -r '.data.GOMAXPROCS // "0"')
    storage_path=$(echo "$RUNTIME_INFO" | jq -r '.data.storageRetention // "unknown"')

    # Uptime from process_start_time_seconds
    local start_time now_time uptime_secs
    start_time=$(prom_metric "process_start_time_seconds")
    now_time=$(date +%s)
    if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then
        uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}')
    else
        uptime_secs=0
    fi

    # Series and samples from TSDB
    local head_series head_chunks
    head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
    head_chunks=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0')

    # Retention from flags
    local retention_time retention_size
    retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"')
    retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"')

    # Memory
    local rss_bytes
    rss_bytes=$(prom_metric "process_resident_memory_bytes")

    print_metric "Prometheus version" "$version"
    print_metric "Uptime" "$(format_duration "$uptime_secs")"
    print_metric "GOMAXPROCS" "$gomaxprocs"
    print_metric "Goroutines" "$(format_number "$goroutines")"
    print_metric "Head series" "$(format_number "$head_series")"
    print_metric "Head chunks" "$(format_number "$head_chunks")"
    print_metric "Retention (time)" "$retention_time"
    print_metric "Retention (size)" "$retention_size"
    print_metric "Memory (RSS)" "$(format_bytes "$rss_bytes")"

    # Series threshold check
    if [ "$head_series" -gt "$((THRESHOLD_SERIES * 5))" ] 2>/dev/null; then
        add_recommendation "CRITICAL" "overview" "Head series count $(format_number "$head_series") is very high (>$(format_number "$((THRESHOLD_SERIES * 5))")) -- investigate high cardinality metrics immediately"
    elif [ "$head_series" -gt "$THRESHOLD_SERIES" ] 2>/dev/null; then
        add_recommendation "WARNING" "overview" "Head series count $(format_number "$head_series") exceeds threshold $(format_number "$THRESHOLD_SERIES") -- review cardinality section"
    fi
}

# ============================================================================
# SECTION: TSDB
# ============================================================================

analyze_tsdb() {
    print_header "TSDB Health"

    if [ -z "$TSDB_STATUS" ]; then
        echo "  (TSDB status endpoint not available)"
        return
    fi

    # Head stats
    local num_series chunk_count min_time max_time num_label_pairs
    num_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
    chunk_count=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0')
    min_time=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.minTime // 0')
    max_time=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.maxTime // 0')
    num_label_pairs=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numLabelPairs // 0')

    # Calculate head block time range
    local head_range_secs=0
    if [ "$min_time" -gt 0 ] && [ "$max_time" -gt 0 ] 2>/dev/null; then
        head_range_secs=$(( (max_time - min_time) / 1000 ))
    fi

    print_metric "Head series" "$(format_number "$num_series")"
    print_metric "Head chunks" "$(format_number "$chunk_count")"
    print_metric "Head label pairs" "$(format_number "$num_label_pairs")"
    print_metric "Head block range" "$(format_duration "$head_range_secs")"
    echo ""

    # Compaction metrics from /metrics
    local compactions_total compactions_failed compaction_duration
    compactions_total=$(prom_metric "prometheus_tsdb_compactions_total")
    compactions_failed=$(prom_metric "prometheus_tsdb_compactions_failed_total")
    compaction_duration=$(prom_metric "prometheus_tsdb_compaction_duration_seconds_sum")

    print_metric "Compactions total" "$(format_number "$compactions_total")"
    print_metric "Compaction failures" "$compactions_failed"
    if [ -n "$compaction_duration" ] && [ "$compaction_duration" != "0" ]; then
        print_metric "Compaction time (total)" "$(format_duration "$compaction_duration")"
    fi

    if [ "$compactions_failed" != "0" ] && [ "$compactions_failed" != "" ] 2>/dev/null; then
        if [ "$compactions_failed" -gt 0 ] 2>/dev/null; then
            add_recommendation "CRITICAL" "tsdb" "TSDB has $compactions_failed compaction failures -- investigate storage health (disk I/O, free space)"
        fi
    fi

    echo ""

    # WAL stats
    local wal_corruptions wal_truncate_total wal_truncate_failed
    wal_corruptions=$(prom_metric "prometheus_tsdb_wal_corruptions_total")
    wal_truncate_total=$(prom_metric "prometheus_tsdb_wal_truncations_total")
    wal_truncate_failed=$(prom_metric "prometheus_tsdb_wal_truncations_failed_total")

    print_metric "WAL corruptions" "$wal_corruptions"
    print_metric "WAL truncations" "$(format_number "$wal_truncate_total")"
    print_metric "WAL truncation failures" "$wal_truncate_failed"

    if [ "$wal_corruptions" != "0" ] && [ -n "$wal_corruptions" ]; then
        if [ "$wal_corruptions" -gt 0 ] 2>/dev/null; then
            add_recommendation "CRITICAL" "tsdb" "WAL has $wal_corruptions corruption(s) -- check disk health, consider running promtool tsdb clean-tombstones"
        fi
    fi

    if [ "$wal_truncate_failed" != "0" ] && [ -n "$wal_truncate_failed" ]; then
        if [ "$wal_truncate_failed" -gt 0 ] 2>/dev/null; then
            add_recommendation "WARNING" "tsdb" "WAL has $wal_truncate_failed truncation failure(s) -- may cause WAL growth"
        fi
    fi

    echo ""

    # Out-of-order samples
    local ooo_total
    ooo_total=$(prom_metric "prometheus_tsdb_out_of_order_samples_total")
    print_metric "Out-of-order samples" "$(format_number "$ooo_total")"

    if [ "$ooo_total" != "0" ] && [ -n "$ooo_total" ]; then
        local ooo_num
        ooo_num=$(echo "$ooo_total" | awk '{printf "%d", $1}')
        if [ "$ooo_num" -gt 1000 ] 2>/dev/null; then
            add_recommendation "WARNING" "tsdb" "$(format_number "$ooo_total") out-of-order samples -- check NTP sync across targets or look for duplicate scraper configs"
        fi
    fi

    # Head GC
    local head_gc_duration
    head_gc_duration=$(prom_metric "prometheus_tsdb_head_gc_duration_seconds_sum")
    if [ -n "$head_gc_duration" ] && [ "$head_gc_duration" != "0" ]; then
        print_metric "Head GC time (total)" "$(format_duration "$head_gc_duration")"
    fi

    # Checkpoint creations
    local checkpoint_total checkpoint_failed
    checkpoint_total=$(prom_metric "prometheus_tsdb_checkpoint_creations_total")
    checkpoint_failed=$(prom_metric "prometheus_tsdb_checkpoint_creations_failed_total")
    print_metric "Checkpoints created" "$(format_number "$checkpoint_total")"
    if [ "$checkpoint_failed" != "0" ] && [ -n "$checkpoint_failed" ]; then
        if [ "$checkpoint_failed" -gt 0 ] 2>/dev/null; then
            print_metric "Checkpoint failures" "$checkpoint_failed"
            add_recommendation "WARNING" "tsdb" "$checkpoint_failed checkpoint creation failure(s) -- investigate disk health"
        fi
    fi

    # Tombstone cleanup
    local tombstones
    tombstones=$(prom_metric "prometheus_tsdb_tombstone_cleanup_seconds_sum")
    if [ -n "$tombstones" ] && [ "$tombstones" != "0" ]; then
        print_metric "Tombstone cleanup time" "$(format_duration "$tombstones")"
    fi
}

# ============================================================================
# SECTION: CARDINALITY
# ============================================================================

analyze_cardinality() {
    print_header "High Cardinality Analysis"

    if [ -z "$TSDB_STATUS" ]; then
        echo "  (TSDB status endpoint not available)"
        return
    fi

    local total_series
    total_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')

    # Top metrics by series count
    echo -e "  ${BOLD}Top Metrics by Series Count${NC}"
    echo ""

    local metric_count
    metric_count=$(echo "$TSDB_STATUS" | jq -r '.data.seriesCountByMetricName // [] | length')

    if [ "$metric_count" -gt 0 ] 2>/dev/null; then
        local i=0
        while [ $i -lt 10 ] && [ $i -lt "$metric_count" ]; do
            local name count pct
            name=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByMetricName[$i].name // \"\"")
            count=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByMetricName[$i].value // 0")

            if [ -n "$name" ] && [ "$name" != "" ]; then
                if [ "$total_series" -gt 0 ] 2>/dev/null; then
                    pct=$(echo "scale=1; $count * 100 / $total_series" | bc 2>/dev/null)
                else
                    pct="0"
                fi
                printf "  %2d. %-45s %10s  (%5s%%)\n" "$((i+1))" "$name" "$(format_number "$count")" "$pct"

                # Flag metrics consuming > 10% of total
                local pct_int
                pct_int=$(echo "$pct" | awk '{printf "%d", $1}')
                if [ "$pct_int" -ge 10 ] 2>/dev/null; then
                    add_recommendation "WARNING" "cardinality" "$name has $(format_number "$count") series (${pct}% of total) -- consider adding metric_relabel_configs to drop unused label dimensions"
                fi
            fi
            i=$((i + 1))
        done
    else
        echo "  (no data available)"
    fi

    echo ""

    # Top labels by value count
    echo -e "  ${BOLD}Top Labels by Value Count${NC}"
    echo ""

    local label_count
    label_count=$(echo "$TSDB_STATUS" | jq -r '.data.labelValueCountByLabelName // [] | length')

    if [ "$label_count" -gt 0 ] 2>/dev/null; then
        local i=0
        while [ $i -lt 10 ] && [ $i -lt "$label_count" ]; do
            local name count
            name=$(echo "$TSDB_STATUS" | jq -r ".data.labelValueCountByLabelName[$i].name // \"\"")
            count=$(echo "$TSDB_STATUS" | jq -r ".data.labelValueCountByLabelName[$i].value // 0")

            if [ -n "$name" ] && [ "$name" != "" ]; then
                printf "  %2d. %-45s %10s values\n" "$((i+1))" "$name" "$(format_number "$count")"

                # Flag labels with very high value counts
                if [ "$count" -gt 10000 ] 2>/dev/null; then
                    add_recommendation "WARNING" "cardinality" "Label '$name' has $(format_number "$count") unique values -- high cardinality label, consider relabeling or dropping"
                fi
            fi
            i=$((i + 1))
        done
    else
        echo "  (no data available)"
    fi

    echo ""

    # Top label-value pairs by series count
    echo -e "  ${BOLD}Top Label-Value Pairs by Series Count${NC}"
    echo ""

    local pair_count
    pair_count=$(echo "$TSDB_STATUS" | jq -r '.data.seriesCountByLabelValuePair // [] | length')

    if [ "$pair_count" -gt 0 ] 2>/dev/null; then
        local i=0
        while [ $i -lt 10 ] && [ $i -lt "$pair_count" ]; do
            local name count
            name=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByLabelValuePair[$i].name // \"\"")
            count=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByLabelValuePair[$i].value // 0")

            if [ -n "$name" ] && [ "$name" != "" ]; then
                printf "  %2d. %-45s %10s series\n" "$((i+1))" "$name" "$(format_number "$count")"
            fi
            i=$((i + 1))
        done
    else
        echo "  (no data available)"
    fi

    echo ""

    # Overall cardinality assessment
    if [ "$total_series" -gt "$((THRESHOLD_SERIES * 5))" ] 2>/dev/null; then
        print_status "CRITICAL" "Total series $(format_number "$total_series") -- well above recommended limits"
    elif [ "$total_series" -gt "$THRESHOLD_SERIES" ] 2>/dev/null; then
        print_status "WARNING" "Total series $(format_number "$total_series") -- above threshold $(format_number "$THRESHOLD_SERIES")"
    else
        print_status "OK" "Total series $(format_number "$total_series") -- within normal range"
    fi

    # Memory per series from TSDB
    local mem_by_label
    mem_by_label=$(echo "$TSDB_STATUS" | jq -r '.data.memoryInBytesByLabelName // [] | length')
    if [ "$mem_by_label" -gt 0 ] 2>/dev/null; then
        echo ""
        echo -e "  ${BOLD}Top Labels by Memory Usage${NC}"
        echo ""
        local i=0
        while [ $i -lt 10 ] && [ $i -lt "$mem_by_label" ]; do
            local name bytes
            name=$(echo "$TSDB_STATUS" | jq -r ".data.memoryInBytesByLabelName[$i].name // \"\"")
            bytes=$(echo "$TSDB_STATUS" | jq -r ".data.memoryInBytesByLabelName[$i].value // 0")
            if [ -n "$name" ] && [ "$name" != "" ]; then
                printf "  %2d. %-45s %10s\n" "$((i+1))" "$name" "$(format_bytes "$bytes")"
            fi
            i=$((i + 1))
        done
    fi
}

# ============================================================================
# SECTION: QUERIES
# ============================================================================

analyze_queries() {
    print_header "Query Performance"

    # Query engine settings from flags
    local max_concurrency query_timeout lookback_delta
    max_concurrency=$(echo "$FLAGS_DATA" | jq -r '.data["query.max-concurrency"] // "unknown"')
    query_timeout=$(echo "$FLAGS_DATA" | jq -r '.data["query.timeout"] // "unknown"')
    lookback_delta=$(echo "$FLAGS_DATA" | jq -r '.data["query.lookback-delta"] // "unknown"')

    print_metric "Max concurrent queries" "$max_concurrency"
    print_metric "Query timeout" "$query_timeout"
    print_metric "Lookback delta" "$lookback_delta"
    echo ""

    # Query duration quantiles from /metrics
    echo -e "  ${BOLD}Query Duration Percentiles${NC}"
    echo ""

    local has_query_metrics=false

    # prometheus_engine_query_duration_seconds (histogram with quantiles)
    local query_durations
    query_durations=$(prom_metric_labeled "prometheus_engine_query_duration_seconds{")

    if [ -n "$query_durations" ]; then
        has_query_metrics=true
        echo "$query_durations" | while IFS= read -r line; do
            local quantile value slice_name
            quantile=$(echo "$line" | grep -oP 'quantile="\K[^"]+')
            slice_name=$(echo "$line" | grep -oP 'slice="\K[^"]+')
            value=$(echo "$line" | awk '{print $NF}')

            if [ -n "$quantile" ] && [ -n "$value" ] && [ "$value" != "NaN" ]; then
                local label
                label="p$(echo "$quantile" | awk '{printf "%g", $1 * 100}')"
                if [ -n "$slice_name" ]; then
                    label="${label} (${slice_name})"
                fi
                printf "  %-35s %s\n" "$label" "$(echo "$value" | awk '{if ($1 < 0.001) printf "%.3f ms", $1*1000; else if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')"
            fi
        done
    fi

    if [ "$has_query_metrics" = false ]; then
        echo "  (query duration metrics not available)"
    fi

    echo ""

    # Query performance from instant metrics
    local queries_active
    queries_active=$(prom_metric "prometheus_engine_queries")
    print_metric "Active queries (now)" "$queries_active"

    local queries_total
    queries_total=$(prom_metric "prometheus_engine_query_samples_total")
    if [ "$queries_total" != "0" ] && [ -n "$queries_total" ]; then
        print_metric "Total query samples" "$(format_number "$queries_total")"
    fi

    # Check for slow queries
    local p99_inner
    p99_inner=$(echo "$query_durations" | grep 'quantile="0.99"' | grep 'inner_eval' | awk '{print $NF}' | head -1)

    if [ -n "$p99_inner" ] && [ "$p99_inner" != "NaN" ]; then
        local p99_secs
        p99_secs=$(echo "$p99_inner" | awk '{printf "%d", $1}')
        if [ "$p99_secs" -gt 10 ] 2>/dev/null; then
            add_recommendation "WARNING" "queries" "p99 inner eval query latency is ${p99_inner}s -- consider adding recording rules for complex queries or reducing cardinality"
        fi
    fi

    # Concurrent query check
    if [ "$max_concurrency" != "unknown" ] && [ "$queries_active" != "0" ]; then
        local active_num max_num
        active_num=$(echo "$queries_active" | awk '{printf "%d", $1}')
        max_num=$(echo "$max_concurrency" | awk '{printf "%d", $1}')
        if [ "$active_num" -ge "$max_num" ] 2>/dev/null; then
            add_recommendation "WARNING" "queries" "Active queries ($active_num) at or near max-concurrency ($max_num) -- consider increasing --query.max-concurrency"
        fi
    fi
}

# ============================================================================
# SECTION: SCRAPES
# ============================================================================

analyze_scrapes() {
    print_header "Scrape Performance"

    if [ -z "$TARGETS_DATA" ]; then
        echo "  (targets endpoint not available)"
        return
    fi

    # Count targets by health
    local total_targets up_targets down_targets unknown_targets
    total_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[]] | length')
    up_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "up")] | length')
    down_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "down")] | length')
    unknown_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health != "up" and .health != "down")] | length')

    print_metric "Total targets" "$total_targets"
    print_metric "Targets up" "${GREEN}${up_targets}${NC}"
    if [ "$down_targets" -gt 0 ] 2>/dev/null; then
        print_metric "Targets down" "${RED}${down_targets}${NC}"
    else
        print_metric "Targets down" "$down_targets"
    fi
    if [ "$unknown_targets" -gt 0 ] 2>/dev/null; then
        print_metric "Targets unknown" "$unknown_targets"
    fi

    echo ""

    # List down targets
    if [ "$down_targets" -gt 0 ] 2>/dev/null; then
        echo -e "  ${BOLD}${RED}Down Targets${NC}"
        echo ""

        echo "$TARGETS_DATA" | jq -r '
            .data.activeTargets // [] |
            .[] | select(.health == "down") |
            "  \(.labels.job // "unknown")  \(.labels.instance // .scrapeUrl)  \(.lastError // "no error")"
        ' 2>/dev/null | head -20

        echo ""

        add_recommendation "CRITICAL" "scrapes" "$down_targets scrape target(s) are down -- check target availability"
    fi

    # Scrape duration analysis per job
    echo -e "  ${BOLD}Scrape Duration by Job${NC}"
    echo ""
    printf "  %-30s %10s %10s %10s %8s\n" "Job" "Avg" "Max" "Interval" "Status"
    printf "  %-30s %10s %10s %10s %8s\n" "---" "---" "---" "--------" "------"

    # Get global scrape interval from config
    local global_interval=""
    if [ -n "$CONFIG_DATA" ]; then
        global_interval=$(echo "$CONFIG_DATA" | jq -r '.data.yaml' 2>/dev/null | grep -m1 'scrape_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"')
    fi
    [ -z "$global_interval" ] && global_interval="60s"

    # Parse interval to seconds
    local global_interval_secs
    global_interval_secs=$(echo "$global_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 60)

    echo "$TARGETS_DATA" | jq -r '
        [.data.activeTargets // [] | .[] | select(.health == "up") |
         {job: (.labels.job // "unknown"), duration: .lastScrapeDuration, interval: .scrapeInterval}] |
        group_by(.job) |
        .[] |
        {
            job: .[0].job,
            avg: ([.[].duration] | add / length),
            max: ([.[].duration] | max),
            interval: .[0].interval,
            count: length
        } |
        "\(.job)|\(.avg)|\(.max)|\(.interval // "")"
    ' 2>/dev/null | while IFS='|' read -r job avg max interval; do
        [ -z "$job" ] && continue

        local avg_fmt max_fmt
        avg_fmt=$(echo "$avg" | awk '{if ($1 < 1) printf "%.0f ms", $1*1000; else printf "%.2f s", $1}')
        max_fmt=$(echo "$max" | awk '{if ($1 < 1) printf "%.0f ms", $1*1000; else printf "%.2f s", $1}')

        local effective_interval="${interval:-$global_interval}"
        local interval_secs
        interval_secs=$(echo "$effective_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo "$global_interval_secs")

        local status="OK"
        local max_num
        max_num=$(echo "$max" | awk '{printf "%.0f", $1 * 100}')
        local interval_80
        interval_80=$(echo "$interval_secs" | awk '{printf "%.0f", $1 * 80}')

        if [ "$max_num" -gt "$interval_80" ] 2>/dev/null; then
            status="${RED}SLOW${NC}"
            add_recommendation "WARNING" "scrapes" "Job '$job' max scrape duration (${max_fmt}) exceeds 80% of scrape interval (${effective_interval}) -- increase interval or optimize target"
        fi

        printf "  %-30s %10s %10s %10s %b\n" "$job" "$avg_fmt" "$max_fmt" "$effective_interval" "$status"
    done

    echo ""

    # Scrape sample stats
    local exceeded dropped
    exceeded=$(prom_metric "prometheus_target_scrapes_exceeded_sample_limit_total")
    dropped=$(prom_metric "prometheus_target_scrapes_sample_duplicate_timestamp_total")

    if [ "$exceeded" != "0" ] && [ -n "$exceeded" ]; then
        local exceeded_num
        exceeded_num=$(echo "$exceeded" | awk '{printf "%d", $1}')
        if [ "$exceeded_num" -gt 0 ] 2>/dev/null; then
            print_metric "Sample limit exceeded" "$(format_number "$exceeded")"
            add_recommendation "WARNING" "scrapes" "$(format_number "$exceeded") scrapes exceeded sample_limit -- increase sample_limit in scrape config or reduce target metrics"
        fi
    fi

    if [ "$dropped" != "0" ] && [ -n "$dropped" ]; then
        local dropped_num
        dropped_num=$(echo "$dropped" | awk '{printf "%d", $1}')
        if [ "$dropped_num" -gt 0 ] 2>/dev/null; then
            print_metric "Duplicate timestamps" "$(format_number "$dropped")"
        fi
    fi
}

# ============================================================================
# SECTION: RULES
# ============================================================================

analyze_rules() {
    print_header "Rule Evaluation"

    if [ -z "$RULES_DATA" ]; then
        echo "  (rules endpoint not available)"
        return
    fi

    # Count rules
    local recording_rules alerting_rules
    recording_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "recording")] | length')
    alerting_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "alerting")] | length')
    local total_groups
    total_groups=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[]] | length')

    print_metric "Rule groups" "$total_groups"
    print_metric "Recording rules" "$recording_rules"
    print_metric "Alerting rules" "$alerting_rules"
    echo ""

    # Rule evaluation failures
    local eval_failures
    eval_failures=$(prom_metric "prometheus_rule_evaluation_failures_total")
    if [ "$eval_failures" != "0" ] && [ -n "$eval_failures" ]; then
        local fail_num
        fail_num=$(echo "$eval_failures" | awk '{printf "%d", $1}')
        if [ "$fail_num" -gt 0 ] 2>/dev/null; then
            print_metric "Evaluation failures" "${RED}$(format_number "$eval_failures")${NC}"
            add_recommendation "WARNING" "rules" "$(format_number "$eval_failures") rule evaluation failures -- check rule syntax and query targets"
        fi
    fi

    # Group evaluation durations
    echo -e "  ${BOLD}Rule Group Evaluation Duration${NC}"
    echo ""
    printf "  %-40s %12s %12s %8s\n" "Group" "Last Eval" "Interval" "Status"
    printf "  %-40s %12s %12s %8s\n" "-----" "---------" "--------" "------"

    echo "$RULES_DATA" | jq -r '
        .data.groups // [] | .[] |
        "\(.name)|\(.lastEvaluation // "")|\(.evaluationTime // 0)|\(.interval // 0)"
    ' 2>/dev/null | while IFS='|' read -r name _last_eval eval_time interval; do
        [ -z "$name" ] && continue

        local eval_fmt interval_fmt status
        eval_fmt=$(echo "$eval_time" | awk '{if ($1 < 0.001) printf "%.3f ms", $1*1000; else if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')
        interval_fmt="${interval}s"

        status="OK"
        if [ -n "$interval" ] && [ "$interval" != "0" ]; then
            local eval_pct
            eval_pct=$(echo "$eval_time $interval" | awk '{printf "%d", ($1 / $2) * 100}')
            if [ "$eval_pct" -gt 50 ] 2>/dev/null; then
                status="${YELLOW}SLOW${NC}"
                add_recommendation "WARNING" "rules" "Rule group '$name' evaluation (${eval_fmt}) exceeds 50% of interval (${interval_fmt}) -- consider splitting group or adding recording rules"
            fi
        fi

        printf "  %-40s %12s %12s %b\n" "${name:0:40}" "$eval_fmt" "$interval_fmt" "$status"
    done

    echo ""
}

# ============================================================================
# SECTION: STORAGE
# ============================================================================

analyze_storage() {
    print_header "Storage"

    # Retention settings
    local retention_time retention_size
    retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"')
    retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"')
    local storage_path
    storage_path=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.path"] // "/data"')

    print_metric "Storage path" "$storage_path"
    print_metric "Retention (time)" "$retention_time"
    print_metric "Retention (size)" "$retention_size"
    echo ""

    # Block stats from /metrics
    local blocks_loaded
    blocks_loaded=$(prom_metric "prometheus_tsdb_blocks_loaded")
    print_metric "Blocks loaded" "$(format_number "$blocks_loaded")"

    # Storage size from metrics
    local storage_size_bytes
    storage_size_bytes=$(prom_metric "prometheus_tsdb_storage_blocks_bytes")
    if [ "$storage_size_bytes" != "0" ] && [ -n "$storage_size_bytes" ]; then
        print_metric "Block storage size" "$(format_bytes "$storage_size_bytes")"
    fi

    # WAL size
    local wal_size
    wal_size=$(prom_metric "prometheus_tsdb_wal_storage_size_bytes")
    if [ "$wal_size" != "0" ] && [ -n "$wal_size" ]; then
        print_metric "WAL size" "$(format_bytes "$wal_size")"
    fi

    # Total storage
    local total_storage
    total_storage=$(echo "${storage_size_bytes:-0} ${wal_size:-0}" | awk '{printf "%.0f", $1 + $2}')
    if [ "$total_storage" -gt 0 ] 2>/dev/null; then
        print_metric "Total TSDB size" "$(format_bytes "$total_storage")"
    fi

    echo ""

    # WAL segments
    local wal_segments
    wal_segments=$(prom_metric "prometheus_tsdb_wal_segment_current")
    if [ "$wal_segments" != "0" ] && [ -n "$wal_segments" ]; then
        print_metric "WAL current segment" "$(format_number "$wal_segments")"
    fi

    # Growth estimation
    echo ""
    echo -e "  ${BOLD}Growth Estimation${NC}"
    echo ""

    local head_series samples_appended
    head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
    samples_appended=$(prom_metric "prometheus_tsdb_head_samples_appended_total")

    local start_time uptime_secs=0
    start_time=$(prom_metric "process_start_time_seconds")
    local now_time
    now_time=$(date +%s)
    if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then
        uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}')
        uptime_secs=${uptime_secs:-0}
    fi

    if [ "${uptime_secs:-0}" -gt 0 ] 2>/dev/null && [ -n "$samples_appended" ] && [ "$samples_appended" != "0" ]; then
        local samples_per_sec samples_per_day
        samples_per_sec=$(echo "$samples_appended $uptime_secs" | awk '{printf "%.1f", $1 / $2}')
        samples_per_day=$(echo "$samples_per_sec" | awk '{printf "%.0f", $1 * 86400}')

        print_metric "Samples/second" "$(format_number "$samples_per_sec")"
        print_metric "Samples/day" "$(format_number "$samples_per_day")"

        # Estimate bytes per sample (~1-2 bytes compressed)
        local bytes_per_day_low bytes_per_day_high
        bytes_per_day_low=$(echo "$samples_per_day" | awk '{printf "%.0f", $1 * 1.0}')
        bytes_per_day_high=$(echo "$samples_per_day" | awk '{printf "%.0f", $1 * 2.0}')

        print_metric "Estimated disk/day" "$(format_bytes "$bytes_per_day_low") -- $(format_bytes "$bytes_per_day_high")"

        # Estimate 30-day storage
        local monthly_low monthly_high
        monthly_low=$(echo "$bytes_per_day_low" | awk '{printf "%.0f", $1 * 30}')
        monthly_high=$(echo "$bytes_per_day_high" | awk '{printf "%.0f", $1 * 30}')
        print_metric "Estimated disk/30 days" "$(format_bytes "$monthly_low") -- $(format_bytes "$monthly_high")"
    else
        echo "  (insufficient uptime data for growth estimation)"
    fi

    # Retention size check
    if [ "$retention_size" != "not set" ] && [ "$retention_size" != "0" ] && [ -n "$total_storage" ]; then
        local ret_bytes
        # Parse retention size (e.g., "512MB", "1GB")
        ret_bytes=$(echo "$retention_size" | awk '{
            s = $1
            if (s ~ /TB/) { gsub(/TB/, "", s); printf "%.0f", s * 1099511627776 }
            else if (s ~ /GB/) { gsub(/GB/, "", s); printf "%.0f", s * 1073741824 }
            else if (s ~ /MB/) { gsub(/MB/, "", s); printf "%.0f", s * 1048576 }
            else if (s ~ /KB/) { gsub(/KB/, "", s); printf "%.0f", s * 1024 }
            else { printf "%.0f", s }
        }')

        if [ "$ret_bytes" -gt 0 ] 2>/dev/null; then
            local usage_pct
            usage_pct=$(echo "$total_storage $ret_bytes" | awk '{printf "%d", ($1 / $2) * 100}')
            print_metric "Retention usage" "${usage_pct}%"

            if [ "$usage_pct" -gt 90 ] 2>/dev/null; then
                add_recommendation "WARNING" "storage" "Storage at ${usage_pct}% of retention size limit ($retention_size) -- data will be dropped soon, consider increasing retention.size"
            fi
        fi
    fi
}

# ============================================================================
# SECTION: MEMORY
# ============================================================================

analyze_memory() {
    print_header "Memory"

    # Process memory
    local rss_bytes vss_bytes
    rss_bytes=$(prom_metric "process_resident_memory_bytes")
    vss_bytes=$(prom_metric "process_virtual_memory_bytes")

    print_metric "Process RSS" "$(format_bytes "$rss_bytes")"
    print_metric "Process virtual" "$(format_bytes "$vss_bytes")"
    echo ""

    # Go runtime memory
    local heap_alloc heap_sys heap_inuse
    heap_alloc=$(prom_metric "go_memstats_heap_alloc_bytes")
    heap_sys=$(prom_metric "go_memstats_heap_sys_bytes")
    heap_inuse=$(prom_metric "go_memstats_heap_inuse_bytes")

    echo -e "  ${BOLD}Go Runtime Memory${NC}"
    echo ""
    print_metric "Heap alloc" "$(format_bytes "$heap_alloc")"
    print_metric "Heap sys" "$(format_bytes "$heap_sys")"
    print_metric "Heap in use" "$(format_bytes "$heap_inuse")"

    # RSS vs Go heap ratio
    if [ -n "$rss_bytes" ] && [ "$rss_bytes" != "0" ] && [ -n "$heap_alloc" ] && [ "$heap_alloc" != "0" ]; then
        local ratio
        ratio=$(echo "$rss_bytes $heap_alloc" | awk '{printf "%.1f", $1 / $2}')
        print_metric "RSS / Heap ratio" "${ratio}x"

        local ratio_int
        ratio_int=$(echo "$ratio" | awk '{printf "%d", $1}')
        if [ "$ratio_int" -ge 3 ] 2>/dev/null; then
            add_recommendation "WARNING" "memory" "RSS is ${ratio}x Go heap -- indicates memory fragmentation or mmap overhead, consider restarting Prometheus during a maintenance window"
        fi
    fi

    echo ""

    # GC stats
    echo -e "  ${BOLD}Garbage Collection${NC}"
    echo ""

    local gc_count gc_pause_total
    gc_count=$(prom_metric "go_gc_duration_seconds_count")
    gc_pause_total=$(prom_metric "go_gc_duration_seconds_sum")

    print_metric "GC cycles (total)" "$(format_number "$gc_count")"
    if [ -n "$gc_pause_total" ] && [ "$gc_pause_total" != "0" ]; then
        print_metric "GC pause (total)" "$(echo "$gc_pause_total" | awk '{if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')"

        # Average GC pause
        if [ -n "$gc_count" ] && [ "$gc_count" != "0" ]; then
            local avg_pause
            avg_pause=$(echo "$gc_pause_total $gc_count" | awk '{printf "%.3f", ($1 / $2) * 1000}')
            print_metric "GC avg pause" "${avg_pause} ms"
        fi
    fi

    # GC quantiles
    local gc_p99
    gc_p99=$(prom_metric_labeled 'go_gc_duration_seconds{quantile="1"}' | awk '{print $NF}')
    if [ -n "$gc_p99" ] && [ "$gc_p99" != "0" ]; then
        print_metric "GC max pause" "$(echo "$gc_p99" | awk '{if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')"
    fi

    echo ""

    # Goroutines
    local goroutines
    goroutines=$(echo "$RUNTIME_INFO" | jq -r '.data.goroutines // 0')
    print_metric "Goroutines" "$(format_number "$goroutines")"

    if [ "$goroutines" -gt 1000 ] 2>/dev/null; then
        add_recommendation "WARNING" "memory" "Goroutine count is $goroutines (>1000) -- may indicate resource leak or excessive concurrency"
    fi

    # Open file descriptors
    local open_fds max_fds
    open_fds=$(prom_metric "process_open_fds")
    max_fds=$(prom_metric "process_max_fds")

    if [ -n "$open_fds" ] && [ "$open_fds" != "0" ]; then
        print_metric "Open file descriptors" "$(format_number "$open_fds")"
        if [ -n "$max_fds" ] && [ "$max_fds" != "0" ]; then
            print_metric "Max file descriptors" "$(format_number "$max_fds")"
            local fd_pct
            fd_pct=$(echo "$open_fds $max_fds" | awk '{printf "%d", ($1 / $2) * 100}')
            if [ "$fd_pct" -gt 80 ] 2>/dev/null; then
                add_recommendation "WARNING" "memory" "File descriptor usage at ${fd_pct}% -- approaching limit, increase ulimit -n"
            fi
        fi
    fi
}

# ============================================================================
# SECTION: CONFIG
# ============================================================================

analyze_config() {
    print_header "Configuration Review"

    if [ -z "$CONFIG_DATA" ] && [ -z "$FLAGS_DATA" ]; then
        echo "  (configuration data not available)"
        return
    fi

    # Global config from YAML
    local config_yaml=""
    if [ -n "$CONFIG_DATA" ]; then
        config_yaml=$(echo "$CONFIG_DATA" | jq -r '.data.yaml // ""' 2>/dev/null)
    fi

    local scrape_interval scrape_timeout eval_interval
    scrape_interval=$(echo "$config_yaml" | grep -m1 'scrape_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"')
    scrape_timeout=$(echo "$config_yaml" | grep -m1 'scrape_timeout:' | awk '{print $2}' | tr -d "'" | tr -d '"')
    eval_interval=$(echo "$config_yaml" | grep -m1 'evaluation_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"')

    [ -z "$scrape_interval" ] && scrape_interval="1m"
    [ -z "$scrape_timeout" ] && scrape_timeout="10s"
    [ -z "$eval_interval" ] && eval_interval="1m"

    echo -e "  ${BOLD}Global Settings${NC}"
    echo ""
    print_metric "Scrape interval" "$scrape_interval"
    print_metric "Scrape timeout" "$scrape_timeout"
    print_metric "Evaluation interval" "$eval_interval"

    # Parse intervals to seconds for comparison
    local scrape_int_secs scrape_to_secs
    scrape_int_secs=$(echo "$scrape_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 60)
    scrape_to_secs=$(echo "$scrape_timeout" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 10)

    if [ "$scrape_to_secs" -ge "$scrape_int_secs" ] 2>/dev/null; then
        add_recommendation "WARNING" "config" "scrape_timeout ($scrape_timeout) >= scrape_interval ($scrape_interval) -- timeout should be less than interval"
    fi

    echo ""

    # External labels
    local external_labels
    external_labels=$(echo "$config_yaml" | awk '/^  external_labels:/,/^  [a-z]/' | grep -v 'external_labels:' | grep -v '^  [a-z]' | grep ':')

    echo -e "  ${BOLD}External Labels${NC}"
    echo ""

    if [ -n "$external_labels" ]; then
        echo "$external_labels" | while IFS= read -r line; do
            echo "  $line"
        done
    else
        echo "  (none configured)"
        add_recommendation "INFO" "config" "No external labels configured -- recommended for remote write, federation, and cross-cluster identification"
    fi

    echo ""

    # Remote write/read
    echo -e "  ${BOLD}Remote Endpoints${NC}"
    echo ""

    local remote_write_count remote_read_count
    remote_write_count=$(echo "$config_yaml" | grep -c 'remote_write:' 2>/dev/null || true)
    remote_read_count=$(echo "$config_yaml" | grep -c 'remote_read:' 2>/dev/null || true)

    local has_remote_write="no"
    local has_remote_read="no"
    [ "$remote_write_count" -gt 0 ] 2>/dev/null && has_remote_write="yes"
    [ "$remote_read_count" -gt 0 ] 2>/dev/null && has_remote_read="yes"

    print_metric "Remote write" "$has_remote_write"
    print_metric "Remote read" "$has_remote_read"

    echo ""

    # Job count and interval distribution
    echo -e "  ${BOLD}Scrape Jobs${NC}"
    echo ""

    local job_count
    job_count=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[].labels.job] | unique | length' 2>/dev/null)
    print_metric "Total scrape jobs" "${job_count:-0}"

    # Check for aggressive scrape intervals
    if [ -n "$TARGETS_DATA" ]; then
        local fast_scrape_jobs
        fast_scrape_jobs=$(echo "$TARGETS_DATA" | jq -r '
            [.data.activeTargets // [] | .[] |
             select(.scrapeInterval != null) |
             {job: .labels.job, interval: .scrapeInterval}] |
            unique_by(.job) |
            .[] |
            select(
                (.interval | test("^[0-9]+s$")) and
                (.interval | gsub("s$"; "") | tonumber) < 10
            ) |
            .job
        ' 2>/dev/null)

        if [ -n "$fast_scrape_jobs" ]; then
            local fast_count
            fast_count=$(echo "$fast_scrape_jobs" | wc -l)
            if [ "$fast_count" -gt 3 ] 2>/dev/null; then
                add_recommendation "WARNING" "config" "$fast_count jobs have scrape_interval < 10s -- high scrape frequency increases storage cost and cardinality"
            fi
        fi
    fi

    # Key flags
    echo ""
    echo -e "  ${BOLD}Key Flags${NC}"
    echo ""

    if [ -n "$FLAGS_DATA" ]; then
        local web_listen log_level tsdb_wal_compression
        web_listen=$(echo "$FLAGS_DATA" | jq -r '.data["web.listen-address"] // "unknown"')
        log_level=$(echo "$FLAGS_DATA" | jq -r '.data["log.level"] // "unknown"')
        tsdb_wal_compression=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.wal-compression"] // "unknown"')

        print_metric "Listen address" "$web_listen"
        print_metric "Log level" "$log_level"
        print_metric "WAL compression" "$tsdb_wal_compression"

        if [ "$tsdb_wal_compression" = "false" ]; then
            add_recommendation "INFO" "config" "WAL compression is disabled -- enabling it (--storage.tsdb.wal-compression) can reduce WAL size by ~50%"
        fi
    fi
}

# ============================================================================
# SECTION: SUMMARY
# ============================================================================

analyze_summary() {
    print_header "Recommendations"

    if [ ${#RECOMMENDATIONS[@]} -eq 0 ]; then
        print_status "OK" "No issues detected -- Prometheus appears healthy"
        echo ""
        echo -e "  ${BOLD}${GREEN}Health Score: 100 / 100${NC}"
        return
    fi

    # Print recommendations grouped by severity
    # Sort: CRITICAL first, then WARNING, then INFO
    for severity in CRITICAL WARNING INFO; do
        local found=false
        for rec in "${RECOMMENDATIONS[@]}"; do
            local sev section message
            sev=$(echo "$rec" | cut -d'|' -f1)
            section=$(echo "$rec" | cut -d'|' -f2)
            message=$(echo "$rec" | cut -d'|' -f3-)

            if [ "$sev" = "$severity" ]; then
                if [ "$found" = false ]; then
                    found=true
                fi
                echo -e "  $(severity_tag "$sev") ${BOLD}[${section}]${NC} $message"
                echo ""
            fi
        done
    done

    # Calculate health score
    local score=100
    score=$((score - (CRITICAL_COUNT * 15)))
    score=$((score - (WARNING_COUNT * 5)))
    score=$((score - (INFO_COUNT * 1)))
    [ "$score" -lt 0 ] && score=0

    echo -e "${BOLD}${CYAN}====================================================${NC}"

    echo ""
    printf "  Issues found: "
    [ "$CRITICAL_COUNT" -gt 0 ] && printf "${RED}%d critical${NC}  " "$CRITICAL_COUNT"
    [ "$WARNING_COUNT" -gt 0 ] && printf "${YELLOW}%d warning${NC}  " "$WARNING_COUNT"
    [ "$INFO_COUNT" -gt 0 ] && printf "${GREEN}%d info${NC}" "$INFO_COUNT"
    echo ""
    echo ""

    local score_color
    if [ "$score" -ge 80 ]; then
        score_color="$GREEN"
    elif [ "$score" -ge 50 ]; then
        score_color="$YELLOW"
    else
        score_color="$RED"
    fi

    echo -e "  ${BOLD}Health Score: ${score_color}${score} / 100${NC}"
}

# ============================================================================
# JSON OUTPUT
# ============================================================================

output_json() {
    collect_all_data

    # Build JSON from all sections
    local version head_series head_chunks retention_time retention_size rss_bytes

    version=$(echo "$RUNTIME_INFO" | jq -r '.data.version // "unknown"')
    local start_time now_time
    start_time=$(prom_metric "process_start_time_seconds")
    now_time=$(date +%s)
    local uptime_secs=0
    if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then
        uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}')
    fi

    head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
    head_chunks=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0')
    retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"')
    retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"')
    rss_bytes=$(prom_metric "process_resident_memory_bytes")

    local compactions_total compactions_failed wal_corruptions ooo_total
    compactions_total=$(prom_metric "prometheus_tsdb_compactions_total")
    compactions_failed=$(prom_metric "prometheus_tsdb_compactions_failed_total")
    wal_corruptions=$(prom_metric "prometheus_tsdb_wal_corruptions_total")
    ooo_total=$(prom_metric "prometheus_tsdb_out_of_order_samples_total")

    local total_targets up_targets down_targets
    total_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[]] | length')
    up_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "up")] | length')
    down_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "down")] | length')

    local recording_rules alerting_rules
    recording_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "recording")] | length')
    alerting_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "alerting")] | length')

    # Top metrics
    local top_metrics
    top_metrics=$(echo "$TSDB_STATUS" | jq '[.data.seriesCountByMetricName // [] | .[:10] | .[] | {name: .name, series: .value}]')

    # Run all checks to populate recommendations
    NO_COLOR=true
    RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC=""

    # Suppress output, just collect recommendations
    analyze_overview >/dev/null 2>&1
    analyze_tsdb >/dev/null 2>&1
    analyze_cardinality >/dev/null 2>&1
    analyze_queries >/dev/null 2>&1
    analyze_scrapes >/dev/null 2>&1
    analyze_rules >/dev/null 2>&1
    analyze_storage >/dev/null 2>&1
    analyze_memory >/dev/null 2>&1
    analyze_config >/dev/null 2>&1

    # Build recommendations JSON
    local rec_json="["
    local first=true
    for rec in "${RECOMMENDATIONS[@]}"; do
        local sev section message
        sev=$(echo "$rec" | cut -d'|' -f1)
        section=$(echo "$rec" | cut -d'|' -f2)
        message=$(echo "$rec" | cut -d'|' -f3-)

        [ "$first" = true ] && first=false || rec_json+=","
        rec_json+="{\"severity\":\"$sev\",\"section\":\"$section\",\"message\":$(echo "$message" | jq -Rs '.')}"
    done
    rec_json+="]"

    local score=100
    score=$((score - (CRITICAL_COUNT * 15)))
    score=$((score - (WARNING_COUNT * 5)))
    score=$((score - (INFO_COUNT * 1)))
    [ "$score" -lt 0 ] && score=0

    jq -n \
       --arg version "$version" \
       --argjson uptime "$uptime_secs" \
       --argjson head_series "$head_series" \
       --argjson head_chunks "$head_chunks" \
       --arg retention_time "$retention_time" \
       --arg retention_size "$retention_size" \
       --argjson rss_bytes "$(echo "$rss_bytes" | awk '{printf "%d", $1}')" \
       --argjson compactions_total "$(echo "$compactions_total" | awk '{printf "%d", $1}')" \
       --argjson compactions_failed "$(echo "$compactions_failed" | awk '{printf "%d", $1}')" \
       --argjson wal_corruptions "$(echo "$wal_corruptions" | awk '{printf "%d", $1}')" \
       --argjson ooo_samples "$(echo "$ooo_total" | awk '{printf "%d", $1}')" \
       --argjson total_targets "$total_targets" \
       --argjson up_targets "$up_targets" \
       --argjson down_targets "$down_targets" \
       --argjson recording_rules "$recording_rules" \
       --argjson alerting_rules "$alerting_rules" \
       --argjson top_metrics "$top_metrics" \
       --argjson recommendations "$rec_json" \
       --argjson score "$score" \
       --argjson critical "$CRITICAL_COUNT" \
       --argjson warnings "$WARNING_COUNT" \
       --argjson info "$INFO_COUNT" \
        '{
            prometheus: {
                version: $version,
                uptime_seconds: $uptime,
                memory_rss_bytes: $rss_bytes
            },
            tsdb: {
                head_series: $head_series,
                head_chunks: $head_chunks,
                retention_time: $retention_time,
                retention_size: $retention_size,
                compactions_total: $compactions_total,
                compactions_failed: $compactions_failed,
                wal_corruptions: $wal_corruptions,
                out_of_order_samples: $ooo_samples
            },
            targets: {
                total: $total_targets,
                up: $up_targets,
                down: $down_targets
            },
            rules: {
                recording: $recording_rules,
                alerting: $alerting_rules
            },
            cardinality: {
                top_metrics: $top_metrics
            },
            health: {
                score: $score,
                critical: $critical,
                warnings: $warnings,
                info: $info
            },
            recommendations: $recommendations
        }'
}

# ============================================================================
# MAIN
# ============================================================================

main() {
    parse_args "$@"

    if ! check_requirements; then
        exit 1
    fi

    # JSON mode
    if [ "$JSON_MODE" = true ]; then
        if [ -n "$OUTPUT_FILE" ]; then
            output_json > "$OUTPUT_FILE"
            echo "JSON report written to $OUTPUT_FILE" >&2
        else
            output_json
        fi
        return
    fi

    # Text report mode
    collect_all_data

    {
        echo -e "${BOLD}Prometheus Performance Analyzer v1.0${NC}"
        echo -e "${DIM}Target: ${PROM_URL}${NC}"
        echo -e "${DIM}Date:   $(date '+%Y-%m-%d %H:%M:%S %Z')${NC}"

        if [ -n "$SECTION" ]; then
            case "$SECTION" in
                overview)    analyze_overview ;;
                tsdb)        analyze_tsdb ;;
                cardinality) analyze_cardinality ;;
                queries)     analyze_queries ;;
                scrapes)     analyze_scrapes ;;
                rules)       analyze_rules ;;
                storage)     analyze_storage ;;
                memory)      analyze_memory ;;
                config)      analyze_config ;;
                summary)     analyze_summary ;;
                *)
                    echo "Unknown section: $SECTION" >&2
                    echo "Valid sections: overview tsdb cardinality queries scrapes rules storage memory config summary" >&2
                    exit 1
                    ;;
            esac
            # Always show summary if not already shown
            if [ "$SECTION" != "summary" ]; then
                analyze_summary
            fi
        else
            analyze_overview
            analyze_tsdb
            analyze_cardinality
            analyze_queries
            analyze_scrapes
            analyze_rules
            analyze_storage
            analyze_memory
            analyze_config
            analyze_summary
        fi

        echo ""
    } | if [ -n "$OUTPUT_FILE" ]; then
        cat > "$OUTPUT_FILE"
        echo "Report written to $OUTPUT_FILE" >&2
    else
        cat
    fi
}

main "$@"