#!/bin/bash ################################################################################ # Script Name: prometheus-performance-analyzer.sh # Version: 1.01 # Description: Diagnostic tool that analyzes Prometheus server performance. # Queries TSDB status, runtime info, flags, config, targets, # rules, and internal metrics to produce a detailed report # with actionable recommendations. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - curl # - jq # - bc (for calculations) # - Network access to Prometheus API # # Usage: # # Analyze local Prometheus # ./prometheus-performance-analyzer.sh # # # Analyze remote Prometheus # ./prometheus-performance-analyzer.sh --url http://prometheus:9090 # # # JSON output for automation # ./prometheus-performance-analyzer.sh --json # # # Analyze specific section only # ./prometheus-performance-analyzer.sh --section cardinality # # # Save report to file (auto-disables color) # ./prometheus-performance-analyzer.sh -o report.txt # # # Custom series threshold # ./prometheus-performance-analyzer.sh --threshold-series 5000000 # # Sections: # overview - Version, uptime, series counts, retention # tsdb - TSDB head stats, compaction, WAL health # cardinality - High cardinality metrics, labels, label-value pairs # queries - Query engine performance and latency # scrapes - Scrape target health and duration analysis # rules - Recording/alerting rule evaluation # storage - Disk, retention, WAL, growth estimation # memory - RSS, Go heap, GC, goroutines # config - Configuration review and best practices # summary - Health score and all recommendations # ################################################################################ # ============================================================================ # CONFIGURATION & DEFAULTS # ============================================================================ PROM_URL="http://localhost:9090" OUTPUT_FILE="" JSON_MODE=false NO_COLOR=false SECTION="" THRESHOLD_SERIES=1000000 # Colors RED='\033[0;31m' YELLOW='\033[1;33m' GREEN='\033[0;32m' CYAN='\033[0;36m' BOLD='\033[1m' DIM='\033[2m' NC='\033[0m' # Global state declare -a RECOMMENDATIONS=() CRITICAL_COUNT=0 WARNING_COUNT=0 INFO_COUNT=0 # Cached API responses TSDB_STATUS="" RUNTIME_INFO="" FLAGS_DATA="" CONFIG_DATA="" TARGETS_DATA="" RULES_DATA="" METRICS_RAW="" # ============================================================================ # HELPER FUNCTIONS # ============================================================================ show_usage() { cat <&2; exit 1 ;; esac done # Strip trailing slash PROM_URL="${PROM_URL%/}" # Disable colors if requested if [ "$NO_COLOR" = true ]; then RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC="" fi } check_requirements() { local missing=0 for cmd in curl jq awk; do if ! command -v "$cmd" >/dev/null 2>&1; then echo "ERROR: $cmd not found" >&2 missing=1 fi done # Auto-install bc if missing if ! command -v bc >/dev/null 2>&1; then echo "bc not found -- installing..." >&2 if command -v apt-get >/dev/null 2>&1; then sudo apt-get install -y bc >/dev/null 2>&1 elif command -v dnf >/dev/null 2>&1; then sudo dnf install -y bc >/dev/null 2>&1 elif command -v yum >/dev/null 2>&1; then sudo yum install -y bc >/dev/null 2>&1 fi if ! command -v bc >/dev/null 2>&1; then echo "ERROR: failed to install bc -- install it manually" >&2 missing=1 fi fi return $missing } # Query Prometheus API endpoint -- returns JSON body prom_api() { local endpoint="$1" curl -sf --connect-timeout 5 --max-time 15 "${PROM_URL}${endpoint}" 2>/dev/null } # Query a Prometheus metric via instant query API -- returns numeric value prom_query() { local query="$1" local encoded encoded=$(printf '%s' "$query" | jq -sRr @uri) local result result=$(prom_api "/api/v1/query?query=${encoded}") if [ -z "$result" ]; then echo "0" return 1 fi echo "$result" | jq -r '.data.result[0].value[1] // "0"' 2>/dev/null } # Extract a metric value from raw /metrics text prom_metric() { local metric="$1" if [ -z "$METRICS_RAW" ]; then echo "0" return fi local val val=$(echo "$METRICS_RAW" | grep "^${metric} " | head -1 | awk '{print $2}') echo "${val:-0}" } # Extract a metric with labels from raw /metrics text prom_metric_labeled() { local pattern="$1" if [ -z "$METRICS_RAW" ]; then return fi echo "$METRICS_RAW" | grep "^${pattern}" 2>/dev/null } # Format bytes to human-readable format_bytes() { local bytes="$1" if [ -z "$bytes" ] || [ "$bytes" = "0" ] || [ "$bytes" = "null" ]; then echo "0 B" return fi echo "$bytes" | awk '{ if ($1 >= 1099511627776) printf "%.1f TB", $1/1099511627776 else if ($1 >= 1073741824) printf "%.1f GB", $1/1073741824 else if ($1 >= 1048576) printf "%.1f MB", $1/1048576 else if ($1 >= 1024) printf "%.1f KB", $1/1024 else printf "%d B", $1 }' } # Format seconds to human-readable duration format_duration() { local total="$1" if [ -z "$total" ] || [ "$total" = "0" ]; then echo "0s" return fi # Handle float seconds local secs secs=$(echo "$total" | awk '{printf "%d", $1}') local days=$((secs / 86400)) local hours=$(( (secs % 86400) / 3600 )) local mins=$(( (secs % 3600) / 60 )) local s=$((secs % 60)) local result="" [ "$days" -gt 0 ] && result="${days}d " [ "$hours" -gt 0 ] && result="${result}${hours}h " [ "$mins" -gt 0 ] && result="${result}${mins}m " [ "$s" -gt 0 ] || [ -z "$result" ] && result="${result}${s}s" echo "${result% }" } # Format number with commas format_number() { local n="$1" if [ -z "$n" ] || [ "$n" = "null" ]; then echo "0" return fi printf "%'.0f" "$n" 2>/dev/null || echo "$n" } # Add a recommendation add_recommendation() { local severity="$1" local section="$2" local message="$3" case "$severity" in CRITICAL) CRITICAL_COUNT=$((CRITICAL_COUNT + 1)) ;; WARNING) WARNING_COUNT=$((WARNING_COUNT + 1)) ;; INFO) INFO_COUNT=$((INFO_COUNT + 1)) ;; esac RECOMMENDATIONS+=("${severity}|${section}|${message}") } # Print section header print_header() { local title="$1" echo "" echo -e "${BOLD}${CYAN}====================================================${NC}" echo -e "${BOLD}${CYAN} ${title}${NC}" echo -e "${BOLD}${CYAN}====================================================${NC}" echo "" } # Print a metric line (label + value, aligned) print_metric() { local label="$1" local value="$2" printf " ${DIM}%-30s${NC} %s\n" "$label:" "$value" } # Print severity tag severity_tag() { local sev="$1" case "$sev" in CRITICAL) echo -e "${RED}[CRITICAL]${NC}" ;; WARNING) echo -e "${YELLOW}[WARNING]${NC}" ;; INFO) echo -e "${GREEN}[INFO]${NC}" ;; OK) echo -e "${GREEN}[OK]${NC}" ;; esac } # Print a status line print_status() { local severity="$1" local message="$2" echo -e " $(severity_tag "$severity") $message" } # ============================================================================ # DATA COLLECTION # ============================================================================ collect_all_data() { echo -e "${DIM}Collecting data from ${PROM_URL}...${NC}" >&2 # Check connectivity first if ! prom_api "/api/v1/status/runtimeinfo" >/dev/null 2>&1; then echo -e "${RED}ERROR: Cannot reach Prometheus at ${PROM_URL}${NC}" >&2 echo "Check the URL and ensure Prometheus is running." >&2 exit 1 fi TSDB_STATUS=$(prom_api "/api/v1/status/tsdb") RUNTIME_INFO=$(prom_api "/api/v1/status/runtimeinfo") FLAGS_DATA=$(prom_api "/api/v1/status/flags") CONFIG_DATA=$(prom_api "/api/v1/status/config") TARGETS_DATA=$(prom_api "/api/v1/targets") RULES_DATA=$(prom_api "/api/v1/rules") METRICS_RAW=$(curl -sf --connect-timeout 5 --max-time 15 "${PROM_URL}/metrics" 2>/dev/null) echo -e "${DIM}Data collection complete.${NC}" >&2 } # ============================================================================ # SECTION: OVERVIEW # ============================================================================ analyze_overview() { print_header "Overview" # Version and uptime from runtime info local version goroutines gomaxprocs storage_path version=$(echo "$RUNTIME_INFO" | jq -r '.data.version // "unknown"') goroutines=$(echo "$RUNTIME_INFO" | jq -r '.data.goroutines // "0"') gomaxprocs=$(echo "$RUNTIME_INFO" | jq -r '.data.GOMAXPROCS // "0"') storage_path=$(echo "$RUNTIME_INFO" | jq -r '.data.storageRetention // "unknown"') # Uptime from process_start_time_seconds local start_time now_time uptime_secs start_time=$(prom_metric "process_start_time_seconds") now_time=$(date +%s) if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}') else uptime_secs=0 fi # Series and samples from TSDB local head_series head_chunks head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0') head_chunks=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0') # Retention from flags local retention_time retention_size retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"') retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"') # Memory local rss_bytes rss_bytes=$(prom_metric "process_resident_memory_bytes") print_metric "Prometheus version" "$version" print_metric "Uptime" "$(format_duration "$uptime_secs")" print_metric "GOMAXPROCS" "$gomaxprocs" print_metric "Goroutines" "$(format_number "$goroutines")" print_metric "Head series" "$(format_number "$head_series")" print_metric "Head chunks" "$(format_number "$head_chunks")" print_metric "Retention (time)" "$retention_time" print_metric "Retention (size)" "$retention_size" print_metric "Memory (RSS)" "$(format_bytes "$rss_bytes")" # Series threshold check if [ "$head_series" -gt "$((THRESHOLD_SERIES * 5))" ] 2>/dev/null; then add_recommendation "CRITICAL" "overview" "Head series count $(format_number "$head_series") is very high (>$(format_number "$((THRESHOLD_SERIES * 5))")) -- investigate high cardinality metrics immediately" elif [ "$head_series" -gt "$THRESHOLD_SERIES" ] 2>/dev/null; then add_recommendation "WARNING" "overview" "Head series count $(format_number "$head_series") exceeds threshold $(format_number "$THRESHOLD_SERIES") -- review cardinality section" fi } # ============================================================================ # SECTION: TSDB # ============================================================================ analyze_tsdb() { print_header "TSDB Health" if [ -z "$TSDB_STATUS" ]; then echo " (TSDB status endpoint not available)" return fi # Head stats local num_series chunk_count min_time max_time num_label_pairs num_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0') chunk_count=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0') min_time=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.minTime // 0') max_time=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.maxTime // 0') num_label_pairs=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numLabelPairs // 0') # Calculate head block time range local head_range_secs=0 if [ "$min_time" -gt 0 ] && [ "$max_time" -gt 0 ] 2>/dev/null; then head_range_secs=$(( (max_time - min_time) / 1000 )) fi print_metric "Head series" "$(format_number "$num_series")" print_metric "Head chunks" "$(format_number "$chunk_count")" print_metric "Head label pairs" "$(format_number "$num_label_pairs")" print_metric "Head block range" "$(format_duration "$head_range_secs")" echo "" # Compaction metrics from /metrics local compactions_total compactions_failed compaction_duration compactions_total=$(prom_metric "prometheus_tsdb_compactions_total") compactions_failed=$(prom_metric "prometheus_tsdb_compactions_failed_total") compaction_duration=$(prom_metric "prometheus_tsdb_compaction_duration_seconds_sum") print_metric "Compactions total" "$(format_number "$compactions_total")" print_metric "Compaction failures" "$compactions_failed" if [ -n "$compaction_duration" ] && [ "$compaction_duration" != "0" ]; then print_metric "Compaction time (total)" "$(format_duration "$compaction_duration")" fi if [ "$compactions_failed" != "0" ] && [ "$compactions_failed" != "" ] 2>/dev/null; then if [ "$compactions_failed" -gt 0 ] 2>/dev/null; then add_recommendation "CRITICAL" "tsdb" "TSDB has $compactions_failed compaction failures -- investigate storage health (disk I/O, free space)" fi fi echo "" # WAL stats local wal_corruptions wal_truncate_total wal_truncate_failed wal_corruptions=$(prom_metric "prometheus_tsdb_wal_corruptions_total") wal_truncate_total=$(prom_metric "prometheus_tsdb_wal_truncations_total") wal_truncate_failed=$(prom_metric "prometheus_tsdb_wal_truncations_failed_total") print_metric "WAL corruptions" "$wal_corruptions" print_metric "WAL truncations" "$(format_number "$wal_truncate_total")" print_metric "WAL truncation failures" "$wal_truncate_failed" if [ "$wal_corruptions" != "0" ] && [ -n "$wal_corruptions" ]; then if [ "$wal_corruptions" -gt 0 ] 2>/dev/null; then add_recommendation "CRITICAL" "tsdb" "WAL has $wal_corruptions corruption(s) -- check disk health, consider running promtool tsdb clean-tombstones" fi fi if [ "$wal_truncate_failed" != "0" ] && [ -n "$wal_truncate_failed" ]; then if [ "$wal_truncate_failed" -gt 0 ] 2>/dev/null; then add_recommendation "WARNING" "tsdb" "WAL has $wal_truncate_failed truncation failure(s) -- may cause WAL growth" fi fi echo "" # Out-of-order samples local ooo_total ooo_total=$(prom_metric "prometheus_tsdb_out_of_order_samples_total") print_metric "Out-of-order samples" "$(format_number "$ooo_total")" if [ "$ooo_total" != "0" ] && [ -n "$ooo_total" ]; then local ooo_num ooo_num=$(echo "$ooo_total" | awk '{printf "%d", $1}') if [ "$ooo_num" -gt 1000 ] 2>/dev/null; then add_recommendation "WARNING" "tsdb" "$(format_number "$ooo_total") out-of-order samples -- check NTP sync across targets or look for duplicate scraper configs" fi fi # Head GC local head_gc_duration head_gc_duration=$(prom_metric "prometheus_tsdb_head_gc_duration_seconds_sum") if [ -n "$head_gc_duration" ] && [ "$head_gc_duration" != "0" ]; then print_metric "Head GC time (total)" "$(format_duration "$head_gc_duration")" fi # Checkpoint creations local checkpoint_total checkpoint_failed checkpoint_total=$(prom_metric "prometheus_tsdb_checkpoint_creations_total") checkpoint_failed=$(prom_metric "prometheus_tsdb_checkpoint_creations_failed_total") print_metric "Checkpoints created" "$(format_number "$checkpoint_total")" if [ "$checkpoint_failed" != "0" ] && [ -n "$checkpoint_failed" ]; then if [ "$checkpoint_failed" -gt 0 ] 2>/dev/null; then print_metric "Checkpoint failures" "$checkpoint_failed" add_recommendation "WARNING" "tsdb" "$checkpoint_failed checkpoint creation failure(s) -- investigate disk health" fi fi # Tombstone cleanup local tombstones tombstones=$(prom_metric "prometheus_tsdb_tombstone_cleanup_seconds_sum") if [ -n "$tombstones" ] && [ "$tombstones" != "0" ]; then print_metric "Tombstone cleanup time" "$(format_duration "$tombstones")" fi } # ============================================================================ # SECTION: CARDINALITY # ============================================================================ analyze_cardinality() { print_header "High Cardinality Analysis" if [ -z "$TSDB_STATUS" ]; then echo " (TSDB status endpoint not available)" return fi local total_series total_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0') # Top metrics by series count echo -e " ${BOLD}Top Metrics by Series Count${NC}" echo "" local metric_count metric_count=$(echo "$TSDB_STATUS" | jq -r '.data.seriesCountByMetricName // [] | length') if [ "$metric_count" -gt 0 ] 2>/dev/null; then local i=0 while [ $i -lt 10 ] && [ $i -lt "$metric_count" ]; do local name count pct name=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByMetricName[$i].name // \"\"") count=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByMetricName[$i].value // 0") if [ -n "$name" ] && [ "$name" != "" ]; then if [ "$total_series" -gt 0 ] 2>/dev/null; then pct=$(echo "scale=1; $count * 100 / $total_series" | bc 2>/dev/null) else pct="0" fi printf " %2d. %-45s %10s (%5s%%)\n" "$((i+1))" "$name" "$(format_number "$count")" "$pct" # Flag metrics consuming > 10% of total local pct_int pct_int=$(echo "$pct" | awk '{printf "%d", $1}') if [ "$pct_int" -ge 10 ] 2>/dev/null; then add_recommendation "WARNING" "cardinality" "$name has $(format_number "$count") series (${pct}% of total) -- consider adding metric_relabel_configs to drop unused label dimensions" fi fi i=$((i + 1)) done else echo " (no data available)" fi echo "" # Top labels by value count echo -e " ${BOLD}Top Labels by Value Count${NC}" echo "" local label_count label_count=$(echo "$TSDB_STATUS" | jq -r '.data.labelValueCountByLabelName // [] | length') if [ "$label_count" -gt 0 ] 2>/dev/null; then local i=0 while [ $i -lt 10 ] && [ $i -lt "$label_count" ]; do local name count name=$(echo "$TSDB_STATUS" | jq -r ".data.labelValueCountByLabelName[$i].name // \"\"") count=$(echo "$TSDB_STATUS" | jq -r ".data.labelValueCountByLabelName[$i].value // 0") if [ -n "$name" ] && [ "$name" != "" ]; then printf " %2d. %-45s %10s values\n" "$((i+1))" "$name" "$(format_number "$count")" # Flag labels with very high value counts if [ "$count" -gt 10000 ] 2>/dev/null; then add_recommendation "WARNING" "cardinality" "Label '$name' has $(format_number "$count") unique values -- high cardinality label, consider relabeling or dropping" fi fi i=$((i + 1)) done else echo " (no data available)" fi echo "" # Top label-value pairs by series count echo -e " ${BOLD}Top Label-Value Pairs by Series Count${NC}" echo "" local pair_count pair_count=$(echo "$TSDB_STATUS" | jq -r '.data.seriesCountByLabelValuePair // [] | length') if [ "$pair_count" -gt 0 ] 2>/dev/null; then local i=0 while [ $i -lt 10 ] && [ $i -lt "$pair_count" ]; do local name count name=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByLabelValuePair[$i].name // \"\"") count=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByLabelValuePair[$i].value // 0") if [ -n "$name" ] && [ "$name" != "" ]; then printf " %2d. %-45s %10s series\n" "$((i+1))" "$name" "$(format_number "$count")" fi i=$((i + 1)) done else echo " (no data available)" fi echo "" # Overall cardinality assessment if [ "$total_series" -gt "$((THRESHOLD_SERIES * 5))" ] 2>/dev/null; then print_status "CRITICAL" "Total series $(format_number "$total_series") -- well above recommended limits" elif [ "$total_series" -gt "$THRESHOLD_SERIES" ] 2>/dev/null; then print_status "WARNING" "Total series $(format_number "$total_series") -- above threshold $(format_number "$THRESHOLD_SERIES")" else print_status "OK" "Total series $(format_number "$total_series") -- within normal range" fi # Memory per series from TSDB local mem_by_label mem_by_label=$(echo "$TSDB_STATUS" | jq -r '.data.memoryInBytesByLabelName // [] | length') if [ "$mem_by_label" -gt 0 ] 2>/dev/null; then echo "" echo -e " ${BOLD}Top Labels by Memory Usage${NC}" echo "" local i=0 while [ $i -lt 10 ] && [ $i -lt "$mem_by_label" ]; do local name bytes name=$(echo "$TSDB_STATUS" | jq -r ".data.memoryInBytesByLabelName[$i].name // \"\"") bytes=$(echo "$TSDB_STATUS" | jq -r ".data.memoryInBytesByLabelName[$i].value // 0") if [ -n "$name" ] && [ "$name" != "" ]; then printf " %2d. %-45s %10s\n" "$((i+1))" "$name" "$(format_bytes "$bytes")" fi i=$((i + 1)) done fi } # ============================================================================ # SECTION: QUERIES # ============================================================================ analyze_queries() { print_header "Query Performance" # Query engine settings from flags local max_concurrency query_timeout lookback_delta max_concurrency=$(echo "$FLAGS_DATA" | jq -r '.data["query.max-concurrency"] // "unknown"') query_timeout=$(echo "$FLAGS_DATA" | jq -r '.data["query.timeout"] // "unknown"') lookback_delta=$(echo "$FLAGS_DATA" | jq -r '.data["query.lookback-delta"] // "unknown"') print_metric "Max concurrent queries" "$max_concurrency" print_metric "Query timeout" "$query_timeout" print_metric "Lookback delta" "$lookback_delta" echo "" # Query duration quantiles from /metrics echo -e " ${BOLD}Query Duration Percentiles${NC}" echo "" local has_query_metrics=false # prometheus_engine_query_duration_seconds (histogram with quantiles) local query_durations query_durations=$(prom_metric_labeled "prometheus_engine_query_duration_seconds{") if [ -n "$query_durations" ]; then has_query_metrics=true echo "$query_durations" | while IFS= read -r line; do local quantile value slice_name quantile=$(echo "$line" | grep -oP 'quantile="\K[^"]+') slice_name=$(echo "$line" | grep -oP 'slice="\K[^"]+') value=$(echo "$line" | awk '{print $NF}') if [ -n "$quantile" ] && [ -n "$value" ] && [ "$value" != "NaN" ]; then local label label="p$(echo "$quantile" | awk '{printf "%g", $1 * 100}')" if [ -n "$slice_name" ]; then label="${label} (${slice_name})" fi printf " %-35s %s\n" "$label" "$(echo "$value" | awk '{if ($1 < 0.001) printf "%.3f ms", $1*1000; else if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')" fi done fi if [ "$has_query_metrics" = false ]; then echo " (query duration metrics not available)" fi echo "" # Query performance from instant metrics local queries_active queries_active=$(prom_metric "prometheus_engine_queries") print_metric "Active queries (now)" "$queries_active" local queries_total queries_total=$(prom_metric "prometheus_engine_query_samples_total") if [ "$queries_total" != "0" ] && [ -n "$queries_total" ]; then print_metric "Total query samples" "$(format_number "$queries_total")" fi # Check for slow queries local p99_inner p99_inner=$(echo "$query_durations" | grep 'quantile="0.99"' | grep 'inner_eval' | awk '{print $NF}' | head -1) if [ -n "$p99_inner" ] && [ "$p99_inner" != "NaN" ]; then local p99_secs p99_secs=$(echo "$p99_inner" | awk '{printf "%d", $1}') if [ "$p99_secs" -gt 10 ] 2>/dev/null; then add_recommendation "WARNING" "queries" "p99 inner eval query latency is ${p99_inner}s -- consider adding recording rules for complex queries or reducing cardinality" fi fi # Concurrent query check if [ "$max_concurrency" != "unknown" ] && [ "$queries_active" != "0" ]; then local active_num max_num active_num=$(echo "$queries_active" | awk '{printf "%d", $1}') max_num=$(echo "$max_concurrency" | awk '{printf "%d", $1}') if [ "$active_num" -ge "$max_num" ] 2>/dev/null; then add_recommendation "WARNING" "queries" "Active queries ($active_num) at or near max-concurrency ($max_num) -- consider increasing --query.max-concurrency" fi fi } # ============================================================================ # SECTION: SCRAPES # ============================================================================ analyze_scrapes() { print_header "Scrape Performance" if [ -z "$TARGETS_DATA" ]; then echo " (targets endpoint not available)" return fi # Count targets by health local total_targets up_targets down_targets unknown_targets total_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[]] | length') up_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "up")] | length') down_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "down")] | length') unknown_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health != "up" and .health != "down")] | length') print_metric "Total targets" "$total_targets" print_metric "Targets up" "${GREEN}${up_targets}${NC}" if [ "$down_targets" -gt 0 ] 2>/dev/null; then print_metric "Targets down" "${RED}${down_targets}${NC}" else print_metric "Targets down" "$down_targets" fi if [ "$unknown_targets" -gt 0 ] 2>/dev/null; then print_metric "Targets unknown" "$unknown_targets" fi echo "" # List down targets if [ "$down_targets" -gt 0 ] 2>/dev/null; then echo -e " ${BOLD}${RED}Down Targets${NC}" echo "" echo "$TARGETS_DATA" | jq -r ' .data.activeTargets // [] | .[] | select(.health == "down") | " \(.labels.job // "unknown") \(.labels.instance // .scrapeUrl) \(.lastError // "no error")" ' 2>/dev/null | head -20 echo "" add_recommendation "CRITICAL" "scrapes" "$down_targets scrape target(s) are down -- check target availability" fi # Scrape duration analysis per job echo -e " ${BOLD}Scrape Duration by Job${NC}" echo "" printf " %-30s %10s %10s %10s %8s\n" "Job" "Avg" "Max" "Interval" "Status" printf " %-30s %10s %10s %10s %8s\n" "---" "---" "---" "--------" "------" # Get global scrape interval from config local global_interval="" if [ -n "$CONFIG_DATA" ]; then global_interval=$(echo "$CONFIG_DATA" | jq -r '.data.yaml' 2>/dev/null | grep -m1 'scrape_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"') fi [ -z "$global_interval" ] && global_interval="60s" # Parse interval to seconds local global_interval_secs global_interval_secs=$(echo "$global_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 60) echo "$TARGETS_DATA" | jq -r ' [.data.activeTargets // [] | .[] | select(.health == "up") | {job: (.labels.job // "unknown"), duration: .lastScrapeDuration, interval: .scrapeInterval}] | group_by(.job) | .[] | { job: .[0].job, avg: ([.[].duration] | add / length), max: ([.[].duration] | max), interval: .[0].interval, count: length } | "\(.job)|\(.avg)|\(.max)|\(.interval // "")" ' 2>/dev/null | while IFS='|' read -r job avg max interval; do [ -z "$job" ] && continue local avg_fmt max_fmt avg_fmt=$(echo "$avg" | awk '{if ($1 < 1) printf "%.0f ms", $1*1000; else printf "%.2f s", $1}') max_fmt=$(echo "$max" | awk '{if ($1 < 1) printf "%.0f ms", $1*1000; else printf "%.2f s", $1}') local effective_interval="${interval:-$global_interval}" local interval_secs interval_secs=$(echo "$effective_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo "$global_interval_secs") local status="OK" local max_num max_num=$(echo "$max" | awk '{printf "%.0f", $1 * 100}') local interval_80 interval_80=$(echo "$interval_secs" | awk '{printf "%.0f", $1 * 80}') if [ "$max_num" -gt "$interval_80" ] 2>/dev/null; then status="${RED}SLOW${NC}" add_recommendation "WARNING" "scrapes" "Job '$job' max scrape duration (${max_fmt}) exceeds 80% of scrape interval (${effective_interval}) -- increase interval or optimize target" fi printf " %-30s %10s %10s %10s %b\n" "$job" "$avg_fmt" "$max_fmt" "$effective_interval" "$status" done echo "" # Scrape sample stats local exceeded dropped exceeded=$(prom_metric "prometheus_target_scrapes_exceeded_sample_limit_total") dropped=$(prom_metric "prometheus_target_scrapes_sample_duplicate_timestamp_total") if [ "$exceeded" != "0" ] && [ -n "$exceeded" ]; then local exceeded_num exceeded_num=$(echo "$exceeded" | awk '{printf "%d", $1}') if [ "$exceeded_num" -gt 0 ] 2>/dev/null; then print_metric "Sample limit exceeded" "$(format_number "$exceeded")" add_recommendation "WARNING" "scrapes" "$(format_number "$exceeded") scrapes exceeded sample_limit -- increase sample_limit in scrape config or reduce target metrics" fi fi if [ "$dropped" != "0" ] && [ -n "$dropped" ]; then local dropped_num dropped_num=$(echo "$dropped" | awk '{printf "%d", $1}') if [ "$dropped_num" -gt 0 ] 2>/dev/null; then print_metric "Duplicate timestamps" "$(format_number "$dropped")" fi fi } # ============================================================================ # SECTION: RULES # ============================================================================ analyze_rules() { print_header "Rule Evaluation" if [ -z "$RULES_DATA" ]; then echo " (rules endpoint not available)" return fi # Count rules local recording_rules alerting_rules recording_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "recording")] | length') alerting_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "alerting")] | length') local total_groups total_groups=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[]] | length') print_metric "Rule groups" "$total_groups" print_metric "Recording rules" "$recording_rules" print_metric "Alerting rules" "$alerting_rules" echo "" # Rule evaluation failures local eval_failures eval_failures=$(prom_metric "prometheus_rule_evaluation_failures_total") if [ "$eval_failures" != "0" ] && [ -n "$eval_failures" ]; then local fail_num fail_num=$(echo "$eval_failures" | awk '{printf "%d", $1}') if [ "$fail_num" -gt 0 ] 2>/dev/null; then print_metric "Evaluation failures" "${RED}$(format_number "$eval_failures")${NC}" add_recommendation "WARNING" "rules" "$(format_number "$eval_failures") rule evaluation failures -- check rule syntax and query targets" fi fi # Group evaluation durations echo -e " ${BOLD}Rule Group Evaluation Duration${NC}" echo "" printf " %-40s %12s %12s %8s\n" "Group" "Last Eval" "Interval" "Status" printf " %-40s %12s %12s %8s\n" "-----" "---------" "--------" "------" echo "$RULES_DATA" | jq -r ' .data.groups // [] | .[] | "\(.name)|\(.lastEvaluation // "")|\(.evaluationTime // 0)|\(.interval // 0)" ' 2>/dev/null | while IFS='|' read -r name _last_eval eval_time interval; do [ -z "$name" ] && continue local eval_fmt interval_fmt status eval_fmt=$(echo "$eval_time" | awk '{if ($1 < 0.001) printf "%.3f ms", $1*1000; else if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}') interval_fmt="${interval}s" status="OK" if [ -n "$interval" ] && [ "$interval" != "0" ]; then local eval_pct eval_pct=$(echo "$eval_time $interval" | awk '{printf "%d", ($1 / $2) * 100}') if [ "$eval_pct" -gt 50 ] 2>/dev/null; then status="${YELLOW}SLOW${NC}" add_recommendation "WARNING" "rules" "Rule group '$name' evaluation (${eval_fmt}) exceeds 50% of interval (${interval_fmt}) -- consider splitting group or adding recording rules" fi fi printf " %-40s %12s %12s %b\n" "${name:0:40}" "$eval_fmt" "$interval_fmt" "$status" done echo "" } # ============================================================================ # SECTION: STORAGE # ============================================================================ analyze_storage() { print_header "Storage" # Retention settings local retention_time retention_size retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"') retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"') local storage_path storage_path=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.path"] // "/data"') print_metric "Storage path" "$storage_path" print_metric "Retention (time)" "$retention_time" print_metric "Retention (size)" "$retention_size" echo "" # Block stats from /metrics local blocks_loaded blocks_loaded=$(prom_metric "prometheus_tsdb_blocks_loaded") print_metric "Blocks loaded" "$(format_number "$blocks_loaded")" # Storage size from metrics local storage_size_bytes storage_size_bytes=$(prom_metric "prometheus_tsdb_storage_blocks_bytes") if [ "$storage_size_bytes" != "0" ] && [ -n "$storage_size_bytes" ]; then print_metric "Block storage size" "$(format_bytes "$storage_size_bytes")" fi # WAL size local wal_size wal_size=$(prom_metric "prometheus_tsdb_wal_storage_size_bytes") if [ "$wal_size" != "0" ] && [ -n "$wal_size" ]; then print_metric "WAL size" "$(format_bytes "$wal_size")" fi # Total storage local total_storage total_storage=$(echo "${storage_size_bytes:-0} ${wal_size:-0}" | awk '{printf "%.0f", $1 + $2}') if [ "$total_storage" -gt 0 ] 2>/dev/null; then print_metric "Total TSDB size" "$(format_bytes "$total_storage")" fi echo "" # WAL segments local wal_segments wal_segments=$(prom_metric "prometheus_tsdb_wal_segment_current") if [ "$wal_segments" != "0" ] && [ -n "$wal_segments" ]; then print_metric "WAL current segment" "$(format_number "$wal_segments")" fi # Growth estimation echo "" echo -e " ${BOLD}Growth Estimation${NC}" echo "" local head_series samples_appended head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0') samples_appended=$(prom_metric "prometheus_tsdb_head_samples_appended_total") local start_time uptime_secs=0 start_time=$(prom_metric "process_start_time_seconds") local now_time now_time=$(date +%s) if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}') uptime_secs=${uptime_secs:-0} fi if [ "${uptime_secs:-0}" -gt 0 ] 2>/dev/null && [ -n "$samples_appended" ] && [ "$samples_appended" != "0" ]; then local samples_per_sec samples_per_day samples_per_sec=$(echo "$samples_appended $uptime_secs" | awk '{printf "%.1f", $1 / $2}') samples_per_day=$(echo "$samples_per_sec" | awk '{printf "%.0f", $1 * 86400}') print_metric "Samples/second" "$(format_number "$samples_per_sec")" print_metric "Samples/day" "$(format_number "$samples_per_day")" # Estimate bytes per sample (~1-2 bytes compressed) local bytes_per_day_low bytes_per_day_high bytes_per_day_low=$(echo "$samples_per_day" | awk '{printf "%.0f", $1 * 1.0}') bytes_per_day_high=$(echo "$samples_per_day" | awk '{printf "%.0f", $1 * 2.0}') print_metric "Estimated disk/day" "$(format_bytes "$bytes_per_day_low") -- $(format_bytes "$bytes_per_day_high")" # Estimate 30-day storage local monthly_low monthly_high monthly_low=$(echo "$bytes_per_day_low" | awk '{printf "%.0f", $1 * 30}') monthly_high=$(echo "$bytes_per_day_high" | awk '{printf "%.0f", $1 * 30}') print_metric "Estimated disk/30 days" "$(format_bytes "$monthly_low") -- $(format_bytes "$monthly_high")" else echo " (insufficient uptime data for growth estimation)" fi # Retention size check if [ "$retention_size" != "not set" ] && [ "$retention_size" != "0" ] && [ -n "$total_storage" ]; then local ret_bytes # Parse retention size (e.g., "512MB", "1GB") ret_bytes=$(echo "$retention_size" | awk '{ s = $1 if (s ~ /TB/) { gsub(/TB/, "", s); printf "%.0f", s * 1099511627776 } else if (s ~ /GB/) { gsub(/GB/, "", s); printf "%.0f", s * 1073741824 } else if (s ~ /MB/) { gsub(/MB/, "", s); printf "%.0f", s * 1048576 } else if (s ~ /KB/) { gsub(/KB/, "", s); printf "%.0f", s * 1024 } else { printf "%.0f", s } }') if [ "$ret_bytes" -gt 0 ] 2>/dev/null; then local usage_pct usage_pct=$(echo "$total_storage $ret_bytes" | awk '{printf "%d", ($1 / $2) * 100}') print_metric "Retention usage" "${usage_pct}%" if [ "$usage_pct" -gt 90 ] 2>/dev/null; then add_recommendation "WARNING" "storage" "Storage at ${usage_pct}% of retention size limit ($retention_size) -- data will be dropped soon, consider increasing retention.size" fi fi fi } # ============================================================================ # SECTION: MEMORY # ============================================================================ analyze_memory() { print_header "Memory" # Process memory local rss_bytes vss_bytes rss_bytes=$(prom_metric "process_resident_memory_bytes") vss_bytes=$(prom_metric "process_virtual_memory_bytes") print_metric "Process RSS" "$(format_bytes "$rss_bytes")" print_metric "Process virtual" "$(format_bytes "$vss_bytes")" echo "" # Go runtime memory local heap_alloc heap_sys heap_inuse heap_alloc=$(prom_metric "go_memstats_heap_alloc_bytes") heap_sys=$(prom_metric "go_memstats_heap_sys_bytes") heap_inuse=$(prom_metric "go_memstats_heap_inuse_bytes") echo -e " ${BOLD}Go Runtime Memory${NC}" echo "" print_metric "Heap alloc" "$(format_bytes "$heap_alloc")" print_metric "Heap sys" "$(format_bytes "$heap_sys")" print_metric "Heap in use" "$(format_bytes "$heap_inuse")" # RSS vs Go heap ratio if [ -n "$rss_bytes" ] && [ "$rss_bytes" != "0" ] && [ -n "$heap_alloc" ] && [ "$heap_alloc" != "0" ]; then local ratio ratio=$(echo "$rss_bytes $heap_alloc" | awk '{printf "%.1f", $1 / $2}') print_metric "RSS / Heap ratio" "${ratio}x" local ratio_int ratio_int=$(echo "$ratio" | awk '{printf "%d", $1}') if [ "$ratio_int" -ge 3 ] 2>/dev/null; then add_recommendation "WARNING" "memory" "RSS is ${ratio}x Go heap -- indicates memory fragmentation or mmap overhead, consider restarting Prometheus during a maintenance window" fi fi echo "" # GC stats echo -e " ${BOLD}Garbage Collection${NC}" echo "" local gc_count gc_pause_total gc_count=$(prom_metric "go_gc_duration_seconds_count") gc_pause_total=$(prom_metric "go_gc_duration_seconds_sum") print_metric "GC cycles (total)" "$(format_number "$gc_count")" if [ -n "$gc_pause_total" ] && [ "$gc_pause_total" != "0" ]; then print_metric "GC pause (total)" "$(echo "$gc_pause_total" | awk '{if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')" # Average GC pause if [ -n "$gc_count" ] && [ "$gc_count" != "0" ]; then local avg_pause avg_pause=$(echo "$gc_pause_total $gc_count" | awk '{printf "%.3f", ($1 / $2) * 1000}') print_metric "GC avg pause" "${avg_pause} ms" fi fi # GC quantiles local gc_p99 gc_p99=$(prom_metric_labeled 'go_gc_duration_seconds{quantile="1"}' | awk '{print $NF}') if [ -n "$gc_p99" ] && [ "$gc_p99" != "0" ]; then print_metric "GC max pause" "$(echo "$gc_p99" | awk '{if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')" fi echo "" # Goroutines local goroutines goroutines=$(echo "$RUNTIME_INFO" | jq -r '.data.goroutines // 0') print_metric "Goroutines" "$(format_number "$goroutines")" if [ "$goroutines" -gt 1000 ] 2>/dev/null; then add_recommendation "WARNING" "memory" "Goroutine count is $goroutines (>1000) -- may indicate resource leak or excessive concurrency" fi # Open file descriptors local open_fds max_fds open_fds=$(prom_metric "process_open_fds") max_fds=$(prom_metric "process_max_fds") if [ -n "$open_fds" ] && [ "$open_fds" != "0" ]; then print_metric "Open file descriptors" "$(format_number "$open_fds")" if [ -n "$max_fds" ] && [ "$max_fds" != "0" ]; then print_metric "Max file descriptors" "$(format_number "$max_fds")" local fd_pct fd_pct=$(echo "$open_fds $max_fds" | awk '{printf "%d", ($1 / $2) * 100}') if [ "$fd_pct" -gt 80 ] 2>/dev/null; then add_recommendation "WARNING" "memory" "File descriptor usage at ${fd_pct}% -- approaching limit, increase ulimit -n" fi fi fi } # ============================================================================ # SECTION: CONFIG # ============================================================================ analyze_config() { print_header "Configuration Review" if [ -z "$CONFIG_DATA" ] && [ -z "$FLAGS_DATA" ]; then echo " (configuration data not available)" return fi # Global config from YAML local config_yaml="" if [ -n "$CONFIG_DATA" ]; then config_yaml=$(echo "$CONFIG_DATA" | jq -r '.data.yaml // ""' 2>/dev/null) fi local scrape_interval scrape_timeout eval_interval scrape_interval=$(echo "$config_yaml" | grep -m1 'scrape_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"') scrape_timeout=$(echo "$config_yaml" | grep -m1 'scrape_timeout:' | awk '{print $2}' | tr -d "'" | tr -d '"') eval_interval=$(echo "$config_yaml" | grep -m1 'evaluation_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"') [ -z "$scrape_interval" ] && scrape_interval="1m" [ -z "$scrape_timeout" ] && scrape_timeout="10s" [ -z "$eval_interval" ] && eval_interval="1m" echo -e " ${BOLD}Global Settings${NC}" echo "" print_metric "Scrape interval" "$scrape_interval" print_metric "Scrape timeout" "$scrape_timeout" print_metric "Evaluation interval" "$eval_interval" # Parse intervals to seconds for comparison local scrape_int_secs scrape_to_secs scrape_int_secs=$(echo "$scrape_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 60) scrape_to_secs=$(echo "$scrape_timeout" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 10) if [ "$scrape_to_secs" -ge "$scrape_int_secs" ] 2>/dev/null; then add_recommendation "WARNING" "config" "scrape_timeout ($scrape_timeout) >= scrape_interval ($scrape_interval) -- timeout should be less than interval" fi echo "" # External labels local external_labels external_labels=$(echo "$config_yaml" | awk '/^ external_labels:/,/^ [a-z]/' | grep -v 'external_labels:' | grep -v '^ [a-z]' | grep ':') echo -e " ${BOLD}External Labels${NC}" echo "" if [ -n "$external_labels" ]; then echo "$external_labels" | while IFS= read -r line; do echo " $line" done else echo " (none configured)" add_recommendation "INFO" "config" "No external labels configured -- recommended for remote write, federation, and cross-cluster identification" fi echo "" # Remote write/read echo -e " ${BOLD}Remote Endpoints${NC}" echo "" local remote_write_count remote_read_count remote_write_count=$(echo "$config_yaml" | grep -c 'remote_write:' 2>/dev/null || true) remote_read_count=$(echo "$config_yaml" | grep -c 'remote_read:' 2>/dev/null || true) local has_remote_write="no" local has_remote_read="no" [ "$remote_write_count" -gt 0 ] 2>/dev/null && has_remote_write="yes" [ "$remote_read_count" -gt 0 ] 2>/dev/null && has_remote_read="yes" print_metric "Remote write" "$has_remote_write" print_metric "Remote read" "$has_remote_read" echo "" # Job count and interval distribution echo -e " ${BOLD}Scrape Jobs${NC}" echo "" local job_count job_count=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[].labels.job] | unique | length' 2>/dev/null) print_metric "Total scrape jobs" "${job_count:-0}" # Check for aggressive scrape intervals if [ -n "$TARGETS_DATA" ]; then local fast_scrape_jobs fast_scrape_jobs=$(echo "$TARGETS_DATA" | jq -r ' [.data.activeTargets // [] | .[] | select(.scrapeInterval != null) | {job: .labels.job, interval: .scrapeInterval}] | unique_by(.job) | .[] | select( (.interval | test("^[0-9]+s$")) and (.interval | gsub("s$"; "") | tonumber) < 10 ) | .job ' 2>/dev/null) if [ -n "$fast_scrape_jobs" ]; then local fast_count fast_count=$(echo "$fast_scrape_jobs" | wc -l) if [ "$fast_count" -gt 3 ] 2>/dev/null; then add_recommendation "WARNING" "config" "$fast_count jobs have scrape_interval < 10s -- high scrape frequency increases storage cost and cardinality" fi fi fi # Key flags echo "" echo -e " ${BOLD}Key Flags${NC}" echo "" if [ -n "$FLAGS_DATA" ]; then local web_listen log_level tsdb_wal_compression web_listen=$(echo "$FLAGS_DATA" | jq -r '.data["web.listen-address"] // "unknown"') log_level=$(echo "$FLAGS_DATA" | jq -r '.data["log.level"] // "unknown"') tsdb_wal_compression=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.wal-compression"] // "unknown"') print_metric "Listen address" "$web_listen" print_metric "Log level" "$log_level" print_metric "WAL compression" "$tsdb_wal_compression" if [ "$tsdb_wal_compression" = "false" ]; then add_recommendation "INFO" "config" "WAL compression is disabled -- enabling it (--storage.tsdb.wal-compression) can reduce WAL size by ~50%" fi fi } # ============================================================================ # SECTION: SUMMARY # ============================================================================ analyze_summary() { print_header "Recommendations" if [ ${#RECOMMENDATIONS[@]} -eq 0 ]; then print_status "OK" "No issues detected -- Prometheus appears healthy" echo "" echo -e " ${BOLD}${GREEN}Health Score: 100 / 100${NC}" return fi # Print recommendations grouped by severity # Sort: CRITICAL first, then WARNING, then INFO for severity in CRITICAL WARNING INFO; do local found=false for rec in "${RECOMMENDATIONS[@]}"; do local sev section message sev=$(echo "$rec" | cut -d'|' -f1) section=$(echo "$rec" | cut -d'|' -f2) message=$(echo "$rec" | cut -d'|' -f3-) if [ "$sev" = "$severity" ]; then if [ "$found" = false ]; then found=true fi echo -e " $(severity_tag "$sev") ${BOLD}[${section}]${NC} $message" echo "" fi done done # Calculate health score local score=100 score=$((score - (CRITICAL_COUNT * 15))) score=$((score - (WARNING_COUNT * 5))) score=$((score - (INFO_COUNT * 1))) [ "$score" -lt 0 ] && score=0 echo -e "${BOLD}${CYAN}====================================================${NC}" echo "" printf " Issues found: " [ "$CRITICAL_COUNT" -gt 0 ] && printf "${RED}%d critical${NC} " "$CRITICAL_COUNT" [ "$WARNING_COUNT" -gt 0 ] && printf "${YELLOW}%d warning${NC} " "$WARNING_COUNT" [ "$INFO_COUNT" -gt 0 ] && printf "${GREEN}%d info${NC}" "$INFO_COUNT" echo "" echo "" local score_color if [ "$score" -ge 80 ]; then score_color="$GREEN" elif [ "$score" -ge 50 ]; then score_color="$YELLOW" else score_color="$RED" fi echo -e " ${BOLD}Health Score: ${score_color}${score} / 100${NC}" } # ============================================================================ # JSON OUTPUT # ============================================================================ output_json() { collect_all_data # Build JSON from all sections local version head_series head_chunks retention_time retention_size rss_bytes version=$(echo "$RUNTIME_INFO" | jq -r '.data.version // "unknown"') local start_time now_time start_time=$(prom_metric "process_start_time_seconds") now_time=$(date +%s) local uptime_secs=0 if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}') fi head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0') head_chunks=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0') retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"') retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"') rss_bytes=$(prom_metric "process_resident_memory_bytes") local compactions_total compactions_failed wal_corruptions ooo_total compactions_total=$(prom_metric "prometheus_tsdb_compactions_total") compactions_failed=$(prom_metric "prometheus_tsdb_compactions_failed_total") wal_corruptions=$(prom_metric "prometheus_tsdb_wal_corruptions_total") ooo_total=$(prom_metric "prometheus_tsdb_out_of_order_samples_total") local total_targets up_targets down_targets total_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[]] | length') up_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "up")] | length') down_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "down")] | length') local recording_rules alerting_rules recording_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "recording")] | length') alerting_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "alerting")] | length') # Top metrics local top_metrics top_metrics=$(echo "$TSDB_STATUS" | jq '[.data.seriesCountByMetricName // [] | .[:10] | .[] | {name: .name, series: .value}]') # Run all checks to populate recommendations NO_COLOR=true RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC="" # Suppress output, just collect recommendations analyze_overview >/dev/null 2>&1 analyze_tsdb >/dev/null 2>&1 analyze_cardinality >/dev/null 2>&1 analyze_queries >/dev/null 2>&1 analyze_scrapes >/dev/null 2>&1 analyze_rules >/dev/null 2>&1 analyze_storage >/dev/null 2>&1 analyze_memory >/dev/null 2>&1 analyze_config >/dev/null 2>&1 # Build recommendations JSON local rec_json="[" local first=true for rec in "${RECOMMENDATIONS[@]}"; do local sev section message sev=$(echo "$rec" | cut -d'|' -f1) section=$(echo "$rec" | cut -d'|' -f2) message=$(echo "$rec" | cut -d'|' -f3-) [ "$first" = true ] && first=false || rec_json+="," rec_json+="{\"severity\":\"$sev\",\"section\":\"$section\",\"message\":$(echo "$message" | jq -Rs '.')}" done rec_json+="]" local score=100 score=$((score - (CRITICAL_COUNT * 15))) score=$((score - (WARNING_COUNT * 5))) score=$((score - (INFO_COUNT * 1))) [ "$score" -lt 0 ] && score=0 jq -n \ --arg version "$version" \ --argjson uptime "$uptime_secs" \ --argjson head_series "$head_series" \ --argjson head_chunks "$head_chunks" \ --arg retention_time "$retention_time" \ --arg retention_size "$retention_size" \ --argjson rss_bytes "$(echo "$rss_bytes" | awk '{printf "%d", $1}')" \ --argjson compactions_total "$(echo "$compactions_total" | awk '{printf "%d", $1}')" \ --argjson compactions_failed "$(echo "$compactions_failed" | awk '{printf "%d", $1}')" \ --argjson wal_corruptions "$(echo "$wal_corruptions" | awk '{printf "%d", $1}')" \ --argjson ooo_samples "$(echo "$ooo_total" | awk '{printf "%d", $1}')" \ --argjson total_targets "$total_targets" \ --argjson up_targets "$up_targets" \ --argjson down_targets "$down_targets" \ --argjson recording_rules "$recording_rules" \ --argjson alerting_rules "$alerting_rules" \ --argjson top_metrics "$top_metrics" \ --argjson recommendations "$rec_json" \ --argjson score "$score" \ --argjson critical "$CRITICAL_COUNT" \ --argjson warnings "$WARNING_COUNT" \ --argjson info "$INFO_COUNT" \ '{ prometheus: { version: $version, uptime_seconds: $uptime, memory_rss_bytes: $rss_bytes }, tsdb: { head_series: $head_series, head_chunks: $head_chunks, retention_time: $retention_time, retention_size: $retention_size, compactions_total: $compactions_total, compactions_failed: $compactions_failed, wal_corruptions: $wal_corruptions, out_of_order_samples: $ooo_samples }, targets: { total: $total_targets, up: $up_targets, down: $down_targets }, rules: { recording: $recording_rules, alerting: $alerting_rules }, cardinality: { top_metrics: $top_metrics }, health: { score: $score, critical: $critical, warnings: $warnings, info: $info }, recommendations: $recommendations }' } # ============================================================================ # MAIN # ============================================================================ main() { parse_args "$@" if ! check_requirements; then exit 1 fi # JSON mode if [ "$JSON_MODE" = true ]; then if [ -n "$OUTPUT_FILE" ]; then output_json > "$OUTPUT_FILE" echo "JSON report written to $OUTPUT_FILE" >&2 else output_json fi return fi # Text report mode collect_all_data { echo -e "${BOLD}Prometheus Performance Analyzer v1.0${NC}" echo -e "${DIM}Target: ${PROM_URL}${NC}" echo -e "${DIM}Date: $(date '+%Y-%m-%d %H:%M:%S %Z')${NC}" if [ -n "$SECTION" ]; then case "$SECTION" in overview) analyze_overview ;; tsdb) analyze_tsdb ;; cardinality) analyze_cardinality ;; queries) analyze_queries ;; scrapes) analyze_scrapes ;; rules) analyze_rules ;; storage) analyze_storage ;; memory) analyze_memory ;; config) analyze_config ;; summary) analyze_summary ;; *) echo "Unknown section: $SECTION" >&2 echo "Valid sections: overview tsdb cardinality queries scrapes rules storage memory config summary" >&2 exit 1 ;; esac # Always show summary if not already shown if [ "$SECTION" != "summary" ]; then analyze_summary fi else analyze_overview analyze_tsdb analyze_cardinality analyze_queries analyze_scrapes analyze_rules analyze_storage analyze_memory analyze_config analyze_summary fi echo "" } | if [ -n "$OUTPUT_FILE" ]; then cat > "$OUTPUT_FILE" echo "Report written to $OUTPUT_FILE" >&2 else cat fi } main "$@"