Files
linux-scripts/prometheus-performance-analyzer.sh
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

1562 lines
59 KiB
Bash
Executable File

#!/bin/bash
################################################################################
# Script Name: prometheus-performance-analyzer.sh
# Version: 1.01
# Description: Diagnostic tool that analyzes Prometheus server performance.
# Queries TSDB status, runtime info, flags, config, targets,
# rules, and internal metrics to produce a detailed report
# with actionable recommendations.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
# - curl
# - jq
# - bc (for calculations)
# - Network access to Prometheus API
#
# Usage:
# # Analyze local Prometheus
# ./prometheus-performance-analyzer.sh
#
# # Analyze remote Prometheus
# ./prometheus-performance-analyzer.sh --url http://prometheus:9090
#
# # JSON output for automation
# ./prometheus-performance-analyzer.sh --json
#
# # Analyze specific section only
# ./prometheus-performance-analyzer.sh --section cardinality
#
# # Save report to file (auto-disables color)
# ./prometheus-performance-analyzer.sh -o report.txt
#
# # Custom series threshold
# ./prometheus-performance-analyzer.sh --threshold-series 5000000
#
# Sections:
# overview - Version, uptime, series counts, retention
# tsdb - TSDB head stats, compaction, WAL health
# cardinality - High cardinality metrics, labels, label-value pairs
# queries - Query engine performance and latency
# scrapes - Scrape target health and duration analysis
# rules - Recording/alerting rule evaluation
# storage - Disk, retention, WAL, growth estimation
# memory - RSS, Go heap, GC, goroutines
# config - Configuration review and best practices
# summary - Health score and all recommendations
#
################################################################################
# ============================================================================
# CONFIGURATION & DEFAULTS
# ============================================================================
PROM_URL="http://localhost:9090"
OUTPUT_FILE=""
JSON_MODE=false
NO_COLOR=false
SECTION=""
THRESHOLD_SERIES=1000000
# Colors
RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
CYAN='\033[0;36m'
BOLD='\033[1m'
DIM='\033[2m'
NC='\033[0m'
# Global state
declare -a RECOMMENDATIONS=()
CRITICAL_COUNT=0
WARNING_COUNT=0
INFO_COUNT=0
# Cached API responses
TSDB_STATUS=""
RUNTIME_INFO=""
FLAGS_DATA=""
CONFIG_DATA=""
TARGETS_DATA=""
RULES_DATA=""
METRICS_RAW=""
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
show_usage() {
cat <<EOF
Usage: $0 [OPTIONS]
Analyze Prometheus server performance and produce recommendations (v1.0).
OPTIONS:
--url URL Prometheus URL (default: $PROM_URL)
--section NAME Run a specific section only
(overview|tsdb|cardinality|queries|scrapes|rules|
storage|memory|config|summary)
--json Output as JSON
--no-color Disable colored output
--threshold-series N Warning threshold for total series (default: $THRESHOLD_SERIES)
-o, --output FILE Write report to file (auto-disables color)
-h, --help Show this help
EXAMPLES:
$0 # Analyze localhost:9090
$0 --url http://prometheus:9090 # Remote server
$0 --json # JSON output
$0 --section cardinality # Single section
$0 -o report.txt # Save to file
$0 --threshold-series 5000000 # Custom threshold
EOF
exit 0
}
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help) show_usage ;;
--url) PROM_URL="$2"; shift 2 ;;
--section) SECTION="$2"; shift 2 ;;
--json) JSON_MODE=true; shift ;;
--no-color) NO_COLOR=true; shift ;;
--threshold-series) THRESHOLD_SERIES="$2"; shift 2 ;;
-o|--output) OUTPUT_FILE="$2"; NO_COLOR=true; shift 2 ;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
# Strip trailing slash
PROM_URL="${PROM_URL%/}"
# Disable colors if requested
if [ "$NO_COLOR" = true ]; then
RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC=""
fi
}
check_requirements() {
local missing=0
for cmd in curl jq awk; do
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "ERROR: $cmd not found" >&2
missing=1
fi
done
# Auto-install bc if missing
if ! command -v bc >/dev/null 2>&1; then
echo "bc not found -- installing..." >&2
if command -v apt-get >/dev/null 2>&1; then
sudo apt-get install -y bc >/dev/null 2>&1
elif command -v dnf >/dev/null 2>&1; then
sudo dnf install -y bc >/dev/null 2>&1
elif command -v yum >/dev/null 2>&1; then
sudo yum install -y bc >/dev/null 2>&1
fi
if ! command -v bc >/dev/null 2>&1; then
echo "ERROR: failed to install bc -- install it manually" >&2
missing=1
fi
fi
return $missing
}
# Query Prometheus API endpoint -- returns JSON body
prom_api() {
local endpoint="$1"
curl -sf --connect-timeout 5 --max-time 15 "${PROM_URL}${endpoint}" 2>/dev/null
}
# Query a Prometheus metric via instant query API -- returns numeric value
prom_query() {
local query="$1"
local encoded
encoded=$(printf '%s' "$query" | jq -sRr @uri)
local result
result=$(prom_api "/api/v1/query?query=${encoded}")
if [ -z "$result" ]; then
echo "0"
return 1
fi
echo "$result" | jq -r '.data.result[0].value[1] // "0"' 2>/dev/null
}
# Extract a metric value from raw /metrics text
prom_metric() {
local metric="$1"
if [ -z "$METRICS_RAW" ]; then
echo "0"
return
fi
local val
val=$(echo "$METRICS_RAW" | grep "^${metric} " | head -1 | awk '{print $2}')
echo "${val:-0}"
}
# Extract a metric with labels from raw /metrics text
prom_metric_labeled() {
local pattern="$1"
if [ -z "$METRICS_RAW" ]; then
return
fi
echo "$METRICS_RAW" | grep "^${pattern}" 2>/dev/null
}
# Format bytes to human-readable
format_bytes() {
local bytes="$1"
if [ -z "$bytes" ] || [ "$bytes" = "0" ] || [ "$bytes" = "null" ]; then
echo "0 B"
return
fi
echo "$bytes" | awk '{
if ($1 >= 1099511627776) printf "%.1f TB", $1/1099511627776
else if ($1 >= 1073741824) printf "%.1f GB", $1/1073741824
else if ($1 >= 1048576) printf "%.1f MB", $1/1048576
else if ($1 >= 1024) printf "%.1f KB", $1/1024
else printf "%d B", $1
}'
}
# Format seconds to human-readable duration
format_duration() {
local total="$1"
if [ -z "$total" ] || [ "$total" = "0" ]; then
echo "0s"
return
fi
# Handle float seconds
local secs
secs=$(echo "$total" | awk '{printf "%d", $1}')
local days=$((secs / 86400))
local hours=$(( (secs % 86400) / 3600 ))
local mins=$(( (secs % 3600) / 60 ))
local s=$((secs % 60))
local result=""
[ "$days" -gt 0 ] && result="${days}d "
[ "$hours" -gt 0 ] && result="${result}${hours}h "
[ "$mins" -gt 0 ] && result="${result}${mins}m "
[ "$s" -gt 0 ] || [ -z "$result" ] && result="${result}${s}s"
echo "${result% }"
}
# Format number with commas
format_number() {
local n="$1"
if [ -z "$n" ] || [ "$n" = "null" ]; then
echo "0"
return
fi
printf "%'.0f" "$n" 2>/dev/null || echo "$n"
}
# Add a recommendation
add_recommendation() {
local severity="$1"
local section="$2"
local message="$3"
case "$severity" in
CRITICAL) CRITICAL_COUNT=$((CRITICAL_COUNT + 1)) ;;
WARNING) WARNING_COUNT=$((WARNING_COUNT + 1)) ;;
INFO) INFO_COUNT=$((INFO_COUNT + 1)) ;;
esac
RECOMMENDATIONS+=("${severity}|${section}|${message}")
}
# Print section header
print_header() {
local title="$1"
echo ""
echo -e "${BOLD}${CYAN}====================================================${NC}"
echo -e "${BOLD}${CYAN} ${title}${NC}"
echo -e "${BOLD}${CYAN}====================================================${NC}"
echo ""
}
# Print a metric line (label + value, aligned)
print_metric() {
local label="$1"
local value="$2"
printf " ${DIM}%-30s${NC} %s\n" "$label:" "$value"
}
# Print severity tag
severity_tag() {
local sev="$1"
case "$sev" in
CRITICAL) echo -e "${RED}[CRITICAL]${NC}" ;;
WARNING) echo -e "${YELLOW}[WARNING]${NC}" ;;
INFO) echo -e "${GREEN}[INFO]${NC}" ;;
OK) echo -e "${GREEN}[OK]${NC}" ;;
esac
}
# Print a status line
print_status() {
local severity="$1"
local message="$2"
echo -e " $(severity_tag "$severity") $message"
}
# ============================================================================
# DATA COLLECTION
# ============================================================================
collect_all_data() {
echo -e "${DIM}Collecting data from ${PROM_URL}...${NC}" >&2
# Check connectivity first
if ! prom_api "/api/v1/status/runtimeinfo" >/dev/null 2>&1; then
echo -e "${RED}ERROR: Cannot reach Prometheus at ${PROM_URL}${NC}" >&2
echo "Check the URL and ensure Prometheus is running." >&2
exit 1
fi
TSDB_STATUS=$(prom_api "/api/v1/status/tsdb")
RUNTIME_INFO=$(prom_api "/api/v1/status/runtimeinfo")
FLAGS_DATA=$(prom_api "/api/v1/status/flags")
CONFIG_DATA=$(prom_api "/api/v1/status/config")
TARGETS_DATA=$(prom_api "/api/v1/targets")
RULES_DATA=$(prom_api "/api/v1/rules")
METRICS_RAW=$(curl -sf --connect-timeout 5 --max-time 15 "${PROM_URL}/metrics" 2>/dev/null)
echo -e "${DIM}Data collection complete.${NC}" >&2
}
# ============================================================================
# SECTION: OVERVIEW
# ============================================================================
analyze_overview() {
print_header "Overview"
# Version and uptime from runtime info
local version goroutines gomaxprocs storage_path
version=$(echo "$RUNTIME_INFO" | jq -r '.data.version // "unknown"')
goroutines=$(echo "$RUNTIME_INFO" | jq -r '.data.goroutines // "0"')
gomaxprocs=$(echo "$RUNTIME_INFO" | jq -r '.data.GOMAXPROCS // "0"')
storage_path=$(echo "$RUNTIME_INFO" | jq -r '.data.storageRetention // "unknown"')
# Uptime from process_start_time_seconds
local start_time now_time uptime_secs
start_time=$(prom_metric "process_start_time_seconds")
now_time=$(date +%s)
if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then
uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}')
else
uptime_secs=0
fi
# Series and samples from TSDB
local head_series head_chunks
head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
head_chunks=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0')
# Retention from flags
local retention_time retention_size
retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"')
retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"')
# Memory
local rss_bytes
rss_bytes=$(prom_metric "process_resident_memory_bytes")
print_metric "Prometheus version" "$version"
print_metric "Uptime" "$(format_duration "$uptime_secs")"
print_metric "GOMAXPROCS" "$gomaxprocs"
print_metric "Goroutines" "$(format_number "$goroutines")"
print_metric "Head series" "$(format_number "$head_series")"
print_metric "Head chunks" "$(format_number "$head_chunks")"
print_metric "Retention (time)" "$retention_time"
print_metric "Retention (size)" "$retention_size"
print_metric "Memory (RSS)" "$(format_bytes "$rss_bytes")"
# Series threshold check
if [ "$head_series" -gt "$((THRESHOLD_SERIES * 5))" ] 2>/dev/null; then
add_recommendation "CRITICAL" "overview" "Head series count $(format_number "$head_series") is very high (>$(format_number "$((THRESHOLD_SERIES * 5))")) -- investigate high cardinality metrics immediately"
elif [ "$head_series" -gt "$THRESHOLD_SERIES" ] 2>/dev/null; then
add_recommendation "WARNING" "overview" "Head series count $(format_number "$head_series") exceeds threshold $(format_number "$THRESHOLD_SERIES") -- review cardinality section"
fi
}
# ============================================================================
# SECTION: TSDB
# ============================================================================
analyze_tsdb() {
print_header "TSDB Health"
if [ -z "$TSDB_STATUS" ]; then
echo " (TSDB status endpoint not available)"
return
fi
# Head stats
local num_series chunk_count min_time max_time num_label_pairs
num_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
chunk_count=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0')
min_time=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.minTime // 0')
max_time=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.maxTime // 0')
num_label_pairs=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numLabelPairs // 0')
# Calculate head block time range
local head_range_secs=0
if [ "$min_time" -gt 0 ] && [ "$max_time" -gt 0 ] 2>/dev/null; then
head_range_secs=$(( (max_time - min_time) / 1000 ))
fi
print_metric "Head series" "$(format_number "$num_series")"
print_metric "Head chunks" "$(format_number "$chunk_count")"
print_metric "Head label pairs" "$(format_number "$num_label_pairs")"
print_metric "Head block range" "$(format_duration "$head_range_secs")"
echo ""
# Compaction metrics from /metrics
local compactions_total compactions_failed compaction_duration
compactions_total=$(prom_metric "prometheus_tsdb_compactions_total")
compactions_failed=$(prom_metric "prometheus_tsdb_compactions_failed_total")
compaction_duration=$(prom_metric "prometheus_tsdb_compaction_duration_seconds_sum")
print_metric "Compactions total" "$(format_number "$compactions_total")"
print_metric "Compaction failures" "$compactions_failed"
if [ -n "$compaction_duration" ] && [ "$compaction_duration" != "0" ]; then
print_metric "Compaction time (total)" "$(format_duration "$compaction_duration")"
fi
if [ "$compactions_failed" != "0" ] && [ "$compactions_failed" != "" ] 2>/dev/null; then
if [ "$compactions_failed" -gt 0 ] 2>/dev/null; then
add_recommendation "CRITICAL" "tsdb" "TSDB has $compactions_failed compaction failures -- investigate storage health (disk I/O, free space)"
fi
fi
echo ""
# WAL stats
local wal_corruptions wal_truncate_total wal_truncate_failed
wal_corruptions=$(prom_metric "prometheus_tsdb_wal_corruptions_total")
wal_truncate_total=$(prom_metric "prometheus_tsdb_wal_truncations_total")
wal_truncate_failed=$(prom_metric "prometheus_tsdb_wal_truncations_failed_total")
print_metric "WAL corruptions" "$wal_corruptions"
print_metric "WAL truncations" "$(format_number "$wal_truncate_total")"
print_metric "WAL truncation failures" "$wal_truncate_failed"
if [ "$wal_corruptions" != "0" ] && [ -n "$wal_corruptions" ]; then
if [ "$wal_corruptions" -gt 0 ] 2>/dev/null; then
add_recommendation "CRITICAL" "tsdb" "WAL has $wal_corruptions corruption(s) -- check disk health, consider running promtool tsdb clean-tombstones"
fi
fi
if [ "$wal_truncate_failed" != "0" ] && [ -n "$wal_truncate_failed" ]; then
if [ "$wal_truncate_failed" -gt 0 ] 2>/dev/null; then
add_recommendation "WARNING" "tsdb" "WAL has $wal_truncate_failed truncation failure(s) -- may cause WAL growth"
fi
fi
echo ""
# Out-of-order samples
local ooo_total
ooo_total=$(prom_metric "prometheus_tsdb_out_of_order_samples_total")
print_metric "Out-of-order samples" "$(format_number "$ooo_total")"
if [ "$ooo_total" != "0" ] && [ -n "$ooo_total" ]; then
local ooo_num
ooo_num=$(echo "$ooo_total" | awk '{printf "%d", $1}')
if [ "$ooo_num" -gt 1000 ] 2>/dev/null; then
add_recommendation "WARNING" "tsdb" "$(format_number "$ooo_total") out-of-order samples -- check NTP sync across targets or look for duplicate scraper configs"
fi
fi
# Head GC
local head_gc_duration
head_gc_duration=$(prom_metric "prometheus_tsdb_head_gc_duration_seconds_sum")
if [ -n "$head_gc_duration" ] && [ "$head_gc_duration" != "0" ]; then
print_metric "Head GC time (total)" "$(format_duration "$head_gc_duration")"
fi
# Checkpoint creations
local checkpoint_total checkpoint_failed
checkpoint_total=$(prom_metric "prometheus_tsdb_checkpoint_creations_total")
checkpoint_failed=$(prom_metric "prometheus_tsdb_checkpoint_creations_failed_total")
print_metric "Checkpoints created" "$(format_number "$checkpoint_total")"
if [ "$checkpoint_failed" != "0" ] && [ -n "$checkpoint_failed" ]; then
if [ "$checkpoint_failed" -gt 0 ] 2>/dev/null; then
print_metric "Checkpoint failures" "$checkpoint_failed"
add_recommendation "WARNING" "tsdb" "$checkpoint_failed checkpoint creation failure(s) -- investigate disk health"
fi
fi
# Tombstone cleanup
local tombstones
tombstones=$(prom_metric "prometheus_tsdb_tombstone_cleanup_seconds_sum")
if [ -n "$tombstones" ] && [ "$tombstones" != "0" ]; then
print_metric "Tombstone cleanup time" "$(format_duration "$tombstones")"
fi
}
# ============================================================================
# SECTION: CARDINALITY
# ============================================================================
analyze_cardinality() {
print_header "High Cardinality Analysis"
if [ -z "$TSDB_STATUS" ]; then
echo " (TSDB status endpoint not available)"
return
fi
local total_series
total_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
# Top metrics by series count
echo -e " ${BOLD}Top Metrics by Series Count${NC}"
echo ""
local metric_count
metric_count=$(echo "$TSDB_STATUS" | jq -r '.data.seriesCountByMetricName // [] | length')
if [ "$metric_count" -gt 0 ] 2>/dev/null; then
local i=0
while [ $i -lt 10 ] && [ $i -lt "$metric_count" ]; do
local name count pct
name=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByMetricName[$i].name // \"\"")
count=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByMetricName[$i].value // 0")
if [ -n "$name" ] && [ "$name" != "" ]; then
if [ "$total_series" -gt 0 ] 2>/dev/null; then
pct=$(echo "scale=1; $count * 100 / $total_series" | bc 2>/dev/null)
else
pct="0"
fi
printf " %2d. %-45s %10s (%5s%%)\n" "$((i+1))" "$name" "$(format_number "$count")" "$pct"
# Flag metrics consuming > 10% of total
local pct_int
pct_int=$(echo "$pct" | awk '{printf "%d", $1}')
if [ "$pct_int" -ge 10 ] 2>/dev/null; then
add_recommendation "WARNING" "cardinality" "$name has $(format_number "$count") series (${pct}% of total) -- consider adding metric_relabel_configs to drop unused label dimensions"
fi
fi
i=$((i + 1))
done
else
echo " (no data available)"
fi
echo ""
# Top labels by value count
echo -e " ${BOLD}Top Labels by Value Count${NC}"
echo ""
local label_count
label_count=$(echo "$TSDB_STATUS" | jq -r '.data.labelValueCountByLabelName // [] | length')
if [ "$label_count" -gt 0 ] 2>/dev/null; then
local i=0
while [ $i -lt 10 ] && [ $i -lt "$label_count" ]; do
local name count
name=$(echo "$TSDB_STATUS" | jq -r ".data.labelValueCountByLabelName[$i].name // \"\"")
count=$(echo "$TSDB_STATUS" | jq -r ".data.labelValueCountByLabelName[$i].value // 0")
if [ -n "$name" ] && [ "$name" != "" ]; then
printf " %2d. %-45s %10s values\n" "$((i+1))" "$name" "$(format_number "$count")"
# Flag labels with very high value counts
if [ "$count" -gt 10000 ] 2>/dev/null; then
add_recommendation "WARNING" "cardinality" "Label '$name' has $(format_number "$count") unique values -- high cardinality label, consider relabeling or dropping"
fi
fi
i=$((i + 1))
done
else
echo " (no data available)"
fi
echo ""
# Top label-value pairs by series count
echo -e " ${BOLD}Top Label-Value Pairs by Series Count${NC}"
echo ""
local pair_count
pair_count=$(echo "$TSDB_STATUS" | jq -r '.data.seriesCountByLabelValuePair // [] | length')
if [ "$pair_count" -gt 0 ] 2>/dev/null; then
local i=0
while [ $i -lt 10 ] && [ $i -lt "$pair_count" ]; do
local name count
name=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByLabelValuePair[$i].name // \"\"")
count=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByLabelValuePair[$i].value // 0")
if [ -n "$name" ] && [ "$name" != "" ]; then
printf " %2d. %-45s %10s series\n" "$((i+1))" "$name" "$(format_number "$count")"
fi
i=$((i + 1))
done
else
echo " (no data available)"
fi
echo ""
# Overall cardinality assessment
if [ "$total_series" -gt "$((THRESHOLD_SERIES * 5))" ] 2>/dev/null; then
print_status "CRITICAL" "Total series $(format_number "$total_series") -- well above recommended limits"
elif [ "$total_series" -gt "$THRESHOLD_SERIES" ] 2>/dev/null; then
print_status "WARNING" "Total series $(format_number "$total_series") -- above threshold $(format_number "$THRESHOLD_SERIES")"
else
print_status "OK" "Total series $(format_number "$total_series") -- within normal range"
fi
# Memory per series from TSDB
local mem_by_label
mem_by_label=$(echo "$TSDB_STATUS" | jq -r '.data.memoryInBytesByLabelName // [] | length')
if [ "$mem_by_label" -gt 0 ] 2>/dev/null; then
echo ""
echo -e " ${BOLD}Top Labels by Memory Usage${NC}"
echo ""
local i=0
while [ $i -lt 10 ] && [ $i -lt "$mem_by_label" ]; do
local name bytes
name=$(echo "$TSDB_STATUS" | jq -r ".data.memoryInBytesByLabelName[$i].name // \"\"")
bytes=$(echo "$TSDB_STATUS" | jq -r ".data.memoryInBytesByLabelName[$i].value // 0")
if [ -n "$name" ] && [ "$name" != "" ]; then
printf " %2d. %-45s %10s\n" "$((i+1))" "$name" "$(format_bytes "$bytes")"
fi
i=$((i + 1))
done
fi
}
# ============================================================================
# SECTION: QUERIES
# ============================================================================
analyze_queries() {
print_header "Query Performance"
# Query engine settings from flags
local max_concurrency query_timeout lookback_delta
max_concurrency=$(echo "$FLAGS_DATA" | jq -r '.data["query.max-concurrency"] // "unknown"')
query_timeout=$(echo "$FLAGS_DATA" | jq -r '.data["query.timeout"] // "unknown"')
lookback_delta=$(echo "$FLAGS_DATA" | jq -r '.data["query.lookback-delta"] // "unknown"')
print_metric "Max concurrent queries" "$max_concurrency"
print_metric "Query timeout" "$query_timeout"
print_metric "Lookback delta" "$lookback_delta"
echo ""
# Query duration quantiles from /metrics
echo -e " ${BOLD}Query Duration Percentiles${NC}"
echo ""
local has_query_metrics=false
# prometheus_engine_query_duration_seconds (histogram with quantiles)
local query_durations
query_durations=$(prom_metric_labeled "prometheus_engine_query_duration_seconds{")
if [ -n "$query_durations" ]; then
has_query_metrics=true
echo "$query_durations" | while IFS= read -r line; do
local quantile value slice_name
quantile=$(echo "$line" | grep -oP 'quantile="\K[^"]+')
slice_name=$(echo "$line" | grep -oP 'slice="\K[^"]+')
value=$(echo "$line" | awk '{print $NF}')
if [ -n "$quantile" ] && [ -n "$value" ] && [ "$value" != "NaN" ]; then
local label
label="p$(echo "$quantile" | awk '{printf "%g", $1 * 100}')"
if [ -n "$slice_name" ]; then
label="${label} (${slice_name})"
fi
printf " %-35s %s\n" "$label" "$(echo "$value" | awk '{if ($1 < 0.001) printf "%.3f ms", $1*1000; else if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')"
fi
done
fi
if [ "$has_query_metrics" = false ]; then
echo " (query duration metrics not available)"
fi
echo ""
# Query performance from instant metrics
local queries_active
queries_active=$(prom_metric "prometheus_engine_queries")
print_metric "Active queries (now)" "$queries_active"
local queries_total
queries_total=$(prom_metric "prometheus_engine_query_samples_total")
if [ "$queries_total" != "0" ] && [ -n "$queries_total" ]; then
print_metric "Total query samples" "$(format_number "$queries_total")"
fi
# Check for slow queries
local p99_inner
p99_inner=$(echo "$query_durations" | grep 'quantile="0.99"' | grep 'inner_eval' | awk '{print $NF}' | head -1)
if [ -n "$p99_inner" ] && [ "$p99_inner" != "NaN" ]; then
local p99_secs
p99_secs=$(echo "$p99_inner" | awk '{printf "%d", $1}')
if [ "$p99_secs" -gt 10 ] 2>/dev/null; then
add_recommendation "WARNING" "queries" "p99 inner eval query latency is ${p99_inner}s -- consider adding recording rules for complex queries or reducing cardinality"
fi
fi
# Concurrent query check
if [ "$max_concurrency" != "unknown" ] && [ "$queries_active" != "0" ]; then
local active_num max_num
active_num=$(echo "$queries_active" | awk '{printf "%d", $1}')
max_num=$(echo "$max_concurrency" | awk '{printf "%d", $1}')
if [ "$active_num" -ge "$max_num" ] 2>/dev/null; then
add_recommendation "WARNING" "queries" "Active queries ($active_num) at or near max-concurrency ($max_num) -- consider increasing --query.max-concurrency"
fi
fi
}
# ============================================================================
# SECTION: SCRAPES
# ============================================================================
analyze_scrapes() {
print_header "Scrape Performance"
if [ -z "$TARGETS_DATA" ]; then
echo " (targets endpoint not available)"
return
fi
# Count targets by health
local total_targets up_targets down_targets unknown_targets
total_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[]] | length')
up_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "up")] | length')
down_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "down")] | length')
unknown_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health != "up" and .health != "down")] | length')
print_metric "Total targets" "$total_targets"
print_metric "Targets up" "${GREEN}${up_targets}${NC}"
if [ "$down_targets" -gt 0 ] 2>/dev/null; then
print_metric "Targets down" "${RED}${down_targets}${NC}"
else
print_metric "Targets down" "$down_targets"
fi
if [ "$unknown_targets" -gt 0 ] 2>/dev/null; then
print_metric "Targets unknown" "$unknown_targets"
fi
echo ""
# List down targets
if [ "$down_targets" -gt 0 ] 2>/dev/null; then
echo -e " ${BOLD}${RED}Down Targets${NC}"
echo ""
echo "$TARGETS_DATA" | jq -r '
.data.activeTargets // [] |
.[] | select(.health == "down") |
" \(.labels.job // "unknown") \(.labels.instance // .scrapeUrl) \(.lastError // "no error")"
' 2>/dev/null | head -20
echo ""
add_recommendation "CRITICAL" "scrapes" "$down_targets scrape target(s) are down -- check target availability"
fi
# Scrape duration analysis per job
echo -e " ${BOLD}Scrape Duration by Job${NC}"
echo ""
printf " %-30s %10s %10s %10s %8s\n" "Job" "Avg" "Max" "Interval" "Status"
printf " %-30s %10s %10s %10s %8s\n" "---" "---" "---" "--------" "------"
# Get global scrape interval from config
local global_interval=""
if [ -n "$CONFIG_DATA" ]; then
global_interval=$(echo "$CONFIG_DATA" | jq -r '.data.yaml' 2>/dev/null | grep -m1 'scrape_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"')
fi
[ -z "$global_interval" ] && global_interval="60s"
# Parse interval to seconds
local global_interval_secs
global_interval_secs=$(echo "$global_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 60)
echo "$TARGETS_DATA" | jq -r '
[.data.activeTargets // [] | .[] | select(.health == "up") |
{job: (.labels.job // "unknown"), duration: .lastScrapeDuration, interval: .scrapeInterval}] |
group_by(.job) |
.[] |
{
job: .[0].job,
avg: ([.[].duration] | add / length),
max: ([.[].duration] | max),
interval: .[0].interval,
count: length
} |
"\(.job)|\(.avg)|\(.max)|\(.interval // "")"
' 2>/dev/null | while IFS='|' read -r job avg max interval; do
[ -z "$job" ] && continue
local avg_fmt max_fmt
avg_fmt=$(echo "$avg" | awk '{if ($1 < 1) printf "%.0f ms", $1*1000; else printf "%.2f s", $1}')
max_fmt=$(echo "$max" | awk '{if ($1 < 1) printf "%.0f ms", $1*1000; else printf "%.2f s", $1}')
local effective_interval="${interval:-$global_interval}"
local interval_secs
interval_secs=$(echo "$effective_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo "$global_interval_secs")
local status="OK"
local max_num
max_num=$(echo "$max" | awk '{printf "%.0f", $1 * 100}')
local interval_80
interval_80=$(echo "$interval_secs" | awk '{printf "%.0f", $1 * 80}')
if [ "$max_num" -gt "$interval_80" ] 2>/dev/null; then
status="${RED}SLOW${NC}"
add_recommendation "WARNING" "scrapes" "Job '$job' max scrape duration (${max_fmt}) exceeds 80% of scrape interval (${effective_interval}) -- increase interval or optimize target"
fi
printf " %-30s %10s %10s %10s %b\n" "$job" "$avg_fmt" "$max_fmt" "$effective_interval" "$status"
done
echo ""
# Scrape sample stats
local exceeded dropped
exceeded=$(prom_metric "prometheus_target_scrapes_exceeded_sample_limit_total")
dropped=$(prom_metric "prometheus_target_scrapes_sample_duplicate_timestamp_total")
if [ "$exceeded" != "0" ] && [ -n "$exceeded" ]; then
local exceeded_num
exceeded_num=$(echo "$exceeded" | awk '{printf "%d", $1}')
if [ "$exceeded_num" -gt 0 ] 2>/dev/null; then
print_metric "Sample limit exceeded" "$(format_number "$exceeded")"
add_recommendation "WARNING" "scrapes" "$(format_number "$exceeded") scrapes exceeded sample_limit -- increase sample_limit in scrape config or reduce target metrics"
fi
fi
if [ "$dropped" != "0" ] && [ -n "$dropped" ]; then
local dropped_num
dropped_num=$(echo "$dropped" | awk '{printf "%d", $1}')
if [ "$dropped_num" -gt 0 ] 2>/dev/null; then
print_metric "Duplicate timestamps" "$(format_number "$dropped")"
fi
fi
}
# ============================================================================
# SECTION: RULES
# ============================================================================
analyze_rules() {
print_header "Rule Evaluation"
if [ -z "$RULES_DATA" ]; then
echo " (rules endpoint not available)"
return
fi
# Count rules
local recording_rules alerting_rules
recording_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "recording")] | length')
alerting_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "alerting")] | length')
local total_groups
total_groups=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[]] | length')
print_metric "Rule groups" "$total_groups"
print_metric "Recording rules" "$recording_rules"
print_metric "Alerting rules" "$alerting_rules"
echo ""
# Rule evaluation failures
local eval_failures
eval_failures=$(prom_metric "prometheus_rule_evaluation_failures_total")
if [ "$eval_failures" != "0" ] && [ -n "$eval_failures" ]; then
local fail_num
fail_num=$(echo "$eval_failures" | awk '{printf "%d", $1}')
if [ "$fail_num" -gt 0 ] 2>/dev/null; then
print_metric "Evaluation failures" "${RED}$(format_number "$eval_failures")${NC}"
add_recommendation "WARNING" "rules" "$(format_number "$eval_failures") rule evaluation failures -- check rule syntax and query targets"
fi
fi
# Group evaluation durations
echo -e " ${BOLD}Rule Group Evaluation Duration${NC}"
echo ""
printf " %-40s %12s %12s %8s\n" "Group" "Last Eval" "Interval" "Status"
printf " %-40s %12s %12s %8s\n" "-----" "---------" "--------" "------"
echo "$RULES_DATA" | jq -r '
.data.groups // [] | .[] |
"\(.name)|\(.lastEvaluation // "")|\(.evaluationTime // 0)|\(.interval // 0)"
' 2>/dev/null | while IFS='|' read -r name _last_eval eval_time interval; do
[ -z "$name" ] && continue
local eval_fmt interval_fmt status
eval_fmt=$(echo "$eval_time" | awk '{if ($1 < 0.001) printf "%.3f ms", $1*1000; else if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')
interval_fmt="${interval}s"
status="OK"
if [ -n "$interval" ] && [ "$interval" != "0" ]; then
local eval_pct
eval_pct=$(echo "$eval_time $interval" | awk '{printf "%d", ($1 / $2) * 100}')
if [ "$eval_pct" -gt 50 ] 2>/dev/null; then
status="${YELLOW}SLOW${NC}"
add_recommendation "WARNING" "rules" "Rule group '$name' evaluation (${eval_fmt}) exceeds 50% of interval (${interval_fmt}) -- consider splitting group or adding recording rules"
fi
fi
printf " %-40s %12s %12s %b\n" "${name:0:40}" "$eval_fmt" "$interval_fmt" "$status"
done
echo ""
}
# ============================================================================
# SECTION: STORAGE
# ============================================================================
analyze_storage() {
print_header "Storage"
# Retention settings
local retention_time retention_size
retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"')
retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"')
local storage_path
storage_path=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.path"] // "/data"')
print_metric "Storage path" "$storage_path"
print_metric "Retention (time)" "$retention_time"
print_metric "Retention (size)" "$retention_size"
echo ""
# Block stats from /metrics
local blocks_loaded
blocks_loaded=$(prom_metric "prometheus_tsdb_blocks_loaded")
print_metric "Blocks loaded" "$(format_number "$blocks_loaded")"
# Storage size from metrics
local storage_size_bytes
storage_size_bytes=$(prom_metric "prometheus_tsdb_storage_blocks_bytes")
if [ "$storage_size_bytes" != "0" ] && [ -n "$storage_size_bytes" ]; then
print_metric "Block storage size" "$(format_bytes "$storage_size_bytes")"
fi
# WAL size
local wal_size
wal_size=$(prom_metric "prometheus_tsdb_wal_storage_size_bytes")
if [ "$wal_size" != "0" ] && [ -n "$wal_size" ]; then
print_metric "WAL size" "$(format_bytes "$wal_size")"
fi
# Total storage
local total_storage
total_storage=$(echo "${storage_size_bytes:-0} ${wal_size:-0}" | awk '{printf "%.0f", $1 + $2}')
if [ "$total_storage" -gt 0 ] 2>/dev/null; then
print_metric "Total TSDB size" "$(format_bytes "$total_storage")"
fi
echo ""
# WAL segments
local wal_segments
wal_segments=$(prom_metric "prometheus_tsdb_wal_segment_current")
if [ "$wal_segments" != "0" ] && [ -n "$wal_segments" ]; then
print_metric "WAL current segment" "$(format_number "$wal_segments")"
fi
# Growth estimation
echo ""
echo -e " ${BOLD}Growth Estimation${NC}"
echo ""
local head_series samples_appended
head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
samples_appended=$(prom_metric "prometheus_tsdb_head_samples_appended_total")
local start_time uptime_secs=0
start_time=$(prom_metric "process_start_time_seconds")
local now_time
now_time=$(date +%s)
if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then
uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}')
uptime_secs=${uptime_secs:-0}
fi
if [ "${uptime_secs:-0}" -gt 0 ] 2>/dev/null && [ -n "$samples_appended" ] && [ "$samples_appended" != "0" ]; then
local samples_per_sec samples_per_day
samples_per_sec=$(echo "$samples_appended $uptime_secs" | awk '{printf "%.1f", $1 / $2}')
samples_per_day=$(echo "$samples_per_sec" | awk '{printf "%.0f", $1 * 86400}')
print_metric "Samples/second" "$(format_number "$samples_per_sec")"
print_metric "Samples/day" "$(format_number "$samples_per_day")"
# Estimate bytes per sample (~1-2 bytes compressed)
local bytes_per_day_low bytes_per_day_high
bytes_per_day_low=$(echo "$samples_per_day" | awk '{printf "%.0f", $1 * 1.0}')
bytes_per_day_high=$(echo "$samples_per_day" | awk '{printf "%.0f", $1 * 2.0}')
print_metric "Estimated disk/day" "$(format_bytes "$bytes_per_day_low") -- $(format_bytes "$bytes_per_day_high")"
# Estimate 30-day storage
local monthly_low monthly_high
monthly_low=$(echo "$bytes_per_day_low" | awk '{printf "%.0f", $1 * 30}')
monthly_high=$(echo "$bytes_per_day_high" | awk '{printf "%.0f", $1 * 30}')
print_metric "Estimated disk/30 days" "$(format_bytes "$monthly_low") -- $(format_bytes "$monthly_high")"
else
echo " (insufficient uptime data for growth estimation)"
fi
# Retention size check
if [ "$retention_size" != "not set" ] && [ "$retention_size" != "0" ] && [ -n "$total_storage" ]; then
local ret_bytes
# Parse retention size (e.g., "512MB", "1GB")
ret_bytes=$(echo "$retention_size" | awk '{
s = $1
if (s ~ /TB/) { gsub(/TB/, "", s); printf "%.0f", s * 1099511627776 }
else if (s ~ /GB/) { gsub(/GB/, "", s); printf "%.0f", s * 1073741824 }
else if (s ~ /MB/) { gsub(/MB/, "", s); printf "%.0f", s * 1048576 }
else if (s ~ /KB/) { gsub(/KB/, "", s); printf "%.0f", s * 1024 }
else { printf "%.0f", s }
}')
if [ "$ret_bytes" -gt 0 ] 2>/dev/null; then
local usage_pct
usage_pct=$(echo "$total_storage $ret_bytes" | awk '{printf "%d", ($1 / $2) * 100}')
print_metric "Retention usage" "${usage_pct}%"
if [ "$usage_pct" -gt 90 ] 2>/dev/null; then
add_recommendation "WARNING" "storage" "Storage at ${usage_pct}% of retention size limit ($retention_size) -- data will be dropped soon, consider increasing retention.size"
fi
fi
fi
}
# ============================================================================
# SECTION: MEMORY
# ============================================================================
analyze_memory() {
print_header "Memory"
# Process memory
local rss_bytes vss_bytes
rss_bytes=$(prom_metric "process_resident_memory_bytes")
vss_bytes=$(prom_metric "process_virtual_memory_bytes")
print_metric "Process RSS" "$(format_bytes "$rss_bytes")"
print_metric "Process virtual" "$(format_bytes "$vss_bytes")"
echo ""
# Go runtime memory
local heap_alloc heap_sys heap_inuse
heap_alloc=$(prom_metric "go_memstats_heap_alloc_bytes")
heap_sys=$(prom_metric "go_memstats_heap_sys_bytes")
heap_inuse=$(prom_metric "go_memstats_heap_inuse_bytes")
echo -e " ${BOLD}Go Runtime Memory${NC}"
echo ""
print_metric "Heap alloc" "$(format_bytes "$heap_alloc")"
print_metric "Heap sys" "$(format_bytes "$heap_sys")"
print_metric "Heap in use" "$(format_bytes "$heap_inuse")"
# RSS vs Go heap ratio
if [ -n "$rss_bytes" ] && [ "$rss_bytes" != "0" ] && [ -n "$heap_alloc" ] && [ "$heap_alloc" != "0" ]; then
local ratio
ratio=$(echo "$rss_bytes $heap_alloc" | awk '{printf "%.1f", $1 / $2}')
print_metric "RSS / Heap ratio" "${ratio}x"
local ratio_int
ratio_int=$(echo "$ratio" | awk '{printf "%d", $1}')
if [ "$ratio_int" -ge 3 ] 2>/dev/null; then
add_recommendation "WARNING" "memory" "RSS is ${ratio}x Go heap -- indicates memory fragmentation or mmap overhead, consider restarting Prometheus during a maintenance window"
fi
fi
echo ""
# GC stats
echo -e " ${BOLD}Garbage Collection${NC}"
echo ""
local gc_count gc_pause_total
gc_count=$(prom_metric "go_gc_duration_seconds_count")
gc_pause_total=$(prom_metric "go_gc_duration_seconds_sum")
print_metric "GC cycles (total)" "$(format_number "$gc_count")"
if [ -n "$gc_pause_total" ] && [ "$gc_pause_total" != "0" ]; then
print_metric "GC pause (total)" "$(echo "$gc_pause_total" | awk '{if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')"
# Average GC pause
if [ -n "$gc_count" ] && [ "$gc_count" != "0" ]; then
local avg_pause
avg_pause=$(echo "$gc_pause_total $gc_count" | awk '{printf "%.3f", ($1 / $2) * 1000}')
print_metric "GC avg pause" "${avg_pause} ms"
fi
fi
# GC quantiles
local gc_p99
gc_p99=$(prom_metric_labeled 'go_gc_duration_seconds{quantile="1"}' | awk '{print $NF}')
if [ -n "$gc_p99" ] && [ "$gc_p99" != "0" ]; then
print_metric "GC max pause" "$(echo "$gc_p99" | awk '{if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')"
fi
echo ""
# Goroutines
local goroutines
goroutines=$(echo "$RUNTIME_INFO" | jq -r '.data.goroutines // 0')
print_metric "Goroutines" "$(format_number "$goroutines")"
if [ "$goroutines" -gt 1000 ] 2>/dev/null; then
add_recommendation "WARNING" "memory" "Goroutine count is $goroutines (>1000) -- may indicate resource leak or excessive concurrency"
fi
# Open file descriptors
local open_fds max_fds
open_fds=$(prom_metric "process_open_fds")
max_fds=$(prom_metric "process_max_fds")
if [ -n "$open_fds" ] && [ "$open_fds" != "0" ]; then
print_metric "Open file descriptors" "$(format_number "$open_fds")"
if [ -n "$max_fds" ] && [ "$max_fds" != "0" ]; then
print_metric "Max file descriptors" "$(format_number "$max_fds")"
local fd_pct
fd_pct=$(echo "$open_fds $max_fds" | awk '{printf "%d", ($1 / $2) * 100}')
if [ "$fd_pct" -gt 80 ] 2>/dev/null; then
add_recommendation "WARNING" "memory" "File descriptor usage at ${fd_pct}% -- approaching limit, increase ulimit -n"
fi
fi
fi
}
# ============================================================================
# SECTION: CONFIG
# ============================================================================
analyze_config() {
print_header "Configuration Review"
if [ -z "$CONFIG_DATA" ] && [ -z "$FLAGS_DATA" ]; then
echo " (configuration data not available)"
return
fi
# Global config from YAML
local config_yaml=""
if [ -n "$CONFIG_DATA" ]; then
config_yaml=$(echo "$CONFIG_DATA" | jq -r '.data.yaml // ""' 2>/dev/null)
fi
local scrape_interval scrape_timeout eval_interval
scrape_interval=$(echo "$config_yaml" | grep -m1 'scrape_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"')
scrape_timeout=$(echo "$config_yaml" | grep -m1 'scrape_timeout:' | awk '{print $2}' | tr -d "'" | tr -d '"')
eval_interval=$(echo "$config_yaml" | grep -m1 'evaluation_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"')
[ -z "$scrape_interval" ] && scrape_interval="1m"
[ -z "$scrape_timeout" ] && scrape_timeout="10s"
[ -z "$eval_interval" ] && eval_interval="1m"
echo -e " ${BOLD}Global Settings${NC}"
echo ""
print_metric "Scrape interval" "$scrape_interval"
print_metric "Scrape timeout" "$scrape_timeout"
print_metric "Evaluation interval" "$eval_interval"
# Parse intervals to seconds for comparison
local scrape_int_secs scrape_to_secs
scrape_int_secs=$(echo "$scrape_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 60)
scrape_to_secs=$(echo "$scrape_timeout" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 10)
if [ "$scrape_to_secs" -ge "$scrape_int_secs" ] 2>/dev/null; then
add_recommendation "WARNING" "config" "scrape_timeout ($scrape_timeout) >= scrape_interval ($scrape_interval) -- timeout should be less than interval"
fi
echo ""
# External labels
local external_labels
external_labels=$(echo "$config_yaml" | awk '/^ external_labels:/,/^ [a-z]/' | grep -v 'external_labels:' | grep -v '^ [a-z]' | grep ':')
echo -e " ${BOLD}External Labels${NC}"
echo ""
if [ -n "$external_labels" ]; then
echo "$external_labels" | while IFS= read -r line; do
echo " $line"
done
else
echo " (none configured)"
add_recommendation "INFO" "config" "No external labels configured -- recommended for remote write, federation, and cross-cluster identification"
fi
echo ""
# Remote write/read
echo -e " ${BOLD}Remote Endpoints${NC}"
echo ""
local remote_write_count remote_read_count
remote_write_count=$(echo "$config_yaml" | grep -c 'remote_write:' 2>/dev/null || true)
remote_read_count=$(echo "$config_yaml" | grep -c 'remote_read:' 2>/dev/null || true)
local has_remote_write="no"
local has_remote_read="no"
[ "$remote_write_count" -gt 0 ] 2>/dev/null && has_remote_write="yes"
[ "$remote_read_count" -gt 0 ] 2>/dev/null && has_remote_read="yes"
print_metric "Remote write" "$has_remote_write"
print_metric "Remote read" "$has_remote_read"
echo ""
# Job count and interval distribution
echo -e " ${BOLD}Scrape Jobs${NC}"
echo ""
local job_count
job_count=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[].labels.job] | unique | length' 2>/dev/null)
print_metric "Total scrape jobs" "${job_count:-0}"
# Check for aggressive scrape intervals
if [ -n "$TARGETS_DATA" ]; then
local fast_scrape_jobs
fast_scrape_jobs=$(echo "$TARGETS_DATA" | jq -r '
[.data.activeTargets // [] | .[] |
select(.scrapeInterval != null) |
{job: .labels.job, interval: .scrapeInterval}] |
unique_by(.job) |
.[] |
select(
(.interval | test("^[0-9]+s$")) and
(.interval | gsub("s$"; "") | tonumber) < 10
) |
.job
' 2>/dev/null)
if [ -n "$fast_scrape_jobs" ]; then
local fast_count
fast_count=$(echo "$fast_scrape_jobs" | wc -l)
if [ "$fast_count" -gt 3 ] 2>/dev/null; then
add_recommendation "WARNING" "config" "$fast_count jobs have scrape_interval < 10s -- high scrape frequency increases storage cost and cardinality"
fi
fi
fi
# Key flags
echo ""
echo -e " ${BOLD}Key Flags${NC}"
echo ""
if [ -n "$FLAGS_DATA" ]; then
local web_listen log_level tsdb_wal_compression
web_listen=$(echo "$FLAGS_DATA" | jq -r '.data["web.listen-address"] // "unknown"')
log_level=$(echo "$FLAGS_DATA" | jq -r '.data["log.level"] // "unknown"')
tsdb_wal_compression=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.wal-compression"] // "unknown"')
print_metric "Listen address" "$web_listen"
print_metric "Log level" "$log_level"
print_metric "WAL compression" "$tsdb_wal_compression"
if [ "$tsdb_wal_compression" = "false" ]; then
add_recommendation "INFO" "config" "WAL compression is disabled -- enabling it (--storage.tsdb.wal-compression) can reduce WAL size by ~50%"
fi
fi
}
# ============================================================================
# SECTION: SUMMARY
# ============================================================================
analyze_summary() {
print_header "Recommendations"
if [ ${#RECOMMENDATIONS[@]} -eq 0 ]; then
print_status "OK" "No issues detected -- Prometheus appears healthy"
echo ""
echo -e " ${BOLD}${GREEN}Health Score: 100 / 100${NC}"
return
fi
# Print recommendations grouped by severity
# Sort: CRITICAL first, then WARNING, then INFO
for severity in CRITICAL WARNING INFO; do
local found=false
for rec in "${RECOMMENDATIONS[@]}"; do
local sev section message
sev=$(echo "$rec" | cut -d'|' -f1)
section=$(echo "$rec" | cut -d'|' -f2)
message=$(echo "$rec" | cut -d'|' -f3-)
if [ "$sev" = "$severity" ]; then
if [ "$found" = false ]; then
found=true
fi
echo -e " $(severity_tag "$sev") ${BOLD}[${section}]${NC} $message"
echo ""
fi
done
done
# Calculate health score
local score=100
score=$((score - (CRITICAL_COUNT * 15)))
score=$((score - (WARNING_COUNT * 5)))
score=$((score - (INFO_COUNT * 1)))
[ "$score" -lt 0 ] && score=0
echo -e "${BOLD}${CYAN}====================================================${NC}"
echo ""
printf " Issues found: "
[ "$CRITICAL_COUNT" -gt 0 ] && printf "${RED}%d critical${NC} " "$CRITICAL_COUNT"
[ "$WARNING_COUNT" -gt 0 ] && printf "${YELLOW}%d warning${NC} " "$WARNING_COUNT"
[ "$INFO_COUNT" -gt 0 ] && printf "${GREEN}%d info${NC}" "$INFO_COUNT"
echo ""
echo ""
local score_color
if [ "$score" -ge 80 ]; then
score_color="$GREEN"
elif [ "$score" -ge 50 ]; then
score_color="$YELLOW"
else
score_color="$RED"
fi
echo -e " ${BOLD}Health Score: ${score_color}${score} / 100${NC}"
}
# ============================================================================
# JSON OUTPUT
# ============================================================================
output_json() {
collect_all_data
# Build JSON from all sections
local version head_series head_chunks retention_time retention_size rss_bytes
version=$(echo "$RUNTIME_INFO" | jq -r '.data.version // "unknown"')
local start_time now_time
start_time=$(prom_metric "process_start_time_seconds")
now_time=$(date +%s)
local uptime_secs=0
if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then
uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}')
fi
head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
head_chunks=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0')
retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"')
retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"')
rss_bytes=$(prom_metric "process_resident_memory_bytes")
local compactions_total compactions_failed wal_corruptions ooo_total
compactions_total=$(prom_metric "prometheus_tsdb_compactions_total")
compactions_failed=$(prom_metric "prometheus_tsdb_compactions_failed_total")
wal_corruptions=$(prom_metric "prometheus_tsdb_wal_corruptions_total")
ooo_total=$(prom_metric "prometheus_tsdb_out_of_order_samples_total")
local total_targets up_targets down_targets
total_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[]] | length')
up_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "up")] | length')
down_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "down")] | length')
local recording_rules alerting_rules
recording_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "recording")] | length')
alerting_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "alerting")] | length')
# Top metrics
local top_metrics
top_metrics=$(echo "$TSDB_STATUS" | jq '[.data.seriesCountByMetricName // [] | .[:10] | .[] | {name: .name, series: .value}]')
# Run all checks to populate recommendations
NO_COLOR=true
RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC=""
# Suppress output, just collect recommendations
analyze_overview >/dev/null 2>&1
analyze_tsdb >/dev/null 2>&1
analyze_cardinality >/dev/null 2>&1
analyze_queries >/dev/null 2>&1
analyze_scrapes >/dev/null 2>&1
analyze_rules >/dev/null 2>&1
analyze_storage >/dev/null 2>&1
analyze_memory >/dev/null 2>&1
analyze_config >/dev/null 2>&1
# Build recommendations JSON
local rec_json="["
local first=true
for rec in "${RECOMMENDATIONS[@]}"; do
local sev section message
sev=$(echo "$rec" | cut -d'|' -f1)
section=$(echo "$rec" | cut -d'|' -f2)
message=$(echo "$rec" | cut -d'|' -f3-)
[ "$first" = true ] && first=false || rec_json+=","
rec_json+="{\"severity\":\"$sev\",\"section\":\"$section\",\"message\":$(echo "$message" | jq -Rs '.')}"
done
rec_json+="]"
local score=100
score=$((score - (CRITICAL_COUNT * 15)))
score=$((score - (WARNING_COUNT * 5)))
score=$((score - (INFO_COUNT * 1)))
[ "$score" -lt 0 ] && score=0
jq -n \
--arg version "$version" \
--argjson uptime "$uptime_secs" \
--argjson head_series "$head_series" \
--argjson head_chunks "$head_chunks" \
--arg retention_time "$retention_time" \
--arg retention_size "$retention_size" \
--argjson rss_bytes "$(echo "$rss_bytes" | awk '{printf "%d", $1}')" \
--argjson compactions_total "$(echo "$compactions_total" | awk '{printf "%d", $1}')" \
--argjson compactions_failed "$(echo "$compactions_failed" | awk '{printf "%d", $1}')" \
--argjson wal_corruptions "$(echo "$wal_corruptions" | awk '{printf "%d", $1}')" \
--argjson ooo_samples "$(echo "$ooo_total" | awk '{printf "%d", $1}')" \
--argjson total_targets "$total_targets" \
--argjson up_targets "$up_targets" \
--argjson down_targets "$down_targets" \
--argjson recording_rules "$recording_rules" \
--argjson alerting_rules "$alerting_rules" \
--argjson top_metrics "$top_metrics" \
--argjson recommendations "$rec_json" \
--argjson score "$score" \
--argjson critical "$CRITICAL_COUNT" \
--argjson warnings "$WARNING_COUNT" \
--argjson info "$INFO_COUNT" \
'{
prometheus: {
version: $version,
uptime_seconds: $uptime,
memory_rss_bytes: $rss_bytes
},
tsdb: {
head_series: $head_series,
head_chunks: $head_chunks,
retention_time: $retention_time,
retention_size: $retention_size,
compactions_total: $compactions_total,
compactions_failed: $compactions_failed,
wal_corruptions: $wal_corruptions,
out_of_order_samples: $ooo_samples
},
targets: {
total: $total_targets,
up: $up_targets,
down: $down_targets
},
rules: {
recording: $recording_rules,
alerting: $alerting_rules
},
cardinality: {
top_metrics: $top_metrics
},
health: {
score: $score,
critical: $critical,
warnings: $warnings,
info: $info
},
recommendations: $recommendations
}'
}
# ============================================================================
# MAIN
# ============================================================================
main() {
parse_args "$@"
if ! check_requirements; then
exit 1
fi
# JSON mode
if [ "$JSON_MODE" = true ]; then
if [ -n "$OUTPUT_FILE" ]; then
output_json > "$OUTPUT_FILE"
echo "JSON report written to $OUTPUT_FILE" >&2
else
output_json
fi
return
fi
# Text report mode
collect_all_data
{
echo -e "${BOLD}Prometheus Performance Analyzer v1.0${NC}"
echo -e "${DIM}Target: ${PROM_URL}${NC}"
echo -e "${DIM}Date: $(date '+%Y-%m-%d %H:%M:%S %Z')${NC}"
if [ -n "$SECTION" ]; then
case "$SECTION" in
overview) analyze_overview ;;
tsdb) analyze_tsdb ;;
cardinality) analyze_cardinality ;;
queries) analyze_queries ;;
scrapes) analyze_scrapes ;;
rules) analyze_rules ;;
storage) analyze_storage ;;
memory) analyze_memory ;;
config) analyze_config ;;
summary) analyze_summary ;;
*)
echo "Unknown section: $SECTION" >&2
echo "Valid sections: overview tsdb cardinality queries scrapes rules storage memory config summary" >&2
exit 1
;;
esac
# Always show summary if not already shown
if [ "$SECTION" != "summary" ]; then
analyze_summary
fi
else
analyze_overview
analyze_tsdb
analyze_cardinality
analyze_queries
analyze_scrapes
analyze_rules
analyze_storage
analyze_memory
analyze_config
analyze_summary
fi
echo ""
} | if [ -n "$OUTPUT_FILE" ]; then
cat > "$OUTPUT_FILE"
echo "Report written to $OUTPUT_FILE" >&2
else
cat
fi
}
main "$@"