a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
1562 lines
59 KiB
Bash
Executable File
1562 lines
59 KiB
Bash
Executable File
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: prometheus-performance-analyzer.sh
|
|
# Version: 1.01
|
|
# Description: Diagnostic tool that analyzes Prometheus server performance.
|
|
# Queries TSDB status, runtime info, flags, config, targets,
|
|
# rules, and internal metrics to produce a detailed report
|
|
# with actionable recommendations.
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
#
|
|
# Prerequisites:
|
|
# - curl
|
|
# - jq
|
|
# - bc (for calculations)
|
|
# - Network access to Prometheus API
|
|
#
|
|
# Usage:
|
|
# # Analyze local Prometheus
|
|
# ./prometheus-performance-analyzer.sh
|
|
#
|
|
# # Analyze remote Prometheus
|
|
# ./prometheus-performance-analyzer.sh --url http://prometheus:9090
|
|
#
|
|
# # JSON output for automation
|
|
# ./prometheus-performance-analyzer.sh --json
|
|
#
|
|
# # Analyze specific section only
|
|
# ./prometheus-performance-analyzer.sh --section cardinality
|
|
#
|
|
# # Save report to file (auto-disables color)
|
|
# ./prometheus-performance-analyzer.sh -o report.txt
|
|
#
|
|
# # Custom series threshold
|
|
# ./prometheus-performance-analyzer.sh --threshold-series 5000000
|
|
#
|
|
# Sections:
|
|
# overview - Version, uptime, series counts, retention
|
|
# tsdb - TSDB head stats, compaction, WAL health
|
|
# cardinality - High cardinality metrics, labels, label-value pairs
|
|
# queries - Query engine performance and latency
|
|
# scrapes - Scrape target health and duration analysis
|
|
# rules - Recording/alerting rule evaluation
|
|
# storage - Disk, retention, WAL, growth estimation
|
|
# memory - RSS, Go heap, GC, goroutines
|
|
# config - Configuration review and best practices
|
|
# summary - Health score and all recommendations
|
|
#
|
|
################################################################################
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION & DEFAULTS
|
|
# ============================================================================
|
|
|
|
PROM_URL="http://localhost:9090"
|
|
OUTPUT_FILE=""
|
|
JSON_MODE=false
|
|
NO_COLOR=false
|
|
SECTION=""
|
|
THRESHOLD_SERIES=1000000
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
YELLOW='\033[1;33m'
|
|
GREEN='\033[0;32m'
|
|
CYAN='\033[0;36m'
|
|
BOLD='\033[1m'
|
|
DIM='\033[2m'
|
|
NC='\033[0m'
|
|
|
|
# Global state
|
|
declare -a RECOMMENDATIONS=()
|
|
CRITICAL_COUNT=0
|
|
WARNING_COUNT=0
|
|
INFO_COUNT=0
|
|
|
|
# Cached API responses
|
|
TSDB_STATUS=""
|
|
RUNTIME_INFO=""
|
|
FLAGS_DATA=""
|
|
CONFIG_DATA=""
|
|
TARGETS_DATA=""
|
|
RULES_DATA=""
|
|
METRICS_RAW=""
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
show_usage() {
|
|
cat <<EOF
|
|
Usage: $0 [OPTIONS]
|
|
|
|
Analyze Prometheus server performance and produce recommendations (v1.0).
|
|
|
|
OPTIONS:
|
|
--url URL Prometheus URL (default: $PROM_URL)
|
|
--section NAME Run a specific section only
|
|
(overview|tsdb|cardinality|queries|scrapes|rules|
|
|
storage|memory|config|summary)
|
|
--json Output as JSON
|
|
--no-color Disable colored output
|
|
--threshold-series N Warning threshold for total series (default: $THRESHOLD_SERIES)
|
|
-o, --output FILE Write report to file (auto-disables color)
|
|
-h, --help Show this help
|
|
|
|
EXAMPLES:
|
|
$0 # Analyze localhost:9090
|
|
$0 --url http://prometheus:9090 # Remote server
|
|
$0 --json # JSON output
|
|
$0 --section cardinality # Single section
|
|
$0 -o report.txt # Save to file
|
|
$0 --threshold-series 5000000 # Custom threshold
|
|
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help) show_usage ;;
|
|
--url) PROM_URL="$2"; shift 2 ;;
|
|
--section) SECTION="$2"; shift 2 ;;
|
|
--json) JSON_MODE=true; shift ;;
|
|
--no-color) NO_COLOR=true; shift ;;
|
|
--threshold-series) THRESHOLD_SERIES="$2"; shift 2 ;;
|
|
-o|--output) OUTPUT_FILE="$2"; NO_COLOR=true; shift 2 ;;
|
|
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
# Strip trailing slash
|
|
PROM_URL="${PROM_URL%/}"
|
|
|
|
# Disable colors if requested
|
|
if [ "$NO_COLOR" = true ]; then
|
|
RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC=""
|
|
fi
|
|
}
|
|
|
|
check_requirements() {
|
|
local missing=0
|
|
|
|
for cmd in curl jq awk; do
|
|
if ! command -v "$cmd" >/dev/null 2>&1; then
|
|
echo "ERROR: $cmd not found" >&2
|
|
missing=1
|
|
fi
|
|
done
|
|
|
|
# Auto-install bc if missing
|
|
if ! command -v bc >/dev/null 2>&1; then
|
|
echo "bc not found -- installing..." >&2
|
|
if command -v apt-get >/dev/null 2>&1; then
|
|
sudo apt-get install -y bc >/dev/null 2>&1
|
|
elif command -v dnf >/dev/null 2>&1; then
|
|
sudo dnf install -y bc >/dev/null 2>&1
|
|
elif command -v yum >/dev/null 2>&1; then
|
|
sudo yum install -y bc >/dev/null 2>&1
|
|
fi
|
|
if ! command -v bc >/dev/null 2>&1; then
|
|
echo "ERROR: failed to install bc -- install it manually" >&2
|
|
missing=1
|
|
fi
|
|
fi
|
|
|
|
return $missing
|
|
}
|
|
|
|
# Query Prometheus API endpoint -- returns JSON body
|
|
prom_api() {
|
|
local endpoint="$1"
|
|
curl -sf --connect-timeout 5 --max-time 15 "${PROM_URL}${endpoint}" 2>/dev/null
|
|
}
|
|
|
|
# Query a Prometheus metric via instant query API -- returns numeric value
|
|
prom_query() {
|
|
local query="$1"
|
|
local encoded
|
|
encoded=$(printf '%s' "$query" | jq -sRr @uri)
|
|
local result
|
|
result=$(prom_api "/api/v1/query?query=${encoded}")
|
|
if [ -z "$result" ]; then
|
|
echo "0"
|
|
return 1
|
|
fi
|
|
echo "$result" | jq -r '.data.result[0].value[1] // "0"' 2>/dev/null
|
|
}
|
|
|
|
# Extract a metric value from raw /metrics text
|
|
prom_metric() {
|
|
local metric="$1"
|
|
if [ -z "$METRICS_RAW" ]; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
local val
|
|
val=$(echo "$METRICS_RAW" | grep "^${metric} " | head -1 | awk '{print $2}')
|
|
echo "${val:-0}"
|
|
}
|
|
|
|
# Extract a metric with labels from raw /metrics text
|
|
prom_metric_labeled() {
|
|
local pattern="$1"
|
|
if [ -z "$METRICS_RAW" ]; then
|
|
return
|
|
fi
|
|
echo "$METRICS_RAW" | grep "^${pattern}" 2>/dev/null
|
|
}
|
|
|
|
# Format bytes to human-readable
|
|
format_bytes() {
|
|
local bytes="$1"
|
|
if [ -z "$bytes" ] || [ "$bytes" = "0" ] || [ "$bytes" = "null" ]; then
|
|
echo "0 B"
|
|
return
|
|
fi
|
|
echo "$bytes" | awk '{
|
|
if ($1 >= 1099511627776) printf "%.1f TB", $1/1099511627776
|
|
else if ($1 >= 1073741824) printf "%.1f GB", $1/1073741824
|
|
else if ($1 >= 1048576) printf "%.1f MB", $1/1048576
|
|
else if ($1 >= 1024) printf "%.1f KB", $1/1024
|
|
else printf "%d B", $1
|
|
}'
|
|
}
|
|
|
|
# Format seconds to human-readable duration
|
|
format_duration() {
|
|
local total="$1"
|
|
if [ -z "$total" ] || [ "$total" = "0" ]; then
|
|
echo "0s"
|
|
return
|
|
fi
|
|
# Handle float seconds
|
|
local secs
|
|
secs=$(echo "$total" | awk '{printf "%d", $1}')
|
|
local days=$((secs / 86400))
|
|
local hours=$(( (secs % 86400) / 3600 ))
|
|
local mins=$(( (secs % 3600) / 60 ))
|
|
local s=$((secs % 60))
|
|
|
|
local result=""
|
|
[ "$days" -gt 0 ] && result="${days}d "
|
|
[ "$hours" -gt 0 ] && result="${result}${hours}h "
|
|
[ "$mins" -gt 0 ] && result="${result}${mins}m "
|
|
[ "$s" -gt 0 ] || [ -z "$result" ] && result="${result}${s}s"
|
|
echo "${result% }"
|
|
}
|
|
|
|
# Format number with commas
|
|
format_number() {
|
|
local n="$1"
|
|
if [ -z "$n" ] || [ "$n" = "null" ]; then
|
|
echo "0"
|
|
return
|
|
fi
|
|
printf "%'.0f" "$n" 2>/dev/null || echo "$n"
|
|
}
|
|
|
|
# Add a recommendation
|
|
add_recommendation() {
|
|
local severity="$1"
|
|
local section="$2"
|
|
local message="$3"
|
|
|
|
case "$severity" in
|
|
CRITICAL) CRITICAL_COUNT=$((CRITICAL_COUNT + 1)) ;;
|
|
WARNING) WARNING_COUNT=$((WARNING_COUNT + 1)) ;;
|
|
INFO) INFO_COUNT=$((INFO_COUNT + 1)) ;;
|
|
esac
|
|
|
|
RECOMMENDATIONS+=("${severity}|${section}|${message}")
|
|
}
|
|
|
|
# Print section header
|
|
print_header() {
|
|
local title="$1"
|
|
echo ""
|
|
echo -e "${BOLD}${CYAN}====================================================${NC}"
|
|
echo -e "${BOLD}${CYAN} ${title}${NC}"
|
|
echo -e "${BOLD}${CYAN}====================================================${NC}"
|
|
echo ""
|
|
}
|
|
|
|
# Print a metric line (label + value, aligned)
|
|
print_metric() {
|
|
local label="$1"
|
|
local value="$2"
|
|
printf " ${DIM}%-30s${NC} %s\n" "$label:" "$value"
|
|
}
|
|
|
|
# Print severity tag
|
|
severity_tag() {
|
|
local sev="$1"
|
|
case "$sev" in
|
|
CRITICAL) echo -e "${RED}[CRITICAL]${NC}" ;;
|
|
WARNING) echo -e "${YELLOW}[WARNING]${NC}" ;;
|
|
INFO) echo -e "${GREEN}[INFO]${NC}" ;;
|
|
OK) echo -e "${GREEN}[OK]${NC}" ;;
|
|
esac
|
|
}
|
|
|
|
# Print a status line
|
|
print_status() {
|
|
local severity="$1"
|
|
local message="$2"
|
|
echo -e " $(severity_tag "$severity") $message"
|
|
}
|
|
|
|
# ============================================================================
|
|
# DATA COLLECTION
|
|
# ============================================================================
|
|
|
|
collect_all_data() {
|
|
echo -e "${DIM}Collecting data from ${PROM_URL}...${NC}" >&2
|
|
|
|
# Check connectivity first
|
|
if ! prom_api "/api/v1/status/runtimeinfo" >/dev/null 2>&1; then
|
|
echo -e "${RED}ERROR: Cannot reach Prometheus at ${PROM_URL}${NC}" >&2
|
|
echo "Check the URL and ensure Prometheus is running." >&2
|
|
exit 1
|
|
fi
|
|
|
|
TSDB_STATUS=$(prom_api "/api/v1/status/tsdb")
|
|
RUNTIME_INFO=$(prom_api "/api/v1/status/runtimeinfo")
|
|
FLAGS_DATA=$(prom_api "/api/v1/status/flags")
|
|
CONFIG_DATA=$(prom_api "/api/v1/status/config")
|
|
TARGETS_DATA=$(prom_api "/api/v1/targets")
|
|
RULES_DATA=$(prom_api "/api/v1/rules")
|
|
METRICS_RAW=$(curl -sf --connect-timeout 5 --max-time 15 "${PROM_URL}/metrics" 2>/dev/null)
|
|
|
|
echo -e "${DIM}Data collection complete.${NC}" >&2
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: OVERVIEW
|
|
# ============================================================================
|
|
|
|
analyze_overview() {
|
|
print_header "Overview"
|
|
|
|
# Version and uptime from runtime info
|
|
local version goroutines gomaxprocs storage_path
|
|
version=$(echo "$RUNTIME_INFO" | jq -r '.data.version // "unknown"')
|
|
goroutines=$(echo "$RUNTIME_INFO" | jq -r '.data.goroutines // "0"')
|
|
gomaxprocs=$(echo "$RUNTIME_INFO" | jq -r '.data.GOMAXPROCS // "0"')
|
|
storage_path=$(echo "$RUNTIME_INFO" | jq -r '.data.storageRetention // "unknown"')
|
|
|
|
# Uptime from process_start_time_seconds
|
|
local start_time now_time uptime_secs
|
|
start_time=$(prom_metric "process_start_time_seconds")
|
|
now_time=$(date +%s)
|
|
if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then
|
|
uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}')
|
|
else
|
|
uptime_secs=0
|
|
fi
|
|
|
|
# Series and samples from TSDB
|
|
local head_series head_chunks
|
|
head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
|
|
head_chunks=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0')
|
|
|
|
# Retention from flags
|
|
local retention_time retention_size
|
|
retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"')
|
|
retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"')
|
|
|
|
# Memory
|
|
local rss_bytes
|
|
rss_bytes=$(prom_metric "process_resident_memory_bytes")
|
|
|
|
print_metric "Prometheus version" "$version"
|
|
print_metric "Uptime" "$(format_duration "$uptime_secs")"
|
|
print_metric "GOMAXPROCS" "$gomaxprocs"
|
|
print_metric "Goroutines" "$(format_number "$goroutines")"
|
|
print_metric "Head series" "$(format_number "$head_series")"
|
|
print_metric "Head chunks" "$(format_number "$head_chunks")"
|
|
print_metric "Retention (time)" "$retention_time"
|
|
print_metric "Retention (size)" "$retention_size"
|
|
print_metric "Memory (RSS)" "$(format_bytes "$rss_bytes")"
|
|
|
|
# Series threshold check
|
|
if [ "$head_series" -gt "$((THRESHOLD_SERIES * 5))" ] 2>/dev/null; then
|
|
add_recommendation "CRITICAL" "overview" "Head series count $(format_number "$head_series") is very high (>$(format_number "$((THRESHOLD_SERIES * 5))")) -- investigate high cardinality metrics immediately"
|
|
elif [ "$head_series" -gt "$THRESHOLD_SERIES" ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "overview" "Head series count $(format_number "$head_series") exceeds threshold $(format_number "$THRESHOLD_SERIES") -- review cardinality section"
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: TSDB
|
|
# ============================================================================
|
|
|
|
analyze_tsdb() {
|
|
print_header "TSDB Health"
|
|
|
|
if [ -z "$TSDB_STATUS" ]; then
|
|
echo " (TSDB status endpoint not available)"
|
|
return
|
|
fi
|
|
|
|
# Head stats
|
|
local num_series chunk_count min_time max_time num_label_pairs
|
|
num_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
|
|
chunk_count=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0')
|
|
min_time=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.minTime // 0')
|
|
max_time=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.maxTime // 0')
|
|
num_label_pairs=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numLabelPairs // 0')
|
|
|
|
# Calculate head block time range
|
|
local head_range_secs=0
|
|
if [ "$min_time" -gt 0 ] && [ "$max_time" -gt 0 ] 2>/dev/null; then
|
|
head_range_secs=$(( (max_time - min_time) / 1000 ))
|
|
fi
|
|
|
|
print_metric "Head series" "$(format_number "$num_series")"
|
|
print_metric "Head chunks" "$(format_number "$chunk_count")"
|
|
print_metric "Head label pairs" "$(format_number "$num_label_pairs")"
|
|
print_metric "Head block range" "$(format_duration "$head_range_secs")"
|
|
echo ""
|
|
|
|
# Compaction metrics from /metrics
|
|
local compactions_total compactions_failed compaction_duration
|
|
compactions_total=$(prom_metric "prometheus_tsdb_compactions_total")
|
|
compactions_failed=$(prom_metric "prometheus_tsdb_compactions_failed_total")
|
|
compaction_duration=$(prom_metric "prometheus_tsdb_compaction_duration_seconds_sum")
|
|
|
|
print_metric "Compactions total" "$(format_number "$compactions_total")"
|
|
print_metric "Compaction failures" "$compactions_failed"
|
|
if [ -n "$compaction_duration" ] && [ "$compaction_duration" != "0" ]; then
|
|
print_metric "Compaction time (total)" "$(format_duration "$compaction_duration")"
|
|
fi
|
|
|
|
if [ "$compactions_failed" != "0" ] && [ "$compactions_failed" != "" ] 2>/dev/null; then
|
|
if [ "$compactions_failed" -gt 0 ] 2>/dev/null; then
|
|
add_recommendation "CRITICAL" "tsdb" "TSDB has $compactions_failed compaction failures -- investigate storage health (disk I/O, free space)"
|
|
fi
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# WAL stats
|
|
local wal_corruptions wal_truncate_total wal_truncate_failed
|
|
wal_corruptions=$(prom_metric "prometheus_tsdb_wal_corruptions_total")
|
|
wal_truncate_total=$(prom_metric "prometheus_tsdb_wal_truncations_total")
|
|
wal_truncate_failed=$(prom_metric "prometheus_tsdb_wal_truncations_failed_total")
|
|
|
|
print_metric "WAL corruptions" "$wal_corruptions"
|
|
print_metric "WAL truncations" "$(format_number "$wal_truncate_total")"
|
|
print_metric "WAL truncation failures" "$wal_truncate_failed"
|
|
|
|
if [ "$wal_corruptions" != "0" ] && [ -n "$wal_corruptions" ]; then
|
|
if [ "$wal_corruptions" -gt 0 ] 2>/dev/null; then
|
|
add_recommendation "CRITICAL" "tsdb" "WAL has $wal_corruptions corruption(s) -- check disk health, consider running promtool tsdb clean-tombstones"
|
|
fi
|
|
fi
|
|
|
|
if [ "$wal_truncate_failed" != "0" ] && [ -n "$wal_truncate_failed" ]; then
|
|
if [ "$wal_truncate_failed" -gt 0 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "tsdb" "WAL has $wal_truncate_failed truncation failure(s) -- may cause WAL growth"
|
|
fi
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Out-of-order samples
|
|
local ooo_total
|
|
ooo_total=$(prom_metric "prometheus_tsdb_out_of_order_samples_total")
|
|
print_metric "Out-of-order samples" "$(format_number "$ooo_total")"
|
|
|
|
if [ "$ooo_total" != "0" ] && [ -n "$ooo_total" ]; then
|
|
local ooo_num
|
|
ooo_num=$(echo "$ooo_total" | awk '{printf "%d", $1}')
|
|
if [ "$ooo_num" -gt 1000 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "tsdb" "$(format_number "$ooo_total") out-of-order samples -- check NTP sync across targets or look for duplicate scraper configs"
|
|
fi
|
|
fi
|
|
|
|
# Head GC
|
|
local head_gc_duration
|
|
head_gc_duration=$(prom_metric "prometheus_tsdb_head_gc_duration_seconds_sum")
|
|
if [ -n "$head_gc_duration" ] && [ "$head_gc_duration" != "0" ]; then
|
|
print_metric "Head GC time (total)" "$(format_duration "$head_gc_duration")"
|
|
fi
|
|
|
|
# Checkpoint creations
|
|
local checkpoint_total checkpoint_failed
|
|
checkpoint_total=$(prom_metric "prometheus_tsdb_checkpoint_creations_total")
|
|
checkpoint_failed=$(prom_metric "prometheus_tsdb_checkpoint_creations_failed_total")
|
|
print_metric "Checkpoints created" "$(format_number "$checkpoint_total")"
|
|
if [ "$checkpoint_failed" != "0" ] && [ -n "$checkpoint_failed" ]; then
|
|
if [ "$checkpoint_failed" -gt 0 ] 2>/dev/null; then
|
|
print_metric "Checkpoint failures" "$checkpoint_failed"
|
|
add_recommendation "WARNING" "tsdb" "$checkpoint_failed checkpoint creation failure(s) -- investigate disk health"
|
|
fi
|
|
fi
|
|
|
|
# Tombstone cleanup
|
|
local tombstones
|
|
tombstones=$(prom_metric "prometheus_tsdb_tombstone_cleanup_seconds_sum")
|
|
if [ -n "$tombstones" ] && [ "$tombstones" != "0" ]; then
|
|
print_metric "Tombstone cleanup time" "$(format_duration "$tombstones")"
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: CARDINALITY
|
|
# ============================================================================
|
|
|
|
analyze_cardinality() {
|
|
print_header "High Cardinality Analysis"
|
|
|
|
if [ -z "$TSDB_STATUS" ]; then
|
|
echo " (TSDB status endpoint not available)"
|
|
return
|
|
fi
|
|
|
|
local total_series
|
|
total_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
|
|
|
|
# Top metrics by series count
|
|
echo -e " ${BOLD}Top Metrics by Series Count${NC}"
|
|
echo ""
|
|
|
|
local metric_count
|
|
metric_count=$(echo "$TSDB_STATUS" | jq -r '.data.seriesCountByMetricName // [] | length')
|
|
|
|
if [ "$metric_count" -gt 0 ] 2>/dev/null; then
|
|
local i=0
|
|
while [ $i -lt 10 ] && [ $i -lt "$metric_count" ]; do
|
|
local name count pct
|
|
name=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByMetricName[$i].name // \"\"")
|
|
count=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByMetricName[$i].value // 0")
|
|
|
|
if [ -n "$name" ] && [ "$name" != "" ]; then
|
|
if [ "$total_series" -gt 0 ] 2>/dev/null; then
|
|
pct=$(echo "scale=1; $count * 100 / $total_series" | bc 2>/dev/null)
|
|
else
|
|
pct="0"
|
|
fi
|
|
printf " %2d. %-45s %10s (%5s%%)\n" "$((i+1))" "$name" "$(format_number "$count")" "$pct"
|
|
|
|
# Flag metrics consuming > 10% of total
|
|
local pct_int
|
|
pct_int=$(echo "$pct" | awk '{printf "%d", $1}')
|
|
if [ "$pct_int" -ge 10 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "cardinality" "$name has $(format_number "$count") series (${pct}% of total) -- consider adding metric_relabel_configs to drop unused label dimensions"
|
|
fi
|
|
fi
|
|
i=$((i + 1))
|
|
done
|
|
else
|
|
echo " (no data available)"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Top labels by value count
|
|
echo -e " ${BOLD}Top Labels by Value Count${NC}"
|
|
echo ""
|
|
|
|
local label_count
|
|
label_count=$(echo "$TSDB_STATUS" | jq -r '.data.labelValueCountByLabelName // [] | length')
|
|
|
|
if [ "$label_count" -gt 0 ] 2>/dev/null; then
|
|
local i=0
|
|
while [ $i -lt 10 ] && [ $i -lt "$label_count" ]; do
|
|
local name count
|
|
name=$(echo "$TSDB_STATUS" | jq -r ".data.labelValueCountByLabelName[$i].name // \"\"")
|
|
count=$(echo "$TSDB_STATUS" | jq -r ".data.labelValueCountByLabelName[$i].value // 0")
|
|
|
|
if [ -n "$name" ] && [ "$name" != "" ]; then
|
|
printf " %2d. %-45s %10s values\n" "$((i+1))" "$name" "$(format_number "$count")"
|
|
|
|
# Flag labels with very high value counts
|
|
if [ "$count" -gt 10000 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "cardinality" "Label '$name' has $(format_number "$count") unique values -- high cardinality label, consider relabeling or dropping"
|
|
fi
|
|
fi
|
|
i=$((i + 1))
|
|
done
|
|
else
|
|
echo " (no data available)"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Top label-value pairs by series count
|
|
echo -e " ${BOLD}Top Label-Value Pairs by Series Count${NC}"
|
|
echo ""
|
|
|
|
local pair_count
|
|
pair_count=$(echo "$TSDB_STATUS" | jq -r '.data.seriesCountByLabelValuePair // [] | length')
|
|
|
|
if [ "$pair_count" -gt 0 ] 2>/dev/null; then
|
|
local i=0
|
|
while [ $i -lt 10 ] && [ $i -lt "$pair_count" ]; do
|
|
local name count
|
|
name=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByLabelValuePair[$i].name // \"\"")
|
|
count=$(echo "$TSDB_STATUS" | jq -r ".data.seriesCountByLabelValuePair[$i].value // 0")
|
|
|
|
if [ -n "$name" ] && [ "$name" != "" ]; then
|
|
printf " %2d. %-45s %10s series\n" "$((i+1))" "$name" "$(format_number "$count")"
|
|
fi
|
|
i=$((i + 1))
|
|
done
|
|
else
|
|
echo " (no data available)"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Overall cardinality assessment
|
|
if [ "$total_series" -gt "$((THRESHOLD_SERIES * 5))" ] 2>/dev/null; then
|
|
print_status "CRITICAL" "Total series $(format_number "$total_series") -- well above recommended limits"
|
|
elif [ "$total_series" -gt "$THRESHOLD_SERIES" ] 2>/dev/null; then
|
|
print_status "WARNING" "Total series $(format_number "$total_series") -- above threshold $(format_number "$THRESHOLD_SERIES")"
|
|
else
|
|
print_status "OK" "Total series $(format_number "$total_series") -- within normal range"
|
|
fi
|
|
|
|
# Memory per series from TSDB
|
|
local mem_by_label
|
|
mem_by_label=$(echo "$TSDB_STATUS" | jq -r '.data.memoryInBytesByLabelName // [] | length')
|
|
if [ "$mem_by_label" -gt 0 ] 2>/dev/null; then
|
|
echo ""
|
|
echo -e " ${BOLD}Top Labels by Memory Usage${NC}"
|
|
echo ""
|
|
local i=0
|
|
while [ $i -lt 10 ] && [ $i -lt "$mem_by_label" ]; do
|
|
local name bytes
|
|
name=$(echo "$TSDB_STATUS" | jq -r ".data.memoryInBytesByLabelName[$i].name // \"\"")
|
|
bytes=$(echo "$TSDB_STATUS" | jq -r ".data.memoryInBytesByLabelName[$i].value // 0")
|
|
if [ -n "$name" ] && [ "$name" != "" ]; then
|
|
printf " %2d. %-45s %10s\n" "$((i+1))" "$name" "$(format_bytes "$bytes")"
|
|
fi
|
|
i=$((i + 1))
|
|
done
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: QUERIES
|
|
# ============================================================================
|
|
|
|
analyze_queries() {
|
|
print_header "Query Performance"
|
|
|
|
# Query engine settings from flags
|
|
local max_concurrency query_timeout lookback_delta
|
|
max_concurrency=$(echo "$FLAGS_DATA" | jq -r '.data["query.max-concurrency"] // "unknown"')
|
|
query_timeout=$(echo "$FLAGS_DATA" | jq -r '.data["query.timeout"] // "unknown"')
|
|
lookback_delta=$(echo "$FLAGS_DATA" | jq -r '.data["query.lookback-delta"] // "unknown"')
|
|
|
|
print_metric "Max concurrent queries" "$max_concurrency"
|
|
print_metric "Query timeout" "$query_timeout"
|
|
print_metric "Lookback delta" "$lookback_delta"
|
|
echo ""
|
|
|
|
# Query duration quantiles from /metrics
|
|
echo -e " ${BOLD}Query Duration Percentiles${NC}"
|
|
echo ""
|
|
|
|
local has_query_metrics=false
|
|
|
|
# prometheus_engine_query_duration_seconds (histogram with quantiles)
|
|
local query_durations
|
|
query_durations=$(prom_metric_labeled "prometheus_engine_query_duration_seconds{")
|
|
|
|
if [ -n "$query_durations" ]; then
|
|
has_query_metrics=true
|
|
echo "$query_durations" | while IFS= read -r line; do
|
|
local quantile value slice_name
|
|
quantile=$(echo "$line" | grep -oP 'quantile="\K[^"]+')
|
|
slice_name=$(echo "$line" | grep -oP 'slice="\K[^"]+')
|
|
value=$(echo "$line" | awk '{print $NF}')
|
|
|
|
if [ -n "$quantile" ] && [ -n "$value" ] && [ "$value" != "NaN" ]; then
|
|
local label
|
|
label="p$(echo "$quantile" | awk '{printf "%g", $1 * 100}')"
|
|
if [ -n "$slice_name" ]; then
|
|
label="${label} (${slice_name})"
|
|
fi
|
|
printf " %-35s %s\n" "$label" "$(echo "$value" | awk '{if ($1 < 0.001) printf "%.3f ms", $1*1000; else if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')"
|
|
fi
|
|
done
|
|
fi
|
|
|
|
if [ "$has_query_metrics" = false ]; then
|
|
echo " (query duration metrics not available)"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Query performance from instant metrics
|
|
local queries_active
|
|
queries_active=$(prom_metric "prometheus_engine_queries")
|
|
print_metric "Active queries (now)" "$queries_active"
|
|
|
|
local queries_total
|
|
queries_total=$(prom_metric "prometheus_engine_query_samples_total")
|
|
if [ "$queries_total" != "0" ] && [ -n "$queries_total" ]; then
|
|
print_metric "Total query samples" "$(format_number "$queries_total")"
|
|
fi
|
|
|
|
# Check for slow queries
|
|
local p99_inner
|
|
p99_inner=$(echo "$query_durations" | grep 'quantile="0.99"' | grep 'inner_eval' | awk '{print $NF}' | head -1)
|
|
|
|
if [ -n "$p99_inner" ] && [ "$p99_inner" != "NaN" ]; then
|
|
local p99_secs
|
|
p99_secs=$(echo "$p99_inner" | awk '{printf "%d", $1}')
|
|
if [ "$p99_secs" -gt 10 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "queries" "p99 inner eval query latency is ${p99_inner}s -- consider adding recording rules for complex queries or reducing cardinality"
|
|
fi
|
|
fi
|
|
|
|
# Concurrent query check
|
|
if [ "$max_concurrency" != "unknown" ] && [ "$queries_active" != "0" ]; then
|
|
local active_num max_num
|
|
active_num=$(echo "$queries_active" | awk '{printf "%d", $1}')
|
|
max_num=$(echo "$max_concurrency" | awk '{printf "%d", $1}')
|
|
if [ "$active_num" -ge "$max_num" ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "queries" "Active queries ($active_num) at or near max-concurrency ($max_num) -- consider increasing --query.max-concurrency"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: SCRAPES
|
|
# ============================================================================
|
|
|
|
analyze_scrapes() {
|
|
print_header "Scrape Performance"
|
|
|
|
if [ -z "$TARGETS_DATA" ]; then
|
|
echo " (targets endpoint not available)"
|
|
return
|
|
fi
|
|
|
|
# Count targets by health
|
|
local total_targets up_targets down_targets unknown_targets
|
|
total_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[]] | length')
|
|
up_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "up")] | length')
|
|
down_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "down")] | length')
|
|
unknown_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health != "up" and .health != "down")] | length')
|
|
|
|
print_metric "Total targets" "$total_targets"
|
|
print_metric "Targets up" "${GREEN}${up_targets}${NC}"
|
|
if [ "$down_targets" -gt 0 ] 2>/dev/null; then
|
|
print_metric "Targets down" "${RED}${down_targets}${NC}"
|
|
else
|
|
print_metric "Targets down" "$down_targets"
|
|
fi
|
|
if [ "$unknown_targets" -gt 0 ] 2>/dev/null; then
|
|
print_metric "Targets unknown" "$unknown_targets"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# List down targets
|
|
if [ "$down_targets" -gt 0 ] 2>/dev/null; then
|
|
echo -e " ${BOLD}${RED}Down Targets${NC}"
|
|
echo ""
|
|
|
|
echo "$TARGETS_DATA" | jq -r '
|
|
.data.activeTargets // [] |
|
|
.[] | select(.health == "down") |
|
|
" \(.labels.job // "unknown") \(.labels.instance // .scrapeUrl) \(.lastError // "no error")"
|
|
' 2>/dev/null | head -20
|
|
|
|
echo ""
|
|
|
|
add_recommendation "CRITICAL" "scrapes" "$down_targets scrape target(s) are down -- check target availability"
|
|
fi
|
|
|
|
# Scrape duration analysis per job
|
|
echo -e " ${BOLD}Scrape Duration by Job${NC}"
|
|
echo ""
|
|
printf " %-30s %10s %10s %10s %8s\n" "Job" "Avg" "Max" "Interval" "Status"
|
|
printf " %-30s %10s %10s %10s %8s\n" "---" "---" "---" "--------" "------"
|
|
|
|
# Get global scrape interval from config
|
|
local global_interval=""
|
|
if [ -n "$CONFIG_DATA" ]; then
|
|
global_interval=$(echo "$CONFIG_DATA" | jq -r '.data.yaml' 2>/dev/null | grep -m1 'scrape_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"')
|
|
fi
|
|
[ -z "$global_interval" ] && global_interval="60s"
|
|
|
|
# Parse interval to seconds
|
|
local global_interval_secs
|
|
global_interval_secs=$(echo "$global_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 60)
|
|
|
|
echo "$TARGETS_DATA" | jq -r '
|
|
[.data.activeTargets // [] | .[] | select(.health == "up") |
|
|
{job: (.labels.job // "unknown"), duration: .lastScrapeDuration, interval: .scrapeInterval}] |
|
|
group_by(.job) |
|
|
.[] |
|
|
{
|
|
job: .[0].job,
|
|
avg: ([.[].duration] | add / length),
|
|
max: ([.[].duration] | max),
|
|
interval: .[0].interval,
|
|
count: length
|
|
} |
|
|
"\(.job)|\(.avg)|\(.max)|\(.interval // "")"
|
|
' 2>/dev/null | while IFS='|' read -r job avg max interval; do
|
|
[ -z "$job" ] && continue
|
|
|
|
local avg_fmt max_fmt
|
|
avg_fmt=$(echo "$avg" | awk '{if ($1 < 1) printf "%.0f ms", $1*1000; else printf "%.2f s", $1}')
|
|
max_fmt=$(echo "$max" | awk '{if ($1 < 1) printf "%.0f ms", $1*1000; else printf "%.2f s", $1}')
|
|
|
|
local effective_interval="${interval:-$global_interval}"
|
|
local interval_secs
|
|
interval_secs=$(echo "$effective_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo "$global_interval_secs")
|
|
|
|
local status="OK"
|
|
local max_num
|
|
max_num=$(echo "$max" | awk '{printf "%.0f", $1 * 100}')
|
|
local interval_80
|
|
interval_80=$(echo "$interval_secs" | awk '{printf "%.0f", $1 * 80}')
|
|
|
|
if [ "$max_num" -gt "$interval_80" ] 2>/dev/null; then
|
|
status="${RED}SLOW${NC}"
|
|
add_recommendation "WARNING" "scrapes" "Job '$job' max scrape duration (${max_fmt}) exceeds 80% of scrape interval (${effective_interval}) -- increase interval or optimize target"
|
|
fi
|
|
|
|
printf " %-30s %10s %10s %10s %b\n" "$job" "$avg_fmt" "$max_fmt" "$effective_interval" "$status"
|
|
done
|
|
|
|
echo ""
|
|
|
|
# Scrape sample stats
|
|
local exceeded dropped
|
|
exceeded=$(prom_metric "prometheus_target_scrapes_exceeded_sample_limit_total")
|
|
dropped=$(prom_metric "prometheus_target_scrapes_sample_duplicate_timestamp_total")
|
|
|
|
if [ "$exceeded" != "0" ] && [ -n "$exceeded" ]; then
|
|
local exceeded_num
|
|
exceeded_num=$(echo "$exceeded" | awk '{printf "%d", $1}')
|
|
if [ "$exceeded_num" -gt 0 ] 2>/dev/null; then
|
|
print_metric "Sample limit exceeded" "$(format_number "$exceeded")"
|
|
add_recommendation "WARNING" "scrapes" "$(format_number "$exceeded") scrapes exceeded sample_limit -- increase sample_limit in scrape config or reduce target metrics"
|
|
fi
|
|
fi
|
|
|
|
if [ "$dropped" != "0" ] && [ -n "$dropped" ]; then
|
|
local dropped_num
|
|
dropped_num=$(echo "$dropped" | awk '{printf "%d", $1}')
|
|
if [ "$dropped_num" -gt 0 ] 2>/dev/null; then
|
|
print_metric "Duplicate timestamps" "$(format_number "$dropped")"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: RULES
|
|
# ============================================================================
|
|
|
|
analyze_rules() {
|
|
print_header "Rule Evaluation"
|
|
|
|
if [ -z "$RULES_DATA" ]; then
|
|
echo " (rules endpoint not available)"
|
|
return
|
|
fi
|
|
|
|
# Count rules
|
|
local recording_rules alerting_rules
|
|
recording_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "recording")] | length')
|
|
alerting_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "alerting")] | length')
|
|
local total_groups
|
|
total_groups=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[]] | length')
|
|
|
|
print_metric "Rule groups" "$total_groups"
|
|
print_metric "Recording rules" "$recording_rules"
|
|
print_metric "Alerting rules" "$alerting_rules"
|
|
echo ""
|
|
|
|
# Rule evaluation failures
|
|
local eval_failures
|
|
eval_failures=$(prom_metric "prometheus_rule_evaluation_failures_total")
|
|
if [ "$eval_failures" != "0" ] && [ -n "$eval_failures" ]; then
|
|
local fail_num
|
|
fail_num=$(echo "$eval_failures" | awk '{printf "%d", $1}')
|
|
if [ "$fail_num" -gt 0 ] 2>/dev/null; then
|
|
print_metric "Evaluation failures" "${RED}$(format_number "$eval_failures")${NC}"
|
|
add_recommendation "WARNING" "rules" "$(format_number "$eval_failures") rule evaluation failures -- check rule syntax and query targets"
|
|
fi
|
|
fi
|
|
|
|
# Group evaluation durations
|
|
echo -e " ${BOLD}Rule Group Evaluation Duration${NC}"
|
|
echo ""
|
|
printf " %-40s %12s %12s %8s\n" "Group" "Last Eval" "Interval" "Status"
|
|
printf " %-40s %12s %12s %8s\n" "-----" "---------" "--------" "------"
|
|
|
|
echo "$RULES_DATA" | jq -r '
|
|
.data.groups // [] | .[] |
|
|
"\(.name)|\(.lastEvaluation // "")|\(.evaluationTime // 0)|\(.interval // 0)"
|
|
' 2>/dev/null | while IFS='|' read -r name _last_eval eval_time interval; do
|
|
[ -z "$name" ] && continue
|
|
|
|
local eval_fmt interval_fmt status
|
|
eval_fmt=$(echo "$eval_time" | awk '{if ($1 < 0.001) printf "%.3f ms", $1*1000; else if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')
|
|
interval_fmt="${interval}s"
|
|
|
|
status="OK"
|
|
if [ -n "$interval" ] && [ "$interval" != "0" ]; then
|
|
local eval_pct
|
|
eval_pct=$(echo "$eval_time $interval" | awk '{printf "%d", ($1 / $2) * 100}')
|
|
if [ "$eval_pct" -gt 50 ] 2>/dev/null; then
|
|
status="${YELLOW}SLOW${NC}"
|
|
add_recommendation "WARNING" "rules" "Rule group '$name' evaluation (${eval_fmt}) exceeds 50% of interval (${interval_fmt}) -- consider splitting group or adding recording rules"
|
|
fi
|
|
fi
|
|
|
|
printf " %-40s %12s %12s %b\n" "${name:0:40}" "$eval_fmt" "$interval_fmt" "$status"
|
|
done
|
|
|
|
echo ""
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: STORAGE
|
|
# ============================================================================
|
|
|
|
analyze_storage() {
|
|
print_header "Storage"
|
|
|
|
# Retention settings
|
|
local retention_time retention_size
|
|
retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"')
|
|
retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"')
|
|
local storage_path
|
|
storage_path=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.path"] // "/data"')
|
|
|
|
print_metric "Storage path" "$storage_path"
|
|
print_metric "Retention (time)" "$retention_time"
|
|
print_metric "Retention (size)" "$retention_size"
|
|
echo ""
|
|
|
|
# Block stats from /metrics
|
|
local blocks_loaded
|
|
blocks_loaded=$(prom_metric "prometheus_tsdb_blocks_loaded")
|
|
print_metric "Blocks loaded" "$(format_number "$blocks_loaded")"
|
|
|
|
# Storage size from metrics
|
|
local storage_size_bytes
|
|
storage_size_bytes=$(prom_metric "prometheus_tsdb_storage_blocks_bytes")
|
|
if [ "$storage_size_bytes" != "0" ] && [ -n "$storage_size_bytes" ]; then
|
|
print_metric "Block storage size" "$(format_bytes "$storage_size_bytes")"
|
|
fi
|
|
|
|
# WAL size
|
|
local wal_size
|
|
wal_size=$(prom_metric "prometheus_tsdb_wal_storage_size_bytes")
|
|
if [ "$wal_size" != "0" ] && [ -n "$wal_size" ]; then
|
|
print_metric "WAL size" "$(format_bytes "$wal_size")"
|
|
fi
|
|
|
|
# Total storage
|
|
local total_storage
|
|
total_storage=$(echo "${storage_size_bytes:-0} ${wal_size:-0}" | awk '{printf "%.0f", $1 + $2}')
|
|
if [ "$total_storage" -gt 0 ] 2>/dev/null; then
|
|
print_metric "Total TSDB size" "$(format_bytes "$total_storage")"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# WAL segments
|
|
local wal_segments
|
|
wal_segments=$(prom_metric "prometheus_tsdb_wal_segment_current")
|
|
if [ "$wal_segments" != "0" ] && [ -n "$wal_segments" ]; then
|
|
print_metric "WAL current segment" "$(format_number "$wal_segments")"
|
|
fi
|
|
|
|
# Growth estimation
|
|
echo ""
|
|
echo -e " ${BOLD}Growth Estimation${NC}"
|
|
echo ""
|
|
|
|
local head_series samples_appended
|
|
head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
|
|
samples_appended=$(prom_metric "prometheus_tsdb_head_samples_appended_total")
|
|
|
|
local start_time uptime_secs=0
|
|
start_time=$(prom_metric "process_start_time_seconds")
|
|
local now_time
|
|
now_time=$(date +%s)
|
|
if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then
|
|
uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}')
|
|
uptime_secs=${uptime_secs:-0}
|
|
fi
|
|
|
|
if [ "${uptime_secs:-0}" -gt 0 ] 2>/dev/null && [ -n "$samples_appended" ] && [ "$samples_appended" != "0" ]; then
|
|
local samples_per_sec samples_per_day
|
|
samples_per_sec=$(echo "$samples_appended $uptime_secs" | awk '{printf "%.1f", $1 / $2}')
|
|
samples_per_day=$(echo "$samples_per_sec" | awk '{printf "%.0f", $1 * 86400}')
|
|
|
|
print_metric "Samples/second" "$(format_number "$samples_per_sec")"
|
|
print_metric "Samples/day" "$(format_number "$samples_per_day")"
|
|
|
|
# Estimate bytes per sample (~1-2 bytes compressed)
|
|
local bytes_per_day_low bytes_per_day_high
|
|
bytes_per_day_low=$(echo "$samples_per_day" | awk '{printf "%.0f", $1 * 1.0}')
|
|
bytes_per_day_high=$(echo "$samples_per_day" | awk '{printf "%.0f", $1 * 2.0}')
|
|
|
|
print_metric "Estimated disk/day" "$(format_bytes "$bytes_per_day_low") -- $(format_bytes "$bytes_per_day_high")"
|
|
|
|
# Estimate 30-day storage
|
|
local monthly_low monthly_high
|
|
monthly_low=$(echo "$bytes_per_day_low" | awk '{printf "%.0f", $1 * 30}')
|
|
monthly_high=$(echo "$bytes_per_day_high" | awk '{printf "%.0f", $1 * 30}')
|
|
print_metric "Estimated disk/30 days" "$(format_bytes "$monthly_low") -- $(format_bytes "$monthly_high")"
|
|
else
|
|
echo " (insufficient uptime data for growth estimation)"
|
|
fi
|
|
|
|
# Retention size check
|
|
if [ "$retention_size" != "not set" ] && [ "$retention_size" != "0" ] && [ -n "$total_storage" ]; then
|
|
local ret_bytes
|
|
# Parse retention size (e.g., "512MB", "1GB")
|
|
ret_bytes=$(echo "$retention_size" | awk '{
|
|
s = $1
|
|
if (s ~ /TB/) { gsub(/TB/, "", s); printf "%.0f", s * 1099511627776 }
|
|
else if (s ~ /GB/) { gsub(/GB/, "", s); printf "%.0f", s * 1073741824 }
|
|
else if (s ~ /MB/) { gsub(/MB/, "", s); printf "%.0f", s * 1048576 }
|
|
else if (s ~ /KB/) { gsub(/KB/, "", s); printf "%.0f", s * 1024 }
|
|
else { printf "%.0f", s }
|
|
}')
|
|
|
|
if [ "$ret_bytes" -gt 0 ] 2>/dev/null; then
|
|
local usage_pct
|
|
usage_pct=$(echo "$total_storage $ret_bytes" | awk '{printf "%d", ($1 / $2) * 100}')
|
|
print_metric "Retention usage" "${usage_pct}%"
|
|
|
|
if [ "$usage_pct" -gt 90 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "storage" "Storage at ${usage_pct}% of retention size limit ($retention_size) -- data will be dropped soon, consider increasing retention.size"
|
|
fi
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: MEMORY
|
|
# ============================================================================
|
|
|
|
analyze_memory() {
|
|
print_header "Memory"
|
|
|
|
# Process memory
|
|
local rss_bytes vss_bytes
|
|
rss_bytes=$(prom_metric "process_resident_memory_bytes")
|
|
vss_bytes=$(prom_metric "process_virtual_memory_bytes")
|
|
|
|
print_metric "Process RSS" "$(format_bytes "$rss_bytes")"
|
|
print_metric "Process virtual" "$(format_bytes "$vss_bytes")"
|
|
echo ""
|
|
|
|
# Go runtime memory
|
|
local heap_alloc heap_sys heap_inuse
|
|
heap_alloc=$(prom_metric "go_memstats_heap_alloc_bytes")
|
|
heap_sys=$(prom_metric "go_memstats_heap_sys_bytes")
|
|
heap_inuse=$(prom_metric "go_memstats_heap_inuse_bytes")
|
|
|
|
echo -e " ${BOLD}Go Runtime Memory${NC}"
|
|
echo ""
|
|
print_metric "Heap alloc" "$(format_bytes "$heap_alloc")"
|
|
print_metric "Heap sys" "$(format_bytes "$heap_sys")"
|
|
print_metric "Heap in use" "$(format_bytes "$heap_inuse")"
|
|
|
|
# RSS vs Go heap ratio
|
|
if [ -n "$rss_bytes" ] && [ "$rss_bytes" != "0" ] && [ -n "$heap_alloc" ] && [ "$heap_alloc" != "0" ]; then
|
|
local ratio
|
|
ratio=$(echo "$rss_bytes $heap_alloc" | awk '{printf "%.1f", $1 / $2}')
|
|
print_metric "RSS / Heap ratio" "${ratio}x"
|
|
|
|
local ratio_int
|
|
ratio_int=$(echo "$ratio" | awk '{printf "%d", $1}')
|
|
if [ "$ratio_int" -ge 3 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "memory" "RSS is ${ratio}x Go heap -- indicates memory fragmentation or mmap overhead, consider restarting Prometheus during a maintenance window"
|
|
fi
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# GC stats
|
|
echo -e " ${BOLD}Garbage Collection${NC}"
|
|
echo ""
|
|
|
|
local gc_count gc_pause_total
|
|
gc_count=$(prom_metric "go_gc_duration_seconds_count")
|
|
gc_pause_total=$(prom_metric "go_gc_duration_seconds_sum")
|
|
|
|
print_metric "GC cycles (total)" "$(format_number "$gc_count")"
|
|
if [ -n "$gc_pause_total" ] && [ "$gc_pause_total" != "0" ]; then
|
|
print_metric "GC pause (total)" "$(echo "$gc_pause_total" | awk '{if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')"
|
|
|
|
# Average GC pause
|
|
if [ -n "$gc_count" ] && [ "$gc_count" != "0" ]; then
|
|
local avg_pause
|
|
avg_pause=$(echo "$gc_pause_total $gc_count" | awk '{printf "%.3f", ($1 / $2) * 1000}')
|
|
print_metric "GC avg pause" "${avg_pause} ms"
|
|
fi
|
|
fi
|
|
|
|
# GC quantiles
|
|
local gc_p99
|
|
gc_p99=$(prom_metric_labeled 'go_gc_duration_seconds{quantile="1"}' | awk '{print $NF}')
|
|
if [ -n "$gc_p99" ] && [ "$gc_p99" != "0" ]; then
|
|
print_metric "GC max pause" "$(echo "$gc_p99" | awk '{if ($1 < 1) printf "%.1f ms", $1*1000; else printf "%.2f s", $1}')"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Goroutines
|
|
local goroutines
|
|
goroutines=$(echo "$RUNTIME_INFO" | jq -r '.data.goroutines // 0')
|
|
print_metric "Goroutines" "$(format_number "$goroutines")"
|
|
|
|
if [ "$goroutines" -gt 1000 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "memory" "Goroutine count is $goroutines (>1000) -- may indicate resource leak or excessive concurrency"
|
|
fi
|
|
|
|
# Open file descriptors
|
|
local open_fds max_fds
|
|
open_fds=$(prom_metric "process_open_fds")
|
|
max_fds=$(prom_metric "process_max_fds")
|
|
|
|
if [ -n "$open_fds" ] && [ "$open_fds" != "0" ]; then
|
|
print_metric "Open file descriptors" "$(format_number "$open_fds")"
|
|
if [ -n "$max_fds" ] && [ "$max_fds" != "0" ]; then
|
|
print_metric "Max file descriptors" "$(format_number "$max_fds")"
|
|
local fd_pct
|
|
fd_pct=$(echo "$open_fds $max_fds" | awk '{printf "%d", ($1 / $2) * 100}')
|
|
if [ "$fd_pct" -gt 80 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "memory" "File descriptor usage at ${fd_pct}% -- approaching limit, increase ulimit -n"
|
|
fi
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: CONFIG
|
|
# ============================================================================
|
|
|
|
analyze_config() {
|
|
print_header "Configuration Review"
|
|
|
|
if [ -z "$CONFIG_DATA" ] && [ -z "$FLAGS_DATA" ]; then
|
|
echo " (configuration data not available)"
|
|
return
|
|
fi
|
|
|
|
# Global config from YAML
|
|
local config_yaml=""
|
|
if [ -n "$CONFIG_DATA" ]; then
|
|
config_yaml=$(echo "$CONFIG_DATA" | jq -r '.data.yaml // ""' 2>/dev/null)
|
|
fi
|
|
|
|
local scrape_interval scrape_timeout eval_interval
|
|
scrape_interval=$(echo "$config_yaml" | grep -m1 'scrape_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"')
|
|
scrape_timeout=$(echo "$config_yaml" | grep -m1 'scrape_timeout:' | awk '{print $2}' | tr -d "'" | tr -d '"')
|
|
eval_interval=$(echo "$config_yaml" | grep -m1 'evaluation_interval:' | awk '{print $2}' | tr -d "'" | tr -d '"')
|
|
|
|
[ -z "$scrape_interval" ] && scrape_interval="1m"
|
|
[ -z "$scrape_timeout" ] && scrape_timeout="10s"
|
|
[ -z "$eval_interval" ] && eval_interval="1m"
|
|
|
|
echo -e " ${BOLD}Global Settings${NC}"
|
|
echo ""
|
|
print_metric "Scrape interval" "$scrape_interval"
|
|
print_metric "Scrape timeout" "$scrape_timeout"
|
|
print_metric "Evaluation interval" "$eval_interval"
|
|
|
|
# Parse intervals to seconds for comparison
|
|
local scrape_int_secs scrape_to_secs
|
|
scrape_int_secs=$(echo "$scrape_interval" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 60)
|
|
scrape_to_secs=$(echo "$scrape_timeout" | sed 's/m/*60/;s/s//;s/h/*3600/' | bc 2>/dev/null || echo 10)
|
|
|
|
if [ "$scrape_to_secs" -ge "$scrape_int_secs" ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "config" "scrape_timeout ($scrape_timeout) >= scrape_interval ($scrape_interval) -- timeout should be less than interval"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# External labels
|
|
local external_labels
|
|
external_labels=$(echo "$config_yaml" | awk '/^ external_labels:/,/^ [a-z]/' | grep -v 'external_labels:' | grep -v '^ [a-z]' | grep ':')
|
|
|
|
echo -e " ${BOLD}External Labels${NC}"
|
|
echo ""
|
|
|
|
if [ -n "$external_labels" ]; then
|
|
echo "$external_labels" | while IFS= read -r line; do
|
|
echo " $line"
|
|
done
|
|
else
|
|
echo " (none configured)"
|
|
add_recommendation "INFO" "config" "No external labels configured -- recommended for remote write, federation, and cross-cluster identification"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Remote write/read
|
|
echo -e " ${BOLD}Remote Endpoints${NC}"
|
|
echo ""
|
|
|
|
local remote_write_count remote_read_count
|
|
remote_write_count=$(echo "$config_yaml" | grep -c 'remote_write:' 2>/dev/null || true)
|
|
remote_read_count=$(echo "$config_yaml" | grep -c 'remote_read:' 2>/dev/null || true)
|
|
|
|
local has_remote_write="no"
|
|
local has_remote_read="no"
|
|
[ "$remote_write_count" -gt 0 ] 2>/dev/null && has_remote_write="yes"
|
|
[ "$remote_read_count" -gt 0 ] 2>/dev/null && has_remote_read="yes"
|
|
|
|
print_metric "Remote write" "$has_remote_write"
|
|
print_metric "Remote read" "$has_remote_read"
|
|
|
|
echo ""
|
|
|
|
# Job count and interval distribution
|
|
echo -e " ${BOLD}Scrape Jobs${NC}"
|
|
echo ""
|
|
|
|
local job_count
|
|
job_count=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[].labels.job] | unique | length' 2>/dev/null)
|
|
print_metric "Total scrape jobs" "${job_count:-0}"
|
|
|
|
# Check for aggressive scrape intervals
|
|
if [ -n "$TARGETS_DATA" ]; then
|
|
local fast_scrape_jobs
|
|
fast_scrape_jobs=$(echo "$TARGETS_DATA" | jq -r '
|
|
[.data.activeTargets // [] | .[] |
|
|
select(.scrapeInterval != null) |
|
|
{job: .labels.job, interval: .scrapeInterval}] |
|
|
unique_by(.job) |
|
|
.[] |
|
|
select(
|
|
(.interval | test("^[0-9]+s$")) and
|
|
(.interval | gsub("s$"; "") | tonumber) < 10
|
|
) |
|
|
.job
|
|
' 2>/dev/null)
|
|
|
|
if [ -n "$fast_scrape_jobs" ]; then
|
|
local fast_count
|
|
fast_count=$(echo "$fast_scrape_jobs" | wc -l)
|
|
if [ "$fast_count" -gt 3 ] 2>/dev/null; then
|
|
add_recommendation "WARNING" "config" "$fast_count jobs have scrape_interval < 10s -- high scrape frequency increases storage cost and cardinality"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Key flags
|
|
echo ""
|
|
echo -e " ${BOLD}Key Flags${NC}"
|
|
echo ""
|
|
|
|
if [ -n "$FLAGS_DATA" ]; then
|
|
local web_listen log_level tsdb_wal_compression
|
|
web_listen=$(echo "$FLAGS_DATA" | jq -r '.data["web.listen-address"] // "unknown"')
|
|
log_level=$(echo "$FLAGS_DATA" | jq -r '.data["log.level"] // "unknown"')
|
|
tsdb_wal_compression=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.wal-compression"] // "unknown"')
|
|
|
|
print_metric "Listen address" "$web_listen"
|
|
print_metric "Log level" "$log_level"
|
|
print_metric "WAL compression" "$tsdb_wal_compression"
|
|
|
|
if [ "$tsdb_wal_compression" = "false" ]; then
|
|
add_recommendation "INFO" "config" "WAL compression is disabled -- enabling it (--storage.tsdb.wal-compression) can reduce WAL size by ~50%"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# SECTION: SUMMARY
|
|
# ============================================================================
|
|
|
|
analyze_summary() {
|
|
print_header "Recommendations"
|
|
|
|
if [ ${#RECOMMENDATIONS[@]} -eq 0 ]; then
|
|
print_status "OK" "No issues detected -- Prometheus appears healthy"
|
|
echo ""
|
|
echo -e " ${BOLD}${GREEN}Health Score: 100 / 100${NC}"
|
|
return
|
|
fi
|
|
|
|
# Print recommendations grouped by severity
|
|
# Sort: CRITICAL first, then WARNING, then INFO
|
|
for severity in CRITICAL WARNING INFO; do
|
|
local found=false
|
|
for rec in "${RECOMMENDATIONS[@]}"; do
|
|
local sev section message
|
|
sev=$(echo "$rec" | cut -d'|' -f1)
|
|
section=$(echo "$rec" | cut -d'|' -f2)
|
|
message=$(echo "$rec" | cut -d'|' -f3-)
|
|
|
|
if [ "$sev" = "$severity" ]; then
|
|
if [ "$found" = false ]; then
|
|
found=true
|
|
fi
|
|
echo -e " $(severity_tag "$sev") ${BOLD}[${section}]${NC} $message"
|
|
echo ""
|
|
fi
|
|
done
|
|
done
|
|
|
|
# Calculate health score
|
|
local score=100
|
|
score=$((score - (CRITICAL_COUNT * 15)))
|
|
score=$((score - (WARNING_COUNT * 5)))
|
|
score=$((score - (INFO_COUNT * 1)))
|
|
[ "$score" -lt 0 ] && score=0
|
|
|
|
echo -e "${BOLD}${CYAN}====================================================${NC}"
|
|
|
|
echo ""
|
|
printf " Issues found: "
|
|
[ "$CRITICAL_COUNT" -gt 0 ] && printf "${RED}%d critical${NC} " "$CRITICAL_COUNT"
|
|
[ "$WARNING_COUNT" -gt 0 ] && printf "${YELLOW}%d warning${NC} " "$WARNING_COUNT"
|
|
[ "$INFO_COUNT" -gt 0 ] && printf "${GREEN}%d info${NC}" "$INFO_COUNT"
|
|
echo ""
|
|
echo ""
|
|
|
|
local score_color
|
|
if [ "$score" -ge 80 ]; then
|
|
score_color="$GREEN"
|
|
elif [ "$score" -ge 50 ]; then
|
|
score_color="$YELLOW"
|
|
else
|
|
score_color="$RED"
|
|
fi
|
|
|
|
echo -e " ${BOLD}Health Score: ${score_color}${score} / 100${NC}"
|
|
}
|
|
|
|
# ============================================================================
|
|
# JSON OUTPUT
|
|
# ============================================================================
|
|
|
|
output_json() {
|
|
collect_all_data
|
|
|
|
# Build JSON from all sections
|
|
local version head_series head_chunks retention_time retention_size rss_bytes
|
|
|
|
version=$(echo "$RUNTIME_INFO" | jq -r '.data.version // "unknown"')
|
|
local start_time now_time
|
|
start_time=$(prom_metric "process_start_time_seconds")
|
|
now_time=$(date +%s)
|
|
local uptime_secs=0
|
|
if [ -n "$start_time" ] && [ "$start_time" != "0" ]; then
|
|
uptime_secs=$(echo "$now_time - $start_time" | bc 2>/dev/null | awk '{printf "%d", $1}')
|
|
fi
|
|
|
|
head_series=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.numSeries // 0')
|
|
head_chunks=$(echo "$TSDB_STATUS" | jq -r '.data.headStats.chunkCount // 0')
|
|
retention_time=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.time"] // "not set"')
|
|
retention_size=$(echo "$FLAGS_DATA" | jq -r '.data["storage.tsdb.retention.size"] // "not set"')
|
|
rss_bytes=$(prom_metric "process_resident_memory_bytes")
|
|
|
|
local compactions_total compactions_failed wal_corruptions ooo_total
|
|
compactions_total=$(prom_metric "prometheus_tsdb_compactions_total")
|
|
compactions_failed=$(prom_metric "prometheus_tsdb_compactions_failed_total")
|
|
wal_corruptions=$(prom_metric "prometheus_tsdb_wal_corruptions_total")
|
|
ooo_total=$(prom_metric "prometheus_tsdb_out_of_order_samples_total")
|
|
|
|
local total_targets up_targets down_targets
|
|
total_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[]] | length')
|
|
up_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "up")] | length')
|
|
down_targets=$(echo "$TARGETS_DATA" | jq '[.data.activeTargets // [] | .[] | select(.health == "down")] | length')
|
|
|
|
local recording_rules alerting_rules
|
|
recording_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "recording")] | length')
|
|
alerting_rules=$(echo "$RULES_DATA" | jq '[.data.groups // [] | .[].rules[] | select(.type == "alerting")] | length')
|
|
|
|
# Top metrics
|
|
local top_metrics
|
|
top_metrics=$(echo "$TSDB_STATUS" | jq '[.data.seriesCountByMetricName // [] | .[:10] | .[] | {name: .name, series: .value}]')
|
|
|
|
# Run all checks to populate recommendations
|
|
NO_COLOR=true
|
|
RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC=""
|
|
|
|
# Suppress output, just collect recommendations
|
|
analyze_overview >/dev/null 2>&1
|
|
analyze_tsdb >/dev/null 2>&1
|
|
analyze_cardinality >/dev/null 2>&1
|
|
analyze_queries >/dev/null 2>&1
|
|
analyze_scrapes >/dev/null 2>&1
|
|
analyze_rules >/dev/null 2>&1
|
|
analyze_storage >/dev/null 2>&1
|
|
analyze_memory >/dev/null 2>&1
|
|
analyze_config >/dev/null 2>&1
|
|
|
|
# Build recommendations JSON
|
|
local rec_json="["
|
|
local first=true
|
|
for rec in "${RECOMMENDATIONS[@]}"; do
|
|
local sev section message
|
|
sev=$(echo "$rec" | cut -d'|' -f1)
|
|
section=$(echo "$rec" | cut -d'|' -f2)
|
|
message=$(echo "$rec" | cut -d'|' -f3-)
|
|
|
|
[ "$first" = true ] && first=false || rec_json+=","
|
|
rec_json+="{\"severity\":\"$sev\",\"section\":\"$section\",\"message\":$(echo "$message" | jq -Rs '.')}"
|
|
done
|
|
rec_json+="]"
|
|
|
|
local score=100
|
|
score=$((score - (CRITICAL_COUNT * 15)))
|
|
score=$((score - (WARNING_COUNT * 5)))
|
|
score=$((score - (INFO_COUNT * 1)))
|
|
[ "$score" -lt 0 ] && score=0
|
|
|
|
jq -n \
|
|
--arg version "$version" \
|
|
--argjson uptime "$uptime_secs" \
|
|
--argjson head_series "$head_series" \
|
|
--argjson head_chunks "$head_chunks" \
|
|
--arg retention_time "$retention_time" \
|
|
--arg retention_size "$retention_size" \
|
|
--argjson rss_bytes "$(echo "$rss_bytes" | awk '{printf "%d", $1}')" \
|
|
--argjson compactions_total "$(echo "$compactions_total" | awk '{printf "%d", $1}')" \
|
|
--argjson compactions_failed "$(echo "$compactions_failed" | awk '{printf "%d", $1}')" \
|
|
--argjson wal_corruptions "$(echo "$wal_corruptions" | awk '{printf "%d", $1}')" \
|
|
--argjson ooo_samples "$(echo "$ooo_total" | awk '{printf "%d", $1}')" \
|
|
--argjson total_targets "$total_targets" \
|
|
--argjson up_targets "$up_targets" \
|
|
--argjson down_targets "$down_targets" \
|
|
--argjson recording_rules "$recording_rules" \
|
|
--argjson alerting_rules "$alerting_rules" \
|
|
--argjson top_metrics "$top_metrics" \
|
|
--argjson recommendations "$rec_json" \
|
|
--argjson score "$score" \
|
|
--argjson critical "$CRITICAL_COUNT" \
|
|
--argjson warnings "$WARNING_COUNT" \
|
|
--argjson info "$INFO_COUNT" \
|
|
'{
|
|
prometheus: {
|
|
version: $version,
|
|
uptime_seconds: $uptime,
|
|
memory_rss_bytes: $rss_bytes
|
|
},
|
|
tsdb: {
|
|
head_series: $head_series,
|
|
head_chunks: $head_chunks,
|
|
retention_time: $retention_time,
|
|
retention_size: $retention_size,
|
|
compactions_total: $compactions_total,
|
|
compactions_failed: $compactions_failed,
|
|
wal_corruptions: $wal_corruptions,
|
|
out_of_order_samples: $ooo_samples
|
|
},
|
|
targets: {
|
|
total: $total_targets,
|
|
up: $up_targets,
|
|
down: $down_targets
|
|
},
|
|
rules: {
|
|
recording: $recording_rules,
|
|
alerting: $alerting_rules
|
|
},
|
|
cardinality: {
|
|
top_metrics: $top_metrics
|
|
},
|
|
health: {
|
|
score: $score,
|
|
critical: $critical,
|
|
warnings: $warnings,
|
|
info: $info
|
|
},
|
|
recommendations: $recommendations
|
|
}'
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN
|
|
# ============================================================================
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
if ! check_requirements; then
|
|
exit 1
|
|
fi
|
|
|
|
# JSON mode
|
|
if [ "$JSON_MODE" = true ]; then
|
|
if [ -n "$OUTPUT_FILE" ]; then
|
|
output_json > "$OUTPUT_FILE"
|
|
echo "JSON report written to $OUTPUT_FILE" >&2
|
|
else
|
|
output_json
|
|
fi
|
|
return
|
|
fi
|
|
|
|
# Text report mode
|
|
collect_all_data
|
|
|
|
{
|
|
echo -e "${BOLD}Prometheus Performance Analyzer v1.0${NC}"
|
|
echo -e "${DIM}Target: ${PROM_URL}${NC}"
|
|
echo -e "${DIM}Date: $(date '+%Y-%m-%d %H:%M:%S %Z')${NC}"
|
|
|
|
if [ -n "$SECTION" ]; then
|
|
case "$SECTION" in
|
|
overview) analyze_overview ;;
|
|
tsdb) analyze_tsdb ;;
|
|
cardinality) analyze_cardinality ;;
|
|
queries) analyze_queries ;;
|
|
scrapes) analyze_scrapes ;;
|
|
rules) analyze_rules ;;
|
|
storage) analyze_storage ;;
|
|
memory) analyze_memory ;;
|
|
config) analyze_config ;;
|
|
summary) analyze_summary ;;
|
|
*)
|
|
echo "Unknown section: $SECTION" >&2
|
|
echo "Valid sections: overview tsdb cardinality queries scrapes rules storage memory config summary" >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
# Always show summary if not already shown
|
|
if [ "$SECTION" != "summary" ]; then
|
|
analyze_summary
|
|
fi
|
|
else
|
|
analyze_overview
|
|
analyze_tsdb
|
|
analyze_cardinality
|
|
analyze_queries
|
|
analyze_scrapes
|
|
analyze_rules
|
|
analyze_storage
|
|
analyze_memory
|
|
analyze_config
|
|
analyze_summary
|
|
fi
|
|
|
|
echo ""
|
|
} | if [ -n "$OUTPUT_FILE" ]; then
|
|
cat > "$OUTPUT_FILE"
|
|
echo "Report written to $OUTPUT_FILE" >&2
|
|
else
|
|
cat
|
|
fi
|
|
}
|
|
|
|
main "$@"
|