a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
415 lines
12 KiB
Bash
Executable File
415 lines
12 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# PostgreSQL HA Prometheus Metrics Exporter
|
|
#
|
|
# Prometheus textfile collector exporter for PostgreSQL HA clusters
|
|
# managed by Patroni. Queries PostgreSQL system views via psql and
|
|
# the Patroni REST API via curl to collect replication lag, WAL
|
|
# generation rate, standby count, leader/follower state, failover
|
|
# history, timeline ID, connection statistics, and database sizes.
|
|
#
|
|
# Usage:
|
|
# ./postgresql-ha-exporter.sh
|
|
# ./postgresql-ha-exporter.sh --textfile
|
|
# ./postgresql-ha-exporter.sh --install
|
|
#
|
|
# Parameters:
|
|
# --textfile Write to textfile collector directory
|
|
# --install Create cron job for automatic collection
|
|
# --help Show usage
|
|
#
|
|
# Environment:
|
|
# PG_HOST PostgreSQL host (default: localhost)
|
|
# PG_PORT PostgreSQL port (default: 5432)
|
|
# PG_USER PostgreSQL user (default: postgres)
|
|
# PG_DATABASE PostgreSQL database (default: postgres)
|
|
# PATRONI_URL Patroni REST API URL (default: http://localhost:8008)
|
|
# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector)
|
|
# CURL_TIMEOUT API request timeout in seconds (default: 10)
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
# Version: 1.0
|
|
#
|
|
# Metrics Exported:
|
|
# Core:
|
|
# - postgresql_ha_up
|
|
# - postgresql_ha_exporter_info{version}
|
|
#
|
|
# Replication:
|
|
# - postgresql_ha_replication_lag_bytes{standby}
|
|
# - postgresql_ha_replication_lag_seconds{standby}
|
|
# - postgresql_ha_standby_count
|
|
#
|
|
# WAL:
|
|
# - postgresql_ha_wal_bytes_total
|
|
#
|
|
# Patroni:
|
|
# - postgresql_ha_patroni_role{role}
|
|
# - postgresql_ha_patroni_timeline
|
|
# - postgresql_ha_patroni_failover_count
|
|
#
|
|
# Connections:
|
|
# - postgresql_ha_connections_active
|
|
# - postgresql_ha_connections_idle
|
|
# - postgresql_ha_connections_waiting
|
|
# - postgresql_ha_connections_max
|
|
#
|
|
# Databases:
|
|
# - postgresql_ha_database_size_bytes{database}
|
|
#
|
|
# Exporter:
|
|
# - postgresql_ha_exporter_duration_seconds
|
|
# - postgresql_ha_exporter_last_run_timestamp
|
|
|
|
set -euo pipefail
|
|
|
|
# --- Configuration ---
|
|
readonly VERSION="1.0"
|
|
readonly SCRIPT_NAME="$(basename "$0")"
|
|
PG_HOST="${PG_HOST:-localhost}"
|
|
PG_PORT="${PG_PORT:-5432}"
|
|
PG_USER="${PG_USER:-postgres}"
|
|
PG_DATABASE="${PG_DATABASE:-postgres}"
|
|
PATRONI_URL="${PATRONI_URL:-http://localhost:8008}"
|
|
TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}"
|
|
CURL_TIMEOUT="${CURL_TIMEOUT:-10}"
|
|
TEXTFILE_MODE=false
|
|
OUTPUT=""
|
|
START_TIME=""
|
|
|
|
# --- Functions ---
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: $SCRIPT_NAME [OPTIONS]
|
|
|
|
PostgreSQL HA Prometheus Metrics Exporter
|
|
|
|
Options:
|
|
--textfile Write metrics to textfile collector directory
|
|
--install Create cron job for automatic collection
|
|
--help Show this help message
|
|
|
|
Environment Variables:
|
|
PG_HOST PostgreSQL host (default: localhost)
|
|
PG_PORT PostgreSQL port (default: 5432)
|
|
PG_USER PostgreSQL user (default: postgres)
|
|
PG_DATABASE PostgreSQL database (default: postgres)
|
|
PATRONI_URL Patroni REST API URL (default: http://localhost:8008)
|
|
TEXTFILE_DIR Output directory (default: /var/lib/node_exporter/textfile_collector)
|
|
CURL_TIMEOUT Request timeout in seconds (default: 10)
|
|
|
|
Examples:
|
|
$SCRIPT_NAME
|
|
$SCRIPT_NAME --textfile
|
|
PG_HOST="db.example.com" PATRONI_URL="http://db.example.com:8008" $SCRIPT_NAME --install
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
check_dependencies() {
|
|
local missing=()
|
|
for cmd in psql curl jq; do
|
|
if ! command -v "$cmd" &>/dev/null; then
|
|
missing+=("$cmd")
|
|
fi
|
|
done
|
|
if [[ ${#missing[@]} -gt 0 ]]; then
|
|
echo "ERROR: Missing required commands: ${missing[*]}" >&2
|
|
echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
validate_config() {
|
|
# Strip trailing slash
|
|
PATRONI_URL="${PATRONI_URL%/}"
|
|
|
|
if [[ -z "$PG_HOST" ]]; then
|
|
echo "ERROR: PG_HOST cannot be empty" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -z "$PG_USER" ]]; then
|
|
echo "ERROR: PG_USER cannot be empty" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
pg_query() {
|
|
local query="$1"
|
|
psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -d "$PG_DATABASE" -tAc "$query" 2>/dev/null || echo ""
|
|
}
|
|
|
|
api_get() {
|
|
local endpoint="$1"
|
|
local curl_args=(-sf --max-time "$CURL_TIMEOUT")
|
|
|
|
curl "${curl_args[@]}" "${PATRONI_URL}${endpoint}" 2>/dev/null || echo ""
|
|
}
|
|
|
|
add_metric() {
|
|
local name="$1"
|
|
local type="$2"
|
|
local help="$3"
|
|
local value="$4"
|
|
local labels="${5:-}"
|
|
|
|
if [[ -n "$labels" ]]; then
|
|
OUTPUT+="# HELP ${name} ${help}
|
|
# TYPE ${name} ${type}
|
|
${name}{${labels}} ${value}
|
|
"
|
|
else
|
|
OUTPUT+="# HELP ${name} ${help}
|
|
# TYPE ${name} ${type}
|
|
${name} ${value}
|
|
"
|
|
fi
|
|
}
|
|
|
|
add_metric_value() {
|
|
local name="$1"
|
|
local value="$2"
|
|
local labels="${3:-}"
|
|
|
|
if [[ -n "$labels" ]]; then
|
|
OUTPUT+="${name}{${labels}} ${value}
|
|
"
|
|
else
|
|
OUTPUT+="${name} ${value}
|
|
"
|
|
fi
|
|
}
|
|
|
|
collect_replication() {
|
|
# Replication lag in bytes per standby
|
|
local lag_bytes_result
|
|
lag_bytes_result=$(pg_query "SELECT client_addr || ':' || coalesce(application_name, 'unknown'), coalesce(pg_wal_lsn_diff(sent_lsn, replay_lsn), 0)::bigint FROM pg_stat_replication")
|
|
|
|
if [[ -z "$lag_bytes_result" ]]; then
|
|
add_metric "postgresql_ha_standby_count" "gauge" "Number of connected standby servers" "0"
|
|
return
|
|
fi
|
|
|
|
local standby_count=0
|
|
|
|
OUTPUT+="# HELP postgresql_ha_replication_lag_bytes Replication lag in bytes per standby
|
|
# TYPE postgresql_ha_replication_lag_bytes gauge
|
|
"
|
|
|
|
while IFS='|' read -r standby lag; do
|
|
if [[ -n "$standby" ]]; then
|
|
add_metric_value "postgresql_ha_replication_lag_bytes" "${lag:-0}" "standby=\"${standby}\""
|
|
standby_count=$((standby_count + 1))
|
|
fi
|
|
done <<< "$lag_bytes_result"
|
|
|
|
# Replication lag in seconds per standby
|
|
local lag_seconds_result
|
|
lag_seconds_result=$(pg_query "SELECT client_addr || ':' || coalesce(application_name, 'unknown'), coalesce(EXTRACT(EPOCH FROM replay_lag), 0)::float FROM pg_stat_replication")
|
|
|
|
if [[ -n "$lag_seconds_result" ]]; then
|
|
OUTPUT+="# HELP postgresql_ha_replication_lag_seconds Replication lag in seconds per standby
|
|
# TYPE postgresql_ha_replication_lag_seconds gauge
|
|
"
|
|
|
|
while IFS='|' read -r standby lag; do
|
|
if [[ -n "$standby" ]]; then
|
|
add_metric_value "postgresql_ha_replication_lag_seconds" "${lag:-0}" "standby=\"${standby}\""
|
|
fi
|
|
done <<< "$lag_seconds_result"
|
|
fi
|
|
|
|
add_metric "postgresql_ha_standby_count" "gauge" "Number of connected standby servers" "$standby_count"
|
|
}
|
|
|
|
collect_wal() {
|
|
local wal_lsn
|
|
wal_lsn=$(pg_query "SELECT pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')::bigint")
|
|
|
|
if [[ -n "$wal_lsn" ]]; then
|
|
add_metric "postgresql_ha_wal_bytes_total" "gauge" "Total WAL bytes generated since WAL origin" "${wal_lsn}"
|
|
fi
|
|
}
|
|
|
|
collect_patroni() {
|
|
local patroni_json
|
|
patroni_json=$(api_get "/patroni")
|
|
|
|
if [[ -z "$patroni_json" ]]; then
|
|
return
|
|
fi
|
|
|
|
# Role detection
|
|
local role
|
|
role=$(echo "$patroni_json" | jq -r '.role // ""' 2>/dev/null)
|
|
|
|
if [[ -n "$role" ]]; then
|
|
OUTPUT+="# HELP postgresql_ha_patroni_role Current Patroni role (1=active)
|
|
# TYPE postgresql_ha_patroni_role gauge
|
|
"
|
|
for r in leader replica sync_standby; do
|
|
if [[ "$role" == "$r" ]]; then
|
|
add_metric_value "postgresql_ha_patroni_role" "1" "role=\"${r}\""
|
|
else
|
|
add_metric_value "postgresql_ha_patroni_role" "0" "role=\"${r}\""
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# Timeline
|
|
local timeline
|
|
timeline=$(echo "$patroni_json" | jq '.timeline // 0' 2>/dev/null)
|
|
add_metric "postgresql_ha_patroni_timeline" "gauge" "Current Patroni timeline ID" "${timeline:-0}"
|
|
|
|
# Failover count from history
|
|
local history_json
|
|
history_json=$(api_get "/history")
|
|
|
|
if [[ -n "$history_json" ]]; then
|
|
local failover_count
|
|
failover_count=$(echo "$history_json" | jq 'length // 0' 2>/dev/null)
|
|
add_metric "postgresql_ha_patroni_failover_count" "gauge" "Total number of failovers from Patroni history" "${failover_count:-0}"
|
|
else
|
|
add_metric "postgresql_ha_patroni_failover_count" "gauge" "Total number of failovers from Patroni history" "0"
|
|
fi
|
|
}
|
|
|
|
collect_connections() {
|
|
# Active connections
|
|
local active
|
|
active=$(pg_query "SELECT count(*) FROM pg_stat_activity WHERE state = 'active'")
|
|
add_metric "postgresql_ha_connections_active" "gauge" "Number of active connections" "${active:-0}"
|
|
|
|
# Idle connections
|
|
local idle
|
|
idle=$(pg_query "SELECT count(*) FROM pg_stat_activity WHERE state = 'idle'")
|
|
add_metric "postgresql_ha_connections_idle" "gauge" "Number of idle connections" "${idle:-0}"
|
|
|
|
# Waiting connections
|
|
local waiting
|
|
waiting=$(pg_query "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type IS NOT NULL AND state != 'idle'")
|
|
add_metric "postgresql_ha_connections_waiting" "gauge" "Number of waiting connections" "${waiting:-0}"
|
|
|
|
# Max connections
|
|
local max_conn
|
|
max_conn=$(pg_query "SHOW max_connections")
|
|
add_metric "postgresql_ha_connections_max" "gauge" "Maximum number of connections (max_connections)" "${max_conn:-0}"
|
|
}
|
|
|
|
collect_databases() {
|
|
local db_result
|
|
db_result=$(pg_query "SELECT datname, pg_database_size(datname)::bigint FROM pg_database WHERE NOT datistemplate AND datallowconn")
|
|
|
|
if [[ -z "$db_result" ]]; then
|
|
return
|
|
fi
|
|
|
|
OUTPUT+="# HELP postgresql_ha_database_size_bytes Database size in bytes
|
|
# TYPE postgresql_ha_database_size_bytes gauge
|
|
"
|
|
|
|
while IFS='|' read -r dbname dbsize; do
|
|
if [[ -n "$dbname" ]]; then
|
|
add_metric_value "postgresql_ha_database_size_bytes" "${dbsize:-0}" "database=\"${dbname}\""
|
|
fi
|
|
done <<< "$db_result"
|
|
}
|
|
|
|
write_output() {
|
|
if [[ "$TEXTFILE_MODE" == true ]]; then
|
|
local output_file="${TEXTFILE_DIR}/postgresql-ha.prom"
|
|
local temp_file="${output_file}.$$"
|
|
|
|
mkdir -p "$TEXTFILE_DIR"
|
|
echo "$OUTPUT" > "$temp_file"
|
|
mv "$temp_file" "$output_file"
|
|
else
|
|
echo "$OUTPUT"
|
|
fi
|
|
}
|
|
|
|
install_cron() {
|
|
if [[ $EUID -ne 0 ]]; then
|
|
echo "ERROR: --install requires root" >&2
|
|
exit 1
|
|
fi
|
|
|
|
local script_path
|
|
script_path=$(readlink -f "$0")
|
|
|
|
cat > /etc/cron.d/postgresql-ha-exporter <<EOF
|
|
# PostgreSQL HA Prometheus Exporter — runs every 2 minutes
|
|
PG_HOST=${PG_HOST}
|
|
PG_PORT=${PG_PORT}
|
|
PG_USER=${PG_USER}
|
|
PG_DATABASE=${PG_DATABASE}
|
|
PATRONI_URL=${PATRONI_URL}
|
|
TEXTFILE_DIR=${TEXTFILE_DIR}
|
|
*/2 * * * * root ${script_path} --textfile 2>/dev/null
|
|
EOF
|
|
|
|
chmod 644 /etc/cron.d/postgresql-ha-exporter
|
|
echo "Installed cron job: /etc/cron.d/postgresql-ha-exporter"
|
|
echo "Metrics will be written to: ${TEXTFILE_DIR}/postgresql-ha.prom"
|
|
}
|
|
|
|
# --- Main ---
|
|
|
|
main() {
|
|
# Parse arguments
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--textfile) TEXTFILE_MODE=true ;;
|
|
--install)
|
|
check_dependencies
|
|
validate_config
|
|
install_cron
|
|
exit 0
|
|
;;
|
|
--help|-h) usage ;;
|
|
*) echo "Unknown option: $arg" >&2; usage ;;
|
|
esac
|
|
done
|
|
|
|
check_dependencies
|
|
validate_config
|
|
|
|
START_TIME=$(date +%s%N)
|
|
|
|
# Exporter info
|
|
add_metric "postgresql_ha_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\""
|
|
|
|
# Check PostgreSQL connectivity
|
|
local pg_test
|
|
pg_test=$(pg_query "SELECT 1")
|
|
|
|
if [[ "$pg_test" != "1" ]]; then
|
|
add_metric "postgresql_ha_up" "gauge" "PostgreSQL HA cluster reachability (1=up, 0=down)" "0"
|
|
else
|
|
add_metric "postgresql_ha_up" "gauge" "PostgreSQL HA cluster reachability (1=up, 0=down)" "1"
|
|
|
|
# Collect metrics
|
|
collect_replication
|
|
collect_wal
|
|
collect_patroni
|
|
collect_connections
|
|
collect_databases
|
|
fi
|
|
|
|
# Exporter performance
|
|
local end_time duration
|
|
end_time=$(date +%s%N)
|
|
duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0")
|
|
add_metric "postgresql_ha_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration"
|
|
add_metric "postgresql_ha_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)"
|
|
|
|
write_output
|
|
}
|
|
|
|
main "$@"
|