Files
linux-scripts/postgresql-ha-exporter.sh
T
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

415 lines
12 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# PostgreSQL HA Prometheus Metrics Exporter
#
# Prometheus textfile collector exporter for PostgreSQL HA clusters
# managed by Patroni. Queries PostgreSQL system views via psql and
# the Patroni REST API via curl to collect replication lag, WAL
# generation rate, standby count, leader/follower state, failover
# history, timeline ID, connection statistics, and database sizes.
#
# Usage:
# ./postgresql-ha-exporter.sh
# ./postgresql-ha-exporter.sh --textfile
# ./postgresql-ha-exporter.sh --install
#
# Parameters:
# --textfile Write to textfile collector directory
# --install Create cron job for automatic collection
# --help Show usage
#
# Environment:
# PG_HOST PostgreSQL host (default: localhost)
# PG_PORT PostgreSQL port (default: 5432)
# PG_USER PostgreSQL user (default: postgres)
# PG_DATABASE PostgreSQL database (default: postgres)
# PATRONI_URL Patroni REST API URL (default: http://localhost:8008)
# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector)
# CURL_TIMEOUT API request timeout in seconds (default: 10)
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
# Version: 1.0
#
# Metrics Exported:
# Core:
# - postgresql_ha_up
# - postgresql_ha_exporter_info{version}
#
# Replication:
# - postgresql_ha_replication_lag_bytes{standby}
# - postgresql_ha_replication_lag_seconds{standby}
# - postgresql_ha_standby_count
#
# WAL:
# - postgresql_ha_wal_bytes_total
#
# Patroni:
# - postgresql_ha_patroni_role{role}
# - postgresql_ha_patroni_timeline
# - postgresql_ha_patroni_failover_count
#
# Connections:
# - postgresql_ha_connections_active
# - postgresql_ha_connections_idle
# - postgresql_ha_connections_waiting
# - postgresql_ha_connections_max
#
# Databases:
# - postgresql_ha_database_size_bytes{database}
#
# Exporter:
# - postgresql_ha_exporter_duration_seconds
# - postgresql_ha_exporter_last_run_timestamp
set -euo pipefail
# --- Configuration ---
readonly VERSION="1.0"
readonly SCRIPT_NAME="$(basename "$0")"
PG_HOST="${PG_HOST:-localhost}"
PG_PORT="${PG_PORT:-5432}"
PG_USER="${PG_USER:-postgres}"
PG_DATABASE="${PG_DATABASE:-postgres}"
PATRONI_URL="${PATRONI_URL:-http://localhost:8008}"
TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}"
CURL_TIMEOUT="${CURL_TIMEOUT:-10}"
TEXTFILE_MODE=false
OUTPUT=""
START_TIME=""
# --- Functions ---
usage() {
cat <<EOF
Usage: $SCRIPT_NAME [OPTIONS]
PostgreSQL HA Prometheus Metrics Exporter
Options:
--textfile Write metrics to textfile collector directory
--install Create cron job for automatic collection
--help Show this help message
Environment Variables:
PG_HOST PostgreSQL host (default: localhost)
PG_PORT PostgreSQL port (default: 5432)
PG_USER PostgreSQL user (default: postgres)
PG_DATABASE PostgreSQL database (default: postgres)
PATRONI_URL Patroni REST API URL (default: http://localhost:8008)
TEXTFILE_DIR Output directory (default: /var/lib/node_exporter/textfile_collector)
CURL_TIMEOUT Request timeout in seconds (default: 10)
Examples:
$SCRIPT_NAME
$SCRIPT_NAME --textfile
PG_HOST="db.example.com" PATRONI_URL="http://db.example.com:8008" $SCRIPT_NAME --install
EOF
exit 0
}
check_dependencies() {
local missing=()
for cmd in psql curl jq; do
if ! command -v "$cmd" &>/dev/null; then
missing+=("$cmd")
fi
done
if [[ ${#missing[@]} -gt 0 ]]; then
echo "ERROR: Missing required commands: ${missing[*]}" >&2
echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2
exit 1
fi
}
validate_config() {
# Strip trailing slash
PATRONI_URL="${PATRONI_URL%/}"
if [[ -z "$PG_HOST" ]]; then
echo "ERROR: PG_HOST cannot be empty" >&2
exit 1
fi
if [[ -z "$PG_USER" ]]; then
echo "ERROR: PG_USER cannot be empty" >&2
exit 1
fi
}
pg_query() {
local query="$1"
psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -d "$PG_DATABASE" -tAc "$query" 2>/dev/null || echo ""
}
api_get() {
local endpoint="$1"
local curl_args=(-sf --max-time "$CURL_TIMEOUT")
curl "${curl_args[@]}" "${PATRONI_URL}${endpoint}" 2>/dev/null || echo ""
}
add_metric() {
local name="$1"
local type="$2"
local help="$3"
local value="$4"
local labels="${5:-}"
if [[ -n "$labels" ]]; then
OUTPUT+="# HELP ${name} ${help}
# TYPE ${name} ${type}
${name}{${labels}} ${value}
"
else
OUTPUT+="# HELP ${name} ${help}
# TYPE ${name} ${type}
${name} ${value}
"
fi
}
add_metric_value() {
local name="$1"
local value="$2"
local labels="${3:-}"
if [[ -n "$labels" ]]; then
OUTPUT+="${name}{${labels}} ${value}
"
else
OUTPUT+="${name} ${value}
"
fi
}
collect_replication() {
# Replication lag in bytes per standby
local lag_bytes_result
lag_bytes_result=$(pg_query "SELECT client_addr || ':' || coalesce(application_name, 'unknown'), coalesce(pg_wal_lsn_diff(sent_lsn, replay_lsn), 0)::bigint FROM pg_stat_replication")
if [[ -z "$lag_bytes_result" ]]; then
add_metric "postgresql_ha_standby_count" "gauge" "Number of connected standby servers" "0"
return
fi
local standby_count=0
OUTPUT+="# HELP postgresql_ha_replication_lag_bytes Replication lag in bytes per standby
# TYPE postgresql_ha_replication_lag_bytes gauge
"
while IFS='|' read -r standby lag; do
if [[ -n "$standby" ]]; then
add_metric_value "postgresql_ha_replication_lag_bytes" "${lag:-0}" "standby=\"${standby}\""
standby_count=$((standby_count + 1))
fi
done <<< "$lag_bytes_result"
# Replication lag in seconds per standby
local lag_seconds_result
lag_seconds_result=$(pg_query "SELECT client_addr || ':' || coalesce(application_name, 'unknown'), coalesce(EXTRACT(EPOCH FROM replay_lag), 0)::float FROM pg_stat_replication")
if [[ -n "$lag_seconds_result" ]]; then
OUTPUT+="# HELP postgresql_ha_replication_lag_seconds Replication lag in seconds per standby
# TYPE postgresql_ha_replication_lag_seconds gauge
"
while IFS='|' read -r standby lag; do
if [[ -n "$standby" ]]; then
add_metric_value "postgresql_ha_replication_lag_seconds" "${lag:-0}" "standby=\"${standby}\""
fi
done <<< "$lag_seconds_result"
fi
add_metric "postgresql_ha_standby_count" "gauge" "Number of connected standby servers" "$standby_count"
}
collect_wal() {
local wal_lsn
wal_lsn=$(pg_query "SELECT pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')::bigint")
if [[ -n "$wal_lsn" ]]; then
add_metric "postgresql_ha_wal_bytes_total" "gauge" "Total WAL bytes generated since WAL origin" "${wal_lsn}"
fi
}
collect_patroni() {
local patroni_json
patroni_json=$(api_get "/patroni")
if [[ -z "$patroni_json" ]]; then
return
fi
# Role detection
local role
role=$(echo "$patroni_json" | jq -r '.role // ""' 2>/dev/null)
if [[ -n "$role" ]]; then
OUTPUT+="# HELP postgresql_ha_patroni_role Current Patroni role (1=active)
# TYPE postgresql_ha_patroni_role gauge
"
for r in leader replica sync_standby; do
if [[ "$role" == "$r" ]]; then
add_metric_value "postgresql_ha_patroni_role" "1" "role=\"${r}\""
else
add_metric_value "postgresql_ha_patroni_role" "0" "role=\"${r}\""
fi
done
fi
# Timeline
local timeline
timeline=$(echo "$patroni_json" | jq '.timeline // 0' 2>/dev/null)
add_metric "postgresql_ha_patroni_timeline" "gauge" "Current Patroni timeline ID" "${timeline:-0}"
# Failover count from history
local history_json
history_json=$(api_get "/history")
if [[ -n "$history_json" ]]; then
local failover_count
failover_count=$(echo "$history_json" | jq 'length // 0' 2>/dev/null)
add_metric "postgresql_ha_patroni_failover_count" "gauge" "Total number of failovers from Patroni history" "${failover_count:-0}"
else
add_metric "postgresql_ha_patroni_failover_count" "gauge" "Total number of failovers from Patroni history" "0"
fi
}
collect_connections() {
# Active connections
local active
active=$(pg_query "SELECT count(*) FROM pg_stat_activity WHERE state = 'active'")
add_metric "postgresql_ha_connections_active" "gauge" "Number of active connections" "${active:-0}"
# Idle connections
local idle
idle=$(pg_query "SELECT count(*) FROM pg_stat_activity WHERE state = 'idle'")
add_metric "postgresql_ha_connections_idle" "gauge" "Number of idle connections" "${idle:-0}"
# Waiting connections
local waiting
waiting=$(pg_query "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type IS NOT NULL AND state != 'idle'")
add_metric "postgresql_ha_connections_waiting" "gauge" "Number of waiting connections" "${waiting:-0}"
# Max connections
local max_conn
max_conn=$(pg_query "SHOW max_connections")
add_metric "postgresql_ha_connections_max" "gauge" "Maximum number of connections (max_connections)" "${max_conn:-0}"
}
collect_databases() {
local db_result
db_result=$(pg_query "SELECT datname, pg_database_size(datname)::bigint FROM pg_database WHERE NOT datistemplate AND datallowconn")
if [[ -z "$db_result" ]]; then
return
fi
OUTPUT+="# HELP postgresql_ha_database_size_bytes Database size in bytes
# TYPE postgresql_ha_database_size_bytes gauge
"
while IFS='|' read -r dbname dbsize; do
if [[ -n "$dbname" ]]; then
add_metric_value "postgresql_ha_database_size_bytes" "${dbsize:-0}" "database=\"${dbname}\""
fi
done <<< "$db_result"
}
write_output() {
if [[ "$TEXTFILE_MODE" == true ]]; then
local output_file="${TEXTFILE_DIR}/postgresql-ha.prom"
local temp_file="${output_file}.$$"
mkdir -p "$TEXTFILE_DIR"
echo "$OUTPUT" > "$temp_file"
mv "$temp_file" "$output_file"
else
echo "$OUTPUT"
fi
}
install_cron() {
if [[ $EUID -ne 0 ]]; then
echo "ERROR: --install requires root" >&2
exit 1
fi
local script_path
script_path=$(readlink -f "$0")
cat > /etc/cron.d/postgresql-ha-exporter <<EOF
# PostgreSQL HA Prometheus Exporter — runs every 2 minutes
PG_HOST=${PG_HOST}
PG_PORT=${PG_PORT}
PG_USER=${PG_USER}
PG_DATABASE=${PG_DATABASE}
PATRONI_URL=${PATRONI_URL}
TEXTFILE_DIR=${TEXTFILE_DIR}
*/2 * * * * root ${script_path} --textfile 2>/dev/null
EOF
chmod 644 /etc/cron.d/postgresql-ha-exporter
echo "Installed cron job: /etc/cron.d/postgresql-ha-exporter"
echo "Metrics will be written to: ${TEXTFILE_DIR}/postgresql-ha.prom"
}
# --- Main ---
main() {
# Parse arguments
for arg in "$@"; do
case "$arg" in
--textfile) TEXTFILE_MODE=true ;;
--install)
check_dependencies
validate_config
install_cron
exit 0
;;
--help|-h) usage ;;
*) echo "Unknown option: $arg" >&2; usage ;;
esac
done
check_dependencies
validate_config
START_TIME=$(date +%s%N)
# Exporter info
add_metric "postgresql_ha_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\""
# Check PostgreSQL connectivity
local pg_test
pg_test=$(pg_query "SELECT 1")
if [[ "$pg_test" != "1" ]]; then
add_metric "postgresql_ha_up" "gauge" "PostgreSQL HA cluster reachability (1=up, 0=down)" "0"
else
add_metric "postgresql_ha_up" "gauge" "PostgreSQL HA cluster reachability (1=up, 0=down)" "1"
# Collect metrics
collect_replication
collect_wal
collect_patroni
collect_connections
collect_databases
fi
# Exporter performance
local end_time duration
end_time=$(date +%s%N)
duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0")
add_metric "postgresql_ha_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration"
add_metric "postgresql_ha_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)"
write_output
}
main "$@"