#!/usr/bin/env bash # # PostgreSQL HA Prometheus Metrics Exporter # # Prometheus textfile collector exporter for PostgreSQL HA clusters # managed by Patroni. Queries PostgreSQL system views via psql and # the Patroni REST API via curl to collect replication lag, WAL # generation rate, standby count, leader/follower state, failover # history, timeline ID, connection statistics, and database sizes. # # Usage: # ./postgresql-ha-exporter.sh # ./postgresql-ha-exporter.sh --textfile # ./postgresql-ha-exporter.sh --install # # Parameters: # --textfile Write to textfile collector directory # --install Create cron job for automatic collection # --help Show usage # # Environment: # PG_HOST PostgreSQL host (default: localhost) # PG_PORT PostgreSQL port (default: 5432) # PG_USER PostgreSQL user (default: postgres) # PG_DATABASE PostgreSQL database (default: postgres) # PATRONI_URL Patroni REST API URL (default: http://localhost:8008) # TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector) # CURL_TIMEOUT API request timeout in seconds (default: 10) # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # Version: 1.0 # # Metrics Exported: # Core: # - postgresql_ha_up # - postgresql_ha_exporter_info{version} # # Replication: # - postgresql_ha_replication_lag_bytes{standby} # - postgresql_ha_replication_lag_seconds{standby} # - postgresql_ha_standby_count # # WAL: # - postgresql_ha_wal_bytes_total # # Patroni: # - postgresql_ha_patroni_role{role} # - postgresql_ha_patroni_timeline # - postgresql_ha_patroni_failover_count # # Connections: # - postgresql_ha_connections_active # - postgresql_ha_connections_idle # - postgresql_ha_connections_waiting # - postgresql_ha_connections_max # # Databases: # - postgresql_ha_database_size_bytes{database} # # Exporter: # - postgresql_ha_exporter_duration_seconds # - postgresql_ha_exporter_last_run_timestamp set -euo pipefail # --- Configuration --- readonly VERSION="1.0" readonly SCRIPT_NAME="$(basename "$0")" PG_HOST="${PG_HOST:-localhost}" PG_PORT="${PG_PORT:-5432}" PG_USER="${PG_USER:-postgres}" PG_DATABASE="${PG_DATABASE:-postgres}" PATRONI_URL="${PATRONI_URL:-http://localhost:8008}" TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" CURL_TIMEOUT="${CURL_TIMEOUT:-10}" TEXTFILE_MODE=false OUTPUT="" START_TIME="" # --- Functions --- usage() { cat </dev/null; then missing+=("$cmd") fi done if [[ ${#missing[@]} -gt 0 ]]; then echo "ERROR: Missing required commands: ${missing[*]}" >&2 echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 exit 1 fi } validate_config() { # Strip trailing slash PATRONI_URL="${PATRONI_URL%/}" if [[ -z "$PG_HOST" ]]; then echo "ERROR: PG_HOST cannot be empty" >&2 exit 1 fi if [[ -z "$PG_USER" ]]; then echo "ERROR: PG_USER cannot be empty" >&2 exit 1 fi } pg_query() { local query="$1" psql -h "$PG_HOST" -p "$PG_PORT" -U "$PG_USER" -d "$PG_DATABASE" -tAc "$query" 2>/dev/null || echo "" } api_get() { local endpoint="$1" local curl_args=(-sf --max-time "$CURL_TIMEOUT") curl "${curl_args[@]}" "${PATRONI_URL}${endpoint}" 2>/dev/null || echo "" } add_metric() { local name="$1" local type="$2" local help="$3" local value="$4" local labels="${5:-}" if [[ -n "$labels" ]]; then OUTPUT+="# HELP ${name} ${help} # TYPE ${name} ${type} ${name}{${labels}} ${value} " else OUTPUT+="# HELP ${name} ${help} # TYPE ${name} ${type} ${name} ${value} " fi } add_metric_value() { local name="$1" local value="$2" local labels="${3:-}" if [[ -n "$labels" ]]; then OUTPUT+="${name}{${labels}} ${value} " else OUTPUT+="${name} ${value} " fi } collect_replication() { # Replication lag in bytes per standby local lag_bytes_result lag_bytes_result=$(pg_query "SELECT client_addr || ':' || coalesce(application_name, 'unknown'), coalesce(pg_wal_lsn_diff(sent_lsn, replay_lsn), 0)::bigint FROM pg_stat_replication") if [[ -z "$lag_bytes_result" ]]; then add_metric "postgresql_ha_standby_count" "gauge" "Number of connected standby servers" "0" return fi local standby_count=0 OUTPUT+="# HELP postgresql_ha_replication_lag_bytes Replication lag in bytes per standby # TYPE postgresql_ha_replication_lag_bytes gauge " while IFS='|' read -r standby lag; do if [[ -n "$standby" ]]; then add_metric_value "postgresql_ha_replication_lag_bytes" "${lag:-0}" "standby=\"${standby}\"" standby_count=$((standby_count + 1)) fi done <<< "$lag_bytes_result" # Replication lag in seconds per standby local lag_seconds_result lag_seconds_result=$(pg_query "SELECT client_addr || ':' || coalesce(application_name, 'unknown'), coalesce(EXTRACT(EPOCH FROM replay_lag), 0)::float FROM pg_stat_replication") if [[ -n "$lag_seconds_result" ]]; then OUTPUT+="# HELP postgresql_ha_replication_lag_seconds Replication lag in seconds per standby # TYPE postgresql_ha_replication_lag_seconds gauge " while IFS='|' read -r standby lag; do if [[ -n "$standby" ]]; then add_metric_value "postgresql_ha_replication_lag_seconds" "${lag:-0}" "standby=\"${standby}\"" fi done <<< "$lag_seconds_result" fi add_metric "postgresql_ha_standby_count" "gauge" "Number of connected standby servers" "$standby_count" } collect_wal() { local wal_lsn wal_lsn=$(pg_query "SELECT pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')::bigint") if [[ -n "$wal_lsn" ]]; then add_metric "postgresql_ha_wal_bytes_total" "gauge" "Total WAL bytes generated since WAL origin" "${wal_lsn}" fi } collect_patroni() { local patroni_json patroni_json=$(api_get "/patroni") if [[ -z "$patroni_json" ]]; then return fi # Role detection local role role=$(echo "$patroni_json" | jq -r '.role // ""' 2>/dev/null) if [[ -n "$role" ]]; then OUTPUT+="# HELP postgresql_ha_patroni_role Current Patroni role (1=active) # TYPE postgresql_ha_patroni_role gauge " for r in leader replica sync_standby; do if [[ "$role" == "$r" ]]; then add_metric_value "postgresql_ha_patroni_role" "1" "role=\"${r}\"" else add_metric_value "postgresql_ha_patroni_role" "0" "role=\"${r}\"" fi done fi # Timeline local timeline timeline=$(echo "$patroni_json" | jq '.timeline // 0' 2>/dev/null) add_metric "postgresql_ha_patroni_timeline" "gauge" "Current Patroni timeline ID" "${timeline:-0}" # Failover count from history local history_json history_json=$(api_get "/history") if [[ -n "$history_json" ]]; then local failover_count failover_count=$(echo "$history_json" | jq 'length // 0' 2>/dev/null) add_metric "postgresql_ha_patroni_failover_count" "gauge" "Total number of failovers from Patroni history" "${failover_count:-0}" else add_metric "postgresql_ha_patroni_failover_count" "gauge" "Total number of failovers from Patroni history" "0" fi } collect_connections() { # Active connections local active active=$(pg_query "SELECT count(*) FROM pg_stat_activity WHERE state = 'active'") add_metric "postgresql_ha_connections_active" "gauge" "Number of active connections" "${active:-0}" # Idle connections local idle idle=$(pg_query "SELECT count(*) FROM pg_stat_activity WHERE state = 'idle'") add_metric "postgresql_ha_connections_idle" "gauge" "Number of idle connections" "${idle:-0}" # Waiting connections local waiting waiting=$(pg_query "SELECT count(*) FROM pg_stat_activity WHERE wait_event_type IS NOT NULL AND state != 'idle'") add_metric "postgresql_ha_connections_waiting" "gauge" "Number of waiting connections" "${waiting:-0}" # Max connections local max_conn max_conn=$(pg_query "SHOW max_connections") add_metric "postgresql_ha_connections_max" "gauge" "Maximum number of connections (max_connections)" "${max_conn:-0}" } collect_databases() { local db_result db_result=$(pg_query "SELECT datname, pg_database_size(datname)::bigint FROM pg_database WHERE NOT datistemplate AND datallowconn") if [[ -z "$db_result" ]]; then return fi OUTPUT+="# HELP postgresql_ha_database_size_bytes Database size in bytes # TYPE postgresql_ha_database_size_bytes gauge " while IFS='|' read -r dbname dbsize; do if [[ -n "$dbname" ]]; then add_metric_value "postgresql_ha_database_size_bytes" "${dbsize:-0}" "database=\"${dbname}\"" fi done <<< "$db_result" } write_output() { if [[ "$TEXTFILE_MODE" == true ]]; then local output_file="${TEXTFILE_DIR}/postgresql-ha.prom" local temp_file="${output_file}.$$" mkdir -p "$TEXTFILE_DIR" echo "$OUTPUT" > "$temp_file" mv "$temp_file" "$output_file" else echo "$OUTPUT" fi } install_cron() { if [[ $EUID -ne 0 ]]; then echo "ERROR: --install requires root" >&2 exit 1 fi local script_path script_path=$(readlink -f "$0") cat > /etc/cron.d/postgresql-ha-exporter </dev/null EOF chmod 644 /etc/cron.d/postgresql-ha-exporter echo "Installed cron job: /etc/cron.d/postgresql-ha-exporter" echo "Metrics will be written to: ${TEXTFILE_DIR}/postgresql-ha.prom" } # --- Main --- main() { # Parse arguments for arg in "$@"; do case "$arg" in --textfile) TEXTFILE_MODE=true ;; --install) check_dependencies validate_config install_cron exit 0 ;; --help|-h) usage ;; *) echo "Unknown option: $arg" >&2; usage ;; esac done check_dependencies validate_config START_TIME=$(date +%s%N) # Exporter info add_metric "postgresql_ha_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" # Check PostgreSQL connectivity local pg_test pg_test=$(pg_query "SELECT 1") if [[ "$pg_test" != "1" ]]; then add_metric "postgresql_ha_up" "gauge" "PostgreSQL HA cluster reachability (1=up, 0=down)" "0" else add_metric "postgresql_ha_up" "gauge" "PostgreSQL HA cluster reachability (1=up, 0=down)" "1" # Collect metrics collect_replication collect_wal collect_patroni collect_connections collect_databases fi # Exporter performance local end_time duration end_time=$(date +%s%N) duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") add_metric "postgresql_ha_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" add_metric "postgresql_ha_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" write_output } main "$@"