#!/bin/bash ############################################################# #### GitLab Migration Metrics Exporter for Prometheus #### #### Monitors database migration progress during updates #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version: 1.0 #### #### #### #### Usage: ./gitlab-migration-exporter.sh [OPTIONS] #### ############################################################# # # Metrics collected: # - Migration status: total, completed, pending, failed, running # - Current migration: name, version, elapsed time # - Background migrations: batched job queue status # - GitLab services: up/down status per service # - GitLab version: current installed version # - Reconfigure status: whether gitlab-ctl reconfigure is running # - Upgrade progress: overall percentage estimate # # Requirements: # - GitLab Omnibus installation # - socat (for HTTP server) # - Root or gitlab-psql access # set -euo pipefail ######################### ### Configuration ### ######################### LISTEN_PORT="${GITLAB_EXPORTER_PORT:-9177}" SCRAPE_INTERVAL="${SCRAPE_INTERVAL:-30}" RAKE_INTERVAL="${RAKE_INTERVAL:-300}" GITLAB_RAILS_CMD="${GITLAB_RAILS_CMD:-gitlab-rails}" GITLAB_RAKE_CMD="${GITLAB_RAKE_CMD:-gitlab-rake}" GITLAB_CTL_CMD="${GITLAB_CTL_CMD:-gitlab-ctl}" GITLAB_PSQL_CMD="${GITLAB_PSQL_CMD:-gitlab-psql}" GITLAB_DB_CONFIG="/var/opt/gitlab/gitlab-rails/etc/database.yml" LOG_DIR="${GITLAB_LOG_DIR:-/var/log/gitlab}" LOGFILE="/var/log/gitlab-migration-exporter.log" # Output mode TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false # Cache state STATE_DIR="/tmp/gitlab-migration-metrics" METRICS_CACHE="$STATE_DIR/metrics_cache" RAKE_CACHE="$STATE_DIR/rake_cache" LOCKFILE="$STATE_DIR/exporter.pid" LAST_SCRAPE=0 ######################### ### Logging ### ######################### log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOGFILE" >&2 } ######################### ### Parse Arguments ### ######################### parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --textfile) OUTPUT_FILE="$TEXTFILE_DIR/gitlab_migration.prom" shift ;; --http) HTTP_MODE=true shift ;; --port|-p) LISTEN_PORT="$2" HTTP_MODE=true shift 2 ;; --output|-o) OUTPUT_FILE="$2" shift 2 ;; --interval) SCRAPE_INTERVAL="$2" shift 2 ;; --rake-interval) RAKE_INTERVAL="$2" shift 2 ;; --log-dir) LOG_DIR="$2" shift 2 ;; --handle-request) handle_request exit 0 ;; --help|-h) cat < "$service_path" <<'UNIT' [Unit] Description=GitLab Migration Metrics Exporter for Prometheus Wants=network-online.target After=network-online.target [Service] User=root Group=root Type=simple ExecStart=/usr/local/bin/gitlab-migration-exporter.sh Restart=always RestartSec=10 [Install] WantedBy=multi-user.target UNIT systemctl daemon-reload systemctl enable gitlab-migration-exporter log "Service installed and enabled" fi } ######################### ### Setup ### ######################### check_already_running() { if [[ -f "$LOCKFILE" ]]; then local old_pid old_pid=$(cat "$LOCKFILE" 2>/dev/null) if [[ -n "$old_pid" ]] && kill -0 "$old_pid" 2>/dev/null; then log "ERROR: Already running (PID $old_pid). Exiting." exit 0 fi # Stale lockfile, remove it rm -f "$LOCKFILE" fi echo $$ > "$LOCKFILE" } cleanup_lock() { rm -f "$LOCKFILE" } trap cleanup_lock EXIT INT TERM setup() { mkdir -p "$STATE_DIR" check_already_running if ! command -v socat &>/dev/null; then log "socat not found, attempting to install..." if [[ $EUID -eq 0 ]]; then if command -v apt-get &>/dev/null; then apt-get update -qq && apt-get install -y -qq socat elif command -v dnf &>/dev/null; then dnf install -y -q socat elif command -v yum &>/dev/null; then yum install -y -q socat else log "ERROR: Cannot auto-install socat. Install manually." exit 1 fi log "socat installed successfully" else log "ERROR: socat is required. Run as root to auto-install, or install manually." exit 1 fi fi # Self-install on first run if running in HTTP mode as root if [[ "$HTTP_MODE" == "true" && $EUID -eq 0 ]]; then install_service fi if ! command -v "$GITLAB_CTL_CMD" &>/dev/null; then log "WARNING: $GITLAB_CTL_CMD not found - is GitLab Omnibus installed?" fi detect_database_config log "Database type: $DB_TYPE" } ######################### ### Migration Status ### ######################### collect_migration_status() { local now now=$(date +%s) # Rake is expensive (can take 2+ mins), use separate cache interval # Check file mtime so caching works across invocations (textfile mode) if [[ -f "$RAKE_CACHE" ]]; then local cache_age cache_age=$(( now - $(stat -c %Y "$RAKE_CACHE" 2>/dev/null || echo "0") )) if [[ $cache_age -lt $RAKE_INTERVAL ]]; then cat "$RAKE_CACHE" return fi fi local total=0 up_count=0 down_count=0 migrate_output="" if command -v "$GITLAB_RAKE_CMD" &>/dev/null; then migrate_output=$("$GITLAB_RAKE_CMD" db:migrate:status 2>/dev/null | grep -E '^\s*(up|down)\s' || echo "") if [[ -n "$migrate_output" ]]; then total=$(echo "$migrate_output" | wc -l) up_count=$(echo "$migrate_output" | grep -cE '^\s*up\s' || true) down_count=$(echo "$migrate_output" | grep -cE '^\s*down\s' || true) fi fi local progress=0 if [[ $total -gt 0 ]]; then progress=$(awk "BEGIN {printf \"%.2f\", ($up_count / $total) * 100}") fi { cat </dev/null | head -1 || echo "") if [[ -n "$rake_pid" ]]; then migration_running=1 migration_pid=$rake_pid # Get elapsed time in seconds migration_elapsed=$(ps -o etimes= -p "$rake_pid" 2>/dev/null | tr -d ' ' || echo "0") # Try to find the current migration from the log if [[ -f "$LOG_DIR/gitlab-rails/production.log" ]]; then current_migration_name=$(grep -oP 'Migrating to \K\S+' "$LOG_DIR/gitlab-rails/production.log" 2>/dev/null | tail -1 || echo "") fi fi # Also check for gitlab-ctl upgrade processes local upgrade_pid upgrade_pid=$(pgrep -f "gitlab-ctl upgrade" 2>/dev/null | head -1 || echo "") if [[ -n "$upgrade_pid" && "$migration_running" -eq 0 ]]; then migration_running=1 migration_pid=$upgrade_pid migration_elapsed=$(ps -o etimes= -p "$upgrade_pid" 2>/dev/null | tr -d ' ' || echo "0") fi cat </dev/null | head -1 || echo "") if [[ -n "$reconf_pid" ]]; then reconfigure_running=1 reconfigure_elapsed=$(ps -o etimes= -p "$reconf_pid" 2>/dev/null | tr -d ' ' || echo "0") fi cat </dev/null; then echo "" echo "# HELP gitlab_service_up Whether a GitLab service is running (1=up, 0=down)" echo "# TYPE gitlab_service_up gauge" echo "gitlab_service_up{service=\"unknown\"} 0" return fi local ctl_output ctl_output=$("$GITLAB_CTL_CMD" status 2>/dev/null || echo "") echo "" echo "# HELP gitlab_service_up Whether a GitLab service is running (1=up, 0=down)" echo "# TYPE gitlab_service_up gauge" echo "# HELP gitlab_service_pid PID of the GitLab service" echo "# TYPE gitlab_service_pid gauge" echo "# HELP gitlab_service_uptime_seconds Uptime of the GitLab service in seconds" echo "# TYPE gitlab_service_uptime_seconds gauge" # gitlab-ctl status output format: # run: alertmanager: (pid 7163) 22805249s; run: log: (pid 7160) 22805249s; # down: sidekiq: 0s, normally up; run: log: (pid 7000) 22805249s; if [[ -n "$ctl_output" ]]; then while IFS= read -r line; do if [[ "$line" =~ ^run:\ ([^:]+):\ \(pid\ ([0-9]+)\)\ ([0-9]+)s ]]; then local service="${BASH_REMATCH[1]}" local pid="${BASH_REMATCH[2]}" local uptime_str="${BASH_REMATCH[3]}" service=$(echo "$service" | tr -d ' ') echo "gitlab_service_up{service=\"$service\"} 1" echo "gitlab_service_pid{service=\"$service\"} $pid" echo "gitlab_service_uptime_seconds{service=\"$service\"} $uptime_str" elif [[ "$line" =~ ^down:\ ([^:]+): ]]; then local service="${BASH_REMATCH[1]}" service=$(echo "$service" | tr -d ' ') echo "gitlab_service_up{service=\"$service\"} 0" echo "gitlab_service_pid{service=\"$service\"} 0" echo "gitlab_service_uptime_seconds{service=\"$service\"} 0" fi done <<< "$ctl_output" fi # Count services local total_services up_services down_services total_services=$(echo "$ctl_output" | grep -cE '^(run|down):' || true) up_services=$(echo "$ctl_output" | grep -cE '^run:' || true) down_services=$(echo "$ctl_output" | grep -cE '^down:' || true) cat </dev/null; then version=$(gitlab-rake gitlab:env:info 2>/dev/null | grep -oP 'GitLab:\s+\K[\d.]+' || echo "unknown") fi cat </dev/null elif [[ "$DB_TYPE" == "local" ]]; then # Local Omnibus DB — gitlab-psql wraps psql with correct socket/user # It does not accept -d; the database is passed via the positional arg "$GITLAB_PSQL_CMD" -t -A -c "$query" -d gitlabhq_production 2>/dev/null \ || sudo -u gitlab-psql "$GITLAB_PSQL_CMD" -t -A -c "$query" -d gitlabhq_production 2>/dev/null \ || "$GITLAB_PSQL_CMD" -t -A -c "$query" gitlabhq_production 2>/dev/null else return 1 fi } parse_yaml_value() { # Extract a YAML value, handling quoted strings and stripping whitespace local raw="$1" raw=$(echo "$raw" | sed 's/^[[:space:]]*[^:]*:[[:space:]]*//' | sed 's/[[:space:]]*$//') # Strip surrounding quotes (single or double) raw=$(echo "$raw" | sed -e 's/^"//' -e 's/"$//' -e "s/^'//" -e "s/'$//") echo "$raw" } install_psql_client() { if [[ $EUID -ne 0 ]]; then log "ERROR: psql client not found. Run as root to auto-install, or install manually." DB_TYPE="none" return fi log "Attempting to install PostgreSQL client..." if command -v amazon-linux-extras &>/dev/null; then amazon-linux-extras install postgresql14 -y &>/dev/null && log "Installed psql via amazon-linux-extras" elif command -v apt-get &>/dev/null; then apt-get update -qq && apt-get install -y -qq postgresql-client &>/dev/null && log "Installed postgresql-client via apt" elif command -v dnf &>/dev/null; then dnf install -y -q postgresql &>/dev/null && log "Installed postgresql via dnf" elif command -v yum &>/dev/null; then yum install -y -q postgresql &>/dev/null && log "Installed postgresql via yum" else log "ERROR: Cannot auto-install psql client. Install manually." DB_TYPE="none" return fi # Verify install worked if ! command -v psql &>/dev/null; then log "ERROR: psql still not found after install attempt" DB_TYPE="none" fi } upgrade_psql_client() { if [[ $EUID -ne 0 ]]; then log "ERROR: Cannot upgrade psql — run as root. Or upgrade manually:" log " Amazon Linux 2: amazon-linux-extras install postgresql14 -y" log " RHEL/CentOS: yum install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-\$(rpm -E %%{rhel})-x86_64/pgdg-redhat-repo-latest.noarch.rpm && yum install -y postgresql16" log " Ubuntu/Debian: apt install -y postgresql-client-16" return fi log "Attempting to upgrade PostgreSQL client for SCRAM support..." if command -v amazon-linux-extras &>/dev/null; then amazon-linux-extras install postgresql14 -y &>/dev/null && log "Upgraded psql via amazon-linux-extras" && return fi if command -v dnf &>/dev/null; then # shellcheck disable=SC1083 dnf install -y -q "https://download.postgresql.org/pub/repos/yum/reporpms/EL-$(rpm -E '%{rhel}')-x86_64/pgdg-redhat-repo-latest.noarch.rpm" &>/dev/null dnf install -y -q postgresql16 &>/dev/null && log "Upgraded to postgresql16 via pgdg repo" && return elif command -v yum &>/dev/null; then # shellcheck disable=SC1083 yum install -y -q "https://download.postgresql.org/pub/repos/yum/reporpms/EL-$(rpm -E '%{rhel}')-x86_64/pgdg-redhat-repo-latest.noarch.rpm" &>/dev/null yum install -y -q postgresql16 &>/dev/null && log "Upgraded to postgresql16 via pgdg repo" && return elif command -v apt-get &>/dev/null; then apt-get update -qq && apt-get install -y -qq postgresql-client-16 &>/dev/null && log "Upgraded to postgresql-client-16 via apt" && return fi log "ERROR: Auto-upgrade failed. Please upgrade manually." } detect_database_config() { DB_HOST="" DB_PORT="5432" DB_NAME="gitlabhq_production" DB_USER="" DB_PASS="" DB_TYPE="none" # Parse GitLab's database.yml to detect external DB if [[ -f "$GITLAB_DB_CONFIG" ]]; then local prod_section prod_section=$(sed -n '/^production:/,/^[a-z]/p' "$GITLAB_DB_CONFIG" | head -n -1) local raw_host raw_host=$(echo "$prod_section" | grep '^\s*host:' | head -1) [[ -n "$raw_host" ]] && DB_HOST=$(parse_yaml_value "$raw_host") local raw_port raw_name raw_user raw_pass raw_port=$(echo "$prod_section" | grep '^\s*port:' | head -1) raw_name=$(echo "$prod_section" | grep '^\s*database:' | head -1) raw_user=$(echo "$prod_section" | grep '^\s*username:' | head -1) raw_pass=$(echo "$prod_section" | grep '^\s*password:' | head -1) [[ -n "$raw_port" ]] && DB_PORT=$(parse_yaml_value "$raw_port") [[ -n "$raw_name" ]] && DB_NAME=$(parse_yaml_value "$raw_name") [[ -n "$raw_user" ]] && DB_USER=$(parse_yaml_value "$raw_user") [[ -n "$raw_pass" ]] && DB_PASS=$(parse_yaml_value "$raw_pass") log "DB config parsed: host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER pass=" # If host is set and not localhost/socket path, treat as external DB if [[ -n "$DB_HOST" && "$DB_HOST" != "localhost" && "$DB_HOST" != "127.0.0.1" && ! "$DB_HOST" =~ ^/ ]]; then if command -v psql &>/dev/null; then DB_TYPE="external" log "Detected external database at $DB_HOST:$DB_PORT" # Verify connectivity local test_result if test_result=$(PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT 1;" 2>&1); then log "External DB connection test: OK" elif echo "$test_result" | grep -qi "SCRAM.*libpq"; then log "ERROR: $test_result" log "The installed psql client is too old for SCRAM authentication." upgrade_psql_client else log "WARNING: External DB connection test failed: $test_result" fi else log "WARNING: External DB detected at $DB_HOST but psql client not found" install_psql_client fi return fi else log "WARNING: Database config not found at $GITLAB_DB_CONFIG" fi # Fall back to local gitlab-psql if command -v "$GITLAB_PSQL_CMD" &>/dev/null; then if "$GITLAB_PSQL_CMD" -t -A -c "SELECT 1;" -d gitlabhq_production &>/dev/null \ || sudo -u gitlab-psql "$GITLAB_PSQL_CMD" -t -A -c "SELECT 1;" -d gitlabhq_production &>/dev/null \ || "$GITLAB_PSQL_CMD" -t -A -c "SELECT 1;" gitlabhq_production &>/dev/null; then DB_TYPE="local" else log "WARNING: gitlab-psql found but cannot connect — check permissions (run as root?)" DB_TYPE="none" fi fi } collect_database_health() { if [[ "$DB_TYPE" == "none" ]]; then return fi # Check database connectivity local db_up=0 if run_db_query "SELECT 1;" &>/dev/null; then db_up=1 fi # Get database size local db_size=0 if [[ $db_up -eq 1 ]]; then db_size=$(run_db_query "SELECT pg_database_size(current_database());" || true) db_size=$(echo "$db_size" | tr -d ' ') [[ -z "$db_size" ]] && db_size=0 fi # Active connections local db_connections=0 if [[ $db_up -eq 1 ]]; then db_connections=$(run_db_query "SELECT count(*) FROM pg_stat_activity WHERE datname = current_database();" || true) db_connections=$(echo "$db_connections" | tr -d ' ') [[ -z "$db_connections" ]] && db_connections=0 fi # Active locks (high lock count can indicate migration issues) local db_locks=0 if [[ $db_up -eq 1 ]]; then db_locks=$(run_db_query "SELECT count(*) FROM pg_locks WHERE NOT granted;" || true) db_locks=$(echo "$db_locks" | tr -d ' ') [[ -z "$db_locks" ]] && db_locks=0 fi # Schema migration version (latest applied) local schema_version="0" if [[ $db_up -eq 1 ]]; then schema_version=$(run_db_query "SELECT MAX(version) FROM schema_migrations;" || true) schema_version=$(echo "$schema_version" | tr -d ' ') [[ -z "$schema_version" ]] && schema_version=0 fi cat </dev/null \ | grep -ciE 'migration.*error|error.*migration|ActiveRecord::StatementInvalid' || true) migration_warnings=$(tail -1000 "$LOG_DIR/gitlab-rails/production.log" 2>/dev/null \ | grep -ciE 'migration.*warning|warning.*migration|deprecated' || true) fi # Check reconfigure log local reconfigure_errors=0 if [[ -f "$LOG_DIR/reconfigure/latest" ]]; then reconfigure_errors=$(tail -500 "$LOG_DIR/reconfigure/latest" 2>/dev/null \ | grep -ciE 'error|fatal|failed' || true) fi cat </dev/null || hostname) cat < "$METRICS_CACHE" LAST_SCRAPE=$now fi local body_length=${#metrics} cat </dev/null || { log "Server error, restarting in 5 seconds..." sleep 5 } done } ######################### ### Main ### ######################### main() { parse_args "$@" setup if [[ "$HTTP_MODE" == "true" ]]; then start_server elif [[ -n "$OUTPUT_FILE" ]]; then # Textfile collector mode: write atomically using temp file local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" local temp_file temp_file=$(mktemp "${output_dir}/.gitlab_migration_metrics.XXXXXX") if ! collect_all_metrics > "$temp_file" 2>/dev/null; then rm -f "$temp_file" log "ERROR: Failed to generate metrics" exit 1 fi local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [[ "$file_lines" -lt 10 ]]; then rm -f "$temp_file" log "ERROR: Metrics file too small ($file_lines lines), keeping previous" exit 1 fi chmod 644 "$temp_file" mv -f "$temp_file" "$OUTPUT_FILE" log "Metrics written to $OUTPUT_FILE ($file_lines lines)" else # Default: output to stdout collect_all_metrics fi } main "$@"