Files
linux-scripts/gitlab-migration-exporter.sh

1028 lines
34 KiB
Bash

#!/bin/bash
#############################################################
#### GitLab Migration Metrics Exporter for Prometheus ####
#### Monitors database migration progress during updates ####
#### ####
#### Author: Phil Connor ####
#### Contact: contact@mylinux.work ####
#### License: MIT ####
#### Version: 1.0 ####
#### ####
#### Usage: ./gitlab-migration-exporter.sh [OPTIONS] ####
#############################################################
#
# Metrics collected:
# - Migration status: total, completed, pending, failed, running
# - Current migration: name, version, elapsed time
# - Background migrations: batched job queue status
# - GitLab services: up/down status per service
# - GitLab version: current installed version
# - Reconfigure status: whether gitlab-ctl reconfigure is running
# - Upgrade progress: overall percentage estimate
#
# Requirements:
# - GitLab Omnibus installation
# - socat (for HTTP server)
# - Root or gitlab-psql access
#
set -euo pipefail
#########################
### Configuration ###
#########################
LISTEN_PORT="${GITLAB_EXPORTER_PORT:-9177}"
SCRAPE_INTERVAL="${SCRAPE_INTERVAL:-30}"
RAKE_INTERVAL="${RAKE_INTERVAL:-300}"
GITLAB_RAILS_CMD="${GITLAB_RAILS_CMD:-gitlab-rails}"
GITLAB_RAKE_CMD="${GITLAB_RAKE_CMD:-gitlab-rake}"
GITLAB_CTL_CMD="${GITLAB_CTL_CMD:-gitlab-ctl}"
GITLAB_PSQL_CMD="${GITLAB_PSQL_CMD:-gitlab-psql}"
GITLAB_DB_CONFIG="/var/opt/gitlab/gitlab-rails/etc/database.yml"
LOG_DIR="${GITLAB_LOG_DIR:-/var/log/gitlab}"
LOGFILE="/var/log/gitlab-migration-exporter.log"
# Output mode
TEXTFILE_DIR="/var/lib/node_exporter"
OUTPUT_FILE=""
HTTP_MODE=false
# Cache state
STATE_DIR="/tmp/gitlab-migration-metrics"
METRICS_CACHE="$STATE_DIR/metrics_cache"
RAKE_CACHE="$STATE_DIR/rake_cache"
LOCKFILE="$STATE_DIR/exporter.pid"
LAST_SCRAPE=0
#########################
### Logging ###
#########################
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOGFILE" >&2
}
#########################
### Parse Arguments ###
#########################
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--textfile)
OUTPUT_FILE="$TEXTFILE_DIR/gitlab_migration.prom"
shift
;;
--http)
HTTP_MODE=true
shift
;;
--port|-p)
LISTEN_PORT="$2"
HTTP_MODE=true
shift 2
;;
--output|-o)
OUTPUT_FILE="$2"
shift 2
;;
--interval)
SCRAPE_INTERVAL="$2"
shift 2
;;
--rake-interval)
RAKE_INTERVAL="$2"
shift 2
;;
--log-dir)
LOG_DIR="$2"
shift 2
;;
--handle-request)
handle_request
exit 0
;;
--help|-h)
cat <<EOF
GitLab Migration Metrics Exporter for Prometheus
Usage: $0 [OPTIONS]
MODES:
--textfile Write to node_exporter textfile collector
--http Run HTTP server on port $LISTEN_PORT (default mode)
(no flag) Output to stdout
OPTIONS:
-p, --port PORT HTTP port (default: 9177)
-o, --output PATH Output file path
--interval SECS Minimum seconds between metric collections (default: 30)
--rake-interval SECS Minimum seconds between gitlab-rake calls (default: 300)
--log-dir PATH GitLab log directory (default: /var/log/gitlab)
--help Show this help
EXAMPLES:
$0 --textfile # Write to textfile collector
$0 --http --port 9177 # Run HTTP server
$0 -o /tmp/gitlab_migration.prom # Write to custom file
$0 # Output to stdout
Environment Variables:
GITLAB_EXPORTER_PORT Same as --port
SCRAPE_INTERVAL Same as --interval
RAKE_INTERVAL Same as --rake-interval
GITLAB_RAILS_CMD Path to gitlab-rails (default: gitlab-rails)
GITLAB_RAKE_CMD Path to gitlab-rake (default: gitlab-rake)
GITLAB_CTL_CMD Path to gitlab-ctl (default: gitlab-ctl)
GITLAB_PSQL_CMD Path to gitlab-psql (default: gitlab-psql)
GITLAB_LOG_DIR Same as --log-dir
EOF
exit 0
;;
*)
log "Unknown option: $1"
exit 1
;;
esac
done
}
#########################
### Self-Install ###
#########################
install_service() {
local script_path="/usr/local/bin/gitlab-migration-exporter.sh"
local service_path="/etc/systemd/system/gitlab-migration-exporter.service"
# Copy script into place if not already there
if [[ "$(readlink -f "$0")" != "$script_path" ]]; then
log "Installing script to $script_path"
cp "$(readlink -f "$0")" "$script_path"
chmod 755 "$script_path"
fi
# Create systemd service unit
if [[ ! -f "$service_path" ]]; then
log "Creating systemd service at $service_path"
cat > "$service_path" <<'UNIT'
[Unit]
Description=GitLab Migration Metrics Exporter for Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=root
Group=root
Type=simple
ExecStart=/usr/local/bin/gitlab-migration-exporter.sh
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
UNIT
systemctl daemon-reload
systemctl enable gitlab-migration-exporter
log "Service installed and enabled"
fi
}
#########################
### Setup ###
#########################
check_already_running() {
if [[ -f "$LOCKFILE" ]]; then
local old_pid
old_pid=$(cat "$LOCKFILE" 2>/dev/null)
if [[ -n "$old_pid" ]] && kill -0 "$old_pid" 2>/dev/null; then
log "ERROR: Already running (PID $old_pid). Exiting."
exit 0
fi
# Stale lockfile, remove it
rm -f "$LOCKFILE"
fi
echo $$ > "$LOCKFILE"
}
cleanup_lock() {
rm -f "$LOCKFILE"
}
trap cleanup_lock EXIT INT TERM
setup() {
mkdir -p "$STATE_DIR"
check_already_running
if ! command -v socat &>/dev/null; then
log "socat not found, attempting to install..."
if [[ $EUID -eq 0 ]]; then
if command -v apt-get &>/dev/null; then
apt-get update -qq && apt-get install -y -qq socat
elif command -v dnf &>/dev/null; then
dnf install -y -q socat
elif command -v yum &>/dev/null; then
yum install -y -q socat
else
log "ERROR: Cannot auto-install socat. Install manually."
exit 1
fi
log "socat installed successfully"
else
log "ERROR: socat is required. Run as root to auto-install, or install manually."
exit 1
fi
fi
# Self-install on first run if running in HTTP mode as root
if [[ "$HTTP_MODE" == "true" && $EUID -eq 0 ]]; then
install_service
fi
if ! command -v "$GITLAB_CTL_CMD" &>/dev/null; then
log "WARNING: $GITLAB_CTL_CMD not found - is GitLab Omnibus installed?"
fi
detect_database_config
log "Database type: $DB_TYPE"
}
#########################
### Migration Status ###
#########################
collect_migration_status() {
local now
now=$(date +%s)
# Rake is expensive (can take 2+ mins), use separate cache interval
# Check file mtime so caching works across invocations (textfile mode)
if [[ -f "$RAKE_CACHE" ]]; then
local cache_age
cache_age=$(( now - $(stat -c %Y "$RAKE_CACHE" 2>/dev/null || echo "0") ))
if [[ $cache_age -lt $RAKE_INTERVAL ]]; then
cat "$RAKE_CACHE"
return
fi
fi
local total=0 up_count=0 down_count=0 migrate_output=""
if command -v "$GITLAB_RAKE_CMD" &>/dev/null; then
migrate_output=$("$GITLAB_RAKE_CMD" db:migrate:status 2>/dev/null | grep -E '^\s*(up|down)\s' || echo "")
if [[ -n "$migrate_output" ]]; then
total=$(echo "$migrate_output" | wc -l)
up_count=$(echo "$migrate_output" | grep -cE '^\s*up\s' || true)
down_count=$(echo "$migrate_output" | grep -cE '^\s*down\s' || true)
fi
fi
local progress=0
if [[ $total -gt 0 ]]; then
progress=$(awk "BEGIN {printf \"%.2f\", ($up_count / $total) * 100}")
fi
{
cat <<EOF
# HELP gitlab_migrations_total Total number of database migrations
# TYPE gitlab_migrations_total gauge
gitlab_migrations_total $total
# HELP gitlab_migrations_completed Number of completed (up) migrations
# TYPE gitlab_migrations_completed gauge
gitlab_migrations_completed $up_count
# HELP gitlab_migrations_pending Number of pending (down) migrations
# TYPE gitlab_migrations_pending gauge
gitlab_migrations_pending $down_count
# HELP gitlab_migrations_progress_percent Percentage of migrations completed
# TYPE gitlab_migrations_progress_percent gauge
gitlab_migrations_progress_percent $progress
EOF
# Emit individual pending migration details
if [[ $down_count -gt 0 && -n "$migrate_output" ]]; then
echo ""
echo "# HELP gitlab_migration_pending_info Info about each pending migration"
echo "# TYPE gitlab_migration_pending_info gauge"
echo "$migrate_output" | grep -E '^\s*down\s' | while read -r status version name; do
name="${name:-unknown}"
name=$(echo "$name" | tr -cd '[:alnum:]_')
echo "gitlab_migration_pending_info{version=\"$version\",name=\"$name\"} 1"
done
fi
} | tee "$RAKE_CACHE"
}
#########################
### Running Migration ###
#########################
collect_running_migration() {
local migration_running=0
local migration_pid=0
local migration_elapsed=0
local current_migration_name=""
# Check if rake db:migrate is running
local rake_pid
rake_pid=$(pgrep -f "rake.*db:migrate" 2>/dev/null | head -1 || echo "")
if [[ -n "$rake_pid" ]]; then
migration_running=1
migration_pid=$rake_pid
# Get elapsed time in seconds
migration_elapsed=$(ps -o etimes= -p "$rake_pid" 2>/dev/null | tr -d ' ' || echo "0")
# Try to find the current migration from the log
if [[ -f "$LOG_DIR/gitlab-rails/production.log" ]]; then
current_migration_name=$(grep -oP 'Migrating to \K\S+' "$LOG_DIR/gitlab-rails/production.log" 2>/dev/null | tail -1 || echo "")
fi
fi
# Also check for gitlab-ctl upgrade processes
local upgrade_pid
upgrade_pid=$(pgrep -f "gitlab-ctl upgrade" 2>/dev/null | head -1 || echo "")
if [[ -n "$upgrade_pid" && "$migration_running" -eq 0 ]]; then
migration_running=1
migration_pid=$upgrade_pid
migration_elapsed=$(ps -o etimes= -p "$upgrade_pid" 2>/dev/null | tr -d ' ' || echo "0")
fi
cat <<EOF
# HELP gitlab_migration_running Whether a database migration is currently running
# TYPE gitlab_migration_running gauge
gitlab_migration_running $migration_running
# HELP gitlab_migration_pid PID of the running migration process
# TYPE gitlab_migration_pid gauge
gitlab_migration_pid $migration_pid
# HELP gitlab_migration_elapsed_seconds Elapsed time of the current migration in seconds
# TYPE gitlab_migration_elapsed_seconds gauge
gitlab_migration_elapsed_seconds $migration_elapsed
EOF
if [[ -n "$current_migration_name" ]]; then
echo ""
echo "# HELP gitlab_migration_current_info Currently running migration name"
echo "# TYPE gitlab_migration_current_info gauge"
echo "gitlab_migration_current_info{name=\"$current_migration_name\"} 1"
fi
}
#########################
### Reconfigure Status ###
#########################
collect_reconfigure_status() {
local reconfigure_running=0
local reconfigure_elapsed=0
local reconf_pid
reconf_pid=$(pgrep -f "chef-client.*reconfigure\|gitlab-ctl reconfigure" 2>/dev/null | head -1 || echo "")
if [[ -n "$reconf_pid" ]]; then
reconfigure_running=1
reconfigure_elapsed=$(ps -o etimes= -p "$reconf_pid" 2>/dev/null | tr -d ' ' || echo "0")
fi
cat <<EOF
# HELP gitlab_reconfigure_running Whether gitlab-ctl reconfigure is currently running
# TYPE gitlab_reconfigure_running gauge
gitlab_reconfigure_running $reconfigure_running
# HELP gitlab_reconfigure_elapsed_seconds Elapsed time of reconfigure in seconds
# TYPE gitlab_reconfigure_elapsed_seconds gauge
gitlab_reconfigure_elapsed_seconds $reconfigure_elapsed
EOF
}
#########################
### Background Migrations ###
#########################
collect_background_migrations() {
local batched_total=0
local batched_finished=0
local batched_running=0
local batched_pending=0
local batched_failed=0
local batched_paused=0
if [[ "$DB_TYPE" != "none" ]]; then
# Query batched_background_migrations table
local query_result
query_result=$(run_db_query "
SELECT status, COUNT(*)
FROM batched_background_migrations
GROUP BY status;
" || echo "")
if [[ -n "$query_result" ]]; then
while IFS='|' read -r status count; do
count=$(echo "$count" | tr -d ' ')
case "$status" in
0) batched_paused=$count ;;
1) batched_running=$count ;; # active
2) batched_pending=$count ;; # queued
3) batched_finished=$count ;;
4) batched_failed=$count ;;
*) ;;
esac
batched_total=$((batched_total + count))
done <<< "$query_result"
fi
fi
local batched_progress=0
if [[ $batched_total -gt 0 ]]; then
batched_progress=$(awk "BEGIN {printf \"%.2f\", ($batched_finished / $batched_total) * 100}")
fi
cat <<EOF
# HELP gitlab_batched_migrations_total Total batched background migrations
# TYPE gitlab_batched_migrations_total gauge
gitlab_batched_migrations_total $batched_total
# HELP gitlab_batched_migrations_finished Finished batched background migrations
# TYPE gitlab_batched_migrations_finished gauge
gitlab_batched_migrations_finished $batched_finished
# HELP gitlab_batched_migrations_running Currently running batched background migrations
# TYPE gitlab_batched_migrations_running gauge
gitlab_batched_migrations_running $batched_running
# HELP gitlab_batched_migrations_pending Pending batched background migrations
# TYPE gitlab_batched_migrations_pending gauge
gitlab_batched_migrations_pending $batched_pending
# HELP gitlab_batched_migrations_failed Failed batched background migrations
# TYPE gitlab_batched_migrations_failed gauge
gitlab_batched_migrations_failed $batched_failed
# HELP gitlab_batched_migrations_paused Paused batched background migrations
# TYPE gitlab_batched_migrations_paused gauge
gitlab_batched_migrations_paused $batched_paused
# HELP gitlab_batched_migrations_progress_percent Percentage of batched migrations finished
# TYPE gitlab_batched_migrations_progress_percent gauge
gitlab_batched_migrations_progress_percent $batched_progress
EOF
# Per-migration detail for running/pending ones
if [[ "$DB_TYPE" != "none" ]]; then
local detail_result
detail_result=$(run_db_query "
SELECT id, job_class_name, table_name, status,
COALESCE(min_value, 0), COALESCE(max_value, 0), COALESCE(batch_size, 0),
EXTRACT(EPOCH FROM (NOW() - created_at))::int as age_seconds
FROM batched_background_migrations
WHERE status IN (1, 2, 4)
ORDER BY id;
" || echo "")
if [[ -n "$detail_result" ]]; then
echo ""
echo "# HELP gitlab_batched_migration_detail Details of active/pending/failed batched migrations"
echo "# TYPE gitlab_batched_migration_detail gauge"
echo "# HELP gitlab_batched_migration_age_seconds Age of batched migration in seconds"
echo "# TYPE gitlab_batched_migration_age_seconds gauge"
while IFS='|' read -r id job_class table_name status _ _ batch_size age; do
[[ -z "$id" ]] && continue
local status_label
case "$status" in
1) status_label="running" ;;
2) status_label="pending" ;;
4) status_label="failed" ;;
*) status_label="unknown" ;;
esac
echo "gitlab_batched_migration_detail{id=\"$id\",job_class=\"$job_class\",table_name=\"$table_name\",status=\"$status_label\"} $batch_size"
echo "gitlab_batched_migration_age_seconds{id=\"$id\",job_class=\"$job_class\"} $age"
done <<< "$detail_result"
fi
fi
}
#########################
### Service Status ###
#########################
collect_service_status() {
if ! command -v "$GITLAB_CTL_CMD" &>/dev/null; then
echo ""
echo "# HELP gitlab_service_up Whether a GitLab service is running (1=up, 0=down)"
echo "# TYPE gitlab_service_up gauge"
echo "gitlab_service_up{service=\"unknown\"} 0"
return
fi
local ctl_output
ctl_output=$("$GITLAB_CTL_CMD" status 2>/dev/null || echo "")
echo ""
echo "# HELP gitlab_service_up Whether a GitLab service is running (1=up, 0=down)"
echo "# TYPE gitlab_service_up gauge"
echo "# HELP gitlab_service_pid PID of the GitLab service"
echo "# TYPE gitlab_service_pid gauge"
echo "# HELP gitlab_service_uptime_seconds Uptime of the GitLab service in seconds"
echo "# TYPE gitlab_service_uptime_seconds gauge"
# gitlab-ctl status output format:
# run: alertmanager: (pid 7163) 22805249s; run: log: (pid 7160) 22805249s;
# down: sidekiq: 0s, normally up; run: log: (pid 7000) 22805249s;
if [[ -n "$ctl_output" ]]; then
while IFS= read -r line; do
if [[ "$line" =~ ^run:\ ([^:]+):\ \(pid\ ([0-9]+)\)\ ([0-9]+)s ]]; then
local service="${BASH_REMATCH[1]}"
local pid="${BASH_REMATCH[2]}"
local uptime_str="${BASH_REMATCH[3]}"
service=$(echo "$service" | tr -d ' ')
echo "gitlab_service_up{service=\"$service\"} 1"
echo "gitlab_service_pid{service=\"$service\"} $pid"
echo "gitlab_service_uptime_seconds{service=\"$service\"} $uptime_str"
elif [[ "$line" =~ ^down:\ ([^:]+): ]]; then
local service="${BASH_REMATCH[1]}"
service=$(echo "$service" | tr -d ' ')
echo "gitlab_service_up{service=\"$service\"} 0"
echo "gitlab_service_pid{service=\"$service\"} 0"
echo "gitlab_service_uptime_seconds{service=\"$service\"} 0"
fi
done <<< "$ctl_output"
fi
# Count services
local total_services up_services down_services
total_services=$(echo "$ctl_output" | grep -cE '^(run|down):' || true)
up_services=$(echo "$ctl_output" | grep -cE '^run:' || true)
down_services=$(echo "$ctl_output" | grep -cE '^down:' || true)
cat <<EOF
# HELP gitlab_services_total Total number of GitLab services
# TYPE gitlab_services_total gauge
gitlab_services_total $total_services
# HELP gitlab_services_up Number of running GitLab services
# TYPE gitlab_services_up gauge
gitlab_services_up $up_services
# HELP gitlab_services_down Number of down GitLab services
# TYPE gitlab_services_down gauge
gitlab_services_down $down_services
EOF
}
#########################
### Version Info ###
#########################
collect_version_info() {
local version="unknown"
# Try multiple methods to get the version
if [[ -f /opt/gitlab/version-manifest.txt ]]; then
version=$(head -1 /opt/gitlab/version-manifest.txt | awk '{print $2}' || echo "unknown")
elif command -v gitlab-rake &>/dev/null; then
version=$(gitlab-rake gitlab:env:info 2>/dev/null | grep -oP 'GitLab:\s+\K[\d.]+' || echo "unknown")
fi
cat <<EOF
# HELP gitlab_version_info GitLab version information
# TYPE gitlab_version_info gauge
gitlab_version_info{version="$version"} 1
EOF
}
#########################
### Database Health ###
#########################
run_db_query() {
local query="$1"
if [[ "$DB_TYPE" == "external" ]]; then
# External DB (AWS RDS, etc) — use psql with individual flags
PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -A -c "$query" 2>/dev/null
elif [[ "$DB_TYPE" == "local" ]]; then
# Local Omnibus DB — gitlab-psql wraps psql with correct socket/user
# It does not accept -d; the database is passed via the positional arg
"$GITLAB_PSQL_CMD" -t -A -c "$query" -d gitlabhq_production 2>/dev/null \
|| sudo -u gitlab-psql "$GITLAB_PSQL_CMD" -t -A -c "$query" -d gitlabhq_production 2>/dev/null \
|| "$GITLAB_PSQL_CMD" -t -A -c "$query" gitlabhq_production 2>/dev/null
else
return 1
fi
}
parse_yaml_value() {
# Extract a YAML value, handling quoted strings and stripping whitespace
local raw="$1"
raw=$(echo "$raw" | sed 's/^[[:space:]]*[^:]*:[[:space:]]*//' | sed 's/[[:space:]]*$//')
# Strip surrounding quotes (single or double)
raw=$(echo "$raw" | sed -e 's/^"//' -e 's/"$//' -e "s/^'//" -e "s/'$//")
echo "$raw"
}
install_psql_client() {
if [[ $EUID -ne 0 ]]; then
log "ERROR: psql client not found. Run as root to auto-install, or install manually."
DB_TYPE="none"
return
fi
log "Attempting to install PostgreSQL client..."
if command -v amazon-linux-extras &>/dev/null; then
amazon-linux-extras install postgresql14 -y &>/dev/null && log "Installed psql via amazon-linux-extras"
elif command -v apt-get &>/dev/null; then
apt-get update -qq && apt-get install -y -qq postgresql-client &>/dev/null && log "Installed postgresql-client via apt"
elif command -v dnf &>/dev/null; then
dnf install -y -q postgresql &>/dev/null && log "Installed postgresql via dnf"
elif command -v yum &>/dev/null; then
yum install -y -q postgresql &>/dev/null && log "Installed postgresql via yum"
else
log "ERROR: Cannot auto-install psql client. Install manually."
DB_TYPE="none"
return
fi
# Verify install worked
if ! command -v psql &>/dev/null; then
log "ERROR: psql still not found after install attempt"
DB_TYPE="none"
fi
}
upgrade_psql_client() {
if [[ $EUID -ne 0 ]]; then
log "ERROR: Cannot upgrade psql — run as root. Or upgrade manually:"
log " Amazon Linux 2: amazon-linux-extras install postgresql14 -y"
log " RHEL/CentOS: yum install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-\$(rpm -E %%{rhel})-x86_64/pgdg-redhat-repo-latest.noarch.rpm && yum install -y postgresql16"
log " Ubuntu/Debian: apt install -y postgresql-client-16"
return
fi
log "Attempting to upgrade PostgreSQL client for SCRAM support..."
if command -v amazon-linux-extras &>/dev/null; then
amazon-linux-extras install postgresql14 -y &>/dev/null && log "Upgraded psql via amazon-linux-extras" && return
fi
if command -v dnf &>/dev/null; then
# shellcheck disable=SC1083
dnf install -y -q "https://download.postgresql.org/pub/repos/yum/reporpms/EL-$(rpm -E '%{rhel}')-x86_64/pgdg-redhat-repo-latest.noarch.rpm" &>/dev/null
dnf install -y -q postgresql16 &>/dev/null && log "Upgraded to postgresql16 via pgdg repo" && return
elif command -v yum &>/dev/null; then
# shellcheck disable=SC1083
yum install -y -q "https://download.postgresql.org/pub/repos/yum/reporpms/EL-$(rpm -E '%{rhel}')-x86_64/pgdg-redhat-repo-latest.noarch.rpm" &>/dev/null
yum install -y -q postgresql16 &>/dev/null && log "Upgraded to postgresql16 via pgdg repo" && return
elif command -v apt-get &>/dev/null; then
apt-get update -qq && apt-get install -y -qq postgresql-client-16 &>/dev/null && log "Upgraded to postgresql-client-16 via apt" && return
fi
log "ERROR: Auto-upgrade failed. Please upgrade manually."
}
detect_database_config() {
DB_HOST=""
DB_PORT="5432"
DB_NAME="gitlabhq_production"
DB_USER=""
DB_PASS=""
DB_TYPE="none"
# Parse GitLab's database.yml to detect external DB
if [[ -f "$GITLAB_DB_CONFIG" ]]; then
local prod_section
prod_section=$(sed -n '/^production:/,/^[a-z]/p' "$GITLAB_DB_CONFIG" | head -n -1)
local raw_host
raw_host=$(echo "$prod_section" | grep '^\s*host:' | head -1)
[[ -n "$raw_host" ]] && DB_HOST=$(parse_yaml_value "$raw_host")
local raw_port raw_name raw_user raw_pass
raw_port=$(echo "$prod_section" | grep '^\s*port:' | head -1)
raw_name=$(echo "$prod_section" | grep '^\s*database:' | head -1)
raw_user=$(echo "$prod_section" | grep '^\s*username:' | head -1)
raw_pass=$(echo "$prod_section" | grep '^\s*password:' | head -1)
[[ -n "$raw_port" ]] && DB_PORT=$(parse_yaml_value "$raw_port")
[[ -n "$raw_name" ]] && DB_NAME=$(parse_yaml_value "$raw_name")
[[ -n "$raw_user" ]] && DB_USER=$(parse_yaml_value "$raw_user")
[[ -n "$raw_pass" ]] && DB_PASS=$(parse_yaml_value "$raw_pass")
log "DB config parsed: host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER pass=<redacted>"
# If host is set and not localhost/socket path, treat as external DB
if [[ -n "$DB_HOST" && "$DB_HOST" != "localhost" && "$DB_HOST" != "127.0.0.1" && ! "$DB_HOST" =~ ^/ ]]; then
if command -v psql &>/dev/null; then
DB_TYPE="external"
log "Detected external database at $DB_HOST:$DB_PORT"
# Verify connectivity
local test_result
if test_result=$(PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT 1;" 2>&1); then
log "External DB connection test: OK"
elif echo "$test_result" | grep -qi "SCRAM.*libpq"; then
log "ERROR: $test_result"
log "The installed psql client is too old for SCRAM authentication."
upgrade_psql_client
else
log "WARNING: External DB connection test failed: $test_result"
fi
else
log "WARNING: External DB detected at $DB_HOST but psql client not found"
install_psql_client
fi
return
fi
else
log "WARNING: Database config not found at $GITLAB_DB_CONFIG"
fi
# Fall back to local gitlab-psql
if command -v "$GITLAB_PSQL_CMD" &>/dev/null; then
if "$GITLAB_PSQL_CMD" -t -A -c "SELECT 1;" -d gitlabhq_production &>/dev/null \
|| sudo -u gitlab-psql "$GITLAB_PSQL_CMD" -t -A -c "SELECT 1;" -d gitlabhq_production &>/dev/null \
|| "$GITLAB_PSQL_CMD" -t -A -c "SELECT 1;" gitlabhq_production &>/dev/null; then
DB_TYPE="local"
else
log "WARNING: gitlab-psql found but cannot connect — check permissions (run as root?)"
DB_TYPE="none"
fi
fi
}
collect_database_health() {
if [[ "$DB_TYPE" == "none" ]]; then
return
fi
# Check database connectivity
local db_up=0
if run_db_query "SELECT 1;" &>/dev/null; then
db_up=1
fi
# Get database size
local db_size=0
if [[ $db_up -eq 1 ]]; then
db_size=$(run_db_query "SELECT pg_database_size(current_database());" || true)
db_size=$(echo "$db_size" | tr -d ' ')
[[ -z "$db_size" ]] && db_size=0
fi
# Active connections
local db_connections=0
if [[ $db_up -eq 1 ]]; then
db_connections=$(run_db_query "SELECT count(*) FROM pg_stat_activity WHERE datname = current_database();" || true)
db_connections=$(echo "$db_connections" | tr -d ' ')
[[ -z "$db_connections" ]] && db_connections=0
fi
# Active locks (high lock count can indicate migration issues)
local db_locks=0
if [[ $db_up -eq 1 ]]; then
db_locks=$(run_db_query "SELECT count(*) FROM pg_locks WHERE NOT granted;" || true)
db_locks=$(echo "$db_locks" | tr -d ' ')
[[ -z "$db_locks" ]] && db_locks=0
fi
# Schema migration version (latest applied)
local schema_version="0"
if [[ $db_up -eq 1 ]]; then
schema_version=$(run_db_query "SELECT MAX(version) FROM schema_migrations;" || true)
schema_version=$(echo "$schema_version" | tr -d ' ')
[[ -z "$schema_version" ]] && schema_version=0
fi
cat <<EOF
# HELP gitlab_database_up Whether the GitLab database is reachable
# TYPE gitlab_database_up gauge
gitlab_database_up $db_up
# HELP gitlab_database_type Database type (1=local, 2=external)
# TYPE gitlab_database_type gauge
gitlab_database_type{type="$DB_TYPE"} $([ "$DB_TYPE" = "local" ] && echo 1 || echo 2)
# HELP gitlab_database_size_bytes Size of the GitLab database in bytes
# TYPE gitlab_database_size_bytes gauge
gitlab_database_size_bytes $db_size
# HELP gitlab_database_connections Active database connections
# TYPE gitlab_database_connections gauge
gitlab_database_connections $db_connections
# HELP gitlab_database_waiting_locks Number of waiting (not granted) locks
# TYPE gitlab_database_waiting_locks gauge
gitlab_database_waiting_locks $db_locks
# HELP gitlab_database_schema_version Latest applied schema migration version
# TYPE gitlab_database_schema_version gauge
gitlab_database_schema_version $schema_version
EOF
}
#########################
### Log Tail Metrics ###
#########################
collect_log_metrics() {
local migration_errors=0
local migration_warnings=0
# Check production.log for recent migration activity
if [[ -f "$LOG_DIR/gitlab-rails/production.log" ]]; then
# Count errors in the last 1000 lines
migration_errors=$(tail -1000 "$LOG_DIR/gitlab-rails/production.log" 2>/dev/null \
| grep -ciE 'migration.*error|error.*migration|ActiveRecord::StatementInvalid' || true)
migration_warnings=$(tail -1000 "$LOG_DIR/gitlab-rails/production.log" 2>/dev/null \
| grep -ciE 'migration.*warning|warning.*migration|deprecated' || true)
fi
# Check reconfigure log
local reconfigure_errors=0
if [[ -f "$LOG_DIR/reconfigure/latest" ]]; then
reconfigure_errors=$(tail -500 "$LOG_DIR/reconfigure/latest" 2>/dev/null \
| grep -ciE 'error|fatal|failed' || true)
fi
cat <<EOF
# HELP gitlab_migration_log_errors Recent migration-related errors in logs
# TYPE gitlab_migration_log_errors gauge
gitlab_migration_log_errors $migration_errors
# HELP gitlab_migration_log_warnings Recent migration-related warnings in logs
# TYPE gitlab_migration_log_warnings gauge
gitlab_migration_log_warnings $migration_warnings
# HELP gitlab_reconfigure_log_errors Recent errors in reconfigure log
# TYPE gitlab_reconfigure_log_errors gauge
gitlab_reconfigure_log_errors $reconfigure_errors
EOF
}
#########################
### Collect All Metrics ###
#########################
collect_all_metrics() {
local hostname
hostname=$(hostname -f 2>/dev/null || hostname)
cat <<EOF
# GitLab Migration Metrics Exporter
# Host: $hostname
# Collected at: $(date -Iseconds)
EOF
collect_version_info
echo ""
collect_migration_status
echo ""
collect_running_migration
echo ""
collect_reconfigure_status
echo ""
collect_background_migrations
echo ""
collect_service_status
echo ""
collect_database_health
echo ""
collect_log_metrics
}
#########################
### HTTP Server ###
#########################
handle_request() {
local request_line=""
while IFS= read -r line; do
line="${line%%$'\r'}"
[[ -z "$line" ]] && break
[[ -z "$request_line" ]] && request_line="$line"
done
local path
path=$(echo "$request_line" | awk '{print $2}')
case "$path" in
/metrics|/)
local now
now=$(date +%s)
# Use cached metrics if within interval
if [[ -f "$METRICS_CACHE" ]] && [[ $((now - LAST_SCRAPE)) -lt $SCRAPE_INTERVAL ]]; then
local metrics
metrics=$(cat "$METRICS_CACHE")
else
local metrics
metrics=$(collect_all_metrics)
echo "$metrics" > "$METRICS_CACHE"
LAST_SCRAPE=$now
fi
local body_length=${#metrics}
cat <<EOF
HTTP/1.1 200 OK
Content-Type: text/plain; charset=utf-8
Content-Length: $body_length
Connection: close
$metrics
EOF
;;
/health|/healthz)
cat <<EOF
HTTP/1.1 200 OK
Content-Type: text/plain
Content-Length: 2
Connection: close
OK
EOF
;;
*)
cat <<EOF
HTTP/1.1 404 Not Found
Content-Type: text/plain
Content-Length: 9
Connection: close
Not Found
EOF
;;
esac
}
start_server() {
log "Starting GitLab Migration Metrics Exporter on port $LISTEN_PORT"
log "Metrics available at http://localhost:$LISTEN_PORT/metrics"
log "Collection interval: ${SCRAPE_INTERVAL}s"
while true; do
socat TCP-LISTEN:"$LISTEN_PORT",reuseaddr,fork EXEC:"$0 --handle-request" 2>/dev/null || {
log "Server error, restarting in 5 seconds..."
sleep 5
}
done
}
#########################
### Main ###
#########################
main() {
parse_args "$@"
setup
if [[ "$HTTP_MODE" == "true" ]]; then
start_server
elif [[ -n "$OUTPUT_FILE" ]]; then
# Textfile collector mode: write atomically using temp file
local output_dir
output_dir="$(dirname "$OUTPUT_FILE")"
mkdir -p "$output_dir"
local temp_file
temp_file=$(mktemp "${output_dir}/.gitlab_migration_metrics.XXXXXX")
if ! collect_all_metrics > "$temp_file" 2>/dev/null; then
rm -f "$temp_file"
log "ERROR: Failed to generate metrics"
exit 1
fi
local file_lines
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
if [[ "$file_lines" -lt 10 ]]; then
rm -f "$temp_file"
log "ERROR: Metrics file too small ($file_lines lines), keeping previous"
exit 1
fi
chmod 644 "$temp_file"
mv -f "$temp_file" "$OUTPUT_FILE"
log "Metrics written to $OUTPUT_FILE ($file_lines lines)"
else
# Default: output to stdout
collect_all_metrics
fi
}
main "$@"