88551536e6
Amp-Thread-ID: https://ampcode.com/threads/T-019cc404-c628-759e-a50b-f5eeea35b91f Co-authored-by: Amp <amp@ampcode.com>
1028 lines
34 KiB
Bash
1028 lines
34 KiB
Bash
#!/bin/bash
|
|
#############################################################
|
|
#### GitLab Migration Metrics Exporter for Prometheus ####
|
|
#### Monitors database migration progress during updates ####
|
|
#### ####
|
|
#### Author: Phil Connor ####
|
|
#### Contact: contact@mylinux.work ####
|
|
#### License: MIT ####
|
|
#### Version: 1.0 ####
|
|
#### ####
|
|
#### Usage: ./gitlab-migration-exporter.sh [OPTIONS] ####
|
|
#############################################################
|
|
#
|
|
# Metrics collected:
|
|
# - Migration status: total, completed, pending, failed, running
|
|
# - Current migration: name, version, elapsed time
|
|
# - Background migrations: batched job queue status
|
|
# - GitLab services: up/down status per service
|
|
# - GitLab version: current installed version
|
|
# - Reconfigure status: whether gitlab-ctl reconfigure is running
|
|
# - Upgrade progress: overall percentage estimate
|
|
#
|
|
# Requirements:
|
|
# - GitLab Omnibus installation
|
|
# - socat (for HTTP server)
|
|
# - Root or gitlab-psql access
|
|
#
|
|
set -euo pipefail
|
|
|
|
#########################
|
|
### Configuration ###
|
|
#########################
|
|
|
|
LISTEN_PORT="${GITLAB_EXPORTER_PORT:-9177}"
|
|
SCRAPE_INTERVAL="${SCRAPE_INTERVAL:-30}"
|
|
RAKE_INTERVAL="${RAKE_INTERVAL:-300}"
|
|
GITLAB_RAILS_CMD="${GITLAB_RAILS_CMD:-gitlab-rails}"
|
|
GITLAB_RAKE_CMD="${GITLAB_RAKE_CMD:-gitlab-rake}"
|
|
GITLAB_CTL_CMD="${GITLAB_CTL_CMD:-gitlab-ctl}"
|
|
GITLAB_PSQL_CMD="${GITLAB_PSQL_CMD:-gitlab-psql}"
|
|
GITLAB_DB_CONFIG="/var/opt/gitlab/gitlab-rails/etc/database.yml"
|
|
LOG_DIR="${GITLAB_LOG_DIR:-/var/log/gitlab}"
|
|
LOGFILE="/var/log/gitlab-migration-exporter.log"
|
|
|
|
# Output mode
|
|
TEXTFILE_DIR="/var/lib/node_exporter"
|
|
OUTPUT_FILE=""
|
|
HTTP_MODE=false
|
|
|
|
# Cache state
|
|
STATE_DIR="/tmp/gitlab-migration-metrics"
|
|
METRICS_CACHE="$STATE_DIR/metrics_cache"
|
|
RAKE_CACHE="$STATE_DIR/rake_cache"
|
|
LOCKFILE="$STATE_DIR/exporter.pid"
|
|
LAST_SCRAPE=0
|
|
|
|
#########################
|
|
### Logging ###
|
|
#########################
|
|
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOGFILE" >&2
|
|
}
|
|
|
|
#########################
|
|
### Parse Arguments ###
|
|
#########################
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--textfile)
|
|
OUTPUT_FILE="$TEXTFILE_DIR/gitlab_migration.prom"
|
|
shift
|
|
;;
|
|
--http)
|
|
HTTP_MODE=true
|
|
shift
|
|
;;
|
|
--port|-p)
|
|
LISTEN_PORT="$2"
|
|
HTTP_MODE=true
|
|
shift 2
|
|
;;
|
|
--output|-o)
|
|
OUTPUT_FILE="$2"
|
|
shift 2
|
|
;;
|
|
--interval)
|
|
SCRAPE_INTERVAL="$2"
|
|
shift 2
|
|
;;
|
|
--rake-interval)
|
|
RAKE_INTERVAL="$2"
|
|
shift 2
|
|
;;
|
|
--log-dir)
|
|
LOG_DIR="$2"
|
|
shift 2
|
|
;;
|
|
--handle-request)
|
|
handle_request
|
|
exit 0
|
|
;;
|
|
--help|-h)
|
|
cat <<EOF
|
|
GitLab Migration Metrics Exporter for Prometheus
|
|
|
|
Usage: $0 [OPTIONS]
|
|
|
|
MODES:
|
|
--textfile Write to node_exporter textfile collector
|
|
--http Run HTTP server on port $LISTEN_PORT (default mode)
|
|
(no flag) Output to stdout
|
|
|
|
OPTIONS:
|
|
-p, --port PORT HTTP port (default: 9177)
|
|
-o, --output PATH Output file path
|
|
--interval SECS Minimum seconds between metric collections (default: 30)
|
|
--rake-interval SECS Minimum seconds between gitlab-rake calls (default: 300)
|
|
--log-dir PATH GitLab log directory (default: /var/log/gitlab)
|
|
--help Show this help
|
|
|
|
EXAMPLES:
|
|
$0 --textfile # Write to textfile collector
|
|
$0 --http --port 9177 # Run HTTP server
|
|
$0 -o /tmp/gitlab_migration.prom # Write to custom file
|
|
$0 # Output to stdout
|
|
|
|
Environment Variables:
|
|
GITLAB_EXPORTER_PORT Same as --port
|
|
SCRAPE_INTERVAL Same as --interval
|
|
RAKE_INTERVAL Same as --rake-interval
|
|
GITLAB_RAILS_CMD Path to gitlab-rails (default: gitlab-rails)
|
|
GITLAB_RAKE_CMD Path to gitlab-rake (default: gitlab-rake)
|
|
GITLAB_CTL_CMD Path to gitlab-ctl (default: gitlab-ctl)
|
|
GITLAB_PSQL_CMD Path to gitlab-psql (default: gitlab-psql)
|
|
GITLAB_LOG_DIR Same as --log-dir
|
|
|
|
EOF
|
|
exit 0
|
|
;;
|
|
*)
|
|
log "Unknown option: $1"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
#########################
|
|
### Self-Install ###
|
|
#########################
|
|
|
|
install_service() {
|
|
local script_path="/usr/local/bin/gitlab-migration-exporter.sh"
|
|
local service_path="/etc/systemd/system/gitlab-migration-exporter.service"
|
|
|
|
# Copy script into place if not already there
|
|
if [[ "$(readlink -f "$0")" != "$script_path" ]]; then
|
|
log "Installing script to $script_path"
|
|
cp "$(readlink -f "$0")" "$script_path"
|
|
chmod 755 "$script_path"
|
|
fi
|
|
|
|
# Create systemd service unit
|
|
if [[ ! -f "$service_path" ]]; then
|
|
log "Creating systemd service at $service_path"
|
|
cat > "$service_path" <<'UNIT'
|
|
[Unit]
|
|
Description=GitLab Migration Metrics Exporter for Prometheus
|
|
Wants=network-online.target
|
|
After=network-online.target
|
|
|
|
[Service]
|
|
User=root
|
|
Group=root
|
|
Type=simple
|
|
ExecStart=/usr/local/bin/gitlab-migration-exporter.sh
|
|
Restart=always
|
|
RestartSec=10
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
UNIT
|
|
systemctl daemon-reload
|
|
systemctl enable gitlab-migration-exporter
|
|
log "Service installed and enabled"
|
|
fi
|
|
}
|
|
|
|
#########################
|
|
### Setup ###
|
|
#########################
|
|
|
|
check_already_running() {
|
|
if [[ -f "$LOCKFILE" ]]; then
|
|
local old_pid
|
|
old_pid=$(cat "$LOCKFILE" 2>/dev/null)
|
|
if [[ -n "$old_pid" ]] && kill -0 "$old_pid" 2>/dev/null; then
|
|
log "ERROR: Already running (PID $old_pid). Exiting."
|
|
exit 0
|
|
fi
|
|
# Stale lockfile, remove it
|
|
rm -f "$LOCKFILE"
|
|
fi
|
|
echo $$ > "$LOCKFILE"
|
|
}
|
|
|
|
cleanup_lock() {
|
|
rm -f "$LOCKFILE"
|
|
}
|
|
|
|
trap cleanup_lock EXIT INT TERM
|
|
|
|
setup() {
|
|
mkdir -p "$STATE_DIR"
|
|
check_already_running
|
|
|
|
if ! command -v socat &>/dev/null; then
|
|
log "socat not found, attempting to install..."
|
|
if [[ $EUID -eq 0 ]]; then
|
|
if command -v apt-get &>/dev/null; then
|
|
apt-get update -qq && apt-get install -y -qq socat
|
|
elif command -v dnf &>/dev/null; then
|
|
dnf install -y -q socat
|
|
elif command -v yum &>/dev/null; then
|
|
yum install -y -q socat
|
|
else
|
|
log "ERROR: Cannot auto-install socat. Install manually."
|
|
exit 1
|
|
fi
|
|
log "socat installed successfully"
|
|
else
|
|
log "ERROR: socat is required. Run as root to auto-install, or install manually."
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
# Self-install on first run if running in HTTP mode as root
|
|
if [[ "$HTTP_MODE" == "true" && $EUID -eq 0 ]]; then
|
|
install_service
|
|
fi
|
|
|
|
if ! command -v "$GITLAB_CTL_CMD" &>/dev/null; then
|
|
log "WARNING: $GITLAB_CTL_CMD not found - is GitLab Omnibus installed?"
|
|
fi
|
|
|
|
detect_database_config
|
|
log "Database type: $DB_TYPE"
|
|
}
|
|
|
|
#########################
|
|
### Migration Status ###
|
|
#########################
|
|
|
|
collect_migration_status() {
|
|
local now
|
|
now=$(date +%s)
|
|
|
|
# Rake is expensive (can take 2+ mins), use separate cache interval
|
|
# Check file mtime so caching works across invocations (textfile mode)
|
|
if [[ -f "$RAKE_CACHE" ]]; then
|
|
local cache_age
|
|
cache_age=$(( now - $(stat -c %Y "$RAKE_CACHE" 2>/dev/null || echo "0") ))
|
|
if [[ $cache_age -lt $RAKE_INTERVAL ]]; then
|
|
cat "$RAKE_CACHE"
|
|
return
|
|
fi
|
|
fi
|
|
|
|
local total=0 up_count=0 down_count=0 migrate_output=""
|
|
|
|
if command -v "$GITLAB_RAKE_CMD" &>/dev/null; then
|
|
migrate_output=$("$GITLAB_RAKE_CMD" db:migrate:status 2>/dev/null | grep -E '^\s*(up|down)\s' || echo "")
|
|
|
|
if [[ -n "$migrate_output" ]]; then
|
|
total=$(echo "$migrate_output" | wc -l)
|
|
up_count=$(echo "$migrate_output" | grep -cE '^\s*up\s' || true)
|
|
down_count=$(echo "$migrate_output" | grep -cE '^\s*down\s' || true)
|
|
fi
|
|
fi
|
|
|
|
local progress=0
|
|
if [[ $total -gt 0 ]]; then
|
|
progress=$(awk "BEGIN {printf \"%.2f\", ($up_count / $total) * 100}")
|
|
fi
|
|
|
|
{
|
|
cat <<EOF
|
|
# HELP gitlab_migrations_total Total number of database migrations
|
|
# TYPE gitlab_migrations_total gauge
|
|
gitlab_migrations_total $total
|
|
|
|
# HELP gitlab_migrations_completed Number of completed (up) migrations
|
|
# TYPE gitlab_migrations_completed gauge
|
|
gitlab_migrations_completed $up_count
|
|
|
|
# HELP gitlab_migrations_pending Number of pending (down) migrations
|
|
# TYPE gitlab_migrations_pending gauge
|
|
gitlab_migrations_pending $down_count
|
|
|
|
# HELP gitlab_migrations_progress_percent Percentage of migrations completed
|
|
# TYPE gitlab_migrations_progress_percent gauge
|
|
gitlab_migrations_progress_percent $progress
|
|
EOF
|
|
|
|
# Emit individual pending migration details
|
|
if [[ $down_count -gt 0 && -n "$migrate_output" ]]; then
|
|
echo ""
|
|
echo "# HELP gitlab_migration_pending_info Info about each pending migration"
|
|
echo "# TYPE gitlab_migration_pending_info gauge"
|
|
echo "$migrate_output" | grep -E '^\s*down\s' | while read -r status version name; do
|
|
name="${name:-unknown}"
|
|
name=$(echo "$name" | tr -cd '[:alnum:]_')
|
|
echo "gitlab_migration_pending_info{version=\"$version\",name=\"$name\"} 1"
|
|
done
|
|
fi
|
|
} | tee "$RAKE_CACHE"
|
|
}
|
|
|
|
#########################
|
|
### Running Migration ###
|
|
#########################
|
|
|
|
collect_running_migration() {
|
|
local migration_running=0
|
|
local migration_pid=0
|
|
local migration_elapsed=0
|
|
local current_migration_name=""
|
|
|
|
# Check if rake db:migrate is running
|
|
local rake_pid
|
|
rake_pid=$(pgrep -f "rake.*db:migrate" 2>/dev/null | head -1 || echo "")
|
|
|
|
if [[ -n "$rake_pid" ]]; then
|
|
migration_running=1
|
|
migration_pid=$rake_pid
|
|
# Get elapsed time in seconds
|
|
migration_elapsed=$(ps -o etimes= -p "$rake_pid" 2>/dev/null | tr -d ' ' || echo "0")
|
|
|
|
# Try to find the current migration from the log
|
|
if [[ -f "$LOG_DIR/gitlab-rails/production.log" ]]; then
|
|
current_migration_name=$(grep -oP 'Migrating to \K\S+' "$LOG_DIR/gitlab-rails/production.log" 2>/dev/null | tail -1 || echo "")
|
|
fi
|
|
fi
|
|
|
|
# Also check for gitlab-ctl upgrade processes
|
|
local upgrade_pid
|
|
upgrade_pid=$(pgrep -f "gitlab-ctl upgrade" 2>/dev/null | head -1 || echo "")
|
|
if [[ -n "$upgrade_pid" && "$migration_running" -eq 0 ]]; then
|
|
migration_running=1
|
|
migration_pid=$upgrade_pid
|
|
migration_elapsed=$(ps -o etimes= -p "$upgrade_pid" 2>/dev/null | tr -d ' ' || echo "0")
|
|
fi
|
|
|
|
cat <<EOF
|
|
|
|
# HELP gitlab_migration_running Whether a database migration is currently running
|
|
# TYPE gitlab_migration_running gauge
|
|
gitlab_migration_running $migration_running
|
|
|
|
# HELP gitlab_migration_pid PID of the running migration process
|
|
# TYPE gitlab_migration_pid gauge
|
|
gitlab_migration_pid $migration_pid
|
|
|
|
# HELP gitlab_migration_elapsed_seconds Elapsed time of the current migration in seconds
|
|
# TYPE gitlab_migration_elapsed_seconds gauge
|
|
gitlab_migration_elapsed_seconds $migration_elapsed
|
|
EOF
|
|
|
|
if [[ -n "$current_migration_name" ]]; then
|
|
echo ""
|
|
echo "# HELP gitlab_migration_current_info Currently running migration name"
|
|
echo "# TYPE gitlab_migration_current_info gauge"
|
|
echo "gitlab_migration_current_info{name=\"$current_migration_name\"} 1"
|
|
fi
|
|
}
|
|
|
|
#########################
|
|
### Reconfigure Status ###
|
|
#########################
|
|
|
|
collect_reconfigure_status() {
|
|
local reconfigure_running=0
|
|
local reconfigure_elapsed=0
|
|
|
|
local reconf_pid
|
|
reconf_pid=$(pgrep -f "chef-client.*reconfigure\|gitlab-ctl reconfigure" 2>/dev/null | head -1 || echo "")
|
|
|
|
if [[ -n "$reconf_pid" ]]; then
|
|
reconfigure_running=1
|
|
reconfigure_elapsed=$(ps -o etimes= -p "$reconf_pid" 2>/dev/null | tr -d ' ' || echo "0")
|
|
fi
|
|
|
|
cat <<EOF
|
|
|
|
# HELP gitlab_reconfigure_running Whether gitlab-ctl reconfigure is currently running
|
|
# TYPE gitlab_reconfigure_running gauge
|
|
gitlab_reconfigure_running $reconfigure_running
|
|
|
|
# HELP gitlab_reconfigure_elapsed_seconds Elapsed time of reconfigure in seconds
|
|
# TYPE gitlab_reconfigure_elapsed_seconds gauge
|
|
gitlab_reconfigure_elapsed_seconds $reconfigure_elapsed
|
|
EOF
|
|
}
|
|
|
|
#########################
|
|
### Background Migrations ###
|
|
#########################
|
|
|
|
collect_background_migrations() {
|
|
local batched_total=0
|
|
local batched_finished=0
|
|
local batched_running=0
|
|
local batched_pending=0
|
|
local batched_failed=0
|
|
local batched_paused=0
|
|
|
|
if [[ "$DB_TYPE" != "none" ]]; then
|
|
# Query batched_background_migrations table
|
|
local query_result
|
|
query_result=$(run_db_query "
|
|
SELECT status, COUNT(*)
|
|
FROM batched_background_migrations
|
|
GROUP BY status;
|
|
" || echo "")
|
|
|
|
if [[ -n "$query_result" ]]; then
|
|
while IFS='|' read -r status count; do
|
|
count=$(echo "$count" | tr -d ' ')
|
|
case "$status" in
|
|
0) batched_paused=$count ;;
|
|
1) batched_running=$count ;; # active
|
|
2) batched_pending=$count ;; # queued
|
|
3) batched_finished=$count ;;
|
|
4) batched_failed=$count ;;
|
|
*) ;;
|
|
esac
|
|
batched_total=$((batched_total + count))
|
|
done <<< "$query_result"
|
|
fi
|
|
fi
|
|
|
|
local batched_progress=0
|
|
if [[ $batched_total -gt 0 ]]; then
|
|
batched_progress=$(awk "BEGIN {printf \"%.2f\", ($batched_finished / $batched_total) * 100}")
|
|
fi
|
|
|
|
cat <<EOF
|
|
|
|
# HELP gitlab_batched_migrations_total Total batched background migrations
|
|
# TYPE gitlab_batched_migrations_total gauge
|
|
gitlab_batched_migrations_total $batched_total
|
|
|
|
# HELP gitlab_batched_migrations_finished Finished batched background migrations
|
|
# TYPE gitlab_batched_migrations_finished gauge
|
|
gitlab_batched_migrations_finished $batched_finished
|
|
|
|
# HELP gitlab_batched_migrations_running Currently running batched background migrations
|
|
# TYPE gitlab_batched_migrations_running gauge
|
|
gitlab_batched_migrations_running $batched_running
|
|
|
|
# HELP gitlab_batched_migrations_pending Pending batched background migrations
|
|
# TYPE gitlab_batched_migrations_pending gauge
|
|
gitlab_batched_migrations_pending $batched_pending
|
|
|
|
# HELP gitlab_batched_migrations_failed Failed batched background migrations
|
|
# TYPE gitlab_batched_migrations_failed gauge
|
|
gitlab_batched_migrations_failed $batched_failed
|
|
|
|
# HELP gitlab_batched_migrations_paused Paused batched background migrations
|
|
# TYPE gitlab_batched_migrations_paused gauge
|
|
gitlab_batched_migrations_paused $batched_paused
|
|
|
|
# HELP gitlab_batched_migrations_progress_percent Percentage of batched migrations finished
|
|
# TYPE gitlab_batched_migrations_progress_percent gauge
|
|
gitlab_batched_migrations_progress_percent $batched_progress
|
|
EOF
|
|
|
|
# Per-migration detail for running/pending ones
|
|
if [[ "$DB_TYPE" != "none" ]]; then
|
|
local detail_result
|
|
detail_result=$(run_db_query "
|
|
SELECT id, job_class_name, table_name, status,
|
|
COALESCE(min_value, 0), COALESCE(max_value, 0), COALESCE(batch_size, 0),
|
|
EXTRACT(EPOCH FROM (NOW() - created_at))::int as age_seconds
|
|
FROM batched_background_migrations
|
|
WHERE status IN (1, 2, 4)
|
|
ORDER BY id;
|
|
" || echo "")
|
|
|
|
if [[ -n "$detail_result" ]]; then
|
|
echo ""
|
|
echo "# HELP gitlab_batched_migration_detail Details of active/pending/failed batched migrations"
|
|
echo "# TYPE gitlab_batched_migration_detail gauge"
|
|
echo "# HELP gitlab_batched_migration_age_seconds Age of batched migration in seconds"
|
|
echo "# TYPE gitlab_batched_migration_age_seconds gauge"
|
|
while IFS='|' read -r id job_class table_name status _ _ batch_size age; do
|
|
[[ -z "$id" ]] && continue
|
|
local status_label
|
|
case "$status" in
|
|
1) status_label="running" ;;
|
|
2) status_label="pending" ;;
|
|
4) status_label="failed" ;;
|
|
*) status_label="unknown" ;;
|
|
esac
|
|
echo "gitlab_batched_migration_detail{id=\"$id\",job_class=\"$job_class\",table_name=\"$table_name\",status=\"$status_label\"} $batch_size"
|
|
echo "gitlab_batched_migration_age_seconds{id=\"$id\",job_class=\"$job_class\"} $age"
|
|
done <<< "$detail_result"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
#########################
|
|
### Service Status ###
|
|
#########################
|
|
|
|
collect_service_status() {
|
|
if ! command -v "$GITLAB_CTL_CMD" &>/dev/null; then
|
|
echo ""
|
|
echo "# HELP gitlab_service_up Whether a GitLab service is running (1=up, 0=down)"
|
|
echo "# TYPE gitlab_service_up gauge"
|
|
echo "gitlab_service_up{service=\"unknown\"} 0"
|
|
return
|
|
fi
|
|
|
|
local ctl_output
|
|
ctl_output=$("$GITLAB_CTL_CMD" status 2>/dev/null || echo "")
|
|
|
|
echo ""
|
|
echo "# HELP gitlab_service_up Whether a GitLab service is running (1=up, 0=down)"
|
|
echo "# TYPE gitlab_service_up gauge"
|
|
echo "# HELP gitlab_service_pid PID of the GitLab service"
|
|
echo "# TYPE gitlab_service_pid gauge"
|
|
echo "# HELP gitlab_service_uptime_seconds Uptime of the GitLab service in seconds"
|
|
echo "# TYPE gitlab_service_uptime_seconds gauge"
|
|
|
|
# gitlab-ctl status output format:
|
|
# run: alertmanager: (pid 7163) 22805249s; run: log: (pid 7160) 22805249s;
|
|
# down: sidekiq: 0s, normally up; run: log: (pid 7000) 22805249s;
|
|
if [[ -n "$ctl_output" ]]; then
|
|
while IFS= read -r line; do
|
|
if [[ "$line" =~ ^run:\ ([^:]+):\ \(pid\ ([0-9]+)\)\ ([0-9]+)s ]]; then
|
|
local service="${BASH_REMATCH[1]}"
|
|
local pid="${BASH_REMATCH[2]}"
|
|
local uptime_str="${BASH_REMATCH[3]}"
|
|
service=$(echo "$service" | tr -d ' ')
|
|
echo "gitlab_service_up{service=\"$service\"} 1"
|
|
echo "gitlab_service_pid{service=\"$service\"} $pid"
|
|
echo "gitlab_service_uptime_seconds{service=\"$service\"} $uptime_str"
|
|
elif [[ "$line" =~ ^down:\ ([^:]+): ]]; then
|
|
local service="${BASH_REMATCH[1]}"
|
|
service=$(echo "$service" | tr -d ' ')
|
|
echo "gitlab_service_up{service=\"$service\"} 0"
|
|
echo "gitlab_service_pid{service=\"$service\"} 0"
|
|
echo "gitlab_service_uptime_seconds{service=\"$service\"} 0"
|
|
fi
|
|
done <<< "$ctl_output"
|
|
fi
|
|
|
|
# Count services
|
|
local total_services up_services down_services
|
|
total_services=$(echo "$ctl_output" | grep -cE '^(run|down):' || true)
|
|
up_services=$(echo "$ctl_output" | grep -cE '^run:' || true)
|
|
down_services=$(echo "$ctl_output" | grep -cE '^down:' || true)
|
|
|
|
cat <<EOF
|
|
|
|
# HELP gitlab_services_total Total number of GitLab services
|
|
# TYPE gitlab_services_total gauge
|
|
gitlab_services_total $total_services
|
|
|
|
# HELP gitlab_services_up Number of running GitLab services
|
|
# TYPE gitlab_services_up gauge
|
|
gitlab_services_up $up_services
|
|
|
|
# HELP gitlab_services_down Number of down GitLab services
|
|
# TYPE gitlab_services_down gauge
|
|
gitlab_services_down $down_services
|
|
EOF
|
|
}
|
|
|
|
#########################
|
|
### Version Info ###
|
|
#########################
|
|
|
|
collect_version_info() {
|
|
local version="unknown"
|
|
|
|
# Try multiple methods to get the version
|
|
if [[ -f /opt/gitlab/version-manifest.txt ]]; then
|
|
version=$(head -1 /opt/gitlab/version-manifest.txt | awk '{print $2}' || echo "unknown")
|
|
elif command -v gitlab-rake &>/dev/null; then
|
|
version=$(gitlab-rake gitlab:env:info 2>/dev/null | grep -oP 'GitLab:\s+\K[\d.]+' || echo "unknown")
|
|
fi
|
|
|
|
cat <<EOF
|
|
|
|
# HELP gitlab_version_info GitLab version information
|
|
# TYPE gitlab_version_info gauge
|
|
gitlab_version_info{version="$version"} 1
|
|
EOF
|
|
}
|
|
|
|
#########################
|
|
### Database Health ###
|
|
#########################
|
|
|
|
run_db_query() {
|
|
local query="$1"
|
|
|
|
if [[ "$DB_TYPE" == "external" ]]; then
|
|
# External DB (AWS RDS, etc) — use psql with individual flags
|
|
PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -A -c "$query" 2>/dev/null
|
|
elif [[ "$DB_TYPE" == "local" ]]; then
|
|
# Local Omnibus DB — gitlab-psql wraps psql with correct socket/user
|
|
# It does not accept -d; the database is passed via the positional arg
|
|
"$GITLAB_PSQL_CMD" -t -A -c "$query" -d gitlabhq_production 2>/dev/null \
|
|
|| sudo -u gitlab-psql "$GITLAB_PSQL_CMD" -t -A -c "$query" -d gitlabhq_production 2>/dev/null \
|
|
|| "$GITLAB_PSQL_CMD" -t -A -c "$query" gitlabhq_production 2>/dev/null
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
parse_yaml_value() {
|
|
# Extract a YAML value, handling quoted strings and stripping whitespace
|
|
local raw="$1"
|
|
raw=$(echo "$raw" | sed 's/^[[:space:]]*[^:]*:[[:space:]]*//' | sed 's/[[:space:]]*$//')
|
|
# Strip surrounding quotes (single or double)
|
|
raw=$(echo "$raw" | sed -e 's/^"//' -e 's/"$//' -e "s/^'//" -e "s/'$//")
|
|
echo "$raw"
|
|
}
|
|
|
|
install_psql_client() {
|
|
if [[ $EUID -ne 0 ]]; then
|
|
log "ERROR: psql client not found. Run as root to auto-install, or install manually."
|
|
DB_TYPE="none"
|
|
return
|
|
fi
|
|
|
|
log "Attempting to install PostgreSQL client..."
|
|
if command -v amazon-linux-extras &>/dev/null; then
|
|
amazon-linux-extras install postgresql14 -y &>/dev/null && log "Installed psql via amazon-linux-extras"
|
|
elif command -v apt-get &>/dev/null; then
|
|
apt-get update -qq && apt-get install -y -qq postgresql-client &>/dev/null && log "Installed postgresql-client via apt"
|
|
elif command -v dnf &>/dev/null; then
|
|
dnf install -y -q postgresql &>/dev/null && log "Installed postgresql via dnf"
|
|
elif command -v yum &>/dev/null; then
|
|
yum install -y -q postgresql &>/dev/null && log "Installed postgresql via yum"
|
|
else
|
|
log "ERROR: Cannot auto-install psql client. Install manually."
|
|
DB_TYPE="none"
|
|
return
|
|
fi
|
|
|
|
# Verify install worked
|
|
if ! command -v psql &>/dev/null; then
|
|
log "ERROR: psql still not found after install attempt"
|
|
DB_TYPE="none"
|
|
fi
|
|
}
|
|
|
|
upgrade_psql_client() {
|
|
if [[ $EUID -ne 0 ]]; then
|
|
log "ERROR: Cannot upgrade psql — run as root. Or upgrade manually:"
|
|
log " Amazon Linux 2: amazon-linux-extras install postgresql14 -y"
|
|
log " RHEL/CentOS: yum install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-\$(rpm -E %%{rhel})-x86_64/pgdg-redhat-repo-latest.noarch.rpm && yum install -y postgresql16"
|
|
log " Ubuntu/Debian: apt install -y postgresql-client-16"
|
|
return
|
|
fi
|
|
|
|
log "Attempting to upgrade PostgreSQL client for SCRAM support..."
|
|
if command -v amazon-linux-extras &>/dev/null; then
|
|
amazon-linux-extras install postgresql14 -y &>/dev/null && log "Upgraded psql via amazon-linux-extras" && return
|
|
fi
|
|
|
|
if command -v dnf &>/dev/null; then
|
|
# shellcheck disable=SC1083
|
|
dnf install -y -q "https://download.postgresql.org/pub/repos/yum/reporpms/EL-$(rpm -E '%{rhel}')-x86_64/pgdg-redhat-repo-latest.noarch.rpm" &>/dev/null
|
|
dnf install -y -q postgresql16 &>/dev/null && log "Upgraded to postgresql16 via pgdg repo" && return
|
|
elif command -v yum &>/dev/null; then
|
|
# shellcheck disable=SC1083
|
|
yum install -y -q "https://download.postgresql.org/pub/repos/yum/reporpms/EL-$(rpm -E '%{rhel}')-x86_64/pgdg-redhat-repo-latest.noarch.rpm" &>/dev/null
|
|
yum install -y -q postgresql16 &>/dev/null && log "Upgraded to postgresql16 via pgdg repo" && return
|
|
elif command -v apt-get &>/dev/null; then
|
|
apt-get update -qq && apt-get install -y -qq postgresql-client-16 &>/dev/null && log "Upgraded to postgresql-client-16 via apt" && return
|
|
fi
|
|
|
|
log "ERROR: Auto-upgrade failed. Please upgrade manually."
|
|
}
|
|
|
|
detect_database_config() {
|
|
DB_HOST=""
|
|
DB_PORT="5432"
|
|
DB_NAME="gitlabhq_production"
|
|
DB_USER=""
|
|
DB_PASS=""
|
|
DB_TYPE="none"
|
|
|
|
# Parse GitLab's database.yml to detect external DB
|
|
if [[ -f "$GITLAB_DB_CONFIG" ]]; then
|
|
local prod_section
|
|
prod_section=$(sed -n '/^production:/,/^[a-z]/p' "$GITLAB_DB_CONFIG" | head -n -1)
|
|
|
|
local raw_host
|
|
raw_host=$(echo "$prod_section" | grep '^\s*host:' | head -1)
|
|
[[ -n "$raw_host" ]] && DB_HOST=$(parse_yaml_value "$raw_host")
|
|
|
|
local raw_port raw_name raw_user raw_pass
|
|
raw_port=$(echo "$prod_section" | grep '^\s*port:' | head -1)
|
|
raw_name=$(echo "$prod_section" | grep '^\s*database:' | head -1)
|
|
raw_user=$(echo "$prod_section" | grep '^\s*username:' | head -1)
|
|
raw_pass=$(echo "$prod_section" | grep '^\s*password:' | head -1)
|
|
|
|
[[ -n "$raw_port" ]] && DB_PORT=$(parse_yaml_value "$raw_port")
|
|
[[ -n "$raw_name" ]] && DB_NAME=$(parse_yaml_value "$raw_name")
|
|
[[ -n "$raw_user" ]] && DB_USER=$(parse_yaml_value "$raw_user")
|
|
[[ -n "$raw_pass" ]] && DB_PASS=$(parse_yaml_value "$raw_pass")
|
|
|
|
log "DB config parsed: host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER pass=<redacted>"
|
|
|
|
# If host is set and not localhost/socket path, treat as external DB
|
|
if [[ -n "$DB_HOST" && "$DB_HOST" != "localhost" && "$DB_HOST" != "127.0.0.1" && ! "$DB_HOST" =~ ^/ ]]; then
|
|
if command -v psql &>/dev/null; then
|
|
DB_TYPE="external"
|
|
log "Detected external database at $DB_HOST:$DB_PORT"
|
|
# Verify connectivity
|
|
local test_result
|
|
if test_result=$(PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT 1;" 2>&1); then
|
|
log "External DB connection test: OK"
|
|
elif echo "$test_result" | grep -qi "SCRAM.*libpq"; then
|
|
log "ERROR: $test_result"
|
|
log "The installed psql client is too old for SCRAM authentication."
|
|
upgrade_psql_client
|
|
else
|
|
log "WARNING: External DB connection test failed: $test_result"
|
|
fi
|
|
else
|
|
log "WARNING: External DB detected at $DB_HOST but psql client not found"
|
|
install_psql_client
|
|
fi
|
|
return
|
|
fi
|
|
else
|
|
log "WARNING: Database config not found at $GITLAB_DB_CONFIG"
|
|
fi
|
|
|
|
# Fall back to local gitlab-psql
|
|
if command -v "$GITLAB_PSQL_CMD" &>/dev/null; then
|
|
if "$GITLAB_PSQL_CMD" -t -A -c "SELECT 1;" -d gitlabhq_production &>/dev/null \
|
|
|| sudo -u gitlab-psql "$GITLAB_PSQL_CMD" -t -A -c "SELECT 1;" -d gitlabhq_production &>/dev/null \
|
|
|| "$GITLAB_PSQL_CMD" -t -A -c "SELECT 1;" gitlabhq_production &>/dev/null; then
|
|
DB_TYPE="local"
|
|
else
|
|
log "WARNING: gitlab-psql found but cannot connect — check permissions (run as root?)"
|
|
DB_TYPE="none"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
collect_database_health() {
|
|
if [[ "$DB_TYPE" == "none" ]]; then
|
|
return
|
|
fi
|
|
|
|
# Check database connectivity
|
|
local db_up=0
|
|
if run_db_query "SELECT 1;" &>/dev/null; then
|
|
db_up=1
|
|
fi
|
|
|
|
# Get database size
|
|
local db_size=0
|
|
if [[ $db_up -eq 1 ]]; then
|
|
db_size=$(run_db_query "SELECT pg_database_size(current_database());" || true)
|
|
db_size=$(echo "$db_size" | tr -d ' ')
|
|
[[ -z "$db_size" ]] && db_size=0
|
|
fi
|
|
|
|
# Active connections
|
|
local db_connections=0
|
|
if [[ $db_up -eq 1 ]]; then
|
|
db_connections=$(run_db_query "SELECT count(*) FROM pg_stat_activity WHERE datname = current_database();" || true)
|
|
db_connections=$(echo "$db_connections" | tr -d ' ')
|
|
[[ -z "$db_connections" ]] && db_connections=0
|
|
fi
|
|
|
|
# Active locks (high lock count can indicate migration issues)
|
|
local db_locks=0
|
|
if [[ $db_up -eq 1 ]]; then
|
|
db_locks=$(run_db_query "SELECT count(*) FROM pg_locks WHERE NOT granted;" || true)
|
|
db_locks=$(echo "$db_locks" | tr -d ' ')
|
|
[[ -z "$db_locks" ]] && db_locks=0
|
|
fi
|
|
|
|
# Schema migration version (latest applied)
|
|
local schema_version="0"
|
|
if [[ $db_up -eq 1 ]]; then
|
|
schema_version=$(run_db_query "SELECT MAX(version) FROM schema_migrations;" || true)
|
|
schema_version=$(echo "$schema_version" | tr -d ' ')
|
|
[[ -z "$schema_version" ]] && schema_version=0
|
|
fi
|
|
|
|
cat <<EOF
|
|
|
|
# HELP gitlab_database_up Whether the GitLab database is reachable
|
|
# TYPE gitlab_database_up gauge
|
|
gitlab_database_up $db_up
|
|
|
|
# HELP gitlab_database_type Database type (1=local, 2=external)
|
|
# TYPE gitlab_database_type gauge
|
|
gitlab_database_type{type="$DB_TYPE"} $([ "$DB_TYPE" = "local" ] && echo 1 || echo 2)
|
|
|
|
# HELP gitlab_database_size_bytes Size of the GitLab database in bytes
|
|
# TYPE gitlab_database_size_bytes gauge
|
|
gitlab_database_size_bytes $db_size
|
|
|
|
# HELP gitlab_database_connections Active database connections
|
|
# TYPE gitlab_database_connections gauge
|
|
gitlab_database_connections $db_connections
|
|
|
|
# HELP gitlab_database_waiting_locks Number of waiting (not granted) locks
|
|
# TYPE gitlab_database_waiting_locks gauge
|
|
gitlab_database_waiting_locks $db_locks
|
|
|
|
# HELP gitlab_database_schema_version Latest applied schema migration version
|
|
# TYPE gitlab_database_schema_version gauge
|
|
gitlab_database_schema_version $schema_version
|
|
EOF
|
|
}
|
|
|
|
#########################
|
|
### Log Tail Metrics ###
|
|
#########################
|
|
|
|
collect_log_metrics() {
|
|
local migration_errors=0
|
|
local migration_warnings=0
|
|
|
|
# Check production.log for recent migration activity
|
|
if [[ -f "$LOG_DIR/gitlab-rails/production.log" ]]; then
|
|
# Count errors in the last 1000 lines
|
|
migration_errors=$(tail -1000 "$LOG_DIR/gitlab-rails/production.log" 2>/dev/null \
|
|
| grep -ciE 'migration.*error|error.*migration|ActiveRecord::StatementInvalid' || true)
|
|
migration_warnings=$(tail -1000 "$LOG_DIR/gitlab-rails/production.log" 2>/dev/null \
|
|
| grep -ciE 'migration.*warning|warning.*migration|deprecated' || true)
|
|
fi
|
|
|
|
# Check reconfigure log
|
|
local reconfigure_errors=0
|
|
if [[ -f "$LOG_DIR/reconfigure/latest" ]]; then
|
|
reconfigure_errors=$(tail -500 "$LOG_DIR/reconfigure/latest" 2>/dev/null \
|
|
| grep -ciE 'error|fatal|failed' || true)
|
|
fi
|
|
|
|
cat <<EOF
|
|
|
|
# HELP gitlab_migration_log_errors Recent migration-related errors in logs
|
|
# TYPE gitlab_migration_log_errors gauge
|
|
gitlab_migration_log_errors $migration_errors
|
|
|
|
# HELP gitlab_migration_log_warnings Recent migration-related warnings in logs
|
|
# TYPE gitlab_migration_log_warnings gauge
|
|
gitlab_migration_log_warnings $migration_warnings
|
|
|
|
# HELP gitlab_reconfigure_log_errors Recent errors in reconfigure log
|
|
# TYPE gitlab_reconfigure_log_errors gauge
|
|
gitlab_reconfigure_log_errors $reconfigure_errors
|
|
EOF
|
|
}
|
|
|
|
#########################
|
|
### Collect All Metrics ###
|
|
#########################
|
|
|
|
collect_all_metrics() {
|
|
local hostname
|
|
hostname=$(hostname -f 2>/dev/null || hostname)
|
|
|
|
cat <<EOF
|
|
# GitLab Migration Metrics Exporter
|
|
# Host: $hostname
|
|
# Collected at: $(date -Iseconds)
|
|
|
|
EOF
|
|
|
|
collect_version_info
|
|
echo ""
|
|
collect_migration_status
|
|
echo ""
|
|
collect_running_migration
|
|
echo ""
|
|
collect_reconfigure_status
|
|
echo ""
|
|
collect_background_migrations
|
|
echo ""
|
|
collect_service_status
|
|
echo ""
|
|
collect_database_health
|
|
echo ""
|
|
collect_log_metrics
|
|
}
|
|
|
|
#########################
|
|
### HTTP Server ###
|
|
#########################
|
|
|
|
handle_request() {
|
|
local request_line=""
|
|
|
|
while IFS= read -r line; do
|
|
line="${line%%$'\r'}"
|
|
[[ -z "$line" ]] && break
|
|
[[ -z "$request_line" ]] && request_line="$line"
|
|
done
|
|
|
|
local path
|
|
path=$(echo "$request_line" | awk '{print $2}')
|
|
|
|
case "$path" in
|
|
/metrics|/)
|
|
local now
|
|
now=$(date +%s)
|
|
|
|
# Use cached metrics if within interval
|
|
if [[ -f "$METRICS_CACHE" ]] && [[ $((now - LAST_SCRAPE)) -lt $SCRAPE_INTERVAL ]]; then
|
|
local metrics
|
|
metrics=$(cat "$METRICS_CACHE")
|
|
else
|
|
local metrics
|
|
metrics=$(collect_all_metrics)
|
|
echo "$metrics" > "$METRICS_CACHE"
|
|
LAST_SCRAPE=$now
|
|
fi
|
|
|
|
local body_length=${#metrics}
|
|
cat <<EOF
|
|
HTTP/1.1 200 OK
|
|
Content-Type: text/plain; charset=utf-8
|
|
Content-Length: $body_length
|
|
Connection: close
|
|
|
|
$metrics
|
|
EOF
|
|
;;
|
|
/health|/healthz)
|
|
cat <<EOF
|
|
HTTP/1.1 200 OK
|
|
Content-Type: text/plain
|
|
Content-Length: 2
|
|
Connection: close
|
|
|
|
OK
|
|
EOF
|
|
;;
|
|
*)
|
|
cat <<EOF
|
|
HTTP/1.1 404 Not Found
|
|
Content-Type: text/plain
|
|
Content-Length: 9
|
|
Connection: close
|
|
|
|
Not Found
|
|
EOF
|
|
;;
|
|
esac
|
|
}
|
|
|
|
start_server() {
|
|
log "Starting GitLab Migration Metrics Exporter on port $LISTEN_PORT"
|
|
log "Metrics available at http://localhost:$LISTEN_PORT/metrics"
|
|
log "Collection interval: ${SCRAPE_INTERVAL}s"
|
|
|
|
while true; do
|
|
socat TCP-LISTEN:"$LISTEN_PORT",reuseaddr,fork EXEC:"$0 --handle-request" 2>/dev/null || {
|
|
log "Server error, restarting in 5 seconds..."
|
|
sleep 5
|
|
}
|
|
done
|
|
}
|
|
|
|
#########################
|
|
### Main ###
|
|
#########################
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
setup
|
|
|
|
if [[ "$HTTP_MODE" == "true" ]]; then
|
|
start_server
|
|
elif [[ -n "$OUTPUT_FILE" ]]; then
|
|
# Textfile collector mode: write atomically using temp file
|
|
local output_dir
|
|
output_dir="$(dirname "$OUTPUT_FILE")"
|
|
mkdir -p "$output_dir"
|
|
|
|
local temp_file
|
|
temp_file=$(mktemp "${output_dir}/.gitlab_migration_metrics.XXXXXX")
|
|
|
|
if ! collect_all_metrics > "$temp_file" 2>/dev/null; then
|
|
rm -f "$temp_file"
|
|
log "ERROR: Failed to generate metrics"
|
|
exit 1
|
|
fi
|
|
|
|
local file_lines
|
|
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
|
|
|
|
if [[ "$file_lines" -lt 10 ]]; then
|
|
rm -f "$temp_file"
|
|
log "ERROR: Metrics file too small ($file_lines lines), keeping previous"
|
|
exit 1
|
|
fi
|
|
|
|
chmod 644 "$temp_file"
|
|
mv -f "$temp_file" "$OUTPUT_FILE"
|
|
log "Metrics written to $OUTPUT_FILE ($file_lines lines)"
|
|
else
|
|
# Default: output to stdout
|
|
collect_all_metrics
|
|
fi
|
|
}
|
|
|
|
main "$@"
|