#!/bin/bash ############################################################################### # configure-openshift-metrics.sh # # Configure an external Prometheus server to receive metrics from OpenShift. # Supports federation (pull) and remote write (push) modes. # # Usage: # sudo ./configure-openshift-metrics.sh --method federation \ # --openshift-url ROUTE --cluster-name NAME # # sudo ./configure-openshift-metrics.sh --method remote-write \ # --prometheus-url URL --cluster-name NAME # # Requirements: # - Root or sudo access on the Prometheus server # - oc CLI logged in with cluster-admin (unless --skip-openshift) # - Prometheus installed via binary (not containerized) # # https://mylinux.work/guides/openshift-metrics-to-external-prometheus/ ############################################################################### set -euo pipefail VERSION="1.0" #------------------------------------------------------------------------------ # Defaults #------------------------------------------------------------------------------ METHOD="federation" OPENSHIFT_URL="" PROMETHEUS_URL="" CLUSTER_NAME="openshift" PROMETHEUS_CONFIG="/etc/prometheus/prometheus.yml" PROMETHEUS_SERVICE="prometheus" RULES_DIR="/etc/prometheus/rules" TOKEN_FILE="/etc/prometheus/openshift-token" PROMETHEUS_USER="prometheus" SKIP_OPENSHIFT=false SKIP_RULES=false DRY_RUN=false OC_NAMESPACE="openshift-monitoring" SA_NAME="prometheus-external" TOKEN_DURATION="8760h" #------------------------------------------------------------------------------ # Colors and logging #------------------------------------------------------------------------------ RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' log() { echo -e "${GREEN}[openshift-metrics]${NC} $1"; } warn() { echo -e "${YELLOW}[openshift-metrics]${NC} $1"; } error() { echo -e "${RED}[openshift-metrics]${NC} $1" >&2; } info() { echo -e "${BLUE}[openshift-metrics]${NC} $1"; } #------------------------------------------------------------------------------ # Usage #------------------------------------------------------------------------------ usage() { cat </dev/null; then warn "promtool not found — config validation will be skipped" fi if [[ "$SKIP_OPENSHIFT" == false ]] && ! command -v oc &>/dev/null; then error "oc CLI not found. Install it or use --skip-openshift with an existing token" exit 1 fi } #------------------------------------------------------------------------------ # Backup existing config #------------------------------------------------------------------------------ backup_config() { local backup_dir backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups" mkdir -p "$backup_dir" local timestamp timestamp=$(date +%F_%H%M%S) local backup_file="${backup_dir}/prometheus.yml.${timestamp}" if $DRY_RUN; then info "[dry-run] Would backup $PROMETHEUS_CONFIG to $backup_file" else cp "$PROMETHEUS_CONFIG" "$backup_file" log "Backed up config to $backup_file" fi } #------------------------------------------------------------------------------ # OpenShift: Create service account and token #------------------------------------------------------------------------------ setup_openshift_sa() { if $SKIP_OPENSHIFT; then if [[ -f "$TOKEN_FILE" ]]; then log "Using existing token from $TOKEN_FILE" else error "No token found at $TOKEN_FILE. Provide a token or remove --skip-openshift." exit 1 fi return fi log "Setting up OpenShift service account..." # Check oc is logged in if ! oc whoami &>/dev/null; then error "Not logged into OpenShift. Run: oc login " exit 1 fi local cluster_info cluster_info=$(oc whoami --show-server 2>/dev/null || echo "unknown") log "Connected to: $cluster_info" if $DRY_RUN; then info "[dry-run] Would create service account $SA_NAME in $OC_NAMESPACE" info "[dry-run] Would grant cluster-monitoring-view role" info "[dry-run] Would generate token with duration $TOKEN_DURATION" return fi # Create service account (ignore if exists) if oc get serviceaccount "$SA_NAME" -n "$OC_NAMESPACE" &>/dev/null; then warn "Service account $SA_NAME already exists in $OC_NAMESPACE" else oc create serviceaccount "$SA_NAME" -n "$OC_NAMESPACE" log "Created service account: $SA_NAME" fi # Grant cluster-monitoring-view role if oc get clusterrolebinding "${SA_NAME}-monitoring-view" &>/dev/null 2>&1; then warn "Role binding already exists" else oc adm policy add-cluster-role-to-user cluster-monitoring-view \ -z "$SA_NAME" -n "$OC_NAMESPACE" log "Granted cluster-monitoring-view role" fi # Generate token local token token=$(oc create token "$SA_NAME" -n "$OC_NAMESPACE" --duration="$TOKEN_DURATION") echo "$token" > "$TOKEN_FILE" chmod 600 "$TOKEN_FILE" chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$TOKEN_FILE" log "Token saved to $TOKEN_FILE (expires in $TOKEN_DURATION)" } #------------------------------------------------------------------------------ # Generate federation scrape config #------------------------------------------------------------------------------ generate_federation_config() { cat < 0.9 for: 10m labels: severity: warning annotations: summary: "High CPU on OpenShift node {{ \$labels.instance }}" description: "CPU usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})." - alert: OpenShiftNodeHighMemory expr: openshift:node_memory_utilization:ratio > 0.9 for: 10m labels: severity: warning annotations: summary: "High memory on OpenShift node {{ \$labels.instance }}" description: "Memory usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})." - alert: OpenShiftPodCrashLooping expr: rate(kube_pod_container_status_restarts_total{cluster="${CLUSTER_NAME}"}[15m]) * 60 * 5 > 0 for: 5m labels: severity: warning annotations: summary: "Pod {{ \$labels.namespace }}/{{ \$labels.pod }} is crash looping" description: "Pod has restarted {{ \$value | humanize }} times in the last 15 minutes." - alert: OpenShiftDeploymentReplicasMismatch expr: | kube_deployment_spec_replicas{cluster="${CLUSTER_NAME}"} != kube_deployment_status_ready_replicas{cluster="${CLUSTER_NAME}"} for: 10m labels: severity: warning annotations: summary: "Deployment {{ \$labels.namespace }}/{{ \$labels.deployment }} replica mismatch" description: "Deployment does not have expected number of ready replicas." - alert: OpenShiftEtcdLeaderChanges expr: increase(etcd_server_leader_changes_seen_total{cluster="${CLUSTER_NAME}"}[1h]) > 3 for: 5m labels: severity: warning annotations: summary: "Frequent etcd leader changes on {{ \$labels.cluster }}" description: "etcd leader changed {{ \$value | humanize }} times in the last hour." YAML } #------------------------------------------------------------------------------ # Apply federation configuration #------------------------------------------------------------------------------ apply_federation() { log "Configuring federation from $OPENSHIFT_URL..." # Set up OpenShift service account and token setup_openshift_sa # Backup existing config backup_config # Generate and append federation scrape config local federation_config federation_config=$(generate_federation_config) if $DRY_RUN; then info "[dry-run] Would append to $PROMETHEUS_CONFIG:" echo "$federation_config" else # Check if the job already exists if grep -q 'job_name: "openshift-federate"' "$PROMETHEUS_CONFIG" 2>/dev/null; then warn "Federation job 'openshift-federate' already exists in $PROMETHEUS_CONFIG" warn "Remove the existing job first or edit it manually." return 1 fi echo "$federation_config" >> "$PROMETHEUS_CONFIG" chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$PROMETHEUS_CONFIG" log "Federation scrape job added to $PROMETHEUS_CONFIG" fi # Generate rules if [[ "$SKIP_RULES" == false ]]; then generate_rules fi # Validate and reload validate_and_reload } #------------------------------------------------------------------------------ # Apply remote write configuration #------------------------------------------------------------------------------ apply_remote_write() { log "Configuring remote write to $PROMETHEUS_URL..." # Backup existing config backup_config # Enable remote write receiver local service_file="/etc/systemd/system/${PROMETHEUS_SERVICE}.service" if [[ -f "$service_file" ]]; then if grep -q "web.enable-remote-write-receiver" "$service_file"; then log "Remote write receiver already enabled" else if $DRY_RUN; then info "[dry-run] Would add --web.enable-remote-write-receiver to $service_file" else warn "You need to add --web.enable-remote-write-receiver to your Prometheus service." warn "Edit $service_file and add the flag to ExecStart, then run:" warn " sudo systemctl daemon-reload && sudo systemctl restart prometheus" echo "" fi fi fi # Generate basic auth credentials local rw_password rw_password=$(openssl rand -base64 24 2>/dev/null || head -c 24 /dev/urandom | base64) local rw_user="openshift" log "Generated remote write credentials:" log " Username: $rw_user" log " Password: $rw_password" echo "" # Generate web.yml with basic auth local web_config_file web_config_file="$(dirname "$PROMETHEUS_CONFIG")/web.yml" if command -v htpasswd &>/dev/null; then local hash hash=$(htpasswd -nbBC 12 "" "$rw_password" | tr -d ':\n') if $DRY_RUN; then info "[dry-run] Would create $web_config_file with basic_auth_users" else if [[ -f "$web_config_file" ]]; then warn "$web_config_file already exists — add this entry manually:" echo " $rw_user: \"$hash\"" else cat > "$web_config_file" < "$RULES_DIR/openshift-rules.yml" chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-rules.yml" log "Created $RULES_DIR/openshift-rules.yml" generate_alert_rules > "$RULES_DIR/openshift-alerts.yml" chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-alerts.yml" log "Created $RULES_DIR/openshift-alerts.yml" } #------------------------------------------------------------------------------ # Validate config and reload Prometheus #------------------------------------------------------------------------------ validate_and_reload() { if $DRY_RUN; then info "[dry-run] Would validate config and reload Prometheus" return fi # Validate with promtool if command -v promtool &>/dev/null; then log "Validating Prometheus configuration..." if ! promtool check config "$PROMETHEUS_CONFIG"; then error "Config validation failed. Restoring backup..." local backup_dir backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups" local latest_backup latest_backup=$(ls -t "$backup_dir"/prometheus.yml.* 2>/dev/null | head -1) if [[ -n "$latest_backup" ]]; then cp "$latest_backup" "$PROMETHEUS_CONFIG" log "Restored from $latest_backup" fi exit 1 fi log "Config validation passed" # Validate rules if [[ "$SKIP_RULES" == false ]]; then for rule_file in "$RULES_DIR"/openshift-*.yml; do if [[ -f "$rule_file" ]]; then if ! promtool check rules "$rule_file"; then error "Rule validation failed: $rule_file" exit 1 fi fi done log "Rule validation passed" fi fi # Reload Prometheus if systemctl is-active --quiet "$PROMETHEUS_SERVICE"; then systemctl reload "$PROMETHEUS_SERVICE" 2>/dev/null || \ systemctl restart "$PROMETHEUS_SERVICE" log "Prometheus reloaded" else warn "Prometheus service is not running. Start it with: sudo systemctl start $PROMETHEUS_SERVICE" fi } #------------------------------------------------------------------------------ # Print summary #------------------------------------------------------------------------------ print_summary() { echo "" echo "============================================" echo " OpenShift Metrics Configuration Complete" echo "============================================" echo "" echo " Method: $METHOD" echo " Cluster name: $CLUSTER_NAME" if [[ "$METHOD" == "federation" ]]; then echo " OpenShift URL: $OPENSHIFT_URL" echo " Token file: $TOKEN_FILE" else echo " Prometheus URL: $PROMETHEUS_URL" fi echo " Config file: $PROMETHEUS_CONFIG" if [[ "$SKIP_RULES" == false ]]; then echo " Rules dir: $RULES_DIR" fi echo "" echo " Verify:" echo " - Check targets: http://localhost:9090/targets" if [[ "$METHOD" == "federation" ]]; then echo " - Test query: node_memory_MemAvailable_bytes{cluster=\"${CLUSTER_NAME}\"}" else echo " - Test query: up{cluster=\"${CLUSTER_NAME}\"}" fi echo "" } #------------------------------------------------------------------------------ # Main #------------------------------------------------------------------------------ main() { echo "" log "configure-openshift-metrics.sh v${VERSION}" echo "" validate if $DRY_RUN; then warn "DRY RUN — no changes will be made" echo "" fi case "$METHOD" in federation) apply_federation ;; remote-write) apply_remote_write ;; esac if ! $DRY_RUN; then print_summary fi log "Done." } main