linux-scripts/configure-openshift-metrics.sh

#!/bin/bash
###############################################################################
# configure-openshift-metrics.sh
#
# Configure an external Prometheus server to receive metrics from OpenShift.
# Supports federation (pull) and remote write (push) modes.
#
# Usage:
#   sudo ./configure-openshift-metrics.sh --method federation \
#     --openshift-url ROUTE --cluster-name NAME
#
#   sudo ./configure-openshift-metrics.sh --method remote-write \
#     --prometheus-url URL --cluster-name NAME
#
# Requirements:
#   - Root or sudo access on the Prometheus server
#   - oc CLI logged in with cluster-admin (unless --skip-openshift)
#   - Prometheus installed via binary (not containerized)
#
# https://mylinux.work/guides/openshift-metrics-to-external-prometheus/
###############################################################################

set -euo pipefail

VERSION="1.0"

#------------------------------------------------------------------------------
# Defaults
#------------------------------------------------------------------------------
METHOD="federation"
OPENSHIFT_URL=""
PROMETHEUS_URL=""
CLUSTER_NAME="openshift"
PROMETHEUS_CONFIG="/etc/prometheus/prometheus.yml"
PROMETHEUS_SERVICE="prometheus"
RULES_DIR="/etc/prometheus/rules"
TOKEN_FILE="/etc/prometheus/openshift-token"
PROMETHEUS_USER="prometheus"
SKIP_OPENSHIFT=false
SKIP_RULES=false
DRY_RUN=false
OC_NAMESPACE="openshift-monitoring"
SA_NAME="prometheus-external"
TOKEN_DURATION="8760h"

#------------------------------------------------------------------------------
# Colors and logging
#------------------------------------------------------------------------------
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

log()   { echo -e "${GREEN}[openshift-metrics]${NC} $1"; }
warn()  { echo -e "${YELLOW}[openshift-metrics]${NC} $1"; }
error() { echo -e "${RED}[openshift-metrics]${NC} $1" >&2; }
info()  { echo -e "${BLUE}[openshift-metrics]${NC} $1"; }

#------------------------------------------------------------------------------
# Usage
#------------------------------------------------------------------------------
usage() {
    cat <<EOF
configure-openshift-metrics.sh v${VERSION}

Configure an external Prometheus to receive OpenShift metrics.

Usage:
  sudo $0 [OPTIONS]

Options:
  --method METHOD          federation or remote-write (default: federation)
  --openshift-url URL      OpenShift Prometheus route hostname (federation)
  --prometheus-url URL     External Prometheus URL (remote-write)
  --cluster-name NAME      Label for metrics (default: openshift)
  --prometheus-config PATH Path to prometheus.yml (default: /etc/prometheus/prometheus.yml)
  --rules-dir PATH         Directory for rule files (default: /etc/prometheus/rules)
  --token-file PATH        Bearer token file path (default: /etc/prometheus/openshift-token)
  --skip-openshift         Skip oc commands (use existing token)
  --skip-rules             Skip recording/alert rule generation
  --dry-run                Show what would be done without making changes
  --help                   Show this help message

Federation example:
  sudo $0 --method federation \\
    --openshift-url prometheus-k8s-openshift-monitoring.apps.cluster.example.com \\
    --cluster-name production

Remote write example:
  sudo $0 --method remote-write \\
    --prometheus-url https://prometheus.example.com:9090 \\
    --cluster-name production
EOF
    exit 0
}

#------------------------------------------------------------------------------
# Parse arguments
#------------------------------------------------------------------------------
while [[ $# -gt 0 ]]; do
    case "$1" in
        --method)          METHOD="$2"; shift 2 ;;
        --openshift-url)   OPENSHIFT_URL="$2"; shift 2 ;;
        --prometheus-url)  PROMETHEUS_URL="$2"; shift 2 ;;
        --cluster-name)    CLUSTER_NAME="$2"; shift 2 ;;
        --prometheus-config) PROMETHEUS_CONFIG="$2"; shift 2 ;;
        --rules-dir)       RULES_DIR="$2"; shift 2 ;;
        --token-file)      TOKEN_FILE="$2"; shift 2 ;;
        --skip-openshift)  SKIP_OPENSHIFT=true; shift ;;
        --skip-rules)      SKIP_RULES=true; shift ;;
        --dry-run)         DRY_RUN=true; shift ;;
        --help|-h)         usage ;;
        *) error "Unknown option: $1"; echo "Use --help for usage."; exit 1 ;;
    esac
done

#------------------------------------------------------------------------------
# Validation
#------------------------------------------------------------------------------
validate() {
    if [[ "$METHOD" != "federation" && "$METHOD" != "remote-write" ]]; then
        error "Invalid method: $METHOD (must be federation or remote-write)"
        exit 1
    fi

    if [[ "$METHOD" == "federation" && -z "$OPENSHIFT_URL" ]]; then
        error "--openshift-url is required for federation mode"
        exit 1
    fi

    if [[ "$METHOD" == "remote-write" && -z "$PROMETHEUS_URL" ]]; then
        error "--prometheus-url is required for remote-write mode"
        exit 1
    fi

    if [[ "$EUID" -ne 0 ]]; then
        error "This script must be run as root or with sudo"
        exit 1
    fi

    if [[ ! -f "$PROMETHEUS_CONFIG" ]]; then
        error "Prometheus config not found: $PROMETHEUS_CONFIG"
        exit 1
    fi

    if ! command -v promtool &>/dev/null; then
        warn "promtool not found — config validation will be skipped"
    fi

    if [[ "$SKIP_OPENSHIFT" == false ]] && ! command -v oc &>/dev/null; then
        error "oc CLI not found. Install it or use --skip-openshift with an existing token"
        exit 1
    fi
}

#------------------------------------------------------------------------------
# Backup existing config
#------------------------------------------------------------------------------
backup_config() {
    local backup_dir
    backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups"
    mkdir -p "$backup_dir"

    local timestamp
    timestamp=$(date +%F_%H%M%S)
    local backup_file="${backup_dir}/prometheus.yml.${timestamp}"

    if $DRY_RUN; then
        info "[dry-run] Would backup $PROMETHEUS_CONFIG to $backup_file"
    else
        cp "$PROMETHEUS_CONFIG" "$backup_file"
        log "Backed up config to $backup_file"
    fi
}

#------------------------------------------------------------------------------
# OpenShift: Create service account and token
#------------------------------------------------------------------------------
setup_openshift_sa() {
    if $SKIP_OPENSHIFT; then
        if [[ -f "$TOKEN_FILE" ]]; then
            log "Using existing token from $TOKEN_FILE"
        else
            error "No token found at $TOKEN_FILE. Provide a token or remove --skip-openshift."
            exit 1
        fi
        return
    fi

    log "Setting up OpenShift service account..."

    # Check oc is logged in
    if ! oc whoami &>/dev/null; then
        error "Not logged into OpenShift. Run: oc login <cluster-url>"
        exit 1
    fi

    local cluster_info
    cluster_info=$(oc whoami --show-server 2>/dev/null || echo "unknown")
    log "Connected to: $cluster_info"

    if $DRY_RUN; then
        info "[dry-run] Would create service account $SA_NAME in $OC_NAMESPACE"
        info "[dry-run] Would grant cluster-monitoring-view role"
        info "[dry-run] Would generate token with duration $TOKEN_DURATION"
        return
    fi

    # Create service account (ignore if exists)
    if oc get serviceaccount "$SA_NAME" -n "$OC_NAMESPACE" &>/dev/null; then
        warn "Service account $SA_NAME already exists in $OC_NAMESPACE"
    else
        oc create serviceaccount "$SA_NAME" -n "$OC_NAMESPACE"
        log "Created service account: $SA_NAME"
    fi

    # Grant cluster-monitoring-view role
    if oc get clusterrolebinding "${SA_NAME}-monitoring-view" &>/dev/null 2>&1; then
        warn "Role binding already exists"
    else
        oc adm policy add-cluster-role-to-user cluster-monitoring-view \
            -z "$SA_NAME" -n "$OC_NAMESPACE"
        log "Granted cluster-monitoring-view role"
    fi

    # Generate token
    local token
    token=$(oc create token "$SA_NAME" -n "$OC_NAMESPACE" --duration="$TOKEN_DURATION")

    echo "$token" > "$TOKEN_FILE"
    chmod 600 "$TOKEN_FILE"
    chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$TOKEN_FILE"
    log "Token saved to $TOKEN_FILE (expires in $TOKEN_DURATION)"
}

#------------------------------------------------------------------------------
# Generate federation scrape config
#------------------------------------------------------------------------------
generate_federation_config() {
    cat <<YAML

  - job_name: "openshift-federate"
    honor_labels: true
    metrics_path: /federate
    scrape_interval: 30s
    scrape_timeout: 25s
    params:
      'match[]':
        - '{job="node-exporter"}'
        - '{job="kube-state-metrics"}'
        - '{__name__=~"container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_.*_bytes_total|container_fs_.*_bytes"}'
        - '{__name__=~"etcd_server_leader_changes_seen_total|etcd_disk_wal_fsync_duration_seconds_bucket|etcd_mvcc_db_total_size_in_bytes"}'
        - '{__name__=~"apiserver_request_total|apiserver_request_duration_seconds_bucket"}'
        - '{__name__="up"}'
    scheme: https
    bearer_token_file: ${TOKEN_FILE}
    tls_config:
      insecure_skip_verify: true
    static_configs:
      - targets:
          - "${OPENSHIFT_URL}"
        labels:
          cluster: ${CLUSTER_NAME}
YAML
}

#------------------------------------------------------------------------------
# Generate recording rules
#------------------------------------------------------------------------------
generate_recording_rules() {
    cat <<YAML
# OpenShift recording rules — generated by configure-openshift-metrics.sh
# Cluster: ${CLUSTER_NAME}
groups:
  - name: openshift_recording_rules
    interval: 30s
    rules:
      - record: openshift:node_cpu_utilization:ratio
        expr: |
          1 - avg by(instance, cluster) (
            rate(node_cpu_seconds_total{mode="idle", cluster="${CLUSTER_NAME}"}[5m])
          )

      - record: openshift:node_memory_utilization:ratio
        expr: |
          1 - (
            node_memory_MemAvailable_bytes{cluster="${CLUSTER_NAME}"}
            / node_memory_MemTotal_bytes{cluster="${CLUSTER_NAME}"}
          )

      - record: openshift:namespace_pod_count:sum
        expr: |
          count by(namespace, cluster) (
            kube_pod_status_phase{phase="Running", cluster="${CLUSTER_NAME}"}
          )

      - record: openshift:namespace_cpu_usage:sum
        expr: |
          sum by(namespace, cluster) (
            rate(container_cpu_usage_seconds_total{cluster="${CLUSTER_NAME}", container!=""}[5m])
          )

      - record: openshift:namespace_memory_usage:sum
        expr: |
          sum by(namespace, cluster) (
            container_memory_working_set_bytes{cluster="${CLUSTER_NAME}", container!=""}
          )
YAML
}

#------------------------------------------------------------------------------
# Generate alert rules
#------------------------------------------------------------------------------
generate_alert_rules() {
    cat <<YAML
# OpenShift alert rules — generated by configure-openshift-metrics.sh
# Cluster: ${CLUSTER_NAME}
groups:
  - name: openshift_alerts
    rules:
      - alert: OpenShiftFederationDown
        expr: up{job="openshift-federate", cluster="${CLUSTER_NAME}"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "OpenShift federation target is down"
          description: "Cannot scrape metrics from OpenShift cluster {{ \$labels.cluster }} for 5 minutes."

      - alert: OpenShiftNodeHighCPU
        expr: openshift:node_cpu_utilization:ratio > 0.9
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU on OpenShift node {{ \$labels.instance }}"
          description: "CPU usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})."

      - alert: OpenShiftNodeHighMemory
        expr: openshift:node_memory_utilization:ratio > 0.9
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory on OpenShift node {{ \$labels.instance }}"
          description: "Memory usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})."

      - alert: OpenShiftPodCrashLooping
        expr: rate(kube_pod_container_status_restarts_total{cluster="${CLUSTER_NAME}"}[15m]) * 60 * 5 > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Pod {{ \$labels.namespace }}/{{ \$labels.pod }} is crash looping"
          description: "Pod has restarted {{ \$value | humanize }} times in the last 15 minutes."

      - alert: OpenShiftDeploymentReplicasMismatch
        expr: |
          kube_deployment_spec_replicas{cluster="${CLUSTER_NAME}"}
          != kube_deployment_status_ready_replicas{cluster="${CLUSTER_NAME}"}
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Deployment {{ \$labels.namespace }}/{{ \$labels.deployment }} replica mismatch"
          description: "Deployment does not have expected number of ready replicas."

      - alert: OpenShiftEtcdLeaderChanges
        expr: increase(etcd_server_leader_changes_seen_total{cluster="${CLUSTER_NAME}"}[1h]) > 3
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Frequent etcd leader changes on {{ \$labels.cluster }}"
          description: "etcd leader changed {{ \$value | humanize }} times in the last hour."
YAML
}

#------------------------------------------------------------------------------
# Apply federation configuration
#------------------------------------------------------------------------------
apply_federation() {
    log "Configuring federation from $OPENSHIFT_URL..."

    # Set up OpenShift service account and token
    setup_openshift_sa

    # Backup existing config
    backup_config

    # Generate and append federation scrape config
    local federation_config
    federation_config=$(generate_federation_config)

    if $DRY_RUN; then
        info "[dry-run] Would append to $PROMETHEUS_CONFIG:"
        echo "$federation_config"
    else
        # Check if the job already exists
        if grep -q 'job_name: "openshift-federate"' "$PROMETHEUS_CONFIG" 2>/dev/null; then
            warn "Federation job 'openshift-federate' already exists in $PROMETHEUS_CONFIG"
            warn "Remove the existing job first or edit it manually."
            return 1
        fi

        echo "$federation_config" >> "$PROMETHEUS_CONFIG"
        chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$PROMETHEUS_CONFIG"
        log "Federation scrape job added to $PROMETHEUS_CONFIG"
    fi

    # Generate rules
    if [[ "$SKIP_RULES" == false ]]; then
        generate_rules
    fi

    # Validate and reload
    validate_and_reload
}

#------------------------------------------------------------------------------
# Apply remote write configuration
#------------------------------------------------------------------------------
apply_remote_write() {
    log "Configuring remote write to $PROMETHEUS_URL..."

    # Backup existing config
    backup_config

    # Enable remote write receiver
    local service_file="/etc/systemd/system/${PROMETHEUS_SERVICE}.service"
    if [[ -f "$service_file" ]]; then
        if grep -q "web.enable-remote-write-receiver" "$service_file"; then
            log "Remote write receiver already enabled"
        else
            if $DRY_RUN; then
                info "[dry-run] Would add --web.enable-remote-write-receiver to $service_file"
            else
                warn "You need to add --web.enable-remote-write-receiver to your Prometheus service."
                warn "Edit $service_file and add the flag to ExecStart, then run:"
                warn "  sudo systemctl daemon-reload && sudo systemctl restart prometheus"
                echo ""
            fi
        fi
    fi

    # Generate basic auth credentials
    local rw_password
    rw_password=$(openssl rand -base64 24 2>/dev/null || head -c 24 /dev/urandom | base64)
    local rw_user="openshift"

    log "Generated remote write credentials:"
    log "  Username: $rw_user"
    log "  Password: $rw_password"
    echo ""

    # Generate web.yml with basic auth
    local web_config_file
    web_config_file="$(dirname "$PROMETHEUS_CONFIG")/web.yml"

    if command -v htpasswd &>/dev/null; then
        local hash
        hash=$(htpasswd -nbBC 12 "" "$rw_password" | tr -d ':\n')

        if $DRY_RUN; then
            info "[dry-run] Would create $web_config_file with basic_auth_users"
        else
            if [[ -f "$web_config_file" ]]; then
                warn "$web_config_file already exists — add this entry manually:"
                echo "  $rw_user: \"$hash\""
            else
                cat > "$web_config_file" <<EOF
basic_auth_users:
  ${rw_user}: "${hash}"
EOF
                chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$web_config_file"
                chmod 600 "$web_config_file"
                log "Created $web_config_file"
            fi
        fi
    else
        warn "htpasswd not found — install apache2-utils (Debian) or httpd-tools (RHEL)"
        warn "Then generate a hash: htpasswd -nbBC 12 '' 'PASSWORD'"
    fi

    # Print OpenShift-side commands
    echo ""
    log "Run the following on your OpenShift cluster:"
    echo ""
    echo "  # Create the auth secret"
    echo "  oc create secret generic remote-write-auth \\"
    echo "    -n openshift-monitoring \\"
    echo "    --from-literal=username=${rw_user} \\"
    echo "    --from-literal=password='${rw_password}'"
    echo ""
    echo "  # Apply the remote write config"
    echo "  oc apply -f - <<'OCEOF'"

    cat <<OCEOF
apiVersion: v1
kind: ConfigMap
metadata:
  name: cluster-monitoring-config
  namespace: openshift-monitoring
data:
  config.yaml: |
    prometheusK8s:
      remoteWrite:
        - url: "${PROMETHEUS_URL}/api/v1/write"
          basicAuth:
            username:
              name: remote-write-auth
              key: username
            password:
              name: remote-write-auth
              key: password
          tlsConfig:
            insecureSkipVerify: true
          writeRelabelConfigs:
            - sourceLabels: [__name__]
              regex: "node_.*|kube_.*|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_.*_bytes_total|container_fs_.*_bytes|etcd_.*|apiserver_request_total|apiserver_request_duration_seconds_bucket|up"
              action: keep
            - regex: "prometheus_replica"
              action: labeldrop
          queueConfig:
            maxSamplesPerSend: 5000
            batchSendDeadline: 5s
            maxShards: 10
OCEOF
    echo "OCEOF"
    echo ""

    # Generate rules
    if [[ "$SKIP_RULES" == false ]]; then
        generate_rules
    fi

    # Validate and reload
    validate_and_reload
}

#------------------------------------------------------------------------------
# Generate recording and alert rules
#------------------------------------------------------------------------------
generate_rules() {
    log "Generating recording and alert rules..."

    if $DRY_RUN; then
        info "[dry-run] Would create $RULES_DIR/openshift-rules.yml"
        info "[dry-run] Would create $RULES_DIR/openshift-alerts.yml"
        echo ""
        info "Recording rules:"
        generate_recording_rules
        echo ""
        info "Alert rules:"
        generate_alert_rules
        return
    fi

    mkdir -p "$RULES_DIR"

    generate_recording_rules > "$RULES_DIR/openshift-rules.yml"
    chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-rules.yml"
    log "Created $RULES_DIR/openshift-rules.yml"

    generate_alert_rules > "$RULES_DIR/openshift-alerts.yml"
    chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-alerts.yml"
    log "Created $RULES_DIR/openshift-alerts.yml"
}

#------------------------------------------------------------------------------
# Validate config and reload Prometheus
#------------------------------------------------------------------------------
validate_and_reload() {
    if $DRY_RUN; then
        info "[dry-run] Would validate config and reload Prometheus"
        return
    fi

    # Validate with promtool
    if command -v promtool &>/dev/null; then
        log "Validating Prometheus configuration..."

        if ! promtool check config "$PROMETHEUS_CONFIG"; then
            error "Config validation failed. Restoring backup..."
            local backup_dir
            backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups"
            local latest_backup
            latest_backup=$(ls -t "$backup_dir"/prometheus.yml.* 2>/dev/null | head -1)
            if [[ -n "$latest_backup" ]]; then
                cp "$latest_backup" "$PROMETHEUS_CONFIG"
                log "Restored from $latest_backup"
            fi
            exit 1
        fi
        log "Config validation passed"

        # Validate rules
        if [[ "$SKIP_RULES" == false ]]; then
            for rule_file in "$RULES_DIR"/openshift-*.yml; do
                if [[ -f "$rule_file" ]]; then
                    if ! promtool check rules "$rule_file"; then
                        error "Rule validation failed: $rule_file"
                        exit 1
                    fi
                fi
            done
            log "Rule validation passed"
        fi
    fi

    # Reload Prometheus
    if systemctl is-active --quiet "$PROMETHEUS_SERVICE"; then
        systemctl reload "$PROMETHEUS_SERVICE" 2>/dev/null || \
            systemctl restart "$PROMETHEUS_SERVICE"
        log "Prometheus reloaded"
    else
        warn "Prometheus service is not running. Start it with: sudo systemctl start $PROMETHEUS_SERVICE"
    fi
}

#------------------------------------------------------------------------------
# Print summary
#------------------------------------------------------------------------------
print_summary() {
    echo ""
    echo "============================================"
    echo "  OpenShift Metrics Configuration Complete"
    echo "============================================"
    echo ""
    echo "  Method:       $METHOD"
    echo "  Cluster name: $CLUSTER_NAME"

    if [[ "$METHOD" == "federation" ]]; then
        echo "  OpenShift URL: $OPENSHIFT_URL"
        echo "  Token file:    $TOKEN_FILE"
    else
        echo "  Prometheus URL: $PROMETHEUS_URL"
    fi

    echo "  Config file:   $PROMETHEUS_CONFIG"

    if [[ "$SKIP_RULES" == false ]]; then
        echo "  Rules dir:     $RULES_DIR"
    fi

    echo ""
    echo "  Verify:"
    echo "    - Check targets: http://localhost:9090/targets"

    if [[ "$METHOD" == "federation" ]]; then
        echo "    - Test query:  node_memory_MemAvailable_bytes{cluster=\"${CLUSTER_NAME}\"}"
    else
        echo "    - Test query:  up{cluster=\"${CLUSTER_NAME}\"}"
    fi

    echo ""
}

#------------------------------------------------------------------------------
# Main
#------------------------------------------------------------------------------
main() {
    echo ""
    log "configure-openshift-metrics.sh v${VERSION}"
    echo ""

    validate

    if $DRY_RUN; then
        warn "DRY RUN — no changes will be made"
        echo ""
    fi

    case "$METHOD" in
        federation)   apply_federation ;;
        remote-write) apply_remote_write ;;
    esac

    if ! $DRY_RUN; then
        print_summary
    fi

    log "Done."
}

main