Files
linux-scripts/configure-openshift-metrics.sh
T
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

688 lines
23 KiB
Bash

#!/bin/bash
###############################################################################
# configure-openshift-metrics.sh
#
# Configure an external Prometheus server to receive metrics from OpenShift.
# Supports federation (pull) and remote write (push) modes.
#
# Usage:
# sudo ./configure-openshift-metrics.sh --method federation \
# --openshift-url ROUTE --cluster-name NAME
#
# sudo ./configure-openshift-metrics.sh --method remote-write \
# --prometheus-url URL --cluster-name NAME
#
# Requirements:
# - Root or sudo access on the Prometheus server
# - oc CLI logged in with cluster-admin (unless --skip-openshift)
# - Prometheus installed via binary (not containerized)
#
# https://mylinux.work/guides/openshift-metrics-to-external-prometheus/
###############################################################################
set -euo pipefail
VERSION="1.0"
#------------------------------------------------------------------------------
# Defaults
#------------------------------------------------------------------------------
METHOD="federation"
OPENSHIFT_URL=""
PROMETHEUS_URL=""
CLUSTER_NAME="openshift"
PROMETHEUS_CONFIG="/etc/prometheus/prometheus.yml"
PROMETHEUS_SERVICE="prometheus"
RULES_DIR="/etc/prometheus/rules"
TOKEN_FILE="/etc/prometheus/openshift-token"
PROMETHEUS_USER="prometheus"
SKIP_OPENSHIFT=false
SKIP_RULES=false
DRY_RUN=false
OC_NAMESPACE="openshift-monitoring"
SA_NAME="prometheus-external"
TOKEN_DURATION="8760h"
#------------------------------------------------------------------------------
# Colors and logging
#------------------------------------------------------------------------------
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log() { echo -e "${GREEN}[openshift-metrics]${NC} $1"; }
warn() { echo -e "${YELLOW}[openshift-metrics]${NC} $1"; }
error() { echo -e "${RED}[openshift-metrics]${NC} $1" >&2; }
info() { echo -e "${BLUE}[openshift-metrics]${NC} $1"; }
#------------------------------------------------------------------------------
# Usage
#------------------------------------------------------------------------------
usage() {
cat <<EOF
configure-openshift-metrics.sh v${VERSION}
Configure an external Prometheus to receive OpenShift metrics.
Usage:
sudo $0 [OPTIONS]
Options:
--method METHOD federation or remote-write (default: federation)
--openshift-url URL OpenShift Prometheus route hostname (federation)
--prometheus-url URL External Prometheus URL (remote-write)
--cluster-name NAME Label for metrics (default: openshift)
--prometheus-config PATH Path to prometheus.yml (default: /etc/prometheus/prometheus.yml)
--rules-dir PATH Directory for rule files (default: /etc/prometheus/rules)
--token-file PATH Bearer token file path (default: /etc/prometheus/openshift-token)
--skip-openshift Skip oc commands (use existing token)
--skip-rules Skip recording/alert rule generation
--dry-run Show what would be done without making changes
--help Show this help message
Federation example:
sudo $0 --method federation \\
--openshift-url prometheus-k8s-openshift-monitoring.apps.cluster.example.com \\
--cluster-name production
Remote write example:
sudo $0 --method remote-write \\
--prometheus-url https://prometheus.example.com:9090 \\
--cluster-name production
EOF
exit 0
}
#------------------------------------------------------------------------------
# Parse arguments
#------------------------------------------------------------------------------
while [[ $# -gt 0 ]]; do
case "$1" in
--method) METHOD="$2"; shift 2 ;;
--openshift-url) OPENSHIFT_URL="$2"; shift 2 ;;
--prometheus-url) PROMETHEUS_URL="$2"; shift 2 ;;
--cluster-name) CLUSTER_NAME="$2"; shift 2 ;;
--prometheus-config) PROMETHEUS_CONFIG="$2"; shift 2 ;;
--rules-dir) RULES_DIR="$2"; shift 2 ;;
--token-file) TOKEN_FILE="$2"; shift 2 ;;
--skip-openshift) SKIP_OPENSHIFT=true; shift ;;
--skip-rules) SKIP_RULES=true; shift ;;
--dry-run) DRY_RUN=true; shift ;;
--help|-h) usage ;;
*) error "Unknown option: $1"; echo "Use --help for usage."; exit 1 ;;
esac
done
#------------------------------------------------------------------------------
# Validation
#------------------------------------------------------------------------------
validate() {
if [[ "$METHOD" != "federation" && "$METHOD" != "remote-write" ]]; then
error "Invalid method: $METHOD (must be federation or remote-write)"
exit 1
fi
if [[ "$METHOD" == "federation" && -z "$OPENSHIFT_URL" ]]; then
error "--openshift-url is required for federation mode"
exit 1
fi
if [[ "$METHOD" == "remote-write" && -z "$PROMETHEUS_URL" ]]; then
error "--prometheus-url is required for remote-write mode"
exit 1
fi
if [[ "$EUID" -ne 0 ]]; then
error "This script must be run as root or with sudo"
exit 1
fi
if [[ ! -f "$PROMETHEUS_CONFIG" ]]; then
error "Prometheus config not found: $PROMETHEUS_CONFIG"
exit 1
fi
if ! command -v promtool &>/dev/null; then
warn "promtool not found — config validation will be skipped"
fi
if [[ "$SKIP_OPENSHIFT" == false ]] && ! command -v oc &>/dev/null; then
error "oc CLI not found. Install it or use --skip-openshift with an existing token"
exit 1
fi
}
#------------------------------------------------------------------------------
# Backup existing config
#------------------------------------------------------------------------------
backup_config() {
local backup_dir
backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups"
mkdir -p "$backup_dir"
local timestamp
timestamp=$(date +%F_%H%M%S)
local backup_file="${backup_dir}/prometheus.yml.${timestamp}"
if $DRY_RUN; then
info "[dry-run] Would backup $PROMETHEUS_CONFIG to $backup_file"
else
cp "$PROMETHEUS_CONFIG" "$backup_file"
log "Backed up config to $backup_file"
fi
}
#------------------------------------------------------------------------------
# OpenShift: Create service account and token
#------------------------------------------------------------------------------
setup_openshift_sa() {
if $SKIP_OPENSHIFT; then
if [[ -f "$TOKEN_FILE" ]]; then
log "Using existing token from $TOKEN_FILE"
else
error "No token found at $TOKEN_FILE. Provide a token or remove --skip-openshift."
exit 1
fi
return
fi
log "Setting up OpenShift service account..."
# Check oc is logged in
if ! oc whoami &>/dev/null; then
error "Not logged into OpenShift. Run: oc login <cluster-url>"
exit 1
fi
local cluster_info
cluster_info=$(oc whoami --show-server 2>/dev/null || echo "unknown")
log "Connected to: $cluster_info"
if $DRY_RUN; then
info "[dry-run] Would create service account $SA_NAME in $OC_NAMESPACE"
info "[dry-run] Would grant cluster-monitoring-view role"
info "[dry-run] Would generate token with duration $TOKEN_DURATION"
return
fi
# Create service account (ignore if exists)
if oc get serviceaccount "$SA_NAME" -n "$OC_NAMESPACE" &>/dev/null; then
warn "Service account $SA_NAME already exists in $OC_NAMESPACE"
else
oc create serviceaccount "$SA_NAME" -n "$OC_NAMESPACE"
log "Created service account: $SA_NAME"
fi
# Grant cluster-monitoring-view role
if oc get clusterrolebinding "${SA_NAME}-monitoring-view" &>/dev/null 2>&1; then
warn "Role binding already exists"
else
oc adm policy add-cluster-role-to-user cluster-monitoring-view \
-z "$SA_NAME" -n "$OC_NAMESPACE"
log "Granted cluster-monitoring-view role"
fi
# Generate token
local token
token=$(oc create token "$SA_NAME" -n "$OC_NAMESPACE" --duration="$TOKEN_DURATION")
echo "$token" > "$TOKEN_FILE"
chmod 600 "$TOKEN_FILE"
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$TOKEN_FILE"
log "Token saved to $TOKEN_FILE (expires in $TOKEN_DURATION)"
}
#------------------------------------------------------------------------------
# Generate federation scrape config
#------------------------------------------------------------------------------
generate_federation_config() {
cat <<YAML
- job_name: "openshift-federate"
honor_labels: true
metrics_path: /federate
scrape_interval: 30s
scrape_timeout: 25s
params:
'match[]':
- '{job="node-exporter"}'
- '{job="kube-state-metrics"}'
- '{__name__=~"container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_.*_bytes_total|container_fs_.*_bytes"}'
- '{__name__=~"etcd_server_leader_changes_seen_total|etcd_disk_wal_fsync_duration_seconds_bucket|etcd_mvcc_db_total_size_in_bytes"}'
- '{__name__=~"apiserver_request_total|apiserver_request_duration_seconds_bucket"}'
- '{__name__="up"}'
scheme: https
bearer_token_file: ${TOKEN_FILE}
tls_config:
insecure_skip_verify: true
static_configs:
- targets:
- "${OPENSHIFT_URL}"
labels:
cluster: ${CLUSTER_NAME}
YAML
}
#------------------------------------------------------------------------------
# Generate recording rules
#------------------------------------------------------------------------------
generate_recording_rules() {
cat <<YAML
# OpenShift recording rules — generated by configure-openshift-metrics.sh
# Cluster: ${CLUSTER_NAME}
groups:
- name: openshift_recording_rules
interval: 30s
rules:
- record: openshift:node_cpu_utilization:ratio
expr: |
1 - avg by(instance, cluster) (
rate(node_cpu_seconds_total{mode="idle", cluster="${CLUSTER_NAME}"}[5m])
)
- record: openshift:node_memory_utilization:ratio
expr: |
1 - (
node_memory_MemAvailable_bytes{cluster="${CLUSTER_NAME}"}
/ node_memory_MemTotal_bytes{cluster="${CLUSTER_NAME}"}
)
- record: openshift:namespace_pod_count:sum
expr: |
count by(namespace, cluster) (
kube_pod_status_phase{phase="Running", cluster="${CLUSTER_NAME}"}
)
- record: openshift:namespace_cpu_usage:sum
expr: |
sum by(namespace, cluster) (
rate(container_cpu_usage_seconds_total{cluster="${CLUSTER_NAME}", container!=""}[5m])
)
- record: openshift:namespace_memory_usage:sum
expr: |
sum by(namespace, cluster) (
container_memory_working_set_bytes{cluster="${CLUSTER_NAME}", container!=""}
)
YAML
}
#------------------------------------------------------------------------------
# Generate alert rules
#------------------------------------------------------------------------------
generate_alert_rules() {
cat <<YAML
# OpenShift alert rules — generated by configure-openshift-metrics.sh
# Cluster: ${CLUSTER_NAME}
groups:
- name: openshift_alerts
rules:
- alert: OpenShiftFederationDown
expr: up{job="openshift-federate", cluster="${CLUSTER_NAME}"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "OpenShift federation target is down"
description: "Cannot scrape metrics from OpenShift cluster {{ \$labels.cluster }} for 5 minutes."
- alert: OpenShiftNodeHighCPU
expr: openshift:node_cpu_utilization:ratio > 0.9
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU on OpenShift node {{ \$labels.instance }}"
description: "CPU usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})."
- alert: OpenShiftNodeHighMemory
expr: openshift:node_memory_utilization:ratio > 0.9
for: 10m
labels:
severity: warning
annotations:
summary: "High memory on OpenShift node {{ \$labels.instance }}"
description: "Memory usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})."
- alert: OpenShiftPodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{cluster="${CLUSTER_NAME}"}[15m]) * 60 * 5 > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ \$labels.namespace }}/{{ \$labels.pod }} is crash looping"
description: "Pod has restarted {{ \$value | humanize }} times in the last 15 minutes."
- alert: OpenShiftDeploymentReplicasMismatch
expr: |
kube_deployment_spec_replicas{cluster="${CLUSTER_NAME}"}
!= kube_deployment_status_ready_replicas{cluster="${CLUSTER_NAME}"}
for: 10m
labels:
severity: warning
annotations:
summary: "Deployment {{ \$labels.namespace }}/{{ \$labels.deployment }} replica mismatch"
description: "Deployment does not have expected number of ready replicas."
- alert: OpenShiftEtcdLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{cluster="${CLUSTER_NAME}"}[1h]) > 3
for: 5m
labels:
severity: warning
annotations:
summary: "Frequent etcd leader changes on {{ \$labels.cluster }}"
description: "etcd leader changed {{ \$value | humanize }} times in the last hour."
YAML
}
#------------------------------------------------------------------------------
# Apply federation configuration
#------------------------------------------------------------------------------
apply_federation() {
log "Configuring federation from $OPENSHIFT_URL..."
# Set up OpenShift service account and token
setup_openshift_sa
# Backup existing config
backup_config
# Generate and append federation scrape config
local federation_config
federation_config=$(generate_federation_config)
if $DRY_RUN; then
info "[dry-run] Would append to $PROMETHEUS_CONFIG:"
echo "$federation_config"
else
# Check if the job already exists
if grep -q 'job_name: "openshift-federate"' "$PROMETHEUS_CONFIG" 2>/dev/null; then
warn "Federation job 'openshift-federate' already exists in $PROMETHEUS_CONFIG"
warn "Remove the existing job first or edit it manually."
return 1
fi
echo "$federation_config" >> "$PROMETHEUS_CONFIG"
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$PROMETHEUS_CONFIG"
log "Federation scrape job added to $PROMETHEUS_CONFIG"
fi
# Generate rules
if [[ "$SKIP_RULES" == false ]]; then
generate_rules
fi
# Validate and reload
validate_and_reload
}
#------------------------------------------------------------------------------
# Apply remote write configuration
#------------------------------------------------------------------------------
apply_remote_write() {
log "Configuring remote write to $PROMETHEUS_URL..."
# Backup existing config
backup_config
# Enable remote write receiver
local service_file="/etc/systemd/system/${PROMETHEUS_SERVICE}.service"
if [[ -f "$service_file" ]]; then
if grep -q "web.enable-remote-write-receiver" "$service_file"; then
log "Remote write receiver already enabled"
else
if $DRY_RUN; then
info "[dry-run] Would add --web.enable-remote-write-receiver to $service_file"
else
warn "You need to add --web.enable-remote-write-receiver to your Prometheus service."
warn "Edit $service_file and add the flag to ExecStart, then run:"
warn " sudo systemctl daemon-reload && sudo systemctl restart prometheus"
echo ""
fi
fi
fi
# Generate basic auth credentials
local rw_password
rw_password=$(openssl rand -base64 24 2>/dev/null || head -c 24 /dev/urandom | base64)
local rw_user="openshift"
log "Generated remote write credentials:"
log " Username: $rw_user"
log " Password: $rw_password"
echo ""
# Generate web.yml with basic auth
local web_config_file
web_config_file="$(dirname "$PROMETHEUS_CONFIG")/web.yml"
if command -v htpasswd &>/dev/null; then
local hash
hash=$(htpasswd -nbBC 12 "" "$rw_password" | tr -d ':\n')
if $DRY_RUN; then
info "[dry-run] Would create $web_config_file with basic_auth_users"
else
if [[ -f "$web_config_file" ]]; then
warn "$web_config_file already exists — add this entry manually:"
echo " $rw_user: \"$hash\""
else
cat > "$web_config_file" <<EOF
basic_auth_users:
${rw_user}: "${hash}"
EOF
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$web_config_file"
chmod 600 "$web_config_file"
log "Created $web_config_file"
fi
fi
else
warn "htpasswd not found — install apache2-utils (Debian) or httpd-tools (RHEL)"
warn "Then generate a hash: htpasswd -nbBC 12 '' 'PASSWORD'"
fi
# Print OpenShift-side commands
echo ""
log "Run the following on your OpenShift cluster:"
echo ""
echo " # Create the auth secret"
echo " oc create secret generic remote-write-auth \\"
echo " -n openshift-monitoring \\"
echo " --from-literal=username=${rw_user} \\"
echo " --from-literal=password='${rw_password}'"
echo ""
echo " # Apply the remote write config"
echo " oc apply -f - <<'OCEOF'"
cat <<OCEOF
apiVersion: v1
kind: ConfigMap
metadata:
name: cluster-monitoring-config
namespace: openshift-monitoring
data:
config.yaml: |
prometheusK8s:
remoteWrite:
- url: "${PROMETHEUS_URL}/api/v1/write"
basicAuth:
username:
name: remote-write-auth
key: username
password:
name: remote-write-auth
key: password
tlsConfig:
insecureSkipVerify: true
writeRelabelConfigs:
- sourceLabels: [__name__]
regex: "node_.*|kube_.*|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_.*_bytes_total|container_fs_.*_bytes|etcd_.*|apiserver_request_total|apiserver_request_duration_seconds_bucket|up"
action: keep
- regex: "prometheus_replica"
action: labeldrop
queueConfig:
maxSamplesPerSend: 5000
batchSendDeadline: 5s
maxShards: 10
OCEOF
echo "OCEOF"
echo ""
# Generate rules
if [[ "$SKIP_RULES" == false ]]; then
generate_rules
fi
# Validate and reload
validate_and_reload
}
#------------------------------------------------------------------------------
# Generate recording and alert rules
#------------------------------------------------------------------------------
generate_rules() {
log "Generating recording and alert rules..."
if $DRY_RUN; then
info "[dry-run] Would create $RULES_DIR/openshift-rules.yml"
info "[dry-run] Would create $RULES_DIR/openshift-alerts.yml"
echo ""
info "Recording rules:"
generate_recording_rules
echo ""
info "Alert rules:"
generate_alert_rules
return
fi
mkdir -p "$RULES_DIR"
generate_recording_rules > "$RULES_DIR/openshift-rules.yml"
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-rules.yml"
log "Created $RULES_DIR/openshift-rules.yml"
generate_alert_rules > "$RULES_DIR/openshift-alerts.yml"
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-alerts.yml"
log "Created $RULES_DIR/openshift-alerts.yml"
}
#------------------------------------------------------------------------------
# Validate config and reload Prometheus
#------------------------------------------------------------------------------
validate_and_reload() {
if $DRY_RUN; then
info "[dry-run] Would validate config and reload Prometheus"
return
fi
# Validate with promtool
if command -v promtool &>/dev/null; then
log "Validating Prometheus configuration..."
if ! promtool check config "$PROMETHEUS_CONFIG"; then
error "Config validation failed. Restoring backup..."
local backup_dir
backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups"
local latest_backup
latest_backup=$(ls -t "$backup_dir"/prometheus.yml.* 2>/dev/null | head -1)
if [[ -n "$latest_backup" ]]; then
cp "$latest_backup" "$PROMETHEUS_CONFIG"
log "Restored from $latest_backup"
fi
exit 1
fi
log "Config validation passed"
# Validate rules
if [[ "$SKIP_RULES" == false ]]; then
for rule_file in "$RULES_DIR"/openshift-*.yml; do
if [[ -f "$rule_file" ]]; then
if ! promtool check rules "$rule_file"; then
error "Rule validation failed: $rule_file"
exit 1
fi
fi
done
log "Rule validation passed"
fi
fi
# Reload Prometheus
if systemctl is-active --quiet "$PROMETHEUS_SERVICE"; then
systemctl reload "$PROMETHEUS_SERVICE" 2>/dev/null || \
systemctl restart "$PROMETHEUS_SERVICE"
log "Prometheus reloaded"
else
warn "Prometheus service is not running. Start it with: sudo systemctl start $PROMETHEUS_SERVICE"
fi
}
#------------------------------------------------------------------------------
# Print summary
#------------------------------------------------------------------------------
print_summary() {
echo ""
echo "============================================"
echo " OpenShift Metrics Configuration Complete"
echo "============================================"
echo ""
echo " Method: $METHOD"
echo " Cluster name: $CLUSTER_NAME"
if [[ "$METHOD" == "federation" ]]; then
echo " OpenShift URL: $OPENSHIFT_URL"
echo " Token file: $TOKEN_FILE"
else
echo " Prometheus URL: $PROMETHEUS_URL"
fi
echo " Config file: $PROMETHEUS_CONFIG"
if [[ "$SKIP_RULES" == false ]]; then
echo " Rules dir: $RULES_DIR"
fi
echo ""
echo " Verify:"
echo " - Check targets: http://localhost:9090/targets"
if [[ "$METHOD" == "federation" ]]; then
echo " - Test query: node_memory_MemAvailable_bytes{cluster=\"${CLUSTER_NAME}\"}"
else
echo " - Test query: up{cluster=\"${CLUSTER_NAME}\"}"
fi
echo ""
}
#------------------------------------------------------------------------------
# Main
#------------------------------------------------------------------------------
main() {
echo ""
log "configure-openshift-metrics.sh v${VERSION}"
echo ""
validate
if $DRY_RUN; then
warn "DRY RUN — no changes will be made"
echo ""
fi
case "$METHOD" in
federation) apply_federation ;;
remote-write) apply_remote_write ;;
esac
if ! $DRY_RUN; then
print_summary
fi
log "Done."
}
main