a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
688 lines
23 KiB
Bash
688 lines
23 KiB
Bash
#!/bin/bash
|
|
###############################################################################
|
|
# configure-openshift-metrics.sh
|
|
#
|
|
# Configure an external Prometheus server to receive metrics from OpenShift.
|
|
# Supports federation (pull) and remote write (push) modes.
|
|
#
|
|
# Usage:
|
|
# sudo ./configure-openshift-metrics.sh --method federation \
|
|
# --openshift-url ROUTE --cluster-name NAME
|
|
#
|
|
# sudo ./configure-openshift-metrics.sh --method remote-write \
|
|
# --prometheus-url URL --cluster-name NAME
|
|
#
|
|
# Requirements:
|
|
# - Root or sudo access on the Prometheus server
|
|
# - oc CLI logged in with cluster-admin (unless --skip-openshift)
|
|
# - Prometheus installed via binary (not containerized)
|
|
#
|
|
# https://mylinux.work/guides/openshift-metrics-to-external-prometheus/
|
|
###############################################################################
|
|
|
|
set -euo pipefail
|
|
|
|
VERSION="1.0"
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Defaults
|
|
#------------------------------------------------------------------------------
|
|
METHOD="federation"
|
|
OPENSHIFT_URL=""
|
|
PROMETHEUS_URL=""
|
|
CLUSTER_NAME="openshift"
|
|
PROMETHEUS_CONFIG="/etc/prometheus/prometheus.yml"
|
|
PROMETHEUS_SERVICE="prometheus"
|
|
RULES_DIR="/etc/prometheus/rules"
|
|
TOKEN_FILE="/etc/prometheus/openshift-token"
|
|
PROMETHEUS_USER="prometheus"
|
|
SKIP_OPENSHIFT=false
|
|
SKIP_RULES=false
|
|
DRY_RUN=false
|
|
OC_NAMESPACE="openshift-monitoring"
|
|
SA_NAME="prometheus-external"
|
|
TOKEN_DURATION="8760h"
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Colors and logging
|
|
#------------------------------------------------------------------------------
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
log() { echo -e "${GREEN}[openshift-metrics]${NC} $1"; }
|
|
warn() { echo -e "${YELLOW}[openshift-metrics]${NC} $1"; }
|
|
error() { echo -e "${RED}[openshift-metrics]${NC} $1" >&2; }
|
|
info() { echo -e "${BLUE}[openshift-metrics]${NC} $1"; }
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Usage
|
|
#------------------------------------------------------------------------------
|
|
usage() {
|
|
cat <<EOF
|
|
configure-openshift-metrics.sh v${VERSION}
|
|
|
|
Configure an external Prometheus to receive OpenShift metrics.
|
|
|
|
Usage:
|
|
sudo $0 [OPTIONS]
|
|
|
|
Options:
|
|
--method METHOD federation or remote-write (default: federation)
|
|
--openshift-url URL OpenShift Prometheus route hostname (federation)
|
|
--prometheus-url URL External Prometheus URL (remote-write)
|
|
--cluster-name NAME Label for metrics (default: openshift)
|
|
--prometheus-config PATH Path to prometheus.yml (default: /etc/prometheus/prometheus.yml)
|
|
--rules-dir PATH Directory for rule files (default: /etc/prometheus/rules)
|
|
--token-file PATH Bearer token file path (default: /etc/prometheus/openshift-token)
|
|
--skip-openshift Skip oc commands (use existing token)
|
|
--skip-rules Skip recording/alert rule generation
|
|
--dry-run Show what would be done without making changes
|
|
--help Show this help message
|
|
|
|
Federation example:
|
|
sudo $0 --method federation \\
|
|
--openshift-url prometheus-k8s-openshift-monitoring.apps.cluster.example.com \\
|
|
--cluster-name production
|
|
|
|
Remote write example:
|
|
sudo $0 --method remote-write \\
|
|
--prometheus-url https://prometheus.example.com:9090 \\
|
|
--cluster-name production
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Parse arguments
|
|
#------------------------------------------------------------------------------
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--method) METHOD="$2"; shift 2 ;;
|
|
--openshift-url) OPENSHIFT_URL="$2"; shift 2 ;;
|
|
--prometheus-url) PROMETHEUS_URL="$2"; shift 2 ;;
|
|
--cluster-name) CLUSTER_NAME="$2"; shift 2 ;;
|
|
--prometheus-config) PROMETHEUS_CONFIG="$2"; shift 2 ;;
|
|
--rules-dir) RULES_DIR="$2"; shift 2 ;;
|
|
--token-file) TOKEN_FILE="$2"; shift 2 ;;
|
|
--skip-openshift) SKIP_OPENSHIFT=true; shift ;;
|
|
--skip-rules) SKIP_RULES=true; shift ;;
|
|
--dry-run) DRY_RUN=true; shift ;;
|
|
--help|-h) usage ;;
|
|
*) error "Unknown option: $1"; echo "Use --help for usage."; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Validation
|
|
#------------------------------------------------------------------------------
|
|
validate() {
|
|
if [[ "$METHOD" != "federation" && "$METHOD" != "remote-write" ]]; then
|
|
error "Invalid method: $METHOD (must be federation or remote-write)"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ "$METHOD" == "federation" && -z "$OPENSHIFT_URL" ]]; then
|
|
error "--openshift-url is required for federation mode"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ "$METHOD" == "remote-write" && -z "$PROMETHEUS_URL" ]]; then
|
|
error "--prometheus-url is required for remote-write mode"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ "$EUID" -ne 0 ]]; then
|
|
error "This script must be run as root or with sudo"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -f "$PROMETHEUS_CONFIG" ]]; then
|
|
error "Prometheus config not found: $PROMETHEUS_CONFIG"
|
|
exit 1
|
|
fi
|
|
|
|
if ! command -v promtool &>/dev/null; then
|
|
warn "promtool not found — config validation will be skipped"
|
|
fi
|
|
|
|
if [[ "$SKIP_OPENSHIFT" == false ]] && ! command -v oc &>/dev/null; then
|
|
error "oc CLI not found. Install it or use --skip-openshift with an existing token"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Backup existing config
|
|
#------------------------------------------------------------------------------
|
|
backup_config() {
|
|
local backup_dir
|
|
backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups"
|
|
mkdir -p "$backup_dir"
|
|
|
|
local timestamp
|
|
timestamp=$(date +%F_%H%M%S)
|
|
local backup_file="${backup_dir}/prometheus.yml.${timestamp}"
|
|
|
|
if $DRY_RUN; then
|
|
info "[dry-run] Would backup $PROMETHEUS_CONFIG to $backup_file"
|
|
else
|
|
cp "$PROMETHEUS_CONFIG" "$backup_file"
|
|
log "Backed up config to $backup_file"
|
|
fi
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# OpenShift: Create service account and token
|
|
#------------------------------------------------------------------------------
|
|
setup_openshift_sa() {
|
|
if $SKIP_OPENSHIFT; then
|
|
if [[ -f "$TOKEN_FILE" ]]; then
|
|
log "Using existing token from $TOKEN_FILE"
|
|
else
|
|
error "No token found at $TOKEN_FILE. Provide a token or remove --skip-openshift."
|
|
exit 1
|
|
fi
|
|
return
|
|
fi
|
|
|
|
log "Setting up OpenShift service account..."
|
|
|
|
# Check oc is logged in
|
|
if ! oc whoami &>/dev/null; then
|
|
error "Not logged into OpenShift. Run: oc login <cluster-url>"
|
|
exit 1
|
|
fi
|
|
|
|
local cluster_info
|
|
cluster_info=$(oc whoami --show-server 2>/dev/null || echo "unknown")
|
|
log "Connected to: $cluster_info"
|
|
|
|
if $DRY_RUN; then
|
|
info "[dry-run] Would create service account $SA_NAME in $OC_NAMESPACE"
|
|
info "[dry-run] Would grant cluster-monitoring-view role"
|
|
info "[dry-run] Would generate token with duration $TOKEN_DURATION"
|
|
return
|
|
fi
|
|
|
|
# Create service account (ignore if exists)
|
|
if oc get serviceaccount "$SA_NAME" -n "$OC_NAMESPACE" &>/dev/null; then
|
|
warn "Service account $SA_NAME already exists in $OC_NAMESPACE"
|
|
else
|
|
oc create serviceaccount "$SA_NAME" -n "$OC_NAMESPACE"
|
|
log "Created service account: $SA_NAME"
|
|
fi
|
|
|
|
# Grant cluster-monitoring-view role
|
|
if oc get clusterrolebinding "${SA_NAME}-monitoring-view" &>/dev/null 2>&1; then
|
|
warn "Role binding already exists"
|
|
else
|
|
oc adm policy add-cluster-role-to-user cluster-monitoring-view \
|
|
-z "$SA_NAME" -n "$OC_NAMESPACE"
|
|
log "Granted cluster-monitoring-view role"
|
|
fi
|
|
|
|
# Generate token
|
|
local token
|
|
token=$(oc create token "$SA_NAME" -n "$OC_NAMESPACE" --duration="$TOKEN_DURATION")
|
|
|
|
echo "$token" > "$TOKEN_FILE"
|
|
chmod 600 "$TOKEN_FILE"
|
|
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$TOKEN_FILE"
|
|
log "Token saved to $TOKEN_FILE (expires in $TOKEN_DURATION)"
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Generate federation scrape config
|
|
#------------------------------------------------------------------------------
|
|
generate_federation_config() {
|
|
cat <<YAML
|
|
|
|
- job_name: "openshift-federate"
|
|
honor_labels: true
|
|
metrics_path: /federate
|
|
scrape_interval: 30s
|
|
scrape_timeout: 25s
|
|
params:
|
|
'match[]':
|
|
- '{job="node-exporter"}'
|
|
- '{job="kube-state-metrics"}'
|
|
- '{__name__=~"container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_.*_bytes_total|container_fs_.*_bytes"}'
|
|
- '{__name__=~"etcd_server_leader_changes_seen_total|etcd_disk_wal_fsync_duration_seconds_bucket|etcd_mvcc_db_total_size_in_bytes"}'
|
|
- '{__name__=~"apiserver_request_total|apiserver_request_duration_seconds_bucket"}'
|
|
- '{__name__="up"}'
|
|
scheme: https
|
|
bearer_token_file: ${TOKEN_FILE}
|
|
tls_config:
|
|
insecure_skip_verify: true
|
|
static_configs:
|
|
- targets:
|
|
- "${OPENSHIFT_URL}"
|
|
labels:
|
|
cluster: ${CLUSTER_NAME}
|
|
YAML
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Generate recording rules
|
|
#------------------------------------------------------------------------------
|
|
generate_recording_rules() {
|
|
cat <<YAML
|
|
# OpenShift recording rules — generated by configure-openshift-metrics.sh
|
|
# Cluster: ${CLUSTER_NAME}
|
|
groups:
|
|
- name: openshift_recording_rules
|
|
interval: 30s
|
|
rules:
|
|
- record: openshift:node_cpu_utilization:ratio
|
|
expr: |
|
|
1 - avg by(instance, cluster) (
|
|
rate(node_cpu_seconds_total{mode="idle", cluster="${CLUSTER_NAME}"}[5m])
|
|
)
|
|
|
|
- record: openshift:node_memory_utilization:ratio
|
|
expr: |
|
|
1 - (
|
|
node_memory_MemAvailable_bytes{cluster="${CLUSTER_NAME}"}
|
|
/ node_memory_MemTotal_bytes{cluster="${CLUSTER_NAME}"}
|
|
)
|
|
|
|
- record: openshift:namespace_pod_count:sum
|
|
expr: |
|
|
count by(namespace, cluster) (
|
|
kube_pod_status_phase{phase="Running", cluster="${CLUSTER_NAME}"}
|
|
)
|
|
|
|
- record: openshift:namespace_cpu_usage:sum
|
|
expr: |
|
|
sum by(namespace, cluster) (
|
|
rate(container_cpu_usage_seconds_total{cluster="${CLUSTER_NAME}", container!=""}[5m])
|
|
)
|
|
|
|
- record: openshift:namespace_memory_usage:sum
|
|
expr: |
|
|
sum by(namespace, cluster) (
|
|
container_memory_working_set_bytes{cluster="${CLUSTER_NAME}", container!=""}
|
|
)
|
|
YAML
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Generate alert rules
|
|
#------------------------------------------------------------------------------
|
|
generate_alert_rules() {
|
|
cat <<YAML
|
|
# OpenShift alert rules — generated by configure-openshift-metrics.sh
|
|
# Cluster: ${CLUSTER_NAME}
|
|
groups:
|
|
- name: openshift_alerts
|
|
rules:
|
|
- alert: OpenShiftFederationDown
|
|
expr: up{job="openshift-federate", cluster="${CLUSTER_NAME}"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "OpenShift federation target is down"
|
|
description: "Cannot scrape metrics from OpenShift cluster {{ \$labels.cluster }} for 5 minutes."
|
|
|
|
- alert: OpenShiftNodeHighCPU
|
|
expr: openshift:node_cpu_utilization:ratio > 0.9
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU on OpenShift node {{ \$labels.instance }}"
|
|
description: "CPU usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})."
|
|
|
|
- alert: OpenShiftNodeHighMemory
|
|
expr: openshift:node_memory_utilization:ratio > 0.9
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory on OpenShift node {{ \$labels.instance }}"
|
|
description: "Memory usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})."
|
|
|
|
- alert: OpenShiftPodCrashLooping
|
|
expr: rate(kube_pod_container_status_restarts_total{cluster="${CLUSTER_NAME}"}[15m]) * 60 * 5 > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod {{ \$labels.namespace }}/{{ \$labels.pod }} is crash looping"
|
|
description: "Pod has restarted {{ \$value | humanize }} times in the last 15 minutes."
|
|
|
|
- alert: OpenShiftDeploymentReplicasMismatch
|
|
expr: |
|
|
kube_deployment_spec_replicas{cluster="${CLUSTER_NAME}"}
|
|
!= kube_deployment_status_ready_replicas{cluster="${CLUSTER_NAME}"}
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Deployment {{ \$labels.namespace }}/{{ \$labels.deployment }} replica mismatch"
|
|
description: "Deployment does not have expected number of ready replicas."
|
|
|
|
- alert: OpenShiftEtcdLeaderChanges
|
|
expr: increase(etcd_server_leader_changes_seen_total{cluster="${CLUSTER_NAME}"}[1h]) > 3
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Frequent etcd leader changes on {{ \$labels.cluster }}"
|
|
description: "etcd leader changed {{ \$value | humanize }} times in the last hour."
|
|
YAML
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Apply federation configuration
|
|
#------------------------------------------------------------------------------
|
|
apply_federation() {
|
|
log "Configuring federation from $OPENSHIFT_URL..."
|
|
|
|
# Set up OpenShift service account and token
|
|
setup_openshift_sa
|
|
|
|
# Backup existing config
|
|
backup_config
|
|
|
|
# Generate and append federation scrape config
|
|
local federation_config
|
|
federation_config=$(generate_federation_config)
|
|
|
|
if $DRY_RUN; then
|
|
info "[dry-run] Would append to $PROMETHEUS_CONFIG:"
|
|
echo "$federation_config"
|
|
else
|
|
# Check if the job already exists
|
|
if grep -q 'job_name: "openshift-federate"' "$PROMETHEUS_CONFIG" 2>/dev/null; then
|
|
warn "Federation job 'openshift-federate' already exists in $PROMETHEUS_CONFIG"
|
|
warn "Remove the existing job first or edit it manually."
|
|
return 1
|
|
fi
|
|
|
|
echo "$federation_config" >> "$PROMETHEUS_CONFIG"
|
|
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$PROMETHEUS_CONFIG"
|
|
log "Federation scrape job added to $PROMETHEUS_CONFIG"
|
|
fi
|
|
|
|
# Generate rules
|
|
if [[ "$SKIP_RULES" == false ]]; then
|
|
generate_rules
|
|
fi
|
|
|
|
# Validate and reload
|
|
validate_and_reload
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Apply remote write configuration
|
|
#------------------------------------------------------------------------------
|
|
apply_remote_write() {
|
|
log "Configuring remote write to $PROMETHEUS_URL..."
|
|
|
|
# Backup existing config
|
|
backup_config
|
|
|
|
# Enable remote write receiver
|
|
local service_file="/etc/systemd/system/${PROMETHEUS_SERVICE}.service"
|
|
if [[ -f "$service_file" ]]; then
|
|
if grep -q "web.enable-remote-write-receiver" "$service_file"; then
|
|
log "Remote write receiver already enabled"
|
|
else
|
|
if $DRY_RUN; then
|
|
info "[dry-run] Would add --web.enable-remote-write-receiver to $service_file"
|
|
else
|
|
warn "You need to add --web.enable-remote-write-receiver to your Prometheus service."
|
|
warn "Edit $service_file and add the flag to ExecStart, then run:"
|
|
warn " sudo systemctl daemon-reload && sudo systemctl restart prometheus"
|
|
echo ""
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Generate basic auth credentials
|
|
local rw_password
|
|
rw_password=$(openssl rand -base64 24 2>/dev/null || head -c 24 /dev/urandom | base64)
|
|
local rw_user="openshift"
|
|
|
|
log "Generated remote write credentials:"
|
|
log " Username: $rw_user"
|
|
log " Password: $rw_password"
|
|
echo ""
|
|
|
|
# Generate web.yml with basic auth
|
|
local web_config_file
|
|
web_config_file="$(dirname "$PROMETHEUS_CONFIG")/web.yml"
|
|
|
|
if command -v htpasswd &>/dev/null; then
|
|
local hash
|
|
hash=$(htpasswd -nbBC 12 "" "$rw_password" | tr -d ':\n')
|
|
|
|
if $DRY_RUN; then
|
|
info "[dry-run] Would create $web_config_file with basic_auth_users"
|
|
else
|
|
if [[ -f "$web_config_file" ]]; then
|
|
warn "$web_config_file already exists — add this entry manually:"
|
|
echo " $rw_user: \"$hash\""
|
|
else
|
|
cat > "$web_config_file" <<EOF
|
|
basic_auth_users:
|
|
${rw_user}: "${hash}"
|
|
EOF
|
|
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$web_config_file"
|
|
chmod 600 "$web_config_file"
|
|
log "Created $web_config_file"
|
|
fi
|
|
fi
|
|
else
|
|
warn "htpasswd not found — install apache2-utils (Debian) or httpd-tools (RHEL)"
|
|
warn "Then generate a hash: htpasswd -nbBC 12 '' 'PASSWORD'"
|
|
fi
|
|
|
|
# Print OpenShift-side commands
|
|
echo ""
|
|
log "Run the following on your OpenShift cluster:"
|
|
echo ""
|
|
echo " # Create the auth secret"
|
|
echo " oc create secret generic remote-write-auth \\"
|
|
echo " -n openshift-monitoring \\"
|
|
echo " --from-literal=username=${rw_user} \\"
|
|
echo " --from-literal=password='${rw_password}'"
|
|
echo ""
|
|
echo " # Apply the remote write config"
|
|
echo " oc apply -f - <<'OCEOF'"
|
|
|
|
cat <<OCEOF
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: cluster-monitoring-config
|
|
namespace: openshift-monitoring
|
|
data:
|
|
config.yaml: |
|
|
prometheusK8s:
|
|
remoteWrite:
|
|
- url: "${PROMETHEUS_URL}/api/v1/write"
|
|
basicAuth:
|
|
username:
|
|
name: remote-write-auth
|
|
key: username
|
|
password:
|
|
name: remote-write-auth
|
|
key: password
|
|
tlsConfig:
|
|
insecureSkipVerify: true
|
|
writeRelabelConfigs:
|
|
- sourceLabels: [__name__]
|
|
regex: "node_.*|kube_.*|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_.*_bytes_total|container_fs_.*_bytes|etcd_.*|apiserver_request_total|apiserver_request_duration_seconds_bucket|up"
|
|
action: keep
|
|
- regex: "prometheus_replica"
|
|
action: labeldrop
|
|
queueConfig:
|
|
maxSamplesPerSend: 5000
|
|
batchSendDeadline: 5s
|
|
maxShards: 10
|
|
OCEOF
|
|
echo "OCEOF"
|
|
echo ""
|
|
|
|
# Generate rules
|
|
if [[ "$SKIP_RULES" == false ]]; then
|
|
generate_rules
|
|
fi
|
|
|
|
# Validate and reload
|
|
validate_and_reload
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Generate recording and alert rules
|
|
#------------------------------------------------------------------------------
|
|
generate_rules() {
|
|
log "Generating recording and alert rules..."
|
|
|
|
if $DRY_RUN; then
|
|
info "[dry-run] Would create $RULES_DIR/openshift-rules.yml"
|
|
info "[dry-run] Would create $RULES_DIR/openshift-alerts.yml"
|
|
echo ""
|
|
info "Recording rules:"
|
|
generate_recording_rules
|
|
echo ""
|
|
info "Alert rules:"
|
|
generate_alert_rules
|
|
return
|
|
fi
|
|
|
|
mkdir -p "$RULES_DIR"
|
|
|
|
generate_recording_rules > "$RULES_DIR/openshift-rules.yml"
|
|
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-rules.yml"
|
|
log "Created $RULES_DIR/openshift-rules.yml"
|
|
|
|
generate_alert_rules > "$RULES_DIR/openshift-alerts.yml"
|
|
chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-alerts.yml"
|
|
log "Created $RULES_DIR/openshift-alerts.yml"
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Validate config and reload Prometheus
|
|
#------------------------------------------------------------------------------
|
|
validate_and_reload() {
|
|
if $DRY_RUN; then
|
|
info "[dry-run] Would validate config and reload Prometheus"
|
|
return
|
|
fi
|
|
|
|
# Validate with promtool
|
|
if command -v promtool &>/dev/null; then
|
|
log "Validating Prometheus configuration..."
|
|
|
|
if ! promtool check config "$PROMETHEUS_CONFIG"; then
|
|
error "Config validation failed. Restoring backup..."
|
|
local backup_dir
|
|
backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups"
|
|
local latest_backup
|
|
latest_backup=$(ls -t "$backup_dir"/prometheus.yml.* 2>/dev/null | head -1)
|
|
if [[ -n "$latest_backup" ]]; then
|
|
cp "$latest_backup" "$PROMETHEUS_CONFIG"
|
|
log "Restored from $latest_backup"
|
|
fi
|
|
exit 1
|
|
fi
|
|
log "Config validation passed"
|
|
|
|
# Validate rules
|
|
if [[ "$SKIP_RULES" == false ]]; then
|
|
for rule_file in "$RULES_DIR"/openshift-*.yml; do
|
|
if [[ -f "$rule_file" ]]; then
|
|
if ! promtool check rules "$rule_file"; then
|
|
error "Rule validation failed: $rule_file"
|
|
exit 1
|
|
fi
|
|
fi
|
|
done
|
|
log "Rule validation passed"
|
|
fi
|
|
fi
|
|
|
|
# Reload Prometheus
|
|
if systemctl is-active --quiet "$PROMETHEUS_SERVICE"; then
|
|
systemctl reload "$PROMETHEUS_SERVICE" 2>/dev/null || \
|
|
systemctl restart "$PROMETHEUS_SERVICE"
|
|
log "Prometheus reloaded"
|
|
else
|
|
warn "Prometheus service is not running. Start it with: sudo systemctl start $PROMETHEUS_SERVICE"
|
|
fi
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Print summary
|
|
#------------------------------------------------------------------------------
|
|
print_summary() {
|
|
echo ""
|
|
echo "============================================"
|
|
echo " OpenShift Metrics Configuration Complete"
|
|
echo "============================================"
|
|
echo ""
|
|
echo " Method: $METHOD"
|
|
echo " Cluster name: $CLUSTER_NAME"
|
|
|
|
if [[ "$METHOD" == "federation" ]]; then
|
|
echo " OpenShift URL: $OPENSHIFT_URL"
|
|
echo " Token file: $TOKEN_FILE"
|
|
else
|
|
echo " Prometheus URL: $PROMETHEUS_URL"
|
|
fi
|
|
|
|
echo " Config file: $PROMETHEUS_CONFIG"
|
|
|
|
if [[ "$SKIP_RULES" == false ]]; then
|
|
echo " Rules dir: $RULES_DIR"
|
|
fi
|
|
|
|
echo ""
|
|
echo " Verify:"
|
|
echo " - Check targets: http://localhost:9090/targets"
|
|
|
|
if [[ "$METHOD" == "federation" ]]; then
|
|
echo " - Test query: node_memory_MemAvailable_bytes{cluster=\"${CLUSTER_NAME}\"}"
|
|
else
|
|
echo " - Test query: up{cluster=\"${CLUSTER_NAME}\"}"
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
#------------------------------------------------------------------------------
|
|
# Main
|
|
#------------------------------------------------------------------------------
|
|
main() {
|
|
echo ""
|
|
log "configure-openshift-metrics.sh v${VERSION}"
|
|
echo ""
|
|
|
|
validate
|
|
|
|
if $DRY_RUN; then
|
|
warn "DRY RUN — no changes will be made"
|
|
echo ""
|
|
fi
|
|
|
|
case "$METHOD" in
|
|
federation) apply_federation ;;
|
|
remote-write) apply_remote_write ;;
|
|
esac
|
|
|
|
if ! $DRY_RUN; then
|
|
print_summary
|
|
fi
|
|
|
|
log "Done."
|
|
}
|
|
|
|
main
|