a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
441 lines
16 KiB
Bash
Executable File
441 lines
16 KiB
Bash
Executable File
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: gpu-exporter.sh
|
|
# Version: 1.0
|
|
# Description: Prometheus exporter for NVIDIA GPU metrics — temperature,
|
|
# utilization, VRAM usage, power draw, fan speed, clock speeds,
|
|
# performance state, and per-process GPU memory via nvidia-smi
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
#
|
|
# Prerequisites:
|
|
# - NVIDIA GPU with drivers installed
|
|
# - nvidia-smi available in PATH
|
|
# - netcat (nc) for HTTP mode
|
|
#
|
|
# Usage:
|
|
# ./gpu-exporter.sh # stdout
|
|
# ./gpu-exporter.sh --http -p 9195 # HTTP server
|
|
# ./gpu-exporter.sh --textfile # node_exporter textfile
|
|
#
|
|
# Metrics Exported:
|
|
# - gpu_info{gpu,name,driver_version,cuda_version} - GPU info
|
|
# - gpu_count - Number of GPUs detected
|
|
# - gpu_temperature_celsius{gpu} - Temperature
|
|
# - gpu_utilization_percent{gpu} - GPU utilization
|
|
# - gpu_memory_utilization_percent{gpu} - Memory utilization
|
|
# - gpu_memory_used_bytes{gpu} - VRAM used
|
|
# - gpu_memory_total_bytes{gpu} - Total VRAM
|
|
# - gpu_memory_free_bytes{gpu} - Free VRAM
|
|
# - gpu_power_draw_watts{gpu} - Power draw
|
|
# - gpu_power_limit_watts{gpu} - Power limit
|
|
# - gpu_fan_speed_percent{gpu} - Fan speed
|
|
# - gpu_clock_speed_mhz{gpu} - GPU clock
|
|
# - gpu_memory_clock_speed_mhz{gpu} - Memory clock
|
|
# - gpu_pstate{gpu} - Performance state
|
|
# - gpu_process_memory_bytes{gpu,pid,process_name} - Per-process memory
|
|
# - gpu_exporter_duration_seconds - Script execution time
|
|
# - gpu_exporter_last_run_timestamp - Last run timestamp
|
|
#
|
|
# Configuration:
|
|
# Default HTTP port: 9195
|
|
# Textfile directory: /var/lib/node_exporter
|
|
#
|
|
################################################################################
|
|
|
|
set -euo pipefail
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION VARIABLES
|
|
# ============================================================================
|
|
|
|
TEXTFILE_DIR="/var/lib/node_exporter"
|
|
OUTPUT_FILE=""
|
|
HTTP_MODE=false
|
|
HTTP_PORT=9195
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
show_usage() {
|
|
cat <<EOF
|
|
Usage: $0 [OPTIONS]
|
|
|
|
Export NVIDIA GPU statistics as Prometheus metrics via nvidia-smi.
|
|
|
|
MODES:
|
|
--textfile Write to node_exporter textfile collector
|
|
--http Run HTTP server on port $HTTP_PORT
|
|
|
|
OPTIONS:
|
|
-p, --port HTTP port (default: 9195)
|
|
-o, --output Output file path
|
|
|
|
EXAMPLES:
|
|
$0 --textfile # Write to textfile collector
|
|
$0 --http --port 9195 # Run HTTP server
|
|
$0 -o /tmp/gpu.prom # Write to custom file
|
|
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help) show_usage ;;
|
|
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/gpu.prom"; shift ;;
|
|
--http) HTTP_MODE=true; shift ;;
|
|
-p|--port) HTTP_PORT="$2"; shift 2 ;;
|
|
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
|
|
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
# Escape special characters in Prometheus label values
|
|
# Args: $1 - string to escape
|
|
# Returns: escaped string safe for Prometheus labels
|
|
prom_escape() {
|
|
local val="$1"
|
|
val="${val//\\/\\\\}"
|
|
val="${val//\"/\\\"}"
|
|
val="${val//$'\n'/}"
|
|
echo "$val"
|
|
}
|
|
|
|
# ============================================================================
|
|
# METRIC GENERATION
|
|
# ============================================================================
|
|
|
|
# Generate all Prometheus metrics
|
|
# Returns: Prometheus text format metrics on stdout
|
|
generate_metrics() {
|
|
local script_start
|
|
script_start=$(date +%s)
|
|
|
|
# Check nvidia-smi exists
|
|
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
|
cat <<EOF
|
|
# HELP gpu_count Number of NVIDIA GPUs detected
|
|
# TYPE gpu_count gauge
|
|
gpu_count 0
|
|
EOF
|
|
return
|
|
fi
|
|
|
|
# ========================================================================
|
|
# GPU COUNT
|
|
# ========================================================================
|
|
|
|
local gpu_count
|
|
gpu_count=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits 2>/dev/null | head -1)
|
|
gpu_count=${gpu_count:-0}
|
|
|
|
# Strip whitespace
|
|
gpu_count=$(echo "$gpu_count" | tr -d '[:space:]')
|
|
|
|
if [ "$gpu_count" -eq 0 ] 2>/dev/null; then
|
|
cat <<EOF
|
|
# HELP gpu_count Number of NVIDIA GPUs detected
|
|
# TYPE gpu_count gauge
|
|
gpu_count 0
|
|
EOF
|
|
return
|
|
fi
|
|
|
|
cat <<EOF
|
|
# HELP gpu_count Number of NVIDIA GPUs detected
|
|
# TYPE gpu_count gauge
|
|
gpu_count $gpu_count
|
|
EOF
|
|
|
|
echo ""
|
|
|
|
# ========================================================================
|
|
# GPU INFO (driver + CUDA version)
|
|
# ========================================================================
|
|
|
|
local driver_version cuda_version
|
|
driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits 2>/dev/null | head -1)
|
|
driver_version=$(echo "$driver_version" | tr -d '[:space:]')
|
|
driver_version=${driver_version:-"unknown"}
|
|
|
|
cuda_version=$(nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null | head -1)
|
|
cuda_version=$(echo "$cuda_version" | tr -d '[:space:]')
|
|
|
|
# Fallback: parse from nvidia-smi header if query fails
|
|
if [ -z "$cuda_version" ] || [ "$cuda_version" = "[N/A]" ]; then
|
|
cuda_version=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version:\s*\K[0-9.]+' || echo "unknown")
|
|
fi
|
|
|
|
cat <<EOF
|
|
# HELP gpu_info GPU information labels
|
|
# TYPE gpu_info gauge
|
|
EOF
|
|
|
|
# ========================================================================
|
|
# GPU INFO LABELS
|
|
# ========================================================================
|
|
|
|
local info_lines
|
|
info_lines=$(nvidia-smi --query-gpu=index,name --format=csv,noheader 2>/dev/null)
|
|
|
|
if [ -n "$info_lines" ]; then
|
|
while IFS= read -r info_line; do
|
|
[ -z "$info_line" ] && continue
|
|
local g_idx g_name
|
|
g_idx=$(echo "$info_line" | cut -d',' -f1 | tr -d '[:space:]')
|
|
g_name=$(echo "$info_line" | cut -d',' -f2- | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
echo "gpu_info{gpu=\"$g_idx\",name=\"$(prom_escape "$g_name")\",driver_version=\"$(prom_escape "$driver_version")\",cuda_version=\"$(prom_escape "$cuda_version")\"} 1"
|
|
done <<< "$info_lines"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# ========================================================================
|
|
# OUTPUT PER-GPU METRICS (with HELP/TYPE headers)
|
|
# ========================================================================
|
|
|
|
# Helper: emit a metric block for all GPUs
|
|
# Args: $1=metric_name, $2=help_text, $3=query_field
|
|
emit_gpu_metric() {
|
|
local metric="$1" help="$2" query="$3"
|
|
echo "# HELP $metric $help"
|
|
echo "# TYPE $metric gauge"
|
|
local lines
|
|
lines=$(nvidia-smi --query-gpu=index,"$query" --format=csv,noheader,nounits 2>/dev/null)
|
|
while IFS=', ' read -r g_idx g_val; do
|
|
g_idx=$(echo "$g_idx" | tr -d '[:space:]')
|
|
g_val=$(echo "$g_val" | tr -d '[:space:]')
|
|
[[ "$g_val" == "[N/A]" ]] && g_val=0
|
|
echo "${metric}{gpu=\"$g_idx\"} $g_val"
|
|
done <<< "$lines"
|
|
echo ""
|
|
}
|
|
|
|
# Helper: emit a memory metric (MiB → bytes) for all GPUs
|
|
# Args: $1=metric_name, $2=help_text, $3=query_field
|
|
emit_gpu_mem_metric() {
|
|
local metric="$1" help="$2" query="$3"
|
|
echo "# HELP $metric $help"
|
|
echo "# TYPE $metric gauge"
|
|
local lines
|
|
lines=$(nvidia-smi --query-gpu=index,"$query" --format=csv,noheader,nounits 2>/dev/null)
|
|
while IFS=', ' read -r g_idx g_val; do
|
|
g_idx=$(echo "$g_idx" | tr -d '[:space:]')
|
|
g_val=$(echo "$g_val" | tr -d '[:space:]')
|
|
[[ "$g_val" == "[N/A]" ]] && g_val=0
|
|
local bytes
|
|
bytes=$(awk "BEGIN { printf \"%.0f\", $g_val * 1048576 }")
|
|
echo "${metric}{gpu=\"$g_idx\"} $bytes"
|
|
done <<< "$lines"
|
|
echo ""
|
|
}
|
|
|
|
emit_gpu_metric "gpu_temperature_celsius" "GPU temperature in degrees Celsius" "temperature.gpu"
|
|
emit_gpu_metric "gpu_utilization_percent" "GPU core utilization percentage" "utilization.gpu"
|
|
emit_gpu_metric "gpu_memory_utilization_percent" "GPU memory utilization percentage" "utilization.memory"
|
|
emit_gpu_mem_metric "gpu_memory_used_bytes" "GPU memory used in bytes" "memory.used"
|
|
emit_gpu_mem_metric "gpu_memory_total_bytes" "GPU total memory in bytes" "memory.total"
|
|
emit_gpu_mem_metric "gpu_memory_free_bytes" "GPU free memory in bytes" "memory.free"
|
|
emit_gpu_metric "gpu_power_draw_watts" "GPU power draw in watts" "power.draw"
|
|
emit_gpu_metric "gpu_power_limit_watts" "GPU power limit in watts" "power.limit"
|
|
emit_gpu_metric "gpu_fan_speed_percent" "GPU fan speed percentage" "fan.speed"
|
|
emit_gpu_metric "gpu_clock_speed_mhz" "GPU graphics clock speed in MHz" "clocks.current.graphics"
|
|
emit_gpu_metric "gpu_memory_clock_speed_mhz" "GPU memory clock speed in MHz" "clocks.current.memory"
|
|
|
|
# Performance state needs special handling (P0 → 0, P8 → 8, etc.)
|
|
echo "# HELP gpu_pstate GPU performance state (0=max, 12=min)"
|
|
echo "# TYPE gpu_pstate gauge"
|
|
local pstate_lines
|
|
pstate_lines=$(nvidia-smi --query-gpu=index,pstate --format=csv,noheader,nounits 2>/dev/null)
|
|
while IFS=', ' read -r g_idx g_pstate; do
|
|
g_idx=$(echo "$g_idx" | tr -d '[:space:]')
|
|
g_pstate=$(echo "$g_pstate" | tr -d '[:space:]')
|
|
local pnum=0
|
|
if [[ "$g_pstate" =~ ^P([0-9]+)$ ]]; then
|
|
pnum="${BASH_REMATCH[1]}"
|
|
fi
|
|
echo "gpu_pstate{gpu=\"$g_idx\"} $pnum"
|
|
done <<< "$pstate_lines"
|
|
|
|
echo ""
|
|
|
|
# ========================================================================
|
|
# PER-PROCESS GPU MEMORY
|
|
# ========================================================================
|
|
|
|
# Build UUID-to-index mapping
|
|
declare -A uuid_to_index
|
|
local uuid_lines
|
|
uuid_lines=$(nvidia-smi --query-gpu=index,uuid --format=csv,noheader 2>/dev/null)
|
|
|
|
if [ -n "$uuid_lines" ]; then
|
|
while IFS=', ' read -r g_idx g_uuid; do
|
|
g_idx=$(echo "$g_idx" | tr -d '[:space:]')
|
|
g_uuid=$(echo "$g_uuid" | tr -d '[:space:]')
|
|
uuid_to_index["$g_uuid"]="$g_idx"
|
|
done <<< "$uuid_lines"
|
|
fi
|
|
|
|
cat <<EOF
|
|
# HELP gpu_process_memory_bytes Per-process GPU memory usage in bytes
|
|
# TYPE gpu_process_memory_bytes gauge
|
|
EOF
|
|
|
|
local process_lines
|
|
process_lines=$(nvidia-smi --query-compute-apps=gpu_uuid,pid,process_name,used_gpu_memory --format=csv,noheader,nounits 2>/dev/null)
|
|
|
|
if [ -n "$process_lines" ]; then
|
|
while IFS= read -r proc_line; do
|
|
[ -z "$proc_line" ] && continue
|
|
|
|
# Parse: uuid, pid, process_name, used_memory_mib
|
|
local proc_uuid proc_pid proc_name proc_mem_mib
|
|
proc_uuid=$(echo "$proc_line" | cut -d',' -f1 | tr -d '[:space:]')
|
|
proc_pid=$(echo "$proc_line" | cut -d',' -f2 | tr -d '[:space:]')
|
|
proc_name=$(echo "$proc_line" | cut -d',' -f3 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
proc_mem_mib=$(echo "$proc_line" | rev | cut -d',' -f1 | rev | tr -d '[:space:]')
|
|
|
|
# Resolve UUID to GPU index
|
|
local proc_gpu_idx="${uuid_to_index[$proc_uuid]:-0}"
|
|
|
|
# Handle [N/A] memory
|
|
if [ "$proc_mem_mib" = "[N/A]" ]; then
|
|
proc_mem_mib=0
|
|
fi
|
|
|
|
# Convert MiB to bytes
|
|
local proc_mem_bytes
|
|
proc_mem_bytes=$(awk "BEGIN { printf \"%.0f\", $proc_mem_mib * 1048576 }")
|
|
|
|
# Extract short process name from full path
|
|
local short_name
|
|
short_name=$(basename "$proc_name" 2>/dev/null || echo "$proc_name")
|
|
|
|
echo "gpu_process_memory_bytes{gpu=\"$proc_gpu_idx\",pid=\"$proc_pid\",process_name=\"$(prom_escape "$short_name")\"} $proc_mem_bytes"
|
|
done <<< "$process_lines"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# ========================================================================
|
|
# EXPORTER RUNTIME
|
|
# ========================================================================
|
|
|
|
local script_end script_duration
|
|
script_end=$(date +%s)
|
|
script_duration=$((script_end - script_start))
|
|
|
|
cat <<EOF
|
|
# HELP gpu_exporter_duration_seconds Time to generate all metrics
|
|
# TYPE gpu_exporter_duration_seconds gauge
|
|
gpu_exporter_duration_seconds $script_duration
|
|
|
|
# HELP gpu_exporter_last_run_timestamp Unix timestamp of last successful run
|
|
# TYPE gpu_exporter_last_run_timestamp gauge
|
|
gpu_exporter_last_run_timestamp $script_end
|
|
EOF
|
|
|
|
echo ""
|
|
}
|
|
|
|
# ============================================================================
|
|
# HTTP SERVER MODE
|
|
# ============================================================================
|
|
|
|
# Run simple HTTP server using netcat
|
|
# Serves metrics on /metrics endpoint
|
|
run_http_server() {
|
|
echo "Starting GPU exporter on port $HTTP_PORT..." >&2
|
|
|
|
if ! command -v nc >/dev/null 2>&1; then
|
|
echo "ERROR: netcat (nc) required for HTTP mode" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Infinite loop accepting HTTP requests
|
|
while true; do
|
|
{
|
|
read -r request
|
|
# Check if request is for /metrics endpoint
|
|
if [[ "$request" =~ ^GET\ /metrics ]]; then
|
|
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r"
|
|
generate_metrics
|
|
else # Serve HTML landing page for other requests
|
|
echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r"
|
|
cat <<EOF
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>GPU Exporter v1.0</title></head>
|
|
<body>
|
|
<h1>GPU Prometheus Exporter v1.0</h1>
|
|
<p><a href="/metrics">Metrics</a></p>
|
|
<p>NVIDIA GPU metrics via nvidia-smi.</p>
|
|
</body>
|
|
</html>
|
|
EOF
|
|
fi
|
|
} | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN EXECUTION
|
|
# ============================================================================
|
|
|
|
# Main entry point - routes to appropriate output mode
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
if [ "$HTTP_MODE" = true ]; then
|
|
# Run HTTP server (blocks until killed)
|
|
run_http_server
|
|
elif [ -n "$OUTPUT_FILE" ]; then
|
|
# Textfile collector mode: write atomically using temp file
|
|
local output_dir
|
|
output_dir="$(dirname "$OUTPUT_FILE")"
|
|
mkdir -p "$output_dir"
|
|
|
|
# Create temp file in SAME directory for atomic rename (same filesystem)
|
|
local temp_file
|
|
temp_file=$(mktemp "${output_dir}/.gpu_metrics.XXXXXX")
|
|
|
|
# Generate metrics to temp file
|
|
if ! generate_metrics > "$temp_file" 2>/dev/null; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Failed to generate metrics" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Validate: file must exist, have content
|
|
local file_lines
|
|
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
|
|
|
|
if [ "$file_lines" -lt 10 ]; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Set permissions before move
|
|
chmod 644 "$temp_file"
|
|
|
|
# Atomic rename — no gap where file is missing
|
|
mv -f "$temp_file" "$OUTPUT_FILE"
|
|
|
|
echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
|
|
else
|
|
# Default: output to stdout
|
|
generate_metrics
|
|
fi
|
|
}
|
|
|
|
# Execute main function with all script arguments
|
|
main "$@"
|