#!/bin/bash ################################################################################ # Script Name: gpu-exporter.sh # Version: 1.0 # Description: Prometheus exporter for NVIDIA GPU metrics — temperature, # utilization, VRAM usage, power draw, fan speed, clock speeds, # performance state, and per-process GPU memory via nvidia-smi # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - NVIDIA GPU with drivers installed # - nvidia-smi available in PATH # - netcat (nc) for HTTP mode # # Usage: # ./gpu-exporter.sh # stdout # ./gpu-exporter.sh --http -p 9195 # HTTP server # ./gpu-exporter.sh --textfile # node_exporter textfile # # Metrics Exported: # - gpu_info{gpu,name,driver_version,cuda_version} - GPU info # - gpu_count - Number of GPUs detected # - gpu_temperature_celsius{gpu} - Temperature # - gpu_utilization_percent{gpu} - GPU utilization # - gpu_memory_utilization_percent{gpu} - Memory utilization # - gpu_memory_used_bytes{gpu} - VRAM used # - gpu_memory_total_bytes{gpu} - Total VRAM # - gpu_memory_free_bytes{gpu} - Free VRAM # - gpu_power_draw_watts{gpu} - Power draw # - gpu_power_limit_watts{gpu} - Power limit # - gpu_fan_speed_percent{gpu} - Fan speed # - gpu_clock_speed_mhz{gpu} - GPU clock # - gpu_memory_clock_speed_mhz{gpu} - Memory clock # - gpu_pstate{gpu} - Performance state # - gpu_process_memory_bytes{gpu,pid,process_name} - Per-process memory # - gpu_exporter_duration_seconds - Script execution time # - gpu_exporter_last_run_timestamp - Last run timestamp # # Configuration: # Default HTTP port: 9195 # Textfile directory: /var/lib/node_exporter # ################################################################################ set -euo pipefail # ============================================================================ # CONFIGURATION VARIABLES # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9195 # ============================================================================ # HELPER FUNCTIONS # ============================================================================ show_usage() { cat <&2; exit 1 ;; esac done } # Escape special characters in Prometheus label values # Args: $1 - string to escape # Returns: escaped string safe for Prometheus labels prom_escape() { local val="$1" val="${val//\\/\\\\}" val="${val//\"/\\\"}" val="${val//$'\n'/}" echo "$val" } # ============================================================================ # METRIC GENERATION # ============================================================================ # Generate all Prometheus metrics # Returns: Prometheus text format metrics on stdout generate_metrics() { local script_start script_start=$(date +%s) # Check nvidia-smi exists if ! command -v nvidia-smi >/dev/null 2>&1; then cat </dev/null | head -1) gpu_count=${gpu_count:-0} # Strip whitespace gpu_count=$(echo "$gpu_count" | tr -d '[:space:]') if [ "$gpu_count" -eq 0 ] 2>/dev/null; then cat </dev/null | head -1) driver_version=$(echo "$driver_version" | tr -d '[:space:]') driver_version=${driver_version:-"unknown"} cuda_version=$(nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null | head -1) cuda_version=$(echo "$cuda_version" | tr -d '[:space:]') # Fallback: parse from nvidia-smi header if query fails if [ -z "$cuda_version" ] || [ "$cuda_version" = "[N/A]" ]; then cuda_version=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version:\s*\K[0-9.]+' || echo "unknown") fi cat </dev/null) if [ -n "$info_lines" ]; then while IFS= read -r info_line; do [ -z "$info_line" ] && continue local g_idx g_name g_idx=$(echo "$info_line" | cut -d',' -f1 | tr -d '[:space:]') g_name=$(echo "$info_line" | cut -d',' -f2- | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') echo "gpu_info{gpu=\"$g_idx\",name=\"$(prom_escape "$g_name")\",driver_version=\"$(prom_escape "$driver_version")\",cuda_version=\"$(prom_escape "$cuda_version")\"} 1" done <<< "$info_lines" fi echo "" # ======================================================================== # OUTPUT PER-GPU METRICS (with HELP/TYPE headers) # ======================================================================== # Helper: emit a metric block for all GPUs # Args: $1=metric_name, $2=help_text, $3=query_field emit_gpu_metric() { local metric="$1" help="$2" query="$3" echo "# HELP $metric $help" echo "# TYPE $metric gauge" local lines lines=$(nvidia-smi --query-gpu=index,"$query" --format=csv,noheader,nounits 2>/dev/null) while IFS=', ' read -r g_idx g_val; do g_idx=$(echo "$g_idx" | tr -d '[:space:]') g_val=$(echo "$g_val" | tr -d '[:space:]') [[ "$g_val" == "[N/A]" ]] && g_val=0 echo "${metric}{gpu=\"$g_idx\"} $g_val" done <<< "$lines" echo "" } # Helper: emit a memory metric (MiB → bytes) for all GPUs # Args: $1=metric_name, $2=help_text, $3=query_field emit_gpu_mem_metric() { local metric="$1" help="$2" query="$3" echo "# HELP $metric $help" echo "# TYPE $metric gauge" local lines lines=$(nvidia-smi --query-gpu=index,"$query" --format=csv,noheader,nounits 2>/dev/null) while IFS=', ' read -r g_idx g_val; do g_idx=$(echo "$g_idx" | tr -d '[:space:]') g_val=$(echo "$g_val" | tr -d '[:space:]') [[ "$g_val" == "[N/A]" ]] && g_val=0 local bytes bytes=$(awk "BEGIN { printf \"%.0f\", $g_val * 1048576 }") echo "${metric}{gpu=\"$g_idx\"} $bytes" done <<< "$lines" echo "" } emit_gpu_metric "gpu_temperature_celsius" "GPU temperature in degrees Celsius" "temperature.gpu" emit_gpu_metric "gpu_utilization_percent" "GPU core utilization percentage" "utilization.gpu" emit_gpu_metric "gpu_memory_utilization_percent" "GPU memory utilization percentage" "utilization.memory" emit_gpu_mem_metric "gpu_memory_used_bytes" "GPU memory used in bytes" "memory.used" emit_gpu_mem_metric "gpu_memory_total_bytes" "GPU total memory in bytes" "memory.total" emit_gpu_mem_metric "gpu_memory_free_bytes" "GPU free memory in bytes" "memory.free" emit_gpu_metric "gpu_power_draw_watts" "GPU power draw in watts" "power.draw" emit_gpu_metric "gpu_power_limit_watts" "GPU power limit in watts" "power.limit" emit_gpu_metric "gpu_fan_speed_percent" "GPU fan speed percentage" "fan.speed" emit_gpu_metric "gpu_clock_speed_mhz" "GPU graphics clock speed in MHz" "clocks.current.graphics" emit_gpu_metric "gpu_memory_clock_speed_mhz" "GPU memory clock speed in MHz" "clocks.current.memory" # Performance state needs special handling (P0 → 0, P8 → 8, etc.) echo "# HELP gpu_pstate GPU performance state (0=max, 12=min)" echo "# TYPE gpu_pstate gauge" local pstate_lines pstate_lines=$(nvidia-smi --query-gpu=index,pstate --format=csv,noheader,nounits 2>/dev/null) while IFS=', ' read -r g_idx g_pstate; do g_idx=$(echo "$g_idx" | tr -d '[:space:]') g_pstate=$(echo "$g_pstate" | tr -d '[:space:]') local pnum=0 if [[ "$g_pstate" =~ ^P([0-9]+)$ ]]; then pnum="${BASH_REMATCH[1]}" fi echo "gpu_pstate{gpu=\"$g_idx\"} $pnum" done <<< "$pstate_lines" echo "" # ======================================================================== # PER-PROCESS GPU MEMORY # ======================================================================== # Build UUID-to-index mapping declare -A uuid_to_index local uuid_lines uuid_lines=$(nvidia-smi --query-gpu=index,uuid --format=csv,noheader 2>/dev/null) if [ -n "$uuid_lines" ]; then while IFS=', ' read -r g_idx g_uuid; do g_idx=$(echo "$g_idx" | tr -d '[:space:]') g_uuid=$(echo "$g_uuid" | tr -d '[:space:]') uuid_to_index["$g_uuid"]="$g_idx" done <<< "$uuid_lines" fi cat </dev/null) if [ -n "$process_lines" ]; then while IFS= read -r proc_line; do [ -z "$proc_line" ] && continue # Parse: uuid, pid, process_name, used_memory_mib local proc_uuid proc_pid proc_name proc_mem_mib proc_uuid=$(echo "$proc_line" | cut -d',' -f1 | tr -d '[:space:]') proc_pid=$(echo "$proc_line" | cut -d',' -f2 | tr -d '[:space:]') proc_name=$(echo "$proc_line" | cut -d',' -f3 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') proc_mem_mib=$(echo "$proc_line" | rev | cut -d',' -f1 | rev | tr -d '[:space:]') # Resolve UUID to GPU index local proc_gpu_idx="${uuid_to_index[$proc_uuid]:-0}" # Handle [N/A] memory if [ "$proc_mem_mib" = "[N/A]" ]; then proc_mem_mib=0 fi # Convert MiB to bytes local proc_mem_bytes proc_mem_bytes=$(awk "BEGIN { printf \"%.0f\", $proc_mem_mib * 1048576 }") # Extract short process name from full path local short_name short_name=$(basename "$proc_name" 2>/dev/null || echo "$proc_name") echo "gpu_process_memory_bytes{gpu=\"$proc_gpu_idx\",pid=\"$proc_pid\",process_name=\"$(prom_escape "$short_name")\"} $proc_mem_bytes" done <<< "$process_lines" fi echo "" # ======================================================================== # EXPORTER RUNTIME # ======================================================================== local script_end script_duration script_end=$(date +%s) script_duration=$((script_end - script_start)) cat <&2 if ! command -v nc >/dev/null 2>&1; then echo "ERROR: netcat (nc) required for HTTP mode" >&2 exit 1 fi # Infinite loop accepting HTTP requests while true; do { read -r request # Check if request is for /metrics endpoint if [[ "$request" =~ ^GET\ /metrics ]]; then echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" generate_metrics else # Serve HTML landing page for other requests echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" cat < GPU Exporter v1.0

GPU Prometheus Exporter v1.0

Metrics

NVIDIA GPU metrics via nvidia-smi.

EOF fi } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null done } # ============================================================================ # MAIN EXECUTION # ============================================================================ # Main entry point - routes to appropriate output mode main() { parse_args "$@" if [ "$HTTP_MODE" = true ]; then # Run HTTP server (blocks until killed) run_http_server elif [ -n "$OUTPUT_FILE" ]; then # Textfile collector mode: write atomically using temp file local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" # Create temp file in SAME directory for atomic rename (same filesystem) local temp_file temp_file=$(mktemp "${output_dir}/.gpu_metrics.XXXXXX") # Generate metrics to temp file if ! generate_metrics > "$temp_file" 2>/dev/null; then rm -f "$temp_file" echo "ERROR: Failed to generate metrics" >&2 exit 1 fi # Validate: file must exist, have content local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [ "$file_lines" -lt 10 ]; then rm -f "$temp_file" echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 exit 1 fi # Set permissions before move chmod 644 "$temp_file" # Atomic rename — no gap where file is missing mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 else # Default: output to stdout generate_metrics fi } # Execute main function with all script arguments main "$@"