linux-scripts/gpu-exporter.sh

#!/bin/bash
################################################################################
# Script Name: gpu-exporter.sh
# Version: 1.0
# Description: Prometheus exporter for NVIDIA GPU metrics — temperature,
#              utilization, VRAM usage, power draw, fan speed, clock speeds,
#              performance state, and per-process GPU memory via nvidia-smi
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
#   - NVIDIA GPU with drivers installed
#   - nvidia-smi available in PATH
#   - netcat (nc) for HTTP mode
#
# Usage:
#   ./gpu-exporter.sh                     # stdout
#   ./gpu-exporter.sh --http -p 9195      # HTTP server
#   ./gpu-exporter.sh --textfile          # node_exporter textfile
#
# Metrics Exported:
#   - gpu_info{gpu,name,driver_version,cuda_version} - GPU info
#   - gpu_count - Number of GPUs detected
#   - gpu_temperature_celsius{gpu} - Temperature
#   - gpu_utilization_percent{gpu} - GPU utilization
#   - gpu_memory_utilization_percent{gpu} - Memory utilization
#   - gpu_memory_used_bytes{gpu} - VRAM used
#   - gpu_memory_total_bytes{gpu} - Total VRAM
#   - gpu_memory_free_bytes{gpu} - Free VRAM
#   - gpu_power_draw_watts{gpu} - Power draw
#   - gpu_power_limit_watts{gpu} - Power limit
#   - gpu_fan_speed_percent{gpu} - Fan speed
#   - gpu_clock_speed_mhz{gpu} - GPU clock
#   - gpu_memory_clock_speed_mhz{gpu} - Memory clock
#   - gpu_pstate{gpu} - Performance state
#   - gpu_process_memory_bytes{gpu,pid,process_name} - Per-process memory
#   - gpu_exporter_duration_seconds - Script execution time
#   - gpu_exporter_last_run_timestamp - Last run timestamp
#
# Configuration:
#   Default HTTP port: 9195
#   Textfile directory: /var/lib/node_exporter
#
################################################################################

set -euo pipefail

# ============================================================================
# CONFIGURATION VARIABLES
# ============================================================================

TEXTFILE_DIR="/var/lib/node_exporter"
OUTPUT_FILE=""
HTTP_MODE=false
HTTP_PORT=9195

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

show_usage() {
    cat <<EOF
Usage: $0 [OPTIONS]

Export NVIDIA GPU statistics as Prometheus metrics via nvidia-smi.

MODES:
    --textfile      Write to node_exporter textfile collector
    --http          Run HTTP server on port $HTTP_PORT

OPTIONS:
    -p, --port        HTTP port (default: 9195)
    -o, --output      Output file path

EXAMPLES:
    $0 --textfile                          # Write to textfile collector
    $0 --http --port 9195                  # Run HTTP server
    $0 -o /tmp/gpu.prom                    # Write to custom file

EOF
    exit 0
}

parse_args() {
    while [[ $# -gt 0 ]]; do
        case $1 in
            -h|--help) show_usage ;;
            --textfile) OUTPUT_FILE="$TEXTFILE_DIR/gpu.prom"; shift ;;
            --http) HTTP_MODE=true; shift ;;
            -p|--port) HTTP_PORT="$2"; shift 2 ;;
            -o|--output) OUTPUT_FILE="$2"; shift 2 ;;
            *) echo "Unknown option: $1" >&2; exit 1 ;;
        esac
    done
}

# Escape special characters in Prometheus label values
# Args: $1 - string to escape
# Returns: escaped string safe for Prometheus labels
prom_escape() {
    local val="$1"
    val="${val//\\/\\\\}"
    val="${val//\"/\\\"}"
    val="${val//$'\n'/}"
    echo "$val"
}

# ============================================================================
# METRIC GENERATION
# ============================================================================

# Generate all Prometheus metrics
# Returns: Prometheus text format metrics on stdout
generate_metrics() {
    local script_start
    script_start=$(date +%s)

    # Check nvidia-smi exists
    if ! command -v nvidia-smi >/dev/null 2>&1; then
        cat <<EOF
# HELP gpu_count Number of NVIDIA GPUs detected
# TYPE gpu_count gauge
gpu_count 0
EOF
        return
    fi

    # ========================================================================
    # GPU COUNT
    # ========================================================================

    local gpu_count
    gpu_count=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits 2>/dev/null | head -1)
    gpu_count=${gpu_count:-0}

    # Strip whitespace
    gpu_count=$(echo "$gpu_count" | tr -d '[:space:]')

    if [ "$gpu_count" -eq 0 ] 2>/dev/null; then
        cat <<EOF
# HELP gpu_count Number of NVIDIA GPUs detected
# TYPE gpu_count gauge
gpu_count 0
EOF
        return
    fi

    cat <<EOF
# HELP gpu_count Number of NVIDIA GPUs detected
# TYPE gpu_count gauge
gpu_count $gpu_count
EOF

    echo ""

    # ========================================================================
    # GPU INFO (driver + CUDA version)
    # ========================================================================

    local driver_version cuda_version
    driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits 2>/dev/null | head -1)
    driver_version=$(echo "$driver_version" | tr -d '[:space:]')
    driver_version=${driver_version:-"unknown"}

    cuda_version=$(nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null | head -1)
    cuda_version=$(echo "$cuda_version" | tr -d '[:space:]')

    # Fallback: parse from nvidia-smi header if query fails
    if [ -z "$cuda_version" ] || [ "$cuda_version" = "[N/A]" ]; then
        cuda_version=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version:\s*\K[0-9.]+' || echo "unknown")
    fi

    cat <<EOF
# HELP gpu_info GPU information labels
# TYPE gpu_info gauge
EOF

    # ========================================================================
    # GPU INFO LABELS
    # ========================================================================

    local info_lines
    info_lines=$(nvidia-smi --query-gpu=index,name --format=csv,noheader 2>/dev/null)

    if [ -n "$info_lines" ]; then
        while IFS= read -r info_line; do
            [ -z "$info_line" ] && continue
            local g_idx g_name
            g_idx=$(echo "$info_line" | cut -d',' -f1 | tr -d '[:space:]')
            g_name=$(echo "$info_line" | cut -d',' -f2- | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
            echo "gpu_info{gpu=\"$g_idx\",name=\"$(prom_escape "$g_name")\",driver_version=\"$(prom_escape "$driver_version")\",cuda_version=\"$(prom_escape "$cuda_version")\"} 1"
        done <<< "$info_lines"
    fi

    echo ""

    # ========================================================================
    # OUTPUT PER-GPU METRICS (with HELP/TYPE headers)
    # ========================================================================

    # Helper: emit a metric block for all GPUs
    # Args: $1=metric_name, $2=help_text, $3=query_field
    emit_gpu_metric() {
        local metric="$1" help="$2" query="$3"
        echo "# HELP $metric $help"
        echo "# TYPE $metric gauge"
        local lines
        lines=$(nvidia-smi --query-gpu=index,"$query" --format=csv,noheader,nounits 2>/dev/null)
        while IFS=', ' read -r g_idx g_val; do
            g_idx=$(echo "$g_idx" | tr -d '[:space:]')
            g_val=$(echo "$g_val" | tr -d '[:space:]')
            [[ "$g_val" == "[N/A]" ]] && g_val=0
            echo "${metric}{gpu=\"$g_idx\"} $g_val"
        done <<< "$lines"
        echo ""
    }

    # Helper: emit a memory metric (MiB → bytes) for all GPUs
    # Args: $1=metric_name, $2=help_text, $3=query_field
    emit_gpu_mem_metric() {
        local metric="$1" help="$2" query="$3"
        echo "# HELP $metric $help"
        echo "# TYPE $metric gauge"
        local lines
        lines=$(nvidia-smi --query-gpu=index,"$query" --format=csv,noheader,nounits 2>/dev/null)
        while IFS=', ' read -r g_idx g_val; do
            g_idx=$(echo "$g_idx" | tr -d '[:space:]')
            g_val=$(echo "$g_val" | tr -d '[:space:]')
            [[ "$g_val" == "[N/A]" ]] && g_val=0
            local bytes
            bytes=$(awk "BEGIN { printf \"%.0f\", $g_val * 1048576 }")
            echo "${metric}{gpu=\"$g_idx\"} $bytes"
        done <<< "$lines"
        echo ""
    }

    emit_gpu_metric "gpu_temperature_celsius" "GPU temperature in degrees Celsius" "temperature.gpu"
    emit_gpu_metric "gpu_utilization_percent" "GPU core utilization percentage" "utilization.gpu"
    emit_gpu_metric "gpu_memory_utilization_percent" "GPU memory utilization percentage" "utilization.memory"
    emit_gpu_mem_metric "gpu_memory_used_bytes" "GPU memory used in bytes" "memory.used"
    emit_gpu_mem_metric "gpu_memory_total_bytes" "GPU total memory in bytes" "memory.total"
    emit_gpu_mem_metric "gpu_memory_free_bytes" "GPU free memory in bytes" "memory.free"
    emit_gpu_metric "gpu_power_draw_watts" "GPU power draw in watts" "power.draw"
    emit_gpu_metric "gpu_power_limit_watts" "GPU power limit in watts" "power.limit"
    emit_gpu_metric "gpu_fan_speed_percent" "GPU fan speed percentage" "fan.speed"
    emit_gpu_metric "gpu_clock_speed_mhz" "GPU graphics clock speed in MHz" "clocks.current.graphics"
    emit_gpu_metric "gpu_memory_clock_speed_mhz" "GPU memory clock speed in MHz" "clocks.current.memory"

    # Performance state needs special handling (P0 → 0, P8 → 8, etc.)
    echo "# HELP gpu_pstate GPU performance state (0=max, 12=min)"
    echo "# TYPE gpu_pstate gauge"
    local pstate_lines
    pstate_lines=$(nvidia-smi --query-gpu=index,pstate --format=csv,noheader,nounits 2>/dev/null)
    while IFS=', ' read -r g_idx g_pstate; do
        g_idx=$(echo "$g_idx" | tr -d '[:space:]')
        g_pstate=$(echo "$g_pstate" | tr -d '[:space:]')
        local pnum=0
        if [[ "$g_pstate" =~ ^P([0-9]+)$ ]]; then
            pnum="${BASH_REMATCH[1]}"
        fi
        echo "gpu_pstate{gpu=\"$g_idx\"} $pnum"
    done <<< "$pstate_lines"

    echo ""

    # ========================================================================
    # PER-PROCESS GPU MEMORY
    # ========================================================================

    # Build UUID-to-index mapping
    declare -A uuid_to_index
    local uuid_lines
    uuid_lines=$(nvidia-smi --query-gpu=index,uuid --format=csv,noheader 2>/dev/null)

    if [ -n "$uuid_lines" ]; then
        while IFS=', ' read -r g_idx g_uuid; do
            g_idx=$(echo "$g_idx" | tr -d '[:space:]')
            g_uuid=$(echo "$g_uuid" | tr -d '[:space:]')
            uuid_to_index["$g_uuid"]="$g_idx"
        done <<< "$uuid_lines"
    fi

    cat <<EOF
# HELP gpu_process_memory_bytes Per-process GPU memory usage in bytes
# TYPE gpu_process_memory_bytes gauge
EOF

    local process_lines
    process_lines=$(nvidia-smi --query-compute-apps=gpu_uuid,pid,process_name,used_gpu_memory --format=csv,noheader,nounits 2>/dev/null)

    if [ -n "$process_lines" ]; then
        while IFS= read -r proc_line; do
            [ -z "$proc_line" ] && continue

            # Parse: uuid, pid, process_name, used_memory_mib
            local proc_uuid proc_pid proc_name proc_mem_mib
            proc_uuid=$(echo "$proc_line" | cut -d',' -f1 | tr -d '[:space:]')
            proc_pid=$(echo "$proc_line" | cut -d',' -f2 | tr -d '[:space:]')
            proc_name=$(echo "$proc_line" | cut -d',' -f3 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
            proc_mem_mib=$(echo "$proc_line" | rev | cut -d',' -f1 | rev | tr -d '[:space:]')

            # Resolve UUID to GPU index
            local proc_gpu_idx="${uuid_to_index[$proc_uuid]:-0}"

            # Handle [N/A] memory
            if [ "$proc_mem_mib" = "[N/A]" ]; then
                proc_mem_mib=0
            fi

            # Convert MiB to bytes
            local proc_mem_bytes
            proc_mem_bytes=$(awk "BEGIN { printf \"%.0f\", $proc_mem_mib * 1048576 }")

            # Extract short process name from full path
            local short_name
            short_name=$(basename "$proc_name" 2>/dev/null || echo "$proc_name")

            echo "gpu_process_memory_bytes{gpu=\"$proc_gpu_idx\",pid=\"$proc_pid\",process_name=\"$(prom_escape "$short_name")\"} $proc_mem_bytes"
        done <<< "$process_lines"
    fi

    echo ""

    # ========================================================================
    # EXPORTER RUNTIME
    # ========================================================================

    local script_end script_duration
    script_end=$(date +%s)
    script_duration=$((script_end - script_start))

    cat <<EOF
# HELP gpu_exporter_duration_seconds Time to generate all metrics
# TYPE gpu_exporter_duration_seconds gauge
gpu_exporter_duration_seconds $script_duration

# HELP gpu_exporter_last_run_timestamp Unix timestamp of last successful run
# TYPE gpu_exporter_last_run_timestamp gauge
gpu_exporter_last_run_timestamp $script_end
EOF

    echo ""
}

# ============================================================================
# HTTP SERVER MODE
# ============================================================================

# Run simple HTTP server using netcat
# Serves metrics on /metrics endpoint
run_http_server() {
    echo "Starting GPU exporter on port $HTTP_PORT..." >&2

    if ! command -v nc >/dev/null 2>&1; then
        echo "ERROR: netcat (nc) required for HTTP mode" >&2
        exit 1
    fi

    # Infinite loop accepting HTTP requests
    while true; do
        {
            read -r request
            # Check if request is for /metrics endpoint
            if [[ "$request" =~ ^GET\ /metrics ]]; then
                echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r"
                generate_metrics
            else  # Serve HTML landing page for other requests
                echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r"
                cat <<EOF
<!DOCTYPE html>
<html>
<head><title>GPU Exporter v1.0</title></head>
<body>
<h1>GPU Prometheus Exporter v1.0</h1>
<p><a href="/metrics">Metrics</a></p>
<p>NVIDIA GPU metrics via nvidia-smi.</p>
</body>
</html>
EOF
            fi
        } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
    done
}

# ============================================================================
# MAIN EXECUTION
# ============================================================================

# Main entry point - routes to appropriate output mode
main() {
    parse_args "$@"

    if [ "$HTTP_MODE" = true ]; then
        # Run HTTP server (blocks until killed)
        run_http_server
    elif [ -n "$OUTPUT_FILE" ]; then
        # Textfile collector mode: write atomically using temp file
        local output_dir
        output_dir="$(dirname "$OUTPUT_FILE")"
        mkdir -p "$output_dir"

        # Create temp file in SAME directory for atomic rename (same filesystem)
        local temp_file
        temp_file=$(mktemp "${output_dir}/.gpu_metrics.XXXXXX")

        # Generate metrics to temp file
        if ! generate_metrics > "$temp_file" 2>/dev/null; then
            rm -f "$temp_file"
            echo "ERROR: Failed to generate metrics" >&2
            exit 1
        fi

        # Validate: file must exist, have content
        local file_lines
        file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)

        if [ "$file_lines" -lt 10 ]; then
            rm -f "$temp_file"
            echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
            exit 1
        fi

        # Set permissions before move
        chmod 644 "$temp_file"

        # Atomic rename — no gap where file is missing
        mv -f "$temp_file" "$OUTPUT_FILE"

        echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
    else
        # Default: output to stdout
        generate_metrics
    fi
}

# Execute main function with all script arguments
main "$@"