#!/bin/bash
################################################################################
# Script Name: journal-error-exporter.sh
# Version: 1.0
# Description: Prometheus exporter for journalctl error/critical/warning
#              messages per systemd unit. Exports per-priority message counts,
#              per-unit breakdown, top offending units, error rates, and
#              journal disk usage metrics.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
#   - journalctl (systemd journal)
#   - netcat (nc) for HTTP mode
#   - Standard Unix tools (awk, sort, uniq, grep)
#
# Performance:
#   Journal output is cached per priority+period combination — journalctl
#   is called once per unique combo, not once per metric. Per-unit counts
#   are extracted from a single cached 24h query.
#   Typical run time: a few seconds even on busy systems.
#
# Usage:
#   # Output to stdout
#   ./journal-error-exporter.sh
#
#   # HTTP server mode
#   ./journal-error-exporter.sh --http -p 9201
#
#   # Textfile collector mode
#   ./journal-error-exporter.sh --textfile
#
# Metrics Exported:
#   Core Status:
#     - journal_error_up - Exporter status (1=up, 0=down)
#     - journal_error_exporter_info{version} - Exporter version
#
#   Message Counts:
#     - journal_error_messages_total{priority,period} - Messages per priority per period
#
#   Per-Unit Breakdown (24h, top 20):
#     - journal_error_unit_messages{unit,priority} - Per-unit message count by priority
#
#   Top Offenders:
#     - journal_error_top_unit_count{unit} - Top 20 units by err+crit+alert+emerg (24h)
#
#   Rates:
#     - journal_error_rate_per_hour{priority} - Average messages per hour (24h)
#
#   Journal Health:
#     - journal_error_journal_disk_usage_bytes - Journal disk usage
#     - journal_error_exporter_duration_seconds - Script execution time
#     - journal_error_exporter_last_run_timestamp - Last run timestamp
#
# Configuration:
#   Default HTTP port: 9201
#   Textfile directory: /var/lib/node_exporter
#
################################################################################

# ============================================================================
# CONFIGURATION VARIABLES
# ============================================================================

TEXTFILE_DIR="/var/lib/node_exporter"
OUTPUT_FILE=""
HTTP_MODE=false
HTTP_PORT=9201
CACHE_DIR=""

# Priority label map
declare -A PRIORITY_LABELS
PRIORITY_LABELS=([0]="emerg" [1]="alert" [2]="crit" [3]="err" [4]="warning")

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

show_usage() {
    cat <<EOF
Usage: $0 [OPTIONS]

Export journalctl error/warning statistics as Prometheus metrics (v1.0).

MODES:
    --textfile      Write to node_exporter textfile collector
    --http          Run HTTP server on port $HTTP_PORT

OPTIONS:
    -p, --port      HTTP port (default: 9201)
    -o, --output    Output file path

EXAMPLES:
    $0 --textfile                    # Write to textfile collector
    $0 --http --port 9201            # Run HTTP server
    $0 -o /tmp/journal_errors.prom   # Write to custom file

METRICS:
    - Per-priority message counts (emerg/alert/crit/err/warning)
    - Per-unit breakdown of messages by priority (24h, top 20)
    - Top offending units by error+ count (24h)
    - Average messages per hour rates by priority (24h)
    - Journal disk usage

EOF
    exit 0
}

parse_args() {
    while [[ $# -gt 0 ]]; do
        case $1 in
            -h|--help) show_usage ;;
            --textfile) OUTPUT_FILE="$TEXTFILE_DIR/journal_errors.prom"; shift ;;
            --http) HTTP_MODE=true; shift ;;
            -p|--port) HTTP_PORT="$2"; shift 2 ;;
            -o|--output) OUTPUT_FILE="$2"; shift 2 ;;
            *) echo "Unknown option: $1" >&2; exit 1 ;;
        esac
    done
}

# Set up a temp directory for caching journal output per priority+period
# Called once at the start of generate_metrics
setup_cache() {
    CACHE_DIR=$(mktemp -d /tmp/journal-error-exporter.XXXXXX)
}

# Clean up cached journal data
cleanup_cache() {
    [ -n "$CACHE_DIR" ] && rm -rf "$CACHE_DIR"
    CACHE_DIR=""
}

# Get message count for a specific priority and time period (cached)
# journalctl is called once per unique priority+period combo
# Args: $1 - priority number (0-4), $2 - period ("1 hour ago", "24 hours ago", "7 days ago")
# Returns: Number of messages
get_priority_count() {
    local priority_num="$1"
    local period="$2"
    local cache_key="priority_${priority_num}_${period// /_}"
    local cache_file="$CACHE_DIR/$cache_key"

    # Return cached result if available
    if [ -f "$cache_file" ]; then
        cat "$cache_file"
        return
    fi

    local count
    count=$(journalctl --priority="${priority_num}..${priority_num}" --since "$period" --no-pager -q 2>/dev/null | wc -l)
    echo "${count:-0}" > "$cache_file"
    cat "$cache_file"
}

# Get top units by err+ (priority 0..3) message count in 24h
# Args: $1 - limit (default: 20)
# Returns: Lines with "count unit" format, sorted by count descending
get_top_units() {
    local limit="${1:-20}"
    local cache_file="$CACHE_DIR/top_units"

    # Return cached result if available
    if [ -f "$cache_file" ]; then
        head -n "$limit" "$cache_file"
        return
    fi

    # Extract unit/process field from syslog-format output
    journalctl --priority=0..3 --since "24 hours ago" --output=short --no-pager -q 2>/dev/null | \
        awk '{print $5}' | sed 's/\[.*//; s/://' | \
        sort | uniq -c | sort -rn > "$cache_file" 2>/dev/null

    head -n "$limit" "$cache_file"
}

# Get message count for a specific unit at a specific priority (24h)
# Args: $1 - unit name, $2 - priority number (0-4)
# Returns: Number of messages
get_unit_priority_count() {
    local unit="$1"
    local priority_num="$2"
    local cache_key="unit_${unit}_p${priority_num}"
    local cache_file="$CACHE_DIR/$cache_key"

    # Return cached result if available
    if [ -f "$cache_file" ]; then
        cat "$cache_file"
        return
    fi

    local count
    count=$(journalctl --priority="${priority_num}..${priority_num}" --since "24 hours ago" --no-pager -q 2>/dev/null | \
        awk '{print $5}' | sed 's/\[.*//; s/://' | \
        grep -cx "$unit" 2>/dev/null)
    echo "${count:-0}" > "$cache_file"
    cat "$cache_file"
}

# Get journal disk usage in bytes
# Parses journalctl --disk-usage output
# Returns: Disk usage in bytes
get_journal_disk_usage() {
    local output
    output=$(journalctl --disk-usage 2>/dev/null)

    # Output format: "Archived and active journals take up 123.4M in the file system."
    # or similar with G/K/B suffixes
    local size_str
    size_str=$(echo "$output" | grep -oE '[0-9]+(\.[0-9]+)?[KMGTB]+' | head -1)

    if [ -z "$size_str" ]; then
        echo "0"
        return
    fi

    local number suffix
    number=$(echo "$size_str" | grep -oE '[0-9]+(\.[0-9]+)?')
    suffix=$(echo "$size_str" | grep -oE '[KMGTB]+$')

    case "$suffix" in
        B)   awk "BEGIN {printf \"%.0f\", $number}" ;;
        K)   awk "BEGIN {printf \"%.0f\", $number * 1024}" ;;
        M)   awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024}" ;;
        G)   awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024 * 1024}" ;;
        T)   awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024 * 1024 * 1024}" ;;
        *)   echo "0" ;;
    esac
}

# ============================================================================
# METRIC GENERATION
# ============================================================================

# Generate all Prometheus metrics
# Returns: Prometheus text format metrics on stdout
generate_metrics() {
    local script_start
    script_start=$(date +%s)

    # Verify journalctl is available
    if ! command -v journalctl >/dev/null 2>&1; then
        cat <<EOF
# HELP journal_error_up Journal error exporter status
# TYPE journal_error_up gauge
journal_error_up 0
EOF
        return
    fi

    # Set up per-priority+period cache (journalctl is called once per combo)
    setup_cache
    trap cleanup_cache EXIT

    cat <<EOF
# HELP journal_error_up Journal error exporter status
# TYPE journal_error_up gauge
journal_error_up 1

# HELP journal_error_exporter_info Journal error exporter information
# TYPE journal_error_exporter_info gauge
journal_error_exporter_info{version="1.0"} 1
EOF

    echo ""

    # ========================================================================
    # Message Counts
    # ========================================================================
    cat <<EOF
# HELP journal_error_messages_total Journal messages per priority and period
# TYPE journal_error_messages_total gauge
EOF

    for priority_num in 0 1 2 3 4; do
        local priority_label="${PRIORITY_LABELS[$priority_num]}"
        for period_label in "1h:1 hour ago" "24h:24 hours ago" "7d:7 days ago"; do
            local label="${period_label%%:*}"
            local since="${period_label#*:}"
            local val
            val=$(get_priority_count "$priority_num" "$since")
            echo "journal_error_messages_total{priority=\"$priority_label\",period=\"$label\"} ${val:-0}"
        done
    done

    echo ""

    # ========================================================================
    # Per-Unit Breakdown (24h, top 20 units by err+ count)
    # ========================================================================
    cat <<EOF
# HELP journal_error_unit_messages Per-unit message count by priority (24h, top 20 units)
# TYPE journal_error_unit_messages gauge
EOF

    local top_units_list
    top_units_list=$(get_top_units 20)

    if [ -n "$top_units_list" ]; then
        echo "$top_units_list" | while read -r count unit; do
            [ -z "$unit" ] && continue
            for priority_num in 0 1 2 3 4; do
                local priority_label="${PRIORITY_LABELS[$priority_num]}"
                local val
                val=$(get_unit_priority_count "$unit" "$priority_num")
                echo "journal_error_unit_messages{unit=\"$unit\",priority=\"$priority_label\"} ${val:-0}"
            done
        done
    fi

    echo ""

    # ========================================================================
    # Top Offenders
    # ========================================================================
    cat <<EOF
# HELP journal_error_top_unit_count Top 20 units by combined err+crit+alert+emerg count (24h)
# TYPE journal_error_top_unit_count gauge
EOF

    if [ -n "$top_units_list" ]; then
        echo "$top_units_list" | while read -r count unit; do
            [ -z "$unit" ] && continue
            echo "journal_error_top_unit_count{unit=\"$unit\"} ${count:-0}"
        done
    fi

    echo ""

    # ========================================================================
    # Rates
    # ========================================================================
    cat <<EOF
# HELP journal_error_rate_per_hour Average messages per hour over 24h by priority
# TYPE journal_error_rate_per_hour gauge
EOF

    for priority_num in 0 1 2 3 4; do
        local priority_label="${PRIORITY_LABELS[$priority_num]}"
        local total
        total=$(get_priority_count "$priority_num" "24 hours ago")
        total=${total:-0}

        local rate
        if [ "$total" -gt 0 ] 2>/dev/null; then
            rate=$(awk "BEGIN {printf \"%.2f\", $total / 24}" 2>/dev/null || echo "0.00")
        else
            rate="0.00"
        fi
        echo "journal_error_rate_per_hour{priority=\"$priority_label\"} $rate"
    done

    echo ""

    # ========================================================================
    # Journal Health
    # ========================================================================
    local disk_usage
    disk_usage=$(get_journal_disk_usage)

    local script_end script_duration
    script_end=$(date +%s)
    script_duration=$((script_end - script_start))

    cat <<EOF
# HELP journal_error_journal_disk_usage_bytes Journal disk usage in bytes
# TYPE journal_error_journal_disk_usage_bytes gauge
journal_error_journal_disk_usage_bytes ${disk_usage:-0}

# HELP journal_error_exporter_duration_seconds Time to generate all metrics
# TYPE journal_error_exporter_duration_seconds gauge
journal_error_exporter_duration_seconds $script_duration

# HELP journal_error_exporter_last_run_timestamp Unix timestamp of last successful run
# TYPE journal_error_exporter_last_run_timestamp gauge
journal_error_exporter_last_run_timestamp $script_end
EOF

    echo ""
}

# ============================================================================
# HTTP SERVER MODE
# ============================================================================

# Run simple HTTP server using netcat
# Serves metrics on /metrics endpoint
run_http_server() {
    echo "Starting journal error exporter on port $HTTP_PORT..." >&2

    if ! command -v nc >/dev/null 2>&1; then
        echo "ERROR: netcat (nc) required for HTTP mode" >&2
        exit 1
    fi

    # Infinite loop accepting HTTP requests
    while true; do
        {
            read -r request
            # Check if request is for /metrics endpoint
            if [[ "$request" =~ ^GET\ /metrics ]]; then
                echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r"
                generate_metrics
            else  # Serve HTML landing page for other requests
                echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r"
                cat <<EOF
<!DOCTYPE html>
<html>
<head><title>Journal Error Exporter v1.0</title></head>
<body>
<h1>Journal Error Exporter v1.0</h1>
<p><a href="/metrics">Metrics</a></p>
<h2>Metric Categories</h2>
<ul>
<li>Core Status: exporter up/down, version info</li>
<li>Message Counts: per-priority counts (emerg/alert/crit/err/warning) per period</li>
<li>Per-Unit Breakdown: per-unit message counts by priority (24h, top 20)</li>
<li>Top Offenders: top 20 units by error+ count (24h)</li>
<li>Rates: average messages per hour by priority (24h)</li>
<li>Journal Health: disk usage, exporter runtime</li>
</ul>
</body>
</html>
EOF
            fi
        } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
    done
}

# ============================================================================
# MAIN EXECUTION
# ============================================================================

# Main entry point - routes to appropriate output mode
main() {
    parse_args "$@"

    if [ "$HTTP_MODE" = true ]; then
        # Run HTTP server (blocks until killed)
        run_http_server
    elif [ -n "$OUTPUT_FILE" ]; then
        # Textfile collector mode: write atomically using temp file
        local output_dir
        output_dir="$(dirname "$OUTPUT_FILE")"
        mkdir -p "$output_dir"

        # Create temp file in SAME directory for atomic rename (same filesystem)
        local temp_file
        temp_file=$(mktemp "${output_dir}/.journal_error_metrics.XXXXXX")

        # Generate metrics to temp file
        if ! generate_metrics > "$temp_file" 2>/dev/null; then
            rm -f "$temp_file"
            echo "ERROR: Failed to generate metrics" >&2
            exit 1
        fi

        # Validate: file must have content
        local file_lines
        file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)

        if [ "$file_lines" -lt 10 ]; then
            rm -f "$temp_file"
            echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
            exit 1
        fi

        # Set permissions before move
        chmod 644 "$temp_file"

        # Atomic rename - no gap where file is missing
        mv -f "$temp_file" "$OUTPUT_FILE"

        echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
    else
        # Default: output to stdout
        generate_metrics
    fi
}

# Execute main function with all script arguments
main "$@"