#!/bin/bash
################################################################################
# Script Name: web-traffic-exporter.sh
# Version: 1.8
# Description: Prometheus exporter for web server access log traffic metrics.
#              Parses Nginx/Apache access logs and exports request counts,
#              status codes, bandwidth, unique visitors, top paths, referrers,
#              bot detection, and protocol distribution.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
#   - Standard Unix tools (awk, grep, tail)
#   - netcat (nc) for HTTP mode
#   - Read access to web server access logs
#
# Usage:
#   # Output to stdout
#   ./web-traffic-exporter.sh
#
#   # HTTP server mode
#   ./web-traffic-exporter.sh --http -p 9199
#
#   # Textfile collector mode
#   ./web-traffic-exporter.sh --textfile
#
# Metrics Exported:
#   Core Status:
#     - web_traffic_up - Exporter status (1=up, 0=down)
#     - web_traffic_exporter_info{version} - Exporter version
#
#   Request Totals:
#     - web_traffic_requests - Total requests in parsed window
#     - web_traffic_requests_by_status{status} - Per HTTP status code
#     - web_traffic_requests_by_class{class} - Per status class (2xx etc)
#     - web_traffic_requests_by_method{method} - Per HTTP method
#
#   Bandwidth:
#     - web_traffic_response_bytes - Total bytes sent
#     - web_traffic_response_bytes_by_class{class} - Bytes per class
#
#   Unique Visitors:
#     - web_traffic_unique_ips - Unique source IPs
#     - web_traffic_unique_user_agents - Unique user agents
#
#   Top Paths (top 10):
#     - web_traffic_top_path_requests{path,rank} - Hits per path
#
#   Top Referrers (top 10):
#     - web_traffic_top_referrer_requests{referrer,rank} - Per referrer
#
#   Bot Detection:
#     - web_traffic_bot_requests - Bot requests
#     - web_traffic_human_requests - Non-bot requests
#     - web_traffic_bot_ratio - Bot / total ratio
#
#   Downloads:
#     - web_traffic_downloads - Total file downloads
#     - web_traffic_downloads_bytes - Bytes from downloads
#     - web_traffic_top_download_requests{file,rank} - Top downloads
#
#   Status Breakdown:
#     - web_traffic_status_2xx
#     - web_traffic_status_3xx
#     - web_traffic_status_4xx
#     - web_traffic_status_5xx
#
#   Request Rate:
#     - web_traffic_requests_per_minute - Estimated from log timestamps
#
#   Protocol:
#     - web_traffic_requests_by_protocol{protocol} - HTTP version
#
#   Hourly Patterns:
#     - web_traffic_requests_by_hour{hour} - Requests per hour of day
#
#   Derived Metrics:
#     - web_traffic_average_response_bytes - Average response size
#     - web_traffic_error_ratio - Ratio of 4xx+5xx to total requests
#
#   404 Errors:
#     - web_traffic_404_path_requests{path,rank} - Top 404 paths
#     - web_traffic_top_404_referrer_requests{referrer,rank} - Top 404 referrers
#
#   Top Paths by Bandwidth:
#     - web_traffic_top_path_response_bytes{path,rank} - Top paths by bytes
#
#   Content Type:
#     - web_traffic_page_views - Page view requests
#     - web_traffic_asset_requests - Asset requests (css/js/images)
#
#   Top Clients:
#     - web_traffic_top_client_requests{ip,rank} - Top 10 client IPs
#
#   Response Sizes:
#     - web_traffic_response_size_bucket{size} - Requests by size range
#
#   Top Bots:
#     - web_traffic_top_bot_requests{bot,rank} - Top 10 bot names
#
#   Server Status:
#     - web_traffic_server_running{server} - 1 if process found
#     - web_traffic_server_type{server} - Server info metric
#
#   Time Windows (daily/weekly/monthly):
#     - web_traffic_window_requests{window} - Requests in window
#     - web_traffic_window_bytes{window} - Bytes in window
#     - web_traffic_window_unique_ips{window} - Unique IPs in window
#     - web_traffic_window_bot_requests{window} - Bot requests in window
#     - web_traffic_window_requests_by_class{window,class} - Per class in window
#     - web_traffic_window_page_views{window} - Page views in window
#     - web_traffic_window_asset_requests{window} - Asset requests in window
#     - web_traffic_window_downloads{window} - Downloads in window
#     - web_traffic_window_downloads_bytes{window} - Download bytes in window
#     - web_traffic_window_human_requests{window} - Non-bot requests in window
#     - web_traffic_window_unique_user_agents{window} - Unique UAs in window
#
#   Exporter:
#     - web_traffic_exporter_duration_seconds - Script execution time
#     - web_traffic_exporter_last_run_timestamp - Last run timestamp
#     - web_traffic_exporter_lines_parsed - Lines parsed count
#
# Configuration:
#   Default HTTP port: 9199
#   Textfile directory: /var/lib/node_exporter
#   ACCESS_LOG: /var/log/nginx/access.log (or WEB_TRAFFIC_ACCESS_LOG env)
#   TAIL_LINES: 0 (all lines; or WEB_TRAFFIC_TAIL_LINES env)
#   MAX_ROTATED: 7 (or WEB_TRAFFIC_MAX_ROTATED env)
#   HTTP_TAIL_LINES: 5000 (or WEB_TRAFFIC_HTTP_TAIL_LINES env)
#   TRACK_UNIQUE_UA: 1 (or WEB_TRAFFIC_TRACK_UNIQUE_UA env; 0=disable)
#   REFERRER_MODE: host (or WEB_TRAFFIC_REFERRER_MODE env; host|full|off)
#   SITE_DOMAIN: (or WEB_TRAFFIC_SITE_DOMAIN env; e.g. mylinux.work)
#     When set, downloads only count if referrer matches this domain.
#     When unset, downloads require any non-empty referrer (filters direct bot hits).
#
################################################################################

# ============================================================================
# CONFIGURATION VARIABLES
# ============================================================================

TEXTFILE_DIR="/var/lib/node_exporter"
OUTPUT_FILE=""
HTTP_MODE=false
HTTP_PORT=9199

# Operational safety / performance knobs
# - Limit rotated files read when present (avoid huge scrape cost)
MAX_ROTATED="${WEB_TRAFFIC_MAX_ROTATED:-7}"
# - In HTTP mode, default to tailing N lines unless user explicitly set tail-lines
HTTP_TAIL_LINES_DEFAULT="${WEB_TRAFFIC_HTTP_TAIL_LINES:-5000}"
# - Reduce memory/cardinality cost:
#   1 = track unique user agents, 0 = disable
TRACK_UNIQUE_UA="${WEB_TRAFFIC_TRACK_UNIQUE_UA:-1}"
#   Referrer mode: host | full | off
REFERRER_MODE="${WEB_TRAFFIC_REFERRER_MODE:-host}"

ACCESS_LOG="${WEB_TRAFFIC_ACCESS_LOG:-/var/log/nginx/access.log}"
LOG_DIR="${WEB_TRAFFIC_LOG_DIR:-}"
LOG_FORMAT="${WEB_TRAFFIC_LOG_FORMAT:-combined}"
TAIL_LINES="${WEB_TRAFFIC_TAIL_LINES:-0}"
SERVER_TYPE="${WEB_TRAFFIC_SERVER_TYPE:-auto}"
SITE_DOMAIN="${WEB_TRAFFIC_SITE_DOMAIN:-}"
DOWNLOAD_PATH="${WEB_TRAFFIC_DOWNLOAD_PATH:-/downloads/}"

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

prom_escape() {
    # Escape Prometheus label values: \, ", and newlines.
    # See: https://prometheus.io/docs/instrumenting/exposition_formats/
    local s="$1"
    s=${s//\\/\\\\}
    s=${s//\"/\\\"}
    s=${s//$'\n'/\\n}
    printf '%s\n' "$s"
}

show_usage() {
    cat <<EOF
Usage: $0 [OPTIONS]

Export web server access log traffic metrics as Prometheus metrics (v1.8).

MODES:
    --textfile      Write to node_exporter textfile collector
    --http          Run HTTP server on port $HTTP_PORT

OPTIONS:
    -p, --port          HTTP port (default: 9199)
    -o, --output        Output file path
    --access-log PATH   Path to access log (default: $ACCESS_LOG)
    --log-dir DIR       Directory of per-domain logs (e.g., /var/log/nginx/domains)
    --log-format FMT    Log format: combined or common (default: combined)
    --tail-lines NUM    Number of log lines to parse (default: $TAIL_LINES, 0=all)
    --server-type TYPE  Server type: auto, nginx, apache (default: auto)
    --site-domain DOM   Your domain (e.g. mylinux.work) — downloads only count
                        when referred from this domain (filters bot/scanner hits)
    --download-path P   URL path prefix for downloads (default: /downloads/)
                        All requests under this path count as downloads

EXAMPLES:
    $0 --textfile                          # Write to textfile collector
    $0 --http --port 9199                  # Run HTTP server
    $0 --access-log /var/log/apache2/access.log  # Use Apache log
    $0 --log-dir /var/log/nginx/domains    # Parse all domain logs (HestiaCP)
    $0 -o /tmp/web_traffic.prom            # Write to custom file

ENVIRONMENT VARIABLES:
    WEB_TRAFFIC_ACCESS_LOG     Path to access log
    WEB_TRAFFIC_LOG_DIR        Directory of per-domain logs
    WEB_TRAFFIC_LOG_FORMAT     Log format: combined or common
    WEB_TRAFFIC_TAIL_LINES     Number of log lines to parse (0=all)
    WEB_TRAFFIC_SERVER_TYPE    Server type: auto, nginx, apache
    WEB_TRAFFIC_MAX_ROTATED    Max rotated log files to read (default: 7)
    WEB_TRAFFIC_HTTP_TAIL_LINES  Default tail lines in HTTP mode (default: 5000)
    WEB_TRAFFIC_TRACK_UNIQUE_UA  Track unique user agents: 1 or 0 (default: 1)
    WEB_TRAFFIC_REFERRER_MODE  Referrer tracking: host, full, or off (default: host)
    WEB_TRAFFIC_SITE_DOMAIN    Your domain — downloads only count from this referrer
    WEB_TRAFFIC_DOWNLOAD_PATH  URL path prefix for downloads (default: /downloads/)

SECTIONS (auto-detected, skipped if unavailable):
    - Request totals by status, class, and method
    - Bandwidth totals and by status class
    - Unique visitors (IPs and user agents)
    - Top 10 requested paths
    - Top 10 external referrers
    - Bot vs human traffic detection
    - HTTP protocol version distribution
    - Request rate estimation
    - Web server process detection

EOF
    exit 0
}

parse_args() {
    while [[ $# -gt 0 ]]; do
        case $1 in
            -h|--help) show_usage ;;
            --textfile) OUTPUT_FILE="$TEXTFILE_DIR/web_traffic.prom"; shift ;;
            --http) HTTP_MODE=true; shift ;;
            -p|--port) HTTP_PORT="$2"; shift 2 ;;
            -o|--output) OUTPUT_FILE="$2"; shift 2 ;;
            --access-log) ACCESS_LOG="$2"; shift 2 ;;
            --log-dir) LOG_DIR="$2"; shift 2 ;;
            --log-format) LOG_FORMAT="$2"; shift 2 ;;
            --tail-lines) TAIL_LINES="$2"; shift 2 ;;
            --server-type) SERVER_TYPE="$2"; shift 2 ;;
            --site-domain) SITE_DOMAIN="$2"; shift 2 ;;
            --download-path) DOWNLOAD_PATH="$2"; shift 2 ;;
            *) echo "Unknown option: $1" >&2; exit 1 ;;
        esac
    done
}

# ============================================================================
# SERVER DETECTION
# ============================================================================

detect_server_type() {
    if [ "$SERVER_TYPE" != "auto" ]; then
        echo "$SERVER_TYPE"
        return
    fi

    if pgrep -x nginx >/dev/null 2>&1; then
        echo "nginx"
    elif pgrep -x apache2 >/dev/null 2>&1; then
        echo "apache"
    elif pgrep -x httpd >/dev/null 2>&1; then
        echo "apache"
    else
        echo "unknown"
    fi
}

# ============================================================================
# LOG STREAMING (current + rotated logs)
# ============================================================================

# Stream log content from rotated logs (oldest first) then current log.
# Handles .log.N (plain) and .log.N.gz (compressed) files.
# Args: $1 - current log file path, $2 - max rotated files to include
stream_log_data() {
    local log_file="$1"
    local max_rotated="${2:-31}"

    # Find rotated logs: domain.log.1, domain.log.2.gz, etc.
    local rotated_files=()
    local i

    for i in $(seq "$max_rotated" -1 1); do
        if [ -f "${log_file}.${i}.gz" ]; then
            rotated_files+=("gz:${log_file}.${i}.gz")
        elif [ -f "${log_file}.${i}" ]; then
            rotated_files+=("plain:${log_file}.${i}")
        fi
    done

    # Output rotated logs (oldest first)
    for entry in "${rotated_files[@]}"; do
        local type="${entry%%:*}"
        local path="${entry#*:}"
        if [ "$type" = "gz" ]; then
            zcat "$path" 2>/dev/null
        else
            cat "$path" 2>/dev/null
        fi
    done

    # Output current log
    cat "$log_file" 2>/dev/null
}

# ============================================================================
# LOG PARSING (single-pass awk)
# ============================================================================

# Parse access log lines and output all metrics data in a structured format.
# This uses a single awk pass for performance.
# Output format: KEY value pairs, one per line
parse_access_log() {
    local log_file="$1"
    local num_lines="$2"
    local format="$3"

    [ -f "$log_file" ] || return
    [ -r "$log_file" ] || return

    local now_epoch
    now_epoch=$(date +%s)
    local cutoff_daily=$((now_epoch - 86400))
    local cutoff_weekly=$((now_epoch - 604800))
    local cutoff_monthly=$((now_epoch - 2592000))

    # Check if rotated logs exist for this file
    local has_rotated=false
    if [ -f "${log_file}.1" ] || [ -f "${log_file}.1.gz" ]; then
        has_rotated=true
    fi

    # Stream log data into awk: use rotated logs for full monthly history,
    # or read the entire current log so daily/weekly/monthly windows differ
    if [ "$has_rotated" = true ]; then
        stream_log_data "$log_file" "$MAX_ROTATED"
    elif [ "$num_lines" -gt 0 ] 2>/dev/null; then
        tail -n "$num_lines" "$log_file" 2>/dev/null
    else
        cat "$log_file" 2>/dev/null
    fi | awk -v fmt="$format" \
        -v track_ua="$TRACK_UNIQUE_UA" \
        -v ref_mode="$REFERRER_MODE" \
        -v site_domain="$SITE_DOMAIN" \
        -v download_path="$DOWNLOAD_PATH" \
        -v cutoff_daily="$cutoff_daily" \
        -v cutoff_weekly="$cutoff_weekly" \
        -v cutoff_monthly="$cutoff_monthly" '
    BEGIN {
        total = 0
        total_bytes = 0
        bot_count = 0
        lines_parsed = 0
        first_ts = ""
        last_ts = ""

        # Month cumulative day offsets (non-leap), 1-based month indexing
        split("0,31,59,90,120,151,181,212,243,273,304,334", mdays, ",")

        # Month name to number lookup
        split("Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec", mn, ",")
        for (i = 1; i <= 12; i++) month_num[mn[i]] = i

        # Window counters
        win_requests["daily"] = 0; win_requests["weekly"] = 0; win_requests["monthly"] = 0
        win_bytes["daily"] = 0; win_bytes["weekly"] = 0; win_bytes["monthly"] = 0
        win_bots["daily"] = 0; win_bots["weekly"] = 0; win_bots["monthly"] = 0
    }
    function count_elems(a,    k, n) { n=0; for (k in a) n++; return n }

    # Track min/max epoch for stable RPM even if log lines arrive out-of-order
    function parse_log_epoch(ts,    parts, dparts, tparts, day, mon, year, hh, mm, ss, tz, sign, tzh, tzm, epoch) {
        # Format: 17/Mar/2026:10:00:00 +0000
        split(ts, parts, " ")
        split(parts[1], dparts, "/")
        day = dparts[1]+0
        mon = month_num[dparts[2]]
        # year:HH:MM:SS
        split(dparts[3], tparts, ":")
        year = tparts[1]+0; hh = tparts[2]+0; mm = tparts[3]+0; ss = tparts[4]+0
        if (mon < 1) return 0

        # Portable epoch calculation (no mktime dependency)
        # Days from year
        epoch = (year - 1970) * 365 + int((year - 1969) / 4) - int((year - 1901) / 100) + int((year - 1601) / 400)
        # Days from months
        epoch += mdays[mon] + day - 1
        # Leap day adjustment for current year
        if (mon > 2 && (year % 4 == 0 && (year % 100 != 0 || year % 400 == 0))) epoch++
        # Convert to seconds and add time
        epoch = epoch * 86400 + hh * 3600 + mm * 60 + ss

        # Apply timezone offset
        tz = parts[2]
        if (tz != "") {
            sign = (substr(tz, 1, 1) == "-") ? 1 : -1
            tzh = substr(tz, 2, 2) + 0
            tzm = substr(tz, 4, 2) + 0
            epoch += sign * (tzh * 3600 + tzm * 60)
        }
        return epoch
    }
    {
        lines_parsed++

        # Parse combined/common log format using field splitting
        # 1.2.3.4 - - [17/Mar/2026:10:00:00 +0000] "GET /path HTTP/1.1" 200 1234 "ref" "ua"
        ip = $1

        # Extract timestamp between [ and ]
        timestamp = ""
        if (match($0, /\[([^\]]+)\]/) ) {
            timestamp = substr($0, RSTART+1, RLENGTH-2)
        }

        # Extract the request line between first pair of quotes
        request_line = ""
        p1 = index($0, "\"")
        if (p1 > 0) {
            rest = substr($0, p1+1)
            p2 = index(rest, "\"")
            if (p2 > 0) {
                request_line = substr(rest, 1, p2-1)
            }
        }

        if (request_line == "") next

        # Split request line: METHOD PATH PROTOCOL
        n_req = split(request_line, req_parts, " ")
        if (n_req < 2) next
        method = req_parts[1]
        if (method != "GET" && method != "HEAD" && method != "POST" && method != "PUT" && method != "DELETE" && method != "PATCH" && method != "OPTIONS" && method != "CONNECT" && method != "TRACE") next
        path = req_parts[2]
        protocol = (n_req >= 3) ? req_parts[3] : ""

        # After the closing quote of request line, find status and bytes
        after_req = substr($0, p1 + 1 + p2)
        gsub(/^ +/, "", after_req)
        n_after = split(after_req, after_parts, " ")
        if (n_after < 2) next
        status = after_parts[1]
        bytes = after_parts[2]

        if (status !~ /^[0-9]+$/) next

        if (first_ts == "") first_ts = timestamp
        last_ts = timestamp

        if (bytes == "-") bytes = 0

        total++
        total_bytes += bytes

        # Status codes
        status_count[status]++

        # Status classes
        class_code = substr(status, 1, 1) "xx"
        class_count[class_code]++
        class_bytes[class_code] += bytes

        # Methods
        method_count[method]++

        # Unique IPs
        ips[ip] = 1

        # Paths (clean query strings for grouping)
        split(path, pathparts, "?")
        clean_path = pathparts[1]
        path_count[clean_path]++
        path_bytes[clean_path] += bytes

        # Download tracking — deferred until after bot detection (see below)
        is_download = 0
        is_downloadable = 0
        is_path_download = 0
        if (method == "GET" && substr(status, 1, 1) == "2") {
            if (download_path != "" && index(clean_path, download_path) == 1) {
                is_downloadable = 1
                is_path_download = 1
            } else if (clean_path ~ /\.(sh|ps1|py|pl|rb|json|yml|yaml|xml|csv|conf|cfg|prom|txt)$/ \
                || clean_path ~ /\.(zip|tar|gz|tgz|bz2|xz|7z|rar)$/ \
                || clean_path ~ /\.(pdf|doc|docx|xls|xlsx|ppt|pptx|odt|ods)$/ \
                || clean_path ~ /\.(deb|rpm|msi|exe|dmg|pkg|appimage|AppImage)$/ \
                || clean_path ~ /\.(iso|img|bin|run)$/) {
                is_downloadable = 1
            }
        }

        # Protocol
        if (protocol != "") {
            gsub(/^ +| +$/, "", protocol)
            if (protocol != "") proto_count[protocol]++
        }

        is_bot = 0

        # Referrer and User-Agent (combined format only)
        if (fmt == "combined") {
            ref = ""
            ua = ""
            # Split the whole line by double-quote to extract quoted fields
            n = split($0, qparts, "\"")
            # qparts[2] = request line, qparts[4] = referrer, qparts[6] = user-agent
            if (n >= 6) {
                ref = qparts[4]
                ua = qparts[6]
            } else if (n >= 4) {
                ref = qparts[4]
            }

            # Referrer counting: reduce cardinality by default (host-only).
            if (ref_mode != "off" && ref != "" && ref != "-") {
                ref_key = ref
                if (ref_mode == "host") {
                    # Extract host from http(s)://host/... (portable, no gawk capture groups)
                    ref_key = ref
                    sub(/^https?:\/\//, "", ref_key)
                    sub(/\/.*/, "", ref_key)
                }
                referrer_count[ref_key]++
                if (status == "404" && ref_key != "") {
                    error_404_referrer[ref_key]++
                }
            }

            # Unique UA tracking can be expensive; allow disabling.
            if (track_ua == 1 && ua != "" && ua != "-") {
                user_agents[ua] = 1
            }

            # Bot detection
            # NOTE: Google uses multiple crawler UA tokens beyond plain "Googlebot"
            # See: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
            if (ua ~ /(Googlebot([-/][A-Za-z0-9._]+)?|Google-Extended|AdsBot-Google|Mediapartners-Google|StoreBot-Google)/ ||
                ua ~ /(meta-webindexer|ChatGPT-User|OAI-SearchBot|Amzn-SearchBot|PerplexityBot)/ ||
                ua ~ /([Bb]ot|[Cc]rawl|[Ss]pider|[Ss]lurp|bingbot|BingPreview|YandexBot|Baiduspider|DuckDuckBot|facebookexternalhit|Twitterbot|LinkedInBot|Applebot|MJ12bot|AhrefsBot|SemrushBot|DotBot|PetalBot)/) {
                bot_count++
                is_bot = 1
            }
        }

        # AI retrieval bots (ChatGPT-User, PerplexityBot) fetch content on
        # behalf of a real user — count their downloads as legitimate.
        is_ai_retrieval = 0
        if (is_bot && ua ~ /(ChatGPT-User|PerplexityBot)/) {
            is_ai_retrieval = 1
        }

        # Count downloads only for non-bot requests that look like real
        # users (plus AI retrieval bots).  Signals indicating a real
        # download:
        #   1. Path-based: any non-bot hit on --download-path (the path
        #      itself signals intent — nobody browses /downloads/ casually)
        #   2. AI retrieval bot (ChatGPT-User, PerplexityBot — user asked)
        #   3. Download-tool UA (wget, curl, aria2 — user copied the URL)
        #   4. Referrer from the site (user clicked a download link)
        # Extension-based downloads outside the download path still require
        # signal 2-4 to avoid counting embedded/linked assets.
        if (is_downloadable && (!is_bot || is_ai_retrieval)) {
            is_real_download = 0
            # Path-based downloads: trust all non-bot requests
            if (is_path_download) {
                is_real_download = 1
            }
            # AI retrieval bots are always real downloads
            if (!is_real_download && is_ai_retrieval) {
                is_real_download = 1
            }
            # Check for download-tool user agents
            if (!is_real_download && (ua ~ /^(Wget|curl|aria2|libcurl|Go-http-client|python-requests|HTTPie)/ ||
                ua ~ /^(ufw-threat-feeds|ufw-blocklist|iptables-threat-feeds|iptables-blocklist)/)) {
                is_real_download = 1
            }
            # Check for site referrer (browser click)
            if (!is_real_download && ref != "" && ref != "-") {
                if (site_domain != "") {
                    if (index(ref, site_domain) > 0) is_real_download = 1
                } else {
                    is_real_download = 1
                }
            }
            if (is_real_download) {
                download_total++
                download_bytes += bytes
                download_count[clean_path]++
                is_download = 1
            }
        }

        # Time-windowed stats + min/max epoch
        epoch = parse_log_epoch(timestamp)

        # Hourly traffic patterns (last 24 hours only)
        if (epoch > 0 && epoch >= cutoff_daily) {
            split(timestamp, ts_parts, ":")
            hour = ts_parts[2]+0
            hour_count[hour]++
        }
        if (epoch > 0) {
            if (min_epoch == "" || epoch < min_epoch) min_epoch = epoch
            if (max_epoch == "" || epoch > max_epoch) max_epoch = epoch
        }

        # 404 error paths
        if (status == "404") {
            error_404_count[clean_path]++
        }

        # Page views vs assets
        is_page = 0
        is_asset = 0
        if (clean_path ~ /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|ico|webp)$/) {
            asset_requests++
            is_asset = 1
        } else if (clean_path ~ /\/$/ || clean_path ~ /\.html?$/ || clean_path !~ /\.[a-zA-Z0-9]+$/) {
            page_views++
            is_page = 1
        }

        # Top client IPs (already tracking ips[ip]=1, add counter)
        ip_count[ip]++

        # Response size distribution
        if (bytes+0 <= 1024) {
            size_bucket["tiny"]++
        } else if (bytes+0 <= 10240) {
            size_bucket["small"]++
        } else if (bytes+0 <= 102400) {
            size_bucket["medium"]++
        } else if (bytes+0 <= 1048576) {
            size_bucket["large"]++
        } else {
            size_bucket["huge"]++
        }

        # Bot name extraction (when bot detected)
        if (is_bot && ua != "") {
            bot_name = ""
            # Prefer specific/official bot tokens first (better grouping)
            if (match(ua, /(Googlebot-[A-Za-z0-9._-]+|Googlebot\/[0-9.]+|Googlebot|Google-Extended|AdsBot-Google|Mediapartners-Google|StoreBot-Google)/)) {
                bot_name = substr(ua, RSTART, RLENGTH)
            } else if (match(ua, /(bingbot|BingPreview)/)) {
                bot_name = substr(ua, RSTART, RLENGTH)
            } else if (match(ua, /(YandexBot|Baiduspider|DuckDuckBot|Slurp)/)) {
                bot_name = substr(ua, RSTART, RLENGTH)
            } else if (match(ua, /(facebookexternalhit|Twitterbot|LinkedInBot|Applebot)/)) {
                bot_name = substr(ua, RSTART, RLENGTH)
            } else if (match(ua, /(AhrefsBot|SemrushBot|MJ12bot|DotBot|PetalBot)/)) {
                bot_name = substr(ua, RSTART, RLENGTH)
            } else if (match(ua, /(meta-webindexer|ChatGPT-User|OAI-SearchBot|Amzn-SearchBot|PerplexityBot)/)) {
                bot_name = substr(ua, RSTART, RLENGTH)
            } else if (match(ua, /([Bb]ot[a-zA-Z]*|[Cc]rawler|[Ss]pider)/)) {
                bot_name = substr(ua, RSTART, RLENGTH)
            }
            if (bot_name != "") {
                bot_name_count[bot_name]++
            }
        }

        if (epoch > 0) {
            if (epoch >= cutoff_daily) {
                win_requests["daily"]++
                win_bytes["daily"] += bytes
                win_ips_daily[ip] = 1
                if (is_bot) win_bots["daily"]++
                if (!is_bot) win_human_daily++
                win_class_daily[class_code]++
                if (is_page) win_page_views_daily++
                if (is_asset) win_asset_requests_daily++
                if (is_download) win_downloads_daily++
                win_downloads_bytes_daily += (is_download ? bytes : 0)
                if (track_ua == 1 && ua != "" && ua != "-") win_uas_daily[ua] = 1
            }
            if (epoch >= cutoff_weekly) {
                win_requests["weekly"]++
                win_bytes["weekly"] += bytes
                win_ips_weekly[ip] = 1
                if (is_bot) win_bots["weekly"]++
            }
            if (epoch >= cutoff_monthly) {
                win_requests["monthly"]++
                win_bytes["monthly"] += bytes
                win_ips_monthly[ip] = 1
                if (is_bot) win_bots["monthly"]++
            }
        }
    }
    END {
        print "LINES_PARSED " lines_parsed
        print "TOTAL_REQUESTS " total
        print "TOTAL_BYTES " total_bytes
        print "BOT_REQUESTS " bot_count
        print "HUMAN_REQUESTS " (total - bot_count)
        print "UNIQUE_IPS " count_elems(ips)
        print "UNIQUE_UAS " count_elems(user_agents)
        print "FIRST_TS " first_ts
        print "LAST_TS " last_ts
        print "MIN_EPOCH " (min_epoch=="" ? 0 : min_epoch)
        print "MAX_EPOCH " (max_epoch=="" ? 0 : max_epoch)

        # Status codes
        for (s in status_count) {
            print "STATUS " s " " status_count[s]
        }

        # Status classes
        for (c in class_count) {
            print "CLASS " c " " class_count[c]
        }

        # Class bytes
        for (c in class_bytes) {
            print "CLASS_BYTES " c " " class_bytes[c]
        }

        # Methods
        for (m in method_count) {
            print "METHOD " m " " method_count[m]
        }

        # Protocols
        for (p in proto_count) {
            print "PROTOCOL " p " " proto_count[p]
        }

        # Top paths (sort by count, output top 10)
        # We use a simple selection approach
        for (i = 1; i <= 10; i++) {
            max_count = 0
            max_path = ""
            for (p in path_count) {
                if (path_count[p] > max_count) {
                    max_count = path_count[p]
                    max_path = p
                }
            }
            if (max_path != "") {
                print "TOP_PATH " i " " max_count " " max_path
                delete path_count[max_path]
            }
        }

        # Top paths by bytes
        for (i = 1; i <= 10; i++) {
            max_bytes = 0
            max_path = ""
            for (p in path_bytes) {
                if (path_bytes[p] > max_bytes) {
                    max_bytes = path_bytes[p]
                    max_path = p
                }
            }
            if (max_path != "") {
                print "TOP_PATH_BYTES " i " " max_bytes " " max_path
                delete path_bytes[max_path]
            }
        }

        # Top referrers (sort by count, output top 10)
        for (i = 1; i <= 10; i++) {
            max_count = 0
            max_ref = ""
            for (r in referrer_count) {
                if (referrer_count[r] > max_count) {
                    max_count = referrer_count[r]
                    max_ref = r
                }
            }
            if (max_ref != "") {
                print "TOP_REF " i " " max_count " " max_ref
                delete referrer_count[max_ref]
            }
        }

        # Downloads
        print "DOWNLOAD_TOTAL " download_total+0
        print "DOWNLOAD_BYTES " download_bytes+0
        for (i = 1; i <= 10; i++) {
            max_count = 0
            max_dl = ""
            for (d in download_count) {
                if (download_count[d] > max_count) {
                    max_count = download_count[d]
                    max_dl = d
                }
            }
            if (max_dl != "") {
                print "TOP_DOWNLOAD " i " " max_count " " max_dl
                delete download_count[max_dl]
            }
        }

        # Hourly distribution
        for (h = 0; h <= 23; h++) {
            printf "HOUR %02d %d\n", h, hour_count[h]+0
        }

        # Top 404 paths
        for (i = 1; i <= 10; i++) {
            max_count = 0
            max_path = ""
            for (p in error_404_count) {
                if (error_404_count[p] > max_count) {
                    max_count = error_404_count[p]
                    max_path = p
                }
            }
            if (max_path != "") {
                print "TOP_404 " i " " max_count " " max_path
                delete error_404_count[max_path]
            }
        }

        # Top 404 referrers
        for (i = 1; i <= 10; i++) {
            max_count = 0
            max_ref = ""
            for (r in error_404_referrer) {
                if (error_404_referrer[r] > max_count) {
                    max_count = error_404_referrer[r]
                    max_ref = r
                }
            }
            if (max_ref != "") {
                print "TOP_404_REF " i " " max_count " " max_ref
                delete error_404_referrer[max_ref]
            }
        }

        # Page views vs assets
        print "PAGE_VIEWS " page_views+0
        print "ASSET_REQUESTS " asset_requests+0

        # Top client IPs
        for (i = 1; i <= 10; i++) {
            max_count = 0
            max_ip = ""
            for (p in ip_count) {
                if (ip_count[p] > max_count) {
                    max_count = ip_count[p]
                    max_ip = p
                }
            }
            if (max_ip != "") {
                print "TOP_IP " i " " max_count " " max_ip
                delete ip_count[max_ip]
            }
        }

        # Response size distribution
        sizes[1] = "tiny"; sizes[2] = "small"; sizes[3] = "medium"; sizes[4] = "large"; sizes[5] = "huge"
        for (s = 1; s <= 5; s++) {
            print "SIZE_BUCKET " sizes[s] " " size_bucket[sizes[s]]+0
        }

        # Top bot names
        for (i = 1; i <= 10; i++) {
            max_count = 0
            max_bot = ""
            for (b in bot_name_count) {
                if (bot_name_count[b] > max_count) {
                    max_count = bot_name_count[b]
                    max_bot = b
                }
            }
            if (max_bot != "") {
                print "TOP_BOT " i " " max_count " " max_bot
                delete bot_name_count[max_bot]
            }
        }

        # Time-windowed summaries
        windows[1] = "daily"; windows[2] = "weekly"; windows[3] = "monthly"
        for (w = 1; w <= 3; w++) {
            wname = windows[w]
            print "WIN_REQUESTS " wname " " win_requests[wname]
            print "WIN_BYTES " wname " " win_bytes[wname]
            print "WIN_BOTS " wname " " win_bots[wname]
        }
        print "WIN_UNIQUE_IPS daily " count_elems(win_ips_daily)
        print "WIN_UNIQUE_IPS weekly " count_elems(win_ips_weekly)
        print "WIN_UNIQUE_IPS monthly " count_elems(win_ips_monthly)

        # Daily window extended metrics
        print "WIN_STATUS_CLASS daily 2xx " win_class_daily["2xx"]+0
        print "WIN_STATUS_CLASS daily 3xx " win_class_daily["3xx"]+0
        print "WIN_STATUS_CLASS daily 4xx " win_class_daily["4xx"]+0
        print "WIN_STATUS_CLASS daily 5xx " win_class_daily["5xx"]+0
        print "WIN_PAGE_VIEWS daily " win_page_views_daily+0
        print "WIN_ASSET_REQUESTS daily " win_asset_requests_daily+0
        print "WIN_DOWNLOADS daily " win_downloads_daily+0
        print "WIN_DOWNLOADS_BYTES daily " win_downloads_bytes_daily+0
        print "WIN_HUMAN_REQUESTS daily " win_human_daily+0
        print "WIN_UNIQUE_UAS daily " count_elems(win_uas_daily)
    }
    '
}

# ============================================================================
# METRICS GENERATION
# ============================================================================

# Emit HELP/TYPE lines only once per metric name (avoids duplicates in --log-dir mode)
_emitted_help=""
emit_help_type() {
    local metric="$1" help_text="$2" mtype="$3"
    case "$_emitted_help" in
        *"|${metric}|"*) return ;;
    esac
    _emitted_help="${_emitted_help}|${metric}|"
    echo "# HELP $metric $help_text"
    echo "# TYPE $metric $mtype"
}

generate_metrics() {
    _emitted_help=""
    local script_start
    script_start=$(date +%s)

    # ========================================================================
    # Exporter Status
    # ========================================================================
    cat <<EOF
# HELP web_traffic_up Exporter status (1=up)
# TYPE web_traffic_up gauge
web_traffic_up 1

# HELP web_traffic_exporter_info Exporter version information
# TYPE web_traffic_exporter_info gauge
web_traffic_exporter_info{version="1.8"} 1

EOF

    # ========================================================================
    # Server Detection
    # ========================================================================
    local detected_server
    detected_server=$(detect_server_type)
    detected_server=$(prom_escape "$detected_server")

    local nginx_running=0 apache_running=0

    if pgrep -x nginx >/dev/null 2>&1; then
        nginx_running=1
    fi
    if pgrep -x apache2 >/dev/null 2>&1 || pgrep -x httpd >/dev/null 2>&1; then
        apache_running=1
    fi

    cat <<EOF
# HELP web_traffic_server_running Whether the web server process is running (1=running)
# TYPE web_traffic_server_running gauge
web_traffic_server_running{server="nginx"} $nginx_running
web_traffic_server_running{server="apache"} $apache_running

# HELP web_traffic_server_type Detected web server type
# TYPE web_traffic_server_type gauge
web_traffic_server_type{server="$detected_server"} 1

EOF

    # ========================================================================
    # Access Log Parsing
    # ========================================================================

    # Safety: in HTTP mode, default to tailing some lines unless user chose otherwise.
    # Prevents extremely expensive scrapes on large/rotated logs.
    if [ "$HTTP_MODE" = true ]; then
        if [ "${TAIL_LINES:-0}" -eq 0 ] 2>/dev/null; then
            TAIL_LINES="$HTTP_TAIL_LINES_DEFAULT"
        fi
    fi

    # Build list of log files to process: (file:domain) pairs
    local log_files=()
    local log_domains=()

    if [ -n "$LOG_DIR" ] && [ -d "$LOG_DIR" ]; then
        for log_file in "$LOG_DIR"/*.log; do
            [ -f "$log_file" ] || continue
            # skip error logs, byte logs, and other non-access logs
            case "$log_file" in
                *.error.log|*.bytes.log|*.ssl.log) continue ;;
            esac
            log_files+=("$log_file")
            local domain_name
            domain_name=$(basename "$log_file" .log)
            log_domains+=("$domain_name")
        done
    else
        if [ -f "$ACCESS_LOG" ]; then
            log_files+=("$ACCESS_LOG")
            log_domains+=("")
        fi
    fi

    if [ ${#log_files[@]} -eq 0 ]; then
        cat <<EOF
# HELP web_traffic_requests Total requests in parsed window
# TYPE web_traffic_requests gauge
web_traffic_requests 0

EOF
    else
        local metrics_buf
        metrics_buf=$(mktemp)

        local file_idx
        for file_idx in "${!log_files[@]}"; do
            local current_log="${log_files[$file_idx]}"
            local current_domain="${log_domains[$file_idx]}"

            # Build label string: empty for single log, domain="x" for multi
            local dlabel=""
            local dlabel_comma=""
            if [ -n "$current_domain" ]; then
                local current_domain_esc
                current_domain_esc=$(prom_escape "$current_domain")
                dlabel="domain=\"$current_domain_esc\""
                dlabel_comma="domain=\"$current_domain_esc\","
            fi

            # Write parsed data to temp file to avoid repeated echo|pipe forks
            local parsed_file
            parsed_file=$(mktemp)
            parse_access_log "$current_log" "$TAIL_LINES" "$LOG_FORMAT" > "$parsed_file"

            if [ ! -s "$parsed_file" ]; then
                rm -f "$parsed_file"
                continue
            fi

            # Extract all scalar values in a single awk pass
            local total_requests total_bytes bot_requests human_requests
            local unique_ips unique_uas lines_parsed first_ts last_ts
            local dl_total dl_bytes page_views asset_reqs min_epoch max_epoch

            eval "$(awk '
                /^LINES_PARSED /    { printf "lines_parsed=%s\n", $2 }
                /^TOTAL_REQUESTS /  { printf "total_requests=%s\n", $2 }
                /^TOTAL_BYTES /     { printf "total_bytes=%s\n", $2 }
                /^BOT_REQUESTS /    { printf "bot_requests=%s\n", $2 }
                /^HUMAN_REQUESTS /  { printf "human_requests=%s\n", $2 }
                /^UNIQUE_IPS /      { printf "unique_ips=%s\n", $2 }
                /^UNIQUE_UAS /      { printf "unique_uas=%s\n", $2 }
                /^FIRST_TS /        { printf "first_ts=\"%s %s\"\n", $2, $3 }
                /^LAST_TS /         { printf "last_ts=\"%s %s\"\n", $2, $3 }
                /^DOWNLOAD_TOTAL /  { printf "dl_total=%s\n", $2 }
                /^DOWNLOAD_BYTES /  { printf "dl_bytes=%s\n", $2 }
                /^PAGE_VIEWS /      { printf "page_views=%s\n", $2 }
                /^ASSET_REQUESTS /  { printf "asset_reqs=%s\n", $2 }
                /^MIN_EPOCH /       { printf "min_epoch=%s\n", $2 }
                /^MAX_EPOCH /       { printf "max_epoch=%s\n", $2 }
            ' "$parsed_file")"

            # ================================================================
            # Request Totals
            # ================================================================
            # Build label wrapper: "label" or empty for single log
            local lwrap="" lwrap_comma="{"
            if [ -n "$dlabel" ]; then
                lwrap="{${dlabel}}"
                lwrap_comma="{${dlabel_comma}"
            fi

            emit_help_type web_traffic_requests "Total requests in parsed window" gauge
            echo "web_traffic_requests${lwrap} ${total_requests:-0}"
            echo ""
            emit_help_type web_traffic_response_bytes "Total response bytes in parsed window" gauge
            echo "web_traffic_response_bytes${lwrap} ${total_bytes:-0}"
            echo ""

            # ================================================================
            # Status Codes
            # ================================================================
            local status_lines
            status_lines=$(grep "^STATUS " "$parsed_file")

            if [ -n "$status_lines" ]; then
                emit_help_type web_traffic_requests_by_status "Requests per HTTP status code" gauge
                echo "$status_lines" | while read -r _ status count; do
                    esc_status=$(prom_escape "$status")
                    echo "web_traffic_requests_by_status${lwrap_comma}status=\"$esc_status\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Status Classes
            # ================================================================
            local class_lines
            class_lines=$(grep "^CLASS [0-9]" "$parsed_file")

            if [ -n "$class_lines" ]; then
                emit_help_type web_traffic_requests_by_class "Requests per status class" gauge
                echo "$class_lines" | while read -r _ class count; do
                    esc_class=$(prom_escape "$class")
                    echo "web_traffic_requests_by_class${lwrap_comma}class=\"$esc_class\"} $count"
                done
                echo ""

                local s2xx s3xx s4xx s5xx
                s2xx=$(echo "$class_lines" | awk '/^CLASS 2xx / {print $3}')
                s3xx=$(echo "$class_lines" | awk '/^CLASS 3xx / {print $3}')
                s4xx=$(echo "$class_lines" | awk '/^CLASS 4xx / {print $3}')
                s5xx=$(echo "$class_lines" | awk '/^CLASS 5xx / {print $3}')

                emit_help_type web_traffic_status_2xx "Total 2xx responses" gauge
                echo "web_traffic_status_2xx${lwrap} ${s2xx:-0}"
                echo ""
                emit_help_type web_traffic_status_3xx "Total 3xx responses" gauge
                echo "web_traffic_status_3xx${lwrap} ${s3xx:-0}"
                echo ""
                emit_help_type web_traffic_status_4xx "Total 4xx responses" gauge
                echo "web_traffic_status_4xx${lwrap} ${s4xx:-0}"
                echo ""
                emit_help_type web_traffic_status_5xx "Total 5xx responses" gauge
                echo "web_traffic_status_5xx${lwrap} ${s5xx:-0}"
                echo ""
            fi

            # ================================================================
            # Class Bytes
            # ================================================================
            local class_bytes_lines
            class_bytes_lines=$(grep "^CLASS_BYTES " "$parsed_file")

            if [ -n "$class_bytes_lines" ]; then
                emit_help_type web_traffic_response_bytes_by_class "Response bytes per status class" gauge
                echo "$class_bytes_lines" | while read -r _ class bytes; do
                    esc_class=$(prom_escape "$class")
                    echo "web_traffic_response_bytes_by_class${lwrap_comma}class=\"$esc_class\"} $bytes"
                done
                echo ""
            fi

            # ================================================================
            # Methods
            # ================================================================
            local method_lines
            method_lines=$(grep "^METHOD " "$parsed_file")

            if [ -n "$method_lines" ]; then
                emit_help_type web_traffic_requests_by_method "Requests per HTTP method" gauge
                echo "$method_lines" | while read -r _ method count; do
                    esc_method=$(prom_escape "$method")
                    echo "web_traffic_requests_by_method${lwrap_comma}method=\"$esc_method\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Unique Visitors
            # ================================================================
            emit_help_type web_traffic_unique_ips "Unique source IPs in parsed window" gauge
            echo "web_traffic_unique_ips${lwrap} ${unique_ips:-0}"
            echo ""
            emit_help_type web_traffic_unique_user_agents "Unique user agents in parsed window" gauge
            echo "web_traffic_unique_user_agents${lwrap} ${unique_uas:-0}"
            echo ""

            # ================================================================
            # Bot Detection (only meaningful with combined format)
            # ================================================================
            if [ "$LOG_FORMAT" = "combined" ]; then
                local bot_ratio="0"
                if [ "${total_requests:-0}" -gt 0 ] 2>/dev/null; then
                    bot_ratio=$(awk "BEGIN {printf \"%.4f\", ${bot_requests:-0} / ${total_requests:-1}}")
                fi

                emit_help_type web_traffic_bot_requests "Total bot requests detected" gauge
                echo "web_traffic_bot_requests${lwrap} ${bot_requests:-0}"
                echo ""
                emit_help_type web_traffic_human_requests "Total non-bot requests" gauge
                echo "web_traffic_human_requests${lwrap} ${human_requests:-0}"
                echo ""
                emit_help_type web_traffic_bot_ratio "Ratio of bot requests to total requests" gauge
                echo "web_traffic_bot_ratio${lwrap} $bot_ratio"
                echo ""
            fi

            # ================================================================
            # Derived Metrics
            # ================================================================
            local avg_response_bytes="0"
            if [ "${total_requests:-0}" -gt 0 ] 2>/dev/null; then
                avg_response_bytes=$(awk "BEGIN {printf \"%.0f\", ${total_bytes:-0} / ${total_requests:-1}}")
            fi
            emit_help_type web_traffic_average_response_bytes "Average response size in bytes" gauge
            echo "web_traffic_average_response_bytes${lwrap} $avg_response_bytes"
            echo ""

            local error_ratio="0"
            if [ "${total_requests:-0}" -gt 0 ] 2>/dev/null; then
                local s4xx_val s5xx_val
                s4xx_val=$(grep "^CLASS 4xx " "$parsed_file" | awk '{print $3}')
                s5xx_val=$(grep "^CLASS 5xx " "$parsed_file" | awk '{print $3}')
                error_ratio=$(awk "BEGIN {printf \"%.4f\", (${s4xx_val:-0} + ${s5xx_val:-0}) / ${total_requests:-1}}")
            fi
            emit_help_type web_traffic_error_ratio "Ratio of 4xx+5xx errors to total requests" gauge
            echo "web_traffic_error_ratio${lwrap} $error_ratio"
            echo ""

            # ================================================================
            # Top Paths
            # ================================================================
            local top_path_lines
            top_path_lines=$(grep "^TOP_PATH " "$parsed_file")

            if [ -n "$top_path_lines" ]; then
                emit_help_type web_traffic_top_path_requests "Top requested paths by hit count" gauge
                echo "$top_path_lines" | while read -r _ rank count path; do
                    esc_path=$(prom_escape "$path")
                    echo "web_traffic_top_path_requests${lwrap_comma}path=\"$esc_path\",rank=\"$rank\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Top Paths by Bandwidth
            # ================================================================
            local top_path_bytes_lines
            top_path_bytes_lines=$(grep "^TOP_PATH_BYTES " "$parsed_file")

            if [ -n "$top_path_bytes_lines" ]; then
                emit_help_type web_traffic_top_path_response_bytes "Top paths by response bytes" gauge
                echo "$top_path_bytes_lines" | while read -r _ rank bytes path; do
                    esc_path=$(prom_escape "$path")
                    echo "web_traffic_top_path_response_bytes${lwrap_comma}path=\"$esc_path\",rank=\"$rank\"} $bytes"
                done
                echo ""
            fi

            # ================================================================
            # Top Referrers
            # ================================================================
            local top_ref_lines
            top_ref_lines=$(grep "^TOP_REF " "$parsed_file")

            if [ -n "$top_ref_lines" ]; then
                emit_help_type web_traffic_top_referrer_requests "Top referrers by hit count" gauge
                echo "$top_ref_lines" | while read -r _ rank count referrer; do
                    esc_ref=$(prom_escape "$referrer")
                    echo "web_traffic_top_referrer_requests${lwrap_comma}referrer=\"$esc_ref\",rank=\"$rank\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Downloads
            # ================================================================
            emit_help_type web_traffic_downloads "Total file downloads" gauge
            echo "web_traffic_downloads${lwrap} ${dl_total:-0}"
            echo ""
            emit_help_type web_traffic_downloads_bytes "Total bytes from file downloads" gauge
            echo "web_traffic_downloads_bytes${lwrap} ${dl_bytes:-0}"
            echo ""

            local top_dl_lines
            top_dl_lines=$(grep "^TOP_DOWNLOAD " "$parsed_file")

            if [ -n "$top_dl_lines" ]; then
                emit_help_type web_traffic_top_download_requests "Top downloaded files by hit count" gauge
                echo "$top_dl_lines" | while read -r _ rank count filepath; do
                    esc_file=$(prom_escape "$filepath")
                    echo "web_traffic_top_download_requests${lwrap_comma}file=\"$esc_file\",rank=\"$rank\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Hourly Traffic Patterns
            # ================================================================
            local hour_lines
            hour_lines=$(grep "^HOUR " "$parsed_file")

            if [ -n "$hour_lines" ]; then
                emit_help_type web_traffic_requests_by_hour "Requests per hour of day" gauge
                echo "$hour_lines" | while read -r _ hour count; do
                    esc_hour=$(prom_escape "$hour")
                    echo "web_traffic_requests_by_hour${lwrap_comma}hour=\"$esc_hour\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # 404 Error Paths
            # ================================================================
            local top_404_lines
            top_404_lines=$(grep "^TOP_404 " "$parsed_file")

            if [ -n "$top_404_lines" ]; then
                emit_help_type web_traffic_404_path_requests "Top paths returning 404" gauge
                echo "$top_404_lines" | while read -r _ rank count path; do
                    esc_path=$(prom_escape "$path")
                    echo "web_traffic_404_path_requests${lwrap_comma}path=\"$esc_path\",rank=\"$rank\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Top 404 Referrers
            # ================================================================
            local top_404_ref_lines
            top_404_ref_lines=$(grep "^TOP_404_REF " "$parsed_file")

            if [ -n "$top_404_ref_lines" ]; then
                emit_help_type web_traffic_top_404_referrer_requests "Top referrers sending traffic to 404 pages" gauge
                echo "$top_404_ref_lines" | while read -r _ rank count referrer; do
                    esc_ref=$(prom_escape "$referrer")
                    echo "web_traffic_top_404_referrer_requests${lwrap_comma}referrer=\"$esc_ref\",rank=\"$rank\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Page Views vs Assets
            # ================================================================
            emit_help_type web_traffic_page_views "Total page view requests" gauge
            echo "web_traffic_page_views${lwrap} ${page_views:-0}"
            echo ""
            emit_help_type web_traffic_asset_requests "Total asset requests" gauge
            echo "web_traffic_asset_requests${lwrap} ${asset_reqs:-0}"
            echo ""

            # ================================================================
            # Top Client IPs
            # ================================================================
            local top_ip_lines
            top_ip_lines=$(grep "^TOP_IP " "$parsed_file")

            if [ -n "$top_ip_lines" ]; then
                emit_help_type web_traffic_top_client_requests "Top client IPs by request count" gauge
                echo "$top_ip_lines" | while read -r _ rank count ip; do
                    esc_ip=$(prom_escape "$ip")
                    echo "web_traffic_top_client_requests${lwrap_comma}ip=\"$esc_ip\",rank=\"$rank\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Response Size Distribution
            # ================================================================
            local size_lines
            size_lines=$(grep "^SIZE_BUCKET " "$parsed_file")

            if [ -n "$size_lines" ]; then
                emit_help_type web_traffic_response_size_bucket "Requests per response size range" gauge
                echo "$size_lines" | while read -r _ size count; do
                    esc_size=$(prom_escape "$size")
                    echo "web_traffic_response_size_bucket${lwrap_comma}size=\"$esc_size\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Top Bot Names
            # ================================================================
            local top_bot_lines
            top_bot_lines=$(grep "^TOP_BOT " "$parsed_file")

            if [ -n "$top_bot_lines" ]; then
                emit_help_type web_traffic_top_bot_requests "Top bots by request count" gauge
                echo "$top_bot_lines" | while read -r _ rank count bot; do
                    esc_bot=$(prom_escape "$bot")
                    echo "web_traffic_top_bot_requests${lwrap_comma}bot=\"$esc_bot\",rank=\"$rank\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Protocol Distribution
            # ================================================================
            local proto_lines
            proto_lines=$(grep "^PROTOCOL " "$parsed_file")

            if [ -n "$proto_lines" ]; then
                emit_help_type web_traffic_requests_by_protocol "Requests per HTTP protocol version" gauge
                echo "$proto_lines" | while read -r _ proto count; do
                    esc_proto=$(prom_escape "$proto")
                    echo "web_traffic_requests_by_protocol${lwrap_comma}protocol=\"$esc_proto\"} $count"
                done
                echo ""
            fi

            # ================================================================
            # Request Rate Estimation
            # ================================================================
            local rpm=0
            if [ "${min_epoch:-0}" -gt 0 ] 2>/dev/null && [ "${max_epoch:-0}" -gt 0 ] 2>/dev/null; then
                local duration=$((max_epoch - min_epoch))
                if [ "$duration" -gt 0 ]; then
                    rpm=$(awk "BEGIN {printf \"%.2f\", (${total_requests:-0} / $duration) * 60}")
                fi
            fi

            emit_help_type web_traffic_requests_per_minute "Estimated requests per minute from log window" gauge
            echo "web_traffic_requests_per_minute${lwrap} $rpm"
            echo ""

            # ================================================================
            # Lines Parsed
            # ================================================================
            emit_help_type web_traffic_exporter_lines_parsed "Number of log lines parsed" gauge
            echo "web_traffic_exporter_lines_parsed${lwrap} ${lines_parsed:-0}"
            echo ""

            # ================================================================
            # Time-Windowed Stats (daily/weekly/monthly)
            # ================================================================
            local win_lines
            win_lines=$(grep "^WIN_" "$parsed_file")

            if [ -n "$win_lines" ]; then
                local wlp="{"
                [ -n "$dlabel_comma" ] && wlp="{${dlabel_comma}"

                emit_help_type web_traffic_window_requests "Total requests in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_REQUESTS / {print "web_traffic_window_requests" dl "window=\"" $2 "\"} " $3}'

                echo ""
                emit_help_type web_traffic_window_bytes "Total response bytes in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_BYTES / {print "web_traffic_window_bytes" dl "window=\"" $2 "\"} " $3}'

                echo ""
                emit_help_type web_traffic_window_unique_ips "Unique source IPs in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_UNIQUE_IPS / {print "web_traffic_window_unique_ips" dl "window=\"" $2 "\"} " $3}'

                echo ""
                emit_help_type web_traffic_window_bot_requests "Bot requests in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_BOTS / {print "web_traffic_window_bot_requests" dl "window=\"" $2 "\"} " $3}'

                echo ""
                emit_help_type web_traffic_window_requests_by_class "Requests per status class in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_STATUS_CLASS / {print "web_traffic_window_requests_by_class" dl "window=\"" $2 "\",class=\"" $3 "\"} " $4}'

                echo ""
                emit_help_type web_traffic_window_page_views "Page view requests in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_PAGE_VIEWS / {print "web_traffic_window_page_views" dl "window=\"" $2 "\"} " $3}'

                echo ""
                emit_help_type web_traffic_window_asset_requests "Asset requests in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_ASSET_REQUESTS / {print "web_traffic_window_asset_requests" dl "window=\"" $2 "\"} " $3}'

                echo ""
                emit_help_type web_traffic_window_downloads "File downloads in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_DOWNLOADS / {print "web_traffic_window_downloads" dl "window=\"" $2 "\"} " $3}'

                echo ""
                emit_help_type web_traffic_window_downloads_bytes "Download bytes in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_DOWNLOADS_BYTES / {print "web_traffic_window_downloads_bytes" dl "window=\"" $2 "\"} " $3}'

                echo ""
                emit_help_type web_traffic_window_human_requests "Non-bot requests in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_HUMAN_REQUESTS / {print "web_traffic_window_human_requests" dl "window=\"" $2 "\"} " $3}'

                echo ""
                emit_help_type web_traffic_window_unique_user_agents "Unique user agents in time window" gauge
                echo "$win_lines" | awk -v dl="$wlp" '/^WIN_UNIQUE_UAS / {print "web_traffic_window_unique_user_agents" dl "window=\"" $2 "\"} " $3}'

                echo ""
            fi

            rm -f "$parsed_file"
        done > "$metrics_buf"

        # Group all samples under their HELP/TYPE headers so multi-domain
        # output is valid Prometheus exposition format
        awk '
        /^# HELP / { metric=$3; if (!(metric in help)) order[n++]=metric; help[metric]=$0; next }
        /^# TYPE / { type[$3]=$0; next }
        /^$/       { next }
        /^[a-zA-Z_]/ { match($0,/^[a-zA-Z_:][a-zA-Z0-9_:]*/); m=substr($0,RSTART,RLENGTH); samples[m]=samples[m] $0 "\n"; next }
        END { for(i=0;i<n;i++){m=order[i]; print help[m]; print type[m]; printf "%s",samples[m]; print ""} }
        ' "$metrics_buf"

        rm -f "$metrics_buf"
    fi

    # ========================================================================
    # Exporter Runtime
    # ========================================================================
    local script_end script_duration
    script_end=$(date +%s)
    script_duration=$((script_end - script_start))

    cat <<EOF
# HELP web_traffic_exporter_duration_seconds Time to generate all metrics
# TYPE web_traffic_exporter_duration_seconds gauge
web_traffic_exporter_duration_seconds $script_duration

# HELP web_traffic_exporter_last_run_timestamp Unix timestamp of last successful run
# TYPE web_traffic_exporter_last_run_timestamp gauge
web_traffic_exporter_last_run_timestamp $script_end
EOF

    echo ""
}

# ============================================================================
# HTTP SERVER MODE
# ============================================================================

run_http_server() {
    echo "Starting web traffic exporter on port $HTTP_PORT..." >&2

    if ! command -v nc >/dev/null 2>&1; then
        echo "ERROR: netcat (nc) required for HTTP mode" >&2
        exit 1
    fi

    trap 'echo "Shutting down web traffic exporter..." >&2; exit 0' INT TERM

    while true; do
        {
            read -r request
            local body
            if [[ "$request" =~ ^GET\ /metrics ]]; then
                body=$(generate_metrics)
                printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body"
            else
                body=$(cat <<'HTMLEOF'
<!DOCTYPE html>
<html>
<head><title>Web Traffic Exporter v1.8</title></head>
<body>
<h1>Web Traffic Exporter v1.8</h1>
<p><a href="/metrics">Metrics</a></p>
<h2>Sections (auto-detected)</h2>
<ul>
<li>Request totals by status, class, and method</li>
<li>Bandwidth totals and by status class</li>
<li>Unique visitors (IPs and user agents)</li>
<li>Top 10 requested paths</li>
<li>Top 10 external referrers</li>
<li>Bot vs human traffic detection</li>
<li>HTTP protocol version distribution</li>
<li>Request rate estimation</li>
<li>Web server process detection</li>
</ul>
</body>
</html>
HTMLEOF
)
                printf "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body"
            fi
        } | if nc -h 2>&1 | grep -q 'GNU\|traditional'; then
            nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
        else
            nc -l "$HTTP_PORT" 2>/dev/null
        fi
    done
}

# ============================================================================
# MAIN EXECUTION
# ============================================================================

main() {
    parse_args "$@"

    if [ "$HTTP_MODE" = true ]; then
        run_http_server
    elif [ -n "$OUTPUT_FILE" ]; then
        local output_dir
        output_dir="$(dirname "$OUTPUT_FILE")"
        mkdir -p "$output_dir"

        local temp_file
        temp_file=$(mktemp "${output_dir}/.web_traffic_metrics.XXXXXX")

        if ! generate_metrics > "$temp_file" 2>/dev/null; then
            rm -f "$temp_file"
            echo "ERROR: Failed to generate metrics" >&2
            exit 1
        fi

        local file_lines
        file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)

        if [ "$file_lines" -lt 5 ]; then
            rm -f "$temp_file"
            echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
            exit 1
        fi

        chmod 644 "$temp_file"
        mv -f "$temp_file" "$OUTPUT_FILE"

        echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
    else
        generate_metrics
    fi
}

main "$@"