a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
1614 lines
64 KiB
Bash
Executable File
1614 lines
64 KiB
Bash
Executable File
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: web-traffic-exporter.sh
|
|
# Version: 1.8
|
|
# Description: Prometheus exporter for web server access log traffic metrics.
|
|
# Parses Nginx/Apache access logs and exports request counts,
|
|
# status codes, bandwidth, unique visitors, top paths, referrers,
|
|
# bot detection, and protocol distribution.
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
#
|
|
# Prerequisites:
|
|
# - Standard Unix tools (awk, grep, tail)
|
|
# - netcat (nc) for HTTP mode
|
|
# - Read access to web server access logs
|
|
#
|
|
# Usage:
|
|
# # Output to stdout
|
|
# ./web-traffic-exporter.sh
|
|
#
|
|
# # HTTP server mode
|
|
# ./web-traffic-exporter.sh --http -p 9199
|
|
#
|
|
# # Textfile collector mode
|
|
# ./web-traffic-exporter.sh --textfile
|
|
#
|
|
# Metrics Exported:
|
|
# Core Status:
|
|
# - web_traffic_up - Exporter status (1=up, 0=down)
|
|
# - web_traffic_exporter_info{version} - Exporter version
|
|
#
|
|
# Request Totals:
|
|
# - web_traffic_requests - Total requests in parsed window
|
|
# - web_traffic_requests_by_status{status} - Per HTTP status code
|
|
# - web_traffic_requests_by_class{class} - Per status class (2xx etc)
|
|
# - web_traffic_requests_by_method{method} - Per HTTP method
|
|
#
|
|
# Bandwidth:
|
|
# - web_traffic_response_bytes - Total bytes sent
|
|
# - web_traffic_response_bytes_by_class{class} - Bytes per class
|
|
#
|
|
# Unique Visitors:
|
|
# - web_traffic_unique_ips - Unique source IPs
|
|
# - web_traffic_unique_user_agents - Unique user agents
|
|
#
|
|
# Top Paths (top 10):
|
|
# - web_traffic_top_path_requests{path,rank} - Hits per path
|
|
#
|
|
# Top Referrers (top 10):
|
|
# - web_traffic_top_referrer_requests{referrer,rank} - Per referrer
|
|
#
|
|
# Bot Detection:
|
|
# - web_traffic_bot_requests - Bot requests
|
|
# - web_traffic_human_requests - Non-bot requests
|
|
# - web_traffic_bot_ratio - Bot / total ratio
|
|
#
|
|
# Downloads:
|
|
# - web_traffic_downloads - Total file downloads
|
|
# - web_traffic_downloads_bytes - Bytes from downloads
|
|
# - web_traffic_top_download_requests{file,rank} - Top downloads
|
|
#
|
|
# Status Breakdown:
|
|
# - web_traffic_status_2xx
|
|
# - web_traffic_status_3xx
|
|
# - web_traffic_status_4xx
|
|
# - web_traffic_status_5xx
|
|
#
|
|
# Request Rate:
|
|
# - web_traffic_requests_per_minute - Estimated from log timestamps
|
|
#
|
|
# Protocol:
|
|
# - web_traffic_requests_by_protocol{protocol} - HTTP version
|
|
#
|
|
# Hourly Patterns:
|
|
# - web_traffic_requests_by_hour{hour} - Requests per hour of day
|
|
#
|
|
# Derived Metrics:
|
|
# - web_traffic_average_response_bytes - Average response size
|
|
# - web_traffic_error_ratio - Ratio of 4xx+5xx to total requests
|
|
#
|
|
# 404 Errors:
|
|
# - web_traffic_404_path_requests{path,rank} - Top 404 paths
|
|
# - web_traffic_top_404_referrer_requests{referrer,rank} - Top 404 referrers
|
|
#
|
|
# Top Paths by Bandwidth:
|
|
# - web_traffic_top_path_response_bytes{path,rank} - Top paths by bytes
|
|
#
|
|
# Content Type:
|
|
# - web_traffic_page_views - Page view requests
|
|
# - web_traffic_asset_requests - Asset requests (css/js/images)
|
|
#
|
|
# Top Clients:
|
|
# - web_traffic_top_client_requests{ip,rank} - Top 10 client IPs
|
|
#
|
|
# Response Sizes:
|
|
# - web_traffic_response_size_bucket{size} - Requests by size range
|
|
#
|
|
# Top Bots:
|
|
# - web_traffic_top_bot_requests{bot,rank} - Top 10 bot names
|
|
#
|
|
# Server Status:
|
|
# - web_traffic_server_running{server} - 1 if process found
|
|
# - web_traffic_server_type{server} - Server info metric
|
|
#
|
|
# Time Windows (daily/weekly/monthly):
|
|
# - web_traffic_window_requests{window} - Requests in window
|
|
# - web_traffic_window_bytes{window} - Bytes in window
|
|
# - web_traffic_window_unique_ips{window} - Unique IPs in window
|
|
# - web_traffic_window_bot_requests{window} - Bot requests in window
|
|
# - web_traffic_window_requests_by_class{window,class} - Per class in window
|
|
# - web_traffic_window_page_views{window} - Page views in window
|
|
# - web_traffic_window_asset_requests{window} - Asset requests in window
|
|
# - web_traffic_window_downloads{window} - Downloads in window
|
|
# - web_traffic_window_downloads_bytes{window} - Download bytes in window
|
|
# - web_traffic_window_human_requests{window} - Non-bot requests in window
|
|
# - web_traffic_window_unique_user_agents{window} - Unique UAs in window
|
|
#
|
|
# Exporter:
|
|
# - web_traffic_exporter_duration_seconds - Script execution time
|
|
# - web_traffic_exporter_last_run_timestamp - Last run timestamp
|
|
# - web_traffic_exporter_lines_parsed - Lines parsed count
|
|
#
|
|
# Configuration:
|
|
# Default HTTP port: 9199
|
|
# Textfile directory: /var/lib/node_exporter
|
|
# ACCESS_LOG: /var/log/nginx/access.log (or WEB_TRAFFIC_ACCESS_LOG env)
|
|
# TAIL_LINES: 0 (all lines; or WEB_TRAFFIC_TAIL_LINES env)
|
|
# MAX_ROTATED: 7 (or WEB_TRAFFIC_MAX_ROTATED env)
|
|
# HTTP_TAIL_LINES: 5000 (or WEB_TRAFFIC_HTTP_TAIL_LINES env)
|
|
# TRACK_UNIQUE_UA: 1 (or WEB_TRAFFIC_TRACK_UNIQUE_UA env; 0=disable)
|
|
# REFERRER_MODE: host (or WEB_TRAFFIC_REFERRER_MODE env; host|full|off)
|
|
# SITE_DOMAIN: (or WEB_TRAFFIC_SITE_DOMAIN env; e.g. mylinux.work)
|
|
# When set, downloads only count if referrer matches this domain.
|
|
# When unset, downloads require any non-empty referrer (filters direct bot hits).
|
|
#
|
|
################################################################################
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION VARIABLES
|
|
# ============================================================================
|
|
|
|
TEXTFILE_DIR="/var/lib/node_exporter"
|
|
OUTPUT_FILE=""
|
|
HTTP_MODE=false
|
|
HTTP_PORT=9199
|
|
|
|
# Operational safety / performance knobs
|
|
# - Limit rotated files read when present (avoid huge scrape cost)
|
|
MAX_ROTATED="${WEB_TRAFFIC_MAX_ROTATED:-7}"
|
|
# - In HTTP mode, default to tailing N lines unless user explicitly set tail-lines
|
|
HTTP_TAIL_LINES_DEFAULT="${WEB_TRAFFIC_HTTP_TAIL_LINES:-5000}"
|
|
# - Reduce memory/cardinality cost:
|
|
# 1 = track unique user agents, 0 = disable
|
|
TRACK_UNIQUE_UA="${WEB_TRAFFIC_TRACK_UNIQUE_UA:-1}"
|
|
# Referrer mode: host | full | off
|
|
REFERRER_MODE="${WEB_TRAFFIC_REFERRER_MODE:-host}"
|
|
|
|
ACCESS_LOG="${WEB_TRAFFIC_ACCESS_LOG:-/var/log/nginx/access.log}"
|
|
LOG_DIR="${WEB_TRAFFIC_LOG_DIR:-}"
|
|
LOG_FORMAT="${WEB_TRAFFIC_LOG_FORMAT:-combined}"
|
|
TAIL_LINES="${WEB_TRAFFIC_TAIL_LINES:-0}"
|
|
SERVER_TYPE="${WEB_TRAFFIC_SERVER_TYPE:-auto}"
|
|
SITE_DOMAIN="${WEB_TRAFFIC_SITE_DOMAIN:-}"
|
|
DOWNLOAD_PATH="${WEB_TRAFFIC_DOWNLOAD_PATH:-/downloads/}"
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
prom_escape() {
|
|
# Escape Prometheus label values: \, ", and newlines.
|
|
# See: https://prometheus.io/docs/instrumenting/exposition_formats/
|
|
local s="$1"
|
|
s=${s//\\/\\\\}
|
|
s=${s//\"/\\\"}
|
|
s=${s//$'\n'/\\n}
|
|
printf '%s\n' "$s"
|
|
}
|
|
|
|
show_usage() {
|
|
cat <<EOF
|
|
Usage: $0 [OPTIONS]
|
|
|
|
Export web server access log traffic metrics as Prometheus metrics (v1.8).
|
|
|
|
MODES:
|
|
--textfile Write to node_exporter textfile collector
|
|
--http Run HTTP server on port $HTTP_PORT
|
|
|
|
OPTIONS:
|
|
-p, --port HTTP port (default: 9199)
|
|
-o, --output Output file path
|
|
--access-log PATH Path to access log (default: $ACCESS_LOG)
|
|
--log-dir DIR Directory of per-domain logs (e.g., /var/log/nginx/domains)
|
|
--log-format FMT Log format: combined or common (default: combined)
|
|
--tail-lines NUM Number of log lines to parse (default: $TAIL_LINES, 0=all)
|
|
--server-type TYPE Server type: auto, nginx, apache (default: auto)
|
|
--site-domain DOM Your domain (e.g. mylinux.work) — downloads only count
|
|
when referred from this domain (filters bot/scanner hits)
|
|
--download-path P URL path prefix for downloads (default: /downloads/)
|
|
All requests under this path count as downloads
|
|
|
|
EXAMPLES:
|
|
$0 --textfile # Write to textfile collector
|
|
$0 --http --port 9199 # Run HTTP server
|
|
$0 --access-log /var/log/apache2/access.log # Use Apache log
|
|
$0 --log-dir /var/log/nginx/domains # Parse all domain logs (HestiaCP)
|
|
$0 -o /tmp/web_traffic.prom # Write to custom file
|
|
|
|
ENVIRONMENT VARIABLES:
|
|
WEB_TRAFFIC_ACCESS_LOG Path to access log
|
|
WEB_TRAFFIC_LOG_DIR Directory of per-domain logs
|
|
WEB_TRAFFIC_LOG_FORMAT Log format: combined or common
|
|
WEB_TRAFFIC_TAIL_LINES Number of log lines to parse (0=all)
|
|
WEB_TRAFFIC_SERVER_TYPE Server type: auto, nginx, apache
|
|
WEB_TRAFFIC_MAX_ROTATED Max rotated log files to read (default: 7)
|
|
WEB_TRAFFIC_HTTP_TAIL_LINES Default tail lines in HTTP mode (default: 5000)
|
|
WEB_TRAFFIC_TRACK_UNIQUE_UA Track unique user agents: 1 or 0 (default: 1)
|
|
WEB_TRAFFIC_REFERRER_MODE Referrer tracking: host, full, or off (default: host)
|
|
WEB_TRAFFIC_SITE_DOMAIN Your domain — downloads only count from this referrer
|
|
WEB_TRAFFIC_DOWNLOAD_PATH URL path prefix for downloads (default: /downloads/)
|
|
|
|
SECTIONS (auto-detected, skipped if unavailable):
|
|
- Request totals by status, class, and method
|
|
- Bandwidth totals and by status class
|
|
- Unique visitors (IPs and user agents)
|
|
- Top 10 requested paths
|
|
- Top 10 external referrers
|
|
- Bot vs human traffic detection
|
|
- HTTP protocol version distribution
|
|
- Request rate estimation
|
|
- Web server process detection
|
|
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help) show_usage ;;
|
|
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/web_traffic.prom"; shift ;;
|
|
--http) HTTP_MODE=true; shift ;;
|
|
-p|--port) HTTP_PORT="$2"; shift 2 ;;
|
|
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
|
|
--access-log) ACCESS_LOG="$2"; shift 2 ;;
|
|
--log-dir) LOG_DIR="$2"; shift 2 ;;
|
|
--log-format) LOG_FORMAT="$2"; shift 2 ;;
|
|
--tail-lines) TAIL_LINES="$2"; shift 2 ;;
|
|
--server-type) SERVER_TYPE="$2"; shift 2 ;;
|
|
--site-domain) SITE_DOMAIN="$2"; shift 2 ;;
|
|
--download-path) DOWNLOAD_PATH="$2"; shift 2 ;;
|
|
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# SERVER DETECTION
|
|
# ============================================================================
|
|
|
|
detect_server_type() {
|
|
if [ "$SERVER_TYPE" != "auto" ]; then
|
|
echo "$SERVER_TYPE"
|
|
return
|
|
fi
|
|
|
|
if pgrep -x nginx >/dev/null 2>&1; then
|
|
echo "nginx"
|
|
elif pgrep -x apache2 >/dev/null 2>&1; then
|
|
echo "apache"
|
|
elif pgrep -x httpd >/dev/null 2>&1; then
|
|
echo "apache"
|
|
else
|
|
echo "unknown"
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# LOG STREAMING (current + rotated logs)
|
|
# ============================================================================
|
|
|
|
# Stream log content from rotated logs (oldest first) then current log.
|
|
# Handles .log.N (plain) and .log.N.gz (compressed) files.
|
|
# Args: $1 - current log file path, $2 - max rotated files to include
|
|
stream_log_data() {
|
|
local log_file="$1"
|
|
local max_rotated="${2:-31}"
|
|
|
|
# Find rotated logs: domain.log.1, domain.log.2.gz, etc.
|
|
local rotated_files=()
|
|
local i
|
|
|
|
for i in $(seq "$max_rotated" -1 1); do
|
|
if [ -f "${log_file}.${i}.gz" ]; then
|
|
rotated_files+=("gz:${log_file}.${i}.gz")
|
|
elif [ -f "${log_file}.${i}" ]; then
|
|
rotated_files+=("plain:${log_file}.${i}")
|
|
fi
|
|
done
|
|
|
|
# Output rotated logs (oldest first)
|
|
for entry in "${rotated_files[@]}"; do
|
|
local type="${entry%%:*}"
|
|
local path="${entry#*:}"
|
|
if [ "$type" = "gz" ]; then
|
|
zcat "$path" 2>/dev/null
|
|
else
|
|
cat "$path" 2>/dev/null
|
|
fi
|
|
done
|
|
|
|
# Output current log
|
|
cat "$log_file" 2>/dev/null
|
|
}
|
|
|
|
# ============================================================================
|
|
# LOG PARSING (single-pass awk)
|
|
# ============================================================================
|
|
|
|
# Parse access log lines and output all metrics data in a structured format.
|
|
# This uses a single awk pass for performance.
|
|
# Output format: KEY value pairs, one per line
|
|
parse_access_log() {
|
|
local log_file="$1"
|
|
local num_lines="$2"
|
|
local format="$3"
|
|
|
|
[ -f "$log_file" ] || return
|
|
[ -r "$log_file" ] || return
|
|
|
|
local now_epoch
|
|
now_epoch=$(date +%s)
|
|
local cutoff_daily=$((now_epoch - 86400))
|
|
local cutoff_weekly=$((now_epoch - 604800))
|
|
local cutoff_monthly=$((now_epoch - 2592000))
|
|
|
|
# Check if rotated logs exist for this file
|
|
local has_rotated=false
|
|
if [ -f "${log_file}.1" ] || [ -f "${log_file}.1.gz" ]; then
|
|
has_rotated=true
|
|
fi
|
|
|
|
# Stream log data into awk: use rotated logs for full monthly history,
|
|
# or read the entire current log so daily/weekly/monthly windows differ
|
|
if [ "$has_rotated" = true ]; then
|
|
stream_log_data "$log_file" "$MAX_ROTATED"
|
|
elif [ "$num_lines" -gt 0 ] 2>/dev/null; then
|
|
tail -n "$num_lines" "$log_file" 2>/dev/null
|
|
else
|
|
cat "$log_file" 2>/dev/null
|
|
fi | awk -v fmt="$format" \
|
|
-v track_ua="$TRACK_UNIQUE_UA" \
|
|
-v ref_mode="$REFERRER_MODE" \
|
|
-v site_domain="$SITE_DOMAIN" \
|
|
-v download_path="$DOWNLOAD_PATH" \
|
|
-v cutoff_daily="$cutoff_daily" \
|
|
-v cutoff_weekly="$cutoff_weekly" \
|
|
-v cutoff_monthly="$cutoff_monthly" '
|
|
BEGIN {
|
|
total = 0
|
|
total_bytes = 0
|
|
bot_count = 0
|
|
lines_parsed = 0
|
|
first_ts = ""
|
|
last_ts = ""
|
|
|
|
# Month cumulative day offsets (non-leap), 1-based month indexing
|
|
split("0,31,59,90,120,151,181,212,243,273,304,334", mdays, ",")
|
|
|
|
# Month name to number lookup
|
|
split("Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec", mn, ",")
|
|
for (i = 1; i <= 12; i++) month_num[mn[i]] = i
|
|
|
|
# Window counters
|
|
win_requests["daily"] = 0; win_requests["weekly"] = 0; win_requests["monthly"] = 0
|
|
win_bytes["daily"] = 0; win_bytes["weekly"] = 0; win_bytes["monthly"] = 0
|
|
win_bots["daily"] = 0; win_bots["weekly"] = 0; win_bots["monthly"] = 0
|
|
}
|
|
function count_elems(a, k, n) { n=0; for (k in a) n++; return n }
|
|
|
|
# Track min/max epoch for stable RPM even if log lines arrive out-of-order
|
|
function parse_log_epoch(ts, parts, dparts, tparts, day, mon, year, hh, mm, ss, tz, sign, tzh, tzm, epoch) {
|
|
# Format: 17/Mar/2026:10:00:00 +0000
|
|
split(ts, parts, " ")
|
|
split(parts[1], dparts, "/")
|
|
day = dparts[1]+0
|
|
mon = month_num[dparts[2]]
|
|
# year:HH:MM:SS
|
|
split(dparts[3], tparts, ":")
|
|
year = tparts[1]+0; hh = tparts[2]+0; mm = tparts[3]+0; ss = tparts[4]+0
|
|
if (mon < 1) return 0
|
|
|
|
# Portable epoch calculation (no mktime dependency)
|
|
# Days from year
|
|
epoch = (year - 1970) * 365 + int((year - 1969) / 4) - int((year - 1901) / 100) + int((year - 1601) / 400)
|
|
# Days from months
|
|
epoch += mdays[mon] + day - 1
|
|
# Leap day adjustment for current year
|
|
if (mon > 2 && (year % 4 == 0 && (year % 100 != 0 || year % 400 == 0))) epoch++
|
|
# Convert to seconds and add time
|
|
epoch = epoch * 86400 + hh * 3600 + mm * 60 + ss
|
|
|
|
# Apply timezone offset
|
|
tz = parts[2]
|
|
if (tz != "") {
|
|
sign = (substr(tz, 1, 1) == "-") ? 1 : -1
|
|
tzh = substr(tz, 2, 2) + 0
|
|
tzm = substr(tz, 4, 2) + 0
|
|
epoch += sign * (tzh * 3600 + tzm * 60)
|
|
}
|
|
return epoch
|
|
}
|
|
{
|
|
lines_parsed++
|
|
|
|
# Parse combined/common log format using field splitting
|
|
# 1.2.3.4 - - [17/Mar/2026:10:00:00 +0000] "GET /path HTTP/1.1" 200 1234 "ref" "ua"
|
|
ip = $1
|
|
|
|
# Extract timestamp between [ and ]
|
|
timestamp = ""
|
|
if (match($0, /\[([^\]]+)\]/) ) {
|
|
timestamp = substr($0, RSTART+1, RLENGTH-2)
|
|
}
|
|
|
|
# Extract the request line between first pair of quotes
|
|
request_line = ""
|
|
p1 = index($0, "\"")
|
|
if (p1 > 0) {
|
|
rest = substr($0, p1+1)
|
|
p2 = index(rest, "\"")
|
|
if (p2 > 0) {
|
|
request_line = substr(rest, 1, p2-1)
|
|
}
|
|
}
|
|
|
|
if (request_line == "") next
|
|
|
|
# Split request line: METHOD PATH PROTOCOL
|
|
n_req = split(request_line, req_parts, " ")
|
|
if (n_req < 2) next
|
|
method = req_parts[1]
|
|
if (method != "GET" && method != "HEAD" && method != "POST" && method != "PUT" && method != "DELETE" && method != "PATCH" && method != "OPTIONS" && method != "CONNECT" && method != "TRACE") next
|
|
path = req_parts[2]
|
|
protocol = (n_req >= 3) ? req_parts[3] : ""
|
|
|
|
# After the closing quote of request line, find status and bytes
|
|
after_req = substr($0, p1 + 1 + p2)
|
|
gsub(/^ +/, "", after_req)
|
|
n_after = split(after_req, after_parts, " ")
|
|
if (n_after < 2) next
|
|
status = after_parts[1]
|
|
bytes = after_parts[2]
|
|
|
|
if (status !~ /^[0-9]+$/) next
|
|
|
|
if (first_ts == "") first_ts = timestamp
|
|
last_ts = timestamp
|
|
|
|
if (bytes == "-") bytes = 0
|
|
|
|
total++
|
|
total_bytes += bytes
|
|
|
|
# Status codes
|
|
status_count[status]++
|
|
|
|
# Status classes
|
|
class_code = substr(status, 1, 1) "xx"
|
|
class_count[class_code]++
|
|
class_bytes[class_code] += bytes
|
|
|
|
# Methods
|
|
method_count[method]++
|
|
|
|
# Unique IPs
|
|
ips[ip] = 1
|
|
|
|
# Paths (clean query strings for grouping)
|
|
split(path, pathparts, "?")
|
|
clean_path = pathparts[1]
|
|
path_count[clean_path]++
|
|
path_bytes[clean_path] += bytes
|
|
|
|
# Download tracking — deferred until after bot detection (see below)
|
|
is_download = 0
|
|
is_downloadable = 0
|
|
is_path_download = 0
|
|
if (method == "GET" && substr(status, 1, 1) == "2") {
|
|
if (download_path != "" && index(clean_path, download_path) == 1) {
|
|
is_downloadable = 1
|
|
is_path_download = 1
|
|
} else if (clean_path ~ /\.(sh|ps1|py|pl|rb|json|yml|yaml|xml|csv|conf|cfg|prom|txt)$/ \
|
|
|| clean_path ~ /\.(zip|tar|gz|tgz|bz2|xz|7z|rar)$/ \
|
|
|| clean_path ~ /\.(pdf|doc|docx|xls|xlsx|ppt|pptx|odt|ods)$/ \
|
|
|| clean_path ~ /\.(deb|rpm|msi|exe|dmg|pkg|appimage|AppImage)$/ \
|
|
|| clean_path ~ /\.(iso|img|bin|run)$/) {
|
|
is_downloadable = 1
|
|
}
|
|
}
|
|
|
|
# Protocol
|
|
if (protocol != "") {
|
|
gsub(/^ +| +$/, "", protocol)
|
|
if (protocol != "") proto_count[protocol]++
|
|
}
|
|
|
|
is_bot = 0
|
|
|
|
# Referrer and User-Agent (combined format only)
|
|
if (fmt == "combined") {
|
|
ref = ""
|
|
ua = ""
|
|
# Split the whole line by double-quote to extract quoted fields
|
|
n = split($0, qparts, "\"")
|
|
# qparts[2] = request line, qparts[4] = referrer, qparts[6] = user-agent
|
|
if (n >= 6) {
|
|
ref = qparts[4]
|
|
ua = qparts[6]
|
|
} else if (n >= 4) {
|
|
ref = qparts[4]
|
|
}
|
|
|
|
# Referrer counting: reduce cardinality by default (host-only).
|
|
if (ref_mode != "off" && ref != "" && ref != "-") {
|
|
ref_key = ref
|
|
if (ref_mode == "host") {
|
|
# Extract host from http(s)://host/... (portable, no gawk capture groups)
|
|
ref_key = ref
|
|
sub(/^https?:\/\//, "", ref_key)
|
|
sub(/\/.*/, "", ref_key)
|
|
}
|
|
referrer_count[ref_key]++
|
|
if (status == "404" && ref_key != "") {
|
|
error_404_referrer[ref_key]++
|
|
}
|
|
}
|
|
|
|
# Unique UA tracking can be expensive; allow disabling.
|
|
if (track_ua == 1 && ua != "" && ua != "-") {
|
|
user_agents[ua] = 1
|
|
}
|
|
|
|
# Bot detection
|
|
# NOTE: Google uses multiple crawler UA tokens beyond plain "Googlebot"
|
|
# See: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
|
|
if (ua ~ /(Googlebot([-/][A-Za-z0-9._]+)?|Google-Extended|AdsBot-Google|Mediapartners-Google|StoreBot-Google)/ ||
|
|
ua ~ /(meta-webindexer|ChatGPT-User|OAI-SearchBot|Amzn-SearchBot|PerplexityBot)/ ||
|
|
ua ~ /([Bb]ot|[Cc]rawl|[Ss]pider|[Ss]lurp|bingbot|BingPreview|YandexBot|Baiduspider|DuckDuckBot|facebookexternalhit|Twitterbot|LinkedInBot|Applebot|MJ12bot|AhrefsBot|SemrushBot|DotBot|PetalBot)/) {
|
|
bot_count++
|
|
is_bot = 1
|
|
}
|
|
}
|
|
|
|
# AI retrieval bots (ChatGPT-User, PerplexityBot) fetch content on
|
|
# behalf of a real user — count their downloads as legitimate.
|
|
is_ai_retrieval = 0
|
|
if (is_bot && ua ~ /(ChatGPT-User|PerplexityBot)/) {
|
|
is_ai_retrieval = 1
|
|
}
|
|
|
|
# Count downloads only for non-bot requests that look like real
|
|
# users (plus AI retrieval bots). Signals indicating a real
|
|
# download:
|
|
# 1. Path-based: any non-bot hit on --download-path (the path
|
|
# itself signals intent — nobody browses /downloads/ casually)
|
|
# 2. AI retrieval bot (ChatGPT-User, PerplexityBot — user asked)
|
|
# 3. Download-tool UA (wget, curl, aria2 — user copied the URL)
|
|
# 4. Referrer from the site (user clicked a download link)
|
|
# Extension-based downloads outside the download path still require
|
|
# signal 2-4 to avoid counting embedded/linked assets.
|
|
if (is_downloadable && (!is_bot || is_ai_retrieval)) {
|
|
is_real_download = 0
|
|
# Path-based downloads: trust all non-bot requests
|
|
if (is_path_download) {
|
|
is_real_download = 1
|
|
}
|
|
# AI retrieval bots are always real downloads
|
|
if (!is_real_download && is_ai_retrieval) {
|
|
is_real_download = 1
|
|
}
|
|
# Check for download-tool user agents
|
|
if (!is_real_download && (ua ~ /^(Wget|curl|aria2|libcurl|Go-http-client|python-requests|HTTPie)/ ||
|
|
ua ~ /^(ufw-threat-feeds|ufw-blocklist|iptables-threat-feeds|iptables-blocklist)/)) {
|
|
is_real_download = 1
|
|
}
|
|
# Check for site referrer (browser click)
|
|
if (!is_real_download && ref != "" && ref != "-") {
|
|
if (site_domain != "") {
|
|
if (index(ref, site_domain) > 0) is_real_download = 1
|
|
} else {
|
|
is_real_download = 1
|
|
}
|
|
}
|
|
if (is_real_download) {
|
|
download_total++
|
|
download_bytes += bytes
|
|
download_count[clean_path]++
|
|
is_download = 1
|
|
}
|
|
}
|
|
|
|
# Time-windowed stats + min/max epoch
|
|
epoch = parse_log_epoch(timestamp)
|
|
|
|
# Hourly traffic patterns (last 24 hours only)
|
|
if (epoch > 0 && epoch >= cutoff_daily) {
|
|
split(timestamp, ts_parts, ":")
|
|
hour = ts_parts[2]+0
|
|
hour_count[hour]++
|
|
}
|
|
if (epoch > 0) {
|
|
if (min_epoch == "" || epoch < min_epoch) min_epoch = epoch
|
|
if (max_epoch == "" || epoch > max_epoch) max_epoch = epoch
|
|
}
|
|
|
|
# 404 error paths
|
|
if (status == "404") {
|
|
error_404_count[clean_path]++
|
|
}
|
|
|
|
# Page views vs assets
|
|
is_page = 0
|
|
is_asset = 0
|
|
if (clean_path ~ /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|ico|webp)$/) {
|
|
asset_requests++
|
|
is_asset = 1
|
|
} else if (clean_path ~ /\/$/ || clean_path ~ /\.html?$/ || clean_path !~ /\.[a-zA-Z0-9]+$/) {
|
|
page_views++
|
|
is_page = 1
|
|
}
|
|
|
|
# Top client IPs (already tracking ips[ip]=1, add counter)
|
|
ip_count[ip]++
|
|
|
|
# Response size distribution
|
|
if (bytes+0 <= 1024) {
|
|
size_bucket["tiny"]++
|
|
} else if (bytes+0 <= 10240) {
|
|
size_bucket["small"]++
|
|
} else if (bytes+0 <= 102400) {
|
|
size_bucket["medium"]++
|
|
} else if (bytes+0 <= 1048576) {
|
|
size_bucket["large"]++
|
|
} else {
|
|
size_bucket["huge"]++
|
|
}
|
|
|
|
# Bot name extraction (when bot detected)
|
|
if (is_bot && ua != "") {
|
|
bot_name = ""
|
|
# Prefer specific/official bot tokens first (better grouping)
|
|
if (match(ua, /(Googlebot-[A-Za-z0-9._-]+|Googlebot\/[0-9.]+|Googlebot|Google-Extended|AdsBot-Google|Mediapartners-Google|StoreBot-Google)/)) {
|
|
bot_name = substr(ua, RSTART, RLENGTH)
|
|
} else if (match(ua, /(bingbot|BingPreview)/)) {
|
|
bot_name = substr(ua, RSTART, RLENGTH)
|
|
} else if (match(ua, /(YandexBot|Baiduspider|DuckDuckBot|Slurp)/)) {
|
|
bot_name = substr(ua, RSTART, RLENGTH)
|
|
} else if (match(ua, /(facebookexternalhit|Twitterbot|LinkedInBot|Applebot)/)) {
|
|
bot_name = substr(ua, RSTART, RLENGTH)
|
|
} else if (match(ua, /(AhrefsBot|SemrushBot|MJ12bot|DotBot|PetalBot)/)) {
|
|
bot_name = substr(ua, RSTART, RLENGTH)
|
|
} else if (match(ua, /(meta-webindexer|ChatGPT-User|OAI-SearchBot|Amzn-SearchBot|PerplexityBot)/)) {
|
|
bot_name = substr(ua, RSTART, RLENGTH)
|
|
} else if (match(ua, /([Bb]ot[a-zA-Z]*|[Cc]rawler|[Ss]pider)/)) {
|
|
bot_name = substr(ua, RSTART, RLENGTH)
|
|
}
|
|
if (bot_name != "") {
|
|
bot_name_count[bot_name]++
|
|
}
|
|
}
|
|
|
|
if (epoch > 0) {
|
|
if (epoch >= cutoff_daily) {
|
|
win_requests["daily"]++
|
|
win_bytes["daily"] += bytes
|
|
win_ips_daily[ip] = 1
|
|
if (is_bot) win_bots["daily"]++
|
|
if (!is_bot) win_human_daily++
|
|
win_class_daily[class_code]++
|
|
if (is_page) win_page_views_daily++
|
|
if (is_asset) win_asset_requests_daily++
|
|
if (is_download) win_downloads_daily++
|
|
win_downloads_bytes_daily += (is_download ? bytes : 0)
|
|
if (track_ua == 1 && ua != "" && ua != "-") win_uas_daily[ua] = 1
|
|
}
|
|
if (epoch >= cutoff_weekly) {
|
|
win_requests["weekly"]++
|
|
win_bytes["weekly"] += bytes
|
|
win_ips_weekly[ip] = 1
|
|
if (is_bot) win_bots["weekly"]++
|
|
}
|
|
if (epoch >= cutoff_monthly) {
|
|
win_requests["monthly"]++
|
|
win_bytes["monthly"] += bytes
|
|
win_ips_monthly[ip] = 1
|
|
if (is_bot) win_bots["monthly"]++
|
|
}
|
|
}
|
|
}
|
|
END {
|
|
print "LINES_PARSED " lines_parsed
|
|
print "TOTAL_REQUESTS " total
|
|
print "TOTAL_BYTES " total_bytes
|
|
print "BOT_REQUESTS " bot_count
|
|
print "HUMAN_REQUESTS " (total - bot_count)
|
|
print "UNIQUE_IPS " count_elems(ips)
|
|
print "UNIQUE_UAS " count_elems(user_agents)
|
|
print "FIRST_TS " first_ts
|
|
print "LAST_TS " last_ts
|
|
print "MIN_EPOCH " (min_epoch=="" ? 0 : min_epoch)
|
|
print "MAX_EPOCH " (max_epoch=="" ? 0 : max_epoch)
|
|
|
|
# Status codes
|
|
for (s in status_count) {
|
|
print "STATUS " s " " status_count[s]
|
|
}
|
|
|
|
# Status classes
|
|
for (c in class_count) {
|
|
print "CLASS " c " " class_count[c]
|
|
}
|
|
|
|
# Class bytes
|
|
for (c in class_bytes) {
|
|
print "CLASS_BYTES " c " " class_bytes[c]
|
|
}
|
|
|
|
# Methods
|
|
for (m in method_count) {
|
|
print "METHOD " m " " method_count[m]
|
|
}
|
|
|
|
# Protocols
|
|
for (p in proto_count) {
|
|
print "PROTOCOL " p " " proto_count[p]
|
|
}
|
|
|
|
# Top paths (sort by count, output top 10)
|
|
# We use a simple selection approach
|
|
for (i = 1; i <= 10; i++) {
|
|
max_count = 0
|
|
max_path = ""
|
|
for (p in path_count) {
|
|
if (path_count[p] > max_count) {
|
|
max_count = path_count[p]
|
|
max_path = p
|
|
}
|
|
}
|
|
if (max_path != "") {
|
|
print "TOP_PATH " i " " max_count " " max_path
|
|
delete path_count[max_path]
|
|
}
|
|
}
|
|
|
|
# Top paths by bytes
|
|
for (i = 1; i <= 10; i++) {
|
|
max_bytes = 0
|
|
max_path = ""
|
|
for (p in path_bytes) {
|
|
if (path_bytes[p] > max_bytes) {
|
|
max_bytes = path_bytes[p]
|
|
max_path = p
|
|
}
|
|
}
|
|
if (max_path != "") {
|
|
print "TOP_PATH_BYTES " i " " max_bytes " " max_path
|
|
delete path_bytes[max_path]
|
|
}
|
|
}
|
|
|
|
# Top referrers (sort by count, output top 10)
|
|
for (i = 1; i <= 10; i++) {
|
|
max_count = 0
|
|
max_ref = ""
|
|
for (r in referrer_count) {
|
|
if (referrer_count[r] > max_count) {
|
|
max_count = referrer_count[r]
|
|
max_ref = r
|
|
}
|
|
}
|
|
if (max_ref != "") {
|
|
print "TOP_REF " i " " max_count " " max_ref
|
|
delete referrer_count[max_ref]
|
|
}
|
|
}
|
|
|
|
# Downloads
|
|
print "DOWNLOAD_TOTAL " download_total+0
|
|
print "DOWNLOAD_BYTES " download_bytes+0
|
|
for (i = 1; i <= 10; i++) {
|
|
max_count = 0
|
|
max_dl = ""
|
|
for (d in download_count) {
|
|
if (download_count[d] > max_count) {
|
|
max_count = download_count[d]
|
|
max_dl = d
|
|
}
|
|
}
|
|
if (max_dl != "") {
|
|
print "TOP_DOWNLOAD " i " " max_count " " max_dl
|
|
delete download_count[max_dl]
|
|
}
|
|
}
|
|
|
|
# Hourly distribution
|
|
for (h = 0; h <= 23; h++) {
|
|
printf "HOUR %02d %d\n", h, hour_count[h]+0
|
|
}
|
|
|
|
# Top 404 paths
|
|
for (i = 1; i <= 10; i++) {
|
|
max_count = 0
|
|
max_path = ""
|
|
for (p in error_404_count) {
|
|
if (error_404_count[p] > max_count) {
|
|
max_count = error_404_count[p]
|
|
max_path = p
|
|
}
|
|
}
|
|
if (max_path != "") {
|
|
print "TOP_404 " i " " max_count " " max_path
|
|
delete error_404_count[max_path]
|
|
}
|
|
}
|
|
|
|
# Top 404 referrers
|
|
for (i = 1; i <= 10; i++) {
|
|
max_count = 0
|
|
max_ref = ""
|
|
for (r in error_404_referrer) {
|
|
if (error_404_referrer[r] > max_count) {
|
|
max_count = error_404_referrer[r]
|
|
max_ref = r
|
|
}
|
|
}
|
|
if (max_ref != "") {
|
|
print "TOP_404_REF " i " " max_count " " max_ref
|
|
delete error_404_referrer[max_ref]
|
|
}
|
|
}
|
|
|
|
# Page views vs assets
|
|
print "PAGE_VIEWS " page_views+0
|
|
print "ASSET_REQUESTS " asset_requests+0
|
|
|
|
# Top client IPs
|
|
for (i = 1; i <= 10; i++) {
|
|
max_count = 0
|
|
max_ip = ""
|
|
for (p in ip_count) {
|
|
if (ip_count[p] > max_count) {
|
|
max_count = ip_count[p]
|
|
max_ip = p
|
|
}
|
|
}
|
|
if (max_ip != "") {
|
|
print "TOP_IP " i " " max_count " " max_ip
|
|
delete ip_count[max_ip]
|
|
}
|
|
}
|
|
|
|
# Response size distribution
|
|
sizes[1] = "tiny"; sizes[2] = "small"; sizes[3] = "medium"; sizes[4] = "large"; sizes[5] = "huge"
|
|
for (s = 1; s <= 5; s++) {
|
|
print "SIZE_BUCKET " sizes[s] " " size_bucket[sizes[s]]+0
|
|
}
|
|
|
|
# Top bot names
|
|
for (i = 1; i <= 10; i++) {
|
|
max_count = 0
|
|
max_bot = ""
|
|
for (b in bot_name_count) {
|
|
if (bot_name_count[b] > max_count) {
|
|
max_count = bot_name_count[b]
|
|
max_bot = b
|
|
}
|
|
}
|
|
if (max_bot != "") {
|
|
print "TOP_BOT " i " " max_count " " max_bot
|
|
delete bot_name_count[max_bot]
|
|
}
|
|
}
|
|
|
|
# Time-windowed summaries
|
|
windows[1] = "daily"; windows[2] = "weekly"; windows[3] = "monthly"
|
|
for (w = 1; w <= 3; w++) {
|
|
wname = windows[w]
|
|
print "WIN_REQUESTS " wname " " win_requests[wname]
|
|
print "WIN_BYTES " wname " " win_bytes[wname]
|
|
print "WIN_BOTS " wname " " win_bots[wname]
|
|
}
|
|
print "WIN_UNIQUE_IPS daily " count_elems(win_ips_daily)
|
|
print "WIN_UNIQUE_IPS weekly " count_elems(win_ips_weekly)
|
|
print "WIN_UNIQUE_IPS monthly " count_elems(win_ips_monthly)
|
|
|
|
# Daily window extended metrics
|
|
print "WIN_STATUS_CLASS daily 2xx " win_class_daily["2xx"]+0
|
|
print "WIN_STATUS_CLASS daily 3xx " win_class_daily["3xx"]+0
|
|
print "WIN_STATUS_CLASS daily 4xx " win_class_daily["4xx"]+0
|
|
print "WIN_STATUS_CLASS daily 5xx " win_class_daily["5xx"]+0
|
|
print "WIN_PAGE_VIEWS daily " win_page_views_daily+0
|
|
print "WIN_ASSET_REQUESTS daily " win_asset_requests_daily+0
|
|
print "WIN_DOWNLOADS daily " win_downloads_daily+0
|
|
print "WIN_DOWNLOADS_BYTES daily " win_downloads_bytes_daily+0
|
|
print "WIN_HUMAN_REQUESTS daily " win_human_daily+0
|
|
print "WIN_UNIQUE_UAS daily " count_elems(win_uas_daily)
|
|
}
|
|
'
|
|
}
|
|
|
|
# ============================================================================
|
|
# METRICS GENERATION
|
|
# ============================================================================
|
|
|
|
# Emit HELP/TYPE lines only once per metric name (avoids duplicates in --log-dir mode)
|
|
_emitted_help=""
|
|
emit_help_type() {
|
|
local metric="$1" help_text="$2" mtype="$3"
|
|
case "$_emitted_help" in
|
|
*"|${metric}|"*) return ;;
|
|
esac
|
|
_emitted_help="${_emitted_help}|${metric}|"
|
|
echo "# HELP $metric $help_text"
|
|
echo "# TYPE $metric $mtype"
|
|
}
|
|
|
|
generate_metrics() {
|
|
_emitted_help=""
|
|
local script_start
|
|
script_start=$(date +%s)
|
|
|
|
# ========================================================================
|
|
# Exporter Status
|
|
# ========================================================================
|
|
cat <<EOF
|
|
# HELP web_traffic_up Exporter status (1=up)
|
|
# TYPE web_traffic_up gauge
|
|
web_traffic_up 1
|
|
|
|
# HELP web_traffic_exporter_info Exporter version information
|
|
# TYPE web_traffic_exporter_info gauge
|
|
web_traffic_exporter_info{version="1.8"} 1
|
|
|
|
EOF
|
|
|
|
# ========================================================================
|
|
# Server Detection
|
|
# ========================================================================
|
|
local detected_server
|
|
detected_server=$(detect_server_type)
|
|
detected_server=$(prom_escape "$detected_server")
|
|
|
|
local nginx_running=0 apache_running=0
|
|
|
|
if pgrep -x nginx >/dev/null 2>&1; then
|
|
nginx_running=1
|
|
fi
|
|
if pgrep -x apache2 >/dev/null 2>&1 || pgrep -x httpd >/dev/null 2>&1; then
|
|
apache_running=1
|
|
fi
|
|
|
|
cat <<EOF
|
|
# HELP web_traffic_server_running Whether the web server process is running (1=running)
|
|
# TYPE web_traffic_server_running gauge
|
|
web_traffic_server_running{server="nginx"} $nginx_running
|
|
web_traffic_server_running{server="apache"} $apache_running
|
|
|
|
# HELP web_traffic_server_type Detected web server type
|
|
# TYPE web_traffic_server_type gauge
|
|
web_traffic_server_type{server="$detected_server"} 1
|
|
|
|
EOF
|
|
|
|
# ========================================================================
|
|
# Access Log Parsing
|
|
# ========================================================================
|
|
|
|
# Safety: in HTTP mode, default to tailing some lines unless user chose otherwise.
|
|
# Prevents extremely expensive scrapes on large/rotated logs.
|
|
if [ "$HTTP_MODE" = true ]; then
|
|
if [ "${TAIL_LINES:-0}" -eq 0 ] 2>/dev/null; then
|
|
TAIL_LINES="$HTTP_TAIL_LINES_DEFAULT"
|
|
fi
|
|
fi
|
|
|
|
# Build list of log files to process: (file:domain) pairs
|
|
local log_files=()
|
|
local log_domains=()
|
|
|
|
if [ -n "$LOG_DIR" ] && [ -d "$LOG_DIR" ]; then
|
|
for log_file in "$LOG_DIR"/*.log; do
|
|
[ -f "$log_file" ] || continue
|
|
# skip error logs, byte logs, and other non-access logs
|
|
case "$log_file" in
|
|
*.error.log|*.bytes.log|*.ssl.log) continue ;;
|
|
esac
|
|
log_files+=("$log_file")
|
|
local domain_name
|
|
domain_name=$(basename "$log_file" .log)
|
|
log_domains+=("$domain_name")
|
|
done
|
|
else
|
|
if [ -f "$ACCESS_LOG" ]; then
|
|
log_files+=("$ACCESS_LOG")
|
|
log_domains+=("")
|
|
fi
|
|
fi
|
|
|
|
if [ ${#log_files[@]} -eq 0 ]; then
|
|
cat <<EOF
|
|
# HELP web_traffic_requests Total requests in parsed window
|
|
# TYPE web_traffic_requests gauge
|
|
web_traffic_requests 0
|
|
|
|
EOF
|
|
else
|
|
local metrics_buf
|
|
metrics_buf=$(mktemp)
|
|
|
|
local file_idx
|
|
for file_idx in "${!log_files[@]}"; do
|
|
local current_log="${log_files[$file_idx]}"
|
|
local current_domain="${log_domains[$file_idx]}"
|
|
|
|
# Build label string: empty for single log, domain="x" for multi
|
|
local dlabel=""
|
|
local dlabel_comma=""
|
|
if [ -n "$current_domain" ]; then
|
|
local current_domain_esc
|
|
current_domain_esc=$(prom_escape "$current_domain")
|
|
dlabel="domain=\"$current_domain_esc\""
|
|
dlabel_comma="domain=\"$current_domain_esc\","
|
|
fi
|
|
|
|
# Write parsed data to temp file to avoid repeated echo|pipe forks
|
|
local parsed_file
|
|
parsed_file=$(mktemp)
|
|
parse_access_log "$current_log" "$TAIL_LINES" "$LOG_FORMAT" > "$parsed_file"
|
|
|
|
if [ ! -s "$parsed_file" ]; then
|
|
rm -f "$parsed_file"
|
|
continue
|
|
fi
|
|
|
|
# Extract all scalar values in a single awk pass
|
|
local total_requests total_bytes bot_requests human_requests
|
|
local unique_ips unique_uas lines_parsed first_ts last_ts
|
|
local dl_total dl_bytes page_views asset_reqs min_epoch max_epoch
|
|
|
|
eval "$(awk '
|
|
/^LINES_PARSED / { printf "lines_parsed=%s\n", $2 }
|
|
/^TOTAL_REQUESTS / { printf "total_requests=%s\n", $2 }
|
|
/^TOTAL_BYTES / { printf "total_bytes=%s\n", $2 }
|
|
/^BOT_REQUESTS / { printf "bot_requests=%s\n", $2 }
|
|
/^HUMAN_REQUESTS / { printf "human_requests=%s\n", $2 }
|
|
/^UNIQUE_IPS / { printf "unique_ips=%s\n", $2 }
|
|
/^UNIQUE_UAS / { printf "unique_uas=%s\n", $2 }
|
|
/^FIRST_TS / { printf "first_ts=\"%s %s\"\n", $2, $3 }
|
|
/^LAST_TS / { printf "last_ts=\"%s %s\"\n", $2, $3 }
|
|
/^DOWNLOAD_TOTAL / { printf "dl_total=%s\n", $2 }
|
|
/^DOWNLOAD_BYTES / { printf "dl_bytes=%s\n", $2 }
|
|
/^PAGE_VIEWS / { printf "page_views=%s\n", $2 }
|
|
/^ASSET_REQUESTS / { printf "asset_reqs=%s\n", $2 }
|
|
/^MIN_EPOCH / { printf "min_epoch=%s\n", $2 }
|
|
/^MAX_EPOCH / { printf "max_epoch=%s\n", $2 }
|
|
' "$parsed_file")"
|
|
|
|
# ================================================================
|
|
# Request Totals
|
|
# ================================================================
|
|
# Build label wrapper: "label" or empty for single log
|
|
local lwrap="" lwrap_comma="{"
|
|
if [ -n "$dlabel" ]; then
|
|
lwrap="{${dlabel}}"
|
|
lwrap_comma="{${dlabel_comma}"
|
|
fi
|
|
|
|
emit_help_type web_traffic_requests "Total requests in parsed window" gauge
|
|
echo "web_traffic_requests${lwrap} ${total_requests:-0}"
|
|
echo ""
|
|
emit_help_type web_traffic_response_bytes "Total response bytes in parsed window" gauge
|
|
echo "web_traffic_response_bytes${lwrap} ${total_bytes:-0}"
|
|
echo ""
|
|
|
|
# ================================================================
|
|
# Status Codes
|
|
# ================================================================
|
|
local status_lines
|
|
status_lines=$(grep "^STATUS " "$parsed_file")
|
|
|
|
if [ -n "$status_lines" ]; then
|
|
emit_help_type web_traffic_requests_by_status "Requests per HTTP status code" gauge
|
|
echo "$status_lines" | while read -r _ status count; do
|
|
esc_status=$(prom_escape "$status")
|
|
echo "web_traffic_requests_by_status${lwrap_comma}status=\"$esc_status\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Status Classes
|
|
# ================================================================
|
|
local class_lines
|
|
class_lines=$(grep "^CLASS [0-9]" "$parsed_file")
|
|
|
|
if [ -n "$class_lines" ]; then
|
|
emit_help_type web_traffic_requests_by_class "Requests per status class" gauge
|
|
echo "$class_lines" | while read -r _ class count; do
|
|
esc_class=$(prom_escape "$class")
|
|
echo "web_traffic_requests_by_class${lwrap_comma}class=\"$esc_class\"} $count"
|
|
done
|
|
echo ""
|
|
|
|
local s2xx s3xx s4xx s5xx
|
|
s2xx=$(echo "$class_lines" | awk '/^CLASS 2xx / {print $3}')
|
|
s3xx=$(echo "$class_lines" | awk '/^CLASS 3xx / {print $3}')
|
|
s4xx=$(echo "$class_lines" | awk '/^CLASS 4xx / {print $3}')
|
|
s5xx=$(echo "$class_lines" | awk '/^CLASS 5xx / {print $3}')
|
|
|
|
emit_help_type web_traffic_status_2xx "Total 2xx responses" gauge
|
|
echo "web_traffic_status_2xx${lwrap} ${s2xx:-0}"
|
|
echo ""
|
|
emit_help_type web_traffic_status_3xx "Total 3xx responses" gauge
|
|
echo "web_traffic_status_3xx${lwrap} ${s3xx:-0}"
|
|
echo ""
|
|
emit_help_type web_traffic_status_4xx "Total 4xx responses" gauge
|
|
echo "web_traffic_status_4xx${lwrap} ${s4xx:-0}"
|
|
echo ""
|
|
emit_help_type web_traffic_status_5xx "Total 5xx responses" gauge
|
|
echo "web_traffic_status_5xx${lwrap} ${s5xx:-0}"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Class Bytes
|
|
# ================================================================
|
|
local class_bytes_lines
|
|
class_bytes_lines=$(grep "^CLASS_BYTES " "$parsed_file")
|
|
|
|
if [ -n "$class_bytes_lines" ]; then
|
|
emit_help_type web_traffic_response_bytes_by_class "Response bytes per status class" gauge
|
|
echo "$class_bytes_lines" | while read -r _ class bytes; do
|
|
esc_class=$(prom_escape "$class")
|
|
echo "web_traffic_response_bytes_by_class${lwrap_comma}class=\"$esc_class\"} $bytes"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Methods
|
|
# ================================================================
|
|
local method_lines
|
|
method_lines=$(grep "^METHOD " "$parsed_file")
|
|
|
|
if [ -n "$method_lines" ]; then
|
|
emit_help_type web_traffic_requests_by_method "Requests per HTTP method" gauge
|
|
echo "$method_lines" | while read -r _ method count; do
|
|
esc_method=$(prom_escape "$method")
|
|
echo "web_traffic_requests_by_method${lwrap_comma}method=\"$esc_method\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Unique Visitors
|
|
# ================================================================
|
|
emit_help_type web_traffic_unique_ips "Unique source IPs in parsed window" gauge
|
|
echo "web_traffic_unique_ips${lwrap} ${unique_ips:-0}"
|
|
echo ""
|
|
emit_help_type web_traffic_unique_user_agents "Unique user agents in parsed window" gauge
|
|
echo "web_traffic_unique_user_agents${lwrap} ${unique_uas:-0}"
|
|
echo ""
|
|
|
|
# ================================================================
|
|
# Bot Detection (only meaningful with combined format)
|
|
# ================================================================
|
|
if [ "$LOG_FORMAT" = "combined" ]; then
|
|
local bot_ratio="0"
|
|
if [ "${total_requests:-0}" -gt 0 ] 2>/dev/null; then
|
|
bot_ratio=$(awk "BEGIN {printf \"%.4f\", ${bot_requests:-0} / ${total_requests:-1}}")
|
|
fi
|
|
|
|
emit_help_type web_traffic_bot_requests "Total bot requests detected" gauge
|
|
echo "web_traffic_bot_requests${lwrap} ${bot_requests:-0}"
|
|
echo ""
|
|
emit_help_type web_traffic_human_requests "Total non-bot requests" gauge
|
|
echo "web_traffic_human_requests${lwrap} ${human_requests:-0}"
|
|
echo ""
|
|
emit_help_type web_traffic_bot_ratio "Ratio of bot requests to total requests" gauge
|
|
echo "web_traffic_bot_ratio${lwrap} $bot_ratio"
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Derived Metrics
|
|
# ================================================================
|
|
local avg_response_bytes="0"
|
|
if [ "${total_requests:-0}" -gt 0 ] 2>/dev/null; then
|
|
avg_response_bytes=$(awk "BEGIN {printf \"%.0f\", ${total_bytes:-0} / ${total_requests:-1}}")
|
|
fi
|
|
emit_help_type web_traffic_average_response_bytes "Average response size in bytes" gauge
|
|
echo "web_traffic_average_response_bytes${lwrap} $avg_response_bytes"
|
|
echo ""
|
|
|
|
local error_ratio="0"
|
|
if [ "${total_requests:-0}" -gt 0 ] 2>/dev/null; then
|
|
local s4xx_val s5xx_val
|
|
s4xx_val=$(grep "^CLASS 4xx " "$parsed_file" | awk '{print $3}')
|
|
s5xx_val=$(grep "^CLASS 5xx " "$parsed_file" | awk '{print $3}')
|
|
error_ratio=$(awk "BEGIN {printf \"%.4f\", (${s4xx_val:-0} + ${s5xx_val:-0}) / ${total_requests:-1}}")
|
|
fi
|
|
emit_help_type web_traffic_error_ratio "Ratio of 4xx+5xx errors to total requests" gauge
|
|
echo "web_traffic_error_ratio${lwrap} $error_ratio"
|
|
echo ""
|
|
|
|
# ================================================================
|
|
# Top Paths
|
|
# ================================================================
|
|
local top_path_lines
|
|
top_path_lines=$(grep "^TOP_PATH " "$parsed_file")
|
|
|
|
if [ -n "$top_path_lines" ]; then
|
|
emit_help_type web_traffic_top_path_requests "Top requested paths by hit count" gauge
|
|
echo "$top_path_lines" | while read -r _ rank count path; do
|
|
esc_path=$(prom_escape "$path")
|
|
echo "web_traffic_top_path_requests${lwrap_comma}path=\"$esc_path\",rank=\"$rank\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Top Paths by Bandwidth
|
|
# ================================================================
|
|
local top_path_bytes_lines
|
|
top_path_bytes_lines=$(grep "^TOP_PATH_BYTES " "$parsed_file")
|
|
|
|
if [ -n "$top_path_bytes_lines" ]; then
|
|
emit_help_type web_traffic_top_path_response_bytes "Top paths by response bytes" gauge
|
|
echo "$top_path_bytes_lines" | while read -r _ rank bytes path; do
|
|
esc_path=$(prom_escape "$path")
|
|
echo "web_traffic_top_path_response_bytes${lwrap_comma}path=\"$esc_path\",rank=\"$rank\"} $bytes"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Top Referrers
|
|
# ================================================================
|
|
local top_ref_lines
|
|
top_ref_lines=$(grep "^TOP_REF " "$parsed_file")
|
|
|
|
if [ -n "$top_ref_lines" ]; then
|
|
emit_help_type web_traffic_top_referrer_requests "Top referrers by hit count" gauge
|
|
echo "$top_ref_lines" | while read -r _ rank count referrer; do
|
|
esc_ref=$(prom_escape "$referrer")
|
|
echo "web_traffic_top_referrer_requests${lwrap_comma}referrer=\"$esc_ref\",rank=\"$rank\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Downloads
|
|
# ================================================================
|
|
emit_help_type web_traffic_downloads "Total file downloads" gauge
|
|
echo "web_traffic_downloads${lwrap} ${dl_total:-0}"
|
|
echo ""
|
|
emit_help_type web_traffic_downloads_bytes "Total bytes from file downloads" gauge
|
|
echo "web_traffic_downloads_bytes${lwrap} ${dl_bytes:-0}"
|
|
echo ""
|
|
|
|
local top_dl_lines
|
|
top_dl_lines=$(grep "^TOP_DOWNLOAD " "$parsed_file")
|
|
|
|
if [ -n "$top_dl_lines" ]; then
|
|
emit_help_type web_traffic_top_download_requests "Top downloaded files by hit count" gauge
|
|
echo "$top_dl_lines" | while read -r _ rank count filepath; do
|
|
esc_file=$(prom_escape "$filepath")
|
|
echo "web_traffic_top_download_requests${lwrap_comma}file=\"$esc_file\",rank=\"$rank\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Hourly Traffic Patterns
|
|
# ================================================================
|
|
local hour_lines
|
|
hour_lines=$(grep "^HOUR " "$parsed_file")
|
|
|
|
if [ -n "$hour_lines" ]; then
|
|
emit_help_type web_traffic_requests_by_hour "Requests per hour of day" gauge
|
|
echo "$hour_lines" | while read -r _ hour count; do
|
|
esc_hour=$(prom_escape "$hour")
|
|
echo "web_traffic_requests_by_hour${lwrap_comma}hour=\"$esc_hour\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# 404 Error Paths
|
|
# ================================================================
|
|
local top_404_lines
|
|
top_404_lines=$(grep "^TOP_404 " "$parsed_file")
|
|
|
|
if [ -n "$top_404_lines" ]; then
|
|
emit_help_type web_traffic_404_path_requests "Top paths returning 404" gauge
|
|
echo "$top_404_lines" | while read -r _ rank count path; do
|
|
esc_path=$(prom_escape "$path")
|
|
echo "web_traffic_404_path_requests${lwrap_comma}path=\"$esc_path\",rank=\"$rank\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Top 404 Referrers
|
|
# ================================================================
|
|
local top_404_ref_lines
|
|
top_404_ref_lines=$(grep "^TOP_404_REF " "$parsed_file")
|
|
|
|
if [ -n "$top_404_ref_lines" ]; then
|
|
emit_help_type web_traffic_top_404_referrer_requests "Top referrers sending traffic to 404 pages" gauge
|
|
echo "$top_404_ref_lines" | while read -r _ rank count referrer; do
|
|
esc_ref=$(prom_escape "$referrer")
|
|
echo "web_traffic_top_404_referrer_requests${lwrap_comma}referrer=\"$esc_ref\",rank=\"$rank\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Page Views vs Assets
|
|
# ================================================================
|
|
emit_help_type web_traffic_page_views "Total page view requests" gauge
|
|
echo "web_traffic_page_views${lwrap} ${page_views:-0}"
|
|
echo ""
|
|
emit_help_type web_traffic_asset_requests "Total asset requests" gauge
|
|
echo "web_traffic_asset_requests${lwrap} ${asset_reqs:-0}"
|
|
echo ""
|
|
|
|
# ================================================================
|
|
# Top Client IPs
|
|
# ================================================================
|
|
local top_ip_lines
|
|
top_ip_lines=$(grep "^TOP_IP " "$parsed_file")
|
|
|
|
if [ -n "$top_ip_lines" ]; then
|
|
emit_help_type web_traffic_top_client_requests "Top client IPs by request count" gauge
|
|
echo "$top_ip_lines" | while read -r _ rank count ip; do
|
|
esc_ip=$(prom_escape "$ip")
|
|
echo "web_traffic_top_client_requests${lwrap_comma}ip=\"$esc_ip\",rank=\"$rank\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Response Size Distribution
|
|
# ================================================================
|
|
local size_lines
|
|
size_lines=$(grep "^SIZE_BUCKET " "$parsed_file")
|
|
|
|
if [ -n "$size_lines" ]; then
|
|
emit_help_type web_traffic_response_size_bucket "Requests per response size range" gauge
|
|
echo "$size_lines" | while read -r _ size count; do
|
|
esc_size=$(prom_escape "$size")
|
|
echo "web_traffic_response_size_bucket${lwrap_comma}size=\"$esc_size\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Top Bot Names
|
|
# ================================================================
|
|
local top_bot_lines
|
|
top_bot_lines=$(grep "^TOP_BOT " "$parsed_file")
|
|
|
|
if [ -n "$top_bot_lines" ]; then
|
|
emit_help_type web_traffic_top_bot_requests "Top bots by request count" gauge
|
|
echo "$top_bot_lines" | while read -r _ rank count bot; do
|
|
esc_bot=$(prom_escape "$bot")
|
|
echo "web_traffic_top_bot_requests${lwrap_comma}bot=\"$esc_bot\",rank=\"$rank\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Protocol Distribution
|
|
# ================================================================
|
|
local proto_lines
|
|
proto_lines=$(grep "^PROTOCOL " "$parsed_file")
|
|
|
|
if [ -n "$proto_lines" ]; then
|
|
emit_help_type web_traffic_requests_by_protocol "Requests per HTTP protocol version" gauge
|
|
echo "$proto_lines" | while read -r _ proto count; do
|
|
esc_proto=$(prom_escape "$proto")
|
|
echo "web_traffic_requests_by_protocol${lwrap_comma}protocol=\"$esc_proto\"} $count"
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# ================================================================
|
|
# Request Rate Estimation
|
|
# ================================================================
|
|
local rpm=0
|
|
if [ "${min_epoch:-0}" -gt 0 ] 2>/dev/null && [ "${max_epoch:-0}" -gt 0 ] 2>/dev/null; then
|
|
local duration=$((max_epoch - min_epoch))
|
|
if [ "$duration" -gt 0 ]; then
|
|
rpm=$(awk "BEGIN {printf \"%.2f\", (${total_requests:-0} / $duration) * 60}")
|
|
fi
|
|
fi
|
|
|
|
emit_help_type web_traffic_requests_per_minute "Estimated requests per minute from log window" gauge
|
|
echo "web_traffic_requests_per_minute${lwrap} $rpm"
|
|
echo ""
|
|
|
|
# ================================================================
|
|
# Lines Parsed
|
|
# ================================================================
|
|
emit_help_type web_traffic_exporter_lines_parsed "Number of log lines parsed" gauge
|
|
echo "web_traffic_exporter_lines_parsed${lwrap} ${lines_parsed:-0}"
|
|
echo ""
|
|
|
|
# ================================================================
|
|
# Time-Windowed Stats (daily/weekly/monthly)
|
|
# ================================================================
|
|
local win_lines
|
|
win_lines=$(grep "^WIN_" "$parsed_file")
|
|
|
|
if [ -n "$win_lines" ]; then
|
|
local wlp="{"
|
|
[ -n "$dlabel_comma" ] && wlp="{${dlabel_comma}"
|
|
|
|
emit_help_type web_traffic_window_requests "Total requests in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_REQUESTS / {print "web_traffic_window_requests" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_bytes "Total response bytes in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_BYTES / {print "web_traffic_window_bytes" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_unique_ips "Unique source IPs in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_UNIQUE_IPS / {print "web_traffic_window_unique_ips" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_bot_requests "Bot requests in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_BOTS / {print "web_traffic_window_bot_requests" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_requests_by_class "Requests per status class in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_STATUS_CLASS / {print "web_traffic_window_requests_by_class" dl "window=\"" $2 "\",class=\"" $3 "\"} " $4}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_page_views "Page view requests in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_PAGE_VIEWS / {print "web_traffic_window_page_views" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_asset_requests "Asset requests in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_ASSET_REQUESTS / {print "web_traffic_window_asset_requests" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_downloads "File downloads in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_DOWNLOADS / {print "web_traffic_window_downloads" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_downloads_bytes "Download bytes in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_DOWNLOADS_BYTES / {print "web_traffic_window_downloads_bytes" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_human_requests "Non-bot requests in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_HUMAN_REQUESTS / {print "web_traffic_window_human_requests" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
emit_help_type web_traffic_window_unique_user_agents "Unique user agents in time window" gauge
|
|
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_UNIQUE_UAS / {print "web_traffic_window_unique_user_agents" dl "window=\"" $2 "\"} " $3}'
|
|
|
|
echo ""
|
|
fi
|
|
|
|
rm -f "$parsed_file"
|
|
done > "$metrics_buf"
|
|
|
|
# Group all samples under their HELP/TYPE headers so multi-domain
|
|
# output is valid Prometheus exposition format
|
|
awk '
|
|
/^# HELP / { metric=$3; if (!(metric in help)) order[n++]=metric; help[metric]=$0; next }
|
|
/^# TYPE / { type[$3]=$0; next }
|
|
/^$/ { next }
|
|
/^[a-zA-Z_]/ { match($0,/^[a-zA-Z_:][a-zA-Z0-9_:]*/); m=substr($0,RSTART,RLENGTH); samples[m]=samples[m] $0 "\n"; next }
|
|
END { for(i=0;i<n;i++){m=order[i]; print help[m]; print type[m]; printf "%s",samples[m]; print ""} }
|
|
' "$metrics_buf"
|
|
|
|
rm -f "$metrics_buf"
|
|
fi
|
|
|
|
# ========================================================================
|
|
# Exporter Runtime
|
|
# ========================================================================
|
|
local script_end script_duration
|
|
script_end=$(date +%s)
|
|
script_duration=$((script_end - script_start))
|
|
|
|
cat <<EOF
|
|
# HELP web_traffic_exporter_duration_seconds Time to generate all metrics
|
|
# TYPE web_traffic_exporter_duration_seconds gauge
|
|
web_traffic_exporter_duration_seconds $script_duration
|
|
|
|
# HELP web_traffic_exporter_last_run_timestamp Unix timestamp of last successful run
|
|
# TYPE web_traffic_exporter_last_run_timestamp gauge
|
|
web_traffic_exporter_last_run_timestamp $script_end
|
|
EOF
|
|
|
|
echo ""
|
|
}
|
|
|
|
# ============================================================================
|
|
# HTTP SERVER MODE
|
|
# ============================================================================
|
|
|
|
run_http_server() {
|
|
echo "Starting web traffic exporter on port $HTTP_PORT..." >&2
|
|
|
|
if ! command -v nc >/dev/null 2>&1; then
|
|
echo "ERROR: netcat (nc) required for HTTP mode" >&2
|
|
exit 1
|
|
fi
|
|
|
|
trap 'echo "Shutting down web traffic exporter..." >&2; exit 0' INT TERM
|
|
|
|
while true; do
|
|
{
|
|
read -r request
|
|
local body
|
|
if [[ "$request" =~ ^GET\ /metrics ]]; then
|
|
body=$(generate_metrics)
|
|
printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body"
|
|
else
|
|
body=$(cat <<'HTMLEOF'
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Web Traffic Exporter v1.8</title></head>
|
|
<body>
|
|
<h1>Web Traffic Exporter v1.8</h1>
|
|
<p><a href="/metrics">Metrics</a></p>
|
|
<h2>Sections (auto-detected)</h2>
|
|
<ul>
|
|
<li>Request totals by status, class, and method</li>
|
|
<li>Bandwidth totals and by status class</li>
|
|
<li>Unique visitors (IPs and user agents)</li>
|
|
<li>Top 10 requested paths</li>
|
|
<li>Top 10 external referrers</li>
|
|
<li>Bot vs human traffic detection</li>
|
|
<li>HTTP protocol version distribution</li>
|
|
<li>Request rate estimation</li>
|
|
<li>Web server process detection</li>
|
|
</ul>
|
|
</body>
|
|
</html>
|
|
HTMLEOF
|
|
)
|
|
printf "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body"
|
|
fi
|
|
} | if nc -h 2>&1 | grep -q 'GNU\|traditional'; then
|
|
nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
|
|
else
|
|
nc -l "$HTTP_PORT" 2>/dev/null
|
|
fi
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN EXECUTION
|
|
# ============================================================================
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
if [ "$HTTP_MODE" = true ]; then
|
|
run_http_server
|
|
elif [ -n "$OUTPUT_FILE" ]; then
|
|
local output_dir
|
|
output_dir="$(dirname "$OUTPUT_FILE")"
|
|
mkdir -p "$output_dir"
|
|
|
|
local temp_file
|
|
temp_file=$(mktemp "${output_dir}/.web_traffic_metrics.XXXXXX")
|
|
|
|
if ! generate_metrics > "$temp_file" 2>/dev/null; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Failed to generate metrics" >&2
|
|
exit 1
|
|
fi
|
|
|
|
local file_lines
|
|
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
|
|
|
|
if [ "$file_lines" -lt 5 ]; then
|
|
rm -f "$temp_file"
|
|
echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
|
|
exit 1
|
|
fi
|
|
|
|
chmod 644 "$temp_file"
|
|
mv -f "$temp_file" "$OUTPUT_FILE"
|
|
|
|
echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
|
|
else
|
|
generate_metrics
|
|
fi
|
|
}
|
|
|
|
main "$@"
|