Files
linux-scripts/web-traffic-exporter.sh
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

1614 lines
64 KiB
Bash
Executable File

#!/bin/bash
################################################################################
# Script Name: web-traffic-exporter.sh
# Version: 1.8
# Description: Prometheus exporter for web server access log traffic metrics.
# Parses Nginx/Apache access logs and exports request counts,
# status codes, bandwidth, unique visitors, top paths, referrers,
# bot detection, and protocol distribution.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
# - Standard Unix tools (awk, grep, tail)
# - netcat (nc) for HTTP mode
# - Read access to web server access logs
#
# Usage:
# # Output to stdout
# ./web-traffic-exporter.sh
#
# # HTTP server mode
# ./web-traffic-exporter.sh --http -p 9199
#
# # Textfile collector mode
# ./web-traffic-exporter.sh --textfile
#
# Metrics Exported:
# Core Status:
# - web_traffic_up - Exporter status (1=up, 0=down)
# - web_traffic_exporter_info{version} - Exporter version
#
# Request Totals:
# - web_traffic_requests - Total requests in parsed window
# - web_traffic_requests_by_status{status} - Per HTTP status code
# - web_traffic_requests_by_class{class} - Per status class (2xx etc)
# - web_traffic_requests_by_method{method} - Per HTTP method
#
# Bandwidth:
# - web_traffic_response_bytes - Total bytes sent
# - web_traffic_response_bytes_by_class{class} - Bytes per class
#
# Unique Visitors:
# - web_traffic_unique_ips - Unique source IPs
# - web_traffic_unique_user_agents - Unique user agents
#
# Top Paths (top 10):
# - web_traffic_top_path_requests{path,rank} - Hits per path
#
# Top Referrers (top 10):
# - web_traffic_top_referrer_requests{referrer,rank} - Per referrer
#
# Bot Detection:
# - web_traffic_bot_requests - Bot requests
# - web_traffic_human_requests - Non-bot requests
# - web_traffic_bot_ratio - Bot / total ratio
#
# Downloads:
# - web_traffic_downloads - Total file downloads
# - web_traffic_downloads_bytes - Bytes from downloads
# - web_traffic_top_download_requests{file,rank} - Top downloads
#
# Status Breakdown:
# - web_traffic_status_2xx
# - web_traffic_status_3xx
# - web_traffic_status_4xx
# - web_traffic_status_5xx
#
# Request Rate:
# - web_traffic_requests_per_minute - Estimated from log timestamps
#
# Protocol:
# - web_traffic_requests_by_protocol{protocol} - HTTP version
#
# Hourly Patterns:
# - web_traffic_requests_by_hour{hour} - Requests per hour of day
#
# Derived Metrics:
# - web_traffic_average_response_bytes - Average response size
# - web_traffic_error_ratio - Ratio of 4xx+5xx to total requests
#
# 404 Errors:
# - web_traffic_404_path_requests{path,rank} - Top 404 paths
# - web_traffic_top_404_referrer_requests{referrer,rank} - Top 404 referrers
#
# Top Paths by Bandwidth:
# - web_traffic_top_path_response_bytes{path,rank} - Top paths by bytes
#
# Content Type:
# - web_traffic_page_views - Page view requests
# - web_traffic_asset_requests - Asset requests (css/js/images)
#
# Top Clients:
# - web_traffic_top_client_requests{ip,rank} - Top 10 client IPs
#
# Response Sizes:
# - web_traffic_response_size_bucket{size} - Requests by size range
#
# Top Bots:
# - web_traffic_top_bot_requests{bot,rank} - Top 10 bot names
#
# Server Status:
# - web_traffic_server_running{server} - 1 if process found
# - web_traffic_server_type{server} - Server info metric
#
# Time Windows (daily/weekly/monthly):
# - web_traffic_window_requests{window} - Requests in window
# - web_traffic_window_bytes{window} - Bytes in window
# - web_traffic_window_unique_ips{window} - Unique IPs in window
# - web_traffic_window_bot_requests{window} - Bot requests in window
# - web_traffic_window_requests_by_class{window,class} - Per class in window
# - web_traffic_window_page_views{window} - Page views in window
# - web_traffic_window_asset_requests{window} - Asset requests in window
# - web_traffic_window_downloads{window} - Downloads in window
# - web_traffic_window_downloads_bytes{window} - Download bytes in window
# - web_traffic_window_human_requests{window} - Non-bot requests in window
# - web_traffic_window_unique_user_agents{window} - Unique UAs in window
#
# Exporter:
# - web_traffic_exporter_duration_seconds - Script execution time
# - web_traffic_exporter_last_run_timestamp - Last run timestamp
# - web_traffic_exporter_lines_parsed - Lines parsed count
#
# Configuration:
# Default HTTP port: 9199
# Textfile directory: /var/lib/node_exporter
# ACCESS_LOG: /var/log/nginx/access.log (or WEB_TRAFFIC_ACCESS_LOG env)
# TAIL_LINES: 0 (all lines; or WEB_TRAFFIC_TAIL_LINES env)
# MAX_ROTATED: 7 (or WEB_TRAFFIC_MAX_ROTATED env)
# HTTP_TAIL_LINES: 5000 (or WEB_TRAFFIC_HTTP_TAIL_LINES env)
# TRACK_UNIQUE_UA: 1 (or WEB_TRAFFIC_TRACK_UNIQUE_UA env; 0=disable)
# REFERRER_MODE: host (or WEB_TRAFFIC_REFERRER_MODE env; host|full|off)
# SITE_DOMAIN: (or WEB_TRAFFIC_SITE_DOMAIN env; e.g. mylinux.work)
# When set, downloads only count if referrer matches this domain.
# When unset, downloads require any non-empty referrer (filters direct bot hits).
#
################################################################################
# ============================================================================
# CONFIGURATION VARIABLES
# ============================================================================
TEXTFILE_DIR="/var/lib/node_exporter"
OUTPUT_FILE=""
HTTP_MODE=false
HTTP_PORT=9199
# Operational safety / performance knobs
# - Limit rotated files read when present (avoid huge scrape cost)
MAX_ROTATED="${WEB_TRAFFIC_MAX_ROTATED:-7}"
# - In HTTP mode, default to tailing N lines unless user explicitly set tail-lines
HTTP_TAIL_LINES_DEFAULT="${WEB_TRAFFIC_HTTP_TAIL_LINES:-5000}"
# - Reduce memory/cardinality cost:
# 1 = track unique user agents, 0 = disable
TRACK_UNIQUE_UA="${WEB_TRAFFIC_TRACK_UNIQUE_UA:-1}"
# Referrer mode: host | full | off
REFERRER_MODE="${WEB_TRAFFIC_REFERRER_MODE:-host}"
ACCESS_LOG="${WEB_TRAFFIC_ACCESS_LOG:-/var/log/nginx/access.log}"
LOG_DIR="${WEB_TRAFFIC_LOG_DIR:-}"
LOG_FORMAT="${WEB_TRAFFIC_LOG_FORMAT:-combined}"
TAIL_LINES="${WEB_TRAFFIC_TAIL_LINES:-0}"
SERVER_TYPE="${WEB_TRAFFIC_SERVER_TYPE:-auto}"
SITE_DOMAIN="${WEB_TRAFFIC_SITE_DOMAIN:-}"
DOWNLOAD_PATH="${WEB_TRAFFIC_DOWNLOAD_PATH:-/downloads/}"
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
prom_escape() {
# Escape Prometheus label values: \, ", and newlines.
# See: https://prometheus.io/docs/instrumenting/exposition_formats/
local s="$1"
s=${s//\\/\\\\}
s=${s//\"/\\\"}
s=${s//$'\n'/\\n}
printf '%s\n' "$s"
}
show_usage() {
cat <<EOF
Usage: $0 [OPTIONS]
Export web server access log traffic metrics as Prometheus metrics (v1.8).
MODES:
--textfile Write to node_exporter textfile collector
--http Run HTTP server on port $HTTP_PORT
OPTIONS:
-p, --port HTTP port (default: 9199)
-o, --output Output file path
--access-log PATH Path to access log (default: $ACCESS_LOG)
--log-dir DIR Directory of per-domain logs (e.g., /var/log/nginx/domains)
--log-format FMT Log format: combined or common (default: combined)
--tail-lines NUM Number of log lines to parse (default: $TAIL_LINES, 0=all)
--server-type TYPE Server type: auto, nginx, apache (default: auto)
--site-domain DOM Your domain (e.g. mylinux.work) — downloads only count
when referred from this domain (filters bot/scanner hits)
--download-path P URL path prefix for downloads (default: /downloads/)
All requests under this path count as downloads
EXAMPLES:
$0 --textfile # Write to textfile collector
$0 --http --port 9199 # Run HTTP server
$0 --access-log /var/log/apache2/access.log # Use Apache log
$0 --log-dir /var/log/nginx/domains # Parse all domain logs (HestiaCP)
$0 -o /tmp/web_traffic.prom # Write to custom file
ENVIRONMENT VARIABLES:
WEB_TRAFFIC_ACCESS_LOG Path to access log
WEB_TRAFFIC_LOG_DIR Directory of per-domain logs
WEB_TRAFFIC_LOG_FORMAT Log format: combined or common
WEB_TRAFFIC_TAIL_LINES Number of log lines to parse (0=all)
WEB_TRAFFIC_SERVER_TYPE Server type: auto, nginx, apache
WEB_TRAFFIC_MAX_ROTATED Max rotated log files to read (default: 7)
WEB_TRAFFIC_HTTP_TAIL_LINES Default tail lines in HTTP mode (default: 5000)
WEB_TRAFFIC_TRACK_UNIQUE_UA Track unique user agents: 1 or 0 (default: 1)
WEB_TRAFFIC_REFERRER_MODE Referrer tracking: host, full, or off (default: host)
WEB_TRAFFIC_SITE_DOMAIN Your domain — downloads only count from this referrer
WEB_TRAFFIC_DOWNLOAD_PATH URL path prefix for downloads (default: /downloads/)
SECTIONS (auto-detected, skipped if unavailable):
- Request totals by status, class, and method
- Bandwidth totals and by status class
- Unique visitors (IPs and user agents)
- Top 10 requested paths
- Top 10 external referrers
- Bot vs human traffic detection
- HTTP protocol version distribution
- Request rate estimation
- Web server process detection
EOF
exit 0
}
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help) show_usage ;;
--textfile) OUTPUT_FILE="$TEXTFILE_DIR/web_traffic.prom"; shift ;;
--http) HTTP_MODE=true; shift ;;
-p|--port) HTTP_PORT="$2"; shift 2 ;;
-o|--output) OUTPUT_FILE="$2"; shift 2 ;;
--access-log) ACCESS_LOG="$2"; shift 2 ;;
--log-dir) LOG_DIR="$2"; shift 2 ;;
--log-format) LOG_FORMAT="$2"; shift 2 ;;
--tail-lines) TAIL_LINES="$2"; shift 2 ;;
--server-type) SERVER_TYPE="$2"; shift 2 ;;
--site-domain) SITE_DOMAIN="$2"; shift 2 ;;
--download-path) DOWNLOAD_PATH="$2"; shift 2 ;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
}
# ============================================================================
# SERVER DETECTION
# ============================================================================
detect_server_type() {
if [ "$SERVER_TYPE" != "auto" ]; then
echo "$SERVER_TYPE"
return
fi
if pgrep -x nginx >/dev/null 2>&1; then
echo "nginx"
elif pgrep -x apache2 >/dev/null 2>&1; then
echo "apache"
elif pgrep -x httpd >/dev/null 2>&1; then
echo "apache"
else
echo "unknown"
fi
}
# ============================================================================
# LOG STREAMING (current + rotated logs)
# ============================================================================
# Stream log content from rotated logs (oldest first) then current log.
# Handles .log.N (plain) and .log.N.gz (compressed) files.
# Args: $1 - current log file path, $2 - max rotated files to include
stream_log_data() {
local log_file="$1"
local max_rotated="${2:-31}"
# Find rotated logs: domain.log.1, domain.log.2.gz, etc.
local rotated_files=()
local i
for i in $(seq "$max_rotated" -1 1); do
if [ -f "${log_file}.${i}.gz" ]; then
rotated_files+=("gz:${log_file}.${i}.gz")
elif [ -f "${log_file}.${i}" ]; then
rotated_files+=("plain:${log_file}.${i}")
fi
done
# Output rotated logs (oldest first)
for entry in "${rotated_files[@]}"; do
local type="${entry%%:*}"
local path="${entry#*:}"
if [ "$type" = "gz" ]; then
zcat "$path" 2>/dev/null
else
cat "$path" 2>/dev/null
fi
done
# Output current log
cat "$log_file" 2>/dev/null
}
# ============================================================================
# LOG PARSING (single-pass awk)
# ============================================================================
# Parse access log lines and output all metrics data in a structured format.
# This uses a single awk pass for performance.
# Output format: KEY value pairs, one per line
parse_access_log() {
local log_file="$1"
local num_lines="$2"
local format="$3"
[ -f "$log_file" ] || return
[ -r "$log_file" ] || return
local now_epoch
now_epoch=$(date +%s)
local cutoff_daily=$((now_epoch - 86400))
local cutoff_weekly=$((now_epoch - 604800))
local cutoff_monthly=$((now_epoch - 2592000))
# Check if rotated logs exist for this file
local has_rotated=false
if [ -f "${log_file}.1" ] || [ -f "${log_file}.1.gz" ]; then
has_rotated=true
fi
# Stream log data into awk: use rotated logs for full monthly history,
# or read the entire current log so daily/weekly/monthly windows differ
if [ "$has_rotated" = true ]; then
stream_log_data "$log_file" "$MAX_ROTATED"
elif [ "$num_lines" -gt 0 ] 2>/dev/null; then
tail -n "$num_lines" "$log_file" 2>/dev/null
else
cat "$log_file" 2>/dev/null
fi | awk -v fmt="$format" \
-v track_ua="$TRACK_UNIQUE_UA" \
-v ref_mode="$REFERRER_MODE" \
-v site_domain="$SITE_DOMAIN" \
-v download_path="$DOWNLOAD_PATH" \
-v cutoff_daily="$cutoff_daily" \
-v cutoff_weekly="$cutoff_weekly" \
-v cutoff_monthly="$cutoff_monthly" '
BEGIN {
total = 0
total_bytes = 0
bot_count = 0
lines_parsed = 0
first_ts = ""
last_ts = ""
# Month cumulative day offsets (non-leap), 1-based month indexing
split("0,31,59,90,120,151,181,212,243,273,304,334", mdays, ",")
# Month name to number lookup
split("Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec", mn, ",")
for (i = 1; i <= 12; i++) month_num[mn[i]] = i
# Window counters
win_requests["daily"] = 0; win_requests["weekly"] = 0; win_requests["monthly"] = 0
win_bytes["daily"] = 0; win_bytes["weekly"] = 0; win_bytes["monthly"] = 0
win_bots["daily"] = 0; win_bots["weekly"] = 0; win_bots["monthly"] = 0
}
function count_elems(a, k, n) { n=0; for (k in a) n++; return n }
# Track min/max epoch for stable RPM even if log lines arrive out-of-order
function parse_log_epoch(ts, parts, dparts, tparts, day, mon, year, hh, mm, ss, tz, sign, tzh, tzm, epoch) {
# Format: 17/Mar/2026:10:00:00 +0000
split(ts, parts, " ")
split(parts[1], dparts, "/")
day = dparts[1]+0
mon = month_num[dparts[2]]
# year:HH:MM:SS
split(dparts[3], tparts, ":")
year = tparts[1]+0; hh = tparts[2]+0; mm = tparts[3]+0; ss = tparts[4]+0
if (mon < 1) return 0
# Portable epoch calculation (no mktime dependency)
# Days from year
epoch = (year - 1970) * 365 + int((year - 1969) / 4) - int((year - 1901) / 100) + int((year - 1601) / 400)
# Days from months
epoch += mdays[mon] + day - 1
# Leap day adjustment for current year
if (mon > 2 && (year % 4 == 0 && (year % 100 != 0 || year % 400 == 0))) epoch++
# Convert to seconds and add time
epoch = epoch * 86400 + hh * 3600 + mm * 60 + ss
# Apply timezone offset
tz = parts[2]
if (tz != "") {
sign = (substr(tz, 1, 1) == "-") ? 1 : -1
tzh = substr(tz, 2, 2) + 0
tzm = substr(tz, 4, 2) + 0
epoch += sign * (tzh * 3600 + tzm * 60)
}
return epoch
}
{
lines_parsed++
# Parse combined/common log format using field splitting
# 1.2.3.4 - - [17/Mar/2026:10:00:00 +0000] "GET /path HTTP/1.1" 200 1234 "ref" "ua"
ip = $1
# Extract timestamp between [ and ]
timestamp = ""
if (match($0, /\[([^\]]+)\]/) ) {
timestamp = substr($0, RSTART+1, RLENGTH-2)
}
# Extract the request line between first pair of quotes
request_line = ""
p1 = index($0, "\"")
if (p1 > 0) {
rest = substr($0, p1+1)
p2 = index(rest, "\"")
if (p2 > 0) {
request_line = substr(rest, 1, p2-1)
}
}
if (request_line == "") next
# Split request line: METHOD PATH PROTOCOL
n_req = split(request_line, req_parts, " ")
if (n_req < 2) next
method = req_parts[1]
if (method != "GET" && method != "HEAD" && method != "POST" && method != "PUT" && method != "DELETE" && method != "PATCH" && method != "OPTIONS" && method != "CONNECT" && method != "TRACE") next
path = req_parts[2]
protocol = (n_req >= 3) ? req_parts[3] : ""
# After the closing quote of request line, find status and bytes
after_req = substr($0, p1 + 1 + p2)
gsub(/^ +/, "", after_req)
n_after = split(after_req, after_parts, " ")
if (n_after < 2) next
status = after_parts[1]
bytes = after_parts[2]
if (status !~ /^[0-9]+$/) next
if (first_ts == "") first_ts = timestamp
last_ts = timestamp
if (bytes == "-") bytes = 0
total++
total_bytes += bytes
# Status codes
status_count[status]++
# Status classes
class_code = substr(status, 1, 1) "xx"
class_count[class_code]++
class_bytes[class_code] += bytes
# Methods
method_count[method]++
# Unique IPs
ips[ip] = 1
# Paths (clean query strings for grouping)
split(path, pathparts, "?")
clean_path = pathparts[1]
path_count[clean_path]++
path_bytes[clean_path] += bytes
# Download tracking — deferred until after bot detection (see below)
is_download = 0
is_downloadable = 0
is_path_download = 0
if (method == "GET" && substr(status, 1, 1) == "2") {
if (download_path != "" && index(clean_path, download_path) == 1) {
is_downloadable = 1
is_path_download = 1
} else if (clean_path ~ /\.(sh|ps1|py|pl|rb|json|yml|yaml|xml|csv|conf|cfg|prom|txt)$/ \
|| clean_path ~ /\.(zip|tar|gz|tgz|bz2|xz|7z|rar)$/ \
|| clean_path ~ /\.(pdf|doc|docx|xls|xlsx|ppt|pptx|odt|ods)$/ \
|| clean_path ~ /\.(deb|rpm|msi|exe|dmg|pkg|appimage|AppImage)$/ \
|| clean_path ~ /\.(iso|img|bin|run)$/) {
is_downloadable = 1
}
}
# Protocol
if (protocol != "") {
gsub(/^ +| +$/, "", protocol)
if (protocol != "") proto_count[protocol]++
}
is_bot = 0
# Referrer and User-Agent (combined format only)
if (fmt == "combined") {
ref = ""
ua = ""
# Split the whole line by double-quote to extract quoted fields
n = split($0, qparts, "\"")
# qparts[2] = request line, qparts[4] = referrer, qparts[6] = user-agent
if (n >= 6) {
ref = qparts[4]
ua = qparts[6]
} else if (n >= 4) {
ref = qparts[4]
}
# Referrer counting: reduce cardinality by default (host-only).
if (ref_mode != "off" && ref != "" && ref != "-") {
ref_key = ref
if (ref_mode == "host") {
# Extract host from http(s)://host/... (portable, no gawk capture groups)
ref_key = ref
sub(/^https?:\/\//, "", ref_key)
sub(/\/.*/, "", ref_key)
}
referrer_count[ref_key]++
if (status == "404" && ref_key != "") {
error_404_referrer[ref_key]++
}
}
# Unique UA tracking can be expensive; allow disabling.
if (track_ua == 1 && ua != "" && ua != "-") {
user_agents[ua] = 1
}
# Bot detection
# NOTE: Google uses multiple crawler UA tokens beyond plain "Googlebot"
# See: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
if (ua ~ /(Googlebot([-/][A-Za-z0-9._]+)?|Google-Extended|AdsBot-Google|Mediapartners-Google|StoreBot-Google)/ ||
ua ~ /(meta-webindexer|ChatGPT-User|OAI-SearchBot|Amzn-SearchBot|PerplexityBot)/ ||
ua ~ /([Bb]ot|[Cc]rawl|[Ss]pider|[Ss]lurp|bingbot|BingPreview|YandexBot|Baiduspider|DuckDuckBot|facebookexternalhit|Twitterbot|LinkedInBot|Applebot|MJ12bot|AhrefsBot|SemrushBot|DotBot|PetalBot)/) {
bot_count++
is_bot = 1
}
}
# AI retrieval bots (ChatGPT-User, PerplexityBot) fetch content on
# behalf of a real user — count their downloads as legitimate.
is_ai_retrieval = 0
if (is_bot && ua ~ /(ChatGPT-User|PerplexityBot)/) {
is_ai_retrieval = 1
}
# Count downloads only for non-bot requests that look like real
# users (plus AI retrieval bots). Signals indicating a real
# download:
# 1. Path-based: any non-bot hit on --download-path (the path
# itself signals intent — nobody browses /downloads/ casually)
# 2. AI retrieval bot (ChatGPT-User, PerplexityBot — user asked)
# 3. Download-tool UA (wget, curl, aria2 — user copied the URL)
# 4. Referrer from the site (user clicked a download link)
# Extension-based downloads outside the download path still require
# signal 2-4 to avoid counting embedded/linked assets.
if (is_downloadable && (!is_bot || is_ai_retrieval)) {
is_real_download = 0
# Path-based downloads: trust all non-bot requests
if (is_path_download) {
is_real_download = 1
}
# AI retrieval bots are always real downloads
if (!is_real_download && is_ai_retrieval) {
is_real_download = 1
}
# Check for download-tool user agents
if (!is_real_download && (ua ~ /^(Wget|curl|aria2|libcurl|Go-http-client|python-requests|HTTPie)/ ||
ua ~ /^(ufw-threat-feeds|ufw-blocklist|iptables-threat-feeds|iptables-blocklist)/)) {
is_real_download = 1
}
# Check for site referrer (browser click)
if (!is_real_download && ref != "" && ref != "-") {
if (site_domain != "") {
if (index(ref, site_domain) > 0) is_real_download = 1
} else {
is_real_download = 1
}
}
if (is_real_download) {
download_total++
download_bytes += bytes
download_count[clean_path]++
is_download = 1
}
}
# Time-windowed stats + min/max epoch
epoch = parse_log_epoch(timestamp)
# Hourly traffic patterns (last 24 hours only)
if (epoch > 0 && epoch >= cutoff_daily) {
split(timestamp, ts_parts, ":")
hour = ts_parts[2]+0
hour_count[hour]++
}
if (epoch > 0) {
if (min_epoch == "" || epoch < min_epoch) min_epoch = epoch
if (max_epoch == "" || epoch > max_epoch) max_epoch = epoch
}
# 404 error paths
if (status == "404") {
error_404_count[clean_path]++
}
# Page views vs assets
is_page = 0
is_asset = 0
if (clean_path ~ /\.(css|js|png|jpg|jpeg|gif|svg|woff|woff2|ttf|ico|webp)$/) {
asset_requests++
is_asset = 1
} else if (clean_path ~ /\/$/ || clean_path ~ /\.html?$/ || clean_path !~ /\.[a-zA-Z0-9]+$/) {
page_views++
is_page = 1
}
# Top client IPs (already tracking ips[ip]=1, add counter)
ip_count[ip]++
# Response size distribution
if (bytes+0 <= 1024) {
size_bucket["tiny"]++
} else if (bytes+0 <= 10240) {
size_bucket["small"]++
} else if (bytes+0 <= 102400) {
size_bucket["medium"]++
} else if (bytes+0 <= 1048576) {
size_bucket["large"]++
} else {
size_bucket["huge"]++
}
# Bot name extraction (when bot detected)
if (is_bot && ua != "") {
bot_name = ""
# Prefer specific/official bot tokens first (better grouping)
if (match(ua, /(Googlebot-[A-Za-z0-9._-]+|Googlebot\/[0-9.]+|Googlebot|Google-Extended|AdsBot-Google|Mediapartners-Google|StoreBot-Google)/)) {
bot_name = substr(ua, RSTART, RLENGTH)
} else if (match(ua, /(bingbot|BingPreview)/)) {
bot_name = substr(ua, RSTART, RLENGTH)
} else if (match(ua, /(YandexBot|Baiduspider|DuckDuckBot|Slurp)/)) {
bot_name = substr(ua, RSTART, RLENGTH)
} else if (match(ua, /(facebookexternalhit|Twitterbot|LinkedInBot|Applebot)/)) {
bot_name = substr(ua, RSTART, RLENGTH)
} else if (match(ua, /(AhrefsBot|SemrushBot|MJ12bot|DotBot|PetalBot)/)) {
bot_name = substr(ua, RSTART, RLENGTH)
} else if (match(ua, /(meta-webindexer|ChatGPT-User|OAI-SearchBot|Amzn-SearchBot|PerplexityBot)/)) {
bot_name = substr(ua, RSTART, RLENGTH)
} else if (match(ua, /([Bb]ot[a-zA-Z]*|[Cc]rawler|[Ss]pider)/)) {
bot_name = substr(ua, RSTART, RLENGTH)
}
if (bot_name != "") {
bot_name_count[bot_name]++
}
}
if (epoch > 0) {
if (epoch >= cutoff_daily) {
win_requests["daily"]++
win_bytes["daily"] += bytes
win_ips_daily[ip] = 1
if (is_bot) win_bots["daily"]++
if (!is_bot) win_human_daily++
win_class_daily[class_code]++
if (is_page) win_page_views_daily++
if (is_asset) win_asset_requests_daily++
if (is_download) win_downloads_daily++
win_downloads_bytes_daily += (is_download ? bytes : 0)
if (track_ua == 1 && ua != "" && ua != "-") win_uas_daily[ua] = 1
}
if (epoch >= cutoff_weekly) {
win_requests["weekly"]++
win_bytes["weekly"] += bytes
win_ips_weekly[ip] = 1
if (is_bot) win_bots["weekly"]++
}
if (epoch >= cutoff_monthly) {
win_requests["monthly"]++
win_bytes["monthly"] += bytes
win_ips_monthly[ip] = 1
if (is_bot) win_bots["monthly"]++
}
}
}
END {
print "LINES_PARSED " lines_parsed
print "TOTAL_REQUESTS " total
print "TOTAL_BYTES " total_bytes
print "BOT_REQUESTS " bot_count
print "HUMAN_REQUESTS " (total - bot_count)
print "UNIQUE_IPS " count_elems(ips)
print "UNIQUE_UAS " count_elems(user_agents)
print "FIRST_TS " first_ts
print "LAST_TS " last_ts
print "MIN_EPOCH " (min_epoch=="" ? 0 : min_epoch)
print "MAX_EPOCH " (max_epoch=="" ? 0 : max_epoch)
# Status codes
for (s in status_count) {
print "STATUS " s " " status_count[s]
}
# Status classes
for (c in class_count) {
print "CLASS " c " " class_count[c]
}
# Class bytes
for (c in class_bytes) {
print "CLASS_BYTES " c " " class_bytes[c]
}
# Methods
for (m in method_count) {
print "METHOD " m " " method_count[m]
}
# Protocols
for (p in proto_count) {
print "PROTOCOL " p " " proto_count[p]
}
# Top paths (sort by count, output top 10)
# We use a simple selection approach
for (i = 1; i <= 10; i++) {
max_count = 0
max_path = ""
for (p in path_count) {
if (path_count[p] > max_count) {
max_count = path_count[p]
max_path = p
}
}
if (max_path != "") {
print "TOP_PATH " i " " max_count " " max_path
delete path_count[max_path]
}
}
# Top paths by bytes
for (i = 1; i <= 10; i++) {
max_bytes = 0
max_path = ""
for (p in path_bytes) {
if (path_bytes[p] > max_bytes) {
max_bytes = path_bytes[p]
max_path = p
}
}
if (max_path != "") {
print "TOP_PATH_BYTES " i " " max_bytes " " max_path
delete path_bytes[max_path]
}
}
# Top referrers (sort by count, output top 10)
for (i = 1; i <= 10; i++) {
max_count = 0
max_ref = ""
for (r in referrer_count) {
if (referrer_count[r] > max_count) {
max_count = referrer_count[r]
max_ref = r
}
}
if (max_ref != "") {
print "TOP_REF " i " " max_count " " max_ref
delete referrer_count[max_ref]
}
}
# Downloads
print "DOWNLOAD_TOTAL " download_total+0
print "DOWNLOAD_BYTES " download_bytes+0
for (i = 1; i <= 10; i++) {
max_count = 0
max_dl = ""
for (d in download_count) {
if (download_count[d] > max_count) {
max_count = download_count[d]
max_dl = d
}
}
if (max_dl != "") {
print "TOP_DOWNLOAD " i " " max_count " " max_dl
delete download_count[max_dl]
}
}
# Hourly distribution
for (h = 0; h <= 23; h++) {
printf "HOUR %02d %d\n", h, hour_count[h]+0
}
# Top 404 paths
for (i = 1; i <= 10; i++) {
max_count = 0
max_path = ""
for (p in error_404_count) {
if (error_404_count[p] > max_count) {
max_count = error_404_count[p]
max_path = p
}
}
if (max_path != "") {
print "TOP_404 " i " " max_count " " max_path
delete error_404_count[max_path]
}
}
# Top 404 referrers
for (i = 1; i <= 10; i++) {
max_count = 0
max_ref = ""
for (r in error_404_referrer) {
if (error_404_referrer[r] > max_count) {
max_count = error_404_referrer[r]
max_ref = r
}
}
if (max_ref != "") {
print "TOP_404_REF " i " " max_count " " max_ref
delete error_404_referrer[max_ref]
}
}
# Page views vs assets
print "PAGE_VIEWS " page_views+0
print "ASSET_REQUESTS " asset_requests+0
# Top client IPs
for (i = 1; i <= 10; i++) {
max_count = 0
max_ip = ""
for (p in ip_count) {
if (ip_count[p] > max_count) {
max_count = ip_count[p]
max_ip = p
}
}
if (max_ip != "") {
print "TOP_IP " i " " max_count " " max_ip
delete ip_count[max_ip]
}
}
# Response size distribution
sizes[1] = "tiny"; sizes[2] = "small"; sizes[3] = "medium"; sizes[4] = "large"; sizes[5] = "huge"
for (s = 1; s <= 5; s++) {
print "SIZE_BUCKET " sizes[s] " " size_bucket[sizes[s]]+0
}
# Top bot names
for (i = 1; i <= 10; i++) {
max_count = 0
max_bot = ""
for (b in bot_name_count) {
if (bot_name_count[b] > max_count) {
max_count = bot_name_count[b]
max_bot = b
}
}
if (max_bot != "") {
print "TOP_BOT " i " " max_count " " max_bot
delete bot_name_count[max_bot]
}
}
# Time-windowed summaries
windows[1] = "daily"; windows[2] = "weekly"; windows[3] = "monthly"
for (w = 1; w <= 3; w++) {
wname = windows[w]
print "WIN_REQUESTS " wname " " win_requests[wname]
print "WIN_BYTES " wname " " win_bytes[wname]
print "WIN_BOTS " wname " " win_bots[wname]
}
print "WIN_UNIQUE_IPS daily " count_elems(win_ips_daily)
print "WIN_UNIQUE_IPS weekly " count_elems(win_ips_weekly)
print "WIN_UNIQUE_IPS monthly " count_elems(win_ips_monthly)
# Daily window extended metrics
print "WIN_STATUS_CLASS daily 2xx " win_class_daily["2xx"]+0
print "WIN_STATUS_CLASS daily 3xx " win_class_daily["3xx"]+0
print "WIN_STATUS_CLASS daily 4xx " win_class_daily["4xx"]+0
print "WIN_STATUS_CLASS daily 5xx " win_class_daily["5xx"]+0
print "WIN_PAGE_VIEWS daily " win_page_views_daily+0
print "WIN_ASSET_REQUESTS daily " win_asset_requests_daily+0
print "WIN_DOWNLOADS daily " win_downloads_daily+0
print "WIN_DOWNLOADS_BYTES daily " win_downloads_bytes_daily+0
print "WIN_HUMAN_REQUESTS daily " win_human_daily+0
print "WIN_UNIQUE_UAS daily " count_elems(win_uas_daily)
}
'
}
# ============================================================================
# METRICS GENERATION
# ============================================================================
# Emit HELP/TYPE lines only once per metric name (avoids duplicates in --log-dir mode)
_emitted_help=""
emit_help_type() {
local metric="$1" help_text="$2" mtype="$3"
case "$_emitted_help" in
*"|${metric}|"*) return ;;
esac
_emitted_help="${_emitted_help}|${metric}|"
echo "# HELP $metric $help_text"
echo "# TYPE $metric $mtype"
}
generate_metrics() {
_emitted_help=""
local script_start
script_start=$(date +%s)
# ========================================================================
# Exporter Status
# ========================================================================
cat <<EOF
# HELP web_traffic_up Exporter status (1=up)
# TYPE web_traffic_up gauge
web_traffic_up 1
# HELP web_traffic_exporter_info Exporter version information
# TYPE web_traffic_exporter_info gauge
web_traffic_exporter_info{version="1.8"} 1
EOF
# ========================================================================
# Server Detection
# ========================================================================
local detected_server
detected_server=$(detect_server_type)
detected_server=$(prom_escape "$detected_server")
local nginx_running=0 apache_running=0
if pgrep -x nginx >/dev/null 2>&1; then
nginx_running=1
fi
if pgrep -x apache2 >/dev/null 2>&1 || pgrep -x httpd >/dev/null 2>&1; then
apache_running=1
fi
cat <<EOF
# HELP web_traffic_server_running Whether the web server process is running (1=running)
# TYPE web_traffic_server_running gauge
web_traffic_server_running{server="nginx"} $nginx_running
web_traffic_server_running{server="apache"} $apache_running
# HELP web_traffic_server_type Detected web server type
# TYPE web_traffic_server_type gauge
web_traffic_server_type{server="$detected_server"} 1
EOF
# ========================================================================
# Access Log Parsing
# ========================================================================
# Safety: in HTTP mode, default to tailing some lines unless user chose otherwise.
# Prevents extremely expensive scrapes on large/rotated logs.
if [ "$HTTP_MODE" = true ]; then
if [ "${TAIL_LINES:-0}" -eq 0 ] 2>/dev/null; then
TAIL_LINES="$HTTP_TAIL_LINES_DEFAULT"
fi
fi
# Build list of log files to process: (file:domain) pairs
local log_files=()
local log_domains=()
if [ -n "$LOG_DIR" ] && [ -d "$LOG_DIR" ]; then
for log_file in "$LOG_DIR"/*.log; do
[ -f "$log_file" ] || continue
# skip error logs, byte logs, and other non-access logs
case "$log_file" in
*.error.log|*.bytes.log|*.ssl.log) continue ;;
esac
log_files+=("$log_file")
local domain_name
domain_name=$(basename "$log_file" .log)
log_domains+=("$domain_name")
done
else
if [ -f "$ACCESS_LOG" ]; then
log_files+=("$ACCESS_LOG")
log_domains+=("")
fi
fi
if [ ${#log_files[@]} -eq 0 ]; then
cat <<EOF
# HELP web_traffic_requests Total requests in parsed window
# TYPE web_traffic_requests gauge
web_traffic_requests 0
EOF
else
local metrics_buf
metrics_buf=$(mktemp)
local file_idx
for file_idx in "${!log_files[@]}"; do
local current_log="${log_files[$file_idx]}"
local current_domain="${log_domains[$file_idx]}"
# Build label string: empty for single log, domain="x" for multi
local dlabel=""
local dlabel_comma=""
if [ -n "$current_domain" ]; then
local current_domain_esc
current_domain_esc=$(prom_escape "$current_domain")
dlabel="domain=\"$current_domain_esc\""
dlabel_comma="domain=\"$current_domain_esc\","
fi
# Write parsed data to temp file to avoid repeated echo|pipe forks
local parsed_file
parsed_file=$(mktemp)
parse_access_log "$current_log" "$TAIL_LINES" "$LOG_FORMAT" > "$parsed_file"
if [ ! -s "$parsed_file" ]; then
rm -f "$parsed_file"
continue
fi
# Extract all scalar values in a single awk pass
local total_requests total_bytes bot_requests human_requests
local unique_ips unique_uas lines_parsed first_ts last_ts
local dl_total dl_bytes page_views asset_reqs min_epoch max_epoch
eval "$(awk '
/^LINES_PARSED / { printf "lines_parsed=%s\n", $2 }
/^TOTAL_REQUESTS / { printf "total_requests=%s\n", $2 }
/^TOTAL_BYTES / { printf "total_bytes=%s\n", $2 }
/^BOT_REQUESTS / { printf "bot_requests=%s\n", $2 }
/^HUMAN_REQUESTS / { printf "human_requests=%s\n", $2 }
/^UNIQUE_IPS / { printf "unique_ips=%s\n", $2 }
/^UNIQUE_UAS / { printf "unique_uas=%s\n", $2 }
/^FIRST_TS / { printf "first_ts=\"%s %s\"\n", $2, $3 }
/^LAST_TS / { printf "last_ts=\"%s %s\"\n", $2, $3 }
/^DOWNLOAD_TOTAL / { printf "dl_total=%s\n", $2 }
/^DOWNLOAD_BYTES / { printf "dl_bytes=%s\n", $2 }
/^PAGE_VIEWS / { printf "page_views=%s\n", $2 }
/^ASSET_REQUESTS / { printf "asset_reqs=%s\n", $2 }
/^MIN_EPOCH / { printf "min_epoch=%s\n", $2 }
/^MAX_EPOCH / { printf "max_epoch=%s\n", $2 }
' "$parsed_file")"
# ================================================================
# Request Totals
# ================================================================
# Build label wrapper: "label" or empty for single log
local lwrap="" lwrap_comma="{"
if [ -n "$dlabel" ]; then
lwrap="{${dlabel}}"
lwrap_comma="{${dlabel_comma}"
fi
emit_help_type web_traffic_requests "Total requests in parsed window" gauge
echo "web_traffic_requests${lwrap} ${total_requests:-0}"
echo ""
emit_help_type web_traffic_response_bytes "Total response bytes in parsed window" gauge
echo "web_traffic_response_bytes${lwrap} ${total_bytes:-0}"
echo ""
# ================================================================
# Status Codes
# ================================================================
local status_lines
status_lines=$(grep "^STATUS " "$parsed_file")
if [ -n "$status_lines" ]; then
emit_help_type web_traffic_requests_by_status "Requests per HTTP status code" gauge
echo "$status_lines" | while read -r _ status count; do
esc_status=$(prom_escape "$status")
echo "web_traffic_requests_by_status${lwrap_comma}status=\"$esc_status\"} $count"
done
echo ""
fi
# ================================================================
# Status Classes
# ================================================================
local class_lines
class_lines=$(grep "^CLASS [0-9]" "$parsed_file")
if [ -n "$class_lines" ]; then
emit_help_type web_traffic_requests_by_class "Requests per status class" gauge
echo "$class_lines" | while read -r _ class count; do
esc_class=$(prom_escape "$class")
echo "web_traffic_requests_by_class${lwrap_comma}class=\"$esc_class\"} $count"
done
echo ""
local s2xx s3xx s4xx s5xx
s2xx=$(echo "$class_lines" | awk '/^CLASS 2xx / {print $3}')
s3xx=$(echo "$class_lines" | awk '/^CLASS 3xx / {print $3}')
s4xx=$(echo "$class_lines" | awk '/^CLASS 4xx / {print $3}')
s5xx=$(echo "$class_lines" | awk '/^CLASS 5xx / {print $3}')
emit_help_type web_traffic_status_2xx "Total 2xx responses" gauge
echo "web_traffic_status_2xx${lwrap} ${s2xx:-0}"
echo ""
emit_help_type web_traffic_status_3xx "Total 3xx responses" gauge
echo "web_traffic_status_3xx${lwrap} ${s3xx:-0}"
echo ""
emit_help_type web_traffic_status_4xx "Total 4xx responses" gauge
echo "web_traffic_status_4xx${lwrap} ${s4xx:-0}"
echo ""
emit_help_type web_traffic_status_5xx "Total 5xx responses" gauge
echo "web_traffic_status_5xx${lwrap} ${s5xx:-0}"
echo ""
fi
# ================================================================
# Class Bytes
# ================================================================
local class_bytes_lines
class_bytes_lines=$(grep "^CLASS_BYTES " "$parsed_file")
if [ -n "$class_bytes_lines" ]; then
emit_help_type web_traffic_response_bytes_by_class "Response bytes per status class" gauge
echo "$class_bytes_lines" | while read -r _ class bytes; do
esc_class=$(prom_escape "$class")
echo "web_traffic_response_bytes_by_class${lwrap_comma}class=\"$esc_class\"} $bytes"
done
echo ""
fi
# ================================================================
# Methods
# ================================================================
local method_lines
method_lines=$(grep "^METHOD " "$parsed_file")
if [ -n "$method_lines" ]; then
emit_help_type web_traffic_requests_by_method "Requests per HTTP method" gauge
echo "$method_lines" | while read -r _ method count; do
esc_method=$(prom_escape "$method")
echo "web_traffic_requests_by_method${lwrap_comma}method=\"$esc_method\"} $count"
done
echo ""
fi
# ================================================================
# Unique Visitors
# ================================================================
emit_help_type web_traffic_unique_ips "Unique source IPs in parsed window" gauge
echo "web_traffic_unique_ips${lwrap} ${unique_ips:-0}"
echo ""
emit_help_type web_traffic_unique_user_agents "Unique user agents in parsed window" gauge
echo "web_traffic_unique_user_agents${lwrap} ${unique_uas:-0}"
echo ""
# ================================================================
# Bot Detection (only meaningful with combined format)
# ================================================================
if [ "$LOG_FORMAT" = "combined" ]; then
local bot_ratio="0"
if [ "${total_requests:-0}" -gt 0 ] 2>/dev/null; then
bot_ratio=$(awk "BEGIN {printf \"%.4f\", ${bot_requests:-0} / ${total_requests:-1}}")
fi
emit_help_type web_traffic_bot_requests "Total bot requests detected" gauge
echo "web_traffic_bot_requests${lwrap} ${bot_requests:-0}"
echo ""
emit_help_type web_traffic_human_requests "Total non-bot requests" gauge
echo "web_traffic_human_requests${lwrap} ${human_requests:-0}"
echo ""
emit_help_type web_traffic_bot_ratio "Ratio of bot requests to total requests" gauge
echo "web_traffic_bot_ratio${lwrap} $bot_ratio"
echo ""
fi
# ================================================================
# Derived Metrics
# ================================================================
local avg_response_bytes="0"
if [ "${total_requests:-0}" -gt 0 ] 2>/dev/null; then
avg_response_bytes=$(awk "BEGIN {printf \"%.0f\", ${total_bytes:-0} / ${total_requests:-1}}")
fi
emit_help_type web_traffic_average_response_bytes "Average response size in bytes" gauge
echo "web_traffic_average_response_bytes${lwrap} $avg_response_bytes"
echo ""
local error_ratio="0"
if [ "${total_requests:-0}" -gt 0 ] 2>/dev/null; then
local s4xx_val s5xx_val
s4xx_val=$(grep "^CLASS 4xx " "$parsed_file" | awk '{print $3}')
s5xx_val=$(grep "^CLASS 5xx " "$parsed_file" | awk '{print $3}')
error_ratio=$(awk "BEGIN {printf \"%.4f\", (${s4xx_val:-0} + ${s5xx_val:-0}) / ${total_requests:-1}}")
fi
emit_help_type web_traffic_error_ratio "Ratio of 4xx+5xx errors to total requests" gauge
echo "web_traffic_error_ratio${lwrap} $error_ratio"
echo ""
# ================================================================
# Top Paths
# ================================================================
local top_path_lines
top_path_lines=$(grep "^TOP_PATH " "$parsed_file")
if [ -n "$top_path_lines" ]; then
emit_help_type web_traffic_top_path_requests "Top requested paths by hit count" gauge
echo "$top_path_lines" | while read -r _ rank count path; do
esc_path=$(prom_escape "$path")
echo "web_traffic_top_path_requests${lwrap_comma}path=\"$esc_path\",rank=\"$rank\"} $count"
done
echo ""
fi
# ================================================================
# Top Paths by Bandwidth
# ================================================================
local top_path_bytes_lines
top_path_bytes_lines=$(grep "^TOP_PATH_BYTES " "$parsed_file")
if [ -n "$top_path_bytes_lines" ]; then
emit_help_type web_traffic_top_path_response_bytes "Top paths by response bytes" gauge
echo "$top_path_bytes_lines" | while read -r _ rank bytes path; do
esc_path=$(prom_escape "$path")
echo "web_traffic_top_path_response_bytes${lwrap_comma}path=\"$esc_path\",rank=\"$rank\"} $bytes"
done
echo ""
fi
# ================================================================
# Top Referrers
# ================================================================
local top_ref_lines
top_ref_lines=$(grep "^TOP_REF " "$parsed_file")
if [ -n "$top_ref_lines" ]; then
emit_help_type web_traffic_top_referrer_requests "Top referrers by hit count" gauge
echo "$top_ref_lines" | while read -r _ rank count referrer; do
esc_ref=$(prom_escape "$referrer")
echo "web_traffic_top_referrer_requests${lwrap_comma}referrer=\"$esc_ref\",rank=\"$rank\"} $count"
done
echo ""
fi
# ================================================================
# Downloads
# ================================================================
emit_help_type web_traffic_downloads "Total file downloads" gauge
echo "web_traffic_downloads${lwrap} ${dl_total:-0}"
echo ""
emit_help_type web_traffic_downloads_bytes "Total bytes from file downloads" gauge
echo "web_traffic_downloads_bytes${lwrap} ${dl_bytes:-0}"
echo ""
local top_dl_lines
top_dl_lines=$(grep "^TOP_DOWNLOAD " "$parsed_file")
if [ -n "$top_dl_lines" ]; then
emit_help_type web_traffic_top_download_requests "Top downloaded files by hit count" gauge
echo "$top_dl_lines" | while read -r _ rank count filepath; do
esc_file=$(prom_escape "$filepath")
echo "web_traffic_top_download_requests${lwrap_comma}file=\"$esc_file\",rank=\"$rank\"} $count"
done
echo ""
fi
# ================================================================
# Hourly Traffic Patterns
# ================================================================
local hour_lines
hour_lines=$(grep "^HOUR " "$parsed_file")
if [ -n "$hour_lines" ]; then
emit_help_type web_traffic_requests_by_hour "Requests per hour of day" gauge
echo "$hour_lines" | while read -r _ hour count; do
esc_hour=$(prom_escape "$hour")
echo "web_traffic_requests_by_hour${lwrap_comma}hour=\"$esc_hour\"} $count"
done
echo ""
fi
# ================================================================
# 404 Error Paths
# ================================================================
local top_404_lines
top_404_lines=$(grep "^TOP_404 " "$parsed_file")
if [ -n "$top_404_lines" ]; then
emit_help_type web_traffic_404_path_requests "Top paths returning 404" gauge
echo "$top_404_lines" | while read -r _ rank count path; do
esc_path=$(prom_escape "$path")
echo "web_traffic_404_path_requests${lwrap_comma}path=\"$esc_path\",rank=\"$rank\"} $count"
done
echo ""
fi
# ================================================================
# Top 404 Referrers
# ================================================================
local top_404_ref_lines
top_404_ref_lines=$(grep "^TOP_404_REF " "$parsed_file")
if [ -n "$top_404_ref_lines" ]; then
emit_help_type web_traffic_top_404_referrer_requests "Top referrers sending traffic to 404 pages" gauge
echo "$top_404_ref_lines" | while read -r _ rank count referrer; do
esc_ref=$(prom_escape "$referrer")
echo "web_traffic_top_404_referrer_requests${lwrap_comma}referrer=\"$esc_ref\",rank=\"$rank\"} $count"
done
echo ""
fi
# ================================================================
# Page Views vs Assets
# ================================================================
emit_help_type web_traffic_page_views "Total page view requests" gauge
echo "web_traffic_page_views${lwrap} ${page_views:-0}"
echo ""
emit_help_type web_traffic_asset_requests "Total asset requests" gauge
echo "web_traffic_asset_requests${lwrap} ${asset_reqs:-0}"
echo ""
# ================================================================
# Top Client IPs
# ================================================================
local top_ip_lines
top_ip_lines=$(grep "^TOP_IP " "$parsed_file")
if [ -n "$top_ip_lines" ]; then
emit_help_type web_traffic_top_client_requests "Top client IPs by request count" gauge
echo "$top_ip_lines" | while read -r _ rank count ip; do
esc_ip=$(prom_escape "$ip")
echo "web_traffic_top_client_requests${lwrap_comma}ip=\"$esc_ip\",rank=\"$rank\"} $count"
done
echo ""
fi
# ================================================================
# Response Size Distribution
# ================================================================
local size_lines
size_lines=$(grep "^SIZE_BUCKET " "$parsed_file")
if [ -n "$size_lines" ]; then
emit_help_type web_traffic_response_size_bucket "Requests per response size range" gauge
echo "$size_lines" | while read -r _ size count; do
esc_size=$(prom_escape "$size")
echo "web_traffic_response_size_bucket${lwrap_comma}size=\"$esc_size\"} $count"
done
echo ""
fi
# ================================================================
# Top Bot Names
# ================================================================
local top_bot_lines
top_bot_lines=$(grep "^TOP_BOT " "$parsed_file")
if [ -n "$top_bot_lines" ]; then
emit_help_type web_traffic_top_bot_requests "Top bots by request count" gauge
echo "$top_bot_lines" | while read -r _ rank count bot; do
esc_bot=$(prom_escape "$bot")
echo "web_traffic_top_bot_requests${lwrap_comma}bot=\"$esc_bot\",rank=\"$rank\"} $count"
done
echo ""
fi
# ================================================================
# Protocol Distribution
# ================================================================
local proto_lines
proto_lines=$(grep "^PROTOCOL " "$parsed_file")
if [ -n "$proto_lines" ]; then
emit_help_type web_traffic_requests_by_protocol "Requests per HTTP protocol version" gauge
echo "$proto_lines" | while read -r _ proto count; do
esc_proto=$(prom_escape "$proto")
echo "web_traffic_requests_by_protocol${lwrap_comma}protocol=\"$esc_proto\"} $count"
done
echo ""
fi
# ================================================================
# Request Rate Estimation
# ================================================================
local rpm=0
if [ "${min_epoch:-0}" -gt 0 ] 2>/dev/null && [ "${max_epoch:-0}" -gt 0 ] 2>/dev/null; then
local duration=$((max_epoch - min_epoch))
if [ "$duration" -gt 0 ]; then
rpm=$(awk "BEGIN {printf \"%.2f\", (${total_requests:-0} / $duration) * 60}")
fi
fi
emit_help_type web_traffic_requests_per_minute "Estimated requests per minute from log window" gauge
echo "web_traffic_requests_per_minute${lwrap} $rpm"
echo ""
# ================================================================
# Lines Parsed
# ================================================================
emit_help_type web_traffic_exporter_lines_parsed "Number of log lines parsed" gauge
echo "web_traffic_exporter_lines_parsed${lwrap} ${lines_parsed:-0}"
echo ""
# ================================================================
# Time-Windowed Stats (daily/weekly/monthly)
# ================================================================
local win_lines
win_lines=$(grep "^WIN_" "$parsed_file")
if [ -n "$win_lines" ]; then
local wlp="{"
[ -n "$dlabel_comma" ] && wlp="{${dlabel_comma}"
emit_help_type web_traffic_window_requests "Total requests in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_REQUESTS / {print "web_traffic_window_requests" dl "window=\"" $2 "\"} " $3}'
echo ""
emit_help_type web_traffic_window_bytes "Total response bytes in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_BYTES / {print "web_traffic_window_bytes" dl "window=\"" $2 "\"} " $3}'
echo ""
emit_help_type web_traffic_window_unique_ips "Unique source IPs in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_UNIQUE_IPS / {print "web_traffic_window_unique_ips" dl "window=\"" $2 "\"} " $3}'
echo ""
emit_help_type web_traffic_window_bot_requests "Bot requests in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_BOTS / {print "web_traffic_window_bot_requests" dl "window=\"" $2 "\"} " $3}'
echo ""
emit_help_type web_traffic_window_requests_by_class "Requests per status class in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_STATUS_CLASS / {print "web_traffic_window_requests_by_class" dl "window=\"" $2 "\",class=\"" $3 "\"} " $4}'
echo ""
emit_help_type web_traffic_window_page_views "Page view requests in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_PAGE_VIEWS / {print "web_traffic_window_page_views" dl "window=\"" $2 "\"} " $3}'
echo ""
emit_help_type web_traffic_window_asset_requests "Asset requests in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_ASSET_REQUESTS / {print "web_traffic_window_asset_requests" dl "window=\"" $2 "\"} " $3}'
echo ""
emit_help_type web_traffic_window_downloads "File downloads in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_DOWNLOADS / {print "web_traffic_window_downloads" dl "window=\"" $2 "\"} " $3}'
echo ""
emit_help_type web_traffic_window_downloads_bytes "Download bytes in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_DOWNLOADS_BYTES / {print "web_traffic_window_downloads_bytes" dl "window=\"" $2 "\"} " $3}'
echo ""
emit_help_type web_traffic_window_human_requests "Non-bot requests in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_HUMAN_REQUESTS / {print "web_traffic_window_human_requests" dl "window=\"" $2 "\"} " $3}'
echo ""
emit_help_type web_traffic_window_unique_user_agents "Unique user agents in time window" gauge
echo "$win_lines" | awk -v dl="$wlp" '/^WIN_UNIQUE_UAS / {print "web_traffic_window_unique_user_agents" dl "window=\"" $2 "\"} " $3}'
echo ""
fi
rm -f "$parsed_file"
done > "$metrics_buf"
# Group all samples under their HELP/TYPE headers so multi-domain
# output is valid Prometheus exposition format
awk '
/^# HELP / { metric=$3; if (!(metric in help)) order[n++]=metric; help[metric]=$0; next }
/^# TYPE / { type[$3]=$0; next }
/^$/ { next }
/^[a-zA-Z_]/ { match($0,/^[a-zA-Z_:][a-zA-Z0-9_:]*/); m=substr($0,RSTART,RLENGTH); samples[m]=samples[m] $0 "\n"; next }
END { for(i=0;i<n;i++){m=order[i]; print help[m]; print type[m]; printf "%s",samples[m]; print ""} }
' "$metrics_buf"
rm -f "$metrics_buf"
fi
# ========================================================================
# Exporter Runtime
# ========================================================================
local script_end script_duration
script_end=$(date +%s)
script_duration=$((script_end - script_start))
cat <<EOF
# HELP web_traffic_exporter_duration_seconds Time to generate all metrics
# TYPE web_traffic_exporter_duration_seconds gauge
web_traffic_exporter_duration_seconds $script_duration
# HELP web_traffic_exporter_last_run_timestamp Unix timestamp of last successful run
# TYPE web_traffic_exporter_last_run_timestamp gauge
web_traffic_exporter_last_run_timestamp $script_end
EOF
echo ""
}
# ============================================================================
# HTTP SERVER MODE
# ============================================================================
run_http_server() {
echo "Starting web traffic exporter on port $HTTP_PORT..." >&2
if ! command -v nc >/dev/null 2>&1; then
echo "ERROR: netcat (nc) required for HTTP mode" >&2
exit 1
fi
trap 'echo "Shutting down web traffic exporter..." >&2; exit 0' INT TERM
while true; do
{
read -r request
local body
if [[ "$request" =~ ^GET\ /metrics ]]; then
body=$(generate_metrics)
printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body"
else
body=$(cat <<'HTMLEOF'
<!DOCTYPE html>
<html>
<head><title>Web Traffic Exporter v1.8</title></head>
<body>
<h1>Web Traffic Exporter v1.8</h1>
<p><a href="/metrics">Metrics</a></p>
<h2>Sections (auto-detected)</h2>
<ul>
<li>Request totals by status, class, and method</li>
<li>Bandwidth totals and by status class</li>
<li>Unique visitors (IPs and user agents)</li>
<li>Top 10 requested paths</li>
<li>Top 10 external referrers</li>
<li>Bot vs human traffic detection</li>
<li>HTTP protocol version distribution</li>
<li>Request rate estimation</li>
<li>Web server process detection</li>
</ul>
</body>
</html>
HTMLEOF
)
printf "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body"
fi
} | if nc -h 2>&1 | grep -q 'GNU\|traditional'; then
nc -l -p "$HTTP_PORT" -q 1 2>/dev/null
else
nc -l "$HTTP_PORT" 2>/dev/null
fi
done
}
# ============================================================================
# MAIN EXECUTION
# ============================================================================
main() {
parse_args "$@"
if [ "$HTTP_MODE" = true ]; then
run_http_server
elif [ -n "$OUTPUT_FILE" ]; then
local output_dir
output_dir="$(dirname "$OUTPUT_FILE")"
mkdir -p "$output_dir"
local temp_file
temp_file=$(mktemp "${output_dir}/.web_traffic_metrics.XXXXXX")
if ! generate_metrics > "$temp_file" 2>/dev/null; then
rm -f "$temp_file"
echo "ERROR: Failed to generate metrics" >&2
exit 1
fi
local file_lines
file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0)
if [ "$file_lines" -lt 5 ]; then
rm -f "$temp_file"
echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2
exit 1
fi
chmod 644 "$temp_file"
mv -f "$temp_file" "$OUTPUT_FILE"
echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2
else
generate_metrics
fi
}
main "$@"