a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
611 lines
21 KiB
Bash
Executable File
611 lines
21 KiB
Bash
Executable File
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: web-log-analyzer.sh
|
|
# Version: 1.01
|
|
# Description: Analyze Nginx, Apache, and Caddy access logs and produce a
|
|
# traffic summary. Reports top IPs, status codes, bandwidth by
|
|
# URI, slowest requests, user agents, suspected bots, hourly
|
|
# histogram, and IP list for geographic review.
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
#
|
|
# Prerequisites:
|
|
# - awk, sort, zcat (only for --all-logs with .gz files)
|
|
#
|
|
# Usage:
|
|
# ./web-log-analyzer.sh
|
|
# ./web-log-analyzer.sh --log /var/log/apache2/access.log
|
|
# ./web-log-analyzer.sh --log /var/log/caddy/access.log --format caddy
|
|
# ./web-log-analyzer.sh --all-logs --since 7d
|
|
# ./web-log-analyzer.sh --top 50 --since 24h
|
|
# ./web-log-analyzer.sh --section status,ips
|
|
# ./web-log-analyzer.sh --json
|
|
# ./web-log-analyzer.sh --no-color > report.txt
|
|
#
|
|
# Sections: ips, status, uris, bandwidth, slow, agents, bots, hourly, geo
|
|
################################################################################
|
|
|
|
set -uo pipefail
|
|
|
|
# ============================================================================
|
|
# DEFAULTS
|
|
# ============================================================================
|
|
|
|
LOG_FILE=""
|
|
TOP_N=20
|
|
JSON_MODE=false
|
|
NO_COLOR=false
|
|
SECTION=""
|
|
SINCE=""
|
|
FORMAT="auto"
|
|
ALL_LOGS=false
|
|
VERSION="1.01"
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
YELLOW='\033[1;33m'
|
|
GREEN='\033[0;32m'
|
|
CYAN='\033[0;36m'
|
|
BOLD='\033[1m'
|
|
DIM='\033[2m'
|
|
NC='\033[0m'
|
|
|
|
# Bot user-agent patterns (case-insensitive via regex alternation)
|
|
BOT_PATTERNS="[Bb]ot|[Cc]rawl|[Ss]pider|[Ss]lurp|[Ss]craper"
|
|
BOT_PATTERNS="${BOT_PATTERNS}|Googlebot|bingbot|YandexBot|AhrefsBot|SemrushBot"
|
|
BOT_PATTERNS="${BOT_PATTERNS}|DotBot|MJ12bot|BLEXBot|PetalBot|Bytespider"
|
|
BOT_PATTERNS="${BOT_PATTERNS}|GPTBot|ClaudeBot|CCBot|Applebot"
|
|
BOT_PATTERNS="${BOT_PATTERNS}|facebookexternalhit|Twitterbot|LinkedInBot"
|
|
BOT_PATTERNS="${BOT_PATTERNS}|DataForSeoBot|Go-http-client|python-requests|curl/|wget/"
|
|
|
|
# ============================================================================
|
|
# USAGE & ARGUMENT PARSING
|
|
# ============================================================================
|
|
|
|
show_usage() {
|
|
cat <<EOF
|
|
Usage: $0 [OPTIONS]
|
|
|
|
Analyze Nginx, Apache, and Caddy access logs (v${VERSION}).
|
|
|
|
OPTIONS:
|
|
--log PATH Path to access log (auto-detected if omitted)
|
|
--format FORMAT Log format: auto (default), combined, caddy
|
|
--all-logs Include rotated logs (.1, .2.gz) from same directory
|
|
--top N Number of entries per section (default: $TOP_N)
|
|
--json Output as JSON
|
|
--section NAMES Comma-separated sections to run:
|
|
ips, status, uris, bandwidth, slow, agents, bots, hourly, geo
|
|
--since RANGE Time filter — ISO date (2026-04-01) or relative (1h, 24h, 7d)
|
|
--no-color Disable colored output
|
|
-h, --help Show this help
|
|
|
|
EXAMPLES:
|
|
$0 # Auto-detect log file
|
|
$0 --log /var/log/nginx/access.log # Nginx
|
|
$0 --log /var/log/apache2/access.log # Apache
|
|
$0 --log /var/log/caddy/access.log # Caddy (auto-detected JSON)
|
|
$0 --all-logs # Include rotated logs
|
|
$0 --all-logs --since 7d # Rotated logs, last 7 days
|
|
$0 --top 50 --since 24h # Top 50, last 24 hours
|
|
$0 --section status,ips # Specific sections only
|
|
$0 --json | jq .summary # JSON output piped to jq
|
|
$0 --no-color > report.txt # Save to file
|
|
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-h|--help) show_usage ;;
|
|
--log) LOG_FILE="$2"; shift 2 ;;
|
|
--format) FORMAT="$2"; shift 2 ;;
|
|
--all-logs) ALL_LOGS=true; shift ;;
|
|
--top) TOP_N="$2"; shift 2 ;;
|
|
--json) JSON_MODE=true; shift ;;
|
|
--no-color) NO_COLOR=true; shift ;;
|
|
--section) SECTION="$2"; shift 2 ;;
|
|
--since) SINCE="$2"; shift 2 ;;
|
|
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
esac
|
|
done
|
|
if [[ "$NO_COLOR" == true ]]; then
|
|
RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC=""
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# HELPERS
|
|
# ============================================================================
|
|
|
|
section_enabled() {
|
|
[[ -z "$SECTION" ]] || echo ",$SECTION," | grep -qi ",$1,"
|
|
}
|
|
|
|
header() {
|
|
echo ""
|
|
echo -e "${CYAN}====================================================${NC}"
|
|
echo -e "${CYAN} ${BOLD}${1}${NC}"
|
|
echo -e "${CYAN}====================================================${NC}"
|
|
echo ""
|
|
}
|
|
|
|
format_bytes() {
|
|
local b="$1"
|
|
if [[ "$b" -ge 1073741824 ]]; then
|
|
awk "BEGIN {printf \"%.2f GB\", $b/1073741824}"
|
|
elif [[ "$b" -ge 1048576 ]]; then
|
|
awk "BEGIN {printf \"%.1f MB\", $b/1048576}"
|
|
elif [[ "$b" -ge 1024 ]]; then
|
|
awk "BEGIN {printf \"%.1f KB\", $b/1024}"
|
|
else
|
|
echo "${b} B"
|
|
fi
|
|
}
|
|
|
|
fmt_num() {
|
|
printf "%'d" "$1" 2>/dev/null || echo "$1"
|
|
}
|
|
|
|
# ============================================================================
|
|
# LOG DETECTION
|
|
# ============================================================================
|
|
|
|
detect_log_file() {
|
|
[[ -n "$LOG_FILE" ]] && return
|
|
|
|
local candidates=(
|
|
/var/log/nginx/access.log
|
|
/var/log/apache2/access.log
|
|
/var/log/httpd/access_log
|
|
/var/log/caddy/access.log
|
|
)
|
|
for f in "${candidates[@]}"; do
|
|
if [[ -f "$f" && -r "$f" ]]; then
|
|
LOG_FILE="$f"
|
|
return
|
|
fi
|
|
done
|
|
|
|
echo "ERROR: no access log found. Checked:" >&2
|
|
for f in "${candidates[@]}"; do
|
|
echo " $f" >&2
|
|
done
|
|
echo "Specify a log file with --log PATH" >&2
|
|
exit 1
|
|
}
|
|
|
|
detect_format() {
|
|
[[ "$FORMAT" != "auto" ]] && return
|
|
|
|
# Check file path
|
|
case "$LOG_FILE" in
|
|
*caddy*) FORMAT="caddy"; return ;;
|
|
esac
|
|
|
|
# Check first line content — JSON means Caddy
|
|
local first_line
|
|
if [[ "$LOG_FILE" == *.gz ]]; then
|
|
first_line=$(zcat "$LOG_FILE" 2>/dev/null | head -1)
|
|
else
|
|
first_line=$(head -1 "$LOG_FILE" 2>/dev/null || true)
|
|
fi
|
|
if [[ "$first_line" == "{"* ]]; then
|
|
FORMAT="caddy"
|
|
else
|
|
FORMAT="combined"
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# ROTATED LOG GATHERING
|
|
# ============================================================================
|
|
|
|
declare -a LOG_FILES=()
|
|
|
|
gather_logs() {
|
|
LOG_FILES=("$LOG_FILE")
|
|
|
|
if [[ "$ALL_LOGS" == true ]]; then
|
|
local dir name
|
|
dir=$(dirname "$LOG_FILE")
|
|
name=$(basename "$LOG_FILE")
|
|
|
|
# Numbered rotated logs: access.log.1, access.log.2, ...
|
|
local i
|
|
for i in $(seq 1 99); do
|
|
[[ -f "${dir}/${name}.${i}" ]] && LOG_FILES+=("${dir}/${name}.${i}")
|
|
done
|
|
|
|
# Gzipped rotated logs: access.log.2.gz, access.log.3.gz, ...
|
|
for f in "${dir}/${name}".*.gz; do
|
|
[[ -f "$f" ]] && LOG_FILES+=("$f")
|
|
done
|
|
fi
|
|
}
|
|
|
|
cat_all_logs() {
|
|
local lf
|
|
for lf in "${LOG_FILES[@]}"; do
|
|
if [[ "$lf" == *.gz ]]; then
|
|
zcat "$lf" 2>/dev/null || true
|
|
else
|
|
cat "$lf"
|
|
fi
|
|
done
|
|
}
|
|
|
|
# ============================================================================
|
|
# TIME FILTER
|
|
# ============================================================================
|
|
|
|
compute_since_epoch() {
|
|
local v="$1"
|
|
local now
|
|
now=$(date +%s)
|
|
|
|
[[ -z "$v" ]] && { echo 0; return; }
|
|
|
|
# Relative: 1h, 24h, 7d, 30m
|
|
if [[ "$v" =~ ^([0-9]+)([hdm])$ ]]; then
|
|
local n="${BASH_REMATCH[1]}"
|
|
local u="${BASH_REMATCH[2]}"
|
|
case "$u" in
|
|
h) echo $(( now - n * 3600 )) ;;
|
|
d) echo $(( now - n * 86400 )) ;;
|
|
m) echo $(( now - n * 60 )) ;;
|
|
esac
|
|
return
|
|
fi
|
|
|
|
# Absolute: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS
|
|
if date -d "$v" +%s >/dev/null 2>&1; then
|
|
date -d "$v" +%s
|
|
return
|
|
fi
|
|
|
|
echo "ERROR: cannot parse --since value: $v" >&2
|
|
exit 1
|
|
}
|
|
|
|
# ============================================================================
|
|
# LOG PARSING — COMBINED FORMAT (Nginx / Apache)
|
|
# ============================================================================
|
|
|
|
prepare_data_combined() {
|
|
local since_epoch cutoff_str=""
|
|
since_epoch=$(compute_since_epoch "$SINCE")
|
|
|
|
if [[ "$since_epoch" -gt 0 ]]; then
|
|
cutoff_str=$(date -d "@${since_epoch}" '+%d/%b/%Y:%H:%M:%S' 2>/dev/null || true)
|
|
fi
|
|
|
|
# Parse combined log format into tab-separated fields.
|
|
# Output: ip \t time \t uri \t status \t bytes \t user_agent \t request_time
|
|
cat_all_logs | awk -v cutoff="$cutoff_str" '
|
|
{
|
|
ip = $1
|
|
|
|
# Extract timestamp from [dd/Mon/YYYY:HH:MM:SS ...]
|
|
match($0, /\[([0-9]{2}\/[A-Za-z]{3}\/[0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2})/, ta)
|
|
time = ta[1]
|
|
if (cutoff != "" && time < cutoff) next
|
|
|
|
# Request line: "METHOD URI HTTP/x.x"
|
|
match($0, /"([A-Z]+) ([^ ]+) HTTP\/[0-9."]+"/, ra)
|
|
uri = ra[2]
|
|
|
|
# Status code and body bytes
|
|
match($0, /" ([0-9]{3}) ([0-9]+|-) /, sa)
|
|
status = sa[1]
|
|
bytes = sa[2]
|
|
if (bytes == "-") bytes = 0
|
|
|
|
# User agent (third quoted field after request line)
|
|
n = split($0, parts, "\"")
|
|
ua = (n >= 6) ? parts[6] : "-"
|
|
|
|
# Optional request_time (float at end of line after last quote)
|
|
rt = ""
|
|
if (match($0, /" ([0-9]+\.[0-9]+)$/, rta)) rt = rta[1]
|
|
|
|
print ip "\t" time "\t" uri "\t" status "\t" bytes "\t" ua "\t" rt
|
|
}' > "$TMP_DATA"
|
|
}
|
|
|
|
# ============================================================================
|
|
# LOG PARSING — CADDY JSON FORMAT
|
|
# ============================================================================
|
|
|
|
prepare_data_caddy() {
|
|
local since_epoch
|
|
since_epoch=$(compute_since_epoch "$SINCE")
|
|
|
|
# Parse Caddy JSON log lines into the same tab-separated format.
|
|
# Caddy JSON fields: .request.remote_ip, .ts, .request.uri, .status,
|
|
# .size, .request.headers["User-Agent"][0], .duration
|
|
cat_all_logs | awk -v since_epoch="$since_epoch" '
|
|
BEGIN { split("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec", months) }
|
|
/^{/ {
|
|
# remote_ip
|
|
ip = ""
|
|
if (match($0, /"remote_ip":"([^"]*)"/, m)) ip = m[1]
|
|
# strip port if present (Caddy sometimes includes ip:port)
|
|
sub(/:.*/, "", ip)
|
|
if (ip == "") next
|
|
|
|
# timestamp (epoch float)
|
|
ts = 0
|
|
if (match($0, /"ts":([0-9]+\.?[0-9]*)/, m)) ts = m[1]+0
|
|
if (since_epoch > 0 && ts < since_epoch) next
|
|
|
|
# Convert epoch to dd/Mon/YYYY:HH:MM:SS
|
|
epoch_int = int(ts)
|
|
cmd = "date -d @" epoch_int " +\"%d/%b/%Y:%H:%M:%S\" 2>/dev/null"
|
|
cmd | getline time
|
|
close(cmd)
|
|
|
|
# uri
|
|
uri = ""
|
|
if (match($0, /"uri":"([^"]*)"/, m)) uri = m[1]
|
|
|
|
# status
|
|
status = ""
|
|
if (match($0, /"status":([0-9]+)/, m)) status = m[1]
|
|
|
|
# size (bytes)
|
|
bytes = 0
|
|
if (match($0, /"size":([0-9]+)/, m)) bytes = m[1]
|
|
|
|
# user agent
|
|
ua = "-"
|
|
if (match($0, /"User-Agent":\["([^"]*)"/, m)) ua = m[1]
|
|
|
|
# duration (seconds as float)
|
|
rt = ""
|
|
if (match($0, /"duration":([0-9]+\.?[0-9]*)/, m)) rt = m[1]
|
|
|
|
print ip "\t" time "\t" uri "\t" status "\t" bytes "\t" ua "\t" rt
|
|
}' > "$TMP_DATA"
|
|
}
|
|
|
|
# ============================================================================
|
|
# PREPARE DATA (dispatch to correct parser)
|
|
# ============================================================================
|
|
|
|
prepare_data() {
|
|
TMP_DATA=$(mktemp)
|
|
trap 'rm -f "$TMP_DATA"' EXIT
|
|
|
|
case "$FORMAT" in
|
|
combined) prepare_data_combined ;;
|
|
caddy) prepare_data_caddy ;;
|
|
*) echo "ERROR: unknown format: $FORMAT" >&2; exit 1 ;;
|
|
esac
|
|
|
|
TOTAL_LINES=$(wc -l < "$TMP_DATA")
|
|
if [[ "$TOTAL_LINES" -eq 0 ]]; then
|
|
echo "No log entries found (check --log path or --since filter)." >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# REPORT SECTIONS
|
|
# ============================================================================
|
|
|
|
report_ips() {
|
|
header "Top ${TOP_N} IPs by Requests"
|
|
printf " ${BOLD}%4s %-40s %10s %12s${NC}\n" "#" "IP Address" "Requests" "Bandwidth"
|
|
awk -F'\t' '{ip[$1]++; bw[$1]+=$5} END {for(i in ip) print ip[i],bw[i],i}' "$TMP_DATA" \
|
|
| sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num count bytes ip; do
|
|
printf " %4s %-40s %10s %12s\n" "$num" "$ip" "$(fmt_num "$count")" "$(format_bytes "$bytes")"
|
|
done
|
|
}
|
|
|
|
report_status() {
|
|
header "Status Code Breakdown"
|
|
local total=$TOTAL_LINES
|
|
awk -F'\t' '{c[substr($4,1,1)"xx"]++} END {for(k in c) print k,c[k]}' "$TMP_DATA" \
|
|
| sort | while read -r cls count; do
|
|
local pct color="$NC"
|
|
pct=$(awk "BEGIN {printf \"%.1f\",($count/$total)*100}")
|
|
case "$cls" in 2xx) color="$GREEN";; 3xx) color="$CYAN";; 4xx) color="$YELLOW";; 5xx) color="$RED";; esac
|
|
printf " ${color}%-6s %10s (%5s%%)${NC}\n" "$cls:" "$(fmt_num "$count")" "$pct"
|
|
done
|
|
echo ""; echo " Detail:"
|
|
awk -F'\t' '{d[$4]++} END {for(k in d) print d[k],k}' "$TMP_DATA" \
|
|
| sort -rn | head -n "$TOP_N" | while read -r count code; do
|
|
printf " %-8s %10s\n" "$code" "$(fmt_num "$count")"
|
|
done
|
|
}
|
|
|
|
report_uris() {
|
|
header "Top ${TOP_N} URIs by Hits"
|
|
printf " ${BOLD}%4s %-60s %10s${NC}\n" "#" "URI" "Hits"
|
|
awk -F'\t' '{u[$3]++} END {for(k in u) print u[k],k}' "$TMP_DATA" \
|
|
| sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num count uri; do
|
|
printf " %4s %-60s %10s\n" "$num" "${uri:0:58}" "$(fmt_num "$count")"
|
|
done
|
|
}
|
|
|
|
report_bandwidth() {
|
|
header "Top ${TOP_N} URIs by Bandwidth"
|
|
printf " ${BOLD}%4s %-55s %12s %8s${NC}\n" "#" "URI" "Bandwidth" "Hits"
|
|
awk -F'\t' '{bw[$3]+=$5; h[$3]++} END {for(k in bw) print bw[k],h[k],k}' "$TMP_DATA" \
|
|
| sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num bytes hits uri; do
|
|
printf " %4s %-55s %12s %8s\n" "$num" "${uri:0:53}" "$(format_bytes "$bytes")" "$(fmt_num "$hits")"
|
|
done
|
|
}
|
|
|
|
report_slow() {
|
|
local has_rt
|
|
has_rt=$(awk -F'\t' '$7!=""{found=1;exit} END{print found+0}' "$TMP_DATA")
|
|
if [[ "$has_rt" -eq 0 ]]; then
|
|
header "Slowest Requests"
|
|
echo -e " ${DIM}(skipped — no request_time/duration field detected in log)${NC}"; return
|
|
fi
|
|
header "Slowest ${TOP_N} Requests"
|
|
printf " ${BOLD}%4s %8s %-6s %-50s %10s${NC}\n" "#" "Time(s)" "Status" "URI" "Bytes"
|
|
awk -F'\t' '$7!=""{print $7,$4,$5,$3}' "$TMP_DATA" \
|
|
| sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num rt status bytes uri; do
|
|
printf " %4s %8s %-6s %-50s %10s\n" "$num" "$rt" "$status" "${uri:0:48}" "$(format_bytes "$bytes")"
|
|
done
|
|
}
|
|
|
|
report_agents() {
|
|
header "Top ${TOP_N} User Agents"
|
|
printf " ${BOLD}%4s %-70s %10s${NC}\n" "#" "User Agent" "Requests"
|
|
awk -F'\t' '{a[$6]++} END {for(k in a) print a[k],k}' "$TMP_DATA" \
|
|
| sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num count agent; do
|
|
printf " %4s %-70s %10s\n" "$num" "${agent:0:68}" "$(fmt_num "$count")"
|
|
done
|
|
}
|
|
|
|
report_bots() {
|
|
header "Suspected Bots"
|
|
printf " ${BOLD}%4s %-65s %10s${NC}\n" "#" "User Agent" "Requests"
|
|
awk -F'\t' -v p="$BOT_PATTERNS" '$6~p{b[$6]++} END {for(k in b) print b[k],k}' "$TMP_DATA" \
|
|
| sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num count agent; do
|
|
printf " %4s %-65s %10s\n" "$num" "${agent:0:63}" "$(fmt_num "$count")"
|
|
done
|
|
local bt; bt=$(awk -F'\t' -v p="$BOT_PATTERNS" '$6~p{c++} END{print c+0}' "$TMP_DATA")
|
|
if [[ "$TOTAL_LINES" -gt 0 ]]; then
|
|
local pct; pct=$(awk "BEGIN{printf \"%.1f\",($bt/$TOTAL_LINES)*100}")
|
|
echo ""; echo -e " Bot traffic: ${YELLOW}$(fmt_num "$bt")${NC} requests (${pct}% of total)"
|
|
fi
|
|
}
|
|
|
|
report_hourly() {
|
|
header "Requests per Hour"
|
|
local max_c
|
|
max_c=$(awk -F'\t' '{split($2,t,":");h[t[2]]++} END {m=0;for(k in h) if(h[k]>m) m=h[k];print m}' "$TMP_DATA")
|
|
awk -F'\t' '{split($2,t,":");h[t[2]]++} END {for(i=0;i<24;i++){hh=sprintf("%02d",i);c=(hh in h)?h[hh]:0;printf "%s\t%d\n",hh,c}}' "$TMP_DATA" \
|
|
| sort -t$'\t' -k1,1 | while IFS=$'\t' read -r hour count; do
|
|
local bl=0
|
|
[[ "$max_c" -gt 0 ]] && bl=$(( count * 50 / max_c ))
|
|
local bar; bar=$(printf '%*s' "$bl" '' | tr ' ' '#')
|
|
printf " ${BOLD}%s:00${NC} %8s ${GREEN}%s${NC}\n" "$hour" "$(fmt_num "$count")" "$bar"
|
|
done
|
|
}
|
|
|
|
report_geo() {
|
|
header "Unique IPs (Geographic Review)"
|
|
echo -e " ${DIM}(No GeoIP lookup — pipe IPs to geoiplookup or an API for country data)${NC}"
|
|
local uc; uc=$(awk -F'\t' '!s[$1]++{c++} END{print c}' "$TMP_DATA")
|
|
echo ""; echo " Unique IPs: $(fmt_num "$uc")"
|
|
echo ""; echo " Top ${TOP_N} by request count:"
|
|
awk -F'\t' '{ip[$1]++} END {for(i in ip) print ip[i],i}' "$TMP_DATA" \
|
|
| sort -rn | head -n "$TOP_N" | while read -r count ip; do
|
|
printf " %-40s %s\n" "$ip" "$(fmt_num "$count")"
|
|
done
|
|
}
|
|
|
|
report_summary() {
|
|
header "Summary"
|
|
local total=$TOTAL_LINES
|
|
local uips; uips=$(awk -F'\t' '!s[$1]++' "$TMP_DATA" | wc -l)
|
|
local tb; tb=$(awk -F'\t' '{s+=$5} END{print s+0}' "$TMP_DATA")
|
|
local ft; ft=$(awk -F'\t' 'NR==1{print $2}' "$TMP_DATA")
|
|
local lt; lt=$(awk -F'\t' 'END{print $2}' "$TMP_DATA")
|
|
printf " %-22s %s\n" "Total requests:" "$(fmt_num "$total")"
|
|
printf " %-22s %s\n" "Unique IPs:" "$(fmt_num "$uips")"
|
|
printf " %-22s %s\n" "Total bandwidth:" "$(format_bytes "$tb")"
|
|
printf " %-22s %s — %s\n" "Time range:" "$ft" "$lt"
|
|
if [[ "$ALL_LOGS" == true && ${#LOG_FILES[@]} -gt 1 ]]; then
|
|
printf " %-22s %s\n" "Log files analyzed:" "${#LOG_FILES[@]}"
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# JSON OUTPUT
|
|
# ============================================================================
|
|
|
|
json_output() {
|
|
awk -F'\t' -v top_n="$TOP_N" -v bp="$BOT_PATTERNS" '
|
|
{ ips[$1]++;ipb[$1]+=$5;st[$4]++;cl[substr($4,1,1)"xx"]++;uh[$3]++;ub[$3]+=$5
|
|
a[$6]++;if($6~bp)bots[$6]++;split($2,t,":");hr[t[2]]++;tb+=$5
|
|
if(!si[$1]++)ui++;if(NR==1)ft=$2;lt=$2;tot++ }
|
|
END {
|
|
printf "{\n \"summary\":{\"total_requests\":%d,\"unique_ips\":%d,\"total_bytes\":%d,\"first_timestamp\":\"%s\",\"last_timestamp\":\"%s\"},\n",tot,ui,tb,ft,lt
|
|
printf " \"status_classes\":{"; s=""; for(c in cl){printf "%s\"%s\":%d",s,c,cl[c];s=","};printf "},\n"
|
|
printf " \"status_codes\":{"; s=""; for(c in st){printf "%s\"%s\":%d",s,c,st[c];s=","};printf "},\n"
|
|
printf " \"top_ips\":["; n=asorti(ips,si2);delete il
|
|
for(i=1;i<=n;i++) il[i]=sprintf("%012d\t%s\t%d",ips[si2[i]],si2[i],ipb[si2[i]])
|
|
asort(il);s="";sh=0;for(i=n;i>=1&&sh<top_n;i--){split(il[i],f,"\t");printf "%s{\"ip\":\"%s\",\"requests\":%d,\"bytes\":%d}",s,f[2],f[1]+0,f[3];s=",";sh++}
|
|
printf "],\n"
|
|
printf " \"top_uris\":["; n=asorti(uh,su);delete ul
|
|
for(i=1;i<=n;i++) ul[i]=sprintf("%012d\t%s",uh[su[i]],su[i])
|
|
asort(ul);s="";sh=0;for(i=n;i>=1&&sh<top_n;i--){split(ul[i],f,"\t");gsub(/"/,"\\\"",f[2]);printf "%s{\"uri\":\"%s\",\"hits\":%d}",s,f[2],f[1]+0;s=",";sh++}
|
|
printf "],\n"
|
|
printf " \"requests_per_hour\":{"; s=""
|
|
for(h=0;h<24;h++){hh=sprintf("%02d",h);c=(hh in hr)?hr[hh]:0;printf "%s\"%s\":%d",s,hh,c;s=","};printf "},\n"
|
|
printf " \"suspected_bots\":["; s=""
|
|
for(b in bots){gsub(/"/,"\\\"",b);printf "%s{\"user_agent\":\"%s\",\"requests\":%d}",s,b,bots[b];s=","};printf "]\n}\n"
|
|
}' "$TMP_DATA"
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN
|
|
# ============================================================================
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
# Detect log file and format
|
|
detect_log_file
|
|
|
|
if [[ ! -f "$LOG_FILE" ]]; then
|
|
echo "ERROR: log file not found: $LOG_FILE" >&2
|
|
exit 1
|
|
fi
|
|
if [[ ! -r "$LOG_FILE" ]]; then
|
|
echo "ERROR: cannot read log file: $LOG_FILE (check permissions)" >&2
|
|
exit 1
|
|
fi
|
|
|
|
detect_format
|
|
gather_logs
|
|
|
|
# Print header
|
|
if [[ "$JSON_MODE" != true ]]; then
|
|
echo -e "${BOLD}Web Log Analyzer v${VERSION}${NC}"
|
|
echo -e "Log: ${DIM}${LOG_FILE}${NC}"
|
|
echo -e "Format: ${DIM}${FORMAT}${NC}"
|
|
[[ -n "$SINCE" ]] && echo -e "Since: ${DIM}${SINCE}${NC}"
|
|
if [[ "$ALL_LOGS" == true && ${#LOG_FILES[@]} -gt 1 ]]; then
|
|
echo -e "Logs: ${DIM}${#LOG_FILES[@]} files (including rotated)${NC}"
|
|
fi
|
|
fi
|
|
|
|
# Parse log into temp working file
|
|
prepare_data
|
|
|
|
# JSON mode — single-pass output and exit
|
|
if [[ "$JSON_MODE" == true ]]; then
|
|
json_output
|
|
exit 0
|
|
fi
|
|
|
|
# Run selected report sections
|
|
section_enabled "ips" && report_ips
|
|
section_enabled "status" && report_status
|
|
section_enabled "uris" && report_uris
|
|
section_enabled "bandwidth" && report_bandwidth
|
|
section_enabled "slow" && report_slow
|
|
section_enabled "agents" && report_agents
|
|
section_enabled "bots" && report_bots
|
|
section_enabled "hourly" && report_hourly
|
|
section_enabled "geo" && report_geo
|
|
|
|
# Summary always runs
|
|
report_summary
|
|
}
|
|
|
|
main "$@"
|