#!/bin/bash ################################################################################ # Script Name: web-log-analyzer.sh # Version: 1.01 # Description: Analyze Nginx, Apache, and Caddy access logs and produce a # traffic summary. Reports top IPs, status codes, bandwidth by # URI, slowest requests, user agents, suspected bots, hourly # histogram, and IP list for geographic review. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - awk, sort, zcat (only for --all-logs with .gz files) # # Usage: # ./web-log-analyzer.sh # ./web-log-analyzer.sh --log /var/log/apache2/access.log # ./web-log-analyzer.sh --log /var/log/caddy/access.log --format caddy # ./web-log-analyzer.sh --all-logs --since 7d # ./web-log-analyzer.sh --top 50 --since 24h # ./web-log-analyzer.sh --section status,ips # ./web-log-analyzer.sh --json # ./web-log-analyzer.sh --no-color > report.txt # # Sections: ips, status, uris, bandwidth, slow, agents, bots, hourly, geo ################################################################################ set -uo pipefail # ============================================================================ # DEFAULTS # ============================================================================ LOG_FILE="" TOP_N=20 JSON_MODE=false NO_COLOR=false SECTION="" SINCE="" FORMAT="auto" ALL_LOGS=false VERSION="1.01" # Colors RED='\033[0;31m' YELLOW='\033[1;33m' GREEN='\033[0;32m' CYAN='\033[0;36m' BOLD='\033[1m' DIM='\033[2m' NC='\033[0m' # Bot user-agent patterns (case-insensitive via regex alternation) BOT_PATTERNS="[Bb]ot|[Cc]rawl|[Ss]pider|[Ss]lurp|[Ss]craper" BOT_PATTERNS="${BOT_PATTERNS}|Googlebot|bingbot|YandexBot|AhrefsBot|SemrushBot" BOT_PATTERNS="${BOT_PATTERNS}|DotBot|MJ12bot|BLEXBot|PetalBot|Bytespider" BOT_PATTERNS="${BOT_PATTERNS}|GPTBot|ClaudeBot|CCBot|Applebot" BOT_PATTERNS="${BOT_PATTERNS}|facebookexternalhit|Twitterbot|LinkedInBot" BOT_PATTERNS="${BOT_PATTERNS}|DataForSeoBot|Go-http-client|python-requests|curl/|wget/" # ============================================================================ # USAGE & ARGUMENT PARSING # ============================================================================ show_usage() { cat < report.txt # Save to file EOF exit 0 } parse_args() { while [[ $# -gt 0 ]]; do case $1 in -h|--help) show_usage ;; --log) LOG_FILE="$2"; shift 2 ;; --format) FORMAT="$2"; shift 2 ;; --all-logs) ALL_LOGS=true; shift ;; --top) TOP_N="$2"; shift 2 ;; --json) JSON_MODE=true; shift ;; --no-color) NO_COLOR=true; shift ;; --section) SECTION="$2"; shift 2 ;; --since) SINCE="$2"; shift 2 ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac done if [[ "$NO_COLOR" == true ]]; then RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC="" fi } # ============================================================================ # HELPERS # ============================================================================ section_enabled() { [[ -z "$SECTION" ]] || echo ",$SECTION," | grep -qi ",$1," } header() { echo "" echo -e "${CYAN}====================================================${NC}" echo -e "${CYAN} ${BOLD}${1}${NC}" echo -e "${CYAN}====================================================${NC}" echo "" } format_bytes() { local b="$1" if [[ "$b" -ge 1073741824 ]]; then awk "BEGIN {printf \"%.2f GB\", $b/1073741824}" elif [[ "$b" -ge 1048576 ]]; then awk "BEGIN {printf \"%.1f MB\", $b/1048576}" elif [[ "$b" -ge 1024 ]]; then awk "BEGIN {printf \"%.1f KB\", $b/1024}" else echo "${b} B" fi } fmt_num() { printf "%'d" "$1" 2>/dev/null || echo "$1" } # ============================================================================ # LOG DETECTION # ============================================================================ detect_log_file() { [[ -n "$LOG_FILE" ]] && return local candidates=( /var/log/nginx/access.log /var/log/apache2/access.log /var/log/httpd/access_log /var/log/caddy/access.log ) for f in "${candidates[@]}"; do if [[ -f "$f" && -r "$f" ]]; then LOG_FILE="$f" return fi done echo "ERROR: no access log found. Checked:" >&2 for f in "${candidates[@]}"; do echo " $f" >&2 done echo "Specify a log file with --log PATH" >&2 exit 1 } detect_format() { [[ "$FORMAT" != "auto" ]] && return # Check file path case "$LOG_FILE" in *caddy*) FORMAT="caddy"; return ;; esac # Check first line content — JSON means Caddy local first_line if [[ "$LOG_FILE" == *.gz ]]; then first_line=$(zcat "$LOG_FILE" 2>/dev/null | head -1) else first_line=$(head -1 "$LOG_FILE" 2>/dev/null || true) fi if [[ "$first_line" == "{"* ]]; then FORMAT="caddy" else FORMAT="combined" fi } # ============================================================================ # ROTATED LOG GATHERING # ============================================================================ declare -a LOG_FILES=() gather_logs() { LOG_FILES=("$LOG_FILE") if [[ "$ALL_LOGS" == true ]]; then local dir name dir=$(dirname "$LOG_FILE") name=$(basename "$LOG_FILE") # Numbered rotated logs: access.log.1, access.log.2, ... local i for i in $(seq 1 99); do [[ -f "${dir}/${name}.${i}" ]] && LOG_FILES+=("${dir}/${name}.${i}") done # Gzipped rotated logs: access.log.2.gz, access.log.3.gz, ... for f in "${dir}/${name}".*.gz; do [[ -f "$f" ]] && LOG_FILES+=("$f") done fi } cat_all_logs() { local lf for lf in "${LOG_FILES[@]}"; do if [[ "$lf" == *.gz ]]; then zcat "$lf" 2>/dev/null || true else cat "$lf" fi done } # ============================================================================ # TIME FILTER # ============================================================================ compute_since_epoch() { local v="$1" local now now=$(date +%s) [[ -z "$v" ]] && { echo 0; return; } # Relative: 1h, 24h, 7d, 30m if [[ "$v" =~ ^([0-9]+)([hdm])$ ]]; then local n="${BASH_REMATCH[1]}" local u="${BASH_REMATCH[2]}" case "$u" in h) echo $(( now - n * 3600 )) ;; d) echo $(( now - n * 86400 )) ;; m) echo $(( now - n * 60 )) ;; esac return fi # Absolute: YYYY-MM-DD or YYYY-MM-DD HH:MM:SS if date -d "$v" +%s >/dev/null 2>&1; then date -d "$v" +%s return fi echo "ERROR: cannot parse --since value: $v" >&2 exit 1 } # ============================================================================ # LOG PARSING — COMBINED FORMAT (Nginx / Apache) # ============================================================================ prepare_data_combined() { local since_epoch cutoff_str="" since_epoch=$(compute_since_epoch "$SINCE") if [[ "$since_epoch" -gt 0 ]]; then cutoff_str=$(date -d "@${since_epoch}" '+%d/%b/%Y:%H:%M:%S' 2>/dev/null || true) fi # Parse combined log format into tab-separated fields. # Output: ip \t time \t uri \t status \t bytes \t user_agent \t request_time cat_all_logs | awk -v cutoff="$cutoff_str" ' { ip = $1 # Extract timestamp from [dd/Mon/YYYY:HH:MM:SS ...] match($0, /\[([0-9]{2}\/[A-Za-z]{3}\/[0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2})/, ta) time = ta[1] if (cutoff != "" && time < cutoff) next # Request line: "METHOD URI HTTP/x.x" match($0, /"([A-Z]+) ([^ ]+) HTTP\/[0-9."]+"/, ra) uri = ra[2] # Status code and body bytes match($0, /" ([0-9]{3}) ([0-9]+|-) /, sa) status = sa[1] bytes = sa[2] if (bytes == "-") bytes = 0 # User agent (third quoted field after request line) n = split($0, parts, "\"") ua = (n >= 6) ? parts[6] : "-" # Optional request_time (float at end of line after last quote) rt = "" if (match($0, /" ([0-9]+\.[0-9]+)$/, rta)) rt = rta[1] print ip "\t" time "\t" uri "\t" status "\t" bytes "\t" ua "\t" rt }' > "$TMP_DATA" } # ============================================================================ # LOG PARSING — CADDY JSON FORMAT # ============================================================================ prepare_data_caddy() { local since_epoch since_epoch=$(compute_since_epoch "$SINCE") # Parse Caddy JSON log lines into the same tab-separated format. # Caddy JSON fields: .request.remote_ip, .ts, .request.uri, .status, # .size, .request.headers["User-Agent"][0], .duration cat_all_logs | awk -v since_epoch="$since_epoch" ' BEGIN { split("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec", months) } /^{/ { # remote_ip ip = "" if (match($0, /"remote_ip":"([^"]*)"/, m)) ip = m[1] # strip port if present (Caddy sometimes includes ip:port) sub(/:.*/, "", ip) if (ip == "") next # timestamp (epoch float) ts = 0 if (match($0, /"ts":([0-9]+\.?[0-9]*)/, m)) ts = m[1]+0 if (since_epoch > 0 && ts < since_epoch) next # Convert epoch to dd/Mon/YYYY:HH:MM:SS epoch_int = int(ts) cmd = "date -d @" epoch_int " +\"%d/%b/%Y:%H:%M:%S\" 2>/dev/null" cmd | getline time close(cmd) # uri uri = "" if (match($0, /"uri":"([^"]*)"/, m)) uri = m[1] # status status = "" if (match($0, /"status":([0-9]+)/, m)) status = m[1] # size (bytes) bytes = 0 if (match($0, /"size":([0-9]+)/, m)) bytes = m[1] # user agent ua = "-" if (match($0, /"User-Agent":\["([^"]*)"/, m)) ua = m[1] # duration (seconds as float) rt = "" if (match($0, /"duration":([0-9]+\.?[0-9]*)/, m)) rt = m[1] print ip "\t" time "\t" uri "\t" status "\t" bytes "\t" ua "\t" rt }' > "$TMP_DATA" } # ============================================================================ # PREPARE DATA (dispatch to correct parser) # ============================================================================ prepare_data() { TMP_DATA=$(mktemp) trap 'rm -f "$TMP_DATA"' EXIT case "$FORMAT" in combined) prepare_data_combined ;; caddy) prepare_data_caddy ;; *) echo "ERROR: unknown format: $FORMAT" >&2; exit 1 ;; esac TOTAL_LINES=$(wc -l < "$TMP_DATA") if [[ "$TOTAL_LINES" -eq 0 ]]; then echo "No log entries found (check --log path or --since filter)." >&2 exit 1 fi } # ============================================================================ # REPORT SECTIONS # ============================================================================ report_ips() { header "Top ${TOP_N} IPs by Requests" printf " ${BOLD}%4s %-40s %10s %12s${NC}\n" "#" "IP Address" "Requests" "Bandwidth" awk -F'\t' '{ip[$1]++; bw[$1]+=$5} END {for(i in ip) print ip[i],bw[i],i}' "$TMP_DATA" \ | sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num count bytes ip; do printf " %4s %-40s %10s %12s\n" "$num" "$ip" "$(fmt_num "$count")" "$(format_bytes "$bytes")" done } report_status() { header "Status Code Breakdown" local total=$TOTAL_LINES awk -F'\t' '{c[substr($4,1,1)"xx"]++} END {for(k in c) print k,c[k]}' "$TMP_DATA" \ | sort | while read -r cls count; do local pct color="$NC" pct=$(awk "BEGIN {printf \"%.1f\",($count/$total)*100}") case "$cls" in 2xx) color="$GREEN";; 3xx) color="$CYAN";; 4xx) color="$YELLOW";; 5xx) color="$RED";; esac printf " ${color}%-6s %10s (%5s%%)${NC}\n" "$cls:" "$(fmt_num "$count")" "$pct" done echo ""; echo " Detail:" awk -F'\t' '{d[$4]++} END {for(k in d) print d[k],k}' "$TMP_DATA" \ | sort -rn | head -n "$TOP_N" | while read -r count code; do printf " %-8s %10s\n" "$code" "$(fmt_num "$count")" done } report_uris() { header "Top ${TOP_N} URIs by Hits" printf " ${BOLD}%4s %-60s %10s${NC}\n" "#" "URI" "Hits" awk -F'\t' '{u[$3]++} END {for(k in u) print u[k],k}' "$TMP_DATA" \ | sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num count uri; do printf " %4s %-60s %10s\n" "$num" "${uri:0:58}" "$(fmt_num "$count")" done } report_bandwidth() { header "Top ${TOP_N} URIs by Bandwidth" printf " ${BOLD}%4s %-55s %12s %8s${NC}\n" "#" "URI" "Bandwidth" "Hits" awk -F'\t' '{bw[$3]+=$5; h[$3]++} END {for(k in bw) print bw[k],h[k],k}' "$TMP_DATA" \ | sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num bytes hits uri; do printf " %4s %-55s %12s %8s\n" "$num" "${uri:0:53}" "$(format_bytes "$bytes")" "$(fmt_num "$hits")" done } report_slow() { local has_rt has_rt=$(awk -F'\t' '$7!=""{found=1;exit} END{print found+0}' "$TMP_DATA") if [[ "$has_rt" -eq 0 ]]; then header "Slowest Requests" echo -e " ${DIM}(skipped — no request_time/duration field detected in log)${NC}"; return fi header "Slowest ${TOP_N} Requests" printf " ${BOLD}%4s %8s %-6s %-50s %10s${NC}\n" "#" "Time(s)" "Status" "URI" "Bytes" awk -F'\t' '$7!=""{print $7,$4,$5,$3}' "$TMP_DATA" \ | sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num rt status bytes uri; do printf " %4s %8s %-6s %-50s %10s\n" "$num" "$rt" "$status" "${uri:0:48}" "$(format_bytes "$bytes")" done } report_agents() { header "Top ${TOP_N} User Agents" printf " ${BOLD}%4s %-70s %10s${NC}\n" "#" "User Agent" "Requests" awk -F'\t' '{a[$6]++} END {for(k in a) print a[k],k}' "$TMP_DATA" \ | sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num count agent; do printf " %4s %-70s %10s\n" "$num" "${agent:0:68}" "$(fmt_num "$count")" done } report_bots() { header "Suspected Bots" printf " ${BOLD}%4s %-65s %10s${NC}\n" "#" "User Agent" "Requests" awk -F'\t' -v p="$BOT_PATTERNS" '$6~p{b[$6]++} END {for(k in b) print b[k],k}' "$TMP_DATA" \ | sort -rn | head -n "$TOP_N" | nl -w4 -s' ' | while read -r num count agent; do printf " %4s %-65s %10s\n" "$num" "${agent:0:63}" "$(fmt_num "$count")" done local bt; bt=$(awk -F'\t' -v p="$BOT_PATTERNS" '$6~p{c++} END{print c+0}' "$TMP_DATA") if [[ "$TOTAL_LINES" -gt 0 ]]; then local pct; pct=$(awk "BEGIN{printf \"%.1f\",($bt/$TOTAL_LINES)*100}") echo ""; echo -e " Bot traffic: ${YELLOW}$(fmt_num "$bt")${NC} requests (${pct}% of total)" fi } report_hourly() { header "Requests per Hour" local max_c max_c=$(awk -F'\t' '{split($2,t,":");h[t[2]]++} END {m=0;for(k in h) if(h[k]>m) m=h[k];print m}' "$TMP_DATA") awk -F'\t' '{split($2,t,":");h[t[2]]++} END {for(i=0;i<24;i++){hh=sprintf("%02d",i);c=(hh in h)?h[hh]:0;printf "%s\t%d\n",hh,c}}' "$TMP_DATA" \ | sort -t$'\t' -k1,1 | while IFS=$'\t' read -r hour count; do local bl=0 [[ "$max_c" -gt 0 ]] && bl=$(( count * 50 / max_c )) local bar; bar=$(printf '%*s' "$bl" '' | tr ' ' '#') printf " ${BOLD}%s:00${NC} %8s ${GREEN}%s${NC}\n" "$hour" "$(fmt_num "$count")" "$bar" done } report_geo() { header "Unique IPs (Geographic Review)" echo -e " ${DIM}(No GeoIP lookup — pipe IPs to geoiplookup or an API for country data)${NC}" local uc; uc=$(awk -F'\t' '!s[$1]++{c++} END{print c}' "$TMP_DATA") echo ""; echo " Unique IPs: $(fmt_num "$uc")" echo ""; echo " Top ${TOP_N} by request count:" awk -F'\t' '{ip[$1]++} END {for(i in ip) print ip[i],i}' "$TMP_DATA" \ | sort -rn | head -n "$TOP_N" | while read -r count ip; do printf " %-40s %s\n" "$ip" "$(fmt_num "$count")" done } report_summary() { header "Summary" local total=$TOTAL_LINES local uips; uips=$(awk -F'\t' '!s[$1]++' "$TMP_DATA" | wc -l) local tb; tb=$(awk -F'\t' '{s+=$5} END{print s+0}' "$TMP_DATA") local ft; ft=$(awk -F'\t' 'NR==1{print $2}' "$TMP_DATA") local lt; lt=$(awk -F'\t' 'END{print $2}' "$TMP_DATA") printf " %-22s %s\n" "Total requests:" "$(fmt_num "$total")" printf " %-22s %s\n" "Unique IPs:" "$(fmt_num "$uips")" printf " %-22s %s\n" "Total bandwidth:" "$(format_bytes "$tb")" printf " %-22s %s — %s\n" "Time range:" "$ft" "$lt" if [[ "$ALL_LOGS" == true && ${#LOG_FILES[@]} -gt 1 ]]; then printf " %-22s %s\n" "Log files analyzed:" "${#LOG_FILES[@]}" fi } # ============================================================================ # JSON OUTPUT # ============================================================================ json_output() { awk -F'\t' -v top_n="$TOP_N" -v bp="$BOT_PATTERNS" ' { ips[$1]++;ipb[$1]+=$5;st[$4]++;cl[substr($4,1,1)"xx"]++;uh[$3]++;ub[$3]+=$5 a[$6]++;if($6~bp)bots[$6]++;split($2,t,":");hr[t[2]]++;tb+=$5 if(!si[$1]++)ui++;if(NR==1)ft=$2;lt=$2;tot++ } END { printf "{\n \"summary\":{\"total_requests\":%d,\"unique_ips\":%d,\"total_bytes\":%d,\"first_timestamp\":\"%s\",\"last_timestamp\":\"%s\"},\n",tot,ui,tb,ft,lt printf " \"status_classes\":{"; s=""; for(c in cl){printf "%s\"%s\":%d",s,c,cl[c];s=","};printf "},\n" printf " \"status_codes\":{"; s=""; for(c in st){printf "%s\"%s\":%d",s,c,st[c];s=","};printf "},\n" printf " \"top_ips\":["; n=asorti(ips,si2);delete il for(i=1;i<=n;i++) il[i]=sprintf("%012d\t%s\t%d",ips[si2[i]],si2[i],ipb[si2[i]]) asort(il);s="";sh=0;for(i=n;i>=1&&sh=1&&sh&2 exit 1 fi if [[ ! -r "$LOG_FILE" ]]; then echo "ERROR: cannot read log file: $LOG_FILE (check permissions)" >&2 exit 1 fi detect_format gather_logs # Print header if [[ "$JSON_MODE" != true ]]; then echo -e "${BOLD}Web Log Analyzer v${VERSION}${NC}" echo -e "Log: ${DIM}${LOG_FILE}${NC}" echo -e "Format: ${DIM}${FORMAT}${NC}" [[ -n "$SINCE" ]] && echo -e "Since: ${DIM}${SINCE}${NC}" if [[ "$ALL_LOGS" == true && ${#LOG_FILES[@]} -gt 1 ]]; then echo -e "Logs: ${DIM}${#LOG_FILES[@]} files (including rotated)${NC}" fi fi # Parse log into temp working file prepare_data # JSON mode — single-pass output and exit if [[ "$JSON_MODE" == true ]]; then json_output exit 0 fi # Run selected report sections section_enabled "ips" && report_ips section_enabled "status" && report_status section_enabled "uris" && report_uris section_enabled "bandwidth" && report_bandwidth section_enabled "slow" && report_slow section_enabled "agents" && report_agents section_enabled "bots" && report_bots section_enabled "hourly" && report_hourly section_enabled "geo" && report_geo # Summary always runs report_summary } main "$@"