Files
linux-scripts/add-fail2ban-scraper-detect.sh
T
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

505 lines
15 KiB
Bash
Executable File

#!/bin/bash
################################################################################
# Script Name: add-fail2ban-scraper-detect.sh
# Version: 1.1
# Description: Adds a Fail2ban jail to detect and ban headless Chrome scrapers
# that pass JavaScript challenges but exhibit bot behavior —
# rapid 499 responses (connection abandoned mid-download),
# high-frequency 404s (probing non-existent URLs), and
# HeadlessChrome user agent strings (no real user). Complements
# add-fail2ban-image-scraper.sh which catches no-referer image
# grabs. This filter catches the next tier: bots running real
# browsers (Puppeteer/Playwright) that execute JS, accept cookies,
# and send proper referers but still behave differently from humans.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Usage:
# sudo ./add-fail2ban-scraper-detect.sh
# sudo ./add-fail2ban-scraper-detect.sh --logpath /var/log/nginx/access.log
# sudo ./add-fail2ban-scraper-detect.sh --maxretry 5
# sudo ./add-fail2ban-scraper-detect.sh --dry-run
#
################################################################################
set -euo pipefail
# ============================================================================
# DEFAULTS
# ============================================================================
readonly VERSION="1.1"
readonly SCRIPT_NAME="${0##*/}"
LOGPATH="auto"
BANTIME="86400"
MAXRETRY="3"
FINDTIME="300"
IGNOREIP=""
DRY_RUN=false
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
log_info() { echo -e "${GREEN}[INFO]${NC} $*"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
log_step() { echo -e "${CYAN}[STEP]${NC} $*"; }
show_usage() {
cat <<EOF
Usage: sudo $SCRIPT_NAME [OPTIONS]
Adds a Fail2ban jail to detect headless Chrome scrapers that pass JavaScript
cookie challenges but exhibit bot behavior patterns:
- HTTP 499 responses: client closed connection before the server finished
responding. Scrapers fire off multiple requests and abandon them once they
have what they need (grab HTML/CSS, drop the images).
- Rapid 404s: probing for URLs that don't exist — guessed slugs, path
enumeration, or following stale links at scraper speed.
These patterns are normal in small numbers from real users, but at volume
they indicate automated scraping. The maxretry threshold controls sensitivity.
This complements add-fail2ban-image-scraper.sh (no-referer image grabs).
That filter catches curl/wget-style bots. This one catches the next tier:
headless Chrome bots (Puppeteer, Playwright) that execute JavaScript and
mimic real browsers.
OPTIONS:
--logpath PATH Access log path (default: auto-detect)
--bantime SECS Ban duration in seconds (default: 86400 / 24 hours)
--maxretry NUM Errors before ban (default: 3)
--findtime SECS Window for counting errors (default: 300 / 5 min)
--ignoreip RANGES Space-separated IPs/CIDRs to whitelist (quoted)
--dry-run Show what would be done without making changes
--remove Remove the filter and jail
-h, --help Show this help message
WHAT IT CATCHES:
- Headless Chrome scrapers (Puppeteer, Playwright, Selenium)
- Bots that pass JS challenges but abandon image downloads (499)
- Automated URL probing at scraper speed (rapid 404s)
- HeadlessChrome user agent (no legitimate browser sends this)
- Chinese content farm scrapers cloning sites via Baidu Cache
WHAT IT IGNORES:
- Legitimate search engine crawlers (Googlebot, Bingbot, etc.)
- Monitoring probes (UptimeRobot, Pingdom, Blackbox-Exporter)
- Single 404 from a mistyped URL (below maxretry threshold)
- Occasional 499 from a user closing their browser tab
EXAMPLES:
# Install with defaults (ban after 3 errors in 5 minutes)
sudo $SCRIPT_NAME
# More lenient: ban after 5 errors
sudo $SCRIPT_NAME --maxretry 5
# Longer window: 5 errors in 15 minutes
sudo $SCRIPT_NAME --maxretry 5 --findtime 900
# Whitelist your monitoring server
sudo $SCRIPT_NAME --ignoreip "10.0.0.5 192.168.1.0/24"
# Preview without changes
sudo $SCRIPT_NAME --dry-run
# Remove the jail
sudo $SCRIPT_NAME --remove
EOF
exit 0
}
# ============================================================================
# ARGUMENT PARSING
# ============================================================================
REMOVE=false
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--logpath) LOGPATH="$2"; shift 2 ;;
--bantime) BANTIME="$2"; shift 2 ;;
--maxretry) MAXRETRY="$2"; shift 2 ;;
--findtime) FINDTIME="$2"; shift 2 ;;
--ignoreip) IGNOREIP="$2"; shift 2 ;;
--dry-run) DRY_RUN=true; shift ;;
--remove) REMOVE=true; shift ;;
-h|--help) show_usage ;;
*) log_error "Unknown option: $1"; show_usage ;;
esac
done
}
# ============================================================================
# CHECKS
# ============================================================================
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "This script must be run as root (sudo)"
exit 1
fi
}
check_fail2ban() {
if ! command -v fail2ban-client &>/dev/null; then
log_error "Fail2ban is not installed"
log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/"
exit 1
fi
if ! systemctl is-active --quiet fail2ban; then
log_error "Fail2ban is not running"
exit 1
fi
log_info "Fail2ban is installed and running"
}
detect_logpath() {
if [[ "$LOGPATH" != "auto" ]]; then
# shellcheck disable=SC2086
local matches=( $LOGPATH )
if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then
log_error "Log file not found: $LOGPATH"
exit 1
fi
log_info "Using specified log path: $LOGPATH"
return
fi
log_step "Auto-detecting web server access log..."
# HestiaCP — apache domains
local hestia_apache=( /var/log/apache2/domains/*.log )
if [[ -f "${hestia_apache[0]:-}" ]]; then
LOGPATH="/var/log/apache2/domains/*.log"
log_info "Detected HestiaCP apache: $LOGPATH"
return
fi
# HestiaCP — nginx domains
local hestia_nginx=( /var/log/nginx/domains/*.log )
if [[ -f "${hestia_nginx[0]:-}" ]]; then
LOGPATH="/var/log/nginx/domains/*.log"
log_info "Detected HestiaCP nginx: $LOGPATH"
return
fi
# Nginx (standard)
if [[ -f /var/log/nginx/access.log ]]; then
LOGPATH="/var/log/nginx/access.log"
log_info "Detected nginx: $LOGPATH"
return
fi
# Apache (Debian/Ubuntu)
if [[ -f /var/log/apache2/access.log ]]; then
LOGPATH="/var/log/apache2/access.log"
log_info "Detected apache2: $LOGPATH"
return
fi
# Apache (RHEL/Rocky)
if [[ -f /var/log/httpd/access_log ]]; then
LOGPATH="/var/log/httpd/access_log"
log_info "Detected httpd: $LOGPATH"
return
fi
log_error "Could not auto-detect access log. Use --logpath to specify."
exit 1
}
# ============================================================================
# REMOVE
# ============================================================================
do_remove() {
local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf"
local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf"
log_step "Removing scraper-detect jail..."
if $DRY_RUN; then
log_info "[DRY RUN] Would remove $filter_file"
log_info "[DRY RUN] Would remove $jail_file"
log_info "[DRY RUN] Would reload fail2ban"
return
fi
if [[ -f "$jail_file" ]]; then
rm -f "$jail_file"
log_info "Removed: $jail_file"
else
log_warn "Jail config not found: $jail_file"
fi
if [[ -f "$filter_file" ]]; then
rm -f "$filter_file"
log_info "Removed: $filter_file"
else
log_warn "Filter not found: $filter_file"
fi
fail2ban-client reload
sleep 2
log_info "Fail2ban reloaded — scraper-detect jail removed"
exit 0
}
# ============================================================================
# INSTALL FILTER
# ============================================================================
install_filter() {
local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf"
log_step "Installing filter: $filter_file"
if $DRY_RUN; then
log_info "[DRY RUN] Would create $filter_file"
echo ""
generate_filter
echo ""
return
fi
if [[ -f "$filter_file" ]]; then
log_warn "Filter already exists — backing up to ${filter_file}.bak"
cp "$filter_file" "${filter_file}.bak"
fi
generate_filter > "$filter_file"
log_info "Filter installed: $filter_file"
}
generate_filter() {
cat <<'EOF'
# Fail2ban filter to detect headless Chrome scrapers
# https://mylinux.work
#
# Catches three patterns that indicate automated scraping:
#
# 1. HTTP 499 — nginx-specific status meaning "client closed connection
# before the server responded." Scrapers fire requests then drop them
# once they've grabbed the HTML. Real users rarely trigger this.
#
# 2. HTTP 404 — not found. A single 404 is normal (mistyped URL). Many
# 404s in a short window indicate URL probing or stale scraper runs.
#
# 3. HeadlessChrome — Puppeteer/Playwright bots that don't bother hiding
# the headless user agent. No legitimate browser sends this string.
# Matched on any status code — headless Chrome is never a real user.
#
# Combined with maxretry in the jail, this catches bots that generate
# multiple errors quickly while ignoring the occasional human mistake.
# HeadlessChrome matches are instant (maxretry 1 would suffice) but
# the jail threshold still applies — a few hits trigger the ban.
[Definition]
# Match 499 (client dropped), 404 (not found), and HeadlessChrome UA
# Works with combined, common, and enriched (GeoIP) log formats
failregex = ^<HOST> \S+ \S+ \[.*\] "\S+ \S+ \S+" 499
^<HOST> \S+ \S+ \[.*\] "\S+ \S+ \S+" 404
^<HOST> .* ".*HeadlessChrome.*"
# Whitelist legitimate bots and monitoring tools
ignoreregex = ^<HOST> .* ".*(?:Googlebot|bingbot|Baiduspider|YandexBot|DuckDuckBot|Slurp|facebookexternalhit|Twitterbot|LinkedInBot|Applebot|Blackbox-Exporter|UptimeRobot|Pingdom|CensysInspect|Feedfetcher|Mediapartners-Google).*"
# Author: Phil Connor — https://mylinux.work
EOF
}
# ============================================================================
# INSTALL JAIL
# ============================================================================
install_jail() {
local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf"
log_step "Installing jail: $jail_file"
if $DRY_RUN; then
log_info "[DRY RUN] Would create $jail_file"
echo ""
generate_jail
echo ""
return
fi
if [[ -f "$jail_file" ]]; then
log_warn "Jail config already exists — backing up to ${jail_file}.bak"
cp "$jail_file" "${jail_file}.bak"
fi
generate_jail > "$jail_file"
log_info "Jail config installed: $jail_file"
}
generate_jail() {
cat <<EOF
# Fail2ban jail to detect headless Chrome scrapers
# https://mylinux.work
#
# Bans IPs that generate excessive 499 (connection dropped) or 404 (not
# found) responses. These patterns indicate automated scraping by headless
# Chrome bots (Puppeteer, Playwright) that pass JS challenges but behave
# differently from real users.
#
# Default: 3 errors in 5 minutes = 24 hour ban.
# Real users rarely hit 3 errors in 5 minutes. Adjust maxretry up
# if you see false positives.
[scraper-detect]
enabled = true
port = http,https
filter = scraper-detect
logpath = $LOGPATH
maxretry = $MAXRETRY
findtime = $FINDTIME
bantime = $BANTIME
EOF
if [[ -n "$IGNOREIP" ]]; then
echo "ignoreip = $IGNOREIP"
fi
cat <<'EOF'
# Author: Phil Connor — https://mylinux.work
EOF
}
# ============================================================================
# RELOAD AND VERIFY
# ============================================================================
reload_fail2ban() {
log_step "Reloading Fail2ban..."
if $DRY_RUN; then
log_info "[DRY RUN] Would reload fail2ban"
return
fi
if ! fail2ban-client --test 2>/dev/null; then
log_warn "Config test not available — reloading directly"
fi
fail2ban-client reload
sleep 2
if systemctl is-active --quiet fail2ban; then
log_info "Fail2ban reloaded successfully"
else
log_error "Fail2ban failed to restart — check: journalctl -u fail2ban"
exit 1
fi
}
verify_jail() {
log_step "Verifying scraper-detect jail..."
if $DRY_RUN; then
log_info "[DRY RUN] Would verify jail status"
return
fi
echo ""
if fail2ban-client status scraper-detect 2>/dev/null; then
echo ""
log_info "Scraper-detect jail is active and monitoring $LOGPATH"
else
log_error "Jail 'scraper-detect' is not running — check: fail2ban-client status"
log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf"
exit 1
fi
}
test_against_logs() {
if $DRY_RUN; then
# shellcheck disable=SC2086
local matches=( $LOGPATH )
if [[ -f "${matches[0]}" ]]; then
log_step "Testing filter against existing logs..."
local tmp_filter
tmp_filter=$(mktemp /tmp/scraper-detect-filter.XXXXXX)
generate_filter > "$tmp_filter"
echo ""
fail2ban-regex "${matches[0]}" "$tmp_filter" 2>&1 | tail -8
rm -f "$tmp_filter"
echo ""
fi
fi
}
# ============================================================================
# MAIN
# ============================================================================
main() {
parse_args "$@"
echo ""
echo "============================================"
echo " Fail2ban Scraper Detect v${VERSION}"
echo " https://mylinux.work"
echo "============================================"
echo ""
check_root
check_fail2ban
if $REMOVE; then
do_remove
fi
detect_logpath
test_against_logs
install_filter
install_jail
reload_fail2ban
verify_jail
echo ""
echo "============================================"
echo " Setup Complete"
echo "============================================"
echo ""
echo " Jail: scraper-detect"
echo " Log: $LOGPATH"
echo " Ban time: ${BANTIME}s ($(( BANTIME / 3600 ))h)"
echo " Max retry: $MAXRETRY (499/404 errors before ban)"
echo " Find time: ${FINDTIME}s ($(( FINDTIME / 60 ))m window)"
if [[ -n "$IGNOREIP" ]]; then
echo " Ignore: $IGNOREIP"
fi
echo ""
echo " Useful commands:"
echo " fail2ban-client status scraper-detect"
echo " fail2ban-client set scraper-detect unbanip <IP>"
echo " fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf"
echo ""
}
main "$@"