#!/bin/bash ################################################################################ # Script Name: add-fail2ban-scraper-detect.sh # Version: 1.1 # Description: Adds a Fail2ban jail to detect and ban headless Chrome scrapers # that pass JavaScript challenges but exhibit bot behavior — # rapid 499 responses (connection abandoned mid-download), # high-frequency 404s (probing non-existent URLs), and # HeadlessChrome user agent strings (no real user). Complements # add-fail2ban-image-scraper.sh which catches no-referer image # grabs. This filter catches the next tier: bots running real # browsers (Puppeteer/Playwright) that execute JS, accept cookies, # and send proper referers but still behave differently from humans. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Usage: # sudo ./add-fail2ban-scraper-detect.sh # sudo ./add-fail2ban-scraper-detect.sh --logpath /var/log/nginx/access.log # sudo ./add-fail2ban-scraper-detect.sh --maxretry 5 # sudo ./add-fail2ban-scraper-detect.sh --dry-run # ################################################################################ set -euo pipefail # ============================================================================ # DEFAULTS # ============================================================================ readonly VERSION="1.1" readonly SCRIPT_NAME="${0##*/}" LOGPATH="auto" BANTIME="86400" MAXRETRY="3" FINDTIME="300" IGNOREIP="" DRY_RUN=false # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' CYAN='\033[0;36m' NC='\033[0m' # ============================================================================ # HELPER FUNCTIONS # ============================================================================ log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } show_usage() { cat </dev/null; then log_error "Fail2ban is not installed" log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/" exit 1 fi if ! systemctl is-active --quiet fail2ban; then log_error "Fail2ban is not running" exit 1 fi log_info "Fail2ban is installed and running" } detect_logpath() { if [[ "$LOGPATH" != "auto" ]]; then # shellcheck disable=SC2086 local matches=( $LOGPATH ) if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then log_error "Log file not found: $LOGPATH" exit 1 fi log_info "Using specified log path: $LOGPATH" return fi log_step "Auto-detecting web server access log..." # HestiaCP — apache domains local hestia_apache=( /var/log/apache2/domains/*.log ) if [[ -f "${hestia_apache[0]:-}" ]]; then LOGPATH="/var/log/apache2/domains/*.log" log_info "Detected HestiaCP apache: $LOGPATH" return fi # HestiaCP — nginx domains local hestia_nginx=( /var/log/nginx/domains/*.log ) if [[ -f "${hestia_nginx[0]:-}" ]]; then LOGPATH="/var/log/nginx/domains/*.log" log_info "Detected HestiaCP nginx: $LOGPATH" return fi # Nginx (standard) if [[ -f /var/log/nginx/access.log ]]; then LOGPATH="/var/log/nginx/access.log" log_info "Detected nginx: $LOGPATH" return fi # Apache (Debian/Ubuntu) if [[ -f /var/log/apache2/access.log ]]; then LOGPATH="/var/log/apache2/access.log" log_info "Detected apache2: $LOGPATH" return fi # Apache (RHEL/Rocky) if [[ -f /var/log/httpd/access_log ]]; then LOGPATH="/var/log/httpd/access_log" log_info "Detected httpd: $LOGPATH" return fi log_error "Could not auto-detect access log. Use --logpath to specify." exit 1 } # ============================================================================ # REMOVE # ============================================================================ do_remove() { local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf" local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf" log_step "Removing scraper-detect jail..." if $DRY_RUN; then log_info "[DRY RUN] Would remove $filter_file" log_info "[DRY RUN] Would remove $jail_file" log_info "[DRY RUN] Would reload fail2ban" return fi if [[ -f "$jail_file" ]]; then rm -f "$jail_file" log_info "Removed: $jail_file" else log_warn "Jail config not found: $jail_file" fi if [[ -f "$filter_file" ]]; then rm -f "$filter_file" log_info "Removed: $filter_file" else log_warn "Filter not found: $filter_file" fi fail2ban-client reload sleep 2 log_info "Fail2ban reloaded — scraper-detect jail removed" exit 0 } # ============================================================================ # INSTALL FILTER # ============================================================================ install_filter() { local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf" log_step "Installing filter: $filter_file" if $DRY_RUN; then log_info "[DRY RUN] Would create $filter_file" echo "" generate_filter echo "" return fi if [[ -f "$filter_file" ]]; then log_warn "Filter already exists — backing up to ${filter_file}.bak" cp "$filter_file" "${filter_file}.bak" fi generate_filter > "$filter_file" log_info "Filter installed: $filter_file" } generate_filter() { cat <<'EOF' # Fail2ban filter to detect headless Chrome scrapers # https://mylinux.work # # Catches three patterns that indicate automated scraping: # # 1. HTTP 499 — nginx-specific status meaning "client closed connection # before the server responded." Scrapers fire requests then drop them # once they've grabbed the HTML. Real users rarely trigger this. # # 2. HTTP 404 — not found. A single 404 is normal (mistyped URL). Many # 404s in a short window indicate URL probing or stale scraper runs. # # 3. HeadlessChrome — Puppeteer/Playwright bots that don't bother hiding # the headless user agent. No legitimate browser sends this string. # Matched on any status code — headless Chrome is never a real user. # # Combined with maxretry in the jail, this catches bots that generate # multiple errors quickly while ignoring the occasional human mistake. # HeadlessChrome matches are instant (maxretry 1 would suffice) but # the jail threshold still applies — a few hits trigger the ban. [Definition] # Match 499 (client dropped), 404 (not found), and HeadlessChrome UA # Works with combined, common, and enriched (GeoIP) log formats failregex = ^ \S+ \S+ \[.*\] "\S+ \S+ \S+" 499 ^ \S+ \S+ \[.*\] "\S+ \S+ \S+" 404 ^ .* ".*HeadlessChrome.*" # Whitelist legitimate bots and monitoring tools ignoreregex = ^ .* ".*(?:Googlebot|bingbot|Baiduspider|YandexBot|DuckDuckBot|Slurp|facebookexternalhit|Twitterbot|LinkedInBot|Applebot|Blackbox-Exporter|UptimeRobot|Pingdom|CensysInspect|Feedfetcher|Mediapartners-Google).*" # Author: Phil Connor — https://mylinux.work EOF } # ============================================================================ # INSTALL JAIL # ============================================================================ install_jail() { local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf" log_step "Installing jail: $jail_file" if $DRY_RUN; then log_info "[DRY RUN] Would create $jail_file" echo "" generate_jail echo "" return fi if [[ -f "$jail_file" ]]; then log_warn "Jail config already exists — backing up to ${jail_file}.bak" cp "$jail_file" "${jail_file}.bak" fi generate_jail > "$jail_file" log_info "Jail config installed: $jail_file" } generate_jail() { cat </dev/null; then log_warn "Config test not available — reloading directly" fi fail2ban-client reload sleep 2 if systemctl is-active --quiet fail2ban; then log_info "Fail2ban reloaded successfully" else log_error "Fail2ban failed to restart — check: journalctl -u fail2ban" exit 1 fi } verify_jail() { log_step "Verifying scraper-detect jail..." if $DRY_RUN; then log_info "[DRY RUN] Would verify jail status" return fi echo "" if fail2ban-client status scraper-detect 2>/dev/null; then echo "" log_info "Scraper-detect jail is active and monitoring $LOGPATH" else log_error "Jail 'scraper-detect' is not running — check: fail2ban-client status" log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf" exit 1 fi } test_against_logs() { if $DRY_RUN; then # shellcheck disable=SC2086 local matches=( $LOGPATH ) if [[ -f "${matches[0]}" ]]; then log_step "Testing filter against existing logs..." local tmp_filter tmp_filter=$(mktemp /tmp/scraper-detect-filter.XXXXXX) generate_filter > "$tmp_filter" echo "" fail2ban-regex "${matches[0]}" "$tmp_filter" 2>&1 | tail -8 rm -f "$tmp_filter" echo "" fi fi } # ============================================================================ # MAIN # ============================================================================ main() { parse_args "$@" echo "" echo "============================================" echo " Fail2ban Scraper Detect v${VERSION}" echo " https://mylinux.work" echo "============================================" echo "" check_root check_fail2ban if $REMOVE; then do_remove fi detect_logpath test_against_logs install_filter install_jail reload_fail2ban verify_jail echo "" echo "============================================" echo " Setup Complete" echo "============================================" echo "" echo " Jail: scraper-detect" echo " Log: $LOGPATH" echo " Ban time: ${BANTIME}s ($(( BANTIME / 3600 ))h)" echo " Max retry: $MAXRETRY (499/404 errors before ban)" echo " Find time: ${FINDTIME}s ($(( FINDTIME / 60 ))m window)" if [[ -n "$IGNOREIP" ]]; then echo " Ignore: $IGNOREIP" fi echo "" echo " Useful commands:" echo " fail2ban-client status scraper-detect" echo " fail2ban-client set scraper-detect unbanip " echo " fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf" echo "" } main "$@"