linux-scripts/add-fail2ban-scraper-detect.sh

#!/bin/bash
################################################################################
# Script Name: add-fail2ban-scraper-detect.sh
# Version: 1.1
# Description: Adds a Fail2ban jail to detect and ban headless Chrome scrapers
#              that pass JavaScript challenges but exhibit bot behavior —
#              rapid 499 responses (connection abandoned mid-download),
#              high-frequency 404s (probing non-existent URLs), and
#              HeadlessChrome user agent strings (no real user). Complements
#              add-fail2ban-image-scraper.sh which catches no-referer image
#              grabs. This filter catches the next tier: bots running real
#              browsers (Puppeteer/Playwright) that execute JS, accept cookies,
#              and send proper referers but still behave differently from humans.
#
# Author:  Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Usage:
#   sudo ./add-fail2ban-scraper-detect.sh
#   sudo ./add-fail2ban-scraper-detect.sh --logpath /var/log/nginx/access.log
#   sudo ./add-fail2ban-scraper-detect.sh --maxretry 5
#   sudo ./add-fail2ban-scraper-detect.sh --dry-run
#
################################################################################

set -euo pipefail

# ============================================================================
# DEFAULTS
# ============================================================================

readonly VERSION="1.1"
readonly SCRIPT_NAME="${0##*/}"

LOGPATH="auto"
BANTIME="86400"
MAXRETRY="3"
FINDTIME="300"
IGNOREIP=""
DRY_RUN=false

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

log_info()  { echo -e "${GREEN}[INFO]${NC} $*"; }
log_warn()  { echo -e "${YELLOW}[WARN]${NC} $*"; }
log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
log_step()  { echo -e "${CYAN}[STEP]${NC} $*"; }

show_usage() {
    cat <<EOF
Usage: sudo $SCRIPT_NAME [OPTIONS]

Adds a Fail2ban jail to detect headless Chrome scrapers that pass JavaScript
cookie challenges but exhibit bot behavior patterns:

  - HTTP 499 responses: client closed connection before the server finished
    responding. Scrapers fire off multiple requests and abandon them once they
    have what they need (grab HTML/CSS, drop the images).

  - Rapid 404s: probing for URLs that don't exist — guessed slugs, path
    enumeration, or following stale links at scraper speed.

These patterns are normal in small numbers from real users, but at volume
they indicate automated scraping. The maxretry threshold controls sensitivity.

This complements add-fail2ban-image-scraper.sh (no-referer image grabs).
That filter catches curl/wget-style bots. This one catches the next tier:
headless Chrome bots (Puppeteer, Playwright) that execute JavaScript and
mimic real browsers.

OPTIONS:
    --logpath PATH      Access log path (default: auto-detect)
    --bantime SECS      Ban duration in seconds (default: 86400 / 24 hours)
    --maxretry NUM      Errors before ban (default: 3)
    --findtime SECS     Window for counting errors (default: 300 / 5 min)
    --ignoreip RANGES   Space-separated IPs/CIDRs to whitelist (quoted)
    --dry-run           Show what would be done without making changes
    --remove            Remove the filter and jail
    -h, --help          Show this help message

WHAT IT CATCHES:
    - Headless Chrome scrapers (Puppeteer, Playwright, Selenium)
    - Bots that pass JS challenges but abandon image downloads (499)
    - Automated URL probing at scraper speed (rapid 404s)
    - HeadlessChrome user agent (no legitimate browser sends this)
    - Chinese content farm scrapers cloning sites via Baidu Cache

WHAT IT IGNORES:
    - Legitimate search engine crawlers (Googlebot, Bingbot, etc.)
    - Monitoring probes (UptimeRobot, Pingdom, Blackbox-Exporter)
    - Single 404 from a mistyped URL (below maxretry threshold)
    - Occasional 499 from a user closing their browser tab

EXAMPLES:
    # Install with defaults (ban after 3 errors in 5 minutes)
    sudo $SCRIPT_NAME

    # More lenient: ban after 5 errors
    sudo $SCRIPT_NAME --maxretry 5

    # Longer window: 5 errors in 15 minutes
    sudo $SCRIPT_NAME --maxretry 5 --findtime 900

    # Whitelist your monitoring server
    sudo $SCRIPT_NAME --ignoreip "10.0.0.5 192.168.1.0/24"

    # Preview without changes
    sudo $SCRIPT_NAME --dry-run

    # Remove the jail
    sudo $SCRIPT_NAME --remove

EOF
    exit 0
}

# ============================================================================
# ARGUMENT PARSING
# ============================================================================

REMOVE=false

parse_args() {
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --logpath)   LOGPATH="$2"; shift 2 ;;
            --bantime)   BANTIME="$2"; shift 2 ;;
            --maxretry)  MAXRETRY="$2"; shift 2 ;;
            --findtime)  FINDTIME="$2"; shift 2 ;;
            --ignoreip)  IGNOREIP="$2"; shift 2 ;;
            --dry-run)   DRY_RUN=true; shift ;;
            --remove)    REMOVE=true; shift ;;
            -h|--help)   show_usage ;;
            *)           log_error "Unknown option: $1"; show_usage ;;
        esac
    done
}

# ============================================================================
# CHECKS
# ============================================================================

check_root() {
    if [[ $EUID -ne 0 ]]; then
        log_error "This script must be run as root (sudo)"
        exit 1
    fi
}

check_fail2ban() {
    if ! command -v fail2ban-client &>/dev/null; then
        log_error "Fail2ban is not installed"
        log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/"
        exit 1
    fi

    if ! systemctl is-active --quiet fail2ban; then
        log_error "Fail2ban is not running"
        exit 1
    fi

    log_info "Fail2ban is installed and running"
}

detect_logpath() {
    if [[ "$LOGPATH" != "auto" ]]; then
        # shellcheck disable=SC2086
        local matches=( $LOGPATH )
        if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then
            log_error "Log file not found: $LOGPATH"
            exit 1
        fi
        log_info "Using specified log path: $LOGPATH"
        return
    fi

    log_step "Auto-detecting web server access log..."

    # HestiaCP — apache domains
    local hestia_apache=( /var/log/apache2/domains/*.log )
    if [[ -f "${hestia_apache[0]:-}" ]]; then
        LOGPATH="/var/log/apache2/domains/*.log"
        log_info "Detected HestiaCP apache: $LOGPATH"
        return
    fi

    # HestiaCP — nginx domains
    local hestia_nginx=( /var/log/nginx/domains/*.log )
    if [[ -f "${hestia_nginx[0]:-}" ]]; then
        LOGPATH="/var/log/nginx/domains/*.log"
        log_info "Detected HestiaCP nginx: $LOGPATH"
        return
    fi

    # Nginx (standard)
    if [[ -f /var/log/nginx/access.log ]]; then
        LOGPATH="/var/log/nginx/access.log"
        log_info "Detected nginx: $LOGPATH"
        return
    fi

    # Apache (Debian/Ubuntu)
    if [[ -f /var/log/apache2/access.log ]]; then
        LOGPATH="/var/log/apache2/access.log"
        log_info "Detected apache2: $LOGPATH"
        return
    fi

    # Apache (RHEL/Rocky)
    if [[ -f /var/log/httpd/access_log ]]; then
        LOGPATH="/var/log/httpd/access_log"
        log_info "Detected httpd: $LOGPATH"
        return
    fi

    log_error "Could not auto-detect access log. Use --logpath to specify."
    exit 1
}

# ============================================================================
# REMOVE
# ============================================================================

do_remove() {
    local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf"
    local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf"

    log_step "Removing scraper-detect jail..."

    if $DRY_RUN; then
        log_info "[DRY RUN] Would remove $filter_file"
        log_info "[DRY RUN] Would remove $jail_file"
        log_info "[DRY RUN] Would reload fail2ban"
        return
    fi

    if [[ -f "$jail_file" ]]; then
        rm -f "$jail_file"
        log_info "Removed: $jail_file"
    else
        log_warn "Jail config not found: $jail_file"
    fi

    if [[ -f "$filter_file" ]]; then
        rm -f "$filter_file"
        log_info "Removed: $filter_file"
    else
        log_warn "Filter not found: $filter_file"
    fi

    fail2ban-client reload
    sleep 2
    log_info "Fail2ban reloaded — scraper-detect jail removed"
    exit 0
}

# ============================================================================
# INSTALL FILTER
# ============================================================================

install_filter() {
    local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf"

    log_step "Installing filter: $filter_file"

    if $DRY_RUN; then
        log_info "[DRY RUN] Would create $filter_file"
        echo ""
        generate_filter
        echo ""
        return
    fi

    if [[ -f "$filter_file" ]]; then
        log_warn "Filter already exists — backing up to ${filter_file}.bak"
        cp "$filter_file" "${filter_file}.bak"
    fi

    generate_filter > "$filter_file"
    log_info "Filter installed: $filter_file"
}

generate_filter() {
    cat <<'EOF'
# Fail2ban filter to detect headless Chrome scrapers
# https://mylinux.work
#
# Catches three patterns that indicate automated scraping:
#
# 1. HTTP 499 — nginx-specific status meaning "client closed connection
#    before the server responded." Scrapers fire requests then drop them
#    once they've grabbed the HTML. Real users rarely trigger this.
#
# 2. HTTP 404 — not found. A single 404 is normal (mistyped URL). Many
#    404s in a short window indicate URL probing or stale scraper runs.
#
# 3. HeadlessChrome — Puppeteer/Playwright bots that don't bother hiding
#    the headless user agent. No legitimate browser sends this string.
#    Matched on any status code — headless Chrome is never a real user.
#
# Combined with maxretry in the jail, this catches bots that generate
# multiple errors quickly while ignoring the occasional human mistake.
# HeadlessChrome matches are instant (maxretry 1 would suffice) but
# the jail threshold still applies — a few hits trigger the ban.

[Definition]

# Match 499 (client dropped), 404 (not found), and HeadlessChrome UA
# Works with combined, common, and enriched (GeoIP) log formats
failregex = ^<HOST> \S+ \S+ \[.*\] "\S+ \S+ \S+" 499
            ^<HOST> \S+ \S+ \[.*\] "\S+ \S+ \S+" 404
            ^<HOST> .* ".*HeadlessChrome.*"

# Whitelist legitimate bots and monitoring tools
ignoreregex = ^<HOST> .* ".*(?:Googlebot|bingbot|Baiduspider|YandexBot|DuckDuckBot|Slurp|facebookexternalhit|Twitterbot|LinkedInBot|Applebot|Blackbox-Exporter|UptimeRobot|Pingdom|CensysInspect|Feedfetcher|Mediapartners-Google).*"

# Author: Phil Connor — https://mylinux.work
EOF
}

# ============================================================================
# INSTALL JAIL
# ============================================================================

install_jail() {
    local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf"

    log_step "Installing jail: $jail_file"

    if $DRY_RUN; then
        log_info "[DRY RUN] Would create $jail_file"
        echo ""
        generate_jail
        echo ""
        return
    fi

    if [[ -f "$jail_file" ]]; then
        log_warn "Jail config already exists — backing up to ${jail_file}.bak"
        cp "$jail_file" "${jail_file}.bak"
    fi

    generate_jail > "$jail_file"
    log_info "Jail config installed: $jail_file"
}

generate_jail() {
    cat <<EOF
# Fail2ban jail to detect headless Chrome scrapers
# https://mylinux.work
#
# Bans IPs that generate excessive 499 (connection dropped) or 404 (not
# found) responses. These patterns indicate automated scraping by headless
# Chrome bots (Puppeteer, Playwright) that pass JS challenges but behave
# differently from real users.
#
# Default: 3 errors in 5 minutes = 24 hour ban.
# Real users rarely hit 3 errors in 5 minutes. Adjust maxretry up
# if you see false positives.

[scraper-detect]
enabled  = true
port     = http,https
filter   = scraper-detect
logpath  = $LOGPATH
maxretry = $MAXRETRY
findtime = $FINDTIME
bantime  = $BANTIME
EOF

    if [[ -n "$IGNOREIP" ]]; then
        echo "ignoreip = $IGNOREIP"
    fi

    cat <<'EOF'

# Author: Phil Connor — https://mylinux.work
EOF
}

# ============================================================================
# RELOAD AND VERIFY
# ============================================================================

reload_fail2ban() {
    log_step "Reloading Fail2ban..."

    if $DRY_RUN; then
        log_info "[DRY RUN] Would reload fail2ban"
        return
    fi

    if ! fail2ban-client --test 2>/dev/null; then
        log_warn "Config test not available — reloading directly"
    fi

    fail2ban-client reload
    sleep 2

    if systemctl is-active --quiet fail2ban; then
        log_info "Fail2ban reloaded successfully"
    else
        log_error "Fail2ban failed to restart — check: journalctl -u fail2ban"
        exit 1
    fi
}

verify_jail() {
    log_step "Verifying scraper-detect jail..."

    if $DRY_RUN; then
        log_info "[DRY RUN] Would verify jail status"
        return
    fi

    echo ""
    if fail2ban-client status scraper-detect 2>/dev/null; then
        echo ""
        log_info "Scraper-detect jail is active and monitoring $LOGPATH"
    else
        log_error "Jail 'scraper-detect' is not running — check: fail2ban-client status"
        log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf"
        exit 1
    fi
}

test_against_logs() {
    if $DRY_RUN; then
        # shellcheck disable=SC2086
        local matches=( $LOGPATH )
        if [[ -f "${matches[0]}" ]]; then
            log_step "Testing filter against existing logs..."
            local tmp_filter
            tmp_filter=$(mktemp /tmp/scraper-detect-filter.XXXXXX)
            generate_filter > "$tmp_filter"
            echo ""
            fail2ban-regex "${matches[0]}" "$tmp_filter" 2>&1 | tail -8
            rm -f "$tmp_filter"
            echo ""
        fi
    fi
}

# ============================================================================
# MAIN
# ============================================================================

main() {
    parse_args "$@"

    echo ""
    echo "============================================"
    echo "  Fail2ban Scraper Detect v${VERSION}"
    echo "  https://mylinux.work"
    echo "============================================"
    echo ""

    check_root
    check_fail2ban

    if $REMOVE; then
        do_remove
    fi

    detect_logpath
    test_against_logs
    install_filter
    install_jail
    reload_fail2ban
    verify_jail

    echo ""
    echo "============================================"
    echo "  Setup Complete"
    echo "============================================"
    echo ""
    echo "  Jail:      scraper-detect"
    echo "  Log:       $LOGPATH"
    echo "  Ban time:  ${BANTIME}s ($(( BANTIME / 3600 ))h)"
    echo "  Max retry: $MAXRETRY (499/404 errors before ban)"
    echo "  Find time: ${FINDTIME}s ($(( FINDTIME / 60 ))m window)"
    if [[ -n "$IGNOREIP" ]]; then
        echo "  Ignore:    $IGNOREIP"
    fi
    echo ""
    echo "  Useful commands:"
    echo "    fail2ban-client status scraper-detect"
    echo "    fail2ban-client set scraper-detect unbanip <IP>"
    echo "    fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf"
    echo ""
}

main "$@"