Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
This commit is contained in:
Executable
+504
@@ -0,0 +1,504 @@
|
||||
#!/bin/bash
|
||||
################################################################################
|
||||
# Script Name: add-fail2ban-scraper-detect.sh
|
||||
# Version: 1.1
|
||||
# Description: Adds a Fail2ban jail to detect and ban headless Chrome scrapers
|
||||
# that pass JavaScript challenges but exhibit bot behavior —
|
||||
# rapid 499 responses (connection abandoned mid-download),
|
||||
# high-frequency 404s (probing non-existent URLs), and
|
||||
# HeadlessChrome user agent strings (no real user). Complements
|
||||
# add-fail2ban-image-scraper.sh which catches no-referer image
|
||||
# grabs. This filter catches the next tier: bots running real
|
||||
# browsers (Puppeteer/Playwright) that execute JS, accept cookies,
|
||||
# and send proper referers but still behave differently from humans.
|
||||
#
|
||||
# Author: Phil Connor
|
||||
# Contact: contact@mylinux.work
|
||||
# Website: https://mylinux.work
|
||||
# License: MIT
|
||||
#
|
||||
# Usage:
|
||||
# sudo ./add-fail2ban-scraper-detect.sh
|
||||
# sudo ./add-fail2ban-scraper-detect.sh --logpath /var/log/nginx/access.log
|
||||
# sudo ./add-fail2ban-scraper-detect.sh --maxretry 5
|
||||
# sudo ./add-fail2ban-scraper-detect.sh --dry-run
|
||||
#
|
||||
################################################################################
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ============================================================================
|
||||
# DEFAULTS
|
||||
# ============================================================================
|
||||
|
||||
readonly VERSION="1.1"
|
||||
readonly SCRIPT_NAME="${0##*/}"
|
||||
|
||||
LOGPATH="auto"
|
||||
BANTIME="86400"
|
||||
MAXRETRY="3"
|
||||
FINDTIME="300"
|
||||
IGNOREIP=""
|
||||
DRY_RUN=false
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m'
|
||||
|
||||
# ============================================================================
|
||||
# HELPER FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
log_info() { echo -e "${GREEN}[INFO]${NC} $*"; }
|
||||
log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
||||
log_step() { echo -e "${CYAN}[STEP]${NC} $*"; }
|
||||
|
||||
show_usage() {
|
||||
cat <<EOF
|
||||
Usage: sudo $SCRIPT_NAME [OPTIONS]
|
||||
|
||||
Adds a Fail2ban jail to detect headless Chrome scrapers that pass JavaScript
|
||||
cookie challenges but exhibit bot behavior patterns:
|
||||
|
||||
- HTTP 499 responses: client closed connection before the server finished
|
||||
responding. Scrapers fire off multiple requests and abandon them once they
|
||||
have what they need (grab HTML/CSS, drop the images).
|
||||
|
||||
- Rapid 404s: probing for URLs that don't exist — guessed slugs, path
|
||||
enumeration, or following stale links at scraper speed.
|
||||
|
||||
These patterns are normal in small numbers from real users, but at volume
|
||||
they indicate automated scraping. The maxretry threshold controls sensitivity.
|
||||
|
||||
This complements add-fail2ban-image-scraper.sh (no-referer image grabs).
|
||||
That filter catches curl/wget-style bots. This one catches the next tier:
|
||||
headless Chrome bots (Puppeteer, Playwright) that execute JavaScript and
|
||||
mimic real browsers.
|
||||
|
||||
OPTIONS:
|
||||
--logpath PATH Access log path (default: auto-detect)
|
||||
--bantime SECS Ban duration in seconds (default: 86400 / 24 hours)
|
||||
--maxretry NUM Errors before ban (default: 3)
|
||||
--findtime SECS Window for counting errors (default: 300 / 5 min)
|
||||
--ignoreip RANGES Space-separated IPs/CIDRs to whitelist (quoted)
|
||||
--dry-run Show what would be done without making changes
|
||||
--remove Remove the filter and jail
|
||||
-h, --help Show this help message
|
||||
|
||||
WHAT IT CATCHES:
|
||||
- Headless Chrome scrapers (Puppeteer, Playwright, Selenium)
|
||||
- Bots that pass JS challenges but abandon image downloads (499)
|
||||
- Automated URL probing at scraper speed (rapid 404s)
|
||||
- HeadlessChrome user agent (no legitimate browser sends this)
|
||||
- Chinese content farm scrapers cloning sites via Baidu Cache
|
||||
|
||||
WHAT IT IGNORES:
|
||||
- Legitimate search engine crawlers (Googlebot, Bingbot, etc.)
|
||||
- Monitoring probes (UptimeRobot, Pingdom, Blackbox-Exporter)
|
||||
- Single 404 from a mistyped URL (below maxretry threshold)
|
||||
- Occasional 499 from a user closing their browser tab
|
||||
|
||||
EXAMPLES:
|
||||
# Install with defaults (ban after 3 errors in 5 minutes)
|
||||
sudo $SCRIPT_NAME
|
||||
|
||||
# More lenient: ban after 5 errors
|
||||
sudo $SCRIPT_NAME --maxretry 5
|
||||
|
||||
# Longer window: 5 errors in 15 minutes
|
||||
sudo $SCRIPT_NAME --maxretry 5 --findtime 900
|
||||
|
||||
# Whitelist your monitoring server
|
||||
sudo $SCRIPT_NAME --ignoreip "10.0.0.5 192.168.1.0/24"
|
||||
|
||||
# Preview without changes
|
||||
sudo $SCRIPT_NAME --dry-run
|
||||
|
||||
# Remove the jail
|
||||
sudo $SCRIPT_NAME --remove
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# ARGUMENT PARSING
|
||||
# ============================================================================
|
||||
|
||||
REMOVE=false
|
||||
|
||||
parse_args() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--logpath) LOGPATH="$2"; shift 2 ;;
|
||||
--bantime) BANTIME="$2"; shift 2 ;;
|
||||
--maxretry) MAXRETRY="$2"; shift 2 ;;
|
||||
--findtime) FINDTIME="$2"; shift 2 ;;
|
||||
--ignoreip) IGNOREIP="$2"; shift 2 ;;
|
||||
--dry-run) DRY_RUN=true; shift ;;
|
||||
--remove) REMOVE=true; shift ;;
|
||||
-h|--help) show_usage ;;
|
||||
*) log_error "Unknown option: $1"; show_usage ;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# CHECKS
|
||||
# ============================================================================
|
||||
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "This script must be run as root (sudo)"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
check_fail2ban() {
|
||||
if ! command -v fail2ban-client &>/dev/null; then
|
||||
log_error "Fail2ban is not installed"
|
||||
log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! systemctl is-active --quiet fail2ban; then
|
||||
log_error "Fail2ban is not running"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "Fail2ban is installed and running"
|
||||
}
|
||||
|
||||
detect_logpath() {
|
||||
if [[ "$LOGPATH" != "auto" ]]; then
|
||||
# shellcheck disable=SC2086
|
||||
local matches=( $LOGPATH )
|
||||
if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then
|
||||
log_error "Log file not found: $LOGPATH"
|
||||
exit 1
|
||||
fi
|
||||
log_info "Using specified log path: $LOGPATH"
|
||||
return
|
||||
fi
|
||||
|
||||
log_step "Auto-detecting web server access log..."
|
||||
|
||||
# HestiaCP — apache domains
|
||||
local hestia_apache=( /var/log/apache2/domains/*.log )
|
||||
if [[ -f "${hestia_apache[0]:-}" ]]; then
|
||||
LOGPATH="/var/log/apache2/domains/*.log"
|
||||
log_info "Detected HestiaCP apache: $LOGPATH"
|
||||
return
|
||||
fi
|
||||
|
||||
# HestiaCP — nginx domains
|
||||
local hestia_nginx=( /var/log/nginx/domains/*.log )
|
||||
if [[ -f "${hestia_nginx[0]:-}" ]]; then
|
||||
LOGPATH="/var/log/nginx/domains/*.log"
|
||||
log_info "Detected HestiaCP nginx: $LOGPATH"
|
||||
return
|
||||
fi
|
||||
|
||||
# Nginx (standard)
|
||||
if [[ -f /var/log/nginx/access.log ]]; then
|
||||
LOGPATH="/var/log/nginx/access.log"
|
||||
log_info "Detected nginx: $LOGPATH"
|
||||
return
|
||||
fi
|
||||
|
||||
# Apache (Debian/Ubuntu)
|
||||
if [[ -f /var/log/apache2/access.log ]]; then
|
||||
LOGPATH="/var/log/apache2/access.log"
|
||||
log_info "Detected apache2: $LOGPATH"
|
||||
return
|
||||
fi
|
||||
|
||||
# Apache (RHEL/Rocky)
|
||||
if [[ -f /var/log/httpd/access_log ]]; then
|
||||
LOGPATH="/var/log/httpd/access_log"
|
||||
log_info "Detected httpd: $LOGPATH"
|
||||
return
|
||||
fi
|
||||
|
||||
log_error "Could not auto-detect access log. Use --logpath to specify."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# REMOVE
|
||||
# ============================================================================
|
||||
|
||||
do_remove() {
|
||||
local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf"
|
||||
local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf"
|
||||
|
||||
log_step "Removing scraper-detect jail..."
|
||||
|
||||
if $DRY_RUN; then
|
||||
log_info "[DRY RUN] Would remove $filter_file"
|
||||
log_info "[DRY RUN] Would remove $jail_file"
|
||||
log_info "[DRY RUN] Would reload fail2ban"
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ -f "$jail_file" ]]; then
|
||||
rm -f "$jail_file"
|
||||
log_info "Removed: $jail_file"
|
||||
else
|
||||
log_warn "Jail config not found: $jail_file"
|
||||
fi
|
||||
|
||||
if [[ -f "$filter_file" ]]; then
|
||||
rm -f "$filter_file"
|
||||
log_info "Removed: $filter_file"
|
||||
else
|
||||
log_warn "Filter not found: $filter_file"
|
||||
fi
|
||||
|
||||
fail2ban-client reload
|
||||
sleep 2
|
||||
log_info "Fail2ban reloaded — scraper-detect jail removed"
|
||||
exit 0
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# INSTALL FILTER
|
||||
# ============================================================================
|
||||
|
||||
install_filter() {
|
||||
local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf"
|
||||
|
||||
log_step "Installing filter: $filter_file"
|
||||
|
||||
if $DRY_RUN; then
|
||||
log_info "[DRY RUN] Would create $filter_file"
|
||||
echo ""
|
||||
generate_filter
|
||||
echo ""
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ -f "$filter_file" ]]; then
|
||||
log_warn "Filter already exists — backing up to ${filter_file}.bak"
|
||||
cp "$filter_file" "${filter_file}.bak"
|
||||
fi
|
||||
|
||||
generate_filter > "$filter_file"
|
||||
log_info "Filter installed: $filter_file"
|
||||
}
|
||||
|
||||
generate_filter() {
|
||||
cat <<'EOF'
|
||||
# Fail2ban filter to detect headless Chrome scrapers
|
||||
# https://mylinux.work
|
||||
#
|
||||
# Catches three patterns that indicate automated scraping:
|
||||
#
|
||||
# 1. HTTP 499 — nginx-specific status meaning "client closed connection
|
||||
# before the server responded." Scrapers fire requests then drop them
|
||||
# once they've grabbed the HTML. Real users rarely trigger this.
|
||||
#
|
||||
# 2. HTTP 404 — not found. A single 404 is normal (mistyped URL). Many
|
||||
# 404s in a short window indicate URL probing or stale scraper runs.
|
||||
#
|
||||
# 3. HeadlessChrome — Puppeteer/Playwright bots that don't bother hiding
|
||||
# the headless user agent. No legitimate browser sends this string.
|
||||
# Matched on any status code — headless Chrome is never a real user.
|
||||
#
|
||||
# Combined with maxretry in the jail, this catches bots that generate
|
||||
# multiple errors quickly while ignoring the occasional human mistake.
|
||||
# HeadlessChrome matches are instant (maxretry 1 would suffice) but
|
||||
# the jail threshold still applies — a few hits trigger the ban.
|
||||
|
||||
[Definition]
|
||||
|
||||
# Match 499 (client dropped), 404 (not found), and HeadlessChrome UA
|
||||
# Works with combined, common, and enriched (GeoIP) log formats
|
||||
failregex = ^<HOST> \S+ \S+ \[.*\] "\S+ \S+ \S+" 499
|
||||
^<HOST> \S+ \S+ \[.*\] "\S+ \S+ \S+" 404
|
||||
^<HOST> .* ".*HeadlessChrome.*"
|
||||
|
||||
# Whitelist legitimate bots and monitoring tools
|
||||
ignoreregex = ^<HOST> .* ".*(?:Googlebot|bingbot|Baiduspider|YandexBot|DuckDuckBot|Slurp|facebookexternalhit|Twitterbot|LinkedInBot|Applebot|Blackbox-Exporter|UptimeRobot|Pingdom|CensysInspect|Feedfetcher|Mediapartners-Google).*"
|
||||
|
||||
# Author: Phil Connor — https://mylinux.work
|
||||
EOF
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# INSTALL JAIL
|
||||
# ============================================================================
|
||||
|
||||
install_jail() {
|
||||
local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf"
|
||||
|
||||
log_step "Installing jail: $jail_file"
|
||||
|
||||
if $DRY_RUN; then
|
||||
log_info "[DRY RUN] Would create $jail_file"
|
||||
echo ""
|
||||
generate_jail
|
||||
echo ""
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ -f "$jail_file" ]]; then
|
||||
log_warn "Jail config already exists — backing up to ${jail_file}.bak"
|
||||
cp "$jail_file" "${jail_file}.bak"
|
||||
fi
|
||||
|
||||
generate_jail > "$jail_file"
|
||||
log_info "Jail config installed: $jail_file"
|
||||
}
|
||||
|
||||
generate_jail() {
|
||||
cat <<EOF
|
||||
# Fail2ban jail to detect headless Chrome scrapers
|
||||
# https://mylinux.work
|
||||
#
|
||||
# Bans IPs that generate excessive 499 (connection dropped) or 404 (not
|
||||
# found) responses. These patterns indicate automated scraping by headless
|
||||
# Chrome bots (Puppeteer, Playwright) that pass JS challenges but behave
|
||||
# differently from real users.
|
||||
#
|
||||
# Default: 3 errors in 5 minutes = 24 hour ban.
|
||||
# Real users rarely hit 3 errors in 5 minutes. Adjust maxretry up
|
||||
# if you see false positives.
|
||||
|
||||
[scraper-detect]
|
||||
enabled = true
|
||||
port = http,https
|
||||
filter = scraper-detect
|
||||
logpath = $LOGPATH
|
||||
maxretry = $MAXRETRY
|
||||
findtime = $FINDTIME
|
||||
bantime = $BANTIME
|
||||
EOF
|
||||
|
||||
if [[ -n "$IGNOREIP" ]]; then
|
||||
echo "ignoreip = $IGNOREIP"
|
||||
fi
|
||||
|
||||
cat <<'EOF'
|
||||
|
||||
# Author: Phil Connor — https://mylinux.work
|
||||
EOF
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# RELOAD AND VERIFY
|
||||
# ============================================================================
|
||||
|
||||
reload_fail2ban() {
|
||||
log_step "Reloading Fail2ban..."
|
||||
|
||||
if $DRY_RUN; then
|
||||
log_info "[DRY RUN] Would reload fail2ban"
|
||||
return
|
||||
fi
|
||||
|
||||
if ! fail2ban-client --test 2>/dev/null; then
|
||||
log_warn "Config test not available — reloading directly"
|
||||
fi
|
||||
|
||||
fail2ban-client reload
|
||||
sleep 2
|
||||
|
||||
if systemctl is-active --quiet fail2ban; then
|
||||
log_info "Fail2ban reloaded successfully"
|
||||
else
|
||||
log_error "Fail2ban failed to restart — check: journalctl -u fail2ban"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
verify_jail() {
|
||||
log_step "Verifying scraper-detect jail..."
|
||||
|
||||
if $DRY_RUN; then
|
||||
log_info "[DRY RUN] Would verify jail status"
|
||||
return
|
||||
fi
|
||||
|
||||
echo ""
|
||||
if fail2ban-client status scraper-detect 2>/dev/null; then
|
||||
echo ""
|
||||
log_info "Scraper-detect jail is active and monitoring $LOGPATH"
|
||||
else
|
||||
log_error "Jail 'scraper-detect' is not running — check: fail2ban-client status"
|
||||
log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
test_against_logs() {
|
||||
if $DRY_RUN; then
|
||||
# shellcheck disable=SC2086
|
||||
local matches=( $LOGPATH )
|
||||
if [[ -f "${matches[0]}" ]]; then
|
||||
log_step "Testing filter against existing logs..."
|
||||
local tmp_filter
|
||||
tmp_filter=$(mktemp /tmp/scraper-detect-filter.XXXXXX)
|
||||
generate_filter > "$tmp_filter"
|
||||
echo ""
|
||||
fail2ban-regex "${matches[0]}" "$tmp_filter" 2>&1 | tail -8
|
||||
rm -f "$tmp_filter"
|
||||
echo ""
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# MAIN
|
||||
# ============================================================================
|
||||
|
||||
main() {
|
||||
parse_args "$@"
|
||||
|
||||
echo ""
|
||||
echo "============================================"
|
||||
echo " Fail2ban Scraper Detect v${VERSION}"
|
||||
echo " https://mylinux.work"
|
||||
echo "============================================"
|
||||
echo ""
|
||||
|
||||
check_root
|
||||
check_fail2ban
|
||||
|
||||
if $REMOVE; then
|
||||
do_remove
|
||||
fi
|
||||
|
||||
detect_logpath
|
||||
test_against_logs
|
||||
install_filter
|
||||
install_jail
|
||||
reload_fail2ban
|
||||
verify_jail
|
||||
|
||||
echo ""
|
||||
echo "============================================"
|
||||
echo " Setup Complete"
|
||||
echo "============================================"
|
||||
echo ""
|
||||
echo " Jail: scraper-detect"
|
||||
echo " Log: $LOGPATH"
|
||||
echo " Ban time: ${BANTIME}s ($(( BANTIME / 3600 ))h)"
|
||||
echo " Max retry: $MAXRETRY (499/404 errors before ban)"
|
||||
echo " Find time: ${FINDTIME}s ($(( FINDTIME / 60 ))m window)"
|
||||
if [[ -n "$IGNOREIP" ]]; then
|
||||
echo " Ignore: $IGNOREIP"
|
||||
fi
|
||||
echo ""
|
||||
echo " Useful commands:"
|
||||
echo " fail2ban-client status scraper-detect"
|
||||
echo " fail2ban-client set scraper-detect unbanip <IP>"
|
||||
echo " fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf"
|
||||
echo ""
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user