Sync all scripts from website downloads — 352 scripts total

Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00
parent dbd6bf0324
commit a1a17e81a1
332 changed files with 174509 additions and 1106 deletions
@@ -0,0 +1,474 @@
+#!/bin/bash
+################################################################################
+# Script Name: add-fail2ban-image-scraper.sh
+# Version: 1.0
+# Description: Adds a Fail2ban jail to block image scrapers — bots that
+#              directly request image files with no referer. Real browsers
+#              always send a referer when loading images (the page containing
+#              the <img> tag). Direct image requests with no referer are
+#              almost always scrapers harvesting images for AI training
+#              datasets or content theft.
+#
+# Author:  Phil Connor
+# Contact: contact@mylinux.work
+# Website: https://mylinux.work
+# License: MIT
+#
+# Usage:
+#   sudo ./add-fail2ban-image-scraper.sh
+#   sudo ./add-fail2ban-image-scraper.sh --logpath /var/log/nginx/access.log
+#   sudo ./add-fail2ban-image-scraper.sh --maxretry 3
+#   sudo ./add-fail2ban-image-scraper.sh --dry-run
+#
+################################################################################
+
+set -euo pipefail
+
+# ============================================================================
+# DEFAULTS
+# ============================================================================
+
+readonly VERSION="1.1"
+readonly SCRIPT_NAME="${0##*/}"
+
+LOGPATH="auto"
+BANTIME="86400"
+MAXRETRY="5"
+FINDTIME="300"
+IGNOREIP=""
+DRY_RUN=false
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+
+log_info()  { echo -e "${GREEN}[INFO]${NC} $*"; }
+log_warn()  { echo -e "${YELLOW}[WARN]${NC} $*"; }
+log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
+log_step()  { echo -e "${CYAN}[STEP]${NC} $*"; }
+
+show_usage() {
+    cat <<EOF
+Usage: sudo $SCRIPT_NAME [OPTIONS]
+
+Adds an image scraper blocking jail to an existing Fail2ban installation.
+Catches bots that directly request image files (.png, .jpg, .webp, etc.)
+with no referer — a pattern that indicates scraping rather than normal
+browsing (real browsers always send the page URL as referer when loading
+images embedded in a page).
+
+OPTIONS:
+    --logpath PATH      Access log path (default: auto-detect)
+    --bantime SECS      Ban duration in seconds (default: 86400 / 24 hours)
+    --maxretry NUM      Image requests before ban (default: 5)
+    --findtime SECS     Window for counting requests (default: 300 / 5 min)
+    --ignoreip RANGES   Space-separated IPs/CIDRs to whitelist (quoted)
+    --ignore-search     Shorthand: whitelist Google + Bing crawlers
+    --dry-run           Show what would be done without making changes
+    --remove            Remove the filter and jail
+    -h, --help          Show this help message
+
+WHAT IT CATCHES:
+    - Direct GET requests for image files with no referer
+    - AI training dataset scrapers (img2dataset and similar)
+    - Content theft bots harvesting images
+    - Scrapers rotating through cloud IPs with spoofed user agents
+
+WHAT IT IGNORES:
+    - Images loaded normally via a web page (have referer)
+    - Search engine image crawlers (Googlebot etc. — use robots.txt for those)
+    - Single direct image access (below maxretry threshold)
+    - Non-image requests (HTML, CSS, JS, etc.)
+
+EXAMPLES:
+    # Install with defaults (ban after 5 direct image requests in 5 min)
+    sudo $SCRIPT_NAME
+
+    # Stricter: ban after 2 requests in 1 hour, whitelist search engines
+    sudo $SCRIPT_NAME --maxretry 2 --findtime 3600 --ignore-search
+
+    # Custom whitelist
+    sudo $SCRIPT_NAME --ignoreip "66.249.0.0/16 40.77.0.0/16 192.168.1.0/24"
+
+    # Preview without changes
+    sudo $SCRIPT_NAME --dry-run
+
+    # Remove the jail
+    sudo $SCRIPT_NAME --remove
+
+EOF
+    exit 0
+}
+
+# ============================================================================
+# ARGUMENT PARSING
+# ============================================================================
+
+REMOVE=false
+
+parse_args() {
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            --logpath)   LOGPATH="$2"; shift 2 ;;
+            --bantime)   BANTIME="$2"; shift 2 ;;
+            --maxretry)  MAXRETRY="$2"; shift 2 ;;
+            --findtime)  FINDTIME="$2"; shift 2 ;;
+            --ignoreip)  IGNOREIP="$2"; shift 2 ;;
+            --ignore-search) IGNOREIP="66.249.0.0/16 40.77.0.0/16"; shift ;;
+            --dry-run)   DRY_RUN=true; shift ;;
+            --remove)    REMOVE=true; shift ;;
+            -h|--help)   show_usage ;;
+            *)           log_error "Unknown option: $1"; show_usage ;;
+        esac
+    done
+}
+
+# ============================================================================
+# CHECKS
+# ============================================================================
+
+check_root() {
+    if [[ $EUID -ne 0 ]]; then
+        log_error "This script must be run as root (sudo)"
+        exit 1
+    fi
+}
+
+check_fail2ban() {
+    if ! command -v fail2ban-client &>/dev/null; then
+        log_error "Fail2ban is not installed"
+        log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/"
+        exit 1
+    fi
+
+    if ! systemctl is-active --quiet fail2ban; then
+        log_error "Fail2ban is not running"
+        exit 1
+    fi
+
+    log_info "Fail2ban is installed and running"
+}
+
+detect_logpath() {
+    if [[ "$LOGPATH" != "auto" ]]; then
+        # shellcheck disable=SC2086
+        local matches=( $LOGPATH )
+        if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then
+            log_error "Log file not found: $LOGPATH"
+            exit 1
+        fi
+        log_info "Using specified log path: $LOGPATH"
+        return
+    fi
+
+    log_step "Auto-detecting web server access log..."
+
+    # HestiaCP — apache domains
+    local hestia_apache=( /var/log/apache2/domains/*.log )
+    if [[ -f "${hestia_apache[0]:-}" ]]; then
+        LOGPATH="/var/log/apache2/domains/*.log"
+        log_info "Detected HestiaCP apache: $LOGPATH"
+        return
+    fi
+
+    # HestiaCP — nginx domains
+    local hestia_nginx=( /var/log/nginx/domains/*.log )
+    if [[ -f "${hestia_nginx[0]:-}" ]]; then
+        LOGPATH="/var/log/nginx/domains/*.log"
+        log_info "Detected HestiaCP nginx: $LOGPATH"
+        return
+    fi
+
+    # Nginx (standard)
+    if [[ -f /var/log/nginx/access.log ]]; then
+        LOGPATH="/var/log/nginx/access.log"
+        log_info "Detected nginx: $LOGPATH"
+        return
+    fi
+
+    # Apache (Debian/Ubuntu)
+    if [[ -f /var/log/apache2/access.log ]]; then
+        LOGPATH="/var/log/apache2/access.log"
+        log_info "Detected apache2: $LOGPATH"
+        return
+    fi
+
+    # Apache (RHEL/Rocky)
+    if [[ -f /var/log/httpd/access_log ]]; then
+        LOGPATH="/var/log/httpd/access_log"
+        log_info "Detected httpd: $LOGPATH"
+        return
+    fi
+
+    log_error "Could not auto-detect access log. Use --logpath to specify."
+    exit 1
+}
+
+# ============================================================================
+# REMOVE
+# ============================================================================
+
+do_remove() {
+    local filter_file="/etc/fail2ban/filter.d/image-scraper.conf"
+    local jail_file="/etc/fail2ban/jail.d/image-scraper.conf"
+
+    log_step "Removing image scraper jail..."
+
+    if $DRY_RUN; then
+        log_info "[DRY RUN] Would remove $filter_file"
+        log_info "[DRY RUN] Would remove $jail_file"
+        log_info "[DRY RUN] Would reload fail2ban"
+        return
+    fi
+
+    if [[ -f "$jail_file" ]]; then
+        rm -f "$jail_file"
+        log_info "Removed: $jail_file"
+    else
+        log_warn "Jail config not found: $jail_file"
+    fi
+
+    if [[ -f "$filter_file" ]]; then
+        rm -f "$filter_file"
+        log_info "Removed: $filter_file"
+    else
+        log_warn "Filter not found: $filter_file"
+    fi
+
+    fail2ban-client reload
+    sleep 2
+    log_info "Fail2ban reloaded — image-scraper jail removed"
+    exit 0
+}
+
+# ============================================================================
+# INSTALL FILTER
+# ============================================================================
+
+install_filter() {
+    local filter_file="/etc/fail2ban/filter.d/image-scraper.conf"
+
+    log_step "Installing filter: $filter_file"
+
+    if $DRY_RUN; then
+        log_info "[DRY RUN] Would create $filter_file"
+        echo ""
+        generate_filter
+        echo ""
+        return
+    fi
+
+    if [[ -f "$filter_file" ]]; then
+        log_warn "Filter already exists — backing up to ${filter_file}.bak"
+        cp "$filter_file" "${filter_file}.bak"
+    fi
+
+    generate_filter > "$filter_file"
+    log_info "Filter installed: $filter_file"
+}
+
+generate_filter() {
+    cat <<'EOF'
+# Fail2ban filter to block image scrapers
+# https://mylinux.work
+#
+# Catches bots that directly request image files with no referer.
+# When a real browser loads an image from a web page, it sends the page
+# URL as the referer header. Direct image requests with no referer
+# indicate scraping — typically for AI training datasets or content theft.
+#
+# Matches: GET requests for .png, .jpg, .jpeg, .gif, .webp, .svg, .avif
+# with referer logged as "-" (absent/empty).
+#
+# Does NOT match .ico (favicons are legitimately requested without referer).
+
+[Definition]
+
+# Direct image request with no referer — combined log format
+# Format: IP - - [date] "GET /path/image.png HTTP/x.x" status size "-" "user agent"
+failregex = ^<HOST> \S+ \S+ \[.*\] "GET \S+\.(?:png|jpe?g|gif|webp|svg|avif) \S+" \d+ \d+ "-" ".*"
+
+ignoreregex =
+
+# Author: Phil Connor — https://mylinux.work
+EOF
+}
+
+# ============================================================================
+# INSTALL JAIL
+# ============================================================================
+
+install_jail() {
+    local jail_file="/etc/fail2ban/jail.d/image-scraper.conf"
+
+    log_step "Installing jail: $jail_file"
+
+    if $DRY_RUN; then
+        log_info "[DRY RUN] Would create $jail_file"
+        echo ""
+        generate_jail
+        echo ""
+        return
+    fi
+
+    if [[ -f "$jail_file" ]]; then
+        log_warn "Jail config already exists — backing up to ${jail_file}.bak"
+        cp "$jail_file" "${jail_file}.bak"
+    fi
+
+    generate_jail > "$jail_file"
+    log_info "Jail config installed: $jail_file"
+}
+
+generate_jail() {
+    cat <<EOF
+# Fail2ban jail to block image scrapers
+# https://mylinux.work
+#
+# Bans IPs that directly request multiple image files with no referer.
+# Default: 5 direct image requests in 5 minutes = 24 hour ban.
+#
+# Real browsers always send a referer when loading images from a page.
+# Direct image requests without one indicate scraping for AI training
+# datasets (img2dataset, etc.) or content theft.
+
+[image-scraper]
+enabled  = true
+port     = http,https
+filter   = image-scraper
+logpath  = $LOGPATH
+maxretry = $MAXRETRY
+findtime = $FINDTIME
+bantime  = $BANTIME
+EOF
+
+    if [[ -n "$IGNOREIP" ]]; then
+        echo "ignoreip = $IGNOREIP"
+    fi
+
+    cat <<'EOF'
+
+# Author: Phil Connor — https://mylinux.work
+EOF
+}
+
+# ============================================================================
+# RELOAD AND VERIFY
+# ============================================================================
+
+reload_fail2ban() {
+    log_step "Reloading Fail2ban..."
+
+    if $DRY_RUN; then
+        log_info "[DRY RUN] Would reload fail2ban"
+        return
+    fi
+
+    if ! fail2ban-client --test 2>/dev/null; then
+        log_warn "Config test not available — reloading directly"
+    fi
+
+    fail2ban-client reload
+    sleep 2
+
+    if systemctl is-active --quiet fail2ban; then
+        log_info "Fail2ban reloaded successfully"
+    else
+        log_error "Fail2ban failed to restart — check: journalctl -u fail2ban"
+        exit 1
+    fi
+}
+
+verify_jail() {
+    log_step "Verifying image-scraper jail..."
+
+    if $DRY_RUN; then
+        log_info "[DRY RUN] Would verify jail status"
+        return
+    fi
+
+    echo ""
+    if fail2ban-client status image-scraper 2>/dev/null; then
+        echo ""
+        log_info "Image scraper jail is active and monitoring $LOGPATH"
+    else
+        log_error "Jail 'image-scraper' is not running — check: fail2ban-client status"
+        log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/image-scraper.conf"
+        exit 1
+    fi
+}
+
+test_against_logs() {
+    if $DRY_RUN; then
+        # shellcheck disable=SC2086
+        local matches=( $LOGPATH )
+        if [[ -f "${matches[0]}" ]]; then
+            log_step "Testing filter against existing logs..."
+            echo ""
+            fail2ban-regex "${matches[0]}" /dev/stdin <<'FILTER' 2>&1 | tail -5
+[Definition]
+failregex = ^<HOST> \S+ \S+ \[.*\] "GET \S+\.(?:png|jpe?g|gif|webp|svg|avif) \S+" \d+ \d+ "-" ".*"
+ignoreregex =
+FILTER
+            echo ""
+        fi
+    fi
+}
+
+# ============================================================================
+# MAIN
+# ============================================================================
+
+main() {
+    parse_args "$@"
+
+    echo ""
+    echo "============================================"
+    echo "  Fail2ban Image Scraper Blocker v${VERSION}"
+    echo "  https://mylinux.work"
+    echo "============================================"
+    echo ""
+
+    check_root
+    check_fail2ban
+
+    if $REMOVE; then
+        do_remove
+    fi
+
+    detect_logpath
+    test_against_logs
+    install_filter
+    install_jail
+    reload_fail2ban
+    verify_jail
+
+    echo ""
+    echo "============================================"
+    echo "  Setup Complete"
+    echo "============================================"
+    echo ""
+    echo "  Jail:      image-scraper"
+    echo "  Log:       $LOGPATH"
+    echo "  Ban time:  ${BANTIME}s ($(( BANTIME / 3600 ))h)"
+    echo "  Max retry: $MAXRETRY (direct image requests before ban)"
+    echo "  Find time: ${FINDTIME}s ($(( FINDTIME / 60 ))m window)"
+    if [[ -n "$IGNOREIP" ]]; then
+        echo "  Ignore:    $IGNOREIP"
+    fi
+    echo ""
+    echo "  Useful commands:"
+    echo "    fail2ban-client status image-scraper"
+    echo "    fail2ban-client set image-scraper unbanip <IP>"
+    echo "    fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/image-scraper.conf"
+    echo ""
+}
+
+main "$@"