a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
396 lines
12 KiB
Bash
Executable File
396 lines
12 KiB
Bash
Executable File
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: add-fail2ban-ai-bots.sh
|
|
# Version: 1.1
|
|
# Description: Adds a Fail2ban jail to block AI scrapers and unwanted bots
|
|
# that ignore robots.txt. Installs filter + jail config and
|
|
# reloads Fail2ban.
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
#
|
|
# Usage:
|
|
# sudo ./add-fail2ban-ai-bots.sh
|
|
# sudo ./add-fail2ban-ai-bots.sh --logpath /var/log/nginx/access.log
|
|
# sudo ./add-fail2ban-ai-bots.sh --bantime 604800
|
|
# sudo ./add-fail2ban-ai-bots.sh --dry-run
|
|
#
|
|
# Changelog:
|
|
# 1.1 — 2026-05-04: Removed Claude-Web, Perplexity-User, ChatGPT-User, and
|
|
# OAI-SearchBot from blocklist. These are user-facing fetcher bots that
|
|
# retrieve content when someone pastes a URL into an AI chat or search.
|
|
# Blocking them prevents your content from being cited in AI answers.
|
|
# Training crawlers (ClaudeBot, PerplexityBot, GPTBot) remain blocked.
|
|
#
|
|
################################################################################
|
|
|
|
set -euo pipefail
|
|
|
|
# ============================================================================
|
|
# DEFAULTS
|
|
# ============================================================================
|
|
|
|
readonly VERSION="1.1"
|
|
readonly SCRIPT_NAME="${0##*/}"
|
|
|
|
LOGPATH="auto"
|
|
BANTIME="86400"
|
|
MAXRETRY="1"
|
|
DRY_RUN=false
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m'
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
log_info() { echo -e "${GREEN}[INFO]${NC} $*"; }
|
|
log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
|
log_step() { echo -e "${CYAN}[STEP]${NC} $*"; }
|
|
|
|
show_usage() {
|
|
cat <<EOF
|
|
Usage: sudo $SCRIPT_NAME [OPTIONS]
|
|
|
|
Adds an AI bot blocking jail to an existing Fail2ban installation.
|
|
|
|
OPTIONS:
|
|
--logpath PATH Nginx/Apache access log path (default: auto-detect)
|
|
--bantime SECS Ban duration in seconds (default: 86400 / 24 hours)
|
|
--maxretry NUM Hits before ban (default: 1)
|
|
--dry-run Show what would be done without making changes
|
|
-h, --help Show this help message
|
|
|
|
BLOCKED BOTS:
|
|
ABEvalBot Unknown operator web scraper
|
|
GPTBot OpenAI training crawler
|
|
CCBot Common Crawl (AI training datasets)
|
|
ClaudeBot Anthropic training crawler
|
|
anthropic-ai Anthropic legacy crawler
|
|
Bytespider ByteDance/TikTok crawler
|
|
TikTokSpider ByteDance/TikTok content spider
|
|
cohere-ai Cohere AI crawler
|
|
meta-externalagent Meta/Facebook content scraper
|
|
Meta-ExternalFetcher Meta content fetcher
|
|
PetalBot Huawei search/AI crawler
|
|
Amazonbot Amazon AI crawler
|
|
AI2Bot Allen Institute crawler
|
|
Ai2Bot-Dolma Allen Institute dataset crawler
|
|
YouBot You.com AI search crawler
|
|
PerplexityBot Perplexity AI indexing
|
|
Diffbot Content extraction bot
|
|
Applebot-Extended Apple AI training (not search)
|
|
Google-Extended Google AI training (not search)
|
|
MistralBot Mistral AI crawler
|
|
YandexGPTBot Yandex AI crawler
|
|
MJ12bot Majestic SEO crawler
|
|
Scrapy Scrapy-based scrapers
|
|
DataForSeoBot SEO data scraper
|
|
Timpibot AI training crawler
|
|
img2dataset Image dataset scraper
|
|
HanaleiBot Unidentified crawler (AWS, beta-stage)
|
|
SemrushBot Semrush SEO crawler
|
|
AhrefsBot Ahrefs SEO crawler
|
|
DotBot Moz SEO crawler
|
|
SERanking SE Ranking backlink/SEO crawler
|
|
trafilatura Web content extraction tool
|
|
|
|
EXAMPLES:
|
|
# Auto-detect and install
|
|
sudo $SCRIPT_NAME
|
|
|
|
# Custom log path and ban time (7 days)
|
|
sudo $SCRIPT_NAME --logpath /var/log/nginx/access.log --bantime 604800
|
|
|
|
# Preview without changes
|
|
sudo $SCRIPT_NAME --dry-run
|
|
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--logpath) LOGPATH="$2"; shift 2 ;;
|
|
--bantime) BANTIME="$2"; shift 2 ;;
|
|
--maxretry) MAXRETRY="$2"; shift 2 ;;
|
|
--dry-run) DRY_RUN=true; shift ;;
|
|
-h|--help) show_usage ;;
|
|
*) log_error "Unknown option: $1"; show_usage ;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
check_root() {
|
|
if [[ $EUID -ne 0 ]]; then
|
|
log_error "This script must be run as root (sudo)"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# CHECKS
|
|
# ============================================================================
|
|
|
|
check_fail2ban() {
|
|
if ! command -v fail2ban-client &>/dev/null; then
|
|
log_error "Fail2ban is not installed"
|
|
log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/"
|
|
exit 1
|
|
fi
|
|
|
|
if ! systemctl is-active --quiet fail2ban; then
|
|
log_error "Fail2ban is not running"
|
|
exit 1
|
|
fi
|
|
|
|
log_info "Fail2ban is installed and running"
|
|
}
|
|
|
|
detect_logpath() {
|
|
if [[ "$LOGPATH" != "auto" ]]; then
|
|
# Support glob patterns (e.g. /var/log/apache2/domains/*.log)
|
|
# shellcheck disable=SC2086,SC2206
|
|
local matches=( $LOGPATH )
|
|
if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then
|
|
log_error "Log file not found: $LOGPATH"
|
|
exit 1
|
|
fi
|
|
log_info "Using specified log path: $LOGPATH (${#matches[@]} file(s))"
|
|
return
|
|
fi
|
|
|
|
log_step "Auto-detecting web server access log..."
|
|
|
|
# HestiaCP / VestaCP — apache domains (check first: has full access logs with user agents)
|
|
local hestia_apache=( /var/log/apache2/domains/*.log )
|
|
if [[ -f "${hestia_apache[0]:-}" ]]; then
|
|
LOGPATH="/var/log/apache2/domains/*.log"
|
|
log_info "Detected HestiaCP/VestaCP apache: $LOGPATH (${#hestia_apache[@]} file(s))"
|
|
return
|
|
fi
|
|
|
|
# HestiaCP / VestaCP — nginx domains (proxy logs only in nginx+apache mode)
|
|
local hestia_nginx=( /var/log/nginx/domains/*.log )
|
|
if [[ -f "${hestia_nginx[0]:-}" ]]; then
|
|
LOGPATH="/var/log/nginx/domains/*.log"
|
|
log_info "Detected HestiaCP/VestaCP nginx: $LOGPATH (${#hestia_nginx[@]} file(s))"
|
|
return
|
|
fi
|
|
|
|
# Nginx (standard)
|
|
if [[ -f /var/log/nginx/access.log ]]; then
|
|
LOGPATH="/var/log/nginx/access.log"
|
|
log_info "Detected nginx: $LOGPATH"
|
|
return
|
|
fi
|
|
|
|
# Apache (Debian/Ubuntu)
|
|
if [[ -f /var/log/apache2/access.log ]]; then
|
|
LOGPATH="/var/log/apache2/access.log"
|
|
log_info "Detected apache2: $LOGPATH"
|
|
return
|
|
fi
|
|
|
|
# Apache (RHEL/Rocky)
|
|
if [[ -f /var/log/httpd/access_log ]]; then
|
|
LOGPATH="/var/log/httpd/access_log"
|
|
log_info "Detected httpd: $LOGPATH"
|
|
return
|
|
fi
|
|
|
|
log_error "Could not auto-detect access log. Use --logpath to specify."
|
|
exit 1
|
|
}
|
|
|
|
# ============================================================================
|
|
# INSTALL FILTER
|
|
# ============================================================================
|
|
|
|
install_filter() {
|
|
local filter_file="/etc/fail2ban/filter.d/ai-bots.conf"
|
|
|
|
log_step "Installing filter: $filter_file"
|
|
|
|
if $DRY_RUN; then
|
|
log_info "[DRY RUN] Would create $filter_file"
|
|
echo ""
|
|
generate_filter
|
|
echo ""
|
|
return
|
|
fi
|
|
|
|
if [[ -f "$filter_file" ]]; then
|
|
log_warn "Filter already exists — backing up to ${filter_file}.bak"
|
|
cp "$filter_file" "${filter_file}.bak"
|
|
fi
|
|
|
|
generate_filter > "$filter_file"
|
|
log_info "Filter installed: $filter_file"
|
|
}
|
|
|
|
generate_filter() {
|
|
cat <<'EOF'
|
|
# Fail2ban filter to block AI scrapers and unwanted bots
|
|
# https://mylinux.work
|
|
#
|
|
# Matches common AI crawler user agents in web server access logs.
|
|
# These bots scrape content for AI model training and typically
|
|
# ignore robots.txt directives.
|
|
|
|
[Definition]
|
|
|
|
# Match AI and unwanted bot user agents in access logs
|
|
# Supports both combined and common log formats
|
|
failregex = ^<HOST> \S+ \S+ \[.*\] "\S+ \S+ \S+" \d+ \d+ "\S+" ".*(?:ABEvalBot|GPTBot|CCBot|ClaudeBot|anthropic-ai|Bytespider|TikTokSpider|cohere-ai|meta-externalagent|Meta-ExternalFetcher|PetalBot|Amazonbot|AI2Bot|Ai2Bot-Dolma|YouBot|PerplexityBot|Diffbot|Applebot-Extended|Google-Extended|MistralBot|YandexGPTBot|MJ12bot|Scrapy|DataForSeoBot|Timpibot|img2dataset|HanaleiBot|SemrushBot|AhrefsBot|DotBot|SERanking|trafilatura).*"
|
|
|
|
ignoreregex =
|
|
|
|
# Author: Phil Connor — https://mylinux.work
|
|
EOF
|
|
}
|
|
|
|
# ============================================================================
|
|
# INSTALL JAIL
|
|
# ============================================================================
|
|
|
|
install_jail() {
|
|
local jail_file="/etc/fail2ban/jail.d/ai-bots.conf"
|
|
|
|
log_step "Installing jail: $jail_file"
|
|
|
|
if $DRY_RUN; then
|
|
log_info "[DRY RUN] Would create $jail_file"
|
|
echo ""
|
|
generate_jail
|
|
echo ""
|
|
return
|
|
fi
|
|
|
|
if [[ -f "$jail_file" ]]; then
|
|
log_warn "Jail config already exists — backing up to ${jail_file}.bak"
|
|
cp "$jail_file" "${jail_file}.bak"
|
|
fi
|
|
|
|
generate_jail > "$jail_file"
|
|
log_info "Jail config installed: $jail_file"
|
|
}
|
|
|
|
generate_jail() {
|
|
cat <<EOF
|
|
# Fail2ban jail to block AI scrapers and unwanted bots
|
|
# https://mylinux.work
|
|
#
|
|
# Bans IPs on first request matching an AI bot user agent.
|
|
# These bots ignore robots.txt so we enforce it at the firewall level.
|
|
|
|
[ai-bots]
|
|
enabled = true
|
|
port = http,https
|
|
filter = ai-bots
|
|
logpath = $LOGPATH
|
|
maxretry = $MAXRETRY
|
|
bantime = $BANTIME
|
|
findtime = $BANTIME
|
|
|
|
# Author: Phil Connor — https://mylinux.work
|
|
EOF
|
|
}
|
|
|
|
# ============================================================================
|
|
# RELOAD AND VERIFY
|
|
# ============================================================================
|
|
|
|
reload_fail2ban() {
|
|
log_step "Reloading Fail2ban..."
|
|
|
|
if $DRY_RUN; then
|
|
log_info "[DRY RUN] Would reload fail2ban"
|
|
return
|
|
fi
|
|
|
|
# Test config before reloading
|
|
if ! fail2ban-client --test 2>/dev/null; then
|
|
log_warn "Config test not available — reloading directly"
|
|
fi
|
|
|
|
fail2ban-client reload
|
|
sleep 2
|
|
|
|
if systemctl is-active --quiet fail2ban; then
|
|
log_info "Fail2ban reloaded successfully"
|
|
else
|
|
log_error "Fail2ban failed to restart — check: journalctl -u fail2ban"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
verify_jail() {
|
|
log_step "Verifying ai-bots jail..."
|
|
|
|
if $DRY_RUN; then
|
|
log_info "[DRY RUN] Would verify jail status"
|
|
return
|
|
fi
|
|
|
|
echo ""
|
|
if fail2ban-client status ai-bots 2>/dev/null; then
|
|
echo ""
|
|
log_info "AI bots jail is active and monitoring $LOGPATH"
|
|
else
|
|
log_error "Jail 'ai-bots' is not running — check: fail2ban-client status"
|
|
log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/ai-bots.conf"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN
|
|
# ============================================================================
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
echo ""
|
|
echo "============================================"
|
|
echo " Fail2ban AI Bot Blocker v${VERSION}"
|
|
echo " https://mylinux.work"
|
|
echo "============================================"
|
|
echo ""
|
|
|
|
check_root
|
|
check_fail2ban
|
|
detect_logpath
|
|
install_filter
|
|
install_jail
|
|
reload_fail2ban
|
|
verify_jail
|
|
|
|
echo ""
|
|
echo "============================================"
|
|
echo " Setup Complete"
|
|
echo "============================================"
|
|
echo ""
|
|
echo " Jail: ai-bots"
|
|
echo " Log: $LOGPATH"
|
|
echo " Ban time: ${BANTIME}s ($(( BANTIME / 3600 ))h)"
|
|
echo " Max retry: $MAXRETRY"
|
|
echo ""
|
|
echo " Useful commands:"
|
|
echo " fail2ban-client status ai-bots"
|
|
echo " fail2ban-client set ai-bots unbanip <IP>"
|
|
echo " fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/ai-bots.conf"
|
|
echo ""
|
|
}
|
|
|
|
main "$@"
|