a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
375 lines
11 KiB
Bash
Executable File
375 lines
11 KiB
Bash
Executable File
#!/bin/bash
|
|
################################################################################
|
|
# Script Name: add-apache-bot-block.sh
|
|
# Version: 1.1
|
|
# Description: Automate AI scraper, SEO bot, vulnerability scanner, and
|
|
# scraping framework blocking on standard Apache servers.
|
|
# Creates mod_rewrite rules server-wide or per-directory via
|
|
# .htaccess.
|
|
#
|
|
# Author: Phil Connor
|
|
# Contact: contact@mylinux.work
|
|
# Website: https://mylinux.work
|
|
# License: MIT
|
|
#
|
|
# Prerequisites:
|
|
# - Apache installed (apache2 on Debian/Ubuntu or httpd on RHEL/CentOS)
|
|
# - Root access
|
|
# - mod_rewrite available
|
|
#
|
|
# Usage:
|
|
# sudo ./add-apache-bot-block.sh
|
|
# sudo ./add-apache-bot-block.sh --dry-run
|
|
# sudo ./add-apache-bot-block.sh --remove
|
|
# sudo ./add-apache-bot-block.sh --htaccess /var/www/html
|
|
# sudo ./add-apache-bot-block.sh --htaccess /var/www/html --remove
|
|
#
|
|
# Changelog:
|
|
# 1.1 — 2026-05-04: Removed OAI-SearchBot from blocklist. User-facing fetcher
|
|
# bot, not a training crawler. Blocking it prevents your content from
|
|
# being cited in AI search answers.
|
|
#
|
|
################################################################################
|
|
|
|
set -euo pipefail
|
|
|
|
# --- Configuration ---
|
|
DRY_RUN=false
|
|
REMOVE=false
|
|
HTACCESS_PATH=""
|
|
DISTRO="" # debian or rhel
|
|
CONF_FILE="" # set after distro detection
|
|
APACHE_SVC="" # apache2 or httpd
|
|
|
|
# --- Colors ---
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
CYAN='\033[0;36m'
|
|
BOLD='\033[1m'
|
|
NC='\033[0m'
|
|
|
|
info() { echo -e "${GREEN}[OK]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
step() { echo -e "${CYAN}[STEP]${NC} $*"; }
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: sudo $(basename "$0") [OPTIONS]
|
|
|
|
Blocks AI scrapers, SEO bots, vulnerability scanners, and scraping frameworks
|
|
on standard Apache servers using mod_rewrite rules.
|
|
|
|
Options:
|
|
--dry-run Show what would be done without making changes
|
|
--remove Remove the bot-block configuration and reload Apache
|
|
--htaccess PATH Write rules to .htaccess at PATH instead of server-wide
|
|
-h, --help Show this help
|
|
|
|
Examples:
|
|
sudo $(basename "$0")
|
|
sudo $(basename "$0") --dry-run
|
|
sudo $(basename "$0") --remove
|
|
sudo $(basename "$0") --htaccess /var/www/html
|
|
sudo $(basename "$0") --htaccess /var/www/html --remove
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
# --- Argument parsing ---
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--dry-run) DRY_RUN=true; shift ;;
|
|
--remove) REMOVE=true; shift ;;
|
|
--htaccess) HTACCESS_PATH="$2"; shift 2 ;;
|
|
-h|--help) usage ;;
|
|
*) echo "Unknown option: $1"; usage ;;
|
|
esac
|
|
done
|
|
|
|
# --- Root check ---
|
|
if [[ $EUID -ne 0 ]]; then
|
|
echo -e "${RED}Error: Run as root (sudo)${NC}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# --- Distro detection ---
|
|
detect_distro() {
|
|
if [[ -f /etc/debian_version ]]; then
|
|
DISTRO="debian"
|
|
CONF_FILE="/etc/apache2/conf-available/bot-block.conf"
|
|
APACHE_SVC="apache2"
|
|
elif [[ -f /etc/redhat-release ]]; then
|
|
DISTRO="rhel"
|
|
CONF_FILE="/etc/httpd/conf.d/bot-block.conf"
|
|
APACHE_SVC="httpd"
|
|
else
|
|
echo -e "${RED}Error: Unsupported distribution (neither Debian/Ubuntu nor RHEL/CentOS)${NC}" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
detect_distro
|
|
|
|
# --- Apache check ---
|
|
if ! command -v apachectl &>/dev/null; then
|
|
echo -e "${RED}Error: Apache (${APACHE_SVC}) not found${NC}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# --- Bot-block rules content ---
|
|
MANAGED_START="# bot-block-managed-start"
|
|
MANAGED_END="# bot-block-managed-end"
|
|
|
|
generate_rules() {
|
|
cat <<'RULES'
|
|
# bot-block-managed-start
|
|
# Bot-blocking rules for Apache — generated by add-apache-bot-block.sh
|
|
# https://mylinux.work
|
|
|
|
RewriteEngine On
|
|
|
|
# AI scrapers
|
|
RewriteCond %{HTTP_USER_AGENT} ABEvalBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} GPTBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} ClaudeBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} anthropic-ai [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} CCBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Bytespider [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} TikTokSpider [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} cohere-ai [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} PerplexityBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Diffbot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} MistralBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} YandexGPTBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} meta-externalagent [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Meta-ExternalFetcher [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} meta-webindexer [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} PetalBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Amazonbot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Amzn-SearchBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} AI2Bot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Timpibot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} img2dataset [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} YouBot [NC,OR]
|
|
|
|
# SEO scrapers
|
|
RewriteCond %{HTTP_USER_AGENT} MJ12bot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} SemrushBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} AhrefsBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} DotBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} DataForSeoBot [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} SERanking [NC,OR]
|
|
|
|
# Vulnerability scanners
|
|
RewriteCond %{HTTP_USER_AGENT} Nikto [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} sqlmap [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Nmap [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} masscan [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} ZmEu [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Morpheus [NC,OR]
|
|
|
|
# Scraping frameworks
|
|
RewriteCond %{HTTP_USER_AGENT} Scrapy [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} python-requests [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Go-http-client [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} Java/ [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} libwww-perl [NC,OR]
|
|
RewriteCond %{HTTP_USER_AGENT} trafilatura [NC]
|
|
RewriteRule .* - [F]
|
|
|
|
# Block broken srcset scrapers
|
|
RewriteCond %{REQUEST_URI} %20[0-9]+w,https?:// [NC]
|
|
RewriteRule .* - [F]
|
|
# bot-block-managed-end
|
|
RULES
|
|
}
|
|
|
|
# =====================================================
|
|
# --remove mode
|
|
# =====================================================
|
|
if [[ "$REMOVE" == "true" ]]; then
|
|
|
|
# --- Remove from .htaccess ---
|
|
if [[ -n "$HTACCESS_PATH" ]]; then
|
|
HTACCESS_FILE="${HTACCESS_PATH%/}/.htaccess"
|
|
step "Removing bot-block rules from ${HTACCESS_FILE}"
|
|
|
|
if [[ ! -f "$HTACCESS_FILE" ]]; then
|
|
warn "File not found: ${HTACCESS_FILE} — nothing to remove"
|
|
exit 0
|
|
fi
|
|
|
|
if ! grep -q "$MANAGED_START" "$HTACCESS_FILE"; then
|
|
warn "No managed bot-block block found in ${HTACCESS_FILE}"
|
|
exit 0
|
|
fi
|
|
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
echo " Would strip managed block from ${HTACCESS_FILE}"
|
|
else
|
|
cp "$HTACCESS_FILE" "${HTACCESS_FILE}.bak.$(date +%s)"
|
|
warn "Backup created"
|
|
sed -i "/${MANAGED_START}/,/${MANAGED_END}/d" "$HTACCESS_FILE"
|
|
info "Managed block removed from ${HTACCESS_FILE}"
|
|
fi
|
|
exit 0
|
|
fi
|
|
|
|
# --- Remove server-wide conf ---
|
|
step "Removing bot-block configuration"
|
|
|
|
if [[ ! -f "$CONF_FILE" ]]; then
|
|
warn "Config not found: ${CONF_FILE} — nothing to remove"
|
|
exit 0
|
|
fi
|
|
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
if [[ "$DISTRO" == "debian" ]]; then
|
|
echo " Would run: a2disconf bot-block"
|
|
fi
|
|
echo " Would remove: ${CONF_FILE}"
|
|
echo " Would reload: ${APACHE_SVC}"
|
|
else
|
|
if [[ "$DISTRO" == "debian" ]]; then
|
|
a2disconf bot-block 2>/dev/null || true
|
|
info "Conf disabled (a2disconf)"
|
|
fi
|
|
rm -f "$CONF_FILE"
|
|
info "Removed ${CONF_FILE}"
|
|
|
|
step "Reloading ${APACHE_SVC}"
|
|
systemctl reload "$APACHE_SVC"
|
|
info "${APACHE_SVC} reloaded"
|
|
fi
|
|
|
|
echo ""
|
|
echo -e "${BOLD}Bot-block configuration removed.${NC}"
|
|
exit 0
|
|
fi
|
|
|
|
# =====================================================
|
|
# --htaccess mode (install)
|
|
# =====================================================
|
|
if [[ -n "$HTACCESS_PATH" ]]; then
|
|
HTACCESS_FILE="${HTACCESS_PATH%/}/.htaccess"
|
|
step "Writing bot-block rules to ${HTACCESS_FILE}"
|
|
|
|
if [[ ! -d "$HTACCESS_PATH" ]]; then
|
|
echo -e "${RED}Error: Directory not found: ${HTACCESS_PATH}${NC}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
echo " Would write managed block to ${HTACCESS_FILE}"
|
|
echo ""
|
|
echo -e "${BOLD}Dry-run complete — no changes made.${NC}"
|
|
exit 0
|
|
fi
|
|
|
|
# Back up existing .htaccess if it exists
|
|
if [[ -f "$HTACCESS_FILE" ]]; then
|
|
# Remove old managed block if present
|
|
if grep -q "$MANAGED_START" "$HTACCESS_FILE"; then
|
|
cp "$HTACCESS_FILE" "${HTACCESS_FILE}.bak.$(date +%s)"
|
|
warn "Backup created"
|
|
sed -i "/${MANAGED_START}/,/${MANAGED_END}/d" "$HTACCESS_FILE"
|
|
warn "Old managed block removed"
|
|
else
|
|
cp "$HTACCESS_FILE" "${HTACCESS_FILE}.bak.$(date +%s)"
|
|
warn "Existing .htaccess backed up"
|
|
fi
|
|
fi
|
|
|
|
# Append rules
|
|
generate_rules >> "$HTACCESS_FILE"
|
|
info "Bot-block rules written to ${HTACCESS_FILE}"
|
|
|
|
echo ""
|
|
echo -e "${BOLD}Done.${NC}"
|
|
echo ""
|
|
echo " File: ${HTACCESS_FILE}"
|
|
echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com"
|
|
echo " Expected: 403"
|
|
exit 0
|
|
fi
|
|
|
|
# =====================================================
|
|
# Server-wide install (default)
|
|
# =====================================================
|
|
|
|
# --- Step 1: Enable mod_rewrite (Debian) ---
|
|
if [[ "$DISTRO" == "debian" ]]; then
|
|
step "Enabling mod_rewrite"
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
echo " Would run: a2enmod rewrite"
|
|
else
|
|
a2enmod rewrite 2>/dev/null || true
|
|
info "mod_rewrite enabled"
|
|
fi
|
|
fi
|
|
|
|
# --- Step 2: Write conf file ---
|
|
step "Creating bot-block conf at ${CONF_FILE}"
|
|
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
echo " Would create: ${CONF_FILE}"
|
|
else
|
|
if [[ -f "$CONF_FILE" ]]; then
|
|
cp "$CONF_FILE" "${CONF_FILE}.bak.$(date +%s)"
|
|
warn "Existing config backed up"
|
|
fi
|
|
generate_rules > "$CONF_FILE"
|
|
info "Config created: ${CONF_FILE}"
|
|
fi
|
|
|
|
# --- Step 3: Enable conf (Debian) ---
|
|
if [[ "$DISTRO" == "debian" ]]; then
|
|
step "Enabling bot-block conf"
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
echo " Would run: a2enconf bot-block"
|
|
else
|
|
a2enconf bot-block 2>/dev/null || true
|
|
info "Conf enabled (a2enconf)"
|
|
fi
|
|
fi
|
|
|
|
# --- Step 4: Validate config ---
|
|
step "Testing Apache configuration"
|
|
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
echo " Would run: apachectl configtest"
|
|
else
|
|
if apachectl configtest 2>&1; then
|
|
info "Apache config valid"
|
|
else
|
|
echo -e "${RED}[ERROR] Apache config test failed${NC}" >&2
|
|
echo " Restore backup from ${CONF_FILE}.bak.* and reload" >&2
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
# --- Step 5: Reload Apache ---
|
|
step "Reloading ${APACHE_SVC}"
|
|
|
|
if [[ "$DRY_RUN" == "true" ]]; then
|
|
echo " Would run: systemctl reload ${APACHE_SVC}"
|
|
else
|
|
systemctl reload "$APACHE_SVC"
|
|
info "${APACHE_SVC} reloaded"
|
|
fi
|
|
|
|
# =====================================================
|
|
# Summary
|
|
# =====================================================
|
|
echo ""
|
|
echo -e "${BOLD}Done.${NC}"
|
|
echo ""
|
|
echo " Config: ${CONF_FILE}"
|
|
echo " Distro: ${DISTRO}"
|
|
echo ""
|
|
echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com"
|
|
echo " Expected: 403"
|
|
echo ""
|
|
echo " Remove: sudo $(basename "$0") --remove"
|