#!/bin/bash ################################################################################ # Script Name: add-apache-bot-block.sh # Version: 1.1 # Description: Automate AI scraper, SEO bot, vulnerability scanner, and # scraping framework blocking on standard Apache servers. # Creates mod_rewrite rules server-wide or per-directory via # .htaccess. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - Apache installed (apache2 on Debian/Ubuntu or httpd on RHEL/CentOS) # - Root access # - mod_rewrite available # # Usage: # sudo ./add-apache-bot-block.sh # sudo ./add-apache-bot-block.sh --dry-run # sudo ./add-apache-bot-block.sh --remove # sudo ./add-apache-bot-block.sh --htaccess /var/www/html # sudo ./add-apache-bot-block.sh --htaccess /var/www/html --remove # # Changelog: # 1.1 — 2026-05-04: Removed OAI-SearchBot from blocklist. User-facing fetcher # bot, not a training crawler. Blocking it prevents your content from # being cited in AI search answers. # ################################################################################ set -euo pipefail # --- Configuration --- DRY_RUN=false REMOVE=false HTACCESS_PATH="" DISTRO="" # debian or rhel CONF_FILE="" # set after distro detection APACHE_SVC="" # apache2 or httpd # --- Colors --- RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' info() { echo -e "${GREEN}[OK]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } step() { echo -e "${CYAN}[STEP]${NC} $*"; } usage() { cat <&2 exit 1 fi # --- Distro detection --- detect_distro() { if [[ -f /etc/debian_version ]]; then DISTRO="debian" CONF_FILE="/etc/apache2/conf-available/bot-block.conf" APACHE_SVC="apache2" elif [[ -f /etc/redhat-release ]]; then DISTRO="rhel" CONF_FILE="/etc/httpd/conf.d/bot-block.conf" APACHE_SVC="httpd" else echo -e "${RED}Error: Unsupported distribution (neither Debian/Ubuntu nor RHEL/CentOS)${NC}" >&2 exit 1 fi } detect_distro # --- Apache check --- if ! command -v apachectl &>/dev/null; then echo -e "${RED}Error: Apache (${APACHE_SVC}) not found${NC}" >&2 exit 1 fi # --- Bot-block rules content --- MANAGED_START="# bot-block-managed-start" MANAGED_END="# bot-block-managed-end" generate_rules() { cat <<'RULES' # bot-block-managed-start # Bot-blocking rules for Apache — generated by add-apache-bot-block.sh # https://mylinux.work RewriteEngine On # AI scrapers RewriteCond %{HTTP_USER_AGENT} ABEvalBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} GPTBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} ClaudeBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} anthropic-ai [NC,OR] RewriteCond %{HTTP_USER_AGENT} CCBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} Bytespider [NC,OR] RewriteCond %{HTTP_USER_AGENT} TikTokSpider [NC,OR] RewriteCond %{HTTP_USER_AGENT} cohere-ai [NC,OR] RewriteCond %{HTTP_USER_AGENT} PerplexityBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} Diffbot [NC,OR] RewriteCond %{HTTP_USER_AGENT} MistralBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} YandexGPTBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} meta-externalagent [NC,OR] RewriteCond %{HTTP_USER_AGENT} Meta-ExternalFetcher [NC,OR] RewriteCond %{HTTP_USER_AGENT} meta-webindexer [NC,OR] RewriteCond %{HTTP_USER_AGENT} PetalBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} Amazonbot [NC,OR] RewriteCond %{HTTP_USER_AGENT} Amzn-SearchBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} AI2Bot [NC,OR] RewriteCond %{HTTP_USER_AGENT} Timpibot [NC,OR] RewriteCond %{HTTP_USER_AGENT} img2dataset [NC,OR] RewriteCond %{HTTP_USER_AGENT} YouBot [NC,OR] # SEO scrapers RewriteCond %{HTTP_USER_AGENT} MJ12bot [NC,OR] RewriteCond %{HTTP_USER_AGENT} SemrushBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} AhrefsBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} DotBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} DataForSeoBot [NC,OR] RewriteCond %{HTTP_USER_AGENT} SERanking [NC,OR] # Vulnerability scanners RewriteCond %{HTTP_USER_AGENT} Nikto [NC,OR] RewriteCond %{HTTP_USER_AGENT} sqlmap [NC,OR] RewriteCond %{HTTP_USER_AGENT} Nmap [NC,OR] RewriteCond %{HTTP_USER_AGENT} masscan [NC,OR] RewriteCond %{HTTP_USER_AGENT} ZmEu [NC,OR] RewriteCond %{HTTP_USER_AGENT} Morpheus [NC,OR] # Scraping frameworks RewriteCond %{HTTP_USER_AGENT} Scrapy [NC,OR] RewriteCond %{HTTP_USER_AGENT} python-requests [NC,OR] RewriteCond %{HTTP_USER_AGENT} Go-http-client [NC,OR] RewriteCond %{HTTP_USER_AGENT} Java/ [NC,OR] RewriteCond %{HTTP_USER_AGENT} libwww-perl [NC,OR] RewriteCond %{HTTP_USER_AGENT} trafilatura [NC] RewriteRule .* - [F] # Block broken srcset scrapers RewriteCond %{REQUEST_URI} %20[0-9]+w,https?:// [NC] RewriteRule .* - [F] # bot-block-managed-end RULES } # ===================================================== # --remove mode # ===================================================== if [[ "$REMOVE" == "true" ]]; then # --- Remove from .htaccess --- if [[ -n "$HTACCESS_PATH" ]]; then HTACCESS_FILE="${HTACCESS_PATH%/}/.htaccess" step "Removing bot-block rules from ${HTACCESS_FILE}" if [[ ! -f "$HTACCESS_FILE" ]]; then warn "File not found: ${HTACCESS_FILE} — nothing to remove" exit 0 fi if ! grep -q "$MANAGED_START" "$HTACCESS_FILE"; then warn "No managed bot-block block found in ${HTACCESS_FILE}" exit 0 fi if [[ "$DRY_RUN" == "true" ]]; then echo " Would strip managed block from ${HTACCESS_FILE}" else cp "$HTACCESS_FILE" "${HTACCESS_FILE}.bak.$(date +%s)" warn "Backup created" sed -i "/${MANAGED_START}/,/${MANAGED_END}/d" "$HTACCESS_FILE" info "Managed block removed from ${HTACCESS_FILE}" fi exit 0 fi # --- Remove server-wide conf --- step "Removing bot-block configuration" if [[ ! -f "$CONF_FILE" ]]; then warn "Config not found: ${CONF_FILE} — nothing to remove" exit 0 fi if [[ "$DRY_RUN" == "true" ]]; then if [[ "$DISTRO" == "debian" ]]; then echo " Would run: a2disconf bot-block" fi echo " Would remove: ${CONF_FILE}" echo " Would reload: ${APACHE_SVC}" else if [[ "$DISTRO" == "debian" ]]; then a2disconf bot-block 2>/dev/null || true info "Conf disabled (a2disconf)" fi rm -f "$CONF_FILE" info "Removed ${CONF_FILE}" step "Reloading ${APACHE_SVC}" systemctl reload "$APACHE_SVC" info "${APACHE_SVC} reloaded" fi echo "" echo -e "${BOLD}Bot-block configuration removed.${NC}" exit 0 fi # ===================================================== # --htaccess mode (install) # ===================================================== if [[ -n "$HTACCESS_PATH" ]]; then HTACCESS_FILE="${HTACCESS_PATH%/}/.htaccess" step "Writing bot-block rules to ${HTACCESS_FILE}" if [[ ! -d "$HTACCESS_PATH" ]]; then echo -e "${RED}Error: Directory not found: ${HTACCESS_PATH}${NC}" >&2 exit 1 fi if [[ "$DRY_RUN" == "true" ]]; then echo " Would write managed block to ${HTACCESS_FILE}" echo "" echo -e "${BOLD}Dry-run complete — no changes made.${NC}" exit 0 fi # Back up existing .htaccess if it exists if [[ -f "$HTACCESS_FILE" ]]; then # Remove old managed block if present if grep -q "$MANAGED_START" "$HTACCESS_FILE"; then cp "$HTACCESS_FILE" "${HTACCESS_FILE}.bak.$(date +%s)" warn "Backup created" sed -i "/${MANAGED_START}/,/${MANAGED_END}/d" "$HTACCESS_FILE" warn "Old managed block removed" else cp "$HTACCESS_FILE" "${HTACCESS_FILE}.bak.$(date +%s)" warn "Existing .htaccess backed up" fi fi # Append rules generate_rules >> "$HTACCESS_FILE" info "Bot-block rules written to ${HTACCESS_FILE}" echo "" echo -e "${BOLD}Done.${NC}" echo "" echo " File: ${HTACCESS_FILE}" echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" echo " Expected: 403" exit 0 fi # ===================================================== # Server-wide install (default) # ===================================================== # --- Step 1: Enable mod_rewrite (Debian) --- if [[ "$DISTRO" == "debian" ]]; then step "Enabling mod_rewrite" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: a2enmod rewrite" else a2enmod rewrite 2>/dev/null || true info "mod_rewrite enabled" fi fi # --- Step 2: Write conf file --- step "Creating bot-block conf at ${CONF_FILE}" if [[ "$DRY_RUN" == "true" ]]; then echo " Would create: ${CONF_FILE}" else if [[ -f "$CONF_FILE" ]]; then cp "$CONF_FILE" "${CONF_FILE}.bak.$(date +%s)" warn "Existing config backed up" fi generate_rules > "$CONF_FILE" info "Config created: ${CONF_FILE}" fi # --- Step 3: Enable conf (Debian) --- if [[ "$DISTRO" == "debian" ]]; then step "Enabling bot-block conf" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: a2enconf bot-block" else a2enconf bot-block 2>/dev/null || true info "Conf enabled (a2enconf)" fi fi # --- Step 4: Validate config --- step "Testing Apache configuration" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: apachectl configtest" else if apachectl configtest 2>&1; then info "Apache config valid" else echo -e "${RED}[ERROR] Apache config test failed${NC}" >&2 echo " Restore backup from ${CONF_FILE}.bak.* and reload" >&2 exit 1 fi fi # --- Step 5: Reload Apache --- step "Reloading ${APACHE_SVC}" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: systemctl reload ${APACHE_SVC}" else systemctl reload "$APACHE_SVC" info "${APACHE_SVC} reloaded" fi # ===================================================== # Summary # ===================================================== echo "" echo -e "${BOLD}Done.${NC}" echo "" echo " Config: ${CONF_FILE}" echo " Distro: ${DISTRO}" echo "" echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" echo " Expected: 403" echo "" echo " Remove: sudo $(basename "$0") --remove"