#!/bin/bash ################################################################################ # Script Name: add-nginx-bot-block.sh # Version: 1.3 # Description: Configure AI scraper and bot blocking on standard nginx servers. # Creates an nginx map in conf.d and injects bot-blocking rules # into server blocks found in sites-enabled and conf.d. # For HestiaCP / VestaCP / myVesta servers, use hestia-bot-block.sh instead. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - nginx installed and running # - Root access # # Usage: # sudo ./add-nginx-bot-block.sh # sudo ./add-nginx-bot-block.sh --dry-run # sudo ./add-nginx-bot-block.sh --conf /etc/nginx/sites-enabled/mysite.conf # sudo ./add-nginx-bot-block.sh --status-code 403 # sudo ./add-nginx-bot-block.sh --remove # # Changelog: # 1.3 — 2026-05-11: Added fragment-in-referer blocking (real browsers strip # URI fragments from the Referer header). Added request method blocking # (only GET/HEAD allowed — static sites never need POST/PUT/DELETE). # Added ospa-radar (lead-gen/business intelligence crawler) to blocklist. # 1.2 — 2026-05-08: Added Exabot (defunct Dassault/Exalead crawler, now # spoofed) and Sogou (Tencent Chinese search crawler) to blocklist. # 1.1 — 2026-05-04: Removed Claude-Web and OAI-SearchBot from blocklist. # These are user-facing fetcher bots, not training crawlers. Blocking # them prevents your content from being cited in AI answers. # ################################################################################ set -euo pipefail # --- Configuration --- CONF_DIR="/etc/nginx/conf.d" SITES_DIR="/etc/nginx/sites-enabled" MAP_FILE="${CONF_DIR}/bot-block.conf" DRY_RUN=false REMOVE=false SINGLE_CONF="" STATUS_CODE="444" TIMESTAMP=$(date +%s) MARKER_START="# bot-block-managed-start" MARKER_END="# bot-block-managed-end" # --- Colors --- RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' info() { echo -e "${GREEN}[OK]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } step() { echo -e "${CYAN}[STEP]${NC} $*"; } usage() { cat <&2 exit 1 fi if ! command -v nginx &>/dev/null; then echo -e "${RED}Error: nginx not found${NC}" >&2 exit 1 fi # ===================================================== # Collect config files to process # ===================================================== collect_configs() { local configs=() if [[ -n "$SINGLE_CONF" ]]; then if [[ ! -f "$SINGLE_CONF" ]]; then echo -e "${RED}Error: Config file not found: ${SINGLE_CONF}${NC}" >&2 exit 1 fi configs+=("$SINGLE_CONF") else # Scan sites-enabled if [[ -d "$SITES_DIR" ]]; then for f in "$SITES_DIR"/*; do [[ -f "$f" ]] && configs+=("$f") done fi # Scan conf.d (skip bot-block.conf itself) if [[ -d "$CONF_DIR" ]]; then for f in "$CONF_DIR"/*.conf; do [[ -f "$f" ]] || continue [[ "$f" == "$MAP_FILE" ]] && continue configs+=("$f") done fi fi # Filter to only files containing a server block local server_configs=() for f in "${configs[@]}"; do if grep -qP '^\s*server\s*\{' "$f" 2>/dev/null; then server_configs+=("$f") fi done printf '%s\n' "${server_configs[@]}" } # ===================================================== # REMOVE MODE # ===================================================== if [[ "$REMOVE" == "true" ]]; then step "Removing bot-block configuration" # Remove map file if [[ -f "$MAP_FILE" ]]; then if [[ "$DRY_RUN" == "true" ]]; then echo " Would remove: ${MAP_FILE}" else rm -f "$MAP_FILE" info "Removed: ${MAP_FILE}" fi else warn "Map file not found: ${MAP_FILE} (already removed?)" fi # Strip managed blocks from config files step "Scanning for injected bot-block rules" mapfile -t configs < <(collect_configs) if [[ ${#configs[@]} -eq 0 ]]; then warn "No server block config files found" else for conf in "${configs[@]}"; do if grep -q "$MARKER_START" "$conf" 2>/dev/null; then if [[ "$DRY_RUN" == "true" ]]; then echo " Would clean: ${conf}" else cp "$conf" "${conf}.bak.${TIMESTAMP}" sed -i "/${MARKER_START}/,/${MARKER_END}/d" "$conf" info "Cleaned: ${conf}" fi fi done fi # Validate and reload if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: nginx -t" echo " Would run: systemctl reload nginx" else step "Testing nginx configuration" if nginx -t 2>&1; then info "nginx config valid" else echo -e "${RED}[ERROR] nginx config test failed — restore .bak files${NC}" >&2 exit 1 fi step "Reloading nginx" systemctl reload nginx info "nginx reloaded" fi echo "" echo -e "${BOLD}Bot-block rules removed.${NC}" exit 0 fi # ===================================================== # INSTALL MODE # ===================================================== # Step 1: Create nginx map # ===================================================== step "Creating bot-block map at ${MAP_FILE}" MAP_CONTENT='# Bot-blocking map for AI scrapers, SEO bots, and vulnerability scanners # Generated by add-nginx-bot-block.sh — https://mylinux.work map $http_user_agent $is_bad_bot { default 0; # AI scrapers ~*ABEvalBot 1; ~*GPTBot 1; ~*ClaudeBot 1; ~*anthropic-ai 1; ~*CCBot 1; ~*Bytespider 1; ~*TikTokSpider 1; ~*cohere-ai 1; ~*PerplexityBot 1; ~*Diffbot 1; ~*MistralBot 1; ~*YandexGPTBot 1; ~*meta-externalagent 1; ~*Meta-ExternalFetcher 1; ~*meta-webindexer 1; ~*PetalBot 1; ~*Amazonbot 1; ~*Amzn-SearchBot 1; ~*AI2Bot 1; ~*Timpibot 1; ~*img2dataset 1; ~*YouBot 1; ~*HanaleiBot 1; # Defunct crawlers (spoofed user agents) ~*Exabot 1; ~*Sogou 1; # SEO scrapers ~*MJ12bot 1; ~*SemrushBot 1; ~*AhrefsBot 1; ~*DotBot 1; ~*DataForSeoBot 1; ~*SERanking 1; # Vulnerability scanners ~*Nikto 1; ~*sqlmap 1; ~*Nmap 1; ~*masscan 1; ~*ZmEu 1; ~*Morpheus 1; # Lead-gen / business intelligence bots ~*ospa-radar 1; ~*HubSeedsBot 1; # AI scrapers / research bots ~*Aranet-SearchBot 1; ~*AzureAI-SearchBot 1; ~*MINERVA-DeepResearch 1; ~*NagetBot 1; ~*LAIABot 1; ~*pi-coding-agent 1; # Probe / monitoring bots ~*CMS-Checker 1; ~*NexoFaviconBot 1; ~*AwarioBot 1; ~*AwarioSmartBot 1; ~*CopyousBot 1; ~*SurdotlyBot 1; ~*trendictionbot 1; ~*wpbot 1; ~*WebFetchTool 1; ~*YisouSpider 1; # Scraping frameworks ~*Scrapy 1; ~*python-requests 1; ~*Go-http-client 1; ~*Java/ 1; ~*libwww-perl 1; ~*trafilatura 1; ~*node-fetch 1; # Outdated browsers (Chrome < 115 — almost certainly bots) ~*Chrome/([1-9][0-9]?|10[0-9]|11[0-4])\. 1; # Empty / missing user agent "" 1; "-" 1; }' if [[ "$DRY_RUN" == "true" ]]; then echo " Would create: ${MAP_FILE}" else if [[ -f "$MAP_FILE" ]]; then cp "$MAP_FILE" "${MAP_FILE}.bak.${TIMESTAMP}" warn "Existing map backed up" fi echo "$MAP_CONTENT" > "$MAP_FILE" info "Map created: ${MAP_FILE}" fi # ===================================================== # Step 2: Inject bot-blocking rule into server blocks # ===================================================== step "Scanning for server blocks to inject bot-blocking rule" mapfile -t configs < <(collect_configs) if [[ ${#configs[@]} -eq 0 ]]; then warn "No server block config files found in ${SITES_DIR} or ${CONF_DIR}" else MODIFIED=0 for conf in "${configs[@]}"; do # Skip if already managed if grep -q "$MARKER_START" "$conf" 2>/dev/null; then warn "Already managed: ${conf} — skipping" continue fi if [[ "$DRY_RUN" == "true" ]]; then echo " Would inject into: ${conf}" MODIFIED=$((MODIFIED + 1)) continue fi # Backup cp "$conf" "${conf}.bak.${TIMESTAMP}" # Inject the if block before the first location directive inside each server block BOT_BLOCK="\\ ${MARKER_START}\\ if (\$is_bad_bot) {\\ return ${STATUS_CODE};\\ }\\ # Block broken srcset scrapers\\ if (\$request_uri ~* \"%20[0-9]+w,https?://\") {\\ return ${STATUS_CODE};\\ }\\ # Block spoofed referers with fragment identifiers (real browsers strip these)\\ if (\$http_referer ~* \"#\") {\\ return ${STATUS_CODE};\\ }\\ # Block non-GET/HEAD methods (static sites never need POST/PUT/DELETE)\\ if (\$request_method !~ ^(GET|HEAD)\$ ) {\\ return ${STATUS_CODE};\\ }\\ ${MARKER_END}" awk -v block="$BOT_BLOCK" ' /^\s*server\s*\{/ { in_server = 1; injected = 0 } in_server && !injected && /^\s*location\s/ { print block print "" injected = 1 } /^\s*\}/ && in_server { # Track brace depth to know when server block ends } { print } ' "$conf" > "${conf}.tmp" mv "${conf}.tmp" "$conf" info "Injected into: ${conf}" MODIFIED=$((MODIFIED + 1)) done if [[ $MODIFIED -eq 0 ]]; then warn "No files modified (all already managed)" fi fi # ===================================================== # Step 3: Validate nginx config # ===================================================== step "Testing nginx configuration" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: nginx -t" else if nginx -t 2>&1; then info "nginx config valid" else echo -e "${RED}[ERROR] nginx config test failed${NC}" >&2 echo " Restore backups (.bak.${TIMESTAMP}) from ${SITES_DIR} and ${CONF_DIR}" >&2 exit 1 fi fi # ===================================================== # Step 4: Reload nginx # ===================================================== step "Reloading nginx" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: systemctl reload nginx" else systemctl reload nginx info "nginx reloaded" fi # ===================================================== # Summary # ===================================================== echo "" echo -e "${BOLD}Done.${NC}" echo "" echo " Map: ${MAP_FILE}" echo " Status code: ${STATUS_CODE}" if [[ -n "$SINGLE_CONF" ]]; then echo " Config: ${SINGLE_CONF}" else echo " Scanned: ${SITES_DIR}/ and ${CONF_DIR}/*.conf" fi echo "" echo " To remove: sudo $(basename "$0") --remove" echo "" echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" echo " Expected: 444 (connection dropped) or 000 (no response)"