#!/bin/bash ################################################################################ # Script Name: hestia-bot-block.sh # Version: 2.5 # Description: Configure AI scraper and SEO bot blocking on HestiaCP and # VestaCP/myVesta servers. Creates an nginx map in conf.d, builds # custom nginx templates with bot-blocking rules, and optionally # applies them to specified domains. # # Supports incremental map updates and stacking on existing # templates (e.g., geoip). Safe to re-run — merges new bots # into existing map without losing custom additions. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - HestiaCP or VestaCP/myVesta installed and running # - Root access # - nginx as proxy (default HestiaCP setup) # # Usage: # sudo ./hestia-bot-block.sh # sudo ./hestia-bot-block.sh --apply user domain.com # sudo ./hestia-bot-block.sh --apply-all user # sudo ./hestia-bot-block.sh --base-template default-geoip # sudo ./hestia-bot-block.sh --update-map-only # sudo ./hestia-bot-block.sh --dry-run # # Changelog: # 2.5 — 2026-05-12: Added empty-referer image scrape blocking. Headless # bot networks (Puppeteer/Playwright on residential proxies) hit # cover images directly with no referer — now returns 444 for any # image request (.png/.jpg/.webp) with an empty Referer header. # 2.4 — 2026-05-11: Added fragment-in-referer blocking (real browsers strip # URI fragments from the Referer header). Added request method blocking # (only GET/HEAD allowed — static sites never need POST/PUT/DELETE). # Added ospa-radar (lead-gen/business intelligence crawler) to blocklist. # 2.3 — 2026-05-08: Added Exabot (defunct Dassault/Exalead crawler, now # spoofed) and Sogou (Tencent Chinese search crawler) to blocklist. # 2.2 — 2026-05-04: Fixed custom entry preservation carrying forward bots # that were removed from the builtin list. Previously-builtin bots # (OAI-SearchBot, Claude-Web) are now stripped during map updates. # 2.1 — 2026-05-04: Removed Claude-Web and OAI-SearchBot from blocklist. # These are user-facing fetcher bots, not training crawlers. Blocking # them prevents your content from being cited in AI answers. # ################################################################################ set -euo pipefail # --- Configuration --- TEMPLATE_NAME="default-botblock" BASE_TEMPLATE="default" CONF_DIR="/etc/nginx/conf.d" PANEL_TPL_DIR="" PANEL_NAME="" MAP_FILE="${CONF_DIR}/bot-block.conf" APPLY_USER="" APPLY_DOMAIN="" APPLY_ALL=false UPDATE_MAP_ONLY=false DRY_RUN=false # --- Colors --- RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' info() { echo -e "${GREEN}[OK]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } step() { echo -e "${CYAN}[STEP]${NC} $*"; } detect_panel() { if [[ -d "/usr/local/hestia/data/templates/web/nginx" ]]; then PANEL_TPL_DIR="/usr/local/hestia/data/templates/web/nginx" PANEL_NAME="HestiaCP" elif [[ -d "/usr/local/vesta/data/templates/web/nginx" ]]; then PANEL_TPL_DIR="/usr/local/vesta/data/templates/web/nginx" PANEL_NAME="VestaCP/myVesta" else echo -e "${RED}Error: Neither HestiaCP nor VestaCP/myVesta found${NC}" >&2 exit 1 fi info "Detected ${PANEL_NAME} (${PANEL_TPL_DIR})" } usage() { cat <&2 exit 1 fi if ! command -v nginx &>/dev/null; then echo -e "${RED}Error: nginx not found${NC}" >&2 exit 1 fi # Only detect panel if we need templates if [[ "$UPDATE_MAP_ONLY" == "false" ]]; then detect_panel fi # ===================================================== # Bot list — single source of truth # ===================================================== # Each line: "~*pattern 1;" # To add a new bot, add it to the appropriate section below. read -r -d '' BOT_LIST <<'BOTLIST' || true # AI scrapers ~*ABEvalBot 1; ~*GPTBot 1; ~*ClaudeBot 1; ~*anthropic-ai 1; ~*CCBot 1; ~*Bytespider 1; ~*TikTokSpider 1; ~*cohere-ai 1; ~*PerplexityBot 1; ~*Diffbot 1; ~*MistralBot 1; ~*YandexGPTBot 1; ~*meta-externalagent 1; ~*Meta-ExternalFetcher 1; ~*meta-webindexer 1; ~*PetalBot 1; ~*Amazonbot 1; ~*Amzn-SearchBot 1; ~*AI2Bot 1; ~*Timpibot 1; ~*img2dataset 1; ~*YouBot 1; ~*HanaleiBot 1; ~*Trafilatura 1; # Defunct crawlers (spoofed user agents) ~*Exabot 1; ~*Sogou 1; # SEO scrapers ~*MJ12bot 1; ~*SemrushBot 1; ~*AhrefsBot 1; ~*DotBot 1; ~*DataForSeoBot 1; ~*SERanking 1; # Vulnerability scanners ~*Nikto 1; ~*sqlmap 1; ~*Nmap 1; ~*masscan 1; ~*ZmEu 1; ~*Morpheus 1; # Lead-gen / business intelligence bots ~*ospa-radar 1; ~*HubSeedsBot 1; # AI scrapers / research bots ~*Aranet-SearchBot 1; ~*AzureAI-SearchBot 1; ~*MINERVA-DeepResearch 1; ~*NagetBot 1; ~*LAIABot 1; ~*pi-coding-agent 1; # Probe / monitoring bots ~*CMS-Checker 1; ~*NexoFaviconBot 1; ~*AwarioBot 1; ~*AwarioSmartBot 1; ~*CopyousBot 1; ~*SurdotlyBot 1; ~*trendictionbot 1; ~*wpbot 1; ~*WebFetchTool 1; ~*YisouSpider 1; # Scraping frameworks ~*Scrapy 1; ~*python-requests 1; ~*Go-http-client 1; ~*Java/ 1; ~*libwww-perl 1; ~*node-fetch 1; ~*HeadlessChrome 1; # Outdated browsers (Chrome < 115 — almost certainly bots) ~*Chrome/([1-9][0-9]?|10[0-9]|11[0-4])\. 1; # Empty / missing user agent "" 1; "-" 1; BOTLIST # ===================================================== # Step 1: Create or update nginx map # ===================================================== # Extract bot patterns from the built-in list (lowercase for comparison) get_builtin_patterns() { echo "$BOT_LIST" | grep -oP '~\*\S+|^ ""' | tr '[:upper:]' '[:lower:]' | sort -u } # Extract bot patterns from an existing map file (lowercase for comparison) get_existing_patterns() { local file="$1" grep -oP '~\*\S+|^\s*""' "$file" 2>/dev/null | tr '[:upper:]' '[:lower:]' | sort -u } # Bots previously in the builtin list that were intentionally removed. # These must be stripped from custom entries to prevent them being preserved # across updates. Users who want to block these can re-add them manually. REMOVED_BOTS="~*oai-searchbot ~*claude-web" # Extract custom entries from existing map that are NOT in our built-in list # and NOT in the removed list get_custom_entries() { local file="$1" if [[ ! -f "$file" ]]; then return fi local builtin_patterns builtin_patterns=$(get_builtin_patterns) # Read each bot line from the existing file, keep ones not in our list while IFS= read -r line; do # skip comments, blank lines, map header/footer, default line [[ "$line" =~ ^[[:space:]]*# ]] && continue [[ "$line" =~ ^[[:space:]]*$ ]] && continue [[ "$line" =~ ^map ]] && continue [[ "$line" =~ ^\} ]] && continue [[ "$line" =~ default ]] && continue # extract the pattern from this line local pattern pattern=$(echo "$line" | grep -oP '~\*\S+|^\s*""' | tr '[:upper:]' '[:lower:]' | head -1) [[ -z "$pattern" ]] && continue # skip if in our built-in list if echo "$builtin_patterns" | grep -qxF "$pattern"; then continue fi # skip if in the removed list (previously builtin, intentionally dropped) if echo "$REMOVED_BOTS" | grep -qxF "$pattern"; then continue fi echo "$line" done < "$file" } step "Configuring bot-block map at ${MAP_FILE}" CUSTOM_ENTRIES="" ADDED_NEW=0 if [[ -f "$MAP_FILE" ]]; then # Detect custom entries added by the user CUSTOM_ENTRIES=$(get_custom_entries "$MAP_FILE") # Count new bots being added existing=$(get_existing_patterns "$MAP_FILE") while IFS= read -r pattern; do [[ -z "$pattern" ]] && continue if ! echo "$existing" | grep -qxF "$pattern"; then ADDED_NEW=$((ADDED_NEW + 1)) fi done <<< "$(get_builtin_patterns)" if [[ -n "$CUSTOM_ENTRIES" ]]; then custom_count=$(echo "$CUSTOM_ENTRIES" | wc -l) info "Found ${custom_count} custom bot entries — will preserve them" fi fi # Build the full map content MAP_CONTENT="# Bot-blocking map for AI scrapers, SEO bots, and vulnerability scanners # Generated by hestia-bot-block.sh — https://mylinux.work # Last updated: $(date '+%Y-%m-%d %H:%M:%S') map \$http_user_agent \$is_bad_bot { default 0; ${BOT_LIST}" # Append custom entries if any if [[ -n "$CUSTOM_ENTRIES" ]]; then MAP_CONTENT="${MAP_CONTENT} # Custom entries (preserved from previous configuration) ${CUSTOM_ENTRIES}" fi MAP_CONTENT="${MAP_CONTENT} }" if [[ "$DRY_RUN" == "true" ]]; then if [[ -f "$MAP_FILE" ]]; then echo " Would update: ${MAP_FILE}" [[ -n "$CUSTOM_ENTRIES" ]] && echo " Would preserve: $(echo "$CUSTOM_ENTRIES" | wc -l) custom entries" [[ "$ADDED_NEW" -gt 0 ]] && echo " Would add: ${ADDED_NEW} new bot patterns" else echo " Would create: ${MAP_FILE}" fi else if [[ -f "$MAP_FILE" ]]; then cp "$MAP_FILE" "${MAP_FILE}.bak.$(date +%s)" if [[ "$ADDED_NEW" -gt 0 ]]; then info "Map updated: ${MAP_FILE} (${ADDED_NEW} new patterns added)" else info "Map updated: ${MAP_FILE} (already current)" fi else info "Map created: ${MAP_FILE}" fi echo "$MAP_CONTENT" > "$MAP_FILE" fi # ===================================================== # If --update-map-only, skip templates and just reload # ===================================================== if [[ "$UPDATE_MAP_ONLY" == "true" ]]; then step "Testing nginx configuration" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: nginx -t" else if nginx -t 2>&1; then info "nginx config valid" else echo -e "${RED}[ERROR] nginx config test failed — restoring backup${NC}" >&2 latest_bak=$(ls -t "${MAP_FILE}.bak."* 2>/dev/null | head -1) if [[ -n "$latest_bak" ]]; then cp "$latest_bak" "$MAP_FILE" warn "Restored: ${latest_bak}" fi exit 1 fi fi step "Reloading nginx" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: systemctl reload nginx" else systemctl reload nginx info "nginx reloaded" fi echo "" echo -e "${BOLD}Done.${NC} Map updated — templates unchanged." echo " Map: ${MAP_FILE}" echo "" echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" echo " Expected: 444 (connection dropped) or 000 (no response)" exit 0 fi # ===================================================== # Step 2: Create custom Hestia templates # ===================================================== BOT_BLOCK_DIRECTIVE=' # Bot blocking — added by hestia-bot-block.sh if ($is_bad_bot) { return 444; } # Block broken srcset scrapers if ($request_uri ~* "%20[0-9]+w,https?://") { return 444; } # Block spoofed referers with fragment identifiers (real browsers strip these) if ($http_referer ~* "#") { return 444; } # Block non-GET/HEAD methods (static sites never need POST/PUT/DELETE) if ($request_method !~ ^(GET|HEAD)$ ) { return 444; } # Block empty-referer requests for images (headless bot image scraping) set $block_image_scrape 0; if ($uri ~* "\.(png|jpg|webp)$") { set $block_image_scrape 1; } if ($http_referer = "") { set $block_image_scrape "${block_image_scrape}1"; } if ($block_image_scrape = "11") { return 444; }' # The if block goes inside the server block, after the listen/server_name lines. # Find the first location block and insert before it. create_template() { local src="$1" dst="$2" label="$3" if [[ ! -f "$src" ]]; then warn "Source template not found: ${src} — skipping ${label}" return fi if [[ "$DRY_RUN" == "true" ]]; then echo " Would create: ${dst} (from ${src})" return fi if [[ -f "$dst" ]]; then cp "$dst" "${dst}.bak.$(date +%s)" warn "Existing ${label} template backed up" fi # Check if the source already has bot blocking (avoid double-injection) if grep -q 'is_bad_bot' "$src"; then # Source already has bot blocking — copy as-is cp "$src" "$dst" info "Created ${label} template: ${dst} (bot blocking already present in base)" else # Insert bot-blocking directive before the first 'location' line awk -v block="$BOT_BLOCK_DIRECTIVE" ' !inserted && /^[[:space:]]*location[[:space:]]/ { print block print "" inserted = 1 } { print } ' "$src" > "$dst" info "Created ${label} template: ${dst}" fi } # Resolve the base template — verify it exists BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" if [[ ! -f "$BASE_TPL" ]]; then if [[ "$BASE_TEMPLATE" != "default" ]]; then warn "Base template '${BASE_TEMPLATE}' not found — falling back to 'default'" BASE_TEMPLATE="default" fi BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" if [[ ! -f "$BASE_TPL" ]]; then echo -e "${RED}Error: Default template not found: ${BASE_TPL}${NC}" >&2 exit 1 fi fi step "Creating custom ${PANEL_NAME} nginx templates (${TEMPLATE_NAME}) from ${BASE_TEMPLATE}" # Proxy templates create_template \ "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" \ "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.tpl" \ "HTTP (.tpl)" create_template \ "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.stpl" \ "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.stpl" \ "SSL (.stpl)" # php-fpm templates (if they exist for the base) if [[ -d "${PANEL_TPL_DIR}/php-fpm" ]]; then if [[ -f "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" ]]; then create_template \ "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" \ "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.tpl" \ "php-fpm HTTP (.tpl)" create_template \ "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.stpl" \ "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.stpl" \ "php-fpm SSL (.stpl)" fi fi # Copy .sh hooks from the base template if they exist for ext in tpl stpl; do base_sh="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.${ext}.sh" dst_sh="${PANEL_TPL_DIR}/${TEMPLATE_NAME}.${ext}.sh" if [[ -f "$base_sh" && ! -f "$dst_sh" ]]; then cp "$base_sh" "$dst_sh" info "Copied hook: ${dst_sh}" fi done # ===================================================== # Step 3: Validate nginx config # ===================================================== step "Testing nginx configuration" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: nginx -t" else if nginx -t 2>&1; then info "nginx config valid" else echo -e "${RED}[ERROR] nginx config test failed${NC}" >&2 echo " Restore backups from ${PANEL_TPL_DIR} and ${CONF_DIR}" >&2 exit 1 fi fi # ===================================================== # Step 4: Apply template (optional) # ===================================================== if [[ -n "$APPLY_USER" ]]; then if ! command -v v-change-web-domain-proxy-tpl &>/dev/null; then echo -e "${RED}Error: v-change-web-domain-proxy-tpl not found${NC}" >&2 exit 1 fi if [[ "$APPLY_ALL" == "true" ]]; then step "Applying template to all domains for user: ${APPLY_USER}" domains=$(v-list-web-domains "$APPLY_USER" plain 2>/dev/null | awk '{print $1}') if [[ -z "$domains" ]]; then warn "No domains found for user: ${APPLY_USER}" else while IFS= read -r domain; do if [[ "$DRY_RUN" == "true" ]]; then echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${domain} ${TEMPLATE_NAME}" else v-change-web-domain-proxy-tpl "$APPLY_USER" "$domain" "$TEMPLATE_NAME" info "Applied to: ${domain}" fi done <<< "$domains" fi else step "Applying template to: ${APPLY_DOMAIN}" if [[ "$DRY_RUN" == "true" ]]; then echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${APPLY_DOMAIN} ${TEMPLATE_NAME}" else v-change-web-domain-proxy-tpl "$APPLY_USER" "$APPLY_DOMAIN" "$TEMPLATE_NAME" info "Applied to: ${APPLY_DOMAIN}" fi fi fi # ===================================================== # Step 5: Reload nginx # ===================================================== step "Reloading nginx" if [[ "$DRY_RUN" == "true" ]]; then echo " Would run: systemctl reload nginx" else systemctl reload nginx info "nginx reloaded" fi # ===================================================== # Summary # ===================================================== echo "" echo -e "${BOLD}Done.${NC}" echo "" echo " Map: ${MAP_FILE}" echo " Base: ${BASE_TEMPLATE}" echo " Template: ${TEMPLATE_NAME} (.tpl + .stpl)" if [[ -n "$APPLY_USER" ]]; then if [[ "$APPLY_ALL" == "true" ]]; then echo " Applied: All domains for ${APPLY_USER}" else echo " Applied: ${APPLY_DOMAIN}" fi else echo "" echo " To apply to a domain:" echo " v-change-web-domain-proxy-tpl ${TEMPLATE_NAME}" echo "" echo " To apply to all domains for a user:" echo " $(basename "$0") --apply-all " fi echo "" echo " To add new bots later without touching templates:" echo " sudo $(basename "$0") --update-map-only" echo "" echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" echo " Expected: 444 (connection dropped) or 000 (no response)"