Files
linux-scripts/hestia-bot-block.sh
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

622 lines
20 KiB
Bash
Executable File

#!/bin/bash
################################################################################
# Script Name: hestia-bot-block.sh
# Version: 2.5
# Description: Configure AI scraper and SEO bot blocking on HestiaCP and
# VestaCP/myVesta servers. Creates an nginx map in conf.d, builds
# custom nginx templates with bot-blocking rules, and optionally
# applies them to specified domains.
#
# Supports incremental map updates and stacking on existing
# templates (e.g., geoip). Safe to re-run — merges new bots
# into existing map without losing custom additions.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Prerequisites:
# - HestiaCP or VestaCP/myVesta installed and running
# - Root access
# - nginx as proxy (default HestiaCP setup)
#
# Usage:
# sudo ./hestia-bot-block.sh
# sudo ./hestia-bot-block.sh --apply user domain.com
# sudo ./hestia-bot-block.sh --apply-all user
# sudo ./hestia-bot-block.sh --base-template default-geoip
# sudo ./hestia-bot-block.sh --update-map-only
# sudo ./hestia-bot-block.sh --dry-run
#
# Changelog:
# 2.5 — 2026-05-12: Added empty-referer image scrape blocking. Headless
# bot networks (Puppeteer/Playwright on residential proxies) hit
# cover images directly with no referer — now returns 444 for any
# image request (.png/.jpg/.webp) with an empty Referer header.
# 2.4 — 2026-05-11: Added fragment-in-referer blocking (real browsers strip
# URI fragments from the Referer header). Added request method blocking
# (only GET/HEAD allowed — static sites never need POST/PUT/DELETE).
# Added ospa-radar (lead-gen/business intelligence crawler) to blocklist.
# 2.3 — 2026-05-08: Added Exabot (defunct Dassault/Exalead crawler, now
# spoofed) and Sogou (Tencent Chinese search crawler) to blocklist.
# 2.2 — 2026-05-04: Fixed custom entry preservation carrying forward bots
# that were removed from the builtin list. Previously-builtin bots
# (OAI-SearchBot, Claude-Web) are now stripped during map updates.
# 2.1 — 2026-05-04: Removed Claude-Web and OAI-SearchBot from blocklist.
# These are user-facing fetcher bots, not training crawlers. Blocking
# them prevents your content from being cited in AI answers.
#
################################################################################
set -euo pipefail
# --- Configuration ---
TEMPLATE_NAME="default-botblock"
BASE_TEMPLATE="default"
CONF_DIR="/etc/nginx/conf.d"
PANEL_TPL_DIR=""
PANEL_NAME=""
MAP_FILE="${CONF_DIR}/bot-block.conf"
APPLY_USER=""
APPLY_DOMAIN=""
APPLY_ALL=false
UPDATE_MAP_ONLY=false
DRY_RUN=false
# --- Colors ---
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m'
info() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
step() { echo -e "${CYAN}[STEP]${NC} $*"; }
detect_panel() {
if [[ -d "/usr/local/hestia/data/templates/web/nginx" ]]; then
PANEL_TPL_DIR="/usr/local/hestia/data/templates/web/nginx"
PANEL_NAME="HestiaCP"
elif [[ -d "/usr/local/vesta/data/templates/web/nginx" ]]; then
PANEL_TPL_DIR="/usr/local/vesta/data/templates/web/nginx"
PANEL_NAME="VestaCP/myVesta"
else
echo -e "${RED}Error: Neither HestiaCP nor VestaCP/myVesta found${NC}" >&2
exit 1
fi
info "Detected ${PANEL_NAME} (${PANEL_TPL_DIR})"
}
usage() {
cat <<EOF
Usage: sudo $(basename "$0") [OPTIONS]
Creates nginx bot-blocking map and custom HestiaCP / VestaCP / myVesta nginx templates.
Options:
--template-name NAME Custom template name (default: default-botblock)
--base-template NAME Template to build on (default: default)
Use this to stack on existing templates (e.g., default-geoip)
--apply USER DOMAIN Apply template to a specific domain after creation
--apply-all USER Apply template to all domains for a user
--update-map-only Only update the bot map — do not touch templates
--dry-run Show what would be done without making changes
-h, --help Show this help
Examples:
sudo $(basename "$0")
sudo $(basename "$0") --apply admin example.com
sudo $(basename "$0") --apply-all admin
sudo $(basename "$0") --base-template default-geoip --template-name geoip-botblock
sudo $(basename "$0") --update-map-only
EOF
exit 0
}
# --- Argument parsing ---
while [[ $# -gt 0 ]]; do
case "$1" in
--template-name) TEMPLATE_NAME="$2"; shift 2 ;;
--base-template) BASE_TEMPLATE="$2"; shift 2 ;;
--apply) APPLY_USER="$2"; APPLY_DOMAIN="$3"; shift 3 ;;
--apply-all) APPLY_USER="$2"; APPLY_ALL=true; shift 2 ;;
--update-map-only) UPDATE_MAP_ONLY=true; shift ;;
--dry-run) DRY_RUN=true; shift ;;
-h|--help) usage ;;
*) echo "Unknown option: $1"; usage ;;
esac
done
# --- Checks ---
if [[ $EUID -ne 0 ]]; then
echo -e "${RED}Error: Run as root (sudo)${NC}" >&2
exit 1
fi
if ! command -v nginx &>/dev/null; then
echo -e "${RED}Error: nginx not found${NC}" >&2
exit 1
fi
# Only detect panel if we need templates
if [[ "$UPDATE_MAP_ONLY" == "false" ]]; then
detect_panel
fi
# =====================================================
# Bot list — single source of truth
# =====================================================
# Each line: "~*pattern 1;"
# To add a new bot, add it to the appropriate section below.
read -r -d '' BOT_LIST <<'BOTLIST' || true
# AI scrapers
~*ABEvalBot 1;
~*GPTBot 1;
~*ClaudeBot 1;
~*anthropic-ai 1;
~*CCBot 1;
~*Bytespider 1;
~*TikTokSpider 1;
~*cohere-ai 1;
~*PerplexityBot 1;
~*Diffbot 1;
~*MistralBot 1;
~*YandexGPTBot 1;
~*meta-externalagent 1;
~*Meta-ExternalFetcher 1;
~*meta-webindexer 1;
~*PetalBot 1;
~*Amazonbot 1;
~*Amzn-SearchBot 1;
~*AI2Bot 1;
~*Timpibot 1;
~*img2dataset 1;
~*YouBot 1;
~*HanaleiBot 1;
~*Trafilatura 1;
# Defunct crawlers (spoofed user agents)
~*Exabot 1;
~*Sogou 1;
# SEO scrapers
~*MJ12bot 1;
~*SemrushBot 1;
~*AhrefsBot 1;
~*DotBot 1;
~*DataForSeoBot 1;
~*SERanking 1;
# Vulnerability scanners
~*Nikto 1;
~*sqlmap 1;
~*Nmap 1;
~*masscan 1;
~*ZmEu 1;
~*Morpheus 1;
# Lead-gen / business intelligence bots
~*ospa-radar 1;
~*HubSeedsBot 1;
# AI scrapers / research bots
~*Aranet-SearchBot 1;
~*AzureAI-SearchBot 1;
~*MINERVA-DeepResearch 1;
~*NagetBot 1;
~*LAIABot 1;
~*pi-coding-agent 1;
# Probe / monitoring bots
~*CMS-Checker 1;
~*NexoFaviconBot 1;
~*AwarioBot 1;
~*AwarioSmartBot 1;
~*CopyousBot 1;
~*SurdotlyBot 1;
~*trendictionbot 1;
~*wpbot 1;
~*WebFetchTool 1;
~*YisouSpider 1;
# Scraping frameworks
~*Scrapy 1;
~*python-requests 1;
~*Go-http-client 1;
~*Java/ 1;
~*libwww-perl 1;
~*node-fetch 1;
~*HeadlessChrome 1;
# Outdated browsers (Chrome < 115 — almost certainly bots)
~*Chrome/([1-9][0-9]?|10[0-9]|11[0-4])\. 1;
# Empty / missing user agent
"" 1;
"-" 1;
BOTLIST
# =====================================================
# Step 1: Create or update nginx map
# =====================================================
# Extract bot patterns from the built-in list (lowercase for comparison)
get_builtin_patterns() {
echo "$BOT_LIST" | grep -oP '~\*\S+|^ ""' | tr '[:upper:]' '[:lower:]' | sort -u
}
# Extract bot patterns from an existing map file (lowercase for comparison)
get_existing_patterns() {
local file="$1"
grep -oP '~\*\S+|^\s*""' "$file" 2>/dev/null | tr '[:upper:]' '[:lower:]' | sort -u
}
# Bots previously in the builtin list that were intentionally removed.
# These must be stripped from custom entries to prevent them being preserved
# across updates. Users who want to block these can re-add them manually.
REMOVED_BOTS="~*oai-searchbot
~*claude-web"
# Extract custom entries from existing map that are NOT in our built-in list
# and NOT in the removed list
get_custom_entries() {
local file="$1"
if [[ ! -f "$file" ]]; then
return
fi
local builtin_patterns
builtin_patterns=$(get_builtin_patterns)
# Read each bot line from the existing file, keep ones not in our list
while IFS= read -r line; do
# skip comments, blank lines, map header/footer, default line
[[ "$line" =~ ^[[:space:]]*# ]] && continue
[[ "$line" =~ ^[[:space:]]*$ ]] && continue
[[ "$line" =~ ^map ]] && continue
[[ "$line" =~ ^\} ]] && continue
[[ "$line" =~ default ]] && continue
# extract the pattern from this line
local pattern
pattern=$(echo "$line" | grep -oP '~\*\S+|^\s*""' | tr '[:upper:]' '[:lower:]' | head -1)
[[ -z "$pattern" ]] && continue
# skip if in our built-in list
if echo "$builtin_patterns" | grep -qxF "$pattern"; then
continue
fi
# skip if in the removed list (previously builtin, intentionally dropped)
if echo "$REMOVED_BOTS" | grep -qxF "$pattern"; then
continue
fi
echo "$line"
done < "$file"
}
step "Configuring bot-block map at ${MAP_FILE}"
CUSTOM_ENTRIES=""
ADDED_NEW=0
if [[ -f "$MAP_FILE" ]]; then
# Detect custom entries added by the user
CUSTOM_ENTRIES=$(get_custom_entries "$MAP_FILE")
# Count new bots being added
existing=$(get_existing_patterns "$MAP_FILE")
while IFS= read -r pattern; do
[[ -z "$pattern" ]] && continue
if ! echo "$existing" | grep -qxF "$pattern"; then
ADDED_NEW=$((ADDED_NEW + 1))
fi
done <<< "$(get_builtin_patterns)"
if [[ -n "$CUSTOM_ENTRIES" ]]; then
custom_count=$(echo "$CUSTOM_ENTRIES" | wc -l)
info "Found ${custom_count} custom bot entries — will preserve them"
fi
fi
# Build the full map content
MAP_CONTENT="# Bot-blocking map for AI scrapers, SEO bots, and vulnerability scanners
# Generated by hestia-bot-block.sh — https://mylinux.work
# Last updated: $(date '+%Y-%m-%d %H:%M:%S')
map \$http_user_agent \$is_bad_bot {
default 0;
${BOT_LIST}"
# Append custom entries if any
if [[ -n "$CUSTOM_ENTRIES" ]]; then
MAP_CONTENT="${MAP_CONTENT}
# Custom entries (preserved from previous configuration)
${CUSTOM_ENTRIES}"
fi
MAP_CONTENT="${MAP_CONTENT}
}"
if [[ "$DRY_RUN" == "true" ]]; then
if [[ -f "$MAP_FILE" ]]; then
echo " Would update: ${MAP_FILE}"
[[ -n "$CUSTOM_ENTRIES" ]] && echo " Would preserve: $(echo "$CUSTOM_ENTRIES" | wc -l) custom entries"
[[ "$ADDED_NEW" -gt 0 ]] && echo " Would add: ${ADDED_NEW} new bot patterns"
else
echo " Would create: ${MAP_FILE}"
fi
else
if [[ -f "$MAP_FILE" ]]; then
cp "$MAP_FILE" "${MAP_FILE}.bak.$(date +%s)"
if [[ "$ADDED_NEW" -gt 0 ]]; then
info "Map updated: ${MAP_FILE} (${ADDED_NEW} new patterns added)"
else
info "Map updated: ${MAP_FILE} (already current)"
fi
else
info "Map created: ${MAP_FILE}"
fi
echo "$MAP_CONTENT" > "$MAP_FILE"
fi
# =====================================================
# If --update-map-only, skip templates and just reload
# =====================================================
if [[ "$UPDATE_MAP_ONLY" == "true" ]]; then
step "Testing nginx configuration"
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would run: nginx -t"
else
if nginx -t 2>&1; then
info "nginx config valid"
else
echo -e "${RED}[ERROR] nginx config test failed — restoring backup${NC}" >&2
latest_bak=$(ls -t "${MAP_FILE}.bak."* 2>/dev/null | head -1)
if [[ -n "$latest_bak" ]]; then
cp "$latest_bak" "$MAP_FILE"
warn "Restored: ${latest_bak}"
fi
exit 1
fi
fi
step "Reloading nginx"
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would run: systemctl reload nginx"
else
systemctl reload nginx
info "nginx reloaded"
fi
echo ""
echo -e "${BOLD}Done.${NC} Map updated — templates unchanged."
echo " Map: ${MAP_FILE}"
echo ""
echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com"
echo " Expected: 444 (connection dropped) or 000 (no response)"
exit 0
fi
# =====================================================
# Step 2: Create custom Hestia templates
# =====================================================
BOT_BLOCK_DIRECTIVE='
# Bot blocking — added by hestia-bot-block.sh
if ($is_bad_bot) {
return 444;
}
# Block broken srcset scrapers
if ($request_uri ~* "%20[0-9]+w,https?://") {
return 444;
}
# Block spoofed referers with fragment identifiers (real browsers strip these)
if ($http_referer ~* "#") {
return 444;
}
# Block non-GET/HEAD methods (static sites never need POST/PUT/DELETE)
if ($request_method !~ ^(GET|HEAD)$ ) {
return 444;
}
# Block empty-referer requests for images (headless bot image scraping)
set $block_image_scrape 0;
if ($uri ~* "\.(png|jpg|webp)$") {
set $block_image_scrape 1;
}
if ($http_referer = "") {
set $block_image_scrape "${block_image_scrape}1";
}
if ($block_image_scrape = "11") {
return 444;
}'
# The if block goes inside the server block, after the listen/server_name lines.
# Find the first location block and insert before it.
create_template() {
local src="$1" dst="$2" label="$3"
if [[ ! -f "$src" ]]; then
warn "Source template not found: ${src} — skipping ${label}"
return
fi
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would create: ${dst} (from ${src})"
return
fi
if [[ -f "$dst" ]]; then
cp "$dst" "${dst}.bak.$(date +%s)"
warn "Existing ${label} template backed up"
fi
# Check if the source already has bot blocking (avoid double-injection)
if grep -q 'is_bad_bot' "$src"; then
# Source already has bot blocking — copy as-is
cp "$src" "$dst"
info "Created ${label} template: ${dst} (bot blocking already present in base)"
else
# Insert bot-blocking directive before the first 'location' line
awk -v block="$BOT_BLOCK_DIRECTIVE" '
!inserted && /^[[:space:]]*location[[:space:]]/ {
print block
print ""
inserted = 1
}
{ print }
' "$src" > "$dst"
info "Created ${label} template: ${dst}"
fi
}
# Resolve the base template — verify it exists
BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl"
if [[ ! -f "$BASE_TPL" ]]; then
if [[ "$BASE_TEMPLATE" != "default" ]]; then
warn "Base template '${BASE_TEMPLATE}' not found — falling back to 'default'"
BASE_TEMPLATE="default"
fi
BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl"
if [[ ! -f "$BASE_TPL" ]]; then
echo -e "${RED}Error: Default template not found: ${BASE_TPL}${NC}" >&2
exit 1
fi
fi
step "Creating custom ${PANEL_NAME} nginx templates (${TEMPLATE_NAME}) from ${BASE_TEMPLATE}"
# Proxy templates
create_template \
"${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" \
"${PANEL_TPL_DIR}/${TEMPLATE_NAME}.tpl" \
"HTTP (.tpl)"
create_template \
"${PANEL_TPL_DIR}/${BASE_TEMPLATE}.stpl" \
"${PANEL_TPL_DIR}/${TEMPLATE_NAME}.stpl" \
"SSL (.stpl)"
# php-fpm templates (if they exist for the base)
if [[ -d "${PANEL_TPL_DIR}/php-fpm" ]]; then
if [[ -f "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" ]]; then
create_template \
"${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" \
"${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.tpl" \
"php-fpm HTTP (.tpl)"
create_template \
"${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.stpl" \
"${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.stpl" \
"php-fpm SSL (.stpl)"
fi
fi
# Copy .sh hooks from the base template if they exist
for ext in tpl stpl; do
base_sh="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.${ext}.sh"
dst_sh="${PANEL_TPL_DIR}/${TEMPLATE_NAME}.${ext}.sh"
if [[ -f "$base_sh" && ! -f "$dst_sh" ]]; then
cp "$base_sh" "$dst_sh"
info "Copied hook: ${dst_sh}"
fi
done
# =====================================================
# Step 3: Validate nginx config
# =====================================================
step "Testing nginx configuration"
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would run: nginx -t"
else
if nginx -t 2>&1; then
info "nginx config valid"
else
echo -e "${RED}[ERROR] nginx config test failed${NC}" >&2
echo " Restore backups from ${PANEL_TPL_DIR} and ${CONF_DIR}" >&2
exit 1
fi
fi
# =====================================================
# Step 4: Apply template (optional)
# =====================================================
if [[ -n "$APPLY_USER" ]]; then
if ! command -v v-change-web-domain-proxy-tpl &>/dev/null; then
echo -e "${RED}Error: v-change-web-domain-proxy-tpl not found${NC}" >&2
exit 1
fi
if [[ "$APPLY_ALL" == "true" ]]; then
step "Applying template to all domains for user: ${APPLY_USER}"
domains=$(v-list-web-domains "$APPLY_USER" plain 2>/dev/null | awk '{print $1}')
if [[ -z "$domains" ]]; then
warn "No domains found for user: ${APPLY_USER}"
else
while IFS= read -r domain; do
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${domain} ${TEMPLATE_NAME}"
else
v-change-web-domain-proxy-tpl "$APPLY_USER" "$domain" "$TEMPLATE_NAME"
info "Applied to: ${domain}"
fi
done <<< "$domains"
fi
else
step "Applying template to: ${APPLY_DOMAIN}"
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${APPLY_DOMAIN} ${TEMPLATE_NAME}"
else
v-change-web-domain-proxy-tpl "$APPLY_USER" "$APPLY_DOMAIN" "$TEMPLATE_NAME"
info "Applied to: ${APPLY_DOMAIN}"
fi
fi
fi
# =====================================================
# Step 5: Reload nginx
# =====================================================
step "Reloading nginx"
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would run: systemctl reload nginx"
else
systemctl reload nginx
info "nginx reloaded"
fi
# =====================================================
# Summary
# =====================================================
echo ""
echo -e "${BOLD}Done.${NC}"
echo ""
echo " Map: ${MAP_FILE}"
echo " Base: ${BASE_TEMPLATE}"
echo " Template: ${TEMPLATE_NAME} (.tpl + .stpl)"
if [[ -n "$APPLY_USER" ]]; then
if [[ "$APPLY_ALL" == "true" ]]; then
echo " Applied: All domains for ${APPLY_USER}"
else
echo " Applied: ${APPLY_DOMAIN}"
fi
else
echo ""
echo " To apply to a domain:"
echo " v-change-web-domain-proxy-tpl <user> <domain> ${TEMPLATE_NAME}"
echo ""
echo " To apply to all domains for a user:"
echo " $(basename "$0") --apply-all <user>"
fi
echo ""
echo " To add new bots later without touching templates:"
echo " sudo $(basename "$0") --update-map-only"
echo ""
echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com"
echo " Expected: 444 (connection dropped) or 000 (no response)"