Files
linux-scripts/install-bot-monitor.sh
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

558 lines
16 KiB
Bash

#!/bin/bash
################################################################################
# Script Name: install-bot-monitor.sh
# Version: 1.1
# Description: Install the bot-monitor script, known agents whitelist, and
# daily cron job. Detects unknown user agents in web server logs
# and sends alerts via ntfy and/or Prometheus textfile metrics.
#
# Author: Phil Connor
# Contact: contact@mylinux.work
# Website: https://mylinux.work
# License: MIT
#
# Usage:
# sudo ./install-bot-monitor.sh --ntfy-url https://ntfy.example.com/bot-alerts
# sudo ./install-bot-monitor.sh --textfile
# sudo ./install-bot-monitor.sh --ntfy-url https://ntfy.example.com/bot-alerts --textfile
# sudo ./install-bot-monitor.sh --textfile --domain example.com
# sudo ./install-bot-monitor.sh --dry-run
#
################################################################################
set -euo pipefail
# --- Defaults ---
NTFY_URL=""
TEXTFILE_DIR="/var/lib/node_exporter"
TEXTFILE_ENABLED=false
LOG_DIR="auto"
LOG_PATTERN="access.log"
DOMAIN=""
MIN_REQUESTS=5
CRON_HOUR=6
DRY_RUN=false
# --- Colors ---
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m'
info() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
step() { echo -e "${CYAN}[STEP]${NC} $*"; }
usage() {
cat <<EOF
Usage: sudo $(basename "$0") [OPTIONS]
Installs bot-monitor: daily detection of unknown user agents with ntfy alerts
and/or Prometheus metrics via node_exporter textfile collector.
At least one of --ntfy-url or --textfile is required.
Options:
--ntfy-url URL ntfy topic URL for alerts
--textfile Write Prometheus metrics to node_exporter textfile
collector (default dir: /var/lib/node_exporter)
--textfile-dir DIR Override textfile directory (implies --textfile)
--domain DOMAIN Monitor a single domain only (HestiaCP/VestaCP)
--log-dir DIR Web server log directory (default: auto-detect)
--log-pattern PATTERN Log filename pattern (default: access.log)
--min-requests NUM Minimum requests to report an agent (default: 5)
--cron-hour HOUR Hour to run daily scan, 0-23 (default: 6)
--dry-run Show what would be done without making changes
-h, --help Show this help
Examples:
# ntfy alerts only
sudo $(basename "$0") --ntfy-url https://ntfy.example.com/bot-alerts
# Prometheus metrics only
sudo $(basename "$0") --textfile
# Both
sudo $(basename "$0") --ntfy-url https://ntfy.example.com/bot-alerts --textfile
# Single domain only (HestiaCP/VestaCP)
sudo $(basename "$0") --textfile --domain example.com
# Custom textfile directory
sudo $(basename "$0") --textfile-dir /opt/node_exporter/textfile
# Preview
sudo $(basename "$0") --dry-run
EOF
exit 0
}
# --- Argument parsing ---
while [[ $# -gt 0 ]]; do
case "$1" in
--ntfy-url) NTFY_URL="$2"; shift 2 ;;
--textfile) TEXTFILE_ENABLED=true; shift ;;
--textfile-dir) TEXTFILE_ENABLED=true; TEXTFILE_DIR="$2"; shift 2 ;;
--domain) DOMAIN="$2"; shift 2 ;;
--log-dir) LOG_DIR="$2"; shift 2 ;;
--log-pattern) LOG_PATTERN="$2"; shift 2 ;;
--min-requests) MIN_REQUESTS="$2"; shift 2 ;;
--cron-hour) CRON_HOUR="$2"; shift 2 ;;
--dry-run) DRY_RUN=true; shift ;;
-h|--help) usage ;;
*) echo "Unknown option: $1"; usage ;;
esac
done
# --- Checks ---
if [[ $EUID -ne 0 ]]; then
echo -e "${RED}Error: Run as root (sudo)${NC}" >&2
exit 1
fi
if [[ -z "$NTFY_URL" && "$TEXTFILE_ENABLED" == "false" && "$DRY_RUN" == "false" ]]; then
echo -e "${RED}Error: At least one of --ntfy-url or --textfile is required${NC}" >&2
echo " Example: sudo $(basename "$0") --ntfy-url https://ntfy.example.com/bot-alerts"
echo " Example: sudo $(basename "$0") --textfile"
exit 1
fi
if [[ "$TEXTFILE_ENABLED" == "true" && ! -d "$TEXTFILE_DIR" && "$DRY_RUN" == "false" ]]; then
echo -e "${RED}Error: Textfile directory not found: ${TEXTFILE_DIR}${NC}" >&2
echo " Create it: sudo mkdir -p ${TEXTFILE_DIR}"
exit 1
fi
# --- Auto-detect log directory ---
detect_log_dir() {
if [[ "$LOG_DIR" != "auto" ]]; then
if [[ ! -d "$LOG_DIR" ]]; then
echo -e "${RED}Error: Log directory not found: ${LOG_DIR}${NC}" >&2
exit 1
fi
if [[ -n "$DOMAIN" ]]; then
LOG_PATTERN="${DOMAIN}.log"
info "Using specified log directory: $LOG_DIR/$LOG_PATTERN"
else
info "Using specified log directory: $LOG_DIR"
fi
return
fi
step "Auto-detecting web server log directory..."
# HestiaCP / VestaCP — per-domain logs
# Check apache first: has full access logs with user agents
# (nginx domain logs are proxy logs in nginx+apache mode)
if [[ -d /var/log/apache2/domains ]]; then
LOG_DIR="/var/log/apache2/domains"
if [[ -n "$DOMAIN" ]]; then
LOG_PATTERN="${DOMAIN}.log"
if [[ ! -f "${LOG_DIR}/${LOG_PATTERN}" ]]; then
echo -e "${RED}Error: Domain log not found: ${LOG_DIR}/${LOG_PATTERN}${NC}" >&2
exit 1
fi
info "Detected HestiaCP/VestaCP apache: $LOG_DIR/$LOG_PATTERN"
else
LOG_PATTERN="*.log"
info "Detected HestiaCP/VestaCP apache: $LOG_DIR (all domains)"
fi
return
fi
if [[ -d /var/log/nginx/domains ]]; then
LOG_DIR="/var/log/nginx/domains"
if [[ -n "$DOMAIN" ]]; then
LOG_PATTERN="${DOMAIN}.log"
if [[ ! -f "${LOG_DIR}/${LOG_PATTERN}" ]]; then
echo -e "${RED}Error: Domain log not found: ${LOG_DIR}/${LOG_PATTERN}${NC}" >&2
exit 1
fi
info "Detected HestiaCP/VestaCP nginx: $LOG_DIR/$LOG_PATTERN"
else
LOG_PATTERN="*.log"
info "Detected HestiaCP/VestaCP nginx: $LOG_DIR (all domains)"
fi
return
fi
# Standard nginx
if [[ -f /var/log/nginx/access.log ]]; then
LOG_DIR="/var/log/nginx"
LOG_PATTERN="access.log"
info "Detected nginx: $LOG_DIR/$LOG_PATTERN"
return
fi
# Apache (Debian/Ubuntu)
if [[ -f /var/log/apache2/access.log ]]; then
LOG_DIR="/var/log/apache2"
LOG_PATTERN="access.log"
info "Detected apache2: $LOG_DIR/$LOG_PATTERN"
return
fi
# Apache (RHEL/Rocky)
if [[ -f /var/log/httpd/access_log ]]; then
LOG_DIR="/var/log/httpd"
LOG_PATTERN="access_log"
info "Detected httpd: $LOG_DIR/$LOG_PATTERN"
return
fi
echo -e "${RED}Error: Could not auto-detect log directory. Use --log-dir to specify.${NC}" >&2
exit 1
}
detect_log_dir
# =====================================================
# Step 1: Create directories
# =====================================================
step "Creating directories"
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would create: /etc/bot-monitor/"
echo " Would create: /var/lib/bot-monitor/"
else
mkdir -p /etc/bot-monitor /var/lib/bot-monitor
info "Created /etc/bot-monitor and /var/lib/bot-monitor"
fi
# =====================================================
# Step 2: Install known agents whitelist
# =====================================================
step "Installing known agents whitelist"
AGENTS_FILE="/etc/bot-monitor/known-agents.txt"
AGENTS_CONTENT='# Known user agents — one grep pattern per line
# Agents matching these patterns are excluded from unknown bot alerts.
# Add your own patterns as needed.
#
# https://mylinux.work
# --- Search engines ---
Googlebot
Bingbot
Applebot
DuckDuckBot
YandexBot
Baiduspider
Sogou
Qwantify
Qwantbot
# --- Social media / link previews ---
facebookexternalhit
Facebot
Twitterbot
LinkedInBot
Pinterestbot
Slackbot
Discordbot
TelegramBot
WhatsApp
SkypeUriPreview
BingPreview
# --- Browsers ---
Chrome
Firefox
Safari
Edge
Opera
Vivaldi
Brave
# --- Monitoring / uptime ---
Uptime-Kuma
UptimeRobot
Pingdom
StatusCake
Better Uptime
Datadog
Site24x7
Cloudflare-Healthchecks
Fastly-Healthcheck
Blackbox-Exporter
ufw-threat-feeds
# --- Feed readers ---
Feedly
Feedbin
NewsBlur
Tiny Tiny RSS
FreshRSS
Miniflux
# --- Tools ---
curl
Wget
HTTPie
Lynx
w3m
link-check
# --- AI user-facing search (cite sources) ---
ChatGPT-User
Claude-User
DuckAssistBot
# --- OS-level networking ---
WebKit.Networking
NetworkingExtension
# --- AI scrapers (already blocked) ---
ABEvalBot
GPTBot
ClaudeBot
anthropic-ai
CCBot
Bytespider
TikTokSpider
cohere-ai
PerplexityBot
Diffbot
MistralBot
YandexGPTBot
meta-externalagent
Meta-ExternalFetcher
meta-webindexer
PetalBot
Amazonbot
Amzn-SearchBot
AI2Bot
Ai2Bot-Dolma
Timpibot
img2dataset
YouBot
HanaleiBot
Applebot-Extended
Google-Extended
# --- SEO crawlers (already blocked) ---
MJ12bot
SemrushBot
AhrefsBot
DotBot
DataForSeoBot
SERanking
# --- Scraping frameworks (already blocked) ---
Scrapy
python-requests
Go-http-client
Java/
libwww-perl
trafilatura
# --- Vulnerability scanners (already blocked) ---
Nikto
sqlmap
Nmap
masscan
ZmEu
Morpheus'
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would create: ${AGENTS_FILE}"
else
if [[ -f "$AGENTS_FILE" ]]; then
cp "$AGENTS_FILE" "${AGENTS_FILE}.bak.$(date +%s)"
warn "Existing whitelist backed up"
fi
echo "$AGENTS_CONTENT" > "$AGENTS_FILE"
info "Installed: ${AGENTS_FILE} ($(grep -cve '^\s*#' -e '^\s*$' "$AGENTS_FILE") patterns)"
fi
# =====================================================
# Step 3: Install bot-monitor script
# =====================================================
step "Installing bot-monitor script"
SCRIPT_FILE="/usr/local/bin/bot-monitor.sh"
cat > /tmp/bot-monitor.sh.tmp << 'SCRIPTEOF'
#!/bin/bash
# /usr/local/bin/bot-monitor.sh
# Detects unknown user agents in yesterday's web server logs.
# Sends alerts via ntfy and/or exports Prometheus metrics.
#
# Author: Phil Connor <contact@mylinux.work>
# License: MIT
set -euo pipefail
# --- Configuration ---
LOG_DIR="__LOG_DIR__"
LOG_PATTERN="__LOG_PATTERN__"
KNOWN_AGENTS="/etc/bot-monitor/known-agents.txt"
STATE_DIR="/var/lib/bot-monitor"
NTFY_URL="__NTFY_URL__"
TEXTFILE_DIR="__TEXTFILE_DIR__"
TEXTFILE_ENABLED=__TEXTFILE_ENABLED__
MIN_REQUESTS=__MIN_REQUESTS__
HOSTNAME=$(hostname -f)
# --- Setup ---
mkdir -p "$STATE_DIR"
# Build grep exclusion pattern from known agents file
EXCLUDE_PATTERN=$(grep -v '^#' "$KNOWN_AGENTS" | grep -v '^$' | paste -sd'|')
# --- Extract unknown agents from yesterday's logs ---
YESTERDAY=$(date -d yesterday +%d/%b/%Y)
UNKNOWN_FILE="$STATE_DIR/unknown-$(date -d yesterday +%Y-%m-%d).txt"
grep "$YESTERDAY" "$LOG_DIR"/$LOG_PATTERN 2>/dev/null \
| awk -F'"' '{print $6}' \
| grep -v '^-$' \
| grep -v '^$' \
| sort | uniq -c | sort -rn \
| grep -viE "$EXCLUDE_PATTERN" \
| awk -v min="$MIN_REQUESTS" '$1 >= min' \
> "$UNKNOWN_FILE" || true
AGENT_COUNT=$(wc -l < "$UNKNOWN_FILE")
TOTAL_REQUESTS=0
if [ "$AGENT_COUNT" -gt 0 ]; then
TOTAL_REQUESTS=$(awk '{sum += $1} END {print sum+0}' "$UNKNOWN_FILE")
fi
# --- ntfy alert ---
if [ "$AGENT_COUNT" -gt 0 ] && [ -n "${NTFY_URL}" ]; then
SUMMARY=$(head -10 "$UNKNOWN_FILE" | while read count agent; do
printf " %6d %s\n" "$count" "$agent"
done)
curl -fsSL \
-H "Title: Unknown bots detected on $HOSTNAME" \
-H "Priority: 3" \
-H "Tags: spider,warning" \
-d "Found $AGENT_COUNT unknown user agents yesterday:
$SUMMARY
Full list: $UNKNOWN_FILE" \
"$NTFY_URL" > /dev/null 2>&1
fi
# --- Prometheus textfile metrics ---
if [ "${TEXTFILE_ENABLED}" = "true" ]; then
OUTPUT_FILE="${TEXTFILE_DIR}/bot_monitor.prom"
PROM_TMP=$(mktemp "${TEXTFILE_DIR}/.bot_monitor.XXXXXX")
{
echo "# HELP bot_monitor_unknown_agents_total Number of unique unknown user agents detected yesterday."
echo "# TYPE bot_monitor_unknown_agents_total gauge"
echo "bot_monitor_unknown_agents_total ${AGENT_COUNT}"
echo ''
echo "# HELP bot_monitor_unknown_requests_total Total requests from unknown user agents yesterday."
echo "# TYPE bot_monitor_unknown_requests_total gauge"
echo "bot_monitor_unknown_requests_total ${TOTAL_REQUESTS}"
echo ''
echo "# HELP bot_monitor_last_scan_timestamp_seconds Unix timestamp of last bot monitor scan."
echo "# TYPE bot_monitor_last_scan_timestamp_seconds gauge"
echo "bot_monitor_last_scan_timestamp_seconds $(date +%s)"
echo ''
if [ "$AGENT_COUNT" -gt 0 ]; then
echo "# HELP bot_monitor_agent_requests Requests per unknown user agent yesterday."
echo "# TYPE bot_monitor_agent_requests gauge"
head -20 "$UNKNOWN_FILE" | while read count agent; do
safe_agent=$(echo "$agent" | sed 's/\\/\\\\/g; s/"/\\"/g' | cut -c1-128)
echo "bot_monitor_agent_requests{agent=\"${safe_agent}\"} ${count}"
done
fi
} > "$PROM_TMP"
chmod 644 "$PROM_TMP"
mv -f "$PROM_TMP" "$OUTPUT_FILE"
fi
# --- Cleanup state files older than 30 days ---
find "$STATE_DIR" -name "unknown-*.txt" -mtime +30 -delete 2>/dev/null || true
SCRIPTEOF
# Replace placeholders with actual values
sed -i "s|__LOG_DIR__|${LOG_DIR}|g" /tmp/bot-monitor.sh.tmp
sed -i "s|__LOG_PATTERN__|${LOG_PATTERN}|g" /tmp/bot-monitor.sh.tmp
sed -i "s|__NTFY_URL__|${NTFY_URL}|g" /tmp/bot-monitor.sh.tmp
sed -i "s|__TEXTFILE_DIR__|${TEXTFILE_DIR}|g" /tmp/bot-monitor.sh.tmp
sed -i "s|__TEXTFILE_ENABLED__|${TEXTFILE_ENABLED}|g" /tmp/bot-monitor.sh.tmp
sed -i "s|__MIN_REQUESTS__|${MIN_REQUESTS}|g" /tmp/bot-monitor.sh.tmp
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would install: ${SCRIPT_FILE}"
echo " Log dir: ${LOG_DIR}"
echo " Log pattern: ${LOG_PATTERN}"
[[ -n "$NTFY_URL" ]] && echo " ntfy URL: ${NTFY_URL}"
[[ "$TEXTFILE_ENABLED" == "true" ]] && echo " Textfile dir: ${TEXTFILE_DIR}"
echo " Min requests: ${MIN_REQUESTS}"
else
if [[ -f "$SCRIPT_FILE" ]]; then
cp "$SCRIPT_FILE" "${SCRIPT_FILE}.bak.$(date +%s)"
warn "Existing script backed up"
fi
mv /tmp/bot-monitor.sh.tmp "$SCRIPT_FILE"
chmod +x "$SCRIPT_FILE"
info "Installed: ${SCRIPT_FILE}"
fi
# =====================================================
# Step 4: Install cron job in root's crontab
# =====================================================
step "Installing cron job"
CRON_LINE="0 ${CRON_HOUR} * * * ${SCRIPT_FILE}"
if [[ "$DRY_RUN" == "true" ]]; then
echo " Would add to root crontab: ${CRON_LINE}"
echo " Schedule: daily at ${CRON_HOUR}:00"
else
# Check if already in crontab
if crontab -l 2>/dev/null | grep -qF "$SCRIPT_FILE"; then
warn "Cron entry already exists in root crontab — skipping"
else
(crontab -l 2>/dev/null; echo "${CRON_LINE}") | crontab -
info "Cron added to root crontab: daily at ${CRON_HOUR}:00"
fi
fi
# =====================================================
# Summary
# =====================================================
echo ""
echo -e "${BOLD}Done.${NC}"
echo ""
echo " Script: ${SCRIPT_FILE}"
echo " Whitelist: ${AGENTS_FILE}"
echo " State dir: /var/lib/bot-monitor/"
echo " Cron: daily at ${CRON_HOUR}:00"
echo " Log source: ${LOG_DIR}/${LOG_PATTERN}"
[[ -n "$NTFY_URL" ]] && echo " Alerts: ${NTFY_URL}"
[[ "$TEXTFILE_ENABLED" == "true" ]] && echo " Metrics: ${TEXTFILE_DIR}/bot_monitor.prom"
echo ""
echo " Test manually:"
echo " sudo ${SCRIPT_FILE}"
echo ""
echo " Check results:"
echo " ls -la /var/lib/bot-monitor/"
echo " cat /var/lib/bot-monitor/unknown-\$(date -d yesterday +%Y-%m-%d).txt"
if [[ "$TEXTFILE_ENABLED" == "true" ]]; then
echo ""
echo " Check Prometheus metrics:"
echo " cat ${TEXTFILE_DIR}/bot_monitor.prom"
echo ""
echo " PromQL examples:"
echo " bot_monitor_unknown_agents_total # unique unknown agents"
echo " bot_monitor_unknown_requests_total # total requests from unknowns"
echo " bot_monitor_agent_requests # per-agent request counts"
fi
echo ""
echo " Edit whitelist to reduce noise:"
echo " nano ${AGENTS_FILE}"