a1a17e81a1
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
493 lines
17 KiB
Bash
Executable File
493 lines
17 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
#########################################################################################
|
|
#### sitemap-validator.sh — Validate Hugo sitemap URLs, meta tags, and content parity ####
|
|
#### Checks each URL for HTTP 200, duplicates, missing meta tags, and compares ####
|
|
#### sitemap entries against local Hugo content files ####
|
|
#### ####
|
|
#### Author: Phil Connor ####
|
|
#### Contact: contact@mylinux.work ####
|
|
#### License: MIT ####
|
|
#### Version 1.1 ####
|
|
#### ####
|
|
#### Usage: ####
|
|
#### ./sitemap-validator.sh https://example.com/sitemap.xml ####
|
|
#### ./sitemap-validator.sh --file public/sitemap.xml --local-dir content ####
|
|
#### ./sitemap-validator.sh --help ####
|
|
#### ####
|
|
#### See --help for all options. ####
|
|
#########################################################################################
|
|
|
|
set -euo pipefail
|
|
|
|
# ── Defaults ──────────────────────────────────────────────────────────
|
|
SITEMAP_URL="${SITEMAP_URL:-}"
|
|
SITEMAP_FILE=""
|
|
LOCAL_DIR=""
|
|
PUBLIC_DIR=""
|
|
CHECK_META="${CHECK_META:-true}"
|
|
PARALLEL="${PARALLEL:-5}"
|
|
VERBOSE="${VERBOSE:-false}"
|
|
COLOR="${COLOR:-auto}"
|
|
BASE_URL=""
|
|
|
|
# ── State ─────────────────────────────────────────────────────────────
|
|
SCRIPT_NAME="$(basename "$0")"
|
|
readonly SCRIPT_NAME
|
|
TMPDIR_WORK=""
|
|
COUNT_TOTAL=0
|
|
COUNT_OK=0
|
|
COUNT_BROKEN=0
|
|
COUNT_MISSING_META=0
|
|
COUNT_DUPLICATES=0
|
|
COUNT_ORPHAN=0
|
|
|
|
# ── Colors ────────────────────────────────────────────────────────────
|
|
setup_colors() {
|
|
if [[ "$COLOR" == "never" ]]; then
|
|
RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET=""
|
|
return
|
|
fi
|
|
if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
CYAN='\033[0;36m'
|
|
BOLD='\033[1m'
|
|
DIM='\033[2m'
|
|
RESET='\033[0m'
|
|
else
|
|
RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET=""
|
|
fi
|
|
}
|
|
|
|
# ── Logging ───────────────────────────────────────────────────────────
|
|
log() { echo -e "${CYAN}[INFO]${RESET} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; }
|
|
err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; }
|
|
verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; }
|
|
|
|
# ── Helpers ───────────────────────────────────────────────────────────
|
|
section_header() {
|
|
echo ""
|
|
echo -e " ${BOLD}${CYAN}── $1 ──${RESET}"
|
|
echo ""
|
|
}
|
|
|
|
cleanup() {
|
|
if [[ -n "$TMPDIR_WORK" && -d "$TMPDIR_WORK" ]]; then
|
|
rm -rf "$TMPDIR_WORK"
|
|
fi
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
fetch_sitemap() {
|
|
local dest="$1"
|
|
if [[ -n "$SITEMAP_FILE" ]]; then
|
|
if [[ ! -f "$SITEMAP_FILE" ]]; then
|
|
err "Sitemap file not found: $SITEMAP_FILE"
|
|
exit 1
|
|
fi
|
|
cp "$SITEMAP_FILE" "$dest"
|
|
elif [[ -n "$SITEMAP_URL" ]]; then
|
|
verbose "Fetching sitemap from $SITEMAP_URL"
|
|
if ! curl -fsSL --max-time 30 "$SITEMAP_URL" -o "$dest" 2>/dev/null; then
|
|
err "Failed to fetch sitemap: $SITEMAP_URL"
|
|
exit 1
|
|
fi
|
|
else
|
|
err "No sitemap URL or file specified"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
extract_urls() {
|
|
local sitemap_file="$1"
|
|
grep -oP '<loc>\K[^<]+' "$sitemap_file" || true
|
|
}
|
|
|
|
check_duplicates() {
|
|
local url_file="$1"
|
|
local dupes
|
|
dupes=$(sort "$url_file" | uniq -d)
|
|
if [[ -n "$dupes" ]]; then
|
|
section_header "Duplicate URLs"
|
|
while IFS= read -r url; do
|
|
local count
|
|
count=$(grep -cF "$url" "$url_file" || true)
|
|
echo -e " ${RED}✗${RESET} ${url} (${count} occurrences)"
|
|
COUNT_DUPLICATES=$((COUNT_DUPLICATES + 1))
|
|
done <<< "$dupes"
|
|
else
|
|
verbose "No duplicate URLs found"
|
|
fi
|
|
}
|
|
|
|
url_to_local_path() {
|
|
local url="$1"
|
|
local path
|
|
path=$(echo "$url" | sed -E 's|https?://[^/]+||')
|
|
path="${path%/}"
|
|
echo "${PUBLIC_DIR}${path}/index.html"
|
|
}
|
|
|
|
check_meta_tags() {
|
|
local html_file="$1"
|
|
local warnings=""
|
|
|
|
if ! grep -qi '<title>' "$html_file"; then
|
|
warnings="${warnings}missing-title,"
|
|
fi
|
|
if ! grep -qi 'name="description"' "$html_file"; then
|
|
warnings="${warnings}missing-description,"
|
|
fi
|
|
if ! grep -qi 'rel="canonical"' "$html_file"; then
|
|
warnings="${warnings}missing-canonical,"
|
|
fi
|
|
if ! grep -qi 'og:title' "$html_file"; then
|
|
warnings="${warnings}missing-og-title,"
|
|
fi
|
|
|
|
echo "${warnings%,}"
|
|
}
|
|
|
|
check_url_local() {
|
|
local url="$1"
|
|
local results_dir="$2"
|
|
local safe_name
|
|
safe_name=$(echo "$url" | md5sum | cut -d' ' -f1)
|
|
local result_file="${results_dir}/${safe_name}"
|
|
|
|
local html_file
|
|
html_file=$(url_to_local_path "$url")
|
|
|
|
if [[ -f "$html_file" ]]; then
|
|
echo "ok" > "$result_file"
|
|
|
|
if [[ "$CHECK_META" == "true" ]]; then
|
|
local warnings
|
|
warnings=$(check_meta_tags "$html_file")
|
|
if [[ -n "$warnings" ]]; then
|
|
echo "meta:${warnings}" > "$result_file"
|
|
fi
|
|
fi
|
|
else
|
|
echo "broken:missing" > "$result_file"
|
|
fi
|
|
echo "$url" >> "$result_file"
|
|
}
|
|
|
|
check_url_remote() {
|
|
local url="$1"
|
|
local results_dir="$2"
|
|
local safe_name
|
|
safe_name=$(echo "$url" | md5sum | cut -d' ' -f1)
|
|
local result_file="${results_dir}/${safe_name}"
|
|
local body_file="${results_dir}/${safe_name}.body"
|
|
|
|
local http_code
|
|
http_code=$(curl -sL --max-time 10 -o "$body_file" -w '%{http_code}' "$url" 2>/dev/null) || http_code="000"
|
|
|
|
if [[ "$http_code" == "200" ]]; then
|
|
echo "ok" > "$result_file"
|
|
|
|
if [[ "$CHECK_META" == "true" && -s "$body_file" ]]; then
|
|
local warnings
|
|
warnings=$(check_meta_tags "$body_file")
|
|
if [[ -n "$warnings" ]]; then
|
|
echo "meta:${warnings}" > "$result_file"
|
|
fi
|
|
fi
|
|
else
|
|
echo "broken:${http_code}" > "$result_file"
|
|
fi
|
|
echo "$url" >> "$result_file"
|
|
rm -f "$body_file"
|
|
}
|
|
|
|
check_urls_parallel() {
|
|
local url_file="$1"
|
|
local results_dir="$2"
|
|
local check_fn="check_url_remote"
|
|
|
|
if [[ -n "$PUBLIC_DIR" ]]; then
|
|
check_fn="check_url_local"
|
|
fi
|
|
|
|
mkdir -p "$results_dir"
|
|
|
|
local pids=()
|
|
local running=0
|
|
|
|
while IFS= read -r url; do
|
|
[[ -z "$url" ]] && continue
|
|
"$check_fn" "$url" "$results_dir" &
|
|
pids+=($!)
|
|
running=$((running + 1))
|
|
|
|
if [[ "$running" -ge "$PARALLEL" ]]; then
|
|
wait "${pids[0]}" 2>/dev/null || true
|
|
pids=("${pids[@]:1}")
|
|
running=$((running - 1))
|
|
fi
|
|
done < "$url_file"
|
|
|
|
for pid in "${pids[@]}"; do
|
|
wait "$pid" 2>/dev/null || true
|
|
done
|
|
}
|
|
|
|
report_url_results() {
|
|
local results_dir="$1"
|
|
|
|
section_header "URL Check Results"
|
|
|
|
for result_file in "$results_dir"/*; do
|
|
[[ -f "$result_file" ]] || continue
|
|
local status url
|
|
status=$(head -1 "$result_file")
|
|
url=$(tail -1 "$result_file")
|
|
|
|
COUNT_TOTAL=$((COUNT_TOTAL + 1))
|
|
|
|
if [[ "$status" == "ok" ]]; then
|
|
COUNT_OK=$((COUNT_OK + 1))
|
|
if [[ "$VERBOSE" == "true" ]]; then
|
|
echo -e " ${GREEN}✓${RESET} ${url}"
|
|
fi
|
|
elif [[ "$status" == broken:* ]]; then
|
|
local code="${status#broken:}"
|
|
COUNT_BROKEN=$((COUNT_BROKEN + 1))
|
|
echo -e " ${RED}✗${RESET} ${url} — HTTP ${code}"
|
|
elif [[ "$status" == meta:* ]]; then
|
|
local tags="${status#meta:}"
|
|
COUNT_OK=$((COUNT_OK + 1))
|
|
COUNT_MISSING_META=$((COUNT_MISSING_META + 1))
|
|
echo -e " ${YELLOW}⚠${RESET} ${url} — ${tags}"
|
|
fi
|
|
done
|
|
}
|
|
|
|
check_orphan_content() {
|
|
local url_file="$1"
|
|
|
|
if [[ -z "$LOCAL_DIR" ]]; then
|
|
verbose "No --local-dir specified, skipping orphan check"
|
|
return
|
|
fi
|
|
|
|
if [[ ! -d "$LOCAL_DIR" ]]; then
|
|
warn "Content directory not found: $LOCAL_DIR"
|
|
return
|
|
fi
|
|
|
|
if [[ -z "$BASE_URL" ]]; then
|
|
BASE_URL=$(head -1 "$url_file" | grep -oP 'https?://[^/]+' || true)
|
|
if [[ -z "$BASE_URL" ]]; then
|
|
warn "Could not determine base URL — skipping orphan check"
|
|
return
|
|
fi
|
|
fi
|
|
|
|
section_header "Orphan Content (not in sitemap)"
|
|
|
|
local found_orphan=false
|
|
while IFS= read -r -d '' content_file; do
|
|
local dir_name
|
|
dir_name=$(dirname "$content_file")
|
|
local rel_path="${dir_name#"$LOCAL_DIR"}"
|
|
rel_path="${rel_path#/}"
|
|
|
|
if [[ "$(basename "$content_file")" == "_index.md" ]]; then
|
|
continue
|
|
fi
|
|
|
|
local expected_url="${BASE_URL}/${rel_path}/"
|
|
expected_url="${expected_url//\/\///}"
|
|
expected_url="${expected_url/:\//://}"
|
|
|
|
local front_matter
|
|
front_matter=$(sed -n '/^---$/,/^---$/p' "$content_file" 2>/dev/null | head -20)
|
|
|
|
if echo "$front_matter" | grep -qi 'draft: *true'; then
|
|
verbose "Skipping draft: $content_file"
|
|
continue
|
|
fi
|
|
|
|
if ! grep -qF "$rel_path" "$url_file" 2>/dev/null; then
|
|
echo -e " ${YELLOW}⚠${RESET} ${content_file} → expected ${expected_url}"
|
|
COUNT_ORPHAN=$((COUNT_ORPHAN + 1))
|
|
found_orphan=true
|
|
fi
|
|
done < <(find "$LOCAL_DIR" -name "index.md" -print0 2>/dev/null)
|
|
|
|
if [[ "$found_orphan" == "false" ]]; then
|
|
echo -e " ${GREEN}✓${RESET} All content files found in sitemap"
|
|
fi
|
|
}
|
|
|
|
print_summary() {
|
|
section_header "Summary"
|
|
|
|
printf " ${BOLD}%-24s${RESET} %d\n" "Total URLs:" "$COUNT_TOTAL"
|
|
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "OK:" "$GREEN" "$COUNT_OK" "$RESET"
|
|
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Broken:" "$RED" "$COUNT_BROKEN" "$RESET"
|
|
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Missing meta tags:" "$YELLOW" "$COUNT_MISSING_META" "$RESET"
|
|
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Duplicate URLs:" "$YELLOW" "$COUNT_DUPLICATES" "$RESET"
|
|
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Orphan content files:" "$YELLOW" "$COUNT_ORPHAN" "$RESET"
|
|
echo ""
|
|
|
|
if [[ "$COUNT_BROKEN" -gt 0 || "$COUNT_DUPLICATES" -gt 0 ]]; then
|
|
echo -e " ${RED}${BOLD}FAIL${RESET} — issues found"
|
|
return 1
|
|
else
|
|
echo -e " ${GREEN}${BOLD}PASS${RESET} — sitemap looks good"
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# USAGE
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
${SCRIPT_NAME} — Validate Hugo sitemap URLs, meta tags, and content parity
|
|
|
|
USAGE:
|
|
${SCRIPT_NAME} [OPTIONS] <SITEMAP_URL>
|
|
${SCRIPT_NAME} --file sitemap.xml [OPTIONS]
|
|
|
|
OPTIONS:
|
|
--file FILE Read sitemap from local file instead of URL
|
|
Auto-detects public dir and checks local HTML files
|
|
--public-dir DIR Override public directory for local HTML checks
|
|
--local-dir DIR Hugo content directory for orphan detection
|
|
--base-url URL Override base URL for orphan content matching
|
|
--parallel N Max concurrent URL checks (default: ${PARALLEL})
|
|
--no-meta Skip meta tag checks
|
|
--verbose Enable debug output
|
|
--no-color Disable colored output
|
|
--help Show this help
|
|
|
|
ENVIRONMENT VARIABLES:
|
|
SITEMAP_URL Default sitemap URL if not passed as argument
|
|
CHECK_META Enable/disable meta checks (default: true)
|
|
PARALLEL Max concurrent checks (default: 5)
|
|
COLOR Color mode: auto, always, never (default: auto)
|
|
|
|
EXAMPLES:
|
|
# Validate remote sitemap
|
|
./${SCRIPT_NAME} https://example.com/sitemap.xml
|
|
|
|
# Validate local build with content cross-reference
|
|
./${SCRIPT_NAME} --file public/sitemap.xml --local-dir content
|
|
|
|
# Fast check — skip meta tag validation
|
|
./${SCRIPT_NAME} --no-meta https://example.com/sitemap.xml
|
|
|
|
# High concurrency
|
|
./${SCRIPT_NAME} --parallel 20 https://example.com/sitemap.xml
|
|
EOF
|
|
}
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# ARGUMENT PARSING
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--file)
|
|
SITEMAP_FILE="$2"; shift 2 ;;
|
|
--public-dir)
|
|
PUBLIC_DIR="$2"; shift 2 ;;
|
|
--local-dir)
|
|
LOCAL_DIR="$2"; shift 2 ;;
|
|
--base-url)
|
|
BASE_URL="$2"; shift 2 ;;
|
|
--parallel)
|
|
PARALLEL="$2"; shift 2 ;;
|
|
--no-meta)
|
|
CHECK_META="false"; shift ;;
|
|
--verbose)
|
|
VERBOSE="true"; shift ;;
|
|
--no-color)
|
|
COLOR="never"; shift ;;
|
|
--help|-h)
|
|
setup_colors
|
|
usage
|
|
exit 0 ;;
|
|
-*)
|
|
err "Unknown option: $1"
|
|
echo "Run ${SCRIPT_NAME} --help for usage" >&2
|
|
exit 1 ;;
|
|
*)
|
|
SITEMAP_URL="$1"; shift ;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# MAIN
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
setup_colors
|
|
|
|
if [[ -z "$SITEMAP_URL" && -z "$SITEMAP_FILE" ]]; then
|
|
err "No sitemap URL or file specified"
|
|
echo "Run ${SCRIPT_NAME} --help for usage" >&2
|
|
exit 1
|
|
fi
|
|
|
|
TMPDIR_WORK=$(mktemp -d)
|
|
local sitemap_file="${TMPDIR_WORK}/sitemap.xml"
|
|
local url_file="${TMPDIR_WORK}/urls.txt"
|
|
local results_dir="${TMPDIR_WORK}/results"
|
|
|
|
# Auto-detect public directory from --file path
|
|
if [[ -n "$SITEMAP_FILE" && -z "$PUBLIC_DIR" ]]; then
|
|
PUBLIC_DIR=$(cd "$(dirname "$SITEMAP_FILE")" && pwd)
|
|
if [[ ! -d "${PUBLIC_DIR}/guides" && ! -d "${PUBLIC_DIR}/posts" ]]; then
|
|
verbose "Auto-detected PUBLIC_DIR=${PUBLIC_DIR} but no content dirs found — falling back to remote"
|
|
PUBLIC_DIR=""
|
|
else
|
|
verbose "Auto-detected PUBLIC_DIR=${PUBLIC_DIR}"
|
|
fi
|
|
fi
|
|
|
|
local mode="remote"
|
|
[[ -n "$PUBLIC_DIR" ]] && mode="local (${PUBLIC_DIR})"
|
|
|
|
echo ""
|
|
echo -e "${BOLD}Sitemap Validator${RESET}"
|
|
echo -e "${DIM}Source: ${SITEMAP_URL:-$SITEMAP_FILE}${RESET}"
|
|
echo -e "${DIM}Mode: ${mode} | Parallel: ${PARALLEL} | Meta check: ${CHECK_META}${RESET}"
|
|
|
|
fetch_sitemap "$sitemap_file"
|
|
extract_urls "$sitemap_file" > "$url_file"
|
|
|
|
local url_count
|
|
url_count=$(wc -l < "$url_file")
|
|
if [[ "$url_count" -eq 0 ]]; then
|
|
err "No URLs found in sitemap"
|
|
exit 1
|
|
fi
|
|
log "Found ${url_count} URLs in sitemap"
|
|
|
|
# Check for localhost
|
|
if grep -qi 'localhost\|127\.0\.0\.1' "$url_file"; then
|
|
warn "Sitemap contains localhost URLs — check your baseURL"
|
|
fi
|
|
|
|
check_duplicates "$url_file"
|
|
check_urls_parallel "$url_file" "$results_dir"
|
|
report_url_results "$results_dir"
|
|
check_orphan_content "$url_file"
|
|
print_summary
|
|
}
|
|
|
|
main "$@"
|