Files
linux-scripts/sitemap-validator.sh
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

493 lines
17 KiB
Bash
Executable File

#!/usr/bin/env bash
#########################################################################################
#### sitemap-validator.sh — Validate Hugo sitemap URLs, meta tags, and content parity ####
#### Checks each URL for HTTP 200, duplicates, missing meta tags, and compares ####
#### sitemap entries against local Hugo content files ####
#### ####
#### Author: Phil Connor ####
#### Contact: contact@mylinux.work ####
#### License: MIT ####
#### Version 1.1 ####
#### ####
#### Usage: ####
#### ./sitemap-validator.sh https://example.com/sitemap.xml ####
#### ./sitemap-validator.sh --file public/sitemap.xml --local-dir content ####
#### ./sitemap-validator.sh --help ####
#### ####
#### See --help for all options. ####
#########################################################################################
set -euo pipefail
# ── Defaults ──────────────────────────────────────────────────────────
SITEMAP_URL="${SITEMAP_URL:-}"
SITEMAP_FILE=""
LOCAL_DIR=""
PUBLIC_DIR=""
CHECK_META="${CHECK_META:-true}"
PARALLEL="${PARALLEL:-5}"
VERBOSE="${VERBOSE:-false}"
COLOR="${COLOR:-auto}"
BASE_URL=""
# ── State ─────────────────────────────────────────────────────────────
SCRIPT_NAME="$(basename "$0")"
readonly SCRIPT_NAME
TMPDIR_WORK=""
COUNT_TOTAL=0
COUNT_OK=0
COUNT_BROKEN=0
COUNT_MISSING_META=0
COUNT_DUPLICATES=0
COUNT_ORPHAN=0
# ── Colors ────────────────────────────────────────────────────────────
setup_colors() {
if [[ "$COLOR" == "never" ]]; then
RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET=""
return
fi
if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
CYAN='\033[0;36m'
BOLD='\033[1m'
DIM='\033[2m'
RESET='\033[0m'
else
RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET=""
fi
}
# ── Logging ───────────────────────────────────────────────────────────
log() { echo -e "${CYAN}[INFO]${RESET} $*"; }
warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; }
err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; }
verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; }
# ── Helpers ───────────────────────────────────────────────────────────
section_header() {
echo ""
echo -e " ${BOLD}${CYAN}── $1 ──${RESET}"
echo ""
}
cleanup() {
if [[ -n "$TMPDIR_WORK" && -d "$TMPDIR_WORK" ]]; then
rm -rf "$TMPDIR_WORK"
fi
}
trap cleanup EXIT
fetch_sitemap() {
local dest="$1"
if [[ -n "$SITEMAP_FILE" ]]; then
if [[ ! -f "$SITEMAP_FILE" ]]; then
err "Sitemap file not found: $SITEMAP_FILE"
exit 1
fi
cp "$SITEMAP_FILE" "$dest"
elif [[ -n "$SITEMAP_URL" ]]; then
verbose "Fetching sitemap from $SITEMAP_URL"
if ! curl -fsSL --max-time 30 "$SITEMAP_URL" -o "$dest" 2>/dev/null; then
err "Failed to fetch sitemap: $SITEMAP_URL"
exit 1
fi
else
err "No sitemap URL or file specified"
exit 1
fi
}
extract_urls() {
local sitemap_file="$1"
grep -oP '<loc>\K[^<]+' "$sitemap_file" || true
}
check_duplicates() {
local url_file="$1"
local dupes
dupes=$(sort "$url_file" | uniq -d)
if [[ -n "$dupes" ]]; then
section_header "Duplicate URLs"
while IFS= read -r url; do
local count
count=$(grep -cF "$url" "$url_file" || true)
echo -e " ${RED}${RESET} ${url} (${count} occurrences)"
COUNT_DUPLICATES=$((COUNT_DUPLICATES + 1))
done <<< "$dupes"
else
verbose "No duplicate URLs found"
fi
}
url_to_local_path() {
local url="$1"
local path
path=$(echo "$url" | sed -E 's|https?://[^/]+||')
path="${path%/}"
echo "${PUBLIC_DIR}${path}/index.html"
}
check_meta_tags() {
local html_file="$1"
local warnings=""
if ! grep -qi '<title>' "$html_file"; then
warnings="${warnings}missing-title,"
fi
if ! grep -qi 'name="description"' "$html_file"; then
warnings="${warnings}missing-description,"
fi
if ! grep -qi 'rel="canonical"' "$html_file"; then
warnings="${warnings}missing-canonical,"
fi
if ! grep -qi 'og:title' "$html_file"; then
warnings="${warnings}missing-og-title,"
fi
echo "${warnings%,}"
}
check_url_local() {
local url="$1"
local results_dir="$2"
local safe_name
safe_name=$(echo "$url" | md5sum | cut -d' ' -f1)
local result_file="${results_dir}/${safe_name}"
local html_file
html_file=$(url_to_local_path "$url")
if [[ -f "$html_file" ]]; then
echo "ok" > "$result_file"
if [[ "$CHECK_META" == "true" ]]; then
local warnings
warnings=$(check_meta_tags "$html_file")
if [[ -n "$warnings" ]]; then
echo "meta:${warnings}" > "$result_file"
fi
fi
else
echo "broken:missing" > "$result_file"
fi
echo "$url" >> "$result_file"
}
check_url_remote() {
local url="$1"
local results_dir="$2"
local safe_name
safe_name=$(echo "$url" | md5sum | cut -d' ' -f1)
local result_file="${results_dir}/${safe_name}"
local body_file="${results_dir}/${safe_name}.body"
local http_code
http_code=$(curl -sL --max-time 10 -o "$body_file" -w '%{http_code}' "$url" 2>/dev/null) || http_code="000"
if [[ "$http_code" == "200" ]]; then
echo "ok" > "$result_file"
if [[ "$CHECK_META" == "true" && -s "$body_file" ]]; then
local warnings
warnings=$(check_meta_tags "$body_file")
if [[ -n "$warnings" ]]; then
echo "meta:${warnings}" > "$result_file"
fi
fi
else
echo "broken:${http_code}" > "$result_file"
fi
echo "$url" >> "$result_file"
rm -f "$body_file"
}
check_urls_parallel() {
local url_file="$1"
local results_dir="$2"
local check_fn="check_url_remote"
if [[ -n "$PUBLIC_DIR" ]]; then
check_fn="check_url_local"
fi
mkdir -p "$results_dir"
local pids=()
local running=0
while IFS= read -r url; do
[[ -z "$url" ]] && continue
"$check_fn" "$url" "$results_dir" &
pids+=($!)
running=$((running + 1))
if [[ "$running" -ge "$PARALLEL" ]]; then
wait "${pids[0]}" 2>/dev/null || true
pids=("${pids[@]:1}")
running=$((running - 1))
fi
done < "$url_file"
for pid in "${pids[@]}"; do
wait "$pid" 2>/dev/null || true
done
}
report_url_results() {
local results_dir="$1"
section_header "URL Check Results"
for result_file in "$results_dir"/*; do
[[ -f "$result_file" ]] || continue
local status url
status=$(head -1 "$result_file")
url=$(tail -1 "$result_file")
COUNT_TOTAL=$((COUNT_TOTAL + 1))
if [[ "$status" == "ok" ]]; then
COUNT_OK=$((COUNT_OK + 1))
if [[ "$VERBOSE" == "true" ]]; then
echo -e " ${GREEN}${RESET} ${url}"
fi
elif [[ "$status" == broken:* ]]; then
local code="${status#broken:}"
COUNT_BROKEN=$((COUNT_BROKEN + 1))
echo -e " ${RED}${RESET} ${url} — HTTP ${code}"
elif [[ "$status" == meta:* ]]; then
local tags="${status#meta:}"
COUNT_OK=$((COUNT_OK + 1))
COUNT_MISSING_META=$((COUNT_MISSING_META + 1))
echo -e " ${YELLOW}${RESET} ${url}${tags}"
fi
done
}
check_orphan_content() {
local url_file="$1"
if [[ -z "$LOCAL_DIR" ]]; then
verbose "No --local-dir specified, skipping orphan check"
return
fi
if [[ ! -d "$LOCAL_DIR" ]]; then
warn "Content directory not found: $LOCAL_DIR"
return
fi
if [[ -z "$BASE_URL" ]]; then
BASE_URL=$(head -1 "$url_file" | grep -oP 'https?://[^/]+' || true)
if [[ -z "$BASE_URL" ]]; then
warn "Could not determine base URL — skipping orphan check"
return
fi
fi
section_header "Orphan Content (not in sitemap)"
local found_orphan=false
while IFS= read -r -d '' content_file; do
local dir_name
dir_name=$(dirname "$content_file")
local rel_path="${dir_name#"$LOCAL_DIR"}"
rel_path="${rel_path#/}"
if [[ "$(basename "$content_file")" == "_index.md" ]]; then
continue
fi
local expected_url="${BASE_URL}/${rel_path}/"
expected_url="${expected_url//\/\///}"
expected_url="${expected_url/:\//://}"
local front_matter
front_matter=$(sed -n '/^---$/,/^---$/p' "$content_file" 2>/dev/null | head -20)
if echo "$front_matter" | grep -qi 'draft: *true'; then
verbose "Skipping draft: $content_file"
continue
fi
if ! grep -qF "$rel_path" "$url_file" 2>/dev/null; then
echo -e " ${YELLOW}${RESET} ${content_file} → expected ${expected_url}"
COUNT_ORPHAN=$((COUNT_ORPHAN + 1))
found_orphan=true
fi
done < <(find "$LOCAL_DIR" -name "index.md" -print0 2>/dev/null)
if [[ "$found_orphan" == "false" ]]; then
echo -e " ${GREEN}${RESET} All content files found in sitemap"
fi
}
print_summary() {
section_header "Summary"
printf " ${BOLD}%-24s${RESET} %d\n" "Total URLs:" "$COUNT_TOTAL"
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "OK:" "$GREEN" "$COUNT_OK" "$RESET"
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Broken:" "$RED" "$COUNT_BROKEN" "$RESET"
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Missing meta tags:" "$YELLOW" "$COUNT_MISSING_META" "$RESET"
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Duplicate URLs:" "$YELLOW" "$COUNT_DUPLICATES" "$RESET"
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Orphan content files:" "$YELLOW" "$COUNT_ORPHAN" "$RESET"
echo ""
if [[ "$COUNT_BROKEN" -gt 0 || "$COUNT_DUPLICATES" -gt 0 ]]; then
echo -e " ${RED}${BOLD}FAIL${RESET} — issues found"
return 1
else
echo -e " ${GREEN}${BOLD}PASS${RESET} — sitemap looks good"
return 0
fi
}
# ══════════════════════════════════════════════════════════════════════
# USAGE
# ══════════════════════════════════════════════════════════════════════
usage() {
cat <<EOF
${SCRIPT_NAME} — Validate Hugo sitemap URLs, meta tags, and content parity
USAGE:
${SCRIPT_NAME} [OPTIONS] <SITEMAP_URL>
${SCRIPT_NAME} --file sitemap.xml [OPTIONS]
OPTIONS:
--file FILE Read sitemap from local file instead of URL
Auto-detects public dir and checks local HTML files
--public-dir DIR Override public directory for local HTML checks
--local-dir DIR Hugo content directory for orphan detection
--base-url URL Override base URL for orphan content matching
--parallel N Max concurrent URL checks (default: ${PARALLEL})
--no-meta Skip meta tag checks
--verbose Enable debug output
--no-color Disable colored output
--help Show this help
ENVIRONMENT VARIABLES:
SITEMAP_URL Default sitemap URL if not passed as argument
CHECK_META Enable/disable meta checks (default: true)
PARALLEL Max concurrent checks (default: 5)
COLOR Color mode: auto, always, never (default: auto)
EXAMPLES:
# Validate remote sitemap
./${SCRIPT_NAME} https://example.com/sitemap.xml
# Validate local build with content cross-reference
./${SCRIPT_NAME} --file public/sitemap.xml --local-dir content
# Fast check — skip meta tag validation
./${SCRIPT_NAME} --no-meta https://example.com/sitemap.xml
# High concurrency
./${SCRIPT_NAME} --parallel 20 https://example.com/sitemap.xml
EOF
}
# ══════════════════════════════════════════════════════════════════════
# ARGUMENT PARSING
# ══════════════════════════════════════════════════════════════════════
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--file)
SITEMAP_FILE="$2"; shift 2 ;;
--public-dir)
PUBLIC_DIR="$2"; shift 2 ;;
--local-dir)
LOCAL_DIR="$2"; shift 2 ;;
--base-url)
BASE_URL="$2"; shift 2 ;;
--parallel)
PARALLEL="$2"; shift 2 ;;
--no-meta)
CHECK_META="false"; shift ;;
--verbose)
VERBOSE="true"; shift ;;
--no-color)
COLOR="never"; shift ;;
--help|-h)
setup_colors
usage
exit 0 ;;
-*)
err "Unknown option: $1"
echo "Run ${SCRIPT_NAME} --help for usage" >&2
exit 1 ;;
*)
SITEMAP_URL="$1"; shift ;;
esac
done
}
# ══════════════════════════════════════════════════════════════════════
# MAIN
# ══════════════════════════════════════════════════════════════════════
main() {
parse_args "$@"
setup_colors
if [[ -z "$SITEMAP_URL" && -z "$SITEMAP_FILE" ]]; then
err "No sitemap URL or file specified"
echo "Run ${SCRIPT_NAME} --help for usage" >&2
exit 1
fi
TMPDIR_WORK=$(mktemp -d)
local sitemap_file="${TMPDIR_WORK}/sitemap.xml"
local url_file="${TMPDIR_WORK}/urls.txt"
local results_dir="${TMPDIR_WORK}/results"
# Auto-detect public directory from --file path
if [[ -n "$SITEMAP_FILE" && -z "$PUBLIC_DIR" ]]; then
PUBLIC_DIR=$(cd "$(dirname "$SITEMAP_FILE")" && pwd)
if [[ ! -d "${PUBLIC_DIR}/guides" && ! -d "${PUBLIC_DIR}/posts" ]]; then
verbose "Auto-detected PUBLIC_DIR=${PUBLIC_DIR} but no content dirs found — falling back to remote"
PUBLIC_DIR=""
else
verbose "Auto-detected PUBLIC_DIR=${PUBLIC_DIR}"
fi
fi
local mode="remote"
[[ -n "$PUBLIC_DIR" ]] && mode="local (${PUBLIC_DIR})"
echo ""
echo -e "${BOLD}Sitemap Validator${RESET}"
echo -e "${DIM}Source: ${SITEMAP_URL:-$SITEMAP_FILE}${RESET}"
echo -e "${DIM}Mode: ${mode} | Parallel: ${PARALLEL} | Meta check: ${CHECK_META}${RESET}"
fetch_sitemap "$sitemap_file"
extract_urls "$sitemap_file" > "$url_file"
local url_count
url_count=$(wc -l < "$url_file")
if [[ "$url_count" -eq 0 ]]; then
err "No URLs found in sitemap"
exit 1
fi
log "Found ${url_count} URLs in sitemap"
# Check for localhost
if grep -qi 'localhost\|127\.0\.0\.1' "$url_file"; then
warn "Sitemap contains localhost URLs — check your baseURL"
fi
check_duplicates "$url_file"
check_urls_parallel "$url_file" "$results_dir"
report_url_results "$results_dir"
check_orphan_content "$url_file"
print_summary
}
main "$@"