#!/usr/bin/env bash ######################################################################################### #### sitemap-validator.sh — Validate Hugo sitemap URLs, meta tags, and content parity #### #### Checks each URL for HTTP 200, duplicates, missing meta tags, and compares #### #### sitemap entries against local Hugo content files #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version 1.1 #### #### #### #### Usage: #### #### ./sitemap-validator.sh https://example.com/sitemap.xml #### #### ./sitemap-validator.sh --file public/sitemap.xml --local-dir content #### #### ./sitemap-validator.sh --help #### #### #### #### See --help for all options. #### ######################################################################################### set -euo pipefail # ── Defaults ────────────────────────────────────────────────────────── SITEMAP_URL="${SITEMAP_URL:-}" SITEMAP_FILE="" LOCAL_DIR="" PUBLIC_DIR="" CHECK_META="${CHECK_META:-true}" PARALLEL="${PARALLEL:-5}" VERBOSE="${VERBOSE:-false}" COLOR="${COLOR:-auto}" BASE_URL="" # ── State ───────────────────────────────────────────────────────────── SCRIPT_NAME="$(basename "$0")" readonly SCRIPT_NAME TMPDIR_WORK="" COUNT_TOTAL=0 COUNT_OK=0 COUNT_BROKEN=0 COUNT_MISSING_META=0 COUNT_DUPLICATES=0 COUNT_ORPHAN=0 # ── Colors ──────────────────────────────────────────────────────────── setup_colors() { if [[ "$COLOR" == "never" ]]; then RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" return fi if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' CYAN='\033[0;36m' BOLD='\033[1m' DIM='\033[2m' RESET='\033[0m' else RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" fi } # ── Logging ─────────────────────────────────────────────────────────── log() { echo -e "${CYAN}[INFO]${RESET} $*"; } warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } # ── Helpers ─────────────────────────────────────────────────────────── section_header() { echo "" echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" echo "" } cleanup() { if [[ -n "$TMPDIR_WORK" && -d "$TMPDIR_WORK" ]]; then rm -rf "$TMPDIR_WORK" fi } trap cleanup EXIT fetch_sitemap() { local dest="$1" if [[ -n "$SITEMAP_FILE" ]]; then if [[ ! -f "$SITEMAP_FILE" ]]; then err "Sitemap file not found: $SITEMAP_FILE" exit 1 fi cp "$SITEMAP_FILE" "$dest" elif [[ -n "$SITEMAP_URL" ]]; then verbose "Fetching sitemap from $SITEMAP_URL" if ! curl -fsSL --max-time 30 "$SITEMAP_URL" -o "$dest" 2>/dev/null; then err "Failed to fetch sitemap: $SITEMAP_URL" exit 1 fi else err "No sitemap URL or file specified" exit 1 fi } extract_urls() { local sitemap_file="$1" grep -oP '\K[^<]+' "$sitemap_file" || true } check_duplicates() { local url_file="$1" local dupes dupes=$(sort "$url_file" | uniq -d) if [[ -n "$dupes" ]]; then section_header "Duplicate URLs" while IFS= read -r url; do local count count=$(grep -cF "$url" "$url_file" || true) echo -e " ${RED}✗${RESET} ${url} (${count} occurrences)" COUNT_DUPLICATES=$((COUNT_DUPLICATES + 1)) done <<< "$dupes" else verbose "No duplicate URLs found" fi } url_to_local_path() { local url="$1" local path path=$(echo "$url" | sed -E 's|https?://[^/]+||') path="${path%/}" echo "${PUBLIC_DIR}${path}/index.html" } check_meta_tags() { local html_file="$1" local warnings="" if ! grep -qi '' "$html_file"; then warnings="${warnings}missing-title," fi if ! grep -qi 'name="description"' "$html_file"; then warnings="${warnings}missing-description," fi if ! grep -qi 'rel="canonical"' "$html_file"; then warnings="${warnings}missing-canonical," fi if ! grep -qi 'og:title' "$html_file"; then warnings="${warnings}missing-og-title," fi echo "${warnings%,}" } check_url_local() { local url="$1" local results_dir="$2" local safe_name safe_name=$(echo "$url" | md5sum | cut -d' ' -f1) local result_file="${results_dir}/${safe_name}" local html_file html_file=$(url_to_local_path "$url") if [[ -f "$html_file" ]]; then echo "ok" > "$result_file" if [[ "$CHECK_META" == "true" ]]; then local warnings warnings=$(check_meta_tags "$html_file") if [[ -n "$warnings" ]]; then echo "meta:${warnings}" > "$result_file" fi fi else echo "broken:missing" > "$result_file" fi echo "$url" >> "$result_file" } check_url_remote() { local url="$1" local results_dir="$2" local safe_name safe_name=$(echo "$url" | md5sum | cut -d' ' -f1) local result_file="${results_dir}/${safe_name}" local body_file="${results_dir}/${safe_name}.body" local http_code http_code=$(curl -sL --max-time 10 -o "$body_file" -w '%{http_code}' "$url" 2>/dev/null) || http_code="000" if [[ "$http_code" == "200" ]]; then echo "ok" > "$result_file" if [[ "$CHECK_META" == "true" && -s "$body_file" ]]; then local warnings warnings=$(check_meta_tags "$body_file") if [[ -n "$warnings" ]]; then echo "meta:${warnings}" > "$result_file" fi fi else echo "broken:${http_code}" > "$result_file" fi echo "$url" >> "$result_file" rm -f "$body_file" } check_urls_parallel() { local url_file="$1" local results_dir="$2" local check_fn="check_url_remote" if [[ -n "$PUBLIC_DIR" ]]; then check_fn="check_url_local" fi mkdir -p "$results_dir" local pids=() local running=0 while IFS= read -r url; do [[ -z "$url" ]] && continue "$check_fn" "$url" "$results_dir" & pids+=($!) running=$((running + 1)) if [[ "$running" -ge "$PARALLEL" ]]; then wait "${pids[0]}" 2>/dev/null || true pids=("${pids[@]:1}") running=$((running - 1)) fi done < "$url_file" for pid in "${pids[@]}"; do wait "$pid" 2>/dev/null || true done } report_url_results() { local results_dir="$1" section_header "URL Check Results" for result_file in "$results_dir"/*; do [[ -f "$result_file" ]] || continue local status url status=$(head -1 "$result_file") url=$(tail -1 "$result_file") COUNT_TOTAL=$((COUNT_TOTAL + 1)) if [[ "$status" == "ok" ]]; then COUNT_OK=$((COUNT_OK + 1)) if [[ "$VERBOSE" == "true" ]]; then echo -e " ${GREEN}✓${RESET} ${url}" fi elif [[ "$status" == broken:* ]]; then local code="${status#broken:}" COUNT_BROKEN=$((COUNT_BROKEN + 1)) echo -e " ${RED}✗${RESET} ${url} — HTTP ${code}" elif [[ "$status" == meta:* ]]; then local tags="${status#meta:}" COUNT_OK=$((COUNT_OK + 1)) COUNT_MISSING_META=$((COUNT_MISSING_META + 1)) echo -e " ${YELLOW}⚠${RESET} ${url} — ${tags}" fi done } check_orphan_content() { local url_file="$1" if [[ -z "$LOCAL_DIR" ]]; then verbose "No --local-dir specified, skipping orphan check" return fi if [[ ! -d "$LOCAL_DIR" ]]; then warn "Content directory not found: $LOCAL_DIR" return fi if [[ -z "$BASE_URL" ]]; then BASE_URL=$(head -1 "$url_file" | grep -oP 'https?://[^/]+' || true) if [[ -z "$BASE_URL" ]]; then warn "Could not determine base URL — skipping orphan check" return fi fi section_header "Orphan Content (not in sitemap)" local found_orphan=false while IFS= read -r -d '' content_file; do local dir_name dir_name=$(dirname "$content_file") local rel_path="${dir_name#"$LOCAL_DIR"}" rel_path="${rel_path#/}" if [[ "$(basename "$content_file")" == "_index.md" ]]; then continue fi local expected_url="${BASE_URL}/${rel_path}/" expected_url="${expected_url//\/\///}" expected_url="${expected_url/:\//://}" local front_matter front_matter=$(sed -n '/^---$/,/^---$/p' "$content_file" 2>/dev/null | head -20) if echo "$front_matter" | grep -qi 'draft: *true'; then verbose "Skipping draft: $content_file" continue fi if ! grep -qF "$rel_path" "$url_file" 2>/dev/null; then echo -e " ${YELLOW}⚠${RESET} ${content_file} → expected ${expected_url}" COUNT_ORPHAN=$((COUNT_ORPHAN + 1)) found_orphan=true fi done < <(find "$LOCAL_DIR" -name "index.md" -print0 2>/dev/null) if [[ "$found_orphan" == "false" ]]; then echo -e " ${GREEN}✓${RESET} All content files found in sitemap" fi } print_summary() { section_header "Summary" printf " ${BOLD}%-24s${RESET} %d\n" "Total URLs:" "$COUNT_TOTAL" printf " ${BOLD}%-24s${RESET} %b%d%b\n" "OK:" "$GREEN" "$COUNT_OK" "$RESET" printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Broken:" "$RED" "$COUNT_BROKEN" "$RESET" printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Missing meta tags:" "$YELLOW" "$COUNT_MISSING_META" "$RESET" printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Duplicate URLs:" "$YELLOW" "$COUNT_DUPLICATES" "$RESET" printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Orphan content files:" "$YELLOW" "$COUNT_ORPHAN" "$RESET" echo "" if [[ "$COUNT_BROKEN" -gt 0 || "$COUNT_DUPLICATES" -gt 0 ]]; then echo -e " ${RED}${BOLD}FAIL${RESET} — issues found" return 1 else echo -e " ${GREEN}${BOLD}PASS${RESET} — sitemap looks good" return 0 fi } # ══════════════════════════════════════════════════════════════════════ # USAGE # ══════════════════════════════════════════════════════════════════════ usage() { cat <<EOF ${SCRIPT_NAME} — Validate Hugo sitemap URLs, meta tags, and content parity USAGE: ${SCRIPT_NAME} [OPTIONS] <SITEMAP_URL> ${SCRIPT_NAME} --file sitemap.xml [OPTIONS] OPTIONS: --file FILE Read sitemap from local file instead of URL Auto-detects public dir and checks local HTML files --public-dir DIR Override public directory for local HTML checks --local-dir DIR Hugo content directory for orphan detection --base-url URL Override base URL for orphan content matching --parallel N Max concurrent URL checks (default: ${PARALLEL}) --no-meta Skip meta tag checks --verbose Enable debug output --no-color Disable colored output --help Show this help ENVIRONMENT VARIABLES: SITEMAP_URL Default sitemap URL if not passed as argument CHECK_META Enable/disable meta checks (default: true) PARALLEL Max concurrent checks (default: 5) COLOR Color mode: auto, always, never (default: auto) EXAMPLES: # Validate remote sitemap ./${SCRIPT_NAME} https://example.com/sitemap.xml # Validate local build with content cross-reference ./${SCRIPT_NAME} --file public/sitemap.xml --local-dir content # Fast check — skip meta tag validation ./${SCRIPT_NAME} --no-meta https://example.com/sitemap.xml # High concurrency ./${SCRIPT_NAME} --parallel 20 https://example.com/sitemap.xml EOF } # ══════════════════════════════════════════════════════════════════════ # ARGUMENT PARSING # ══════════════════════════════════════════════════════════════════════ parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --file) SITEMAP_FILE="$2"; shift 2 ;; --public-dir) PUBLIC_DIR="$2"; shift 2 ;; --local-dir) LOCAL_DIR="$2"; shift 2 ;; --base-url) BASE_URL="$2"; shift 2 ;; --parallel) PARALLEL="$2"; shift 2 ;; --no-meta) CHECK_META="false"; shift ;; --verbose) VERBOSE="true"; shift ;; --no-color) COLOR="never"; shift ;; --help|-h) setup_colors usage exit 0 ;; -*) err "Unknown option: $1" echo "Run ${SCRIPT_NAME} --help for usage" >&2 exit 1 ;; *) SITEMAP_URL="$1"; shift ;; esac done } # ══════════════════════════════════════════════════════════════════════ # MAIN # ══════════════════════════════════════════════════════════════════════ main() { parse_args "$@" setup_colors if [[ -z "$SITEMAP_URL" && -z "$SITEMAP_FILE" ]]; then err "No sitemap URL or file specified" echo "Run ${SCRIPT_NAME} --help for usage" >&2 exit 1 fi TMPDIR_WORK=$(mktemp -d) local sitemap_file="${TMPDIR_WORK}/sitemap.xml" local url_file="${TMPDIR_WORK}/urls.txt" local results_dir="${TMPDIR_WORK}/results" # Auto-detect public directory from --file path if [[ -n "$SITEMAP_FILE" && -z "$PUBLIC_DIR" ]]; then PUBLIC_DIR=$(cd "$(dirname "$SITEMAP_FILE")" && pwd) if [[ ! -d "${PUBLIC_DIR}/guides" && ! -d "${PUBLIC_DIR}/posts" ]]; then verbose "Auto-detected PUBLIC_DIR=${PUBLIC_DIR} but no content dirs found — falling back to remote" PUBLIC_DIR="" else verbose "Auto-detected PUBLIC_DIR=${PUBLIC_DIR}" fi fi local mode="remote" [[ -n "$PUBLIC_DIR" ]] && mode="local (${PUBLIC_DIR})" echo "" echo -e "${BOLD}Sitemap Validator${RESET}" echo -e "${DIM}Source: ${SITEMAP_URL:-$SITEMAP_FILE}${RESET}" echo -e "${DIM}Mode: ${mode} | Parallel: ${PARALLEL} | Meta check: ${CHECK_META}${RESET}" fetch_sitemap "$sitemap_file" extract_urls "$sitemap_file" > "$url_file" local url_count url_count=$(wc -l < "$url_file") if [[ "$url_count" -eq 0 ]]; then err "No URLs found in sitemap" exit 1 fi log "Found ${url_count} URLs in sitemap" # Check for localhost if grep -qi 'localhost\|127\.0\.0\.1' "$url_file"; then warn "Sitemap contains localhost URLs — check your baseURL" fi check_duplicates "$url_file" check_urls_parallel "$url_file" "$results_dir" report_url_results "$results_dir" check_orphan_content "$url_file" print_summary } main "$@"