Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist, same-site referer bypass, Blackbox-Exporter allowed bot, and all new exporters, cheat sheets, and automation scripts.
This commit is contained in:
Executable
+492
@@ -0,0 +1,492 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
#########################################################################################
|
||||
#### sitemap-validator.sh — Validate Hugo sitemap URLs, meta tags, and content parity ####
|
||||
#### Checks each URL for HTTP 200, duplicates, missing meta tags, and compares ####
|
||||
#### sitemap entries against local Hugo content files ####
|
||||
#### ####
|
||||
#### Author: Phil Connor ####
|
||||
#### Contact: contact@mylinux.work ####
|
||||
#### License: MIT ####
|
||||
#### Version 1.1 ####
|
||||
#### ####
|
||||
#### Usage: ####
|
||||
#### ./sitemap-validator.sh https://example.com/sitemap.xml ####
|
||||
#### ./sitemap-validator.sh --file public/sitemap.xml --local-dir content ####
|
||||
#### ./sitemap-validator.sh --help ####
|
||||
#### ####
|
||||
#### See --help for all options. ####
|
||||
#########################################################################################
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ── Defaults ──────────────────────────────────────────────────────────
|
||||
SITEMAP_URL="${SITEMAP_URL:-}"
|
||||
SITEMAP_FILE=""
|
||||
LOCAL_DIR=""
|
||||
PUBLIC_DIR=""
|
||||
CHECK_META="${CHECK_META:-true}"
|
||||
PARALLEL="${PARALLEL:-5}"
|
||||
VERBOSE="${VERBOSE:-false}"
|
||||
COLOR="${COLOR:-auto}"
|
||||
BASE_URL=""
|
||||
|
||||
# ── State ─────────────────────────────────────────────────────────────
|
||||
SCRIPT_NAME="$(basename "$0")"
|
||||
readonly SCRIPT_NAME
|
||||
TMPDIR_WORK=""
|
||||
COUNT_TOTAL=0
|
||||
COUNT_OK=0
|
||||
COUNT_BROKEN=0
|
||||
COUNT_MISSING_META=0
|
||||
COUNT_DUPLICATES=0
|
||||
COUNT_ORPHAN=0
|
||||
|
||||
# ── Colors ────────────────────────────────────────────────────────────
|
||||
setup_colors() {
|
||||
if [[ "$COLOR" == "never" ]]; then
|
||||
RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET=""
|
||||
return
|
||||
fi
|
||||
if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
CYAN='\033[0;36m'
|
||||
BOLD='\033[1m'
|
||||
DIM='\033[2m'
|
||||
RESET='\033[0m'
|
||||
else
|
||||
RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET=""
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Logging ───────────────────────────────────────────────────────────
|
||||
log() { echo -e "${CYAN}[INFO]${RESET} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; }
|
||||
err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; }
|
||||
verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; }
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────
|
||||
section_header() {
|
||||
echo ""
|
||||
echo -e " ${BOLD}${CYAN}── $1 ──${RESET}"
|
||||
echo ""
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "$TMPDIR_WORK" && -d "$TMPDIR_WORK" ]]; then
|
||||
rm -rf "$TMPDIR_WORK"
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
fetch_sitemap() {
|
||||
local dest="$1"
|
||||
if [[ -n "$SITEMAP_FILE" ]]; then
|
||||
if [[ ! -f "$SITEMAP_FILE" ]]; then
|
||||
err "Sitemap file not found: $SITEMAP_FILE"
|
||||
exit 1
|
||||
fi
|
||||
cp "$SITEMAP_FILE" "$dest"
|
||||
elif [[ -n "$SITEMAP_URL" ]]; then
|
||||
verbose "Fetching sitemap from $SITEMAP_URL"
|
||||
if ! curl -fsSL --max-time 30 "$SITEMAP_URL" -o "$dest" 2>/dev/null; then
|
||||
err "Failed to fetch sitemap: $SITEMAP_URL"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
err "No sitemap URL or file specified"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
extract_urls() {
|
||||
local sitemap_file="$1"
|
||||
grep -oP '<loc>\K[^<]+' "$sitemap_file" || true
|
||||
}
|
||||
|
||||
check_duplicates() {
|
||||
local url_file="$1"
|
||||
local dupes
|
||||
dupes=$(sort "$url_file" | uniq -d)
|
||||
if [[ -n "$dupes" ]]; then
|
||||
section_header "Duplicate URLs"
|
||||
while IFS= read -r url; do
|
||||
local count
|
||||
count=$(grep -cF "$url" "$url_file" || true)
|
||||
echo -e " ${RED}✗${RESET} ${url} (${count} occurrences)"
|
||||
COUNT_DUPLICATES=$((COUNT_DUPLICATES + 1))
|
||||
done <<< "$dupes"
|
||||
else
|
||||
verbose "No duplicate URLs found"
|
||||
fi
|
||||
}
|
||||
|
||||
url_to_local_path() {
|
||||
local url="$1"
|
||||
local path
|
||||
path=$(echo "$url" | sed -E 's|https?://[^/]+||')
|
||||
path="${path%/}"
|
||||
echo "${PUBLIC_DIR}${path}/index.html"
|
||||
}
|
||||
|
||||
check_meta_tags() {
|
||||
local html_file="$1"
|
||||
local warnings=""
|
||||
|
||||
if ! grep -qi '<title>' "$html_file"; then
|
||||
warnings="${warnings}missing-title,"
|
||||
fi
|
||||
if ! grep -qi 'name="description"' "$html_file"; then
|
||||
warnings="${warnings}missing-description,"
|
||||
fi
|
||||
if ! grep -qi 'rel="canonical"' "$html_file"; then
|
||||
warnings="${warnings}missing-canonical,"
|
||||
fi
|
||||
if ! grep -qi 'og:title' "$html_file"; then
|
||||
warnings="${warnings}missing-og-title,"
|
||||
fi
|
||||
|
||||
echo "${warnings%,}"
|
||||
}
|
||||
|
||||
check_url_local() {
|
||||
local url="$1"
|
||||
local results_dir="$2"
|
||||
local safe_name
|
||||
safe_name=$(echo "$url" | md5sum | cut -d' ' -f1)
|
||||
local result_file="${results_dir}/${safe_name}"
|
||||
|
||||
local html_file
|
||||
html_file=$(url_to_local_path "$url")
|
||||
|
||||
if [[ -f "$html_file" ]]; then
|
||||
echo "ok" > "$result_file"
|
||||
|
||||
if [[ "$CHECK_META" == "true" ]]; then
|
||||
local warnings
|
||||
warnings=$(check_meta_tags "$html_file")
|
||||
if [[ -n "$warnings" ]]; then
|
||||
echo "meta:${warnings}" > "$result_file"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
echo "broken:missing" > "$result_file"
|
||||
fi
|
||||
echo "$url" >> "$result_file"
|
||||
}
|
||||
|
||||
check_url_remote() {
|
||||
local url="$1"
|
||||
local results_dir="$2"
|
||||
local safe_name
|
||||
safe_name=$(echo "$url" | md5sum | cut -d' ' -f1)
|
||||
local result_file="${results_dir}/${safe_name}"
|
||||
local body_file="${results_dir}/${safe_name}.body"
|
||||
|
||||
local http_code
|
||||
http_code=$(curl -sL --max-time 10 -o "$body_file" -w '%{http_code}' "$url" 2>/dev/null) || http_code="000"
|
||||
|
||||
if [[ "$http_code" == "200" ]]; then
|
||||
echo "ok" > "$result_file"
|
||||
|
||||
if [[ "$CHECK_META" == "true" && -s "$body_file" ]]; then
|
||||
local warnings
|
||||
warnings=$(check_meta_tags "$body_file")
|
||||
if [[ -n "$warnings" ]]; then
|
||||
echo "meta:${warnings}" > "$result_file"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
echo "broken:${http_code}" > "$result_file"
|
||||
fi
|
||||
echo "$url" >> "$result_file"
|
||||
rm -f "$body_file"
|
||||
}
|
||||
|
||||
check_urls_parallel() {
|
||||
local url_file="$1"
|
||||
local results_dir="$2"
|
||||
local check_fn="check_url_remote"
|
||||
|
||||
if [[ -n "$PUBLIC_DIR" ]]; then
|
||||
check_fn="check_url_local"
|
||||
fi
|
||||
|
||||
mkdir -p "$results_dir"
|
||||
|
||||
local pids=()
|
||||
local running=0
|
||||
|
||||
while IFS= read -r url; do
|
||||
[[ -z "$url" ]] && continue
|
||||
"$check_fn" "$url" "$results_dir" &
|
||||
pids+=($!)
|
||||
running=$((running + 1))
|
||||
|
||||
if [[ "$running" -ge "$PARALLEL" ]]; then
|
||||
wait "${pids[0]}" 2>/dev/null || true
|
||||
pids=("${pids[@]:1}")
|
||||
running=$((running - 1))
|
||||
fi
|
||||
done < "$url_file"
|
||||
|
||||
for pid in "${pids[@]}"; do
|
||||
wait "$pid" 2>/dev/null || true
|
||||
done
|
||||
}
|
||||
|
||||
report_url_results() {
|
||||
local results_dir="$1"
|
||||
|
||||
section_header "URL Check Results"
|
||||
|
||||
for result_file in "$results_dir"/*; do
|
||||
[[ -f "$result_file" ]] || continue
|
||||
local status url
|
||||
status=$(head -1 "$result_file")
|
||||
url=$(tail -1 "$result_file")
|
||||
|
||||
COUNT_TOTAL=$((COUNT_TOTAL + 1))
|
||||
|
||||
if [[ "$status" == "ok" ]]; then
|
||||
COUNT_OK=$((COUNT_OK + 1))
|
||||
if [[ "$VERBOSE" == "true" ]]; then
|
||||
echo -e " ${GREEN}✓${RESET} ${url}"
|
||||
fi
|
||||
elif [[ "$status" == broken:* ]]; then
|
||||
local code="${status#broken:}"
|
||||
COUNT_BROKEN=$((COUNT_BROKEN + 1))
|
||||
echo -e " ${RED}✗${RESET} ${url} — HTTP ${code}"
|
||||
elif [[ "$status" == meta:* ]]; then
|
||||
local tags="${status#meta:}"
|
||||
COUNT_OK=$((COUNT_OK + 1))
|
||||
COUNT_MISSING_META=$((COUNT_MISSING_META + 1))
|
||||
echo -e " ${YELLOW}⚠${RESET} ${url} — ${tags}"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
check_orphan_content() {
|
||||
local url_file="$1"
|
||||
|
||||
if [[ -z "$LOCAL_DIR" ]]; then
|
||||
verbose "No --local-dir specified, skipping orphan check"
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ ! -d "$LOCAL_DIR" ]]; then
|
||||
warn "Content directory not found: $LOCAL_DIR"
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ -z "$BASE_URL" ]]; then
|
||||
BASE_URL=$(head -1 "$url_file" | grep -oP 'https?://[^/]+' || true)
|
||||
if [[ -z "$BASE_URL" ]]; then
|
||||
warn "Could not determine base URL — skipping orphan check"
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
section_header "Orphan Content (not in sitemap)"
|
||||
|
||||
local found_orphan=false
|
||||
while IFS= read -r -d '' content_file; do
|
||||
local dir_name
|
||||
dir_name=$(dirname "$content_file")
|
||||
local rel_path="${dir_name#"$LOCAL_DIR"}"
|
||||
rel_path="${rel_path#/}"
|
||||
|
||||
if [[ "$(basename "$content_file")" == "_index.md" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
local expected_url="${BASE_URL}/${rel_path}/"
|
||||
expected_url="${expected_url//\/\///}"
|
||||
expected_url="${expected_url/:\//://}"
|
||||
|
||||
local front_matter
|
||||
front_matter=$(sed -n '/^---$/,/^---$/p' "$content_file" 2>/dev/null | head -20)
|
||||
|
||||
if echo "$front_matter" | grep -qi 'draft: *true'; then
|
||||
verbose "Skipping draft: $content_file"
|
||||
continue
|
||||
fi
|
||||
|
||||
if ! grep -qF "$rel_path" "$url_file" 2>/dev/null; then
|
||||
echo -e " ${YELLOW}⚠${RESET} ${content_file} → expected ${expected_url}"
|
||||
COUNT_ORPHAN=$((COUNT_ORPHAN + 1))
|
||||
found_orphan=true
|
||||
fi
|
||||
done < <(find "$LOCAL_DIR" -name "index.md" -print0 2>/dev/null)
|
||||
|
||||
if [[ "$found_orphan" == "false" ]]; then
|
||||
echo -e " ${GREEN}✓${RESET} All content files found in sitemap"
|
||||
fi
|
||||
}
|
||||
|
||||
print_summary() {
|
||||
section_header "Summary"
|
||||
|
||||
printf " ${BOLD}%-24s${RESET} %d\n" "Total URLs:" "$COUNT_TOTAL"
|
||||
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "OK:" "$GREEN" "$COUNT_OK" "$RESET"
|
||||
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Broken:" "$RED" "$COUNT_BROKEN" "$RESET"
|
||||
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Missing meta tags:" "$YELLOW" "$COUNT_MISSING_META" "$RESET"
|
||||
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Duplicate URLs:" "$YELLOW" "$COUNT_DUPLICATES" "$RESET"
|
||||
printf " ${BOLD}%-24s${RESET} %b%d%b\n" "Orphan content files:" "$YELLOW" "$COUNT_ORPHAN" "$RESET"
|
||||
echo ""
|
||||
|
||||
if [[ "$COUNT_BROKEN" -gt 0 || "$COUNT_DUPLICATES" -gt 0 ]]; then
|
||||
echo -e " ${RED}${BOLD}FAIL${RESET} — issues found"
|
||||
return 1
|
||||
else
|
||||
echo -e " ${GREEN}${BOLD}PASS${RESET} — sitemap looks good"
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# USAGE
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
${SCRIPT_NAME} — Validate Hugo sitemap URLs, meta tags, and content parity
|
||||
|
||||
USAGE:
|
||||
${SCRIPT_NAME} [OPTIONS] <SITEMAP_URL>
|
||||
${SCRIPT_NAME} --file sitemap.xml [OPTIONS]
|
||||
|
||||
OPTIONS:
|
||||
--file FILE Read sitemap from local file instead of URL
|
||||
Auto-detects public dir and checks local HTML files
|
||||
--public-dir DIR Override public directory for local HTML checks
|
||||
--local-dir DIR Hugo content directory for orphan detection
|
||||
--base-url URL Override base URL for orphan content matching
|
||||
--parallel N Max concurrent URL checks (default: ${PARALLEL})
|
||||
--no-meta Skip meta tag checks
|
||||
--verbose Enable debug output
|
||||
--no-color Disable colored output
|
||||
--help Show this help
|
||||
|
||||
ENVIRONMENT VARIABLES:
|
||||
SITEMAP_URL Default sitemap URL if not passed as argument
|
||||
CHECK_META Enable/disable meta checks (default: true)
|
||||
PARALLEL Max concurrent checks (default: 5)
|
||||
COLOR Color mode: auto, always, never (default: auto)
|
||||
|
||||
EXAMPLES:
|
||||
# Validate remote sitemap
|
||||
./${SCRIPT_NAME} https://example.com/sitemap.xml
|
||||
|
||||
# Validate local build with content cross-reference
|
||||
./${SCRIPT_NAME} --file public/sitemap.xml --local-dir content
|
||||
|
||||
# Fast check — skip meta tag validation
|
||||
./${SCRIPT_NAME} --no-meta https://example.com/sitemap.xml
|
||||
|
||||
# High concurrency
|
||||
./${SCRIPT_NAME} --parallel 20 https://example.com/sitemap.xml
|
||||
EOF
|
||||
}
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# ARGUMENT PARSING
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
parse_args() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--file)
|
||||
SITEMAP_FILE="$2"; shift 2 ;;
|
||||
--public-dir)
|
||||
PUBLIC_DIR="$2"; shift 2 ;;
|
||||
--local-dir)
|
||||
LOCAL_DIR="$2"; shift 2 ;;
|
||||
--base-url)
|
||||
BASE_URL="$2"; shift 2 ;;
|
||||
--parallel)
|
||||
PARALLEL="$2"; shift 2 ;;
|
||||
--no-meta)
|
||||
CHECK_META="false"; shift ;;
|
||||
--verbose)
|
||||
VERBOSE="true"; shift ;;
|
||||
--no-color)
|
||||
COLOR="never"; shift ;;
|
||||
--help|-h)
|
||||
setup_colors
|
||||
usage
|
||||
exit 0 ;;
|
||||
-*)
|
||||
err "Unknown option: $1"
|
||||
echo "Run ${SCRIPT_NAME} --help for usage" >&2
|
||||
exit 1 ;;
|
||||
*)
|
||||
SITEMAP_URL="$1"; shift ;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# MAIN
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
main() {
|
||||
parse_args "$@"
|
||||
setup_colors
|
||||
|
||||
if [[ -z "$SITEMAP_URL" && -z "$SITEMAP_FILE" ]]; then
|
||||
err "No sitemap URL or file specified"
|
||||
echo "Run ${SCRIPT_NAME} --help for usage" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TMPDIR_WORK=$(mktemp -d)
|
||||
local sitemap_file="${TMPDIR_WORK}/sitemap.xml"
|
||||
local url_file="${TMPDIR_WORK}/urls.txt"
|
||||
local results_dir="${TMPDIR_WORK}/results"
|
||||
|
||||
# Auto-detect public directory from --file path
|
||||
if [[ -n "$SITEMAP_FILE" && -z "$PUBLIC_DIR" ]]; then
|
||||
PUBLIC_DIR=$(cd "$(dirname "$SITEMAP_FILE")" && pwd)
|
||||
if [[ ! -d "${PUBLIC_DIR}/guides" && ! -d "${PUBLIC_DIR}/posts" ]]; then
|
||||
verbose "Auto-detected PUBLIC_DIR=${PUBLIC_DIR} but no content dirs found — falling back to remote"
|
||||
PUBLIC_DIR=""
|
||||
else
|
||||
verbose "Auto-detected PUBLIC_DIR=${PUBLIC_DIR}"
|
||||
fi
|
||||
fi
|
||||
|
||||
local mode="remote"
|
||||
[[ -n "$PUBLIC_DIR" ]] && mode="local (${PUBLIC_DIR})"
|
||||
|
||||
echo ""
|
||||
echo -e "${BOLD}Sitemap Validator${RESET}"
|
||||
echo -e "${DIM}Source: ${SITEMAP_URL:-$SITEMAP_FILE}${RESET}"
|
||||
echo -e "${DIM}Mode: ${mode} | Parallel: ${PARALLEL} | Meta check: ${CHECK_META}${RESET}"
|
||||
|
||||
fetch_sitemap "$sitemap_file"
|
||||
extract_urls "$sitemap_file" > "$url_file"
|
||||
|
||||
local url_count
|
||||
url_count=$(wc -l < "$url_file")
|
||||
if [[ "$url_count" -eq 0 ]]; then
|
||||
err "No URLs found in sitemap"
|
||||
exit 1
|
||||
fi
|
||||
log "Found ${url_count} URLs in sitemap"
|
||||
|
||||
# Check for localhost
|
||||
if grep -qi 'localhost\|127\.0\.0\.1' "$url_file"; then
|
||||
warn "Sitemap contains localhost URLs — check your baseURL"
|
||||
fi
|
||||
|
||||
check_duplicates "$url_file"
|
||||
check_urls_parallel "$url_file" "$results_dir"
|
||||
report_url_results "$results_dir"
|
||||
check_orphan_content "$url_file"
|
||||
print_summary
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user