#!/bin/bash ################################################################################ # Script Name: textfile-health-exporter.sh # Version: 1.0 # Description: Prometheus exporter that monitors node_exporter textfile # collector .prom files — detects stale files, parse errors, # file sizes, and missing cron jobs. A meta-exporter that # watches the health of your other exporters. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - awk, stat, find # - netcat (nc) for HTTP mode # # Usage: # ./textfile-health-exporter.sh # stdout # ./textfile-health-exporter.sh --textfile # node_exporter textfile # ./textfile-health-exporter.sh --http # HTTP server on port 9202 # # Metrics Exported: # Per-File: # - textfile_health_file_age_seconds{file} - Seconds since last modified # - textfile_health_file_size_bytes{file} - File size in bytes # - textfile_health_file_lines{file} - Line count # - textfile_health_parse_ok{file} - 1 if valid, 0 if errors # # Summary: # - textfile_health_files_total - Total .prom files found # - textfile_health_stale_files_total - Files older than threshold # - textfile_health_parse_errors_total - Files with parse errors # - textfile_health_total_size_bytes - Total size of all files # # Exporter: # - textfile_health_up - Exporter status (1=up) # - textfile_health_stale_threshold_seconds - Configured stale threshold # - textfile_health_duration_seconds - Script execution time # - textfile_health_last_run_timestamp - Last run timestamp # # Configuration: # Default HTTP port: 9202 # Textfile directory: /var/lib/node_exporter # Stale threshold: 600 seconds (10 minutes) # ################################################################################ set -uo pipefail # ============================================================================ # CONFIGURATION # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9202 STALE_THRESHOLD=600 OWN_OUTPUT_FILE="textfile_health.prom" # ============================================================================ # USAGE # ============================================================================ show_usage() { cat <&2; show_usage ;; esac done } # ============================================================================ # PREFLIGHT # ============================================================================ preflight() { local missing=() for cmd in awk stat find wc; do command -v "$cmd" &>/dev/null || missing+=("$cmd") done if [[ ${#missing[@]} -gt 0 ]]; then echo "# ERROR: Missing required commands: ${missing[*]}" >&2 exit 1 fi if [[ ! -d "$TEXTFILE_DIR" ]]; then echo "# ERROR: Textfile directory does not exist: ${TEXTFILE_DIR}" >&2 exit 1 fi } # ============================================================================ # PARSE VALIDATION # ============================================================================ validate_prom_file() { local file="$1" awk ' /^[[:space:]]*$/ { next } /^#/ { next } /^[a-zA-Z_:][a-zA-Z0-9_:]*(\{[^}]*\})?[[:space:]]+-?[0-9]/ { next } /^[a-zA-Z_:][a-zA-Z0-9_:]*(\{[^}]*\})?[[:space:]]+[+-]?[Ii][Nn][Ff]/ { next } /^[a-zA-Z_:][a-zA-Z0-9_:]*(\{[^}]*\})?[[:space:]]+[Nn][Aa][Nn]/ { next } { errors++; } END { exit (errors > 0) ? 1 : 0 } ' "$file" 2>/dev/null } # ============================================================================ # METRICS COLLECTION # ============================================================================ collect_metrics() { local start_time start_time=$(date +%s%N) local now now=$(date +%s) echo "# HELP textfile_health_up Exporter status (1=up)" echo "# TYPE textfile_health_up gauge" echo "textfile_health_up 1" echo "" echo "# HELP textfile_health_stale_threshold_seconds Configured stale threshold" echo "# TYPE textfile_health_stale_threshold_seconds gauge" echo "textfile_health_stale_threshold_seconds ${STALE_THRESHOLD}" echo "" # --- Resolve own output filename for exclusion --- local exclude_basename if [[ -n "$OUTPUT_FILE" ]]; then exclude_basename=$(basename "$OUTPUT_FILE") else exclude_basename="$OWN_OUTPUT_FILE" fi # --- Find all .prom files, excluding own output --- local prom_files=() while IFS= read -r -d '' f; do local base base=$(basename "$f") [[ "$base" == "$exclude_basename" ]] && continue prom_files+=("$f") done < <(find "$TEXTFILE_DIR" -maxdepth 1 -name '*.prom' -type f -print0 2>/dev/null) local files_total=${#prom_files[@]} local stale_total=0 local parse_errors_total=0 local total_size=0 # --- Per-file metrics --- if [[ $files_total -gt 0 ]]; then echo "# HELP textfile_health_file_age_seconds Seconds since file was last modified" echo "# TYPE textfile_health_file_age_seconds gauge" for f in "${prom_files[@]}"; do local fname fname=$(basename "$f") local mtime mtime=$(stat -c %Y "$f" 2>/dev/null) || continue local age=$(( now - mtime )) echo "textfile_health_file_age_seconds{file=\"${fname}\"} ${age}" done echo "" echo "# HELP textfile_health_file_size_bytes File size in bytes" echo "# TYPE textfile_health_file_size_bytes gauge" for f in "${prom_files[@]}"; do local fname fname=$(basename "$f") local size size=$(stat -c %s "$f" 2>/dev/null) || continue echo "textfile_health_file_size_bytes{file=\"${fname}\"} ${size}" total_size=$(( total_size + size )) done echo "" echo "# HELP textfile_health_file_lines Line count per file" echo "# TYPE textfile_health_file_lines gauge" for f in "${prom_files[@]}"; do local fname fname=$(basename "$f") local lines lines=$(wc -l < "$f" 2>/dev/null) || continue echo "textfile_health_file_lines{file=\"${fname}\"} ${lines}" done echo "" echo "# HELP textfile_health_parse_ok Parse validation (1=valid, 0=errors)" echo "# TYPE textfile_health_parse_ok gauge" for f in "${prom_files[@]}"; do local fname fname=$(basename "$f") if validate_prom_file "$f"; then echo "textfile_health_parse_ok{file=\"${fname}\"} 1" else echo "textfile_health_parse_ok{file=\"${fname}\"} 0" parse_errors_total=$(( parse_errors_total + 1 )) fi done echo "" # --- Count stale files --- for f in "${prom_files[@]}"; do local mtime mtime=$(stat -c %Y "$f" 2>/dev/null) || continue local age=$(( now - mtime )) if [[ $age -ge $STALE_THRESHOLD ]]; then stale_total=$(( stale_total + 1 )) fi done fi # --- Summary metrics --- echo "# HELP textfile_health_files_total Total .prom files found" echo "# TYPE textfile_health_files_total gauge" echo "textfile_health_files_total ${files_total}" echo "" echo "# HELP textfile_health_stale_files_total Files older than stale threshold" echo "# TYPE textfile_health_stale_files_total gauge" echo "textfile_health_stale_files_total ${stale_total}" echo "" echo "# HELP textfile_health_parse_errors_total Files with parse errors" echo "# TYPE textfile_health_parse_errors_total gauge" echo "textfile_health_parse_errors_total ${parse_errors_total}" echo "" echo "# HELP textfile_health_total_size_bytes Total size of all .prom files" echo "# TYPE textfile_health_total_size_bytes gauge" echo "textfile_health_total_size_bytes ${total_size}" echo "" # --- Exporter metadata --- local end_time duration end_time=$(date +%s%N) duration=$(echo "scale=3; ($end_time - $start_time) / 1000000000" | bc 2>/dev/null || echo "0") echo "# HELP textfile_health_duration_seconds Script execution time" echo "# TYPE textfile_health_duration_seconds gauge" echo "textfile_health_duration_seconds ${duration}" echo "" echo "# HELP textfile_health_last_run_timestamp Last successful run (unix timestamp)" echo "# TYPE textfile_health_last_run_timestamp gauge" echo "textfile_health_last_run_timestamp ${now}" } # ============================================================================ # OUTPUT HANDLING # ============================================================================ output_metrics() { local metrics metrics=$(collect_metrics) if [[ -n "$OUTPUT_FILE" ]]; then local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" local temp_file temp_file=$(mktemp "${output_dir}/.textfile_health.XXXXXX") echo "$metrics" > "$temp_file" local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [[ "$file_lines" -lt 5 ]]; then rm -f "$temp_file" echo "# ERROR: Metrics file too small (${file_lines} lines), keeping previous" >&2 exit 1 fi chmod 644 "$temp_file" mv -f "$temp_file" "$OUTPUT_FILE" echo "# Metrics written to ${OUTPUT_FILE} (${file_lines} lines)" >&2 else echo "$metrics" fi } serve_http() { echo "# Starting HTTP server on port ${HTTP_PORT}" >&2 echo "# Metrics endpoint: http://localhost:${HTTP_PORT}/metrics" >&2 if ! command -v nc &>/dev/null && ! command -v ncat &>/dev/null; then echo "# ERROR: netcat (nc/ncat) required for HTTP mode" >&2 exit 1 fi local nc_cmd="nc" command -v ncat &>/dev/null && nc_cmd="ncat" while true; do local metrics metrics=$(collect_metrics) local content_length=${#metrics} local response="HTTP/1.1 200 OK\r\nContent-Type: text/plain; charset=utf-8\r\nContent-Length: ${content_length}\r\nConnection: close\r\n\r\n${metrics}" echo -e "$response" | $nc_cmd -l -p "$HTTP_PORT" -q 1 2>/dev/null || \ echo -e "$response" | $nc_cmd -l "$HTTP_PORT" 2>/dev/null || true done } # ============================================================================ # MAIN # ============================================================================ parse_args "$@" preflight if [[ "$HTTP_MODE" == "true" ]]; then serve_http else output_metrics fi