#!/bin/bash ################################################################################ # Script Name: journal-error-exporter.sh # Version: 1.0 # Description: Prometheus exporter for journalctl error/critical/warning # messages per systemd unit. Exports per-priority message counts, # per-unit breakdown, top offending units, error rates, and # journal disk usage metrics. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - journalctl (systemd journal) # - netcat (nc) for HTTP mode # - Standard Unix tools (awk, sort, uniq, grep) # # Performance: # Journal output is cached per priority+period combination — journalctl # is called once per unique combo, not once per metric. Per-unit counts # are extracted from a single cached 24h query. # Typical run time: a few seconds even on busy systems. # # Usage: # # Output to stdout # ./journal-error-exporter.sh # # # HTTP server mode # ./journal-error-exporter.sh --http -p 9201 # # # Textfile collector mode # ./journal-error-exporter.sh --textfile # # Metrics Exported: # Core Status: # - journal_error_up - Exporter status (1=up, 0=down) # - journal_error_exporter_info{version} - Exporter version # # Message Counts: # - journal_error_messages_total{priority,period} - Messages per priority per period # # Per-Unit Breakdown (24h, top 20): # - journal_error_unit_messages{unit,priority} - Per-unit message count by priority # # Top Offenders: # - journal_error_top_unit_count{unit} - Top 20 units by err+crit+alert+emerg (24h) # # Rates: # - journal_error_rate_per_hour{priority} - Average messages per hour (24h) # # Journal Health: # - journal_error_journal_disk_usage_bytes - Journal disk usage # - journal_error_exporter_duration_seconds - Script execution time # - journal_error_exporter_last_run_timestamp - Last run timestamp # # Configuration: # Default HTTP port: 9201 # Textfile directory: /var/lib/node_exporter # ################################################################################ # ============================================================================ # CONFIGURATION VARIABLES # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9201 CACHE_DIR="" # Priority label map declare -A PRIORITY_LABELS PRIORITY_LABELS=([0]="emerg" [1]="alert" [2]="crit" [3]="err" [4]="warning") # ============================================================================ # HELPER FUNCTIONS # ============================================================================ show_usage() { cat <&2; exit 1 ;; esac done } # Set up a temp directory for caching journal output per priority+period # Called once at the start of generate_metrics setup_cache() { CACHE_DIR=$(mktemp -d /tmp/journal-error-exporter.XXXXXX) } # Clean up cached journal data cleanup_cache() { [ -n "$CACHE_DIR" ] && rm -rf "$CACHE_DIR" CACHE_DIR="" } # Get message count for a specific priority and time period (cached) # journalctl is called once per unique priority+period combo # Args: $1 - priority number (0-4), $2 - period ("1 hour ago", "24 hours ago", "7 days ago") # Returns: Number of messages get_priority_count() { local priority_num="$1" local period="$2" local cache_key="priority_${priority_num}_${period// /_}" local cache_file="$CACHE_DIR/$cache_key" # Return cached result if available if [ -f "$cache_file" ]; then cat "$cache_file" return fi local count count=$(journalctl --priority="${priority_num}..${priority_num}" --since "$period" --no-pager -q 2>/dev/null | wc -l) echo "${count:-0}" > "$cache_file" cat "$cache_file" } # Get top units by err+ (priority 0..3) message count in 24h # Args: $1 - limit (default: 20) # Returns: Lines with "count unit" format, sorted by count descending get_top_units() { local limit="${1:-20}" local cache_file="$CACHE_DIR/top_units" # Return cached result if available if [ -f "$cache_file" ]; then head -n "$limit" "$cache_file" return fi # Extract unit/process field from syslog-format output journalctl --priority=0..3 --since "24 hours ago" --output=short --no-pager -q 2>/dev/null | \ awk '{print $5}' | sed 's/\[.*//; s/://' | \ sort | uniq -c | sort -rn > "$cache_file" 2>/dev/null head -n "$limit" "$cache_file" } # Get message count for a specific unit at a specific priority (24h) # Args: $1 - unit name, $2 - priority number (0-4) # Returns: Number of messages get_unit_priority_count() { local unit="$1" local priority_num="$2" local cache_key="unit_${unit}_p${priority_num}" local cache_file="$CACHE_DIR/$cache_key" # Return cached result if available if [ -f "$cache_file" ]; then cat "$cache_file" return fi local count count=$(journalctl --priority="${priority_num}..${priority_num}" --since "24 hours ago" --no-pager -q 2>/dev/null | \ awk '{print $5}' | sed 's/\[.*//; s/://' | \ grep -cx "$unit" 2>/dev/null) echo "${count:-0}" > "$cache_file" cat "$cache_file" } # Get journal disk usage in bytes # Parses journalctl --disk-usage output # Returns: Disk usage in bytes get_journal_disk_usage() { local output output=$(journalctl --disk-usage 2>/dev/null) # Output format: "Archived and active journals take up 123.4M in the file system." # or similar with G/K/B suffixes local size_str size_str=$(echo "$output" | grep -oE '[0-9]+(\.[0-9]+)?[KMGTB]+' | head -1) if [ -z "$size_str" ]; then echo "0" return fi local number suffix number=$(echo "$size_str" | grep -oE '[0-9]+(\.[0-9]+)?') suffix=$(echo "$size_str" | grep -oE '[KMGTB]+$') case "$suffix" in B) awk "BEGIN {printf \"%.0f\", $number}" ;; K) awk "BEGIN {printf \"%.0f\", $number * 1024}" ;; M) awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024}" ;; G) awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024 * 1024}" ;; T) awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024 * 1024 * 1024}" ;; *) echo "0" ;; esac } # ============================================================================ # METRIC GENERATION # ============================================================================ # Generate all Prometheus metrics # Returns: Prometheus text format metrics on stdout generate_metrics() { local script_start script_start=$(date +%s) # Verify journalctl is available if ! command -v journalctl >/dev/null 2>&1; then cat </dev/null; then rate=$(awk "BEGIN {printf \"%.2f\", $total / 24}" 2>/dev/null || echo "0.00") else rate="0.00" fi echo "journal_error_rate_per_hour{priority=\"$priority_label\"} $rate" done echo "" # ======================================================================== # Journal Health # ======================================================================== local disk_usage disk_usage=$(get_journal_disk_usage) local script_end script_duration script_end=$(date +%s) script_duration=$((script_end - script_start)) cat <&2 if ! command -v nc >/dev/null 2>&1; then echo "ERROR: netcat (nc) required for HTTP mode" >&2 exit 1 fi # Infinite loop accepting HTTP requests while true; do { read -r request # Check if request is for /metrics endpoint if [[ "$request" =~ ^GET\ /metrics ]]; then echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" generate_metrics else # Serve HTML landing page for other requests echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" cat < Journal Error Exporter v1.0

Journal Error Exporter v1.0

Metrics

Metric Categories

  • Core Status: exporter up/down, version info
  • Message Counts: per-priority counts (emerg/alert/crit/err/warning) per period
  • Per-Unit Breakdown: per-unit message counts by priority (24h, top 20)
  • Top Offenders: top 20 units by error+ count (24h)
  • Rates: average messages per hour by priority (24h)
  • Journal Health: disk usage, exporter runtime
EOF fi } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null done } # ============================================================================ # MAIN EXECUTION # ============================================================================ # Main entry point - routes to appropriate output mode main() { parse_args "$@" if [ "$HTTP_MODE" = true ]; then # Run HTTP server (blocks until killed) run_http_server elif [ -n "$OUTPUT_FILE" ]; then # Textfile collector mode: write atomically using temp file local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" # Create temp file in SAME directory for atomic rename (same filesystem) local temp_file temp_file=$(mktemp "${output_dir}/.journal_error_metrics.XXXXXX") # Generate metrics to temp file if ! generate_metrics > "$temp_file" 2>/dev/null; then rm -f "$temp_file" echo "ERROR: Failed to generate metrics" >&2 exit 1 fi # Validate: file must have content local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [ "$file_lines" -lt 10 ]; then rm -f "$temp_file" echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 exit 1 fi # Set permissions before move chmod 644 "$temp_file" # Atomic rename - no gap where file is missing mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 else # Default: output to stdout generate_metrics fi } # Execute main function with all script arguments main "$@"