#!/bin/bash ################################################################################ # Script Name: ntp-drift-exporter.sh # Version: 1.0 # Description: Prometheus exporter for NTP time synchronisation metrics. # Monitors chrony, ntpd, and systemd-timesyncd clock offset, # stratum, sync status, and drift rate. Time drift is a silent # killer for logs, certificates, Kerberos, and distributed systems. # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # # Prerequisites: # - chronyc (chrony), ntpq/ntpstat (ntpd), or timedatectl (systemd-timesyncd) # - netcat (nc) for HTTP mode # - Standard Unix tools (awk, grep) # # Usage: # # Output to stdout # ./ntp-drift-exporter.sh # # # HTTP server mode # ./ntp-drift-exporter.sh --http -p 9200 # # # Textfile collector mode # ./ntp-drift-exporter.sh --textfile # # Metrics Exported: # Core Status: # - ntp_drift_up - Exporter status (1=up, 0=down) # - ntp_drift_exporter_info{version,source} - Exporter version and NTP source # # Sync Status: # - ntp_drift_synchronised - Clock synchronised (1=yes, 0=no) # - ntp_drift_stratum - Current stratum level # - ntp_drift_offset_seconds - Clock offset from upstream in seconds # - ntp_drift_offset_abs_seconds - Absolute clock offset in seconds # # Chrony (if chronyc available): # - ntp_drift_chrony_root_delay_seconds - Root delay # - ntp_drift_chrony_root_dispersion_seconds - Root dispersion # - ntp_drift_chrony_last_offset_seconds - Last measured offset # - ntp_drift_chrony_rms_offset_seconds - RMS offset # - ntp_drift_chrony_frequency_ppm - Frequency error in ppm # - ntp_drift_chrony_residual_freq_ppm - Residual frequency error in ppm # - ntp_drift_chrony_skew_ppm - Estimated skew in ppm # - ntp_drift_chrony_update_interval_seconds - Mean update interval # - ntp_drift_chrony_leap_status - Leap status (0=normal, 1=insert, 2=delete, 3=unsync) # - ntp_drift_chrony_sources_total - Total configured NTP sources # - ntp_drift_chrony_sources_reachable - Reachable NTP sources # - ntp_drift_chrony_source_offset_seconds{source,mode} - Per-source offset # # NTPd (if ntpq available): # - ntp_drift_ntpd_peers_total - Total configured peers # - ntp_drift_ntpd_peers_reachable - Reachable peers # - ntp_drift_ntpd_peer_offset_seconds{peer,type} - Per-peer offset # - ntp_drift_ntpd_peer_delay_seconds{peer} - Per-peer round-trip delay # - ntp_drift_ntpd_peer_jitter_seconds{peer} - Per-peer jitter # - ntp_drift_ntpd_selected_peer_offset_seconds - Selected peer offset # # systemd-timesyncd (if timedatectl available): # - ntp_drift_timesyncd_server_info{server,address} - NTP server info # - ntp_drift_timesyncd_delay_seconds - Round-trip delay # - ntp_drift_timesyncd_jitter_seconds - Jitter # - ntp_drift_timesyncd_frequency_ppm - Frequency error in ppm # - ntp_drift_timesyncd_root_distance_seconds - Root distance # - ntp_drift_timesyncd_poll_interval_seconds - Current poll interval # - ntp_drift_timesyncd_packet_count - NTP packet count # - ntp_drift_timesyncd_leap_status - Leap indicator (0=normal, 1=insert, 2=delete, 3=unsync) # # Alerts: # - ntp_drift_offset_critical - 1 if |offset| > 100ms # - ntp_drift_offset_warning - 1 if |offset| > 10ms # - ntp_drift_unsynchronised - 1 if clock is not synchronised # # Exporter: # - ntp_drift_exporter_duration_seconds - Script execution time # - ntp_drift_exporter_last_run_timestamp - Last run timestamp # # Configuration: # Default HTTP port: 9200 # Textfile directory: /var/lib/node_exporter # ################################################################################ # ============================================================================ # CONFIGURATION VARIABLES # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9200 NTP_SOURCE="" # ============================================================================ # HELPER FUNCTIONS # ============================================================================ show_usage() { cat <10ms warning, >100ms critical) EOF exit 0 } parse_args() { while [[ $# -gt 0 ]]; do case $1 in -h|--help) show_usage ;; --textfile) OUTPUT_FILE="$TEXTFILE_DIR/ntp_drift.prom"; shift ;; --http) HTTP_MODE=true; shift ;; -p|--port) HTTP_PORT="$2"; shift 2 ;; -o|--output) OUTPUT_FILE="$2"; shift 2 ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac done } # Detect the NTP client in use # Sets NTP_SOURCE global variable # Returns: 0 if found, 1 if no NTP client available detect_ntp_source() { if command -v chronyc >/dev/null 2>&1 && chronyc tracking >/dev/null 2>&1; then NTP_SOURCE="chrony" return 0 elif command -v ntpq >/dev/null 2>&1 && ntpq -p >/dev/null 2>&1; then NTP_SOURCE="ntpd" return 0 elif command -v timedatectl >/dev/null 2>&1 && timedatectl show 2>/dev/null | grep -q "NTP=yes"; then NTP_SOURCE="timesyncd" return 0 fi return 1 } # Get chrony tracking data # Returns: Lines of "key value" from chronyc tracking get_chrony_tracking() { chronyc tracking 2>/dev/null } # Parse a value with unit suffix from chronyc output # Converts to seconds (for time values) or returns raw number # Args: $1 - raw value string (e.g., "+0.000012345 seconds", "0.123 ppm") # Returns: numeric value in base unit parse_chrony_value() { local raw="$1" local number unit number=$(echo "$raw" | awk '{gsub(/[+]/, ""); print $1}') unit=$(echo "$raw" | awk '{print $NF}') case "$unit" in seconds) echo "$number" ;; milliseconds) awk "BEGIN {printf \"%.9f\", $number / 1000}" ;; microseconds) awk "BEGIN {printf \"%.12f\", $number / 1000000}" ;; nanoseconds) awk "BEGIN {printf \"%.15f\", $number / 1000000000}" ;; ppm) echo "$number" ;; *) echo "$number" ;; esac } # Get chrony synchronisation status # Returns: "synchronised stratum offset_seconds" get_chrony_sync_status() { local tracking tracking=$(get_chrony_tracking) local ref_id stratum sys_offset leap ref_id=$(echo "$tracking" | awk -F: '/Reference ID/ { gsub(/^[ \t]+/, "", $2); print $2 }') stratum=$(echo "$tracking" | awk -F: '/Stratum/ { gsub(/^[ \t]+/, "", $2); print $2 }') local offset_raw offset_raw=$(echo "$tracking" | awk -F: '/System time/ { gsub(/^[ \t]+/, "", $2); print $2 }') sys_offset=$(parse_chrony_value "$offset_raw") leap=$(echo "$tracking" | awk -F: '/Leap status/ { gsub(/^[ \t]+/, "", $2); print $2 }') local synchronised=0 if [ -n "$ref_id" ] && ! echo "$ref_id" | grep -q "00000000"; then if ! echo "$leap" | grep -qi "not synchronised"; then synchronised=1 fi fi echo "$synchronised ${stratum:-16} ${sys_offset:-0}" } # Get chrony detailed tracking metrics # Returns: "root_delay root_disp last_offset rms_offset freq resid_freq skew update_interval leap_code" get_chrony_details() { local tracking tracking=$(get_chrony_tracking) local root_delay_raw root_disp_raw last_offset_raw rms_offset_raw local freq_raw resid_freq_raw skew_raw interval_raw leap_raw root_delay_raw=$(echo "$tracking" | awk -F: '/Root delay/ { gsub(/^[ \t]+/, "", $2); print $2 }') root_disp_raw=$(echo "$tracking" | awk -F: '/Root dispersion/ { gsub(/^[ \t]+/, "", $2); print $2 }') last_offset_raw=$(echo "$tracking" | awk -F: '/Last offset/ { gsub(/^[ \t]+/, "", $2); print $2 }') rms_offset_raw=$(echo "$tracking" | awk -F: '/RMS offset/ { gsub(/^[ \t]+/, "", $2); print $2 }') freq_raw=$(echo "$tracking" | awk -F: '/Frequency/ { gsub(/^[ \t]+/, "", $2); print $2 }') resid_freq_raw=$(echo "$tracking" | awk -F: '/Residual freq/ { gsub(/^[ \t]+/, "", $2); print $2 }') skew_raw=$(echo "$tracking" | awk -F: '/Skew/ { gsub(/^[ \t]+/, "", $2); print $2 }') interval_raw=$(echo "$tracking" | awk -F: '/Update interval/ { gsub(/^[ \t]+/, "", $2); print $2 }') leap_raw=$(echo "$tracking" | awk -F: '/Leap status/ { gsub(/^[ \t]+/, "", $2); print $2 }') local root_delay root_disp last_offset rms_offset freq resid_freq skew interval leap_code root_delay=$(parse_chrony_value "$root_delay_raw") root_disp=$(parse_chrony_value "$root_disp_raw") last_offset=$(parse_chrony_value "$last_offset_raw") rms_offset=$(parse_chrony_value "$rms_offset_raw") freq=$(parse_chrony_value "$freq_raw") resid_freq=$(parse_chrony_value "$resid_freq_raw") skew=$(parse_chrony_value "$skew_raw") interval=$(parse_chrony_value "$interval_raw") case "$leap_raw" in *"Normal"*) leap_code=0 ;; *"Insert"*) leap_code=1 ;; *"Delete"*) leap_code=2 ;; *) leap_code=3 ;; esac echo "${root_delay:-0} ${root_disp:-0} ${last_offset:-0} ${rms_offset:-0} ${freq:-0} ${resid_freq:-0} ${skew:-0} ${interval:-0} ${leap_code:-3}" } # Get chrony source list with status # Returns: Lines of "source mode offset reachable" # mode: server/peer/ref_clock # reachable: 1 or 0 get_chrony_sources() { chronyc sources 2>/dev/null | awk ' NR > 3 && NF >= 8 { mode_char = substr($1, 1, 1) if (mode_char == "^") mode = "server" else if (mode_char == "=") mode = "peer" else if (mode_char == "#") mode = "ref_clock" else mode = "unknown" source = $2 state_char = substr($1, 2, 1) # Reachable if state is * (synced), + (combined), - (not combined) reachable = 0 if (state_char == "*" || state_char == "+" || state_char == "-") reachable = 1 # Offset is field 7, may have +/- prefix and unit suffix offset_raw = $7 # Remove +/- prefix for awk math gsub(/[+]/, "", offset_raw) # Convert units: ns, us, ms, s if (offset_raw ~ /ns$/) { gsub(/ns$/, "", offset_raw) offset = offset_raw / 1000000000 } else if (offset_raw ~ /us$/) { gsub(/us$/, "", offset_raw) offset = offset_raw / 1000000 } else if (offset_raw ~ /ms$/) { gsub(/ms$/, "", offset_raw) offset = offset_raw / 1000 } else if (offset_raw ~ /s$/) { gsub(/s$/, "", offset_raw) offset = offset_raw + 0 } else { offset = offset_raw + 0 } printf "%s %s %.9f %d\n", source, mode, offset, reachable }' } # Get ntpd sync status via ntpstat or ntpq # Returns: "synchronised stratum offset_seconds" get_ntpd_sync_status() { local synchronised=0 local stratum=16 local offset=0 # Try ntpstat first if command -v ntpstat >/dev/null 2>&1; then local ntpstat_output ntpstat_output=$(ntpstat 2>/dev/null) local exit_code=$? if [ "$exit_code" -eq 0 ]; then synchronised=1 stratum=$(echo "$ntpstat_output" | awk '/stratum/ { for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/) {print $i; exit} }') # Offset in ms from ntpstat local offset_ms offset_ms=$(echo "$ntpstat_output" | awk '/time correct/ { for(i=1;i<=NF;i++) if($i ~ /^[0-9.]+$/) {print $i; exit} }') if [ -n "$offset_ms" ]; then offset=$(awk "BEGIN {printf \"%.9f\", $offset_ms / 1000}") fi fi fi # Fall back to ntpq if ntpstat not available or failed if [ "$synchronised" -eq 0 ] && command -v ntpq >/dev/null 2>&1; then local selected_peer selected_peer=$(ntpq -p 2>/dev/null | awk '/^\*/ { print $0 }') if [ -n "$selected_peer" ]; then synchronised=1 stratum=$(echo "$selected_peer" | awk '{print $3}') local offset_ms offset_ms=$(echo "$selected_peer" | awk '{print $9}') if [ -n "$offset_ms" ]; then offset=$(awk "BEGIN {printf \"%.9f\", $offset_ms / 1000}") fi fi fi echo "$synchronised ${stratum:-16} ${offset:-0}" } # Get ntpd peer list with status # Returns: Lines of "peer type offset delay jitter reachable" get_ntpd_peers() { command -v ntpq >/dev/null 2>&1 || return ntpq -p 2>/dev/null | awk ' NR > 2 && NF >= 9 { tally = substr($1, 1, 1) peer = substr($1, 2) if (peer == "") next st = $3 # offset in ms (field 9), delay in ms (field 8), jitter in ms (field 10) offset_ms = $9 + 0 delay_ms = $8 + 0 jitter_ms = $10 + 0 # Convert to seconds offset_s = offset_ms / 1000 delay_s = delay_ms / 1000 jitter_s = jitter_ms / 1000 # Type based on tally code if (tally == "*") type = "selected" else if (tally == "+") type = "candidate" else if (tally == "-") type = "outlier" else if (tally == "#") type = "selected_distance" else type = "other" # Reachable if tally is *, +, -, or # reachable = (tally == "*" || tally == "+" || tally == "-" || tally == "#") ? 1 : 0 printf "%s %s %.9f %.9f %.9f %d\n", peer, type, offset_s, delay_s, jitter_s, reachable }' } # Parse a timedatectl timesync-status value with unit suffix to seconds # Args: $1 - raw value string (e.g., "+2.764ms", "95.987ms", "34min 8s") # Returns: value in seconds parse_timesyncd_duration() { local raw="$1" # Remove leading +/- raw="${raw#[+-]}" # Handle compound durations like "34min 8s" if echo "$raw" | grep -q "min"; then local mins secs mins=$(echo "$raw" | grep -oE '[0-9]+min' | grep -oE '[0-9]+') secs=$(echo "$raw" | grep -oE '[0-9.]+s$' | grep -oE '[0-9.]+') awk "BEGIN {printf \"%.9f\", ${mins:-0} * 60 + ${secs:-0}}" return fi # Handle single unit values if echo "$raw" | grep -qE 'us$'; then local val="${raw%us}" awk "BEGIN {printf \"%.12f\", $val / 1000000}" elif echo "$raw" | grep -qE 'ms$'; then local val="${raw%ms}" awk "BEGIN {printf \"%.9f\", $val / 1000}" elif echo "$raw" | grep -qE 's$'; then echo "${raw%s}" else echo "$raw" fi } # Get systemd-timesyncd sync status # Returns: "synchronised stratum offset_seconds" get_timesyncd_sync_status() { local synchronised=0 local stratum=16 local offset=0 local synced synced=$(timedatectl show 2>/dev/null | awk -F= '/NTPSynchronized/ {print $2}') if [ "$synced" = "yes" ]; then synchronised=1 fi local status status=$(timedatectl timesync-status 2>/dev/null) if [ -n "$status" ]; then stratum=$(echo "$status" | awk -F: '/Stratum/ { gsub(/^[ \t]+/, "", $2); print $2 }') local offset_raw offset_raw=$(echo "$status" | awk -F: '/Offset/ { gsub(/^[ \t]+/, "", $2); print $2 }') if [ -n "$offset_raw" ]; then # Preserve sign local sign="" if echo "$offset_raw" | grep -q '^-'; then sign="-" fi local abs_val abs_val=$(parse_timesyncd_duration "$offset_raw") offset="${sign}${abs_val}" fi fi echo "$synchronised ${stratum:-16} ${offset:-0}" } # Get systemd-timesyncd detailed metrics # Returns: "delay jitter frequency root_distance poll_interval packet_count leap_code server address" get_timesyncd_details() { local status status=$(timedatectl timesync-status 2>/dev/null) local delay_raw jitter_raw freq_raw rootdist_raw poll_raw packets_raw leap_raw delay_raw=$(echo "$status" | awk -F: '/Delay/ { gsub(/^[ \t]+/, "", $2); print $2 }') jitter_raw=$(echo "$status" | awk -F: '/Jitter/ { gsub(/^[ \t]+/, "", $2); print $2 }') freq_raw=$(echo "$status" | awk -F: '/Frequency/ { gsub(/^[ \t]+/, "", $2); print $2 }') rootdist_raw=$(echo "$status" | awk -F: '/Root distance/ { gsub(/^[ \t]+/, "", $2); print $2 }') poll_raw=$(echo "$status" | awk -F: '/Poll interval/ { gsub(/^[ \t]+/, "", $2); print $2 }') packets_raw=$(echo "$status" | awk -F: '/Packet count/ { gsub(/^[ \t]+/, "", $2); print $2 }') leap_raw=$(echo "$status" | awk -F: '/Leap/ { gsub(/^[ \t]+/, "", $2); print $2 }') # Server info from show-timesync local show_output show_output=$(timedatectl show-timesync 2>/dev/null) local server address server=$(echo "$show_output" | awk -F= '/ServerName/ {print $2}') address=$(echo "$show_output" | awk -F= '/ServerAddress/ {print $2}') local delay jitter root_distance poll_interval delay=$(parse_timesyncd_duration "$delay_raw") jitter=$(parse_timesyncd_duration "$jitter_raw") # Root distance may have "(max: 5s)" suffix — strip it local rootdist_clean="${rootdist_raw%% (*}" root_distance=$(parse_timesyncd_duration "$rootdist_clean") # Poll interval: format may be "34min 8s (min: 32s; max 34min 8s)" — strip parenthetical local poll_current="${poll_raw%% (*}" poll_interval=$(parse_timesyncd_duration "$poll_current") # Frequency: strip "ppm" suffix, may have sign local frequency frequency="${freq_raw%ppm}" # Leap indicator local leap_code case "$leap_raw" in *normal*) leap_code=0 ;; *"insert"*) leap_code=1 ;; *"delete"*) leap_code=2 ;; *) leap_code=3 ;; esac # Trim whitespace from packets local packets packets=$(echo "$packets_raw" | awk '{print $1+0}') echo "${delay:-0} ${jitter:-0} ${frequency:-0} ${root_distance:-0} ${poll_interval:-0} ${packets:-0} ${leap_code:-3} ${server:-unknown} ${address:-unknown}" } # ============================================================================ # METRIC GENERATION # ============================================================================ # Generate all Prometheus metrics # Returns: Prometheus text format metrics on stdout generate_metrics() { local script_start script_start=$(date +%s) # Detect NTP client if ! detect_ntp_source; then cat < 0.1) ? 1 : 0}") offset_warning=$(awk "BEGIN {print ($abs_offset > 0.01) ? 1 : 0}") unsync_alert=$((1 - synchronised)) cat <&2 if ! command -v nc >/dev/null 2>&1; then echo "ERROR: netcat (nc) required for HTTP mode" >&2 exit 1 fi # Infinite loop accepting HTTP requests while true; do { read -r request # Check if request is for /metrics endpoint if [[ "$request" =~ ^GET\ /metrics ]]; then echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" generate_metrics else # Serve HTML landing page for other requests echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" cat < NTP Drift Exporter v1.0

NTP Drift Exporter v1.0

Metrics

Metric Categories

  • Core Status: exporter up/down, NTP source type
  • Sync Status: synchronised, stratum, offset
  • Chrony: root delay, dispersion, frequency, skew, per-source offsets
  • NTPd: peer offsets, delay, jitter, selected peer
  • Alerts: offset warning (>10ms), critical (>100ms), unsynchronised
EOF fi } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null done } # ============================================================================ # MAIN EXECUTION # ============================================================================ # Main entry point - routes to appropriate output mode main() { parse_args "$@" if [ "$HTTP_MODE" = true ]; then # Run HTTP server (blocks until killed) run_http_server elif [ -n "$OUTPUT_FILE" ]; then # Textfile collector mode: write atomically using temp file local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" # Create temp file in SAME directory for atomic rename (same filesystem) local temp_file temp_file=$(mktemp "${output_dir}/.ntp_drift_metrics.XXXXXX") # Generate metrics to temp file if ! generate_metrics > "$temp_file" 2>/dev/null; then rm -f "$temp_file" echo "ERROR: Failed to generate metrics" >&2 exit 1 fi # Validate: file must have content local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [ "$file_lines" -lt 10 ]; then rm -f "$temp_file" echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 exit 1 fi # Set permissions before move chmod 644 "$temp_file" # Atomic rename - no gap where file is missing mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 else # Default: output to stdout generate_metrics fi } # Execute main function with all script arguments main "$@"