#!/bin/bash ################################################################################ # Script Name: systemd-boot-time-exporter.sh # Version: 1.1 # Description: Prometheus textfile collector exporter for systemd boot timing # Exports boot phase durations, per-service startup times, and # total boot time using systemd-analyze # # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT # Date: 2026-03-31 # # Prerequisites: # - systemd-analyze command available # - node_exporter with textfile collector enabled # - /var/lib/node_exporter directory exists # # Usage: # ./systemd-boot-time-exporter.sh # Output to stdout # ./systemd-boot-time-exporter.sh --textfile # Write to textfile collector # ./systemd-boot-time-exporter.sh -o /tmp/boot.prom # Write to custom file # TOP_N=10 ./systemd-boot-time-exporter.sh # DEBUG=1 ./systemd-boot-time-exporter.sh # # Metrics Exported: # - linux_boot_time_seconds{phase} - Boot phase durations # - linux_boot_total_seconds - Total boot time # - linux_boot_service_time_seconds{service} - Per-service startup time # - linux_boot_timestamp - Unix timestamp of last boot # - linux_boot_services_total{state} - Service count by state # - linux_boot_service_state_info{service,state} - Per-service state # - linux_boot_exporter_duration_seconds - Collection runtime # ################################################################################ set -o pipefail # ============================================================================ # CONFIGURATION # ============================================================================ readonly VERSION="1.1" readonly SCRIPT_NAME="${0##*/}" readonly TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter}" readonly TOP_N="${TOP_N:-20}" # Runtime flags OUTPUT_FILE="" DEBUG=${DEBUG:-} # ============================================================================ # HELPER FUNCTIONS # ============================================================================ debug_echo() { if [[ -n "$DEBUG" ]]; then echo "[DEBUG] $*" >&2 fi } log_error() { echo "[ERROR] $*" >&2 } cleanup() { rm -f "${OUTPUT_FILE}.$$" 2>/dev/null } trap cleanup EXIT show_help() { cat </dev/null) || true if [[ -z "$analyze_output" ]]; then log_error "systemd-analyze returned no output" return 1 fi debug_echo "systemd-analyze output: $analyze_output" output+="# HELP linux_boot_time_seconds Duration of each boot phase in seconds\n" output+="# TYPE linux_boot_time_seconds gauge\n" for phase in firmware loader kernel initrd userspace; do local val=0 if [[ "$analyze_output" =~ ([0-9]+(\.[0-9]+)?(min )?[0-9]*(\.[0-9]+)?m?s?)\ \(${phase}\) ]]; then val=$(parse_time_value "${BASH_REMATCH[1]}") fi debug_echo "${phase} phase: ${val}s" output+="linux_boot_time_seconds{phase=\"${phase}\"} ${val}\n" done output+="\n# HELP linux_boot_total_seconds Total boot time in seconds\n" output+="# TYPE linux_boot_total_seconds gauge\n" local total_time=0 if [[ "$analyze_output" =~ =\ ([0-9]+(\.[0-9]+)?(min )?[0-9]*(\.[0-9]+)?m?s?) ]]; then total_time=$(parse_time_value "${BASH_REMATCH[1]}") fi debug_echo "Total boot time: ${total_time}s" output+="linux_boot_total_seconds ${total_time}\n" printf '%b' "$output" } collect_service_times() { local output="" local blame_output blame_output=$(systemd-analyze blame 2>/dev/null | head -n "$TOP_N") || true if [[ -z "$blame_output" ]]; then debug_echo "systemd-analyze blame returned no output" return 0 fi output+="# HELP linux_boot_service_time_seconds Time taken by each systemd service to start\n" output+="# TYPE linux_boot_service_time_seconds gauge\n" while read -r time_str service_name _; do [[ -z "$service_name" ]] && continue local seconds seconds=$(parse_time_value "$time_str") debug_echo "Service $service_name: ${seconds}s" output+="linux_boot_service_time_seconds{service=\"${service_name}\"} ${seconds}\n" done <<< "$blame_output" printf '%b' "$output" } collect_boot_timestamp() { local output="" output+="# HELP linux_boot_timestamp Unix timestamp of last boot\n" output+="# TYPE linux_boot_timestamp gauge\n" local boot_ts boot_ts=$(who -b 2>/dev/null | awk '{print $3, $4}') || true if [[ -n "$boot_ts" ]]; then local epoch epoch=$(date -d "$boot_ts" +%s 2>/dev/null) || true if [[ -n "$epoch" ]]; then debug_echo "Boot timestamp: $boot_ts (epoch: $epoch)" output+="linux_boot_timestamp ${epoch}\n" else output+="linux_boot_timestamp 0\n" fi else # Fallback to /proc/stat btime local btime btime=$(awk '/^btime/ {print $2}' /proc/stat 2>/dev/null) || true if [[ -n "$btime" ]]; then debug_echo "Boot timestamp from /proc/stat: $btime" output+="linux_boot_timestamp ${btime}\n" else output+="linux_boot_timestamp 0\n" fi fi printf '%b' "$output" } collect_service_state_counts() { local output="" output+="# HELP linux_boot_services_total Count of services by activation state at boot\n" output+="# TYPE linux_boot_services_total gauge\n" local active=0 inactive=0 failed=0 active=$(systemctl list-units --type=service --state=active --no-legend 2>/dev/null | wc -l) || true inactive=$(systemctl list-units --type=service --state=inactive --no-legend 2>/dev/null | wc -l) || true failed=$(systemctl list-units --type=service --state=failed --no-legend 2>/dev/null | wc -l) || true debug_echo "Service states — active: $active, inactive: $inactive, failed: $failed" output+="linux_boot_services_total{state=\"active\"} ${active}\n" output+="linux_boot_services_total{state=\"inactive\"} ${inactive}\n" output+="linux_boot_services_total{state=\"failed\"} ${failed}\n" output+="\n# HELP linux_boot_service_state_info Service state information\n" output+="# TYPE linux_boot_service_state_info gauge\n" local line svc for state in failed active inactive; do while read -r line; do [[ -z "$line" ]] && continue svc=$(echo "$line" | awk '{for(i=1;i<=NF;i++) if($i ~ /\.service$/) {print $i; exit}}') [[ -z "$svc" ]] && continue output+="linux_boot_service_state_info{service=\"${svc}\",state=\"${state}\"} 1\n" done < <(systemctl list-units --type=service --state="$state" --no-legend 2>/dev/null) done printf '%b' "$output" } # ============================================================================ # METRICS COLLECTION # ============================================================================ collect_metrics() { local start_time start_time=$(date +%s%N) collect_boot_phases echo collect_service_times echo collect_boot_timestamp echo collect_service_state_counts local end_time duration end_time=$(date +%s%N) duration=$(awk -v s="$start_time" -v e="$end_time" 'BEGIN {printf "%.4f", (e - s) / 1000000000}') echo echo "# HELP linux_boot_exporter_duration_seconds Time taken to collect all metrics" echo "# TYPE linux_boot_exporter_duration_seconds gauge" echo "linux_boot_exporter_duration_seconds ${duration}" if [[ -n "$DEBUG" ]]; then debug_echo "--- critical-chain output (for reference) ---" systemd-analyze critical-chain 2>/dev/null | while IFS= read -r line; do debug_echo " $line" done fi } # ============================================================================ # OUTPUT # ============================================================================ write_metrics() { local metrics metrics=$(collect_metrics) if [[ -n "$OUTPUT_FILE" ]]; then local output_dir output_dir="$(dirname "$OUTPUT_FILE")" if [[ ! -d "$output_dir" ]]; then log_error "Directory does not exist: $output_dir" exit 1 fi local temp_file="${OUTPUT_FILE}.$$" echo "$metrics" > "$temp_file" chmod 644 "$temp_file" mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE" >&2 else echo "$metrics" fi } # ============================================================================ # MAIN # ============================================================================ main() { while [[ $# -gt 0 ]]; do case "$1" in --textfile) OUTPUT_FILE="${TEXTFILE_DIR}/systemd_boot_time.prom" shift ;; -o|--output) OUTPUT_FILE="$2" shift 2 ;; --debug) DEBUG=1 shift ;; --help|-h) show_help ;; --version|-v) show_version ;; *) log_error "Unknown option: $1" echo "Use --help for usage information" >&2 exit 1 ;; esac done if ! command -v systemd-analyze &>/dev/null; then log_error "systemd-analyze not found — this script requires systemd" exit 1 fi write_metrics } main "$@"