#!/usr/bin/env bash ######################################################################################### #### triage.sh — Rapid 60-second diagnostic for misbehaving Linux servers #### #### Top CPU/memory consumers, disk pressure, OOM kills, failed services, #### #### recent errors, network states, load averages, and I/O wait #### #### No dependencies beyond coreutils and standard Linux tools #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version 1.00 #### #### #### #### Usage: #### #### ./triage.sh #### #### ./triage.sh --no-color #### #### ./triage.sh --section load,disk,oom #### #### #### #### See --help for all options. #### ######################################################################################### set -euo pipefail # ── Defaults ────────────────────────────────────────────────────────── SECTIONS="${SECTIONS:-all}" VERBOSE="${VERBOSE:-false}" COLOR="${COLOR:-auto}" LINES="${LINES:-10}" ERRORS="${ERRORS:-25}" # ── State ───────────────────────────────────────────────────────────── SCRIPT_NAME="$(basename "$0")" readonly SCRIPT_NAME # ── Colors ──────────────────────────────────────────────────────────── RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" setup_colors() { if [[ "$COLOR" == "never" ]]; then RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" return fi if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' BOLD='\033[1m' DIM='\033[2m' RESET='\033[0m' else RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" fi } # ── Logging ─────────────────────────────────────────────────────────── log() { echo -e "${BLUE}[INFO]${RESET} $*"; } verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } # ── Helpers ─────────────────────────────────────────────────────────── section_header() { echo "" echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" echo "" } field() { printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" } field_color() { printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" } human_bytes() { local bytes="$1" if [[ "$bytes" -ge 1073741824 ]]; then awk "BEGIN { printf \"%.1f GiB\", $bytes / 1073741824 }" elif [[ "$bytes" -ge 1048576 ]]; then awk "BEGIN { printf \"%.0f MiB\", $bytes / 1048576 }" elif [[ "$bytes" -ge 1024 ]]; then awk "BEGIN { printf \"%.1f KiB\", $bytes / 1024 }" else echo "${bytes} B" fi } should_show() { [[ "$SECTIONS" == "all" ]] || [[ ",$SECTIONS," == *",$1,"* ]] } # ══════════════════════════════════════════════════════════════════════ # LOAD & CPU # ══════════════════════════════════════════════════════════════════════ show_load() { section_header "Load & CPU" local load1 load5 load15 cpus if [[ -f /proc/loadavg ]]; then read -r load1 load5 load15 _ _ < /proc/loadavg else load1=$(uptime | awk -F'load average:' '{print $2}' | awk -F', ' '{print $1}' | tr -d ' ') load5=$(uptime | awk -F'load average:' '{print $2}' | awk -F', ' '{print $2}' | tr -d ' ') load15=$(uptime | awk -F'load average:' '{print $2}' | awk -F', ' '{print $3}' | tr -d ' ') fi cpus=$(nproc 2>/dev/null || grep -c "^processor" /proc/cpuinfo 2>/dev/null || echo 1) field "Load average (1m):" "$load1" field "Load average (5m):" "$load5" field "Load average (15m):" "$load15" field "Logical CPUs:" "$cpus" local load_per_cpu load_per_cpu=$(awk "BEGIN { printf \"%.2f\", $load1 / $cpus }") field "Load per CPU (1m):" "$load_per_cpu" # I/O wait local iowait="" if command -v mpstat &>/dev/null; then verbose "Using mpstat for I/O wait" iowait=$(mpstat 1 1 2>/dev/null | tail -1 | awk '{print $6}') fi if [[ -z "$iowait" ]] && [[ -f /proc/stat ]]; then verbose "Using /proc/stat for I/O wait" local cpu_line cpu_line=$(head -1 /proc/stat) local user nice sys idle io_wait user=$(echo "$cpu_line" | awk '{print $2}') nice=$(echo "$cpu_line" | awk '{print $3}') sys=$(echo "$cpu_line" | awk '{print $4}') idle=$(echo "$cpu_line" | awk '{print $5}') io_wait=$(echo "$cpu_line" | awk '{print $6}') local total=$((user + nice + sys + idle + io_wait)) if [[ "$total" -gt 0 ]]; then iowait=$(awk "BEGIN { printf \"%.1f\", $io_wait * 100 / $total }") fi fi if [[ -n "$iowait" ]]; then local iow_color="$GREEN" local iow_int=${iowait%.*} if [[ "${iow_int:-0}" -ge 20 ]]; then iow_color="$RED" elif [[ "${iow_int:-0}" -ge 10 ]]; then iow_color="$YELLOW" fi field_color "I/O wait:" "${iow_color}${iowait}%${RESET}" else field "I/O wait:" "N/A" fi # CPU usage snapshot if command -v mpstat &>/dev/null; then local idle_pct idle_pct=$(mpstat 1 1 2>/dev/null | tail -1 | awk '{print $NF}') if [[ -n "$idle_pct" ]]; then local cpu_used cpu_used=$(awk "BEGIN { printf \"%.1f\", 100 - $idle_pct }") local cpu_color="$GREEN" local cpu_int=${cpu_used%.*} if [[ "${cpu_int:-0}" -ge 90 ]]; then cpu_color="$RED" elif [[ "${cpu_int:-0}" -ge 75 ]]; then cpu_color="$YELLOW" fi field_color "CPU usage:" "${cpu_color}${cpu_used}%${RESET}" fi fi } # ══════════════════════════════════════════════════════════════════════ # TOP CPU CONSUMERS # ══════════════════════════════════════════════════════════════════════ show_cpu() { section_header "Top CPU Consumers" printf " ${BOLD}%-8s %-12s %6s %6s %-12s %s${RESET}\n" \ "PID" "USER" "CPU%" "MEM%" "TIME" "COMMAND" printf " %s\n" "$(printf '%.0s─' {1..58})" ps -eo pid,user,pcpu,pmem,time,comm --sort=-%cpu --no-headers 2>/dev/null \ | head -"$LINES" | while IFS= read -r line; do local pid user cpu mem time cmd pid=$(echo "$line" | awk '{print $1}') user=$(echo "$line" | awk '{print $2}') cpu=$(echo "$line" | awk '{print $3}') mem=$(echo "$line" | awk '{print $4}') time=$(echo "$line" | awk '{print $5}') cmd=$(echo "$line" | awk '{for(i=6;i<=NF;i++) printf "%s ", $i; print ""}' | sed 's/ $//') local color="" local cpu_int=${cpu%.*} if [[ "${cpu_int:-0}" -ge 80 ]]; then color="$RED" elif [[ "${cpu_int:-0}" -ge 50 ]]; then color="$YELLOW" fi if [[ -n "$color" ]]; then printf " %-8s %-12s %b%6s%b %6s %-12s %s\n" \ "$pid" "$user" "$color" "$cpu" "$RESET" "$mem" "$time" "$cmd" else printf " %-8s %-12s %6s %6s %-12s %s\n" \ "$pid" "$user" "$cpu" "$mem" "$time" "$cmd" fi done } # ══════════════════════════════════════════════════════════════════════ # TOP MEMORY CONSUMERS # ══════════════════════════════════════════════════════════════════════ show_memory() { section_header "Top Memory Consumers" printf " ${BOLD}%-8s %-12s %10s %6s %s${RESET}\n" \ "PID" "USER" "RSS" "MEM%" "COMMAND" printf " %s\n" "$(printf '%.0s─' {1..58})" ps -eo pid,user,rss,pmem,comm --sort=-rss --no-headers 2>/dev/null \ | head -"$LINES" | while IFS= read -r line; do local pid user rss_kb mem cmd rss_human pid=$(echo "$line" | awk '{print $1}') user=$(echo "$line" | awk '{print $2}') rss_kb=$(echo "$line" | awk '{print $3}') mem=$(echo "$line" | awk '{print $4}') cmd=$(echo "$line" | awk '{for(i=5;i<=NF;i++) printf "%s ", $i; print ""}' | sed 's/ $//') local rss_bytes=$((rss_kb * 1024)) rss_human=$(human_bytes "$rss_bytes") printf " %-8s %-12s %10s %6s %s\n" \ "$pid" "$user" "$rss_human" "$mem" "$cmd" done } # ══════════════════════════════════════════════════════════════════════ # DISK PRESSURE # ══════════════════════════════════════════════════════════════════════ show_disk() { section_header "Disk Pressure" printf " ${BOLD}%-24s %8s %8s %8s %6s${RESET}\n" \ "FILESYSTEM" "SIZE" "USED" "AVAIL" "USE%" printf " %s\n" "$(printf '%.0s─' {1..58})" df -h --output=target,size,used,avail,pcent -x tmpfs -x devtmpfs -x overlay 2>/dev/null \ | tail -n +2 | sort | while IFS= read -r line; do local mount size used avail pct mount=$(echo "$line" | awk '{print $1}') size=$(echo "$line" | awk '{print $2}') used=$(echo "$line" | awk '{print $3}') avail=$(echo "$line" | awk '{print $4}') pct=$(echo "$line" | awk '{print $5}' | tr -d '%') if [[ -z "$pct" ]] || [[ "$pct" -lt 80 ]]; then continue fi local color="$YELLOW" if [[ "$pct" -ge 95 ]]; then color="$RED" elif [[ "$pct" -ge 90 ]]; then color="$RED" fi printf " %-24s %8s %8s %8s %b%5s%%%b\n" \ "$mount" "$size" "$used" "$avail" "$color" "$pct" "$RESET" done # If nothing was printed (all under 80%), say so local over_80 over_80=$(df -h --output=pcent -x tmpfs -x devtmpfs -x overlay 2>/dev/null \ | tail -n +2 | tr -d '%' | awk '$1 >= 80' | wc -l) if [[ "$over_80" -eq 0 ]]; then echo -e " ${GREEN}All filesystems below 80%${RESET} ✓" fi # Inode pressure for mounted filesystems echo "" df -i --output=target,ipcent -x tmpfs -x devtmpfs -x overlay 2>/dev/null \ | tail -n +2 | sort | while IFS= read -r line; do local mount ipct mount=$(echo "$line" | awk '{print $1}') ipct=$(echo "$line" | awk '{print $2}' | tr -d '%') [[ -z "$ipct" ]] && continue local icolor="$GREEN" if [[ "$ipct" -ge 90 ]]; then icolor="$RED" elif [[ "$ipct" -ge 75 ]]; then icolor="$YELLOW" fi field_color "Inode usage (${mount}):" "${icolor}${ipct}%${RESET}" done } # ══════════════════════════════════════════════════════════════════════ # OOM KILLS # ══════════════════════════════════════════════════════════════════════ show_oom() { section_header "OOM Kills" local oom_lines="" # Check dmesg for OOM kills if command -v dmesg &>/dev/null; then verbose "Checking dmesg for OOM kills" local dmesg_oom dmesg_oom=$(dmesg 2>/dev/null | grep -i "out of memory\|oom-killer\|killed process" || true) if [[ -n "$dmesg_oom" ]]; then oom_lines="$dmesg_oom" fi fi # Check journalctl -k for OOM kills if command -v journalctl &>/dev/null; then verbose "Checking journalctl -k for OOM kills" local journal_oom journal_oom=$(journalctl -k --no-pager -q 2>/dev/null \ | grep -i "out of memory\|oom-killer\|killed process" || true) if [[ -n "$journal_oom" ]]; then if [[ -n "$oom_lines" ]]; then oom_lines="${oom_lines}"$'\n'"${journal_oom}" else oom_lines="$journal_oom" fi fi fi if [[ -z "$oom_lines" ]]; then echo -e " ${GREEN}No recent OOM kills found${RESET} ✓" else # Deduplicate and show last entries local unique_lines unique_lines=$(echo "$oom_lines" | sort -u | tail -20) local count count=$(echo "$unique_lines" | wc -l) field_color "OOM events:" "${RED}${count}${RESET}" echo "" echo "$unique_lines" | while IFS= read -r line; do printf " ${RED}✗${RESET} %s\n" "$line" done fi } # ══════════════════════════════════════════════════════════════════════ # FAILED SERVICES # ══════════════════════════════════════════════════════════════════════ show_services() { section_header "Failed Services" if ! command -v systemctl &>/dev/null; then field "Status:" "systemd not available" return fi local failed_count failed_count=$(systemctl --no-legend --state=failed 2>/dev/null | wc -l) if [[ "$failed_count" -gt 0 ]]; then field_color "Failed services:" "${RED}${failed_count}${RESET}" echo "" systemctl --no-legend --state=failed 2>/dev/null | while IFS= read -r line; do local unit unit=$(echo "$line" | awk '{print $1}') printf " ${RED}✗${RESET} %s\n" "$unit" done else field_color "Failed services:" "${GREEN}0${RESET} ✓" fi } # ══════════════════════════════════════════════════════════════════════ # RECENT ERRORS # ══════════════════════════════════════════════════════════════════════ show_errors() { section_header "Recent Errors (last ${ERRORS})" if ! command -v journalctl &>/dev/null; then field "Status:" "journalctl not available" return fi local error_output error_output=$(journalctl --no-pager -q -p 3 -n "$ERRORS" 2>/dev/null || true) if [[ -z "$error_output" ]]; then echo -e " ${GREEN}No recent priority ≤ 3 messages${RESET} ✓" else local count count=$(echo "$error_output" | wc -l) verbose "Found $count error entries" echo "$error_output" | while IFS= read -r line; do printf " %s\n" "$line" done fi } # ══════════════════════════════════════════════════════════════════════ # NETWORK STATES # ══════════════════════════════════════════════════════════════════════ show_network() { section_header "Network States" local tcp_states="" if command -v ss &>/dev/null; then verbose "Using ss for TCP state summary" tcp_states=$(ss -tan 2>/dev/null | tail -n +2 | awk '{print $1}' | sort | uniq -c | sort -rn || true) elif command -v netstat &>/dev/null; then verbose "Falling back to netstat for TCP state summary" tcp_states=$(netstat -tan 2>/dev/null | tail -n +3 | awk '{print $6}' | sort | uniq -c | sort -rn || true) fi if [[ -z "$tcp_states" ]]; then field "Status:" "No TCP connection data available" return fi printf " ${BOLD}%-18s %s${RESET}\n" "STATE" "COUNT" printf " %s\n" "$(printf '%.0s─' {1..26})" echo "$tcp_states" | while IFS= read -r line; do [[ -z "$line" ]] && continue local count state count=$(echo "$line" | awk '{print $1}') state=$(echo "$line" | awk '{print $2}') local color="" case "$state" in CLOSE-WAIT|CLOSE_WAIT) if [[ "$count" -ge 10 ]]; then color="$YELLOW"; fi if [[ "$count" -ge 50 ]]; then color="$RED"; fi ;; TIME-WAIT|TIME_WAIT) if [[ "$count" -ge 500 ]]; then color="$YELLOW"; fi if [[ "$count" -ge 2000 ]]; then color="$RED"; fi ;; esac if [[ -n "$color" ]]; then printf " %-18s %b%s%b\n" "$state" "$color" "$count" "$RESET" else printf " %-18s %s\n" "$state" "$count" fi done # Warnings for notable states echo "" local close_wait time_wait close_wait=$(echo "$tcp_states" | awk '/CLOSE.WAIT/ {print $1}') time_wait=$(echo "$tcp_states" | awk '/TIME.WAIT/ {print $1}') if [[ -n "$close_wait" ]] && [[ "$close_wait" -ge 10 ]]; then echo -e " ${YELLOW}CLOSE_WAIT: ${close_wait} — possible application not closing connections${RESET}" fi if [[ -n "$time_wait" ]] && [[ "$time_wait" -ge 500 ]]; then echo -e " ${YELLOW}TIME_WAIT: ${time_wait} — high volume; consider net.ipv4.tcp_tw_reuse${RESET}" fi } # ══════════════════════════════════════════════════════════════════════ # USAGE # ══════════════════════════════════════════════════════════════════════ usage() { cat <&2 echo "Run ${SCRIPT_NAME} --help for usage" >&2 exit 1 ;; esac done } # ══════════════════════════════════════════════════════════════════════ # MAIN # ══════════════════════════════════════════════════════════════════════ main() { parse_args "$@" setup_colors echo "" echo -e "${BOLD}Triage — $(hostname -f 2>/dev/null || hostname)${RESET}" echo -e "${DIM}$(date '+%Y-%m-%d %H:%M:%S %Z')${RESET}" should_show "load" && show_load should_show "cpu" && show_cpu should_show "memory" && show_memory should_show "disk" && show_disk should_show "oom" && show_oom should_show "services" && show_services should_show "errors" && show_errors should_show "network" && show_network echo "" } main "$@"