#!/usr/bin/env bash ######################################################################################### #### chaos-runner.sh — Inject controlled failures and verify system recovery #### #### CPU stress, memory pressure, disk fill, service kill, network faults #### #### Requires: bash 4+, root privileges #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version 1.01 #### #### #### #### Usage: #### #### sudo ./chaos-runner.sh --fault cpu-stress --duration 30 #### #### #### #### See --help for all options. #### ######################################################################################### set -euo pipefail # --------------------------------------------------------------------------- # Color variables — pre-initialized empty, set by setup_colors() # --------------------------------------------------------------------------- RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" setup_colors() { if [[ "${COLOR}" == "never" ]]; then return fi if [[ "${COLOR}" == "always" ]] || [[ -t 1 ]]; then RED="\033[0;31m" GREEN="\033[0;32m" YELLOW="\033[0;33m" BLUE="\033[0;34m" CYAN="\033[0;36m" BOLD="\033[1m" DIM="\033[2m" RESET="\033[0m" fi } # --------------------------------------------------------------------------- # Standard helpers # --------------------------------------------------------------------------- log() { printf "%b[+]%b %s\n" "$GREEN" "$RESET" "$*"; } warn() { printf "%b[!]%b %s\n" "$YELLOW" "$RESET" "$*" >&2; } err() { printf "%b[-]%b %s\n" "$RED" "$RESET" "$*" >&2; } verbose() { [[ "$VERBOSE" == "true" ]] && printf "%b[~]%b %s\n" "$DIM" "$RESET" "$*"; return 0; } die() { err "$*"; exit 1; } section_header() { printf "\n%b%b══ %b%s%b\n" "$CYAN" "$BOLD" "$BLUE" "$*" "$RESET" } field() { printf " %-24s %s\n" "$1" "$2" } field_color() { local label="$1" color="$2" value="$3" printf " %-24s %b%s%b\n" "$label" "$color" "$value" "$RESET" } # --------------------------------------------------------------------------- # Defaults # --------------------------------------------------------------------------- RUN_MODE="" FAULT_TYPE="" DURATION="${CHAOS_DURATION:-30}" TARGET_SERVICE="" FILL_PATH="${CHAOS_FILL_PATH:-/tmp}" FILL_SIZE="${CHAOS_FILL_SIZE:-90}" LATENCY_MS="${CHAOS_LATENCY:-200}" DROP_PERCENT="${CHAOS_DROP:-50}" NETWORK_IFACE="${CHAOS_IFACE:-eth0}" PLAN_FILE="" CONFIRM_YES=false VERBOSE="${VERBOSE:-false}" COLOR="${COLOR:-auto}" CLEANUP_PIDS=() CLEANUP_FILES=() CHAOS_ACTIVE=false # --------------------------------------------------------------------------- # State # --------------------------------------------------------------------------- readonly SCRIPT_NAME="${0##*/}" START_TIME="" # --------------------------------------------------------------------------- # Trap # --------------------------------------------------------------------------- trap cleanup_all EXIT INT TERM # --------------------------------------------------------------------------- # Safety — cleanup # --------------------------------------------------------------------------- cleanup_all() { if [[ "${CHAOS_ACTIVE}" != "true" ]]; then return fi CHAOS_ACTIVE=false warn "Running cleanup..." # Kill tracked background PIDs local pid for pid in "${CLEANUP_PIDS[@]}"; do kill "$pid" 2>/dev/null || true wait "$pid" 2>/dev/null || true done CLEANUP_PIDS=() # Remove tracked temp files local f for f in "${CLEANUP_FILES[@]}"; do if [[ -d "$f" ]] && mountpoint -q "$f" 2>/dev/null; then umount "$f" 2>/dev/null || true rmdir "$f" 2>/dev/null || true elif [[ -f "$f" ]]; then rm -f "$f" 2>/dev/null || true elif [[ -d "$f" ]]; then rmdir "$f" 2>/dev/null || true fi done CLEANUP_FILES=() # Remove tc qdiscs tc qdisc del dev "$NETWORK_IFACE" root 2>/dev/null || true # Restore resolv.conf from backup if [[ -f /etc/resolv.conf.chaos-backup ]]; then mv /etc/resolv.conf.chaos-backup /etc/resolv.conf 2>/dev/null || true log "Restored /etc/resolv.conf from backup" fi log "Cleanup complete" } # --------------------------------------------------------------------------- # Utilities # --------------------------------------------------------------------------- require_root() { if [[ "$(id -u)" -ne 0 ]]; then die "This operation requires root privileges. Run with sudo." fi } confirm_action() { local message="$1" if [[ "$CONFIRM_YES" == "true" ]]; then return 0 fi printf "%b[?]%b %s [y/N] " "$YELLOW" "$RESET" "$message" local answer read -r answer case "$answer" in [yY]|[yY][eE][sS]) return 0 ;; *) die "Aborted by user" ;; esac } wait_duration() { local remaining="$DURATION" while [[ "$remaining" -gt 0 ]]; do printf "\r %bTime remaining: %ds%b " "$DIM" "$remaining" "$RESET" sleep 1 ((remaining--)) || true done printf "\r%40s\r" "" } # --------------------------------------------------------------------------- # Fault: cpu-stress # --------------------------------------------------------------------------- fault_cpu_stress() { local cores cores=$(nproc) section_header "CPU Stress — saturating $cores cores for ${DURATION}s" CHAOS_ACTIVE=true local i for ((i = 0; i < cores; i++)); do while :; do :; done & CLEANUP_PIDS+=("$!") verbose "Spawned CPU worker PID $!" done log "Started $cores CPU stress workers" wait_duration cleanup_all } # --------------------------------------------------------------------------- # Fault: memory-pressure # --------------------------------------------------------------------------- fault_memory_pressure() { section_header "Memory Pressure — filling tmpfs for ${DURATION}s" CHAOS_ACTIVE=true local mount_dir mount_dir=$(mktemp -d /tmp/chaos-mem-XXXXXX) mount -t tmpfs -o size=256M tmpfs "$mount_dir" CLEANUP_FILES+=("$mount_dir") log "Mounted tmpfs at $mount_dir (256M)" head -c 240M /dev/urandom > "${mount_dir}/fill.dat" 2>/dev/null || true log "Filled tmpfs with ~240M of data" wait_duration cleanup_all } # --------------------------------------------------------------------------- # Fault: disk-fill # --------------------------------------------------------------------------- fault_disk_fill() { section_header "Disk Fill — filling ${FILL_PATH} to ${FILL_SIZE}% for ${DURATION}s" CHAOS_ACTIVE=true local current_usage target_bytes fill_file total_kb fill_file="${FILL_PATH}/chaos-fill-$(date +%s).dat" total_kb=$(df --output=size -k "$FILL_PATH" | tail -1 | tr -d ' ') current_usage=$(df --output=pcent "$FILL_PATH" | tail -1 | tr -d ' %') if [[ "$current_usage" -ge "$FILL_SIZE" ]]; then warn "Disk already at ${current_usage}% — above target ${FILL_SIZE}%" return fi target_bytes=$(( (FILL_SIZE - current_usage) * total_kb * 1024 / 100 )) local target_mb=$(( target_bytes / 1048576 )) log "Writing ${target_mb}M to $fill_file" dd if=/dev/zero of="$fill_file" bs=1M count="$target_mb" status=none 2>/dev/null || true CLEANUP_FILES+=("$fill_file") log "Disk fill complete — $(df --output=pcent "$FILL_PATH" | tail -1 | tr -d ' ') used" wait_duration cleanup_all } # --------------------------------------------------------------------------- # Fault: service-kill # --------------------------------------------------------------------------- fault_service_kill() { if [[ -z "$TARGET_SERVICE" ]]; then die "service-kill requires --target SERVICE_NAME" fi section_header "Service Kill — stopping ${TARGET_SERVICE} for ${DURATION}s" CHAOS_ACTIVE=true if ! systemctl is-active --quiet "$TARGET_SERVICE"; then die "Service '$TARGET_SERVICE' is not currently active" fi confirm_action "Stop service '$TARGET_SERVICE' for ${DURATION}s?" systemctl stop "$TARGET_SERVICE" log "Stopped $TARGET_SERVICE" wait_duration log "Restarting $TARGET_SERVICE..." systemctl start "$TARGET_SERVICE" log "Service $TARGET_SERVICE restarted" CHAOS_ACTIVE=false } # --------------------------------------------------------------------------- # Fault: network-latency # --------------------------------------------------------------------------- fault_network_latency() { section_header "Network Latency — ${LATENCY_MS}ms on ${NETWORK_IFACE} for ${DURATION}s" CHAOS_ACTIVE=true if ! command -v tc &>/dev/null; then die "tc (iproute2) is required for network faults" fi tc qdisc del dev "$NETWORK_IFACE" root 2>/dev/null || true tc qdisc add dev "$NETWORK_IFACE" root netem delay "${LATENCY_MS}ms" log "Added ${LATENCY_MS}ms latency to $NETWORK_IFACE" wait_duration cleanup_all } # --------------------------------------------------------------------------- # Fault: network-drop # --------------------------------------------------------------------------- fault_network_drop() { section_header "Network Drop — ${DROP_PERCENT}% loss on ${NETWORK_IFACE} for ${DURATION}s" CHAOS_ACTIVE=true if ! command -v tc &>/dev/null; then die "tc (iproute2) is required for network faults" fi tc qdisc del dev "$NETWORK_IFACE" root 2>/dev/null || true tc qdisc add dev "$NETWORK_IFACE" root netem loss "${DROP_PERCENT}%" log "Added ${DROP_PERCENT}% packet loss to $NETWORK_IFACE" wait_duration cleanup_all } # --------------------------------------------------------------------------- # Fault: dns-failure # --------------------------------------------------------------------------- fault_dns_failure() { section_header "DNS Failure — breaking DNS for ${DURATION}s" CHAOS_ACTIVE=true if [[ -f /etc/resolv.conf.chaos-backup ]]; then die "A chaos backup of resolv.conf already exists — run --cleanup first" fi cp /etc/resolv.conf /etc/resolv.conf.chaos-backup CLEANUP_FILES+=("/etc/resolv.conf.chaos-backup") printf "# Chaos: DNS intentionally broken\nnameserver 127.0.0.254\n" > /etc/resolv.conf log "Replaced /etc/resolv.conf with broken nameserver" wait_duration cleanup_all } # --------------------------------------------------------------------------- # Fault: io-latency # --------------------------------------------------------------------------- fault_io_latency() { section_header "I/O Latency — degrading I/O for ${DURATION}s" CHAOS_ACTIVE=true local io_file io_file="${FILL_PATH}/chaos-io-$(date +%s).dat" ionice -c 2 -n 7 dd if=/dev/urandom of="$io_file" bs=4K count=0 status=none 2>/dev/null & CLEANUP_PIDS+=("$!") CLEANUP_FILES+=("$io_file") # Run continuous slow I/O in background ( while :; do ionice -c 3 dd if=/dev/zero of="$io_file" bs=4K count=256 conv=fdatasync status=none 2>/dev/null || true sync sleep 0.5 done ) & CLEANUP_PIDS+=("$!") log "Started degraded I/O worker (idle-class ionice)" wait_duration cleanup_all } # --------------------------------------------------------------------------- # Dispatch # --------------------------------------------------------------------------- do_fault() { require_root if [[ -z "$FAULT_TYPE" ]]; then die "No fault type specified. Use --fault TYPE" fi confirm_action "Inject fault '${FAULT_TYPE}' for ${DURATION}s?" START_TIME=$(date +%s) log "Starting fault injection: $FAULT_TYPE (duration: ${DURATION}s)" case "$FAULT_TYPE" in cpu-stress) fault_cpu_stress ;; memory-pressure) fault_memory_pressure ;; disk-fill) fault_disk_fill ;; service-kill) fault_service_kill ;; network-latency) fault_network_latency ;; network-drop) fault_network_drop ;; dns-failure) fault_dns_failure ;; io-latency) fault_io_latency ;; *) die "Unknown fault type: $FAULT_TYPE" ;; esac local elapsed=$(( $(date +%s) - START_TIME )) log "Fault injection complete (${elapsed}s elapsed)" } # --------------------------------------------------------------------------- # List fault types # --------------------------------------------------------------------------- do_list() { section_header "Available Fault Types" printf "\n" printf " %-20s %s\n" "FAULT TYPE" "DESCRIPTION" printf " ─────────────────────────────────────────────────────────────\n" printf " %-20s %s\n" "cpu-stress" "Saturate all CPU cores" printf " %-20s %s\n" "memory-pressure" "Fill memory via tmpfs allocation" printf " %-20s %s\n" "disk-fill" "Fill disk to threshold percentage" printf " %-20s %s\n" "service-kill" "Stop a systemd service temporarily" printf " %-20s %s\n" "network-latency" "Add network latency via tc netem" printf " %-20s %s\n" "network-drop" "Drop packets via tc netem" printf " %-20s %s\n" "dns-failure" "Break DNS resolution temporarily" printf " %-20s %s\n" "io-latency" "Degrade I/O performance via ionice" printf "\n" } # --------------------------------------------------------------------------- # Verify system health # --------------------------------------------------------------------------- do_verify() { section_header "System Health Check" local issues=0 # CPU load local load_1m load_1m=$(awk '{print $1}' /proc/loadavg) local cores cores=$(nproc) if awk "BEGIN {exit !($load_1m > $cores * 0.9)}"; then field_color "CPU load (1m):" "$RED" "${load_1m} — HIGH (cores: ${cores})" ((issues++)) || true else field_color "CPU load (1m):" "$GREEN" "${load_1m} (cores: ${cores})" fi # Memory local mem_avail_kb mem_total_kb mem_pct mem_total_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo) mem_avail_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo) mem_pct=$(( (mem_total_kb - mem_avail_kb) * 100 / mem_total_kb )) if [[ "$mem_pct" -gt 90 ]]; then field_color "Memory usage:" "$RED" "${mem_pct}% — HIGH" ((issues++)) || true else field_color "Memory usage:" "$GREEN" "${mem_pct}%" fi # Disk local disk_pct disk_pct=$(df --output=pcent / | tail -1 | tr -d ' %') if [[ "$disk_pct" -gt 90 ]]; then field_color "Disk usage (/):" "$RED" "${disk_pct}% — HIGH" ((issues++)) || true else field_color "Disk usage (/):" "$GREEN" "${disk_pct}%" fi # Network connectivity if ping -c 1 -W 3 8.8.8.8 &>/dev/null; then field_color "Network (ping):" "$GREEN" "OK" else field_color "Network (ping):" "$RED" "UNREACHABLE" ((issues++)) || true fi # DNS resolution if host google.com &>/dev/null; then field_color "DNS resolution:" "$GREEN" "OK" else field_color "DNS resolution:" "$RED" "FAILING" ((issues++)) || true fi # Chaos artifacts if [[ -f /etc/resolv.conf.chaos-backup ]]; then field_color "Chaos artifacts:" "$YELLOW" "resolv.conf backup found" ((issues++)) || true else field_color "Chaos artifacts:" "$GREEN" "None" fi printf "\n" if [[ "$issues" -gt 0 ]]; then warn "Found $issues issue(s)" return 1 else log "All checks passed" return 0 fi } # --------------------------------------------------------------------------- # Plan execution # --------------------------------------------------------------------------- do_plan() { require_root if [[ -z "$PLAN_FILE" ]]; then die "No plan file specified. Use --plan FILE" fi if [[ ! -f "$PLAN_FILE" ]]; then die "Plan file not found: $PLAN_FILE" fi if ! command -v jq &>/dev/null; then die "jq is required for plan execution" fi section_header "Executing Chaos Plan: $PLAN_FILE" local plan_length plan_length=$(jq '.faults | length' "$PLAN_FILE") log "Plan contains $plan_length fault(s)" local i fault_entry f_type f_duration for ((i = 0; i < plan_length; i++)); do fault_entry=$(jq -r ".faults[$i]" "$PLAN_FILE") f_type=$(printf '%s' "$fault_entry" | jq -r '.type') f_duration=$(printf '%s' "$fault_entry" | jq -r '.duration // 30') log "Step $((i + 1))/$plan_length: $f_type (${f_duration}s)" FAULT_TYPE="$f_type" DURATION="$f_duration" # Extract optional fields local f_target f_iface f_target=$(printf '%s' "$fault_entry" | jq -r '.target // empty') f_iface=$(printf '%s' "$fault_entry" | jq -r '.iface // empty') [[ -n "$f_target" ]] && TARGET_SERVICE="$f_target" [[ -n "$f_iface" ]] && NETWORK_IFACE="$f_iface" case "$FAULT_TYPE" in cpu-stress) fault_cpu_stress ;; memory-pressure) fault_memory_pressure ;; disk-fill) fault_disk_fill ;; service-kill) fault_service_kill ;; network-latency) fault_network_latency ;; network-drop) fault_network_drop ;; dns-failure) fault_dns_failure ;; io-latency) fault_io_latency ;; *) warn "Unknown fault type in plan: $FAULT_TYPE — skipping" ;; esac if [[ "$i" -lt $((plan_length - 1)) ]]; then log "Pausing 5s before next fault..." sleep 5 fi done log "Plan execution complete" } # --------------------------------------------------------------------------- # Force cleanup # --------------------------------------------------------------------------- do_cleanup() { require_root section_header "Force Cleanup" CHAOS_ACTIVE=true cleanup_all log "Force cleanup complete" } # --------------------------------------------------------------------------- # Help # --------------------------------------------------------------------------- show_help() { cat <