Files
linux-scripts/chaos-runner.sh
T
chiefgeek a1a17e81a1 Sync all scripts from website downloads — 352 scripts total
Includes updated JS challenge scripts with Claude-User whitelist,
same-site referer bypass, Blackbox-Exporter allowed bot, and all
new exporters, cheat sheets, and automation scripts.
2026-05-25 03:31:08 +02:00

740 lines
25 KiB
Bash
Executable File

#!/usr/bin/env bash
#########################################################################################
#### chaos-runner.sh — Inject controlled failures and verify system recovery ####
#### CPU stress, memory pressure, disk fill, service kill, network faults ####
#### Requires: bash 4+, root privileges ####
#### ####
#### Author: Phil Connor ####
#### Contact: contact@mylinux.work ####
#### License: MIT ####
#### Version 1.01 ####
#### ####
#### Usage: ####
#### sudo ./chaos-runner.sh --fault cpu-stress --duration 30 ####
#### ####
#### See --help for all options. ####
#########################################################################################
set -euo pipefail
# ---------------------------------------------------------------------------
# Color variables — pre-initialized empty, set by setup_colors()
# ---------------------------------------------------------------------------
RED=""
GREEN=""
YELLOW=""
BLUE=""
CYAN=""
BOLD=""
DIM=""
RESET=""
setup_colors() {
if [[ "${COLOR}" == "never" ]]; then
return
fi
if [[ "${COLOR}" == "always" ]] || [[ -t 1 ]]; then
RED="\033[0;31m"
GREEN="\033[0;32m"
YELLOW="\033[0;33m"
BLUE="\033[0;34m"
CYAN="\033[0;36m"
BOLD="\033[1m"
DIM="\033[2m"
RESET="\033[0m"
fi
}
# ---------------------------------------------------------------------------
# Standard helpers
# ---------------------------------------------------------------------------
log() { printf "%b[+]%b %s\n" "$GREEN" "$RESET" "$*"; }
warn() { printf "%b[!]%b %s\n" "$YELLOW" "$RESET" "$*" >&2; }
err() { printf "%b[-]%b %s\n" "$RED" "$RESET" "$*" >&2; }
verbose() { [[ "$VERBOSE" == "true" ]] && printf "%b[~]%b %s\n" "$DIM" "$RESET" "$*"; return 0; }
die() { err "$*"; exit 1; }
section_header() {
printf "\n%b%b══ %b%s%b\n" "$CYAN" "$BOLD" "$BLUE" "$*" "$RESET"
}
field() {
printf " %-24s %s\n" "$1" "$2"
}
field_color() {
local label="$1" color="$2" value="$3"
printf " %-24s %b%s%b\n" "$label" "$color" "$value" "$RESET"
}
# ---------------------------------------------------------------------------
# Defaults
# ---------------------------------------------------------------------------
RUN_MODE=""
FAULT_TYPE=""
DURATION="${CHAOS_DURATION:-30}"
TARGET_SERVICE=""
FILL_PATH="${CHAOS_FILL_PATH:-/tmp}"
FILL_SIZE="${CHAOS_FILL_SIZE:-90}"
LATENCY_MS="${CHAOS_LATENCY:-200}"
DROP_PERCENT="${CHAOS_DROP:-50}"
NETWORK_IFACE="${CHAOS_IFACE:-eth0}"
PLAN_FILE=""
CONFIRM_YES=false
VERBOSE="${VERBOSE:-false}"
COLOR="${COLOR:-auto}"
CLEANUP_PIDS=()
CLEANUP_FILES=()
CHAOS_ACTIVE=false
# ---------------------------------------------------------------------------
# State
# ---------------------------------------------------------------------------
readonly SCRIPT_NAME="${0##*/}"
START_TIME=""
# ---------------------------------------------------------------------------
# Trap
# ---------------------------------------------------------------------------
trap cleanup_all EXIT INT TERM
# ---------------------------------------------------------------------------
# Safety — cleanup
# ---------------------------------------------------------------------------
cleanup_all() {
if [[ "${CHAOS_ACTIVE}" != "true" ]]; then
return
fi
CHAOS_ACTIVE=false
warn "Running cleanup..."
# Kill tracked background PIDs
local pid
for pid in "${CLEANUP_PIDS[@]}"; do
kill "$pid" 2>/dev/null || true
wait "$pid" 2>/dev/null || true
done
CLEANUP_PIDS=()
# Remove tracked temp files
local f
for f in "${CLEANUP_FILES[@]}"; do
if [[ -d "$f" ]] && mountpoint -q "$f" 2>/dev/null; then
umount "$f" 2>/dev/null || true
rmdir "$f" 2>/dev/null || true
elif [[ -f "$f" ]]; then
rm -f "$f" 2>/dev/null || true
elif [[ -d "$f" ]]; then
rmdir "$f" 2>/dev/null || true
fi
done
CLEANUP_FILES=()
# Remove tc qdiscs
tc qdisc del dev "$NETWORK_IFACE" root 2>/dev/null || true
# Restore resolv.conf from backup
if [[ -f /etc/resolv.conf.chaos-backup ]]; then
mv /etc/resolv.conf.chaos-backup /etc/resolv.conf 2>/dev/null || true
log "Restored /etc/resolv.conf from backup"
fi
log "Cleanup complete"
}
# ---------------------------------------------------------------------------
# Utilities
# ---------------------------------------------------------------------------
require_root() {
if [[ "$(id -u)" -ne 0 ]]; then
die "This operation requires root privileges. Run with sudo."
fi
}
confirm_action() {
local message="$1"
if [[ "$CONFIRM_YES" == "true" ]]; then
return 0
fi
printf "%b[?]%b %s [y/N] " "$YELLOW" "$RESET" "$message"
local answer
read -r answer
case "$answer" in
[yY]|[yY][eE][sS]) return 0 ;;
*) die "Aborted by user" ;;
esac
}
wait_duration() {
local remaining="$DURATION"
while [[ "$remaining" -gt 0 ]]; do
printf "\r %bTime remaining: %ds%b " "$DIM" "$remaining" "$RESET"
sleep 1
((remaining--)) || true
done
printf "\r%40s\r" ""
}
# ---------------------------------------------------------------------------
# Fault: cpu-stress
# ---------------------------------------------------------------------------
fault_cpu_stress() {
local cores
cores=$(nproc)
section_header "CPU Stress — saturating $cores cores for ${DURATION}s"
CHAOS_ACTIVE=true
local i
for ((i = 0; i < cores; i++)); do
while :; do :; done &
CLEANUP_PIDS+=("$!")
verbose "Spawned CPU worker PID $!"
done
log "Started $cores CPU stress workers"
wait_duration
cleanup_all
}
# ---------------------------------------------------------------------------
# Fault: memory-pressure
# ---------------------------------------------------------------------------
fault_memory_pressure() {
section_header "Memory Pressure — filling tmpfs for ${DURATION}s"
CHAOS_ACTIVE=true
local mount_dir
mount_dir=$(mktemp -d /tmp/chaos-mem-XXXXXX)
mount -t tmpfs -o size=256M tmpfs "$mount_dir"
CLEANUP_FILES+=("$mount_dir")
log "Mounted tmpfs at $mount_dir (256M)"
head -c 240M /dev/urandom > "${mount_dir}/fill.dat" 2>/dev/null || true
log "Filled tmpfs with ~240M of data"
wait_duration
cleanup_all
}
# ---------------------------------------------------------------------------
# Fault: disk-fill
# ---------------------------------------------------------------------------
fault_disk_fill() {
section_header "Disk Fill — filling ${FILL_PATH} to ${FILL_SIZE}% for ${DURATION}s"
CHAOS_ACTIVE=true
local current_usage target_bytes fill_file total_kb
fill_file="${FILL_PATH}/chaos-fill-$(date +%s).dat"
total_kb=$(df --output=size -k "$FILL_PATH" | tail -1 | tr -d ' ')
current_usage=$(df --output=pcent "$FILL_PATH" | tail -1 | tr -d ' %')
if [[ "$current_usage" -ge "$FILL_SIZE" ]]; then
warn "Disk already at ${current_usage}% — above target ${FILL_SIZE}%"
return
fi
target_bytes=$(( (FILL_SIZE - current_usage) * total_kb * 1024 / 100 ))
local target_mb=$(( target_bytes / 1048576 ))
log "Writing ${target_mb}M to $fill_file"
dd if=/dev/zero of="$fill_file" bs=1M count="$target_mb" status=none 2>/dev/null || true
CLEANUP_FILES+=("$fill_file")
log "Disk fill complete — $(df --output=pcent "$FILL_PATH" | tail -1 | tr -d ' ') used"
wait_duration
cleanup_all
}
# ---------------------------------------------------------------------------
# Fault: service-kill
# ---------------------------------------------------------------------------
fault_service_kill() {
if [[ -z "$TARGET_SERVICE" ]]; then
die "service-kill requires --target SERVICE_NAME"
fi
section_header "Service Kill — stopping ${TARGET_SERVICE} for ${DURATION}s"
CHAOS_ACTIVE=true
if ! systemctl is-active --quiet "$TARGET_SERVICE"; then
die "Service '$TARGET_SERVICE' is not currently active"
fi
confirm_action "Stop service '$TARGET_SERVICE' for ${DURATION}s?"
systemctl stop "$TARGET_SERVICE"
log "Stopped $TARGET_SERVICE"
wait_duration
log "Restarting $TARGET_SERVICE..."
systemctl start "$TARGET_SERVICE"
log "Service $TARGET_SERVICE restarted"
CHAOS_ACTIVE=false
}
# ---------------------------------------------------------------------------
# Fault: network-latency
# ---------------------------------------------------------------------------
fault_network_latency() {
section_header "Network Latency — ${LATENCY_MS}ms on ${NETWORK_IFACE} for ${DURATION}s"
CHAOS_ACTIVE=true
if ! command -v tc &>/dev/null; then
die "tc (iproute2) is required for network faults"
fi
tc qdisc del dev "$NETWORK_IFACE" root 2>/dev/null || true
tc qdisc add dev "$NETWORK_IFACE" root netem delay "${LATENCY_MS}ms"
log "Added ${LATENCY_MS}ms latency to $NETWORK_IFACE"
wait_duration
cleanup_all
}
# ---------------------------------------------------------------------------
# Fault: network-drop
# ---------------------------------------------------------------------------
fault_network_drop() {
section_header "Network Drop — ${DROP_PERCENT}% loss on ${NETWORK_IFACE} for ${DURATION}s"
CHAOS_ACTIVE=true
if ! command -v tc &>/dev/null; then
die "tc (iproute2) is required for network faults"
fi
tc qdisc del dev "$NETWORK_IFACE" root 2>/dev/null || true
tc qdisc add dev "$NETWORK_IFACE" root netem loss "${DROP_PERCENT}%"
log "Added ${DROP_PERCENT}% packet loss to $NETWORK_IFACE"
wait_duration
cleanup_all
}
# ---------------------------------------------------------------------------
# Fault: dns-failure
# ---------------------------------------------------------------------------
fault_dns_failure() {
section_header "DNS Failure — breaking DNS for ${DURATION}s"
CHAOS_ACTIVE=true
if [[ -f /etc/resolv.conf.chaos-backup ]]; then
die "A chaos backup of resolv.conf already exists — run --cleanup first"
fi
cp /etc/resolv.conf /etc/resolv.conf.chaos-backup
CLEANUP_FILES+=("/etc/resolv.conf.chaos-backup")
printf "# Chaos: DNS intentionally broken\nnameserver 127.0.0.254\n" > /etc/resolv.conf
log "Replaced /etc/resolv.conf with broken nameserver"
wait_duration
cleanup_all
}
# ---------------------------------------------------------------------------
# Fault: io-latency
# ---------------------------------------------------------------------------
fault_io_latency() {
section_header "I/O Latency — degrading I/O for ${DURATION}s"
CHAOS_ACTIVE=true
local io_file
io_file="${FILL_PATH}/chaos-io-$(date +%s).dat"
ionice -c 2 -n 7 dd if=/dev/urandom of="$io_file" bs=4K count=0 status=none 2>/dev/null &
CLEANUP_PIDS+=("$!")
CLEANUP_FILES+=("$io_file")
# Run continuous slow I/O in background
(
while :; do
ionice -c 3 dd if=/dev/zero of="$io_file" bs=4K count=256 conv=fdatasync status=none 2>/dev/null || true
sync
sleep 0.5
done
) &
CLEANUP_PIDS+=("$!")
log "Started degraded I/O worker (idle-class ionice)"
wait_duration
cleanup_all
}
# ---------------------------------------------------------------------------
# Dispatch
# ---------------------------------------------------------------------------
do_fault() {
require_root
if [[ -z "$FAULT_TYPE" ]]; then
die "No fault type specified. Use --fault TYPE"
fi
confirm_action "Inject fault '${FAULT_TYPE}' for ${DURATION}s?"
START_TIME=$(date +%s)
log "Starting fault injection: $FAULT_TYPE (duration: ${DURATION}s)"
case "$FAULT_TYPE" in
cpu-stress) fault_cpu_stress ;;
memory-pressure) fault_memory_pressure ;;
disk-fill) fault_disk_fill ;;
service-kill) fault_service_kill ;;
network-latency) fault_network_latency ;;
network-drop) fault_network_drop ;;
dns-failure) fault_dns_failure ;;
io-latency) fault_io_latency ;;
*) die "Unknown fault type: $FAULT_TYPE" ;;
esac
local elapsed=$(( $(date +%s) - START_TIME ))
log "Fault injection complete (${elapsed}s elapsed)"
}
# ---------------------------------------------------------------------------
# List fault types
# ---------------------------------------------------------------------------
do_list() {
section_header "Available Fault Types"
printf "\n"
printf " %-20s %s\n" "FAULT TYPE" "DESCRIPTION"
printf " ─────────────────────────────────────────────────────────────\n"
printf " %-20s %s\n" "cpu-stress" "Saturate all CPU cores"
printf " %-20s %s\n" "memory-pressure" "Fill memory via tmpfs allocation"
printf " %-20s %s\n" "disk-fill" "Fill disk to threshold percentage"
printf " %-20s %s\n" "service-kill" "Stop a systemd service temporarily"
printf " %-20s %s\n" "network-latency" "Add network latency via tc netem"
printf " %-20s %s\n" "network-drop" "Drop packets via tc netem"
printf " %-20s %s\n" "dns-failure" "Break DNS resolution temporarily"
printf " %-20s %s\n" "io-latency" "Degrade I/O performance via ionice"
printf "\n"
}
# ---------------------------------------------------------------------------
# Verify system health
# ---------------------------------------------------------------------------
do_verify() {
section_header "System Health Check"
local issues=0
# CPU load
local load_1m
load_1m=$(awk '{print $1}' /proc/loadavg)
local cores
cores=$(nproc)
if awk "BEGIN {exit !($load_1m > $cores * 0.9)}"; then
field_color "CPU load (1m):" "$RED" "${load_1m} — HIGH (cores: ${cores})"
((issues++)) || true
else
field_color "CPU load (1m):" "$GREEN" "${load_1m} (cores: ${cores})"
fi
# Memory
local mem_avail_kb mem_total_kb mem_pct
mem_total_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
mem_avail_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
mem_pct=$(( (mem_total_kb - mem_avail_kb) * 100 / mem_total_kb ))
if [[ "$mem_pct" -gt 90 ]]; then
field_color "Memory usage:" "$RED" "${mem_pct}% — HIGH"
((issues++)) || true
else
field_color "Memory usage:" "$GREEN" "${mem_pct}%"
fi
# Disk
local disk_pct
disk_pct=$(df --output=pcent / | tail -1 | tr -d ' %')
if [[ "$disk_pct" -gt 90 ]]; then
field_color "Disk usage (/):" "$RED" "${disk_pct}% — HIGH"
((issues++)) || true
else
field_color "Disk usage (/):" "$GREEN" "${disk_pct}%"
fi
# Network connectivity
if ping -c 1 -W 3 8.8.8.8 &>/dev/null; then
field_color "Network (ping):" "$GREEN" "OK"
else
field_color "Network (ping):" "$RED" "UNREACHABLE"
((issues++)) || true
fi
# DNS resolution
if host google.com &>/dev/null; then
field_color "DNS resolution:" "$GREEN" "OK"
else
field_color "DNS resolution:" "$RED" "FAILING"
((issues++)) || true
fi
# Chaos artifacts
if [[ -f /etc/resolv.conf.chaos-backup ]]; then
field_color "Chaos artifacts:" "$YELLOW" "resolv.conf backup found"
((issues++)) || true
else
field_color "Chaos artifacts:" "$GREEN" "None"
fi
printf "\n"
if [[ "$issues" -gt 0 ]]; then
warn "Found $issues issue(s)"
return 1
else
log "All checks passed"
return 0
fi
}
# ---------------------------------------------------------------------------
# Plan execution
# ---------------------------------------------------------------------------
do_plan() {
require_root
if [[ -z "$PLAN_FILE" ]]; then
die "No plan file specified. Use --plan FILE"
fi
if [[ ! -f "$PLAN_FILE" ]]; then
die "Plan file not found: $PLAN_FILE"
fi
if ! command -v jq &>/dev/null; then
die "jq is required for plan execution"
fi
section_header "Executing Chaos Plan: $PLAN_FILE"
local plan_length
plan_length=$(jq '.faults | length' "$PLAN_FILE")
log "Plan contains $plan_length fault(s)"
local i fault_entry f_type f_duration
for ((i = 0; i < plan_length; i++)); do
fault_entry=$(jq -r ".faults[$i]" "$PLAN_FILE")
f_type=$(printf '%s' "$fault_entry" | jq -r '.type')
f_duration=$(printf '%s' "$fault_entry" | jq -r '.duration // 30')
log "Step $((i + 1))/$plan_length: $f_type (${f_duration}s)"
FAULT_TYPE="$f_type"
DURATION="$f_duration"
# Extract optional fields
local f_target f_iface
f_target=$(printf '%s' "$fault_entry" | jq -r '.target // empty')
f_iface=$(printf '%s' "$fault_entry" | jq -r '.iface // empty')
[[ -n "$f_target" ]] && TARGET_SERVICE="$f_target"
[[ -n "$f_iface" ]] && NETWORK_IFACE="$f_iface"
case "$FAULT_TYPE" in
cpu-stress) fault_cpu_stress ;;
memory-pressure) fault_memory_pressure ;;
disk-fill) fault_disk_fill ;;
service-kill) fault_service_kill ;;
network-latency) fault_network_latency ;;
network-drop) fault_network_drop ;;
dns-failure) fault_dns_failure ;;
io-latency) fault_io_latency ;;
*) warn "Unknown fault type in plan: $FAULT_TYPE — skipping" ;;
esac
if [[ "$i" -lt $((plan_length - 1)) ]]; then
log "Pausing 5s before next fault..."
sleep 5
fi
done
log "Plan execution complete"
}
# ---------------------------------------------------------------------------
# Force cleanup
# ---------------------------------------------------------------------------
do_cleanup() {
require_root
section_header "Force Cleanup"
CHAOS_ACTIVE=true
cleanup_all
log "Force cleanup complete"
}
# ---------------------------------------------------------------------------
# Help
# ---------------------------------------------------------------------------
show_help() {
cat <<EOF
${SCRIPT_NAME} — Controlled failure injection and recovery verification
USAGE
sudo ./${SCRIPT_NAME} --fault TYPE [OPTIONS]
sudo ./${SCRIPT_NAME} --list
sudo ./${SCRIPT_NAME} --verify
sudo ./${SCRIPT_NAME} --plan FILE [--yes]
sudo ./${SCRIPT_NAME} --cleanup
MODES
--fault TYPE Inject a specific fault for --duration seconds
--list List all available fault types
--verify Run system health checks
--plan FILE Execute a JSON plan file of sequential faults
--cleanup Force cleanup of any leftover chaos artifacts
FAULT TYPES
cpu-stress Saturate all CPU cores
memory-pressure Fill memory via tmpfs allocation
disk-fill Fill disk to threshold percentage
service-kill Stop a systemd service (requires --target)
network-latency Add network latency via tc netem
network-drop Drop packets via tc netem
dns-failure Break DNS resolution temporarily
io-latency Degrade I/O performance
OPTIONS
--duration SEC Fault duration in seconds (default: 30)
--target SERVICE Target service for service-kill
--fill-path PATH Path for disk-fill (default: /tmp)
--fill-size PCT Disk fill target percent (default: 90)
--latency MS Latency in ms for network-latency (default: 200)
--drop PCT Drop percent for network-drop (default: 50)
--iface IFACE Network interface (default: eth0)
--yes Skip confirmation prompts
--verbose Enable verbose output
--no-color Disable color output
--help Show this help message
ENVIRONMENT VARIABLES
CHAOS_DURATION Default duration (seconds)
CHAOS_FILL_PATH Default fill path
CHAOS_FILL_SIZE Default fill size (percent)
CHAOS_LATENCY Default network latency (ms)
CHAOS_DROP Default packet drop (percent)
CHAOS_IFACE Default network interface
VERBOSE Set to 'true' for verbose output
COLOR Set to 'never' to disable colors
PLAN FILE FORMAT (JSON)
{
"faults": [
{ "type": "cpu-stress", "duration": 15 },
{ "type": "network-latency", "duration": 20 },
{ "type": "service-kill", "duration": 10, "target": "nginx" }
]
}
EXAMPLES
sudo ./${SCRIPT_NAME} --fault cpu-stress --duration 30
sudo ./${SCRIPT_NAME} --fault service-kill --target nginx --duration 60
sudo ./${SCRIPT_NAME} --fault network-latency --latency 500 --iface ens33
sudo ./${SCRIPT_NAME} --verify
sudo ./${SCRIPT_NAME} --plan chaos-plan.json --yes
EOF
}
# ---------------------------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------------------------
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--fault)
RUN_MODE="fault"
FAULT_TYPE="${2:-}"
[[ -z "$FAULT_TYPE" ]] && die "--fault requires a TYPE argument"
shift 2
;;
--list)
RUN_MODE="list"
shift
;;
--verify)
RUN_MODE="verify"
shift
;;
--plan)
RUN_MODE="plan"
PLAN_FILE="${2:-}"
[[ -z "$PLAN_FILE" ]] && die "--plan requires a FILE argument"
shift 2
;;
--cleanup)
RUN_MODE="cleanup"
shift
;;
--duration)
DURATION="${2:-}"
[[ -z "$DURATION" ]] && die "--duration requires a value"
shift 2
;;
--target)
TARGET_SERVICE="${2:-}"
[[ -z "$TARGET_SERVICE" ]] && die "--target requires a SERVICE name"
shift 2
;;
--fill-path)
FILL_PATH="${2:-}"
[[ -z "$FILL_PATH" ]] && die "--fill-path requires a PATH"
shift 2
;;
--fill-size)
FILL_SIZE="${2:-}"
[[ -z "$FILL_SIZE" ]] && die "--fill-size requires a percentage"
shift 2
;;
--latency)
LATENCY_MS="${2:-}"
[[ -z "$LATENCY_MS" ]] && die "--latency requires a value in ms"
shift 2
;;
--drop)
DROP_PERCENT="${2:-}"
[[ -z "$DROP_PERCENT" ]] && die "--drop requires a percentage"
shift 2
;;
--iface)
NETWORK_IFACE="${2:-}"
[[ -z "$NETWORK_IFACE" ]] && die "--iface requires an interface name"
shift 2
;;
--yes)
CONFIRM_YES=true
shift
;;
--verbose)
VERBOSE="true"
shift
;;
--no-color)
COLOR="never"
shift
;;
--help|-h)
RUN_MODE="help"
shift
;;
*)
die "Unknown option: $1 (see --help)"
;;
esac
done
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
main() {
parse_args "$@"
setup_colors
case "$RUN_MODE" in
fault) do_fault ;;
list) do_list ;;
verify) do_verify ;;
plan) do_plan ;;
cleanup) do_cleanup ;;
help) show_help ;;
"") show_help; die "No mode specified — use --fault, --list, --verify, --plan, or --cleanup" ;;
*) die "Unknown mode: $RUN_MODE" ;;
esac
}
main "$@"