#!/usr/bin/env bash ##################################################################################### #### kubernetes-smoke-tests.sh — Verify Kubernetes cluster health #### #### Checks API server, etcd, CoreDNS, scheduling, services, PVC, RBAC, certs. #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version: 1.0 #### #### #### #### Usage: ./kubernetes-smoke-tests.sh #### #### KUBECONFIG=/path/to/kubeconfig ./kubernetes-smoke-tests.sh #### #### #### #### See --help for all options. #### ##################################################################################### set -euo pipefail # ── Defaults ────────────────────────────────────────────────────────── TEST_NAMESPACE="${TEST_NAMESPACE:-smoke-test-$$}" TEST_IMAGE="${TEST_IMAGE:-busybox:latest}" SKIP_PVC="${SKIP_PVC:-false}" SKIP_NETPOL="${SKIP_NETPOL:-false}" SKIP_SCHEDULING="${SKIP_SCHEDULING:-false}" POD_TIMEOUT="${POD_TIMEOUT:-60}" STORAGE_CLASS="${STORAGE_CLASS:-}" OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" VERBOSE="${VERBOSE:-false}" COLOR="${COLOR:-auto}" # ── State ───────────────────────────────────────────────────────────── PASS=0 FAIL=0 SKIP=0 TOTAL=0 RESULTS=() START_TIME=$(date +%s) NS_CREATED="false" # ── Colors ──────────────────────────────────────────────────────────── setup_colors() { if [[ "$COLOR" == "never" ]]; then RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" return fi if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' BOLD='\033[1m' RESET='\033[0m' else RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" fi } # ── Helpers ─────────────────────────────────────────────────────────── log() { [[ "$VERBOSE" == "true" ]] && echo -e "${BLUE}# $*${RESET}" >&2 || true; } pass() { ((TOTAL++)) || true ((PASS++)) || true RESULTS+=("ok $TOTAL - $1") [[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "${GREEN} PASS${RESET} $1" >&2 || true } fail() { ((TOTAL++)) || true ((FAIL++)) || true RESULTS+=("not ok $TOTAL - $1") [[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "${RED} FAIL${RESET} $1" >&2 || true } skip() { ((TOTAL++)) || true ((SKIP++)) || true RESULTS+=("ok $TOTAL - $1 # SKIP $2") [[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "${YELLOW} SKIP${RESET} $1 ($2)" >&2 || true } wait_for_pod() { local name="$1" ns="$2" timeout="$3" local deadline=$(($(date +%s) + timeout)) while [[ $(date +%s) -lt $deadline ]]; do local phase phase=$(kubectl get pod "$name" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || echo "") if [[ "$phase" == "Running" ]]; then return 0 elif [[ "$phase" == "Failed" || "$phase" == "Error" ]]; then return 1 fi sleep 2 done return 1 } wait_for_pvc() { local name="$1" ns="$2" timeout="$3" local deadline=$(($(date +%s) + timeout)) while [[ $(date +%s) -lt $deadline ]]; do local phase phase=$(kubectl get pvc "$name" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || echo "") if [[ "$phase" == "Bound" ]]; then return 0 fi sleep 2 done return 1 } wait_for_endpoint() { local name="$1" ns="$2" timeout="$3" local deadline=$(($(date +%s) + timeout)) while [[ $(date +%s) -lt $deadline ]]; do local addrs addrs=$(kubectl get endpoints "$name" -n "$ns" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || echo "") if [[ -n "$addrs" ]]; then return 0 fi sleep 2 done return 1 } # ── Cleanup ─────────────────────────────────────────────────────────── cleanup() { if [[ "$NS_CREATED" == "true" ]]; then log "Cleaning up namespace $TEST_NAMESPACE" kubectl delete namespace "$TEST_NAMESPACE" --ignore-not-found --wait=false &>/dev/null || true fi } trap cleanup EXIT # ── Help ────────────────────────────────────────────────────────────── show_help() { cat <<'HELP' kubernetes-smoke-tests.sh — Verify Kubernetes cluster health Environment variables: KUBECONFIG Path to kubeconfig (default: kubectl default) TEST_NAMESPACE Namespace for test resources (default: smoke-test-) TEST_IMAGE Image for test pods (default: busybox:latest) SKIP_PVC Skip PVC test (default: false) SKIP_NETPOL Skip network policy test (default: false) SKIP_SCHEDULING Skip pod scheduling test (default: false) POD_TIMEOUT Seconds to wait for pods (default: 60) STORAGE_CLASS StorageClass for PVC test (default: cluster default) OUTPUT_FORMAT Output format: text, tap (default: text) COLOR Color output: auto, always, never (default: auto) VERBOSE Verbose logging: true/false (default: false) Exit codes: 0 All tests passed 1 One or more tests failed 2 Script error HELP exit 0 } [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && show_help # ── Preflight ───────────────────────────────────────────────────────── setup_colors if ! command -v kubectl &>/dev/null; then echo "ERROR: kubectl not found" >&2 exit 2 fi [[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "${BOLD}Kubernetes Smoke Tests${RESET}" >&2 [[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "Namespace: ${BLUE}$TEST_NAMESPACE${RESET}" >&2 [[ "$OUTPUT_FORMAT" == "text" ]] && echo "" >&2 # ── Tests ───────────────────────────────────────────────────────────── test_api_server() { log "Checking API server" if kubectl cluster-info &>/dev/null; then pass "API server reachable" else fail "API server unreachable" fi } test_nodes_ready() { log "Checking node readiness" local not_ready not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l) local total total=$(kubectl get nodes --no-headers 2>/dev/null | wc -l) if [[ "$not_ready" -eq 0 && "$total" -gt 0 ]]; then pass "all $total nodes Ready" elif [[ "$total" -eq 0 ]]; then fail "no nodes found" else fail "$not_ready of $total nodes not Ready" fi } test_control_plane() { log "Checking control plane pods" local components=("kube-apiserver" "kube-controller-manager" "kube-scheduler" "etcd") local all_ok=true local missing=() for comp in "${components[@]}"; do local count count=$(kubectl get pods -n kube-system -l "component=$comp" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) if [[ "$count" -eq 0 ]]; then # Try by name prefix for managed clusters count=$(kubectl get pods -n kube-system --no-headers 2>/dev/null | grep "^${comp}" | grep -c "Running" || true) fi if [[ "$count" -eq 0 ]]; then all_ok=false missing+=("$comp") fi done if [[ "$all_ok" == "true" ]]; then pass "control plane pods healthy" else fail "control plane pods missing: ${missing[*]}" fi } test_coredns_running() { log "Checking CoreDNS" local running running=$(kubectl get pods -n kube-system -l k8s-app=kube-dns --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) if [[ "$running" -gt 0 ]]; then pass "CoreDNS running ($running pods)" else fail "CoreDNS not running" fi } test_dns_resolution() { log "Testing DNS resolution inside cluster" if [[ "$SKIP_SCHEDULING" == "true" ]]; then skip "DNS resolution" "SKIP_SCHEDULING=true" return fi # Create namespace if needed if [[ "$NS_CREATED" != "true" ]]; then kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true NS_CREATED="true" fi kubectl run smoke-dns-$$ \ --namespace="$TEST_NAMESPACE" \ --image="$TEST_IMAGE" \ --restart=Never \ --command -- sleep 300 &>/dev/null 2>&1 || true if wait_for_pod "smoke-dns-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then if kubectl exec "smoke-dns-$$" -n "$TEST_NAMESPACE" -- \ nslookup kubernetes.default.svc.cluster.local &>/dev/null 2>&1; then pass "DNS resolution working (kubernetes.default)" else fail "DNS resolution failed inside pod" fi else fail "DNS test pod did not reach Running state" fi kubectl delete pod "smoke-dns-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true } test_pod_scheduling() { if [[ "$SKIP_SCHEDULING" == "true" ]]; then skip "pod scheduling" "SKIP_SCHEDULING=true" return fi log "Testing pod scheduling" if [[ "$NS_CREATED" != "true" ]]; then kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true NS_CREATED="true" fi kubectl run smoke-sched-$$ \ --namespace="$TEST_NAMESPACE" \ --image="$TEST_IMAGE" \ --restart=Never \ --command -- sleep 10 &>/dev/null 2>&1 || true if wait_for_pod "smoke-sched-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then pass "pod scheduling working" else fail "pod did not reach Running within ${POD_TIMEOUT}s" fi kubectl delete pod "smoke-sched-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true } test_service_endpoint() { if [[ "$SKIP_SCHEDULING" == "true" ]]; then skip "service endpoint" "SKIP_SCHEDULING=true" return fi log "Testing service endpoint creation" if [[ "$NS_CREATED" != "true" ]]; then kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true NS_CREATED="true" fi # Create deployment kubectl create deployment smoke-svc-$$ \ --namespace="$TEST_NAMESPACE" \ --image="$TEST_IMAGE" \ -- sleep 300 &>/dev/null 2>&1 || true # Expose as service kubectl expose deployment "smoke-svc-$$" \ --namespace="$TEST_NAMESPACE" \ --port=80 --target-port=80 &>/dev/null 2>&1 || true # Wait for endpoint if wait_for_endpoint "smoke-svc-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then pass "service endpoint has addresses" else fail "service endpoint has no addresses" fi kubectl delete deployment "smoke-svc-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true kubectl delete service "smoke-svc-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true } test_pvc_provisioning() { if [[ "$SKIP_PVC" == "true" ]]; then skip "PVC provisioning" "SKIP_PVC=true" return fi log "Testing PVC provisioning" if [[ "$NS_CREATED" != "true" ]]; then kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true NS_CREATED="true" fi local sc_spec="" if [[ -n "$STORAGE_CLASS" ]]; then sc_spec="storageClassName: $STORAGE_CLASS" fi cat </dev/null apiVersion: v1 kind: PersistentVolumeClaim metadata: name: smoke-pvc-$$ spec: accessModes: [ReadWriteOnce] ${sc_spec} resources: requests: storage: 1Gi PVC_EOF if wait_for_pvc "smoke-pvc-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then pass "PVC provisioned and Bound" else fail "PVC did not reach Bound state" fi kubectl delete pvc "smoke-pvc-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true } test_rbac() { log "Testing RBAC" if [[ "$NS_CREATED" != "true" ]]; then kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true NS_CREATED="true" fi # Create ServiceAccount kubectl create serviceaccount "smoke-sa-$$" \ -n "$TEST_NAMESPACE" &>/dev/null 2>&1 || true # Create Role cat </dev/null apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: smoke-role-$$ rules: - apiGroups: [""] resources: ["pods"] verbs: ["get", "list"] ROLE_EOF # Create RoleBinding cat </dev/null apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: smoke-rb-$$ roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: smoke-role-$$ subjects: - kind: ServiceAccount name: smoke-sa-$$ namespace: $TEST_NAMESPACE RB_EOF # Verify RBAC if kubectl auth can-i get pods \ --namespace="$TEST_NAMESPACE" \ --as="system:serviceaccount:${TEST_NAMESPACE}:smoke-sa-$$" &>/dev/null 2>&1; then pass "RBAC working (ServiceAccount can get pods)" else fail "RBAC check failed" fi } test_network_policy() { if [[ "$SKIP_NETPOL" == "true" ]]; then skip "network policy" "SKIP_NETPOL=true" return fi if [[ "$SKIP_SCHEDULING" == "true" ]]; then skip "network policy" "SKIP_SCHEDULING=true" return fi log "Testing network policy enforcement" if [[ "$NS_CREATED" != "true" ]]; then kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true NS_CREATED="true" fi # Create a target pod kubectl run smoke-netpol-target-$$ \ --namespace="$TEST_NAMESPACE" \ --image="$TEST_IMAGE" \ --restart=Never \ --command -- sleep 300 &>/dev/null 2>&1 || true if ! wait_for_pod "smoke-netpol-target-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then fail "network policy test pod did not start" return fi # Apply deny-all ingress policy cat </dev/null apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: name: smoke-deny-all-$$ spec: podSelector: {} policyTypes: [Ingress] NP_EOF # Network policy is applied — verify it exists if kubectl get networkpolicy "smoke-deny-all-$$" -n "$TEST_NAMESPACE" &>/dev/null; then pass "network policy applied (deny-all ingress)" else fail "network policy not applied" fi kubectl delete pod "smoke-netpol-target-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true kubectl delete networkpolicy "smoke-deny-all-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true } test_cert_expiry() { log "Checking API server certificate expiry" local api_server api_server=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}' 2>/dev/null || echo "") if [[ -z "$api_server" ]]; then skip "certificate expiry" "cannot determine API server address" return fi local host port host=$(echo "$api_server" | sed 's|https://||;s|:.*||') port=$(echo "$api_server" | grep -o ':[0-9]*' | tr -d ':') port="${port:-443}" if ! command -v openssl &>/dev/null; then skip "certificate expiry" "openssl not installed" return fi local end_date end_date=$(echo | openssl s_client -connect "${host}:${port}" -servername "$host" 2>/dev/null \ | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) if [[ -z "$end_date" ]]; then skip "certificate expiry" "could not read certificate" return fi local exp_epoch now_epoch days_left exp_epoch=$(date -d "$end_date" +%s 2>/dev/null || date -jf "%b %d %T %Y %Z" "$end_date" +%s 2>/dev/null || echo 0) now_epoch=$(date +%s) if [[ "$exp_epoch" -eq 0 ]]; then skip "certificate expiry" "could not parse date" return fi days_left=$(( (exp_epoch - now_epoch) / 86400 )) if [[ "$days_left" -ge 30 ]]; then pass "API server cert valid ($days_left days remaining)" else fail "API server cert expiring in $days_left days (min 30)" fi } test_resource_pressure() { log "Checking node resource pressure" local pressure_nodes=() local conditions=("MemoryPressure" "DiskPressure" "PIDPressure") for cond in "${conditions[@]}"; do local affected affected=$(kubectl get nodes -o jsonpath="{.items[?(@.status.conditions[?(@.type=='$cond')].status=='True')].metadata.name}" 2>/dev/null || echo "") if [[ -n "$affected" ]]; then pressure_nodes+=("$cond: $affected") fi done if [[ ${#pressure_nodes[@]} -eq 0 ]]; then pass "no node resource pressure" else fail "node pressure detected: ${pressure_nodes[*]}" fi } # ── Run Tests ───────────────────────────────────────────────────────── test_api_server test_nodes_ready test_control_plane test_coredns_running test_dns_resolution test_pod_scheduling test_service_endpoint test_pvc_provisioning test_rbac test_network_policy test_cert_expiry test_resource_pressure # ── Output ──────────────────────────────────────────────────────────── END_TIME=$(date +%s) DURATION=$((END_TIME - START_TIME)) if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "TAP version 13" echo "1..$TOTAL" for r in "${RESULTS[@]}"; do echo "$r"; done fi echo "" if [[ "$OUTPUT_FORMAT" == "text" ]]; then echo -e "${BOLD}Results:${RESET} pass=$PASS fail=$FAIL skip=$SKIP total=$TOTAL (${DURATION}s)" else echo "# pass=$PASS fail=$FAIL skip=$SKIP total=$TOTAL duration=${DURATION}s" fi [[ "$FAIL" -eq 0 ]]