#!/usr/bin/env bash ######################################################################################### #### rds-snapshot-manager.sh — Create, manage, audit, and prune AWS RDS snapshots #### #### Supports automated creation, cross-region copy, retention, and orphan detection #### #### Requires: bash 4+, aws-cli v2, jq #### #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### #### Version 1.01 #### #### #### #### Usage: #### #### export AWS_PROFILE="production" #### #### ./rds-snapshot-manager.sh --snapshot #### #### #### #### See --help for all options. #### ######################################################################################### set -euo pipefail # ── Defaults ────────────────────────────────────────────────────────── AWS_REGION="${AWS_REGION:-}" DB_IDENTIFIER="${DB_IDENTIFIER:-}" DB_TAG_KEY="${DB_TAG_KEY:-}" DB_TAG_VALUE="${DB_TAG_VALUE:-}" RETENTION_DAYS="${RETENTION_DAYS:-30}" COPY_TO_REGION="${COPY_TO_REGION:-}" DRY_RUN="${DRY_RUN:-true}" NO_WAIT="${NO_WAIT:-false}" RESTORE_INSTANCE_CLASS="${RESTORE_INSTANCE_CLASS:-}" OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" VERBOSE="${VERBOSE:-false}" COLOR="${COLOR:-auto}" # ── State ───────────────────────────────────────────────────────────── SCRIPT_NAME="$(basename "$0")" readonly SCRIPT_NAME RUN_MODE="" TARGET_SNAPSHOT="" START_TIME="" WARNINGS=0 # ── Colors ──────────────────────────────────────────────────────────── setup_colors() { if [[ "$COLOR" == "never" ]]; then RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" return fi if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' BOLD='\033[1m' DIM='\033[2m' RESET='\033[0m' else RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" fi } # ── Logging ─────────────────────────────────────────────────────────── log() { echo -e "${BLUE}[INFO]${RESET} $*"; } warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; ((WARNINGS++)) || true; } err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } die() { err "$*"; exit 1; } # ── AWS CLI wrapper ─────────────────────────────────────────────────── aws_cmd() { local args=("$@") [[ -n "$AWS_REGION" ]] && args+=(--region "$AWS_REGION") verbose "aws ${args[*]}" aws "${args[@]}" } # ── Dependency check ────────────────────────────────────────────────── check_deps() { for cmd in aws jq; do if ! command -v "$cmd" &>/dev/null; then die "${cmd} is required but not installed" fi done if ! aws sts get-caller-identity &>/dev/null; then die "AWS credentials not configured or expired" fi if [[ -z "$AWS_REGION" ]]; then AWS_REGION=$(aws configure get region 2>/dev/null || echo "") if [[ -z "$AWS_REGION" ]]; then die "AWS_REGION is required (set via env var or aws configure)" fi fi verbose "Using region: ${AWS_REGION}" } # ── Get DB instance list ────────────────────────────────────────────── get_db_instances() { if [[ -n "$DB_IDENTIFIER" ]]; then echo "$DB_IDENTIFIER" return fi local instances_json instances_json=$(aws_cmd rds describe-db-instances \ --query 'DBInstances[*].DBInstanceIdentifier' \ --output json 2>/dev/null) if [[ -n "$DB_TAG_KEY" ]]; then # Filter by tag — need to check each instance echo "$instances_json" | jq -r '.[]' | while IFS= read -r db_id; do local tags_json local arn arn=$(aws_cmd rds describe-db-instances \ --db-instance-identifier "$db_id" \ --query 'DBInstances[0].DBInstanceArn' \ --output text 2>/dev/null) || continue tags_json=$(aws_cmd rds list-tags-for-resource \ --resource-name "$arn" \ --output json 2>/dev/null) || continue local match match=$(echo "$tags_json" | jq -r ".TagList[] | select(.Key == \"${DB_TAG_KEY}\" and .Value == \"${DB_TAG_VALUE:-*}\") | .Key" 2>/dev/null) if [[ -n "$match" ]]; then echo "$db_id" fi done else echo "$instances_json" | jq -r '.[]' fi } # ── Get account ID ─────────────────────────────────────────────────── get_account_id() { aws sts get-caller-identity --query 'Account' --output text } # ══════════════════════════════════════════════════════════════════════ # SNAPSHOT MODE # ══════════════════════════════════════════════════════════════════════ do_snapshot() { log "Creating RDS snapshots..." local instances instances=$(get_db_instances) if [[ -z "$instances" ]]; then warn "No DB instances found matching criteria" return fi local db_count db_count=$(echo "$instances" | wc -l) log "Found ${db_count} DB instance(s) to snapshot" local created=0 failed=0 local now now=$(date -u +%Y-%m-%dT%H:%M:%SZ) while IFS= read -r db_id; do [[ -z "$db_id" ]] && continue verbose "Snapshotting ${db_id}..." local snap_id snap_id="rds-snap-${db_id}-$(date +%Y%m%d-%H%M%S)" local arn arn=$(aws_cmd rds describe-db-instances \ --db-instance-identifier "$db_id" \ --query 'DBInstances[0].DBInstanceArn' \ --output text 2>/dev/null) || arn="" if aws_cmd rds create-db-snapshot \ --db-instance-identifier "$db_id" \ --db-snapshot-identifier "$snap_id" \ --tags "Key=CreatedBy,Value=rds-snapshot-manager" \ "Key=CreatedAt,Value=${now}" \ "Key=DBIdentifier,Value=${db_id}" \ --output text &>/dev/null; then echo -e " ${GREEN}✓${RESET} ${db_id} → ${snap_id}" ((created++)) || true else echo -e " ${RED}✗${RESET} ${db_id} — snapshot creation failed" ((failed++)) || true fi done <<< "$instances" # Wait for completion if [[ "$NO_WAIT" != "true" && "$created" -gt 0 ]]; then log "Waiting for snapshot(s) to complete (this may take several minutes)..." while IFS= read -r db_id; do [[ -z "$db_id" ]] && continue local snap_id snap_id="rds-snap-${db_id}-$(date +%Y%m%d-%H%M%S)" # Wait for the most recent snapshot of this instance local latest_snap latest_snap=$(aws_cmd rds describe-db-snapshots \ --db-instance-identifier "$db_id" \ --query 'sort_by(DBSnapshots, &SnapshotCreateTime) | [-1].DBSnapshotIdentifier' \ --output text 2>/dev/null) || continue if aws_cmd rds wait db-snapshot-available \ --db-snapshot-identifier "$latest_snap" 2>/dev/null; then verbose "${latest_snap} completed" else warn "${latest_snap} did not complete within timeout" fi done <<< "$instances" fi echo "" log "Snapshots created: ${created}, failed: ${failed}" } # ══════════════════════════════════════════════════════════════════════ # PRUNE MODE # ══════════════════════════════════════════════════════════════════════ do_prune() { local cutoff_epoch cutoff_epoch=$(date -d "-${RETENTION_DAYS} days" +%s 2>/dev/null) || \ cutoff_epoch=$(date -v-"${RETENTION_DAYS}"d +%s 2>/dev/null) || \ die "Could not calculate retention cutoff date" local cutoff_date cutoff_date=$(date -d "@${cutoff_epoch}" +%Y-%m-%dT%H:%M:%S 2>/dev/null) || \ cutoff_date=$(date -r "${cutoff_epoch}" +%Y-%m-%dT%H:%M:%S 2>/dev/null) log "Pruning snapshots older than ${RETENTION_DAYS} days (before ${cutoff_date})" if [[ "$DRY_RUN" == "true" ]]; then log "${YELLOW}DRY RUN${RESET} — no snapshots will be deleted. Use --force to delete." fi # Get all managed snapshots local snapshots_json snapshots_json=$(aws_cmd rds describe-db-snapshots \ --snapshot-type manual \ --query 'DBSnapshots[*].{Id:DBSnapshotIdentifier,DB:DBInstanceIdentifier,Size:AllocatedStorage,Status:Status,Time:SnapshotCreateTime}' \ --output json 2>/dev/null) local deleted=0 skipped=0 total=0 echo "$snapshots_json" | jq -c '.[]' | while IFS= read -r snap; do local snap_id snap_time db_id size snap_id=$(echo "$snap" | jq -r '.Id') snap_time=$(echo "$snap" | jq -r '.Time') db_id=$(echo "$snap" | jq -r '.DB') size=$(echo "$snap" | jq -r '.Size') # Check if managed by us (check tags) local is_managed # shellcheck disable=SC2016 is_managed=$(aws_cmd rds list-tags-for-resource \ --resource-name "arn:aws:rds:${AWS_REGION}:$(get_account_id):snapshot:${snap_id}" \ --query 'TagList[?Key==`CreatedBy` && Value==`rds-snapshot-manager`].Key' \ --output text 2>/dev/null || echo "") if [[ -z "$is_managed" ]]; then verbose "Skipping ${snap_id} — not managed by rds-snapshot-manager" continue fi ((total++)) || true local snap_epoch snap_epoch=$(date -d "${snap_time}" +%s 2>/dev/null) || \ snap_epoch=$(date -j -f "%Y-%m-%dT%H:%M:%S" "${snap_time:0:19}" +%s 2>/dev/null) || continue if [[ "$snap_epoch" -lt "$cutoff_epoch" ]]; then local age_days=$(( ($(date +%s) - snap_epoch) / 86400 )) if [[ "$DRY_RUN" == "true" ]]; then echo -e " ${YELLOW}⊘${RESET} ${snap_id} (${db_id}, ${size}G, ${age_days}d) — would delete" else if aws_cmd rds delete-db-snapshot \ --db-snapshot-identifier "$snap_id" &>/dev/null; then echo -e " ${RED}✗${RESET} ${snap_id} (${db_id}, ${size}G, ${age_days}d) — deleted" ((deleted++)) || true else warn "Failed to delete ${snap_id}" fi fi else ((skipped++)) || true verbose "Keeping ${snap_id} — within retention" fi done echo "" if [[ "$DRY_RUN" == "true" ]]; then log "Dry run complete. Use --force to actually delete snapshots." else log "Deleted: ${deleted}, kept: ${skipped}" fi } # ══════════════════════════════════════════════════════════════════════ # COPY-REGION MODE # ══════════════════════════════════════════════════════════════════════ do_copy_region() { if [[ -z "$COPY_TO_REGION" ]]; then die "No target region specified. Use --copy-region REGION" fi log "Copying latest snapshots to ${COPY_TO_REGION}..." local instances instances=$(get_db_instances) if [[ -z "$instances" ]]; then warn "No DB instances found" return fi local copied=0 failed=0 while IFS= read -r db_id; do [[ -z "$db_id" ]] && continue local latest_snap latest_snap=$(aws_cmd rds describe-db-snapshots \ --db-instance-identifier "$db_id" \ --snapshot-type manual \ --query 'sort_by(DBSnapshots, &SnapshotCreateTime) | [-1].DBSnapshotIdentifier' \ --output text 2>/dev/null) if [[ -z "$latest_snap" || "$latest_snap" == "None" ]]; then warn "No snapshots found for ${db_id}" continue fi local source_arn source_arn="arn:aws:rds:${AWS_REGION}:$(get_account_id):snapshot:${latest_snap}" local target_snap="dr-${latest_snap}" verbose "Copying ${latest_snap} → ${COPY_TO_REGION}" if aws rds copy-db-snapshot \ --source-db-snapshot-identifier "$source_arn" \ --target-db-snapshot-identifier "$target_snap" \ --region "$COPY_TO_REGION" \ --copy-tags \ --output text &>/dev/null; then echo -e " ${GREEN}✓${RESET} ${latest_snap} → ${COPY_TO_REGION} (${target_snap})" ((copied++)) || true else echo -e " ${RED}✗${RESET} ${latest_snap} — copy failed" ((failed++)) || true fi done <<< "$instances" echo "" log "Copied: ${copied}, failed: ${failed}" } # ══════════════════════════════════════════════════════════════════════ # AUDIT MODE # ══════════════════════════════════════════════════════════════════════ do_audit() { log "Auditing RDS snapshots in ${AWS_REGION}..." # Get all manual snapshots local snapshots_json snapshots_json=$(aws_cmd rds describe-db-snapshots \ --snapshot-type manual \ --query 'DBSnapshots[*].{Id:DBSnapshotIdentifier,DB:DBInstanceIdentifier,Size:AllocatedStorage,Status:Status,Time:SnapshotCreateTime}' \ --output json 2>/dev/null) local total total=$(echo "$snapshots_json" | jq 'length') if [[ "$total" -eq 0 ]]; then log "No manual snapshots found" return fi # Display snapshot inventory echo "" printf " ${BOLD}%-30s %-20s %8s %6s %s${RESET}\n" "SNAPSHOT" "DB INSTANCE" "SIZE" "AGE" "STATUS" printf " %s\n" "$(printf '%.0s─' {1..80})" # Get list of current DB instances local current_instances current_instances=$(aws_cmd rds describe-db-instances \ --query 'DBInstances[*].DBInstanceIdentifier' \ --output text 2>/dev/null | tr '\t' '\n') echo "$snapshots_json" | jq -c '.[]' | while IFS= read -r snap; do local snap_id db_id size status snap_time snap_id=$(echo "$snap" | jq -r '.Id') db_id=$(echo "$snap" | jq -r '.DB') size=$(echo "$snap" | jq -r '.Size') status=$(echo "$snap" | jq -r '.Status') snap_time=$(echo "$snap" | jq -r '.Time') local snap_epoch age_days snap_epoch=$(date -d "${snap_time}" +%s 2>/dev/null) || snap_epoch=0 age_days=$(( ($(date +%s) - snap_epoch) / 86400 )) local label="manual" if ! echo "$current_instances" | grep -qx "$db_id"; then label="orphan" fi printf " %-30s %-20s %6s G %4sd %s\n" \ "${snap_id:0:30}" "${db_id:0:20}" "$size" "$age_days" "$label" done # Cost estimate local total_gb total_gb=$(echo "$snapshots_json" | jq '[.[].Size] | add // 0') local est_cost est_cost=$(awk "BEGIN { printf \"%.2f\", $total_gb * 0.05 }") echo "" echo -e " ${BOLD}Summary${RESET}" echo " Total snapshots: ${total}" echo " Total storage: ${total_gb} GiB" echo " Est. monthly cost: \$${est_cost}" # Unprotected instances echo "" echo -e " ${BOLD}DB Instances Without Recent Snapshots (>${RETENTION_DAYS}d)${RESET}" local db_instances db_instances=$(aws_cmd rds describe-db-instances \ --query 'DBInstances[*].{Id:DBInstanceIdentifier,Size:AllocatedStorage}' \ --output json 2>/dev/null) echo "$db_instances" | jq -c '.[]' | while IFS= read -r db; do local db_id db_size db_id=$(echo "$db" | jq -r '.Id') db_size=$(echo "$db" | jq -r '.Size') local latest_time latest_time=$(aws_cmd rds describe-db-snapshots \ --db-instance-identifier "$db_id" \ --snapshot-type manual \ --query 'sort_by(DBSnapshots, &SnapshotCreateTime) | [-1].SnapshotCreateTime' \ --output text 2>/dev/null) if [[ -z "$latest_time" || "$latest_time" == "None" ]]; then echo -e " ${RED}✗${RESET} ${db_id} (${db_size} GiB) — no snapshots" else local snap_epoch age_days snap_epoch=$(date -d "${latest_time}" +%s 2>/dev/null) || snap_epoch=0 age_days=$(( ($(date +%s) - snap_epoch) / 86400 )) if [[ "$age_days" -gt "$RETENTION_DAYS" ]]; then echo -e " ${YELLOW}!${RESET} ${db_id} (${db_size} GiB) — last snapshot ${age_days}d ago" fi fi done echo "" log "Audit complete" } # ══════════════════════════════════════════════════════════════════════ # RESTORE MODE # ══════════════════════════════════════════════════════════════════════ do_restore() { if [[ -z "$TARGET_SNAPSHOT" ]]; then die "No snapshot specified for restore" fi log "Restoring from snapshot ${TARGET_SNAPSHOT}..." # Get snapshot info local snap_info snap_info=$(aws_cmd rds describe-db-snapshots \ --db-snapshot-identifier "$TARGET_SNAPSHOT" \ --query 'DBSnapshots[0].{DB:DBInstanceIdentifier,Size:AllocatedStorage,Engine:Engine,EngineVer:EngineVersion}' \ --output json 2>/dev/null) || die "Snapshot not found: ${TARGET_SNAPSHOT}" local source_db engine engine_ver snap_size source_db=$(echo "$snap_info" | jq -r '.DB') engine=$(echo "$snap_info" | jq -r '.Engine') engine_ver=$(echo "$snap_info" | jq -r '.EngineVer') snap_size=$(echo "$snap_info" | jq -r '.Size') local new_db_id new_db_id="restored-${source_db}-$(date +%Y%m%d-%H%M%S)" local restore_args=( rds restore-db-instance-from-db-snapshot --db-instance-identifier "$new_db_id" --db-snapshot-identifier "$TARGET_SNAPSHOT" --tags "Key=CreatedBy,Value=rds-snapshot-manager" "Key=RestoredFrom,Value=${TARGET_SNAPSHOT}" "Key=SourceDB,Value=${source_db}" ) [[ -n "$RESTORE_INSTANCE_CLASS" ]] && restore_args+=(--db-instance-class "$RESTORE_INSTANCE_CLASS") if aws_cmd "${restore_args[@]}" --output text &>/dev/null; then echo -e " ${GREEN}✓${RESET} Restoring to ${new_db_id}" echo " Source snapshot: ${TARGET_SNAPSHOT}" echo " Engine: ${engine} ${engine_ver}" echo " Size: ${snap_size} GiB" if [[ "$NO_WAIT" != "true" ]]; then log "Waiting for instance to become available (this may take several minutes)..." if aws_cmd rds wait db-instance-available \ --db-instance-identifier "$new_db_id" 2>/dev/null; then echo -e " ${GREEN}✓${RESET} Instance ${new_db_id} is available" else warn "Instance did not become available within timeout" fi fi else die "Failed to restore from snapshot" fi } # ══════════════════════════════════════════════════════════════════════ # LIST MODE # ══════════════════════════════════════════════════════════════════════ do_list() { local query_args=(rds describe-db-snapshots --snapshot-type manual) [[ -n "$DB_IDENTIFIER" ]] && query_args+=(--db-instance-identifier "$DB_IDENTIFIER") local snapshots_json snapshots_json=$(aws_cmd "${query_args[@]}" \ --query 'sort_by(DBSnapshots, &SnapshotCreateTime) | reverse(@) | [*].{Id:DBSnapshotIdentifier,DB:DBInstanceIdentifier,Size:AllocatedStorage,Status:Status,Time:SnapshotCreateTime,Engine:Engine}' \ --output json 2>/dev/null) local total total=$(echo "$snapshots_json" | jq 'length') if [[ "$total" -eq 0 ]]; then log "No snapshots found" return fi if [[ "$OUTPUT_FORMAT" == "json" ]]; then echo "$snapshots_json" | jq '.' return fi echo "" printf " ${BOLD}%-30s %-18s %6s %-10s %-22s %s${RESET}\n" "SNAPSHOT" "DB INSTANCE" "SIZE" "STATUS" "CREATED" "ENGINE" printf " %s\n" "$(printf '%.0s─' {1..100})" echo "$snapshots_json" | jq -c '.[]' | while IFS= read -r snap; do local snap_id db_id size status snap_time engine snap_id=$(echo "$snap" | jq -r '.Id') db_id=$(echo "$snap" | jq -r '.DB') size=$(echo "$snap" | jq -r '.Size') status=$(echo "$snap" | jq -r '.Status') snap_time=$(echo "$snap" | jq -r '.Time' | cut -c1-19) engine=$(echo "$snap" | jq -r '.Engine') printf " %-30s %-18s %4s G %-10s %-22s %s\n" \ "${snap_id:0:30}" "${db_id:0:18}" "$size" "$status" "$snap_time" "$engine" done echo "" log "Total: ${total} snapshot(s)" } # ══════════════════════════════════════════════════════════════════════ # HELP # ══════════════════════════════════════════════════════════════════════ show_help() { cat </dev/null || echo 'default')}" echo "Mode: ${RUN_MODE}" echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" echo "" check_deps case "$RUN_MODE" in snapshot) do_snapshot ;; prune) do_prune ;; copy-region) do_copy_region ;; audit) do_audit ;; restore) do_restore ;; list) do_list ;; esac local end_time end_time=$(date +%s) local duration=$(( end_time - START_TIME )) log "Completed in ${duration}s" } main "$@"