diff --git a/PdnsInstall.sh b/PdnsInstall.sh index 1a239f7..bece42c 100644 --- a/PdnsInstall.sh +++ b/PdnsInstall.sh @@ -2,7 +2,7 @@ set -euo pipefail ###################################################################################### -#### Version 3.0 #### +#### Version 3.1 #### #### For questions or comments contact@mylinux.work #### #### Author : Phil Connor #### #### #### @@ -36,8 +36,8 @@ HTTP=nginx # <-- Choose apache or nginx --> The apa ########################## ip4=$(ip -o -4 route get 8.8.8.8 | awk '{print $7; exit}') host=$(hostname -f) -OS=$(grep PRETTY_NAME /etc/os-release | sed 's/PRETTY_NAME=//g' | tr -d '="' | awk '{print $1}' | tr '[:upper:]' '[:lower:]') -OSVER=$(grep VERSION_ID /etc/os-release | sed 's/VERSION_ID=//g' | tr -d '="' | awk -F. '{print $1}') +OS=$({ grep PRETTY_NAME /etc/os-release || true; } | sed 's/PRETTY_NAME=//g' | tr -d '="' | awk '{print $1}' | tr '[:upper:]' '[:lower:]') +OSVER=$({ grep VERSION_ID /etc/os-release || true; } | sed 's/VERSION_ID=//g' | tr -d '="' | awk -F. '{print $1}') # OS family: debian (ubuntu, debian), rhel (centos, red, oracle, rocky, alma), fedora, suse (opensuse) OS_FAMILY="" diff --git a/ad-health-exporter.ps1 b/ad-health-exporter.ps1 new file mode 100644 index 0000000..b024710 --- /dev/null +++ b/ad-health-exporter.ps1 @@ -0,0 +1,602 @@ +<# +.SYNOPSIS + Active Directory Health Prometheus Metrics Exporter +.DESCRIPTION + Prometheus exporter for Active Directory health. Monitors replication, + FSMO roles, account hygiene, dcdiag tests, DNS SRV records, SYSVOL/NETLOGON + accessibility, and domain controller metadata. Exports metrics as + Prometheus-compatible text format for windows_exporter textfile collector. +.PARAMETER Mode + Output mode: 'stdout' (default), 'textfile', or 'http' +.PARAMETER Port + HTTP port for http mode (default: 9198) +.PARAMETER TextfileDir + Directory for textfile collector output (default: C:\ProgramData\node_exporter) +.PARAMETER OutputFile + Custom output file path +.PARAMETER InstallScheduledTask + Switch to create a scheduled task for auto-start on system boot +.PARAMETER TaskIntervalMinutes + Interval in minutes for the scheduled task (default: 5) +.NOTES + Author: Phil Connor + Contact: contact@mylinux.work + Website: https://mylinux.work + License: MIT + Version: 1.0 + + Metrics Exported: + Core Status: + - windows_ad_up + - windows_ad_exporter_info{version} + + Replication: + - windows_ad_replication_failure_total{partner} + - windows_ad_replication_last_success_timestamp{partner} + - windows_ad_replication_pending_objects{partner} + + FSMO Roles: + - windows_ad_fsmo_role_holder{role} + + Account Health: + - windows_ad_account_lockout_total + - windows_ad_account_disabled_total + - windows_ad_account_expired_total + - windows_ad_account_password_expired_total + - windows_ad_account_inactive_total + + Computer Health: + - windows_ad_computer_stale_total + + Group Health: + - windows_ad_group_empty_total + + DCDiag: + - windows_ad_dcdiag_test_result{test} + + DNS and Shares: + - windows_ad_dns_srv_record_status + - windows_ad_sysvol_accessible + - windows_ad_netlogon_accessible + + Domain Controller Info: + - windows_ad_domain_controller_info{domain,site,gc} + + Exporter: + - windows_ad_exporter_duration_seconds + - windows_ad_exporter_last_run_timestamp +#> + +param( + [ValidateSet('stdout', 'textfile', 'http')] + [string]$Mode = 'stdout', + + [int]$Port = 9198, + + [string]$TextfileDir = 'C:\ProgramData\node_exporter', + + [string]$OutputFile, + + [switch]$InstallScheduledTask, + + [int]$TaskIntervalMinutes = 5 +) + +# Create a scheduled task to run this script every $TaskIntervalMinutes minutes +# The task will run as SYSTEM and will be set to run at startup +if ($InstallScheduledTask) { + $taskName = "ADHealthExporter" + $existingTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + + if (-not $existingTask) { + $taskAction = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$($MyInvocation.MyCommand.Path)`" -Mode textfile" + + if (-not $TaskIntervalMinutes -or $TaskIntervalMinutes -le 0) { + throw "TaskIntervalMinutes must be a positive integer" + } + + $taskTrigger = New-ScheduledTaskTrigger -Once -At (Get-Date).AddMinutes(1) -RepetitionInterval (New-TimeSpan -Minutes $TaskIntervalMinutes) -RepetitionDuration (New-TimeSpan -Days 365) + $taskPrincipal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest + + try { + Write-Host "Creating scheduled task: $taskName" + Register-ScheduledTask -TaskName $taskName -Action $taskAction -Trigger $taskTrigger -Principal $taskPrincipal -Description "Exports Active Directory health metrics for Prometheus every $TaskIntervalMinutes minutes" + + $createdTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + if (-not $createdTask) { + throw "Failed to verify scheduled task creation" + } + Write-Host "Successfully created scheduled task: $taskName" -ForegroundColor Green + } catch { + Write-Error "Failed to create auto-start task: $($_.Exception.Message)" + throw + } + } else { + Write-Host "Scheduled task '$taskName' already exists, skipping creation" + } +} + +$ErrorActionPreference = 'SilentlyContinue' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +function Get-UnixTimestamp { + [int][double]::Parse((Get-Date -UFormat '%s')) +} + +function Format-MetricValue { + param([double]$Value, [int]$Decimals = 2) + [math]::Round($Value, $Decimals) +} + +function Sanitize-LabelValue { + param([string]$Value) + $Value -replace '\\', '\\\\' -replace '"', '\\"' -replace "`n", '\\n' +} + +# ============================================================================ +# REPLICATION METRICS +# ============================================================================ + +function Get-ReplicationMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $partners = Get-ADReplicationPartnerMetadata -Target * -ErrorAction Stop + + [void]$sb.AppendLine('# HELP windows_ad_replication_failure_total Replication failures per partner') + [void]$sb.AppendLine('# TYPE windows_ad_replication_failure_total gauge') + foreach ($p in $partners) { + $partnerName = Sanitize-LabelValue ($p.Partner -replace '^CN=NTDS Settings,CN=', '' -replace ',.*$', '') + $failures = if ($p.ConsecutiveReplicationFailures) { $p.ConsecutiveReplicationFailures } else { 0 } + [void]$sb.AppendLine("windows_ad_replication_failure_total{partner=`"$partnerName`"} $failures") + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP windows_ad_replication_last_success_timestamp Last successful replication per partner (unix timestamp)') + [void]$sb.AppendLine('# TYPE windows_ad_replication_last_success_timestamp gauge') + foreach ($p in $partners) { + $partnerName = Sanitize-LabelValue ($p.Partner -replace '^CN=NTDS Settings,CN=', '' -replace ',.*$', '') + $ts = 0 + if ($p.LastReplicationSuccess) { + $epoch = [datetime]'1970-01-01' + $ts = [int]($p.LastReplicationSuccess.ToUniversalTime() - $epoch).TotalSeconds + } + [void]$sb.AppendLine("windows_ad_replication_last_success_timestamp{partner=`"$partnerName`"} $ts") + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP windows_ad_replication_pending_objects Pending replication objects per partner') + [void]$sb.AppendLine('# TYPE windows_ad_replication_pending_objects gauge') + foreach ($p in $partners) { + $partnerName = Sanitize-LabelValue ($p.Partner -replace '^CN=NTDS Settings,CN=', '' -replace ',.*$', '') + $pending = if ($p.InboundNeighbors) { + ($p.InboundNeighbors | Measure-Object -Property EstimatedChanges -Sum).Sum + } else { 0 } + if (-not $pending) { $pending = 0 } + [void]$sb.AppendLine("windows_ad_replication_pending_objects{partner=`"$partnerName`"} $pending") + } + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect replication metrics: $_" + [void]$sb.AppendLine('# HELP windows_ad_replication_failure_total Replication failures per partner') + [void]$sb.AppendLine('# TYPE windows_ad_replication_failure_total gauge') + [void]$sb.AppendLine('') + } + + $sb.ToString() +} + +# ============================================================================ +# FSMO ROLE METRICS +# ============================================================================ + +function Get-FsmoRoleMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $domain = Get-ADDomain -ErrorAction Stop + $forest = Get-ADForest -ErrorAction Stop + $localDC = $env:COMPUTERNAME + + $fsmoRoles = @{ + 'PDCEmulator' = $domain.PDCEmulator + 'RIDMaster' = $domain.RIDMaster + 'InfrastructureMaster' = $domain.InfrastructureMaster + 'SchemaMaster' = $forest.SchemaMaster + 'DomainNamingMaster' = $forest.DomainNamingMaster + } + + [void]$sb.AppendLine('# HELP windows_ad_fsmo_role_holder FSMO role holder (1 if this DC holds the role)') + [void]$sb.AppendLine('# TYPE windows_ad_fsmo_role_holder gauge') + foreach ($role in $fsmoRoles.GetEnumerator()) { + $holdsRole = if ($role.Value -match "^$localDC(\.|$)") { 1 } else { 0 } + [void]$sb.AppendLine("windows_ad_fsmo_role_holder{role=`"$($role.Key)`"} $holdsRole") + } + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect FSMO role metrics: $_" + [void]$sb.AppendLine('# HELP windows_ad_fsmo_role_holder FSMO role holder (1 if this DC holds the role)') + [void]$sb.AppendLine('# TYPE windows_ad_fsmo_role_holder gauge') + [void]$sb.AppendLine('') + } + + $sb.ToString() +} + +# ============================================================================ +# ACCOUNT HEALTH METRICS +# ============================================================================ + +function Get-AccountHealthMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + # Locked out accounts + $lockedOut = @(Search-ADAccount -LockedOut -ErrorAction Stop).Count + [void]$sb.AppendLine('# HELP windows_ad_account_lockout_total Number of locked out accounts') + [void]$sb.AppendLine('# TYPE windows_ad_account_lockout_total gauge') + [void]$sb.AppendLine("windows_ad_account_lockout_total $lockedOut") + [void]$sb.AppendLine('') + + # Disabled accounts + $disabled = @(Search-ADAccount -AccountDisabled -ErrorAction Stop).Count + [void]$sb.AppendLine('# HELP windows_ad_account_disabled_total Number of disabled accounts') + [void]$sb.AppendLine('# TYPE windows_ad_account_disabled_total gauge') + [void]$sb.AppendLine("windows_ad_account_disabled_total $disabled") + [void]$sb.AppendLine('') + + # Expired accounts + $expired = @(Search-ADAccount -AccountExpired -ErrorAction Stop).Count + [void]$sb.AppendLine('# HELP windows_ad_account_expired_total Number of expired accounts') + [void]$sb.AppendLine('# TYPE windows_ad_account_expired_total gauge') + [void]$sb.AppendLine("windows_ad_account_expired_total $expired") + [void]$sb.AppendLine('') + + # Password expired accounts + $pwdExpired = @(Search-ADAccount -PasswordExpired -ErrorAction Stop).Count + [void]$sb.AppendLine('# HELP windows_ad_account_password_expired_total Accounts with expired passwords') + [void]$sb.AppendLine('# TYPE windows_ad_account_password_expired_total gauge') + [void]$sb.AppendLine("windows_ad_account_password_expired_total $pwdExpired") + [void]$sb.AppendLine('') + + # Inactive accounts (no logon in 90 days) + $inactive = @(Search-ADAccount -AccountInactive -TimeSpan (New-TimeSpan -Days 90) -UsersOnly -ErrorAction Stop).Count + [void]$sb.AppendLine('# HELP windows_ad_account_inactive_total Accounts inactive for more than 90 days') + [void]$sb.AppendLine('# TYPE windows_ad_account_inactive_total gauge') + [void]$sb.AppendLine("windows_ad_account_inactive_total $inactive") + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect account health metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# COMPUTER HEALTH METRICS +# ============================================================================ + +function Get-ComputerHealthMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $staleComputers = @(Search-ADAccount -AccountInactive -TimeSpan (New-TimeSpan -Days 90) -ComputersOnly -ErrorAction Stop).Count + [void]$sb.AppendLine('# HELP windows_ad_computer_stale_total Computers not logged in for more than 90 days') + [void]$sb.AppendLine('# TYPE windows_ad_computer_stale_total gauge') + [void]$sb.AppendLine("windows_ad_computer_stale_total $staleComputers") + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect computer health metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# GROUP HEALTH METRICS +# ============================================================================ + +function Get-GroupHealthMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $allGroups = Get-ADGroup -Filter { GroupCategory -eq 'Security' } -Properties Members -ErrorAction Stop + $emptyGroups = @($allGroups | Where-Object { $_.Members.Count -eq 0 }).Count + [void]$sb.AppendLine('# HELP windows_ad_group_empty_total Empty security groups') + [void]$sb.AppendLine('# TYPE windows_ad_group_empty_total gauge') + [void]$sb.AppendLine("windows_ad_group_empty_total $emptyGroups") + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect group health metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# DCDIAG METRICS +# ============================================================================ + +function Get-DcdiagMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $tests = @('Connectivity', 'Replications', 'DNS', 'Services', 'Advertising', 'FrsEvent', 'KccEvent') + + [void]$sb.AppendLine('# HELP windows_ad_dcdiag_test_result DCDiag test result (1=pass, 0=fail)') + [void]$sb.AppendLine('# TYPE windows_ad_dcdiag_test_result gauge') + + foreach ($test in $tests) { + try { + $output = dcdiag /test:$test 2>&1 | Out-String + $passed = if ($output -match "passed test $test") { 1 } else { 0 } + [void]$sb.AppendLine("windows_ad_dcdiag_test_result{test=`"$test`"} $passed") + } + catch { + [void]$sb.AppendLine("windows_ad_dcdiag_test_result{test=`"$test`"} 0") + } + } + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect dcdiag metrics: $_" + [void]$sb.AppendLine('# HELP windows_ad_dcdiag_test_result DCDiag test result (1=pass, 0=fail)') + [void]$sb.AppendLine('# TYPE windows_ad_dcdiag_test_result gauge') + [void]$sb.AppendLine('') + } + + $sb.ToString() +} + +# ============================================================================ +# DNS AND SHARE ACCESSIBILITY METRICS +# ============================================================================ + +function Get-DnsAndShareMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + # DNS SRV record check + $domain = (Get-ADDomain -ErrorAction Stop).DNSRoot + $srvOk = 0 + try { + $srvResult = Resolve-DnsName -Name "_ldap._tcp.dc._msdcs.$domain" -Type SRV -ErrorAction Stop + if ($srvResult) { $srvOk = 1 } + } catch {} + + [void]$sb.AppendLine('# HELP windows_ad_dns_srv_record_status DNS SRV record health (1=OK, 0=missing)') + [void]$sb.AppendLine('# TYPE windows_ad_dns_srv_record_status gauge') + [void]$sb.AppendLine("windows_ad_dns_srv_record_status $srvOk") + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to check DNS SRV records: $_" + [void]$sb.AppendLine('# HELP windows_ad_dns_srv_record_status DNS SRV record health (1=OK, 0=missing)') + [void]$sb.AppendLine('# TYPE windows_ad_dns_srv_record_status gauge') + [void]$sb.AppendLine("windows_ad_dns_srv_record_status 0") + [void]$sb.AppendLine('') + } + + # SYSVOL accessibility + try { + $dcName = $env:COMPUTERNAME + $sysvolOk = if (Test-Path "\\$dcName\SYSVOL") { 1 } else { 0 } + } + catch { + $sysvolOk = 0 + } + [void]$sb.AppendLine('# HELP windows_ad_sysvol_accessible SYSVOL share accessibility (1=OK, 0=fail)') + [void]$sb.AppendLine('# TYPE windows_ad_sysvol_accessible gauge') + [void]$sb.AppendLine("windows_ad_sysvol_accessible $sysvolOk") + [void]$sb.AppendLine('') + + # NETLOGON accessibility + try { + $netlogonOk = if (Test-Path "\\$dcName\NETLOGON") { 1 } else { 0 } + } + catch { + $netlogonOk = 0 + } + [void]$sb.AppendLine('# HELP windows_ad_netlogon_accessible NETLOGON share accessibility (1=OK, 0=fail)') + [void]$sb.AppendLine('# TYPE windows_ad_netlogon_accessible gauge') + [void]$sb.AppendLine("windows_ad_netlogon_accessible $netlogonOk") + [void]$sb.AppendLine('') + + $sb.ToString() +} + +# ============================================================================ +# DOMAIN CONTROLLER INFO +# ============================================================================ + +function Get-DomainControllerInfoMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $dc = Get-ADDomainController -ErrorAction Stop + $domainName = Sanitize-LabelValue $dc.Domain + $siteName = Sanitize-LabelValue $dc.Site + $isGC = if ($dc.IsGlobalCatalog) { "true" } else { "false" } + + [void]$sb.AppendLine('# HELP windows_ad_domain_controller_info Domain controller metadata') + [void]$sb.AppendLine('# TYPE windows_ad_domain_controller_info gauge') + [void]$sb.AppendLine("windows_ad_domain_controller_info{domain=`"$domainName`",site=`"$siteName`",gc=`"$isGC`"} 1") + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect domain controller info: $_" + [void]$sb.AppendLine('# HELP windows_ad_domain_controller_info Domain controller metadata') + [void]$sb.AppendLine('# TYPE windows_ad_domain_controller_info gauge') + [void]$sb.AppendLine('') + } + + $sb.ToString() +} + +# ============================================================================ +# COLLECT ALL METRICS +# ============================================================================ + +function Get-AllMetrics { + $scriptStart = Get-Date + $sb = [System.Text.StringBuilder]::new() + + # Exporter up + [void]$sb.AppendLine('# HELP windows_ad_up Exporter status (1=up, 0=down)') + [void]$sb.AppendLine('# TYPE windows_ad_up gauge') + [void]$sb.AppendLine('windows_ad_up 1') + [void]$sb.AppendLine('') + + # Exporter info + [void]$sb.AppendLine('# HELP windows_ad_exporter_info Exporter version information') + [void]$sb.AppendLine('# TYPE windows_ad_exporter_info gauge') + [void]$sb.AppendLine('windows_ad_exporter_info{version="1.0"} 1') + [void]$sb.AppendLine('') + + # Collect all sections + [void]$sb.Append((Get-ReplicationMetrics)) + [void]$sb.Append((Get-FsmoRoleMetrics)) + [void]$sb.Append((Get-AccountHealthMetrics)) + [void]$sb.Append((Get-ComputerHealthMetrics)) + [void]$sb.Append((Get-GroupHealthMetrics)) + [void]$sb.Append((Get-DcdiagMetrics)) + [void]$sb.Append((Get-DnsAndShareMetrics)) + [void]$sb.Append((Get-DomainControllerInfoMetrics)) + + # Exporter runtime + $scriptEnd = Get-Date + $duration = Format-MetricValue ($scriptEnd - $scriptStart).TotalSeconds + $timestamp = Get-UnixTimestamp + + [void]$sb.AppendLine('# HELP windows_ad_exporter_duration_seconds Time to generate all metrics') + [void]$sb.AppendLine('# TYPE windows_ad_exporter_duration_seconds gauge') + [void]$sb.AppendLine("windows_ad_exporter_duration_seconds $duration") + [void]$sb.AppendLine('') + [void]$sb.AppendLine('# HELP windows_ad_exporter_last_run_timestamp Unix timestamp of last successful run') + [void]$sb.AppendLine('# TYPE windows_ad_exporter_last_run_timestamp gauge') + [void]$sb.AppendLine("windows_ad_exporter_last_run_timestamp $timestamp") + [void]$sb.AppendLine('') + + $sb.ToString() +} + +# ============================================================================ +# HTTP SERVER MODE +# ============================================================================ + +function Start-HttpServer { + param([int]$ListenPort) + + $prefix = "http://+:$ListenPort/" + $listener = [System.Net.HttpListener]::new() + $listener.Prefixes.Add($prefix) + + try { + $listener.Start() + Write-Host "Starting Active Directory health exporter on port $ListenPort..." -ForegroundColor Green + Write-Host "Metrics available at http://localhost:$ListenPort/metrics" + + while ($listener.IsListening) { + $context = $listener.GetContext() + $request = $context.Request + $response = $context.Response + + if ($request.Url.AbsolutePath -eq '/metrics') { + $metrics = Get-AllMetrics + $buffer = [System.Text.Encoding]::UTF8.GetBytes($metrics) + $response.ContentType = 'text/plain; version=0.0.4; charset=utf-8' + } + else { + $html = @" + + +AD Health Exporter v1.0 + +

Active Directory Health Exporter v1.0

+

Metrics

+

Sections

+ + + +"@ + $buffer = [System.Text.Encoding]::UTF8.GetBytes($html) + $response.ContentType = 'text/html; charset=utf-8' + } + + $response.ContentLength64 = $buffer.Length + $response.OutputStream.Write($buffer, 0, $buffer.Length) + $response.OutputStream.Close() + } + } + catch { + Write-Error "HTTP server error: $_" + Write-Error "If access denied, run: netsh http add urlacl url=http://+:$ListenPort/ user=Everyone" + } + finally { + if ($listener.IsListening) { + $listener.Stop() + } + } +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +switch ($Mode) { + 'http' { + Start-HttpServer -ListenPort $Port + } + 'textfile' { + if (-not $OutputFile) { + $OutputFile = Join-Path $TextfileDir 'windows_ad_health.prom' + } + + $outputDir = Split-Path $OutputFile -Parent + if (-not (Test-Path $outputDir)) { + New-Item -Path $outputDir -ItemType Directory -Force | Out-Null + } + + $tempFile = Join-Path $outputDir ".windows_ad_health_metrics.$PID.tmp" + + try { + $metrics = Get-AllMetrics + $metrics | Out-File -FilePath $tempFile -Encoding utf8 -NoNewline + + $lineCount = ($metrics -split "`n").Count + if ($lineCount -lt 10) { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Metrics file too small ($lineCount lines), keeping previous" + exit 1 + } + + Move-Item -Path $tempFile -Destination $OutputFile -Force + Write-Host "Metrics written to $OutputFile ($lineCount lines)" -ForegroundColor Green + } + catch { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Failed to generate metrics: $_" + exit 1 + } + } + default { + Get-AllMetrics | Write-Output + } +} diff --git a/add-apache-bot-block.sh b/add-apache-bot-block.sh new file mode 100755 index 0000000..5686164 --- /dev/null +++ b/add-apache-bot-block.sh @@ -0,0 +1,374 @@ +#!/bin/bash +################################################################################ +# Script Name: add-apache-bot-block.sh +# Version: 1.1 +# Description: Automate AI scraper, SEO bot, vulnerability scanner, and +# scraping framework blocking on standard Apache servers. +# Creates mod_rewrite rules server-wide or per-directory via +# .htaccess. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - Apache installed (apache2 on Debian/Ubuntu or httpd on RHEL/CentOS) +# - Root access +# - mod_rewrite available +# +# Usage: +# sudo ./add-apache-bot-block.sh +# sudo ./add-apache-bot-block.sh --dry-run +# sudo ./add-apache-bot-block.sh --remove +# sudo ./add-apache-bot-block.sh --htaccess /var/www/html +# sudo ./add-apache-bot-block.sh --htaccess /var/www/html --remove +# +# Changelog: +# 1.1 — 2026-05-04: Removed OAI-SearchBot from blocklist. User-facing fetcher +# bot, not a training crawler. Blocking it prevents your content from +# being cited in AI search answers. +# +################################################################################ + +set -euo pipefail + +# --- Configuration --- +DRY_RUN=false +REMOVE=false +HTACCESS_PATH="" +DISTRO="" # debian or rhel +CONF_FILE="" # set after distro detection +APACHE_SVC="" # apache2 or httpd + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +usage() { + cat <&2 + exit 1 +fi + +# --- Distro detection --- +detect_distro() { + if [[ -f /etc/debian_version ]]; then + DISTRO="debian" + CONF_FILE="/etc/apache2/conf-available/bot-block.conf" + APACHE_SVC="apache2" + elif [[ -f /etc/redhat-release ]]; then + DISTRO="rhel" + CONF_FILE="/etc/httpd/conf.d/bot-block.conf" + APACHE_SVC="httpd" + else + echo -e "${RED}Error: Unsupported distribution (neither Debian/Ubuntu nor RHEL/CentOS)${NC}" >&2 + exit 1 + fi +} + +detect_distro + +# --- Apache check --- +if ! command -v apachectl &>/dev/null; then + echo -e "${RED}Error: Apache (${APACHE_SVC}) not found${NC}" >&2 + exit 1 +fi + +# --- Bot-block rules content --- +MANAGED_START="# bot-block-managed-start" +MANAGED_END="# bot-block-managed-end" + +generate_rules() { + cat <<'RULES' +# bot-block-managed-start +# Bot-blocking rules for Apache — generated by add-apache-bot-block.sh +# https://mylinux.work + +RewriteEngine On + +# AI scrapers +RewriteCond %{HTTP_USER_AGENT} ABEvalBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} GPTBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} ClaudeBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} anthropic-ai [NC,OR] +RewriteCond %{HTTP_USER_AGENT} CCBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Bytespider [NC,OR] +RewriteCond %{HTTP_USER_AGENT} TikTokSpider [NC,OR] +RewriteCond %{HTTP_USER_AGENT} cohere-ai [NC,OR] +RewriteCond %{HTTP_USER_AGENT} PerplexityBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Diffbot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} MistralBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} YandexGPTBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} meta-externalagent [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Meta-ExternalFetcher [NC,OR] +RewriteCond %{HTTP_USER_AGENT} meta-webindexer [NC,OR] +RewriteCond %{HTTP_USER_AGENT} PetalBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Amazonbot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Amzn-SearchBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} AI2Bot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Timpibot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} img2dataset [NC,OR] +RewriteCond %{HTTP_USER_AGENT} YouBot [NC,OR] + +# SEO scrapers +RewriteCond %{HTTP_USER_AGENT} MJ12bot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} SemrushBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} AhrefsBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} DotBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} DataForSeoBot [NC,OR] +RewriteCond %{HTTP_USER_AGENT} SERanking [NC,OR] + +# Vulnerability scanners +RewriteCond %{HTTP_USER_AGENT} Nikto [NC,OR] +RewriteCond %{HTTP_USER_AGENT} sqlmap [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Nmap [NC,OR] +RewriteCond %{HTTP_USER_AGENT} masscan [NC,OR] +RewriteCond %{HTTP_USER_AGENT} ZmEu [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Morpheus [NC,OR] + +# Scraping frameworks +RewriteCond %{HTTP_USER_AGENT} Scrapy [NC,OR] +RewriteCond %{HTTP_USER_AGENT} python-requests [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Go-http-client [NC,OR] +RewriteCond %{HTTP_USER_AGENT} Java/ [NC,OR] +RewriteCond %{HTTP_USER_AGENT} libwww-perl [NC,OR] +RewriteCond %{HTTP_USER_AGENT} trafilatura [NC] +RewriteRule .* - [F] + +# Block broken srcset scrapers +RewriteCond %{REQUEST_URI} %20[0-9]+w,https?:// [NC] +RewriteRule .* - [F] +# bot-block-managed-end +RULES +} + +# ===================================================== +# --remove mode +# ===================================================== +if [[ "$REMOVE" == "true" ]]; then + + # --- Remove from .htaccess --- + if [[ -n "$HTACCESS_PATH" ]]; then + HTACCESS_FILE="${HTACCESS_PATH%/}/.htaccess" + step "Removing bot-block rules from ${HTACCESS_FILE}" + + if [[ ! -f "$HTACCESS_FILE" ]]; then + warn "File not found: ${HTACCESS_FILE} — nothing to remove" + exit 0 + fi + + if ! grep -q "$MANAGED_START" "$HTACCESS_FILE"; then + warn "No managed bot-block block found in ${HTACCESS_FILE}" + exit 0 + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would strip managed block from ${HTACCESS_FILE}" + else + cp "$HTACCESS_FILE" "${HTACCESS_FILE}.bak.$(date +%s)" + warn "Backup created" + sed -i "/${MANAGED_START}/,/${MANAGED_END}/d" "$HTACCESS_FILE" + info "Managed block removed from ${HTACCESS_FILE}" + fi + exit 0 + fi + + # --- Remove server-wide conf --- + step "Removing bot-block configuration" + + if [[ ! -f "$CONF_FILE" ]]; then + warn "Config not found: ${CONF_FILE} — nothing to remove" + exit 0 + fi + + if [[ "$DRY_RUN" == "true" ]]; then + if [[ "$DISTRO" == "debian" ]]; then + echo " Would run: a2disconf bot-block" + fi + echo " Would remove: ${CONF_FILE}" + echo " Would reload: ${APACHE_SVC}" + else + if [[ "$DISTRO" == "debian" ]]; then + a2disconf bot-block 2>/dev/null || true + info "Conf disabled (a2disconf)" + fi + rm -f "$CONF_FILE" + info "Removed ${CONF_FILE}" + + step "Reloading ${APACHE_SVC}" + systemctl reload "$APACHE_SVC" + info "${APACHE_SVC} reloaded" + fi + + echo "" + echo -e "${BOLD}Bot-block configuration removed.${NC}" + exit 0 +fi + +# ===================================================== +# --htaccess mode (install) +# ===================================================== +if [[ -n "$HTACCESS_PATH" ]]; then + HTACCESS_FILE="${HTACCESS_PATH%/}/.htaccess" + step "Writing bot-block rules to ${HTACCESS_FILE}" + + if [[ ! -d "$HTACCESS_PATH" ]]; then + echo -e "${RED}Error: Directory not found: ${HTACCESS_PATH}${NC}" >&2 + exit 1 + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would write managed block to ${HTACCESS_FILE}" + echo "" + echo -e "${BOLD}Dry-run complete — no changes made.${NC}" + exit 0 + fi + + # Back up existing .htaccess if it exists + if [[ -f "$HTACCESS_FILE" ]]; then + # Remove old managed block if present + if grep -q "$MANAGED_START" "$HTACCESS_FILE"; then + cp "$HTACCESS_FILE" "${HTACCESS_FILE}.bak.$(date +%s)" + warn "Backup created" + sed -i "/${MANAGED_START}/,/${MANAGED_END}/d" "$HTACCESS_FILE" + warn "Old managed block removed" + else + cp "$HTACCESS_FILE" "${HTACCESS_FILE}.bak.$(date +%s)" + warn "Existing .htaccess backed up" + fi + fi + + # Append rules + generate_rules >> "$HTACCESS_FILE" + info "Bot-block rules written to ${HTACCESS_FILE}" + + echo "" + echo -e "${BOLD}Done.${NC}" + echo "" + echo " File: ${HTACCESS_FILE}" + echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" + echo " Expected: 403" + exit 0 +fi + +# ===================================================== +# Server-wide install (default) +# ===================================================== + +# --- Step 1: Enable mod_rewrite (Debian) --- +if [[ "$DISTRO" == "debian" ]]; then + step "Enabling mod_rewrite" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: a2enmod rewrite" + else + a2enmod rewrite 2>/dev/null || true + info "mod_rewrite enabled" + fi +fi + +# --- Step 2: Write conf file --- +step "Creating bot-block conf at ${CONF_FILE}" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${CONF_FILE}" +else + if [[ -f "$CONF_FILE" ]]; then + cp "$CONF_FILE" "${CONF_FILE}.bak.$(date +%s)" + warn "Existing config backed up" + fi + generate_rules > "$CONF_FILE" + info "Config created: ${CONF_FILE}" +fi + +# --- Step 3: Enable conf (Debian) --- +if [[ "$DISTRO" == "debian" ]]; then + step "Enabling bot-block conf" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: a2enconf bot-block" + else + a2enconf bot-block 2>/dev/null || true + info "Conf enabled (a2enconf)" + fi +fi + +# --- Step 4: Validate config --- +step "Testing Apache configuration" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: apachectl configtest" +else + if apachectl configtest 2>&1; then + info "Apache config valid" + else + echo -e "${RED}[ERROR] Apache config test failed${NC}" >&2 + echo " Restore backup from ${CONF_FILE}.bak.* and reload" >&2 + exit 1 + fi +fi + +# --- Step 5: Reload Apache --- +step "Reloading ${APACHE_SVC}" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl reload ${APACHE_SVC}" +else + systemctl reload "$APACHE_SVC" + info "${APACHE_SVC} reloaded" +fi + +# ===================================================== +# Summary +# ===================================================== +echo "" +echo -e "${BOLD}Done.${NC}" +echo "" +echo " Config: ${CONF_FILE}" +echo " Distro: ${DISTRO}" +echo "" +echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" +echo " Expected: 403" +echo "" +echo " Remove: sudo $(basename "$0") --remove" diff --git a/add-fail2ban-ai-bots.sh b/add-fail2ban-ai-bots.sh new file mode 100755 index 0000000..2ed0e67 --- /dev/null +++ b/add-fail2ban-ai-bots.sh @@ -0,0 +1,395 @@ +#!/bin/bash +################################################################################ +# Script Name: add-fail2ban-ai-bots.sh +# Version: 1.1 +# Description: Adds a Fail2ban jail to block AI scrapers and unwanted bots +# that ignore robots.txt. Installs filter + jail config and +# reloads Fail2ban. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Usage: +# sudo ./add-fail2ban-ai-bots.sh +# sudo ./add-fail2ban-ai-bots.sh --logpath /var/log/nginx/access.log +# sudo ./add-fail2ban-ai-bots.sh --bantime 604800 +# sudo ./add-fail2ban-ai-bots.sh --dry-run +# +# Changelog: +# 1.1 — 2026-05-04: Removed Claude-Web, Perplexity-User, ChatGPT-User, and +# OAI-SearchBot from blocklist. These are user-facing fetcher bots that +# retrieve content when someone pastes a URL into an AI chat or search. +# Blocking them prevents your content from being cited in AI answers. +# Training crawlers (ClaudeBot, PerplexityBot, GPTBot) remain blocked. +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +readonly VERSION="1.1" +readonly SCRIPT_NAME="${0##*/}" + +LOGPATH="auto" +BANTIME="86400" +MAXRETRY="1" +DRY_RUN=false + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +show_usage() { + cat </dev/null; then + log_error "Fail2ban is not installed" + log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/" + exit 1 + fi + + if ! systemctl is-active --quiet fail2ban; then + log_error "Fail2ban is not running" + exit 1 + fi + + log_info "Fail2ban is installed and running" +} + +detect_logpath() { + if [[ "$LOGPATH" != "auto" ]]; then + # Support glob patterns (e.g. /var/log/apache2/domains/*.log) + # shellcheck disable=SC2086,SC2206 + local matches=( $LOGPATH ) + if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then + log_error "Log file not found: $LOGPATH" + exit 1 + fi + log_info "Using specified log path: $LOGPATH (${#matches[@]} file(s))" + return + fi + + log_step "Auto-detecting web server access log..." + + # HestiaCP / VestaCP — apache domains (check first: has full access logs with user agents) + local hestia_apache=( /var/log/apache2/domains/*.log ) + if [[ -f "${hestia_apache[0]:-}" ]]; then + LOGPATH="/var/log/apache2/domains/*.log" + log_info "Detected HestiaCP/VestaCP apache: $LOGPATH (${#hestia_apache[@]} file(s))" + return + fi + + # HestiaCP / VestaCP — nginx domains (proxy logs only in nginx+apache mode) + local hestia_nginx=( /var/log/nginx/domains/*.log ) + if [[ -f "${hestia_nginx[0]:-}" ]]; then + LOGPATH="/var/log/nginx/domains/*.log" + log_info "Detected HestiaCP/VestaCP nginx: $LOGPATH (${#hestia_nginx[@]} file(s))" + return + fi + + # Nginx (standard) + if [[ -f /var/log/nginx/access.log ]]; then + LOGPATH="/var/log/nginx/access.log" + log_info "Detected nginx: $LOGPATH" + return + fi + + # Apache (Debian/Ubuntu) + if [[ -f /var/log/apache2/access.log ]]; then + LOGPATH="/var/log/apache2/access.log" + log_info "Detected apache2: $LOGPATH" + return + fi + + # Apache (RHEL/Rocky) + if [[ -f /var/log/httpd/access_log ]]; then + LOGPATH="/var/log/httpd/access_log" + log_info "Detected httpd: $LOGPATH" + return + fi + + log_error "Could not auto-detect access log. Use --logpath to specify." + exit 1 +} + +# ============================================================================ +# INSTALL FILTER +# ============================================================================ + +install_filter() { + local filter_file="/etc/fail2ban/filter.d/ai-bots.conf" + + log_step "Installing filter: $filter_file" + + if $DRY_RUN; then + log_info "[DRY RUN] Would create $filter_file" + echo "" + generate_filter + echo "" + return + fi + + if [[ -f "$filter_file" ]]; then + log_warn "Filter already exists — backing up to ${filter_file}.bak" + cp "$filter_file" "${filter_file}.bak" + fi + + generate_filter > "$filter_file" + log_info "Filter installed: $filter_file" +} + +generate_filter() { + cat <<'EOF' +# Fail2ban filter to block AI scrapers and unwanted bots +# https://mylinux.work +# +# Matches common AI crawler user agents in web server access logs. +# These bots scrape content for AI model training and typically +# ignore robots.txt directives. + +[Definition] + +# Match AI and unwanted bot user agents in access logs +# Supports both combined and common log formats +failregex = ^ \S+ \S+ \[.*\] "\S+ \S+ \S+" \d+ \d+ "\S+" ".*(?:ABEvalBot|GPTBot|CCBot|ClaudeBot|anthropic-ai|Bytespider|TikTokSpider|cohere-ai|meta-externalagent|Meta-ExternalFetcher|PetalBot|Amazonbot|AI2Bot|Ai2Bot-Dolma|YouBot|PerplexityBot|Diffbot|Applebot-Extended|Google-Extended|MistralBot|YandexGPTBot|MJ12bot|Scrapy|DataForSeoBot|Timpibot|img2dataset|HanaleiBot|SemrushBot|AhrefsBot|DotBot|SERanking|trafilatura).*" + +ignoreregex = + +# Author: Phil Connor — https://mylinux.work +EOF +} + +# ============================================================================ +# INSTALL JAIL +# ============================================================================ + +install_jail() { + local jail_file="/etc/fail2ban/jail.d/ai-bots.conf" + + log_step "Installing jail: $jail_file" + + if $DRY_RUN; then + log_info "[DRY RUN] Would create $jail_file" + echo "" + generate_jail + echo "" + return + fi + + if [[ -f "$jail_file" ]]; then + log_warn "Jail config already exists — backing up to ${jail_file}.bak" + cp "$jail_file" "${jail_file}.bak" + fi + + generate_jail > "$jail_file" + log_info "Jail config installed: $jail_file" +} + +generate_jail() { + cat </dev/null; then + log_warn "Config test not available — reloading directly" + fi + + fail2ban-client reload + sleep 2 + + if systemctl is-active --quiet fail2ban; then + log_info "Fail2ban reloaded successfully" + else + log_error "Fail2ban failed to restart — check: journalctl -u fail2ban" + exit 1 + fi +} + +verify_jail() { + log_step "Verifying ai-bots jail..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would verify jail status" + return + fi + + echo "" + if fail2ban-client status ai-bots 2>/dev/null; then + echo "" + log_info "AI bots jail is active and monitoring $LOGPATH" + else + log_error "Jail 'ai-bots' is not running — check: fail2ban-client status" + log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/ai-bots.conf" + exit 1 + fi +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "============================================" + echo " Fail2ban AI Bot Blocker v${VERSION}" + echo " https://mylinux.work" + echo "============================================" + echo "" + + check_root + check_fail2ban + detect_logpath + install_filter + install_jail + reload_fail2ban + verify_jail + + echo "" + echo "============================================" + echo " Setup Complete" + echo "============================================" + echo "" + echo " Jail: ai-bots" + echo " Log: $LOGPATH" + echo " Ban time: ${BANTIME}s ($(( BANTIME / 3600 ))h)" + echo " Max retry: $MAXRETRY" + echo "" + echo " Useful commands:" + echo " fail2ban-client status ai-bots" + echo " fail2ban-client set ai-bots unbanip " + echo " fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/ai-bots.conf" + echo "" +} + +main "$@" diff --git a/add-fail2ban-head-crawler.sh b/add-fail2ban-head-crawler.sh new file mode 100755 index 0000000..5a87b39 --- /dev/null +++ b/add-fail2ban-head-crawler.sh @@ -0,0 +1,456 @@ +#!/bin/bash +################################################################################ +# Script Name: add-fail2ban-head-crawler.sh +# Version: 1.0 +# Description: Adds a Fail2ban jail to block HEAD-only crawlers — bots that +# systematically send HEAD requests with no referer to probe or +# index your site while spoofing real browser user agents. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Usage: +# sudo ./add-fail2ban-head-crawler.sh +# sudo ./add-fail2ban-head-crawler.sh --logpath /var/log/nginx/access.log +# sudo ./add-fail2ban-head-crawler.sh --maxretry 10 +# sudo ./add-fail2ban-head-crawler.sh --dry-run +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +readonly VERSION="1.0" +readonly SCRIPT_NAME="${0##*/}" + +LOGPATH="auto" +BANTIME="86400" +MAXRETRY="5" +FINDTIME="300" +DRY_RUN=false + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +show_usage() { + cat </dev/null; then + log_error "Fail2ban is not installed" + log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/" + exit 1 + fi + + if ! systemctl is-active --quiet fail2ban; then + log_error "Fail2ban is not running" + exit 1 + fi + + log_info "Fail2ban is installed and running" +} + +detect_logpath() { + if [[ "$LOGPATH" != "auto" ]]; then + # shellcheck disable=SC2086 + local matches=( $LOGPATH ) + if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then + log_error "Log file not found: $LOGPATH" + exit 1 + fi + log_info "Using specified log path: $LOGPATH" + return + fi + + log_step "Auto-detecting web server access log..." + + # HestiaCP — apache domains + local hestia_apache=( /var/log/apache2/domains/*.log ) + if [[ -f "${hestia_apache[0]:-}" ]]; then + LOGPATH="/var/log/apache2/domains/*.log" + log_info "Detected HestiaCP apache: $LOGPATH" + return + fi + + # HestiaCP — nginx domains + local hestia_nginx=( /var/log/nginx/domains/*.log ) + if [[ -f "${hestia_nginx[0]:-}" ]]; then + LOGPATH="/var/log/nginx/domains/*.log" + log_info "Detected HestiaCP nginx: $LOGPATH" + return + fi + + # Nginx (standard) + if [[ -f /var/log/nginx/access.log ]]; then + LOGPATH="/var/log/nginx/access.log" + log_info "Detected nginx: $LOGPATH" + return + fi + + # Apache (Debian/Ubuntu) + if [[ -f /var/log/apache2/access.log ]]; then + LOGPATH="/var/log/apache2/access.log" + log_info "Detected apache2: $LOGPATH" + return + fi + + # Apache (RHEL/Rocky) + if [[ -f /var/log/httpd/access_log ]]; then + LOGPATH="/var/log/httpd/access_log" + log_info "Detected httpd: $LOGPATH" + return + fi + + log_error "Could not auto-detect access log. Use --logpath to specify." + exit 1 +} + +# ============================================================================ +# REMOVE +# ============================================================================ + +do_remove() { + local filter_file="/etc/fail2ban/filter.d/head-crawler.conf" + local jail_file="/etc/fail2ban/jail.d/head-crawler.conf" + + log_step "Removing HEAD crawler jail..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would remove $filter_file" + log_info "[DRY RUN] Would remove $jail_file" + log_info "[DRY RUN] Would reload fail2ban" + return + fi + + if [[ -f "$jail_file" ]]; then + rm -f "$jail_file" + log_info "Removed: $jail_file" + else + log_warn "Jail config not found: $jail_file" + fi + + if [[ -f "$filter_file" ]]; then + rm -f "$filter_file" + log_info "Removed: $filter_file" + else + log_warn "Filter not found: $filter_file" + fi + + fail2ban-client reload + sleep 2 + log_info "Fail2ban reloaded — head-crawler jail removed" + exit 0 +} + +# ============================================================================ +# INSTALL FILTER +# ============================================================================ + +install_filter() { + local filter_file="/etc/fail2ban/filter.d/head-crawler.conf" + + log_step "Installing filter: $filter_file" + + if $DRY_RUN; then + log_info "[DRY RUN] Would create $filter_file" + echo "" + generate_filter + echo "" + return + fi + + if [[ -f "$filter_file" ]]; then + log_warn "Filter already exists — backing up to ${filter_file}.bak" + cp "$filter_file" "${filter_file}.bak" + fi + + generate_filter > "$filter_file" + log_info "Filter installed: $filter_file" +} + +generate_filter() { + cat <<'EOF' +# Fail2ban filter to block HEAD-only crawlers +# https://mylinux.work +# +# Catches bots that send HEAD requests with no referer. These are typically +# scrapers, SEO tools, or reconnaissance bots that spoof real browser user +# agents and rotate through cloud IPs to avoid detection. +# +# The filter matches: +# - HTTP HEAD method +# - No referer (logged as "-") +# - Any user agent (spoofed or otherwise) +# +# Combined with a low maxretry (default: 5 in 5 min), this catches +# systematic crawlers while ignoring occasional legitimate HEAD requests +# (browser prefetch, monitoring probes). + +[Definition] + +# HEAD request with no referer — combined log format +# Format: IP - - [date] "HEAD /path HTTP/x.x" status size "-" "user agent" +failregex = ^ \S+ \S+ \[.*\] "HEAD \S+ \S+" \d+ \d+ "-" ".*" + +ignoreregex = + +# Author: Phil Connor — https://mylinux.work +EOF +} + +# ============================================================================ +# INSTALL JAIL +# ============================================================================ + +install_jail() { + local jail_file="/etc/fail2ban/jail.d/head-crawler.conf" + + log_step "Installing jail: $jail_file" + + if $DRY_RUN; then + log_info "[DRY RUN] Would create $jail_file" + echo "" + generate_jail + echo "" + return + fi + + if [[ -f "$jail_file" ]]; then + log_warn "Jail config already exists — backing up to ${jail_file}.bak" + cp "$jail_file" "${jail_file}.bak" + fi + + generate_jail > "$jail_file" + log_info "Jail config installed: $jail_file" +} + +generate_jail() { + cat </dev/null; then + log_warn "Config test not available — reloading directly" + fi + + fail2ban-client reload + sleep 2 + + if systemctl is-active --quiet fail2ban; then + log_info "Fail2ban reloaded successfully" + else + log_error "Fail2ban failed to restart — check: journalctl -u fail2ban" + exit 1 + fi +} + +verify_jail() { + log_step "Verifying head-crawler jail..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would verify jail status" + return + fi + + echo "" + if fail2ban-client status head-crawler 2>/dev/null; then + echo "" + log_info "HEAD crawler jail is active and monitoring $LOGPATH" + else + log_error "Jail 'head-crawler' is not running — check: fail2ban-client status" + log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/head-crawler.conf" + exit 1 + fi +} + +test_against_logs() { + if $DRY_RUN; then + # shellcheck disable=SC2086 + local matches=( $LOGPATH ) + if [[ -f "${matches[0]}" ]]; then + log_step "Testing filter against existing logs..." + echo "" + fail2ban-regex "${matches[0]}" /dev/stdin <<'FILTER' 2>&1 | tail -5 +[Definition] +failregex = ^ \S+ \S+ \[.*\] "HEAD \S+ \S+" \d+ \d+ "-" ".*" +ignoreregex = +FILTER + echo "" + fi + fi +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "============================================" + echo " Fail2ban HEAD Crawler Blocker v${VERSION}" + echo " https://mylinux.work" + echo "============================================" + echo "" + + check_root + check_fail2ban + + if $REMOVE; then + do_remove + fi + + detect_logpath + test_against_logs + install_filter + install_jail + reload_fail2ban + verify_jail + + echo "" + echo "============================================" + echo " Setup Complete" + echo "============================================" + echo "" + echo " Jail: head-crawler" + echo " Log: $LOGPATH" + echo " Ban time: ${BANTIME}s ($(( BANTIME / 3600 ))h)" + echo " Max retry: $MAXRETRY (HEAD requests before ban)" + echo " Find time: ${FINDTIME}s ($(( FINDTIME / 60 ))m window)" + echo "" + echo " Useful commands:" + echo " fail2ban-client status head-crawler" + echo " fail2ban-client set head-crawler unbanip " + echo " fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/head-crawler.conf" + echo "" +} + +main "$@" diff --git a/add-fail2ban-image-scraper.sh b/add-fail2ban-image-scraper.sh new file mode 100755 index 0000000..d3c5c98 --- /dev/null +++ b/add-fail2ban-image-scraper.sh @@ -0,0 +1,474 @@ +#!/bin/bash +################################################################################ +# Script Name: add-fail2ban-image-scraper.sh +# Version: 1.0 +# Description: Adds a Fail2ban jail to block image scrapers — bots that +# directly request image files with no referer. Real browsers +# always send a referer when loading images (the page containing +# the tag). Direct image requests with no referer are +# almost always scrapers harvesting images for AI training +# datasets or content theft. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Usage: +# sudo ./add-fail2ban-image-scraper.sh +# sudo ./add-fail2ban-image-scraper.sh --logpath /var/log/nginx/access.log +# sudo ./add-fail2ban-image-scraper.sh --maxretry 3 +# sudo ./add-fail2ban-image-scraper.sh --dry-run +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +readonly VERSION="1.1" +readonly SCRIPT_NAME="${0##*/}" + +LOGPATH="auto" +BANTIME="86400" +MAXRETRY="5" +FINDTIME="300" +IGNOREIP="" +DRY_RUN=false + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +show_usage() { + cat </dev/null; then + log_error "Fail2ban is not installed" + log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/" + exit 1 + fi + + if ! systemctl is-active --quiet fail2ban; then + log_error "Fail2ban is not running" + exit 1 + fi + + log_info "Fail2ban is installed and running" +} + +detect_logpath() { + if [[ "$LOGPATH" != "auto" ]]; then + # shellcheck disable=SC2086 + local matches=( $LOGPATH ) + if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then + log_error "Log file not found: $LOGPATH" + exit 1 + fi + log_info "Using specified log path: $LOGPATH" + return + fi + + log_step "Auto-detecting web server access log..." + + # HestiaCP — apache domains + local hestia_apache=( /var/log/apache2/domains/*.log ) + if [[ -f "${hestia_apache[0]:-}" ]]; then + LOGPATH="/var/log/apache2/domains/*.log" + log_info "Detected HestiaCP apache: $LOGPATH" + return + fi + + # HestiaCP — nginx domains + local hestia_nginx=( /var/log/nginx/domains/*.log ) + if [[ -f "${hestia_nginx[0]:-}" ]]; then + LOGPATH="/var/log/nginx/domains/*.log" + log_info "Detected HestiaCP nginx: $LOGPATH" + return + fi + + # Nginx (standard) + if [[ -f /var/log/nginx/access.log ]]; then + LOGPATH="/var/log/nginx/access.log" + log_info "Detected nginx: $LOGPATH" + return + fi + + # Apache (Debian/Ubuntu) + if [[ -f /var/log/apache2/access.log ]]; then + LOGPATH="/var/log/apache2/access.log" + log_info "Detected apache2: $LOGPATH" + return + fi + + # Apache (RHEL/Rocky) + if [[ -f /var/log/httpd/access_log ]]; then + LOGPATH="/var/log/httpd/access_log" + log_info "Detected httpd: $LOGPATH" + return + fi + + log_error "Could not auto-detect access log. Use --logpath to specify." + exit 1 +} + +# ============================================================================ +# REMOVE +# ============================================================================ + +do_remove() { + local filter_file="/etc/fail2ban/filter.d/image-scraper.conf" + local jail_file="/etc/fail2ban/jail.d/image-scraper.conf" + + log_step "Removing image scraper jail..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would remove $filter_file" + log_info "[DRY RUN] Would remove $jail_file" + log_info "[DRY RUN] Would reload fail2ban" + return + fi + + if [[ -f "$jail_file" ]]; then + rm -f "$jail_file" + log_info "Removed: $jail_file" + else + log_warn "Jail config not found: $jail_file" + fi + + if [[ -f "$filter_file" ]]; then + rm -f "$filter_file" + log_info "Removed: $filter_file" + else + log_warn "Filter not found: $filter_file" + fi + + fail2ban-client reload + sleep 2 + log_info "Fail2ban reloaded — image-scraper jail removed" + exit 0 +} + +# ============================================================================ +# INSTALL FILTER +# ============================================================================ + +install_filter() { + local filter_file="/etc/fail2ban/filter.d/image-scraper.conf" + + log_step "Installing filter: $filter_file" + + if $DRY_RUN; then + log_info "[DRY RUN] Would create $filter_file" + echo "" + generate_filter + echo "" + return + fi + + if [[ -f "$filter_file" ]]; then + log_warn "Filter already exists — backing up to ${filter_file}.bak" + cp "$filter_file" "${filter_file}.bak" + fi + + generate_filter > "$filter_file" + log_info "Filter installed: $filter_file" +} + +generate_filter() { + cat <<'EOF' +# Fail2ban filter to block image scrapers +# https://mylinux.work +# +# Catches bots that directly request image files with no referer. +# When a real browser loads an image from a web page, it sends the page +# URL as the referer header. Direct image requests with no referer +# indicate scraping — typically for AI training datasets or content theft. +# +# Matches: GET requests for .png, .jpg, .jpeg, .gif, .webp, .svg, .avif +# with referer logged as "-" (absent/empty). +# +# Does NOT match .ico (favicons are legitimately requested without referer). + +[Definition] + +# Direct image request with no referer — combined log format +# Format: IP - - [date] "GET /path/image.png HTTP/x.x" status size "-" "user agent" +failregex = ^ \S+ \S+ \[.*\] "GET \S+\.(?:png|jpe?g|gif|webp|svg|avif) \S+" \d+ \d+ "-" ".*" + +ignoreregex = + +# Author: Phil Connor — https://mylinux.work +EOF +} + +# ============================================================================ +# INSTALL JAIL +# ============================================================================ + +install_jail() { + local jail_file="/etc/fail2ban/jail.d/image-scraper.conf" + + log_step "Installing jail: $jail_file" + + if $DRY_RUN; then + log_info "[DRY RUN] Would create $jail_file" + echo "" + generate_jail + echo "" + return + fi + + if [[ -f "$jail_file" ]]; then + log_warn "Jail config already exists — backing up to ${jail_file}.bak" + cp "$jail_file" "${jail_file}.bak" + fi + + generate_jail > "$jail_file" + log_info "Jail config installed: $jail_file" +} + +generate_jail() { + cat </dev/null; then + log_warn "Config test not available — reloading directly" + fi + + fail2ban-client reload + sleep 2 + + if systemctl is-active --quiet fail2ban; then + log_info "Fail2ban reloaded successfully" + else + log_error "Fail2ban failed to restart — check: journalctl -u fail2ban" + exit 1 + fi +} + +verify_jail() { + log_step "Verifying image-scraper jail..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would verify jail status" + return + fi + + echo "" + if fail2ban-client status image-scraper 2>/dev/null; then + echo "" + log_info "Image scraper jail is active and monitoring $LOGPATH" + else + log_error "Jail 'image-scraper' is not running — check: fail2ban-client status" + log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/image-scraper.conf" + exit 1 + fi +} + +test_against_logs() { + if $DRY_RUN; then + # shellcheck disable=SC2086 + local matches=( $LOGPATH ) + if [[ -f "${matches[0]}" ]]; then + log_step "Testing filter against existing logs..." + echo "" + fail2ban-regex "${matches[0]}" /dev/stdin <<'FILTER' 2>&1 | tail -5 +[Definition] +failregex = ^ \S+ \S+ \[.*\] "GET \S+\.(?:png|jpe?g|gif|webp|svg|avif) \S+" \d+ \d+ "-" ".*" +ignoreregex = +FILTER + echo "" + fi + fi +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "============================================" + echo " Fail2ban Image Scraper Blocker v${VERSION}" + echo " https://mylinux.work" + echo "============================================" + echo "" + + check_root + check_fail2ban + + if $REMOVE; then + do_remove + fi + + detect_logpath + test_against_logs + install_filter + install_jail + reload_fail2ban + verify_jail + + echo "" + echo "============================================" + echo " Setup Complete" + echo "============================================" + echo "" + echo " Jail: image-scraper" + echo " Log: $LOGPATH" + echo " Ban time: ${BANTIME}s ($(( BANTIME / 3600 ))h)" + echo " Max retry: $MAXRETRY (direct image requests before ban)" + echo " Find time: ${FINDTIME}s ($(( FINDTIME / 60 ))m window)" + if [[ -n "$IGNOREIP" ]]; then + echo " Ignore: $IGNOREIP" + fi + echo "" + echo " Useful commands:" + echo " fail2ban-client status image-scraper" + echo " fail2ban-client set image-scraper unbanip " + echo " fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/image-scraper.conf" + echo "" +} + +main "$@" diff --git a/add-fail2ban-nginx-hardening.sh b/add-fail2ban-nginx-hardening.sh new file mode 100755 index 0000000..4ba8d54 --- /dev/null +++ b/add-fail2ban-nginx-hardening.sh @@ -0,0 +1,558 @@ +#!/bin/bash +################################################################################ +# Script Name: add-fail2ban-nginx-hardening.sh +# Version: 1.0 +# Description: Adds custom Fail2ban jails to block vulnerability scanners, +# script probes, and path enumeration attacks on nginx servers. +# Installs filters + jails and reloads Fail2ban. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Usage: +# sudo ./add-fail2ban-nginx-hardening.sh +# sudo ./add-fail2ban-nginx-hardening.sh --logpath /var/log/nginx/access.log +# sudo ./add-fail2ban-nginx-hardening.sh --bantime 604800 +# sudo ./add-fail2ban-nginx-hardening.sh --dry-run +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +readonly VERSION="1.0" +readonly SCRIPT_NAME="${0##*/}" + +ACCESS_LOGPATH="auto" +ERROR_LOGPATH="auto" +BANTIME="86400" +DRY_RUN=false +SKIP_JAILS="" +ALLOW_EXTENSIONS="" +ALLOW_PATHS="" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +show_usage() { + cat </dev/null; then + log_error "Fail2ban is not installed" + log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/" + exit 1 + fi + + if ! systemctl is-active --quiet fail2ban; then + log_error "Fail2ban is not running" + exit 1 + fi + + log_info "Fail2ban is installed and running" +} + +detect_logpaths() { + # Access log + if [[ "$ACCESS_LOGPATH" == "auto" ]]; then + if [[ -f /var/log/nginx/access.log ]]; then + ACCESS_LOGPATH="/var/log/nginx/access.log" + else + log_error "Could not find nginx access log. Use --logpath to specify." + exit 1 + fi + elif [[ ! -f "$ACCESS_LOGPATH" ]]; then + log_error "Access log not found: $ACCESS_LOGPATH" + exit 1 + fi + log_info "Access log: $ACCESS_LOGPATH" + + # Error log + if [[ "$ERROR_LOGPATH" == "auto" ]]; then + if [[ -f /var/log/nginx/error.log ]]; then + ERROR_LOGPATH="/var/log/nginx/error.log" + else + ERROR_LOGPATH="$ACCESS_LOGPATH" + log_warn "Error log not found — using access log for all jails" + fi + fi + log_info "Error log: $ERROR_LOGPATH" +} + +should_skip() { + local jail="$1" + [[ ",$SKIP_JAILS," == *",$jail,"* ]] +} + +# Build extension regex excluding allowed extensions +build_extension_regex() { + local all_exts="php|asp|aspx|jsp|cgi|exe|pl" + if [[ -n "$ALLOW_EXTENSIONS" ]]; then + local result="" + IFS='|' read -ra EXT_ARRAY <<< "${all_exts}" + for ext in "${EXT_ARRAY[@]}"; do + if [[ ",$ALLOW_EXTENSIONS," != *",$ext,"* ]]; then + [[ -n "$result" ]] && result="${result}|" + result="${result}${ext}" + fi + done + if [[ -z "$result" ]]; then + log_warn "All extensions whitelisted — skipping nginx-noscript" + return 1 + fi + echo "$result" + else + echo "$all_exts" + fi + return 0 +} + +# Build path ignore regex from allowed paths +build_path_ignoreregex() { + if [[ -z "$ALLOW_PATHS" ]]; then + echo "" + return + fi + local ignore_parts="" + IFS=',' read -ra PATH_ARRAY <<< "$ALLOW_PATHS" + for p in "${PATH_ARRAY[@]}"; do + p=$(echo "$p" | xargs | sed 's|^/||') + [[ -n "$ignore_parts" ]] && ignore_parts="${ignore_parts}|" + ignore_parts="${ignore_parts}/${p}" + done + echo "ignoreregex = ^ \\S+ \\S+ \\[.*\\] \"\\S+ [^\"]*($ignore_parts)[^\"]*HTTP" +} + +# ============================================================================ +# HELPER — write file with backup +# ============================================================================ + +write_config() { + local file="$1" + local label="$2" + + if $DRY_RUN; then + log_info "[DRY RUN] Would create $file" + echo "" + cat + echo "" + return + fi + + if [[ -f "$file" ]]; then + log_warn "$label already exists — backing up to ${file}.bak" + cp "$file" "${file}.bak" + fi + + cat > "$file" + log_info "$label installed: $file" +} + +# ============================================================================ +# JAIL 1: nginx-noscript +# ============================================================================ + +install_noscript() { + if should_skip "noscript"; then + log_info "Skipping nginx-noscript (--skip)" + return + fi + + local ext_regex + ext_regex=$(build_extension_regex) || return 0 + + log_step "Installing nginx-noscript filter and jail..." + if [[ -n "$ALLOW_EXTENSIONS" ]]; then + log_info "Whitelisted extensions: $ALLOW_EXTENSIONS" + fi + log_info "Blocking extensions: $ext_regex" + + # Filter + generate_noscript_filter "$ext_regex" | write_config \ + /etc/fail2ban/filter.d/nginx-noscript.conf \ + "nginx-noscript filter" + + # Jail + generate_noscript_jail | write_config \ + /etc/fail2ban/jail.d/nginx-noscript.conf \ + "nginx-noscript jail" +} + +generate_noscript_filter() { + local ext_regex="$1" + cat < \S+ \S+ \[.*\] "(GET|POST|HEAD|PUT|DELETE|OPTIONS) [^"]*\.($ext_regex)(\?[^\"]*)? HTTP[^"]*" (400|403|404|444) + +ignoreregex = + +# Author: Phil Connor — https://mylinux.work +EOF +} + +generate_noscript_jail() { + cat < \S+ \S+ \[.*\] "(GET|POST|HEAD) [^"]*(/\.env|/\.git|/\.svn|/\.hg|/\.htaccess|/\.htpasswd|/\.aws|/\.docker|/\.ssh|/\.kube|/\.config|/wp-admin|/wp-login|/wp-config|/wp-content/uploads|/wp-includes|/xmlrpc\.php|/administrator|/admin/config|/phpmyadmin|/pma|/myadmin|/dbadmin|/mysql|/phpinfo|/info\.php|/server-status|/server-info|/cgi-bin|/shell|/cmd|/command|/console|/config\.json|/config\.yml|/config\.yaml|/config\.xml|/database\.yml|/backup|/dump|/db\.sql|/\.sql|/api/v1/debug|/debug|/trace|/actuator|/swagger|/graphql|/solr|/elasticsearch|/_cat|/_cluster)[^"]*HTTP[^"]*" (400|403|404|444) + +$ignore_line + +# Author: Phil Connor — https://mylinux.work +EOF +} + +generate_pathscan_jail() { + cat < \S+ \S+ \[.*\] "\S+ \S+ \S+" (400|401|403|404|405|444) + +ignoreregex = + +# Author: Phil Connor — https://mylinux.work +EOF +} + +generate_4xx_jail() { + cat </dev/null; then + log_info " ✓ $full_jail is active" + else + log_error " ✗ $full_jail failed to start" + all_ok=false + fi + done + echo "" + + if ! $all_ok; then + log_error "Some jails failed — debug with:" + log_error " fail2ban-client status" + log_error " fail2ban-regex $ACCESS_LOGPATH /etc/fail2ban/filter.d/.conf" + exit 1 + fi +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "============================================" + echo " Fail2ban Nginx Hardening v${VERSION}" + echo " https://mylinux.work" + echo "============================================" + echo "" + + check_root + check_fail2ban + detect_logpaths + + install_noscript + install_pathscan + install_4xx_flood + + reload_fail2ban + verify_jails + + local installed=0 + should_skip "noscript" || installed=$((installed + 1)) + should_skip "pathscan" || installed=$((installed + 1)) + should_skip "4xx-flood" || installed=$((installed + 1)) + + echo "============================================" + echo " Setup Complete — ${installed} Jail(s) Installed" + echo "============================================" + echo "" + echo " Jails:" + should_skip "noscript" || echo " nginx-noscript Ban after 2 script requests (.php, .asp, etc.)" + should_skip "pathscan" || echo " nginx-pathscan Ban on first sensitive path probe (.env, .git, wp-admin)" + should_skip "4xx-flood" || echo " nginx-4xx-flood Ban after 20 errors in 5 minutes" + echo "" + echo " Log: $ACCESS_LOGPATH" + echo " Ban time: ${BANTIME}s ($(( BANTIME / 3600 ))h)" + echo "" + echo " Useful commands:" + echo " fail2ban-client status nginx-noscript" + echo " fail2ban-client status nginx-pathscan" + echo " fail2ban-client status nginx-4xx-flood" + echo " fail2ban-client set unbanip " + echo " fail2ban-regex $ACCESS_LOGPATH /etc/fail2ban/filter.d/nginx-pathscan.conf" + echo "" +} + +main "$@" diff --git a/add-fail2ban-scraper-detect.sh b/add-fail2ban-scraper-detect.sh new file mode 100755 index 0000000..0c5d99e --- /dev/null +++ b/add-fail2ban-scraper-detect.sh @@ -0,0 +1,504 @@ +#!/bin/bash +################################################################################ +# Script Name: add-fail2ban-scraper-detect.sh +# Version: 1.1 +# Description: Adds a Fail2ban jail to detect and ban headless Chrome scrapers +# that pass JavaScript challenges but exhibit bot behavior — +# rapid 499 responses (connection abandoned mid-download), +# high-frequency 404s (probing non-existent URLs), and +# HeadlessChrome user agent strings (no real user). Complements +# add-fail2ban-image-scraper.sh which catches no-referer image +# grabs. This filter catches the next tier: bots running real +# browsers (Puppeteer/Playwright) that execute JS, accept cookies, +# and send proper referers but still behave differently from humans. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Usage: +# sudo ./add-fail2ban-scraper-detect.sh +# sudo ./add-fail2ban-scraper-detect.sh --logpath /var/log/nginx/access.log +# sudo ./add-fail2ban-scraper-detect.sh --maxretry 5 +# sudo ./add-fail2ban-scraper-detect.sh --dry-run +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +readonly VERSION="1.1" +readonly SCRIPT_NAME="${0##*/}" + +LOGPATH="auto" +BANTIME="86400" +MAXRETRY="3" +FINDTIME="300" +IGNOREIP="" +DRY_RUN=false + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +show_usage() { + cat </dev/null; then + log_error "Fail2ban is not installed" + log_error "Install it first: https://mylinux.work/guides/fail2ban-setup/" + exit 1 + fi + + if ! systemctl is-active --quiet fail2ban; then + log_error "Fail2ban is not running" + exit 1 + fi + + log_info "Fail2ban is installed and running" +} + +detect_logpath() { + if [[ "$LOGPATH" != "auto" ]]; then + # shellcheck disable=SC2086 + local matches=( $LOGPATH ) + if [[ ${#matches[@]} -eq 0 || ! -f "${matches[0]}" ]]; then + log_error "Log file not found: $LOGPATH" + exit 1 + fi + log_info "Using specified log path: $LOGPATH" + return + fi + + log_step "Auto-detecting web server access log..." + + # HestiaCP — apache domains + local hestia_apache=( /var/log/apache2/domains/*.log ) + if [[ -f "${hestia_apache[0]:-}" ]]; then + LOGPATH="/var/log/apache2/domains/*.log" + log_info "Detected HestiaCP apache: $LOGPATH" + return + fi + + # HestiaCP — nginx domains + local hestia_nginx=( /var/log/nginx/domains/*.log ) + if [[ -f "${hestia_nginx[0]:-}" ]]; then + LOGPATH="/var/log/nginx/domains/*.log" + log_info "Detected HestiaCP nginx: $LOGPATH" + return + fi + + # Nginx (standard) + if [[ -f /var/log/nginx/access.log ]]; then + LOGPATH="/var/log/nginx/access.log" + log_info "Detected nginx: $LOGPATH" + return + fi + + # Apache (Debian/Ubuntu) + if [[ -f /var/log/apache2/access.log ]]; then + LOGPATH="/var/log/apache2/access.log" + log_info "Detected apache2: $LOGPATH" + return + fi + + # Apache (RHEL/Rocky) + if [[ -f /var/log/httpd/access_log ]]; then + LOGPATH="/var/log/httpd/access_log" + log_info "Detected httpd: $LOGPATH" + return + fi + + log_error "Could not auto-detect access log. Use --logpath to specify." + exit 1 +} + +# ============================================================================ +# REMOVE +# ============================================================================ + +do_remove() { + local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf" + local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf" + + log_step "Removing scraper-detect jail..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would remove $filter_file" + log_info "[DRY RUN] Would remove $jail_file" + log_info "[DRY RUN] Would reload fail2ban" + return + fi + + if [[ -f "$jail_file" ]]; then + rm -f "$jail_file" + log_info "Removed: $jail_file" + else + log_warn "Jail config not found: $jail_file" + fi + + if [[ -f "$filter_file" ]]; then + rm -f "$filter_file" + log_info "Removed: $filter_file" + else + log_warn "Filter not found: $filter_file" + fi + + fail2ban-client reload + sleep 2 + log_info "Fail2ban reloaded — scraper-detect jail removed" + exit 0 +} + +# ============================================================================ +# INSTALL FILTER +# ============================================================================ + +install_filter() { + local filter_file="/etc/fail2ban/filter.d/scraper-detect.conf" + + log_step "Installing filter: $filter_file" + + if $DRY_RUN; then + log_info "[DRY RUN] Would create $filter_file" + echo "" + generate_filter + echo "" + return + fi + + if [[ -f "$filter_file" ]]; then + log_warn "Filter already exists — backing up to ${filter_file}.bak" + cp "$filter_file" "${filter_file}.bak" + fi + + generate_filter > "$filter_file" + log_info "Filter installed: $filter_file" +} + +generate_filter() { + cat <<'EOF' +# Fail2ban filter to detect headless Chrome scrapers +# https://mylinux.work +# +# Catches three patterns that indicate automated scraping: +# +# 1. HTTP 499 — nginx-specific status meaning "client closed connection +# before the server responded." Scrapers fire requests then drop them +# once they've grabbed the HTML. Real users rarely trigger this. +# +# 2. HTTP 404 — not found. A single 404 is normal (mistyped URL). Many +# 404s in a short window indicate URL probing or stale scraper runs. +# +# 3. HeadlessChrome — Puppeteer/Playwright bots that don't bother hiding +# the headless user agent. No legitimate browser sends this string. +# Matched on any status code — headless Chrome is never a real user. +# +# Combined with maxretry in the jail, this catches bots that generate +# multiple errors quickly while ignoring the occasional human mistake. +# HeadlessChrome matches are instant (maxretry 1 would suffice) but +# the jail threshold still applies — a few hits trigger the ban. + +[Definition] + +# Match 499 (client dropped), 404 (not found), and HeadlessChrome UA +# Works with combined, common, and enriched (GeoIP) log formats +failregex = ^ \S+ \S+ \[.*\] "\S+ \S+ \S+" 499 + ^ \S+ \S+ \[.*\] "\S+ \S+ \S+" 404 + ^ .* ".*HeadlessChrome.*" + +# Whitelist legitimate bots and monitoring tools +ignoreregex = ^ .* ".*(?:Googlebot|bingbot|Baiduspider|YandexBot|DuckDuckBot|Slurp|facebookexternalhit|Twitterbot|LinkedInBot|Applebot|Blackbox-Exporter|UptimeRobot|Pingdom|CensysInspect|Feedfetcher|Mediapartners-Google).*" + +# Author: Phil Connor — https://mylinux.work +EOF +} + +# ============================================================================ +# INSTALL JAIL +# ============================================================================ + +install_jail() { + local jail_file="/etc/fail2ban/jail.d/scraper-detect.conf" + + log_step "Installing jail: $jail_file" + + if $DRY_RUN; then + log_info "[DRY RUN] Would create $jail_file" + echo "" + generate_jail + echo "" + return + fi + + if [[ -f "$jail_file" ]]; then + log_warn "Jail config already exists — backing up to ${jail_file}.bak" + cp "$jail_file" "${jail_file}.bak" + fi + + generate_jail > "$jail_file" + log_info "Jail config installed: $jail_file" +} + +generate_jail() { + cat </dev/null; then + log_warn "Config test not available — reloading directly" + fi + + fail2ban-client reload + sleep 2 + + if systemctl is-active --quiet fail2ban; then + log_info "Fail2ban reloaded successfully" + else + log_error "Fail2ban failed to restart — check: journalctl -u fail2ban" + exit 1 + fi +} + +verify_jail() { + log_step "Verifying scraper-detect jail..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would verify jail status" + return + fi + + echo "" + if fail2ban-client status scraper-detect 2>/dev/null; then + echo "" + log_info "Scraper-detect jail is active and monitoring $LOGPATH" + else + log_error "Jail 'scraper-detect' is not running — check: fail2ban-client status" + log_error "Debug with: fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf" + exit 1 + fi +} + +test_against_logs() { + if $DRY_RUN; then + # shellcheck disable=SC2086 + local matches=( $LOGPATH ) + if [[ -f "${matches[0]}" ]]; then + log_step "Testing filter against existing logs..." + local tmp_filter + tmp_filter=$(mktemp /tmp/scraper-detect-filter.XXXXXX) + generate_filter > "$tmp_filter" + echo "" + fail2ban-regex "${matches[0]}" "$tmp_filter" 2>&1 | tail -8 + rm -f "$tmp_filter" + echo "" + fi + fi +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "============================================" + echo " Fail2ban Scraper Detect v${VERSION}" + echo " https://mylinux.work" + echo "============================================" + echo "" + + check_root + check_fail2ban + + if $REMOVE; then + do_remove + fi + + detect_logpath + test_against_logs + install_filter + install_jail + reload_fail2ban + verify_jail + + echo "" + echo "============================================" + echo " Setup Complete" + echo "============================================" + echo "" + echo " Jail: scraper-detect" + echo " Log: $LOGPATH" + echo " Ban time: ${BANTIME}s ($(( BANTIME / 3600 ))h)" + echo " Max retry: $MAXRETRY (499/404 errors before ban)" + echo " Find time: ${FINDTIME}s ($(( FINDTIME / 60 ))m window)" + if [[ -n "$IGNOREIP" ]]; then + echo " Ignore: $IGNOREIP" + fi + echo "" + echo " Useful commands:" + echo " fail2ban-client status scraper-detect" + echo " fail2ban-client set scraper-detect unbanip " + echo " fail2ban-regex $LOGPATH /etc/fail2ban/filter.d/scraper-detect.conf" + echo "" +} + +main "$@" diff --git a/add-nginx-block-head.sh b/add-nginx-block-head.sh new file mode 100755 index 0000000..074e9e0 --- /dev/null +++ b/add-nginx-block-head.sh @@ -0,0 +1,257 @@ +#!/usr/bin/env bash + +######################################################################################### +#### add-nginx-block-head.sh — Block HEAD requests in Nginx (HestiaCP compatible) #### +#### Adds a 444 drop rule for HEAD method crawlers/scrapers. #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### sudo ./add-nginx-block-head.sh #### +#### sudo ./add-nginx-block-head.sh --dry-run #### +#### sudo ./add-nginx-block-head.sh --remove #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +DRY_RUN=false +REMOVE=false +SNIPPET_NAME="nginx.conf_block_head" + +# ── Colors ──────────────────────────────────────────────────────────── +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BOLD='\033[1m' + RESET='\033[0m' +else + RED="" GREEN="" YELLOW="" BOLD="" RESET="" +fi + +log() { echo -e "${GREEN}[OK]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*"; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +info() { echo -e "${BOLD}[INFO]${RESET} $*"; } + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat </dev/null; then + err "Nginx not found" + exit 1 +fi + +if ! command -v v-list-users &>/dev/null; then + err "HestiaCP not found (v-list-users missing)" + exit 1 +fi + +# ── Snippet content ────────────────────────────────────────────────── +SNIPPET_CONTENT='# Block HEAD request crawlers/scrapers +# Added by add-nginx-block-head.sh +# Returns 444 (drop connection) — no response sent to bot +if ($request_method = HEAD) { + return 444; +}' + +# ── Find all HestiaCP domains ──────────────────────────────────────── +get_all_domain_dirs() { + local users + users=$(v-list-users plain 2>/dev/null | cut -f1) + + for user in $users; do + local user_conf="/home/${user}/conf/web" + [[ -d "$user_conf" ]] || continue + + # Find domain directories by looking for nginx.conf files + for nginx_conf in "${user_conf}"/*/nginx.conf; do + [[ -f "$nginx_conf" ]] || continue + dirname "$nginx_conf" + done + done +} + +# ── Remove mode ─────────────────────────────────────────────────────── +if [[ "$REMOVE" == "true" ]]; then + removed=0 + + while IFS= read -r domain_dir; do + snippet="${domain_dir}/${SNIPPET_NAME}" + ssl_snippet="${domain_dir}/nginx.ssl.conf_block_head" + + for f in "$snippet" "$ssl_snippet"; do + if [[ -f "$f" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + info "Would remove: ${f}" + else + rm -f "$f" + log "Removed ${f}" + fi + ((removed++)) || true + fi + done + done < <(get_all_domain_dirs) + + if [[ $removed -eq 0 ]]; then + info "No block-head snippets found — nothing to remove" + exit 0 + fi + + if [[ "$DRY_RUN" == "true" ]]; then + info "Would test and reload Nginx" + exit 0 + fi + + if nginx -t 2>/dev/null; then + systemctl reload nginx + log "Nginx reloaded — HEAD requests are now allowed" + else + err "Nginx config test failed after removal — check your config" + exit 1 + fi + exit 0 +fi + +# ── Install mode ────────────────────────────────────────────────────── +domain_dirs=() +while IFS= read -r dir; do + domain_dirs+=("$dir") +done < <(get_all_domain_dirs) + +if [[ ${#domain_dirs[@]} -eq 0 ]]; then + err "No HestiaCP web domains found" + exit 1 +fi + +info "Found ${#domain_dirs[@]} domain config(s)" +echo "" + +created=0 +skipped=0 +created_files=() + +for domain_dir in "${domain_dirs[@]}"; do + domain_name=$(basename "$domain_dir") + + # Add snippet for both HTTP and HTTPS server blocks + for conf_type in "" ".ssl"; do + if [[ -n "$conf_type" ]]; then + snippet="${domain_dir}/nginx${conf_type}.conf_block_head" + else + snippet="${domain_dir}/${SNIPPET_NAME}" + fi + + # Check the main config exists for this type + main_conf="${domain_dir}/nginx${conf_type}.conf" + [[ -f "$main_conf" ]] || continue + + if [[ -f "$snippet" ]]; then + info "Already exists: ${snippet}" + ((skipped++)) || true + continue + fi + + if [[ "$DRY_RUN" == "true" ]]; then + info "Would create: ${snippet}" + ((created++)) || true + else + echo "$SNIPPET_CONTENT" > "$snippet" + created_files+=("$snippet") + log "Created ${snippet}" + ((created++)) || true + fi + done +done + +echo "" + +if [[ $created -eq 0 && $skipped -gt 0 ]]; then + info "HEAD requests are already blocked on all domains" + exit 0 +fi + +if [[ "$DRY_RUN" == "true" ]]; then + echo "" + echo "$SNIPPET_CONTENT" + echo "" + info "Would create ${created} snippet(s) (${skipped} already exist)" + info "Would test Nginx config and reload" + exit 0 +fi + +# Test Nginx config +info "Testing Nginx configuration..." +if nginx -t 2>&1; then + echo "" + log "Config test passed" + systemctl reload nginx + log "Nginx reloaded — HEAD requests blocked on ${#domain_dirs[@]} domain(s) (444 drop)" +else + echo "" + err "Config test FAILED — rolling back all changes" + for f in "${created_files[@]}"; do + rm -f "$f" + err "Removed ${f}" + done + err "Nginx was NOT reloaded — your site is unaffected" + exit 1 +fi + +echo "" +info "Verify with: curl -I https://your-site.com" +info "Expected: curl returns empty reply (connection dropped)" +info "To undo: $(basename "$0") --remove" diff --git a/add-nginx-bot-block.sh b/add-nginx-bot-block.sh new file mode 100755 index 0000000..7fc8ab0 --- /dev/null +++ b/add-nginx-bot-block.sh @@ -0,0 +1,443 @@ +#!/bin/bash +################################################################################ +# Script Name: add-nginx-bot-block.sh +# Version: 1.3 +# Description: Configure AI scraper and bot blocking on standard nginx servers. +# Creates an nginx map in conf.d and injects bot-blocking rules +# into server blocks found in sites-enabled and conf.d. +# For HestiaCP / VestaCP / myVesta servers, use hestia-bot-block.sh instead. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - nginx installed and running +# - Root access +# +# Usage: +# sudo ./add-nginx-bot-block.sh +# sudo ./add-nginx-bot-block.sh --dry-run +# sudo ./add-nginx-bot-block.sh --conf /etc/nginx/sites-enabled/mysite.conf +# sudo ./add-nginx-bot-block.sh --status-code 403 +# sudo ./add-nginx-bot-block.sh --remove +# +# Changelog: +# 1.3 — 2026-05-11: Added fragment-in-referer blocking (real browsers strip +# URI fragments from the Referer header). Added request method blocking +# (only GET/HEAD allowed — static sites never need POST/PUT/DELETE). +# Added ospa-radar (lead-gen/business intelligence crawler) to blocklist. +# 1.2 — 2026-05-08: Added Exabot (defunct Dassault/Exalead crawler, now +# spoofed) and Sogou (Tencent Chinese search crawler) to blocklist. +# 1.1 — 2026-05-04: Removed Claude-Web and OAI-SearchBot from blocklist. +# These are user-facing fetcher bots, not training crawlers. Blocking +# them prevents your content from being cited in AI answers. +# +################################################################################ + +set -euo pipefail + +# --- Configuration --- +CONF_DIR="/etc/nginx/conf.d" +SITES_DIR="/etc/nginx/sites-enabled" +MAP_FILE="${CONF_DIR}/bot-block.conf" +DRY_RUN=false +REMOVE=false +SINGLE_CONF="" +STATUS_CODE="444" +TIMESTAMP=$(date +%s) +MARKER_START="# bot-block-managed-start" +MARKER_END="# bot-block-managed-end" + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +usage() { + cat <&2 + exit 1 +fi + +if ! command -v nginx &>/dev/null; then + echo -e "${RED}Error: nginx not found${NC}" >&2 + exit 1 +fi + +# ===================================================== +# Collect config files to process +# ===================================================== +collect_configs() { + local configs=() + + if [[ -n "$SINGLE_CONF" ]]; then + if [[ ! -f "$SINGLE_CONF" ]]; then + echo -e "${RED}Error: Config file not found: ${SINGLE_CONF}${NC}" >&2 + exit 1 + fi + configs+=("$SINGLE_CONF") + else + # Scan sites-enabled + if [[ -d "$SITES_DIR" ]]; then + for f in "$SITES_DIR"/*; do + [[ -f "$f" ]] && configs+=("$f") + done + fi + # Scan conf.d (skip bot-block.conf itself) + if [[ -d "$CONF_DIR" ]]; then + for f in "$CONF_DIR"/*.conf; do + [[ -f "$f" ]] || continue + [[ "$f" == "$MAP_FILE" ]] && continue + configs+=("$f") + done + fi + fi + + # Filter to only files containing a server block + local server_configs=() + for f in "${configs[@]}"; do + if grep -qP '^\s*server\s*\{' "$f" 2>/dev/null; then + server_configs+=("$f") + fi + done + + printf '%s\n' "${server_configs[@]}" +} + +# ===================================================== +# REMOVE MODE +# ===================================================== +if [[ "$REMOVE" == "true" ]]; then + step "Removing bot-block configuration" + + # Remove map file + if [[ -f "$MAP_FILE" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would remove: ${MAP_FILE}" + else + rm -f "$MAP_FILE" + info "Removed: ${MAP_FILE}" + fi + else + warn "Map file not found: ${MAP_FILE} (already removed?)" + fi + + # Strip managed blocks from config files + step "Scanning for injected bot-block rules" + + mapfile -t configs < <(collect_configs) + + if [[ ${#configs[@]} -eq 0 ]]; then + warn "No server block config files found" + else + for conf in "${configs[@]}"; do + if grep -q "$MARKER_START" "$conf" 2>/dev/null; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would clean: ${conf}" + else + cp "$conf" "${conf}.bak.${TIMESTAMP}" + sed -i "/${MARKER_START}/,/${MARKER_END}/d" "$conf" + info "Cleaned: ${conf}" + fi + fi + done + fi + + # Validate and reload + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" + echo " Would run: systemctl reload nginx" + else + step "Testing nginx configuration" + if nginx -t 2>&1; then + info "nginx config valid" + else + echo -e "${RED}[ERROR] nginx config test failed — restore .bak files${NC}" >&2 + exit 1 + fi + + step "Reloading nginx" + systemctl reload nginx + info "nginx reloaded" + fi + + echo "" + echo -e "${BOLD}Bot-block rules removed.${NC}" + exit 0 +fi + +# ===================================================== +# INSTALL MODE +# ===================================================== + +# Step 1: Create nginx map +# ===================================================== +step "Creating bot-block map at ${MAP_FILE}" + +MAP_CONTENT='# Bot-blocking map for AI scrapers, SEO bots, and vulnerability scanners +# Generated by add-nginx-bot-block.sh — https://mylinux.work + +map $http_user_agent $is_bad_bot { + default 0; + + # AI scrapers + ~*ABEvalBot 1; + ~*GPTBot 1; + ~*ClaudeBot 1; + ~*anthropic-ai 1; + ~*CCBot 1; + ~*Bytespider 1; + ~*TikTokSpider 1; + ~*cohere-ai 1; + ~*PerplexityBot 1; + ~*Diffbot 1; + ~*MistralBot 1; + ~*YandexGPTBot 1; + ~*meta-externalagent 1; + ~*Meta-ExternalFetcher 1; + ~*meta-webindexer 1; + ~*PetalBot 1; + ~*Amazonbot 1; + ~*Amzn-SearchBot 1; + ~*AI2Bot 1; + ~*Timpibot 1; + ~*img2dataset 1; + ~*YouBot 1; + ~*HanaleiBot 1; + + # Defunct crawlers (spoofed user agents) + ~*Exabot 1; + ~*Sogou 1; + + # SEO scrapers + ~*MJ12bot 1; + ~*SemrushBot 1; + ~*AhrefsBot 1; + ~*DotBot 1; + ~*DataForSeoBot 1; + ~*SERanking 1; + + # Vulnerability scanners + ~*Nikto 1; + ~*sqlmap 1; + ~*Nmap 1; + ~*masscan 1; + ~*ZmEu 1; + ~*Morpheus 1; + + # Lead-gen / business intelligence bots + ~*ospa-radar 1; + ~*HubSeedsBot 1; + + # AI scrapers / research bots + ~*Aranet-SearchBot 1; + ~*AzureAI-SearchBot 1; + ~*MINERVA-DeepResearch 1; + ~*NagetBot 1; + ~*LAIABot 1; + ~*pi-coding-agent 1; + + # Probe / monitoring bots + ~*CMS-Checker 1; + ~*NexoFaviconBot 1; + ~*AwarioBot 1; + ~*AwarioSmartBot 1; + ~*CopyousBot 1; + ~*SurdotlyBot 1; + ~*trendictionbot 1; + ~*wpbot 1; + ~*WebFetchTool 1; + ~*YisouSpider 1; + + # Scraping frameworks + ~*Scrapy 1; + ~*python-requests 1; + ~*Go-http-client 1; + ~*Java/ 1; + ~*libwww-perl 1; + ~*trafilatura 1; + ~*node-fetch 1; + + # Outdated browsers (Chrome < 115 — almost certainly bots) + ~*Chrome/([1-9][0-9]?|10[0-9]|11[0-4])\. 1; + + # Empty / missing user agent + "" 1; + "-" 1; +}' + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${MAP_FILE}" +else + if [[ -f "$MAP_FILE" ]]; then + cp "$MAP_FILE" "${MAP_FILE}.bak.${TIMESTAMP}" + warn "Existing map backed up" + fi + echo "$MAP_CONTENT" > "$MAP_FILE" + info "Map created: ${MAP_FILE}" +fi + +# ===================================================== +# Step 2: Inject bot-blocking rule into server blocks +# ===================================================== +step "Scanning for server blocks to inject bot-blocking rule" + +mapfile -t configs < <(collect_configs) + +if [[ ${#configs[@]} -eq 0 ]]; then + warn "No server block config files found in ${SITES_DIR} or ${CONF_DIR}" +else + MODIFIED=0 + for conf in "${configs[@]}"; do + # Skip if already managed + if grep -q "$MARKER_START" "$conf" 2>/dev/null; then + warn "Already managed: ${conf} — skipping" + continue + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would inject into: ${conf}" + MODIFIED=$((MODIFIED + 1)) + continue + fi + + # Backup + cp "$conf" "${conf}.bak.${TIMESTAMP}" + + # Inject the if block before the first location directive inside each server block + BOT_BLOCK="\\ + ${MARKER_START}\\ + if (\$is_bad_bot) {\\ + return ${STATUS_CODE};\\ + }\\ + # Block broken srcset scrapers\\ + if (\$request_uri ~* \"%20[0-9]+w,https?://\") {\\ + return ${STATUS_CODE};\\ + }\\ + # Block spoofed referers with fragment identifiers (real browsers strip these)\\ + if (\$http_referer ~* \"#\") {\\ + return ${STATUS_CODE};\\ + }\\ + # Block non-GET/HEAD methods (static sites never need POST/PUT/DELETE)\\ + if (\$request_method !~ ^(GET|HEAD)\$ ) {\\ + return ${STATUS_CODE};\\ + }\\ + ${MARKER_END}" + + awk -v block="$BOT_BLOCK" ' + /^\s*server\s*\{/ { in_server = 1; injected = 0 } + in_server && !injected && /^\s*location\s/ { + print block + print "" + injected = 1 + } + /^\s*\}/ && in_server { + # Track brace depth to know when server block ends + } + { print } + ' "$conf" > "${conf}.tmp" + mv "${conf}.tmp" "$conf" + + info "Injected into: ${conf}" + MODIFIED=$((MODIFIED + 1)) + done + + if [[ $MODIFIED -eq 0 ]]; then + warn "No files modified (all already managed)" + fi +fi + +# ===================================================== +# Step 3: Validate nginx config +# ===================================================== +step "Testing nginx configuration" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" +else + if nginx -t 2>&1; then + info "nginx config valid" + else + echo -e "${RED}[ERROR] nginx config test failed${NC}" >&2 + echo " Restore backups (.bak.${TIMESTAMP}) from ${SITES_DIR} and ${CONF_DIR}" >&2 + exit 1 + fi +fi + +# ===================================================== +# Step 4: Reload nginx +# ===================================================== +step "Reloading nginx" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl reload nginx" +else + systemctl reload nginx + info "nginx reloaded" +fi + +# ===================================================== +# Summary +# ===================================================== +echo "" +echo -e "${BOLD}Done.${NC}" +echo "" +echo " Map: ${MAP_FILE}" +echo " Status code: ${STATUS_CODE}" +if [[ -n "$SINGLE_CONF" ]]; then + echo " Config: ${SINGLE_CONF}" +else + echo " Scanned: ${SITES_DIR}/ and ${CONF_DIR}/*.conf" +fi +echo "" +echo " To remove: sudo $(basename "$0") --remove" +echo "" +echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" +echo " Expected: 444 (connection dropped) or 000 (no response)" diff --git a/add-nginx-js-challenge.sh b/add-nginx-js-challenge.sh new file mode 100644 index 0000000..d00fd97 --- /dev/null +++ b/add-nginx-js-challenge.sh @@ -0,0 +1,582 @@ +#!/bin/bash +################################################################################ +# Script Name: add-nginx-js-challenge.sh +# Version: 3.1 +# Description: Adds a lightweight JavaScript cookie challenge to nginx. +# Bots that don't execute JavaScript are silently dropped. +# Legitimate search engine crawlers are whitelisted by user agent. +# Headless Chrome bots from suspect GeoIP regions with no external +# referrer are tarpitted (served at 50 bytes/sec). +# Works alongside bot-block.conf (run add-nginx-bot-block.sh first). +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - nginx installed and running +# - Root access +# +# Usage: +# sudo ./add-nginx-js-challenge.sh +# sudo ./add-nginx-js-challenge.sh --dry-run +# sudo ./add-nginx-js-challenge.sh --remove +# +# How it works: +# 1. Whitelisted bot UAs (Googlebot, Bingbot, etc.) bypass the check entirely +# 2. All other visitors must have a cookie with a randomized name and token +# 3. First-time visitors get a brief redirect to a challenge page that sets +# the cookie via JS and bounces them back — takes < 100ms +# 4. Bots that don't run JS never get the cookie and get 444'd +# 5. Cookie name and token are randomized per installation — re-running the +# script rotates them, immediately invalidating old pre-set cookies +# +# Changelog: +# 3.1 — 2026-05-21: Challenge endpoint rate limiting. Headless Chrome bot farms +# were passing the JS challenge on every request by spawning fresh browser +# instances without persistent cookies. Added limit_req_zone on the +# challenge endpoint: 3 requests allowed (burst), then 1/min sustained. +# Excess requests get 444. Added --challenge-burst and --challenge-rate. +# Fixed geoip2 variable name ($geoip2_country_code to match standard +# geoip2.conf). Conditional geoip2 block — only added if no existing +# mmdb is loaded elsewhere in nginx config. Challenge JS now treats +# same-domain referrers as "direct" for tarpit purposes. +# 3.0 — 2026-05-20: Referrer tracking through challenge redirect. Original +# HTTP Referer is passed as &ref= param in the 302 redirect. Challenge +# JS stores it in a _bc_ref cookie. Tarpit map: visitors from suspect +# GeoIP countries (CN by default) with no external referrer are served +# at 50 bytes/sec via limit_rate, draining headless Chrome resources. +# Requires ngx_http_geoip2_module for GeoIP-based tarpitting. +# Added --tarpit-countries option (default: CN). +# Added --tarpit-rate option (default: 50 bytes/sec). +# 2.0 — 2026-05-19: Randomized cookie name and token per installation. +# Cookie name is now a random 2-character suffix (e.g. _v7, _xq). +# Cookie value is now a 32-char hex token instead of static "verified". +# Values persist in /etc/nginx/js-challenge.env for future reference. +# Re-running rotates credentials and invalidates old bot bypass cookies. +# Added no-cache headers on challenge page to prevent stale HTML after +# rotation. Fixed challenge page Secure flag to be conditional on HTTPS. +# Fixed challenge location — removed incorrect 'internal' directive. +# 1.0 — 2026-05-11: Initial release +# +################################################################################ + +set -euo pipefail + +# --- Configuration --- +CONF_DIR="/etc/nginx/conf.d" +CHALLENGE_MAP="${CONF_DIR}/js-challenge.conf" +CHALLENGE_DIR="/var/www/js-challenge" +CHALLENGE_HTML="${CHALLENGE_DIR}/challenge.html" +STATE_FILE="/etc/nginx/js-challenge.env" +CHALLENGE_PATH="/_bc" +DRY_RUN=false +REMOVE=false +COOKIE_MAX_AGE=86400 # 24 hours +TARPIT_COUNTRIES="${TARPIT_COUNTRIES:-CN}" # GeoIP country codes to tarpit (space-separated) +TARPIT_RATE="${TARPIT_RATE:-50}" # bytes/sec for tarpitted responses +CHALLENGE_RATE="${CHALLENGE_RATE:-1}" # sustained challenge requests per minute per IP +CHALLENGE_BURST="${CHALLENGE_BURST:-3}" # initial burst of challenge requests allowed +TIMESTAMP=$(date +%s) + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +usage() { + cat <&2 + exit 1 +fi + +# ===================================================== +# Generate or load cookie credentials +# ===================================================== + +generate_credentials() { + COOKIE_NAME="_$(openssl rand -hex 1)" + COOKIE_VALUE="$(openssl rand -hex 16)" +} + +save_credentials() { + if [[ "$DRY_RUN" != "true" ]]; then + cat > "$STATE_FILE" <&1; then + systemctl reload nginx + info "nginx reloaded" + else + echo -e "${RED}[ERROR] nginx config test failed after removal${NC}" >&2 + exit 1 + fi + fi + + echo "" + echo -e "${BOLD}JS challenge removed.${NC}" + echo "" + echo " Note: You may also need to remove the js-challenge location blocks" + echo " from your server block configs (look for 'js-challenge-managed')." + exit 0 +fi + +# ===================================================== +# Step 1: Create the challenge HTML page +# ===================================================== +step "Creating challenge page at ${CHALLENGE_HTML}" + +CHALLENGE_CONTENT=' + +Verifying + + + + + +' + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${CHALLENGE_DIR}/" + echo " Would create: ${CHALLENGE_HTML}" +else + mkdir -p "$CHALLENGE_DIR" + echo "$CHALLENGE_CONTENT" > "$CHALLENGE_HTML" + info "Challenge page created: ${CHALLENGE_HTML}" +fi + +# Save credentials +save_credentials + +# ===================================================== +# Step 2: Create nginx map config +# ===================================================== +step "Creating JS challenge map at ${CHALLENGE_MAP}" + +# Build the cookie variable name for nginx (e.g. _v7 → $cookie__v7) +NGINX_COOKIE_VAR="\$cookie_${COOKIE_NAME}" + +# Check if a geoip2 block already loads an mmdb anywhere in nginx config. +# If so, $geoip2_country_code should already be defined — don't duplicate. +GEOIP2_BLOCK="" +if ! grep -r 'geoip2.*\.mmdb' /etc/nginx/ \ + --include='*.conf' --exclude='js-challenge.conf' --exclude='*.bak.*' \ + -q 2>/dev/null; then + GEOIP2_BLOCK=' +# ── GeoIP2: country lookup for tarpit decisions ────────────────────── +# Uses the City database (superset of Country). Adjust path if needed. +geoip2 /usr/share/GeoIP/GeoLite2-City.mmdb { + $geoip2_country_code country iso_code; +} +' + step "No existing geoip2 country_code config found — adding to map config" +fi + + # Collect server_name values from nginx configs to build same-site referer map + local REFERER_ENTRIES="" + local _jsc_domain_seen=() + for _conf in /etc/nginx/conf.d/*.conf /etc/nginx/sites-enabled/*; do + [[ -f "$_conf" ]] || continue + while read -r _sn; do + for _d in $_sn; do + [[ "$_d" == "server_name" || "$_d" == ";" || "$_d" == "_" || "$_d" =~ ^[0-9] ]] && continue + _d="${_d%;}" + [[ " ${_jsc_domain_seen[*]:-} " == *" $_d "* ]] && continue + _jsc_domain_seen+=("$_d") + local _d_escaped="${_d//./\\.}" + REFERER_ENTRIES+=" ~^1:https?://${_d_escaped} 1;\n" + done + done < <(grep -oP '^\s*server_name\s+\K[^;]+;?' "$_conf" 2>/dev/null) + done + + if [[ -z "$REFERER_ENTRIES" ]]; then + warn "No server_name values found — same-site image bypass will not work" + warn "Images behind the challenge may cause redirect loops for browsers" + fi + +MAP_CONTENT='# JS cookie challenge — allowed bots and cookie check +# Generated by add-nginx-js-challenge.sh — https://mylinux.work +# Cookie: '"${COOKIE_NAME}"' Token: '"${COOKIE_VALUE:0:8}"'... +# Generated: '"$(date -Iseconds)"' + +# ── Rate limit: challenge endpoint ─────────────────────────────────── +# Real users hit the challenge once and keep the cookie. Headless bot farms +# spawn fresh browsers per request, hitting the challenge every time. +# Rate: '"${CHALLENGE_RATE}"'r/m with burst of '"${CHALLENGE_BURST}"' — excess gets 444. +limit_req_zone $binary_remote_addr zone=jschallenge:10m rate='"${CHALLENGE_RATE}"'r/m; + +# Bots that legitimately identify themselves and should bypass the JS check +map $http_user_agent $is_allowed_bot { + default 0; + + # Search engines + ~*Googlebot 1; + ~*bingbot 1; + ~*Slurp 1; + ~*DuckDuckBot 1; + ~*DuckAssistBot 1; + ~*Baiduspider 1; + ~*YandexBot 1; + ~*YandexFavicons 1; + ~*Applebot 1; + ~*Qwantbot 1; + ~*Qwantify 1; + ~*Bravebot 1; + ~*kagi-fetcher 1; + ~*Kagibot 1; + ~*Yahoo! 1; + ~*Yeti 1; + + # Social media / link previews + ~*facebookexternalhit 1; + ~*Facebot 1; + ~*Twitterbot 1; + ~*LinkedInBot 1; + ~*Slackbot 1; + ~*Slack-ImgProxy 1; + ~*Discordbot 1; + ~*TelegramBot 1; + ~*WhatsApp 1; + ~*redditbot 1; + ~*ArenaUnfurlBot 1; + + # Feed readers + ~*Feedly 1; + ~*Miniflux 1; + ~*FreshRSS 1; + ~*NewsBlur 1; + ~*Tiny\ Tiny\ RSS 1; + ~*Inoreader 1; + ~*NetNewsWire 1; + + # Monitoring / uptime + ~*UptimeRobot 1; + ~*Pingdom 1; + ~*StatusCake 1; + ~*Blackbox-Exporter 1; + + # AI answer bots (user-facing, not training crawlers) + ~*OAI-SearchBot 1; + ~*ChatGPT-User 1; + ~*Claude-Web 1; + ~*Claude-User 1; + ~*MistralAI-User 1; + + # Archive / research + ~*archive\.org_bot 1; + + # Apple Safari prefetch + ~*safarifetcherd 1; + + # Link checkers / validators + ~*W3C_Validator 1; + ~*W3C-checklink 1; + ~*LinkChecker 1; + ~*link-check 1; + + # Decentralized search + ~*yacybot 1; + + # Add your own allowed bots below +} + +# Validate the challenge cookie — exact token match +map '"${NGINX_COOKIE_VAR}"' $js_cookie_valid { + default 0; + "'"${COOKIE_VALUE}"'" 1; +} + +# Detect requests to the challenge page and download paths (prevent redirect loops) +map $uri $is_challenge_uri { + default 0; + "'"${CHALLENGE_PATH}"'" 1; + ~^/downloads/ 1; + ~*\.(css|js|woff2?)$ 1; + ~*favicon 1; + ~*apple-touch-icon 1; +} + +# Detect image sub-resource requests with same-site referer (browser loads) +# These bypass the challenge because: (a) images cannot execute JS challenges, +# and (b) the same-site referer proves the browser loaded a page from this domain. +# Direct image requests from scrapers (no referer or external referer) still get challenged. +map $uri $is_image_request { + default 0; + ~*\.(png|jpe?g|gif|svg|webp|ico|avif)$ 1; +} +map "$is_image_request:$http_referer" $is_samesite_image { + default 0; +'"${REFERER_ENTRIES}"'} + +# Combined check: need challenge if not allowed bot, no valid cookie, and not the challenge page +map "$is_allowed_bot:$js_cookie_valid:$is_challenge_uri:$is_samesite_image" $needs_js_challenge { + default 1; + "1:0:0:0" 0; + "1:0:0:1" 0; + "1:0:1:0" 0; + "1:0:1:1" 0; + "1:1:0:0" 0; + "1:1:0:1" 0; + "1:1:1:0" 0; + "1:1:1:1" 0; + "0:1:0:0" 0; + "0:1:0:1" 0; + "0:1:1:0" 0; + "0:1:1:1" 0; + "0:0:1:0" 0; + "0:0:1:1" 0; + "0:0:0:1" 0; +} +'"${GEOIP2_BLOCK}"' +# ── Tarpit: headless Chrome bots from suspect regions ───────────────── +# Visitors from tarpit countries with no external referrer (passed through +# the challenge redirect as the _bc_ref cookie) are served at a crawl. +# This drains headless Chrome resources (~200-500 MB RAM per instance) +# without giving the bot a clear "blocked" signal to adapt to. +# +# The _bc_ref cookie is set by the challenge page JS from the &ref= param. +# It contains the original HTTP Referer before the 302 redirect destroyed it. +# "direct" = no external referrer (typed URL or bot). Cookie expires in 120s. + +# Check if visitor is from a tarpit country (requires geoip2 module) +map $geoip2_country_code $is_tarpit_country { + default 0; +'"$(for cc in $TARPIT_COUNTRIES; do echo " \"${cc}\" 1;"; done)"' +} + +# Tarpit only if: tarpit country + no external referrer + passed JS challenge +map "$is_tarpit_country:$cookie__bc_ref" $tarpit_client { + default 0; + "1:direct" 1; + "1:" 1; +} + +# Serve the challenge page +server { + listen 127.0.0.1:18444; + server_name _; + root /var/www/js-challenge; + + location / { + add_header Cache-Control "no-store, no-cache, must-revalidate" always; + add_header Pragma "no-cache" always; + try_files /challenge.html =404; + } +}' + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${CHALLENGE_MAP}" +else + if [[ -f "$CHALLENGE_MAP" ]]; then + cp "$CHALLENGE_MAP" "${CHALLENGE_MAP}.bak.${TIMESTAMP}" + warn "Existing config backed up" + fi + echo "$MAP_CONTENT" > "$CHALLENGE_MAP" + info "Map config created: ${CHALLENGE_MAP}" +fi + +# ===================================================== +# Step 3: Show injection instructions +# ===================================================== +step "Server block configuration" + +echo "" +echo " Add the following inside each server block (after your bot-block rules):" +echo "" +echo -e "${CYAN} # js-challenge-managed-start" +echo " location = ${CHALLENGE_PATH} {" +echo " limit_req zone=jschallenge burst=${CHALLENGE_BURST} nodelay;" +echo " limit_req_status 444;" +echo " proxy_pass http://127.0.0.1:18444/;" +echo " }" +echo "" +echo " # JS cookie challenge — redirect non-JS visitors" +echo " if (\$needs_js_challenge) {" +echo " return 302 ${CHALLENGE_PATH}?r=\$request_uri&ref=\$http_referer;" +echo " }" +echo "" +echo " # Tarpit headless Chrome bots from suspect GeoIP regions" +echo " if (\$tarpit_client) {" +echo " set \$limit_rate ${TARPIT_RATE};" +echo " }" +echo -e " # js-challenge-managed-end${NC}" +echo "" +echo " Or re-run add-nginx-bot-block.sh to have it injected automatically" +echo " (if supported in your version)." +echo "" + +# ===================================================== +# Step 4: Validate nginx config +# ===================================================== +step "Testing nginx configuration" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" +else + if nginx -t 2>&1; then + info "nginx config valid" + else + echo -e "${RED}[ERROR] nginx config test failed${NC}" >&2 + echo " Restore backup: ${CHALLENGE_MAP}.bak.${TIMESTAMP}" >&2 + exit 1 + fi +fi + +# ===================================================== +# Step 5: Reload nginx +# ===================================================== +step "Reloading nginx" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl reload nginx" +else + systemctl reload nginx + info "nginx reloaded" +fi + +# ===================================================== +# Summary +# ===================================================== +echo "" +echo -e "${BOLD}Done.${NC}" +echo "" +echo " Challenge map: ${CHALLENGE_MAP}" +echo " Challenge page: ${CHALLENGE_HTML}" +echo " State file: ${STATE_FILE}" +echo " Cookie name: ${COOKIE_NAME}" +echo " Cookie token: ${COOKIE_VALUE:0:8}... (32 hex chars)" +echo " Cookie TTL: ${COOKIE_MAX_AGE}s" +echo " Tarpit countries: ${TARPIT_COUNTRIES}" +echo " Tarpit rate: ${TARPIT_RATE} bytes/sec" +echo " Challenge rate: ${CHALLENGE_RATE}r/m (burst: ${CHALLENGE_BURST})" +echo "" +echo " To rotate credentials (invalidate bot-cached cookies):" +echo " sudo $(basename "$0")" +echo "" +echo " To remove: sudo $(basename "$0") --remove" +echo "" +echo " Test (bot without cookie gets redirected to challenge):" +echo " curl -o /dev/null -s -w '%{http_code}' https://yourdomain.com" +echo " Expected: 302" +echo "" +echo " Test (browser completes challenge — 302 → 200):" +echo " Open https://yourdomain.com in a browser" +echo " Expected: brief redirect then page loads normally" +echo "" +echo " Test (old static bypass no longer works):" +echo " curl -b '_bc=verified' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" +echo " Expected: 302 (not 200 — old cookie is invalid)" +echo "" +echo " Test (rate limit on challenge endpoint):" +echo " for i in 1 2 3 4 5; do curl -o /dev/null -s -w \"\$i: %{http_code}\n\" https://yourdomain.com${CHALLENGE_PATH}; done" +echo " Expected: first 3 return 200, then 444 (rate limited)" +echo "" +echo " Test (allowed bot bypasses challenge):" +echo " curl -A 'Googlebot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" +echo " Expected: 200" diff --git a/add-prometheus-tls.sh b/add-prometheus-tls.sh new file mode 100644 index 0000000..1ed1d95 --- /dev/null +++ b/add-prometheus-tls.sh @@ -0,0 +1,1234 @@ +#!/bin/bash +################################################################################ +# Script Name: add-prometheus-tls.sh +# Version: 1.01 +# Description: Add TLS encryption to Prometheus and node_exporter +# Auto-detects whether this is the Prometheus server (generates +# a CA + server cert) or a target node (configures node_exporter +# with a provided or generated cert signed by the Prometheus CA). +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Role Detection: +# - "server" — Prometheus is installed: generates CA, server cert, +# configures prometheus.yml for TLS scraping, and optionally +# configures the local node_exporter too. +# - "node" — Only node_exporter found: configures node_exporter with +# TLS using a cert signed by the Prometheus CA (CA cert must +# be provided or copied from the server). +# +# Usage: +# sudo ./add-prometheus-tls.sh # Auto-detect role +# sudo ./add-prometheus-tls.sh --role server # Force server mode +# sudo ./add-prometheus-tls.sh --role node # Force node mode +# sudo ./add-prometheus-tls.sh --role node --ca-cert /path/to/ca.crt --ca-key /path/to/ca.key +# sudo ./add-prometheus-tls.sh --deploy host1,host2 # Push TLS to remote nodes +# sudo ./add-prometheus-tls.sh --deploy-file hosts.txt # Push TLS to nodes from file +# sudo ./add-prometheus-tls.sh --status # Show TLS status +# sudo ./add-prometheus-tls.sh --remove # Remove TLS config +# +################################################################################ + +set -euo pipefail + +SCRIPT_VERSION="1.0" + +# Paths +PROM_DIR="/etc/prometheus" +PROM_TLS_DIR="${PROM_DIR}/tls" +NODE_EXPORTER_DIR="/etc/node_exporter" +NODE_EXPORTER_TLS_DIR="${NODE_EXPORTER_DIR}/tls" +BACKUP_DIR="/var/backups/prometheus-tls" + +# CA defaults +CA_DAYS=3650 +CERT_DAYS=825 +KEY_BITS=4096 + +# Runtime +ROLE="" # "server" or "node" +CA_CERT="" # path to existing CA cert (node mode) +CA_KEY="" # path to existing CA key (node mode) +PROM_USER="prometheus" +NODE_USER="node_exporter" +HOSTNAME_FQDN="" +DEPLOY_TARGETS="" # comma-separated hosts for --deploy +DEPLOY_FILE="" # file containing hosts for --deploy-file +SSH_USER="root" # SSH user for deploy +SSH_KEY="" # optional SSH key path +DRY_RUN=false +DEBUG=${DEBUG:-} + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2 + exit 1 +} + +warn() { + echo "WARNING: $1" >&2 +} + +info() { + echo "[INFO] $1" +} + +debug_echo() { + if [[ -n "$DEBUG" ]]; then + echo "[DEBUG] $*" >&2 + fi +} + +backup_file() { + local file="$1" + if [[ ! -f "$file" ]]; then + return 0 + fi + local timestamp + timestamp=$(date +%F_%H%M%S) + local backup_path="${BACKUP_DIR}/${timestamp}" + mkdir -p "$backup_path" + if [[ "$DRY_RUN" == true ]]; then + info "[DRY RUN] Would backup $file -> ${backup_path}/$(basename "$file")" + else + cp -a "$file" "${backup_path}/" + info "Backed up $file -> ${backup_path}/$(basename "$file")" + fi +} + +# ============================================================================ +# ROLE DETECTION +# ============================================================================ + +detect_role() { + local has_prometheus=false + local has_node_exporter=false + + if systemctl list-unit-files prometheus.service &>/dev/null && \ + systemctl cat prometheus.service &>/dev/null; then + has_prometheus=true + fi + + if systemctl list-unit-files node_exporter.service &>/dev/null && \ + systemctl cat node_exporter.service &>/dev/null; then + has_node_exporter=true + fi + + if [[ "$has_prometheus" == true ]]; then + ROLE="server" + info "Detected role: server (Prometheus installed)" + elif [[ "$has_node_exporter" == true ]]; then + ROLE="node" + info "Detected role: node (node_exporter only)" + else + die "Neither Prometheus nor node_exporter detected. Install them first." + fi +} + +detect_hostname() { + if [[ -n "$HOSTNAME_FQDN" ]]; then + return 0 + fi + + HOSTNAME_FQDN=$(hostname -f 2>/dev/null || hostname) + info "Using hostname: ${HOSTNAME_FQDN}" +} + +# ============================================================================ +# CERTIFICATE GENERATION +# ============================================================================ + +generate_ca() { + local ca_dir="${PROM_TLS_DIR}" + local ca_cert="${ca_dir}/ca.crt" + local ca_key="${ca_dir}/ca.key" + + if [[ -f "$ca_cert" && -f "$ca_key" ]]; then + echo "" + echo " CA certificate already exists at ${ca_cert}" + read -r -p " Regenerate CA? (will invalidate all existing certs) [y/N]: " confirm + if [[ ! "$confirm" =~ ^[Yy]$ ]]; then + info "Keeping existing CA" + CA_CERT="$ca_cert" + CA_KEY="$ca_key" + return 0 + fi + backup_file "$ca_cert" + backup_file "$ca_key" + fi + + info "Generating Certificate Authority..." + if [[ "$DRY_RUN" == true ]]; then + info "[DRY RUN] Would generate CA cert at ${ca_cert}" + CA_CERT="$ca_cert" + CA_KEY="$ca_key" + return 0 + fi + + mkdir -p "$ca_dir" + + openssl genrsa -out "$ca_key" "$KEY_BITS" 2>/dev/null + openssl req -x509 -new -nodes \ + -key "$ca_key" \ + -sha256 \ + -days "$CA_DAYS" \ + -out "$ca_cert" \ + -subj "/CN=Prometheus CA/O=Prometheus/OU=Monitoring" \ + 2>/dev/null + + chmod 644 "$ca_cert" + chmod 600 "$ca_key" + + CA_CERT="$ca_cert" + CA_KEY="$ca_key" + + info "CA certificate created: ${ca_cert}" + info "CA key created: ${ca_key} (keep this safe!)" +} + +generate_cert() { + local name="$1" # e.g., "prometheus" or "node_exporter" + local cert_dir="$2" # where to put the cert + local owner="$3" # file owner user + + local cert_file="${cert_dir}/${name}.crt" + local key_file="${cert_dir}/${name}.key" + + if [[ -f "$cert_file" && -f "$key_file" ]]; then + echo "" + echo " Certificate for ${name} already exists." + read -r -p " Regenerate? [y/N]: " confirm + if [[ ! "$confirm" =~ ^[Yy]$ ]]; then + info "Keeping existing ${name} certificate" + return 0 + fi + backup_file "$cert_file" + backup_file "$key_file" + fi + + info "Generating certificate for ${name}..." + if [[ "$DRY_RUN" == true ]]; then + info "[DRY RUN] Would generate cert at ${cert_file}" + return 0 + fi + + mkdir -p "$cert_dir" + + # Create CSR config with SANs + local csr_conf + csr_conf=$(mktemp) + cat > "$csr_conf" </dev/null | awk '{print $1}' || echo "127.0.0.1") +CSREOF + + # Create ext config for signing + local ext_conf + ext_conf=$(mktemp) + cat > "$ext_conf" </dev/null | awk '{print $1}' || echo "127.0.0.1") +EXTEOF + + # Generate key + openssl genrsa -out "$key_file" "$KEY_BITS" 2>/dev/null + + # Generate CSR + local csr_file + csr_file=$(mktemp) + openssl req -new \ + -key "$key_file" \ + -out "$csr_file" \ + -config "$csr_conf" \ + 2>/dev/null + + # Sign with CA + openssl x509 -req \ + -in "$csr_file" \ + -CA "$CA_CERT" \ + -CAkey "$CA_KEY" \ + -CAcreateserial \ + -out "$cert_file" \ + -days "$CERT_DAYS" \ + -sha256 \ + -extfile "$ext_conf" \ + 2>/dev/null + + # Set ownership + chmod 644 "$cert_file" + chmod 600 "$key_file" + if id "$owner" &>/dev/null; then + chown "${owner}:${owner}" "$cert_file" "$key_file" + fi + + # Cleanup temp files + rm -f "$csr_conf" "$ext_conf" "$csr_file" + + info "Certificate created: ${cert_file}" + info "Key created: ${key_file}" +} + +# ============================================================================ +# PROMETHEUS SERVER CONFIGURATION +# ============================================================================ + +configure_prometheus_tls() { + local web_config="${PROM_DIR}/web.yml" + + if [[ -f "$web_config" ]] && grep -q "tls_server_config" "$web_config" 2>/dev/null; then + echo "" + echo " Prometheus web.yml already has TLS config." + read -r -p " Overwrite? [y/N]: " confirm + if [[ ! "$confirm" =~ ^[Yy]$ ]]; then + info "Keeping existing Prometheus TLS config" + return 0 + fi + backup_file "$web_config" + fi + + info "Configuring Prometheus TLS (web.yml)..." + if [[ "$DRY_RUN" == true ]]; then + info "[DRY RUN] Would create ${web_config}" + return 0 + fi + + cat > "$web_config" </dev/null; then + chown "${PROM_USER}:${PROM_USER}" "$web_config" + fi + chmod 644 "$web_config" + + # Ensure --web.config.file is in the systemd unit + update_prometheus_service + + info "Prometheus web.yml created: ${web_config}" +} + +update_prometheus_service() { + local service_file + service_file=$(systemctl show -p FragmentPath prometheus.service 2>/dev/null | cut -d= -f2) + + if [[ -z "$service_file" || ! -f "$service_file" ]]; then + warn "Could not find prometheus.service unit file" + warn "Manually add '--web.config.file=${PROM_DIR}/web.yml' to Prometheus startup" + return 0 + fi + + if grep -q "web.config.file" "$service_file" 2>/dev/null; then + debug_echo "Prometheus service already has --web.config.file flag" + return 0 + fi + + info "Updating Prometheus systemd service to use web.yml..." + if [[ "$DRY_RUN" == true ]]; then + info "[DRY RUN] Would add --web.config.file to ${service_file}" + return 0 + fi + + backup_file "$service_file" + + # Add --web.config.file to the ExecStart line + if grep -qE '^ExecStart=.*prometheus' "$service_file"; then + sed -i '/^ExecStart=.*prometheus/ s|$| \\\n --web.config.file='"${PROM_DIR}"'/web.yml|' "$service_file" + systemctl daemon-reload + info "Added --web.config.file to Prometheus service" + else + warn "Could not auto-patch service file. Add manually:" + warn " --web.config.file=${PROM_DIR}/web.yml" + fi +} + +update_prometheus_scrape_configs() { + local prom_config="${PROM_DIR}/prometheus.yml" + + if [[ ! -f "$prom_config" ]]; then + warn "prometheus.yml not found at ${prom_config} — skipping scrape config update" + return 0 + fi + + info "Updating prometheus.yml scrape configs for TLS..." + if [[ "$DRY_RUN" == true ]]; then + info "[DRY RUN] Would update scrape configs in ${prom_config}" + return 0 + fi + + backup_file "$prom_config" + + # Check if tls_config already exists for node targets + if grep -q "tls_config" "$prom_config" 2>/dev/null; then + info "prometheus.yml already contains tls_config entries" + echo " Review ${prom_config} to ensure all scrape jobs use TLS." + return 0 + fi + + # Create a TLS snippet file that can be included + local tls_snippet="${PROM_DIR}/tls_scrape.yml" + cat > "$tls_snippet" </dev/null; then + chown "${PROM_USER}:${PROM_USER}" "$tls_snippet" + fi + + # Auto-patch: update scheme and add tls_config to existing jobs + # Update scheme: http -> https for node jobs + local tmpfile + tmpfile=$(mktemp) + local in_job=false + local job_patched=false + + while IFS= read -r line; do + echo "$line" >> "$tmpfile" + + # Detect job_name lines + if [[ "$line" =~ ^[[:space:]]*-[[:space:]]*job_name: ]]; then + in_job=true + job_patched=false + fi + + # If we're in a job block and find scheme: http (not https), patch it + if [[ "$in_job" == true && "$job_patched" == false ]]; then + if [[ "$line" =~ ^[[:space:]]*scheme:[[:space:]]*http[[:space:]]*$ ]]; then + # Replace this line with https + tls_config + sed -i '$ s|scheme: http|scheme: https|' "$tmpfile" + # Determine indentation + local indent + indent=$(echo "$line" | sed 's/\(^[[:space:]]*\).*/\1/') + echo "${indent}tls_config:" >> "$tmpfile" + echo "${indent} ca_file: ${PROM_TLS_DIR}/ca.crt" >> "$tmpfile" + job_patched=true + fi + fi + done < "$prom_config" + + # If no scheme: lines were found, add a note + if ! grep -q "scheme: https" "$tmpfile" 2>/dev/null; then + info "No 'scheme: http' lines found to auto-patch." + info "Reference TLS snippet created at: ${tls_snippet}" + info "Manually update your scrape jobs to use scheme: https with tls_config." + rm -f "$tmpfile" + return 0 + fi + + cp "$tmpfile" "$prom_config" + rm -f "$tmpfile" + + if id "$PROM_USER" &>/dev/null; then + chown "${PROM_USER}:${PROM_USER}" "$prom_config" + fi + + info "Updated scrape configs in ${prom_config}" + info "TLS reference snippet saved to: ${tls_snippet}" +} + +# ============================================================================ +# NODE EXPORTER CONFIGURATION +# ============================================================================ + +configure_node_exporter_tls() { + local tls_dir="$NODE_EXPORTER_TLS_DIR" + local web_config="${NODE_EXPORTER_DIR}/web.yml" + + mkdir -p "$tls_dir" "$NODE_EXPORTER_DIR" + + # Generate cert for this node + generate_cert "node_exporter" "$tls_dir" "$NODE_USER" + + # Copy CA cert to node_exporter dir for reference + if [[ "$DRY_RUN" != true && -f "$CA_CERT" ]]; then + cp -a "$CA_CERT" "${tls_dir}/ca.crt" + if id "$NODE_USER" &>/dev/null; then + chown "${NODE_USER}:${NODE_USER}" "${tls_dir}/ca.crt" + fi + fi + + if [[ -f "$web_config" ]] && grep -q "tls_server_config" "$web_config" 2>/dev/null; then + echo "" + echo " node_exporter web.yml already has TLS config." + read -r -p " Overwrite? [y/N]: " confirm + if [[ ! "$confirm" =~ ^[Yy]$ ]]; then + info "Keeping existing node_exporter TLS config" + update_node_exporter_service + return 0 + fi + backup_file "$web_config" + fi + + info "Configuring node_exporter TLS (web.yml)..." + if [[ "$DRY_RUN" == true ]]; then + info "[DRY RUN] Would create ${web_config}" + return 0 + fi + + cat > "$web_config" </dev/null; then + chown "${NODE_USER}:${NODE_USER}" "$web_config" + fi + chmod 644 "$web_config" + + update_node_exporter_service + + info "node_exporter web.yml created: ${web_config}" +} + +update_node_exporter_service() { + local service_file + service_file=$(systemctl show -p FragmentPath node_exporter.service 2>/dev/null | cut -d= -f2) + + if [[ -z "$service_file" || ! -f "$service_file" ]]; then + warn "Could not find node_exporter.service unit file" + warn "Manually add '--web.config.file=${NODE_EXPORTER_DIR}/web.yml' to node_exporter startup" + return 0 + fi + + if grep -q "web.config.file" "$service_file" 2>/dev/null; then + debug_echo "node_exporter service already has --web.config.file flag" + return 0 + fi + + info "Updating node_exporter systemd service to use web.yml..." + if [[ "$DRY_RUN" == true ]]; then + info "[DRY RUN] Would add --web.config.file to ${service_file}" + return 0 + fi + + backup_file "$service_file" + + if grep -qE '^ExecStart=.*node_exporter' "$service_file"; then + sed -i '/^ExecStart=.*node_exporter/ s|$| \\\n --web.config.file='"${NODE_EXPORTER_DIR}"'/web.yml|' "$service_file" + systemctl daemon-reload + info "Added --web.config.file to node_exporter service" + else + warn "Could not auto-patch service file. Add manually:" + warn " --web.config.file=${NODE_EXPORTER_DIR}/web.yml" + fi +} + +# ============================================================================ +# STATUS +# ============================================================================ + +show_status() { + echo "" + echo "==========================================" + echo "Prometheus TLS Status (v${SCRIPT_VERSION})" + echo "==========================================" + echo "" + + # Check Prometheus + echo "--- Prometheus Server ---" + if systemctl cat prometheus.service &>/dev/null 2>&1; then + local prom_status="installed" + systemctl is-active --quiet prometheus 2>/dev/null && prom_status="running" + + echo " Service: ${prom_status}" + + if [[ -f "${PROM_DIR}/web.yml" ]] && grep -q "tls_server_config" "${PROM_DIR}/web.yml" 2>/dev/null; then + echo " TLS: ✓ enabled (web.yml)" + else + echo " TLS: ✗ not configured" + fi + + if [[ -f "${PROM_TLS_DIR}/ca.crt" ]]; then + local ca_expiry + ca_expiry=$(openssl x509 -enddate -noout -in "${PROM_TLS_DIR}/ca.crt" 2>/dev/null | cut -d= -f2) + echo " CA cert: ✓ present (expires: ${ca_expiry})" + else + echo " CA cert: ✗ not found" + fi + + if [[ -f "${PROM_TLS_DIR}/prometheus.crt" ]]; then + local prom_expiry + prom_expiry=$(openssl x509 -enddate -noout -in "${PROM_TLS_DIR}/prometheus.crt" 2>/dev/null | cut -d= -f2) + echo " Server cert: ✓ present (expires: ${prom_expiry})" + else + echo " Server cert: ✗ not found" + fi + + # Verify Prometheus is actually serving HTTPS + if curl -sk --max-time 3 "https://localhost:9090/-/healthy" &>/dev/null; then + echo " HTTPS: ✓ responding on https://localhost:9090" + elif curl -s --max-time 3 "http://localhost:9090/-/healthy" &>/dev/null; then + echo " HTTPS: ✗ still serving plain HTTP" + else + echo " HTTPS: ? could not connect" + fi + else + echo " Not installed" + fi + + echo "" + + # Check node_exporter + echo "--- node_exporter ---" + if systemctl cat node_exporter.service &>/dev/null 2>&1; then + local node_status="installed" + systemctl is-active --quiet node_exporter 2>/dev/null && node_status="running" + + echo " Service: ${node_status}" + + if [[ -f "${NODE_EXPORTER_DIR}/web.yml" ]] && grep -q "tls_server_config" "${NODE_EXPORTER_DIR}/web.yml" 2>/dev/null; then + echo " TLS: ✓ enabled (web.yml)" + else + echo " TLS: ✗ not configured" + fi + + if [[ -f "${NODE_EXPORTER_TLS_DIR}/node_exporter.crt" ]]; then + local node_expiry + node_expiry=$(openssl x509 -enddate -noout -in "${NODE_EXPORTER_TLS_DIR}/node_exporter.crt" 2>/dev/null | cut -d= -f2) + echo " Cert: ✓ present (expires: ${node_expiry})" + else + echo " Cert: ✗ not found" + fi + + # Verify node_exporter is actually serving HTTPS + if curl -sk --max-time 3 "https://localhost:9100/metrics" &>/dev/null; then + echo " HTTPS: ✓ responding on https://localhost:9100" + elif curl -s --max-time 3 "http://localhost:9100/metrics" &>/dev/null; then + echo " HTTPS: ✗ still serving plain HTTP" + else + echo " HTTPS: ? could not connect" + fi + else + echo " Not installed" + fi + + echo "" +} + +# ============================================================================ +# REMOVE +# ============================================================================ + +do_remove() { + echo "" + echo "==========================================" + echo "Remove Prometheus TLS Configuration" + echo "==========================================" + echo "" + + # Remove Prometheus TLS + if [[ -f "${PROM_DIR}/web.yml" ]]; then + backup_file "${PROM_DIR}/web.yml" + if [[ "$DRY_RUN" != true ]]; then + rm -f "${PROM_DIR}/web.yml" + fi + info "Removed Prometheus web.yml" + + # Remove --web.config.file from service + local prom_service + prom_service=$(systemctl show -p FragmentPath prometheus.service 2>/dev/null | cut -d= -f2) + if [[ -n "$prom_service" && -f "$prom_service" ]] && grep -q "web.config.file" "$prom_service"; then + backup_file "$prom_service" + if [[ "$DRY_RUN" != true ]]; then + sed -i '/--web.config.file/d' "$prom_service" + # Clean up trailing backslash if left dangling + sed -i '${/^[[:space:]]*\\[[:space:]]*$/d}' "$prom_service" + systemctl daemon-reload + fi + info "Removed --web.config.file from prometheus.service" + fi + + if [[ "$DRY_RUN" != true ]]; then + systemctl restart prometheus 2>/dev/null || warn "Could not restart Prometheus" + fi + fi + + # Remove node_exporter TLS + if [[ -f "${NODE_EXPORTER_DIR}/web.yml" ]]; then + backup_file "${NODE_EXPORTER_DIR}/web.yml" + if [[ "$DRY_RUN" != true ]]; then + rm -f "${NODE_EXPORTER_DIR}/web.yml" + fi + info "Removed node_exporter web.yml" + + local node_service + node_service=$(systemctl show -p FragmentPath node_exporter.service 2>/dev/null | cut -d= -f2) + if [[ -n "$node_service" && -f "$node_service" ]] && grep -q "web.config.file" "$node_service"; then + backup_file "$node_service" + if [[ "$DRY_RUN" != true ]]; then + sed -i '/--web.config.file/d' "$node_service" + sed -i '${/^[[:space:]]*\\[[:space:]]*$/d}' "$node_service" + systemctl daemon-reload + fi + info "Removed --web.config.file from node_exporter.service" + fi + + if [[ "$DRY_RUN" != true ]]; then + systemctl restart node_exporter 2>/dev/null || warn "Could not restart node_exporter" + fi + fi + + echo "" + info "TLS configuration removed. Backups saved to: ${BACKUP_DIR}" + info "Note: Certificate files in ${PROM_TLS_DIR} and ${NODE_EXPORTER_TLS_DIR} were NOT deleted." + info "Remove them manually if no longer needed." +} + +# ============================================================================ +# SERVER SETUP +# ============================================================================ + +setup_server() { + echo "" + echo "==========================================" + echo "Prometheus Server TLS Setup" + echo "Version: ${SCRIPT_VERSION}" + echo "==========================================" + echo "" + + detect_hostname + mkdir -p "$PROM_TLS_DIR" "$BACKUP_DIR" + + # Step 1: Generate CA + echo "" + echo "=== Step 1: Certificate Authority ===" + generate_ca + + # Step 2: Generate Prometheus server cert + echo "" + echo "=== Step 2: Prometheus Server Certificate ===" + generate_cert "prometheus" "$PROM_TLS_DIR" "$PROM_USER" + + # Step 3: Configure Prometheus web.yml + echo "" + echo "=== Step 3: Prometheus TLS Configuration ===" + configure_prometheus_tls + + # Step 4: Update scrape configs + echo "" + echo "=== Step 4: Scrape Configuration ===" + update_prometheus_scrape_configs + + # Step 5: Optionally configure local node_exporter + if systemctl cat node_exporter.service &>/dev/null 2>&1; then + echo "" + echo "=== Step 5: Local node_exporter ===" + echo " node_exporter detected on this server." + read -r -p " Configure TLS for local node_exporter too? [Y/n]: " configure_node + if [[ ! "$configure_node" =~ ^[Nn]$ ]]; then + configure_node_exporter_tls + fi + fi + + # Step 6: Restart services + echo "" + echo "=== Restarting Services ===" + if [[ "$DRY_RUN" != true ]]; then + info "Restarting Prometheus..." + systemctl restart prometheus + if systemctl is-active --quiet prometheus; then + info "Prometheus restarted successfully" + else + warn "Prometheus failed to start — check: journalctl -u prometheus" + fi + + if [[ -f "${NODE_EXPORTER_DIR}/web.yml" ]]; then + info "Restarting node_exporter..." + systemctl restart node_exporter + if systemctl is-active --quiet node_exporter; then + info "node_exporter restarted successfully" + else + warn "node_exporter failed to start — check: journalctl -u node_exporter" + fi + fi + else + info "[DRY RUN] Would restart prometheus and node_exporter" + fi + + # Summary + echo "" + echo "==========================================" + echo "TLS Setup Complete!" + echo "==========================================" + echo "" + echo "CA Certificate: ${PROM_TLS_DIR}/ca.crt" + echo "CA Key: ${PROM_TLS_DIR}/ca.key" + echo "Server Certificate: ${PROM_TLS_DIR}/prometheus.crt" + echo "Backups: ${BACKUP_DIR}" + echo "" + echo "To configure remote nodes, copy the CA cert and key to each node:" + echo "" + echo " scp ${PROM_TLS_DIR}/ca.crt ${PROM_TLS_DIR}/ca.key user@node:/tmp/" + echo " ssh user@node 'sudo ./add-prometheus-tls.sh --role node --ca-cert /tmp/ca.crt --ca-key /tmp/ca.key'" + echo "" + echo "To verify: curl -s --cacert ${PROM_TLS_DIR}/ca.crt https://localhost:9090/-/healthy" + echo "" +} + +# ============================================================================ +# NODE SETUP +# ============================================================================ + +setup_node() { + echo "" + echo "==========================================" + echo "Node Exporter TLS Setup" + echo "Version: ${SCRIPT_VERSION}" + echo "==========================================" + echo "" + + detect_hostname + mkdir -p "$NODE_EXPORTER_TLS_DIR" "$BACKUP_DIR" + + # Check for CA cert/key + if [[ -z "$CA_CERT" || -z "$CA_KEY" ]]; then + # Check if they exist locally (maybe copied from server) + if [[ -f "${NODE_EXPORTER_TLS_DIR}/ca.crt" && -f "${NODE_EXPORTER_TLS_DIR}/ca.key" ]]; then + CA_CERT="${NODE_EXPORTER_TLS_DIR}/ca.crt" + CA_KEY="${NODE_EXPORTER_TLS_DIR}/ca.key" + info "Found existing CA files in ${NODE_EXPORTER_TLS_DIR}" + elif [[ -f "${PROM_TLS_DIR}/ca.crt" && -f "${PROM_TLS_DIR}/ca.key" ]]; then + CA_CERT="${PROM_TLS_DIR}/ca.crt" + CA_KEY="${PROM_TLS_DIR}/ca.key" + info "Found existing CA files in ${PROM_TLS_DIR}" + else + echo " No CA certificate found. You need the CA cert and key from your" + echo " Prometheus server to sign this node's certificate." + echo "" + echo " Copy them from the Prometheus server:" + echo " scp prometheus-server:${PROM_TLS_DIR}/ca.crt /tmp/" + echo " scp prometheus-server:${PROM_TLS_DIR}/ca.key /tmp/" + echo "" + read -r -p " Path to CA certificate: " CA_CERT + read -r -p " Path to CA key: " CA_KEY + + if [[ ! -f "$CA_CERT" ]]; then + die "CA certificate not found: ${CA_CERT}" + fi + if [[ ! -f "$CA_KEY" ]]; then + die "CA key not found: ${CA_KEY}" + fi + fi + else + # Validate provided paths + if [[ ! -f "$CA_CERT" ]]; then + die "CA certificate not found: ${CA_CERT}" + fi + if [[ ! -f "$CA_KEY" ]]; then + die "CA key not found: ${CA_KEY}" + fi + fi + + # Copy CA files to node_exporter tls dir + if [[ "$DRY_RUN" != true ]]; then + cp -a "$CA_CERT" "${NODE_EXPORTER_TLS_DIR}/ca.crt" + cp -a "$CA_KEY" "${NODE_EXPORTER_TLS_DIR}/ca.key" + chmod 644 "${NODE_EXPORTER_TLS_DIR}/ca.crt" + chmod 600 "${NODE_EXPORTER_TLS_DIR}/ca.key" + fi + + # Generate cert and configure + echo "" + echo "=== Generating node_exporter Certificate ===" + configure_node_exporter_tls + + # Restart + echo "" + echo "=== Restarting node_exporter ===" + if [[ "$DRY_RUN" != true ]]; then + systemctl restart node_exporter + if systemctl is-active --quiet node_exporter; then + info "node_exporter restarted successfully" + else + warn "node_exporter failed to start — check: journalctl -u node_exporter" + fi + else + info "[DRY RUN] Would restart node_exporter" + fi + + # Summary + echo "" + echo "==========================================" + echo "node_exporter TLS Setup Complete!" + echo "==========================================" + echo "" + echo "Certificate: ${NODE_EXPORTER_TLS_DIR}/node_exporter.crt" + echo "Key: ${NODE_EXPORTER_TLS_DIR}/node_exporter.key" + echo "Backups: ${BACKUP_DIR}" + echo "" + echo "Add this node to your Prometheus server's prometheus.yml:" + echo "" + echo " - job_name: 'node'" + echo " scheme: https" + echo " tls_config:" + echo " ca_file: ${PROM_TLS_DIR}/ca.crt" + echo " static_configs:" + echo " - targets: ['${HOSTNAME_FQDN}:9100']" + echo "" + echo "To verify: curl -s --cacert ${NODE_EXPORTER_TLS_DIR}/ca.crt https://localhost:9100/metrics | head" + echo "" +} + +# ============================================================================ +# REMOTE DEPLOY +# ============================================================================ + +build_ssh_cmd() { + local ssh_opts="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" + if [[ -n "$SSH_KEY" ]]; then + ssh_opts+=" -i ${SSH_KEY}" + fi + echo "ssh ${ssh_opts}" +} + +build_scp_cmd() { + local scp_opts="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" + if [[ -n "$SSH_KEY" ]]; then + scp_opts+=" -i ${SSH_KEY}" + fi + echo "scp ${scp_opts}" +} + +deploy_to_nodes() { + local hosts=() + + # Build host list from --deploy and/or --deploy-file + if [[ -n "$DEPLOY_TARGETS" ]]; then + IFS=',' read -ra target_hosts <<< "$DEPLOY_TARGETS" + hosts+=("${target_hosts[@]}") + fi + + if [[ -n "$DEPLOY_FILE" ]]; then + if [[ ! -f "$DEPLOY_FILE" ]]; then + die "Deploy file not found: ${DEPLOY_FILE}" + fi + while IFS= read -r line; do + # Skip blank lines and comments + line=$(echo "$line" | sed 's/#.*//' | xargs) + [[ -z "$line" ]] && continue + hosts+=("$line") + done < "$DEPLOY_FILE" + fi + + if [[ ${#hosts[@]} -eq 0 ]]; then + die "No target hosts specified" + fi + + # Verify CA exists (must run server setup first) + if [[ ! -f "${PROM_TLS_DIR}/ca.crt" || ! -f "${PROM_TLS_DIR}/ca.key" ]]; then + die "CA not found at ${PROM_TLS_DIR}/. Run server setup first: $0 --role server" + fi + + local script_path + script_path=$(readlink -f "$0") + local ssh_cmd scp_cmd + ssh_cmd=$(build_ssh_cmd) + scp_cmd=$(build_scp_cmd) + + echo "" + echo "==========================================" + echo "Deploy TLS to Remote Nodes" + echo "==========================================" + echo "" + echo " CA: ${PROM_TLS_DIR}/ca.crt" + echo " SSH user: ${SSH_USER}" + echo " Targets: ${hosts[*]}" + echo "" + + local succeeded=0 + local failed=0 + local failed_hosts=() + + for host in "${hosts[@]}"; do + echo "--- ${host} ---" + + if [[ "$DRY_RUN" == true ]]; then + info "[DRY RUN] Would deploy TLS to ${host}" + ((succeeded++)) || true + continue + fi + + # Test SSH connectivity + if ! $ssh_cmd "${SSH_USER}@${host}" "echo ok" &>/dev/null; then + warn "Cannot connect to ${host} — skipping" + ((failed++)) || true + failed_hosts+=("$host") + echo "" + continue + fi + + # Create temp dir on remote + local remote_tmp + remote_tmp=$($ssh_cmd "${SSH_USER}@${host}" "mktemp -d /tmp/prom-tls-XXXXXX") + + # Copy CA cert, CA key, and this script + $scp_cmd "${PROM_TLS_DIR}/ca.crt" "${PROM_TLS_DIR}/ca.key" "$script_path" \ + "${SSH_USER}@${host}:${remote_tmp}/" 2>/dev/null + + if [[ $? -ne 0 ]]; then + warn "Failed to copy files to ${host} — skipping" + ((failed++)) || true + failed_hosts+=("$host") + echo "" + continue + fi + + # Run the script in node mode on the remote host + info "Running node setup on ${host}..." + if $ssh_cmd "${SSH_USER}@${host}" \ + "chmod +x ${remote_tmp}/$(basename "$script_path") && \ + ${remote_tmp}/$(basename "$script_path") \ + --role node \ + --ca-cert ${remote_tmp}/ca.crt \ + --ca-key ${remote_tmp}/ca.key"; then + info "${host}: TLS configured successfully" + ((succeeded++)) || true + else + warn "${host}: Setup failed — check logs on that host" + ((failed++)) || true + failed_hosts+=("$host") + fi + + # Cleanup temp files on remote + $ssh_cmd "${SSH_USER}@${host}" "rm -rf ${remote_tmp}" 2>/dev/null + + echo "" + done + + # Summary + echo "==========================================" + echo "Deploy Complete" + echo "==========================================" + echo "" + echo " Succeeded: ${succeeded}" + echo " Failed: ${failed}" + + if [[ ${#failed_hosts[@]} -gt 0 ]]; then + echo " Failed hosts: ${failed_hosts[*]}" + fi + + # Print prometheus.yml snippet for all successful hosts + echo "" + echo "Add these targets to your prometheus.yml:" + echo "" + echo " - job_name: 'node'" + echo " scheme: https" + echo " tls_config:" + echo " ca_file: ${PROM_TLS_DIR}/ca.crt" + echo " static_configs:" + echo -n " - targets: [" + local first=true + for host in "${hosts[@]}"; do + # Skip failed hosts + local is_failed=false + for fh in "${failed_hosts[@]}"; do + [[ "$fh" == "$host" ]] && is_failed=true + done + [[ "$is_failed" == true ]] && continue + + if [[ "$first" == true ]]; then + echo -n "'${host}:9100'" + first=false + else + echo -n ", '${host}:9100'" + fi + done + echo "]" + echo "" + + [[ $failed -gt 0 ]] && return 1 + return 0 +} + +# ============================================================================ +# ARGUMENT PARSING +# ============================================================================ + +parse_arguments() { + while [[ $# -gt 0 ]]; do + case $1 in + --role) + ROLE="$2" + if [[ "$ROLE" != "server" && "$ROLE" != "node" ]]; then + die "Invalid role: ${ROLE}. Must be 'server' or 'node'" + fi + shift 2 + ;; + --ca-cert) + CA_CERT="$2" + shift 2 + ;; + --ca-key) + CA_KEY="$2" + shift 2 + ;; + --hostname) + HOSTNAME_FQDN="$2" + shift 2 + ;; + --deploy) + DEPLOY_TARGETS="$2" + shift 2 + ;; + --deploy-file) + DEPLOY_FILE="$2" + shift 2 + ;; + --ssh-user) + SSH_USER="$2" + shift 2 + ;; + --ssh-key) + SSH_KEY="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=true + shift + ;; + --status) + show_status + exit 0 + ;; + --remove) + do_remove + exit 0 + ;; + -h|--help) + show_usage + ;; + *) + die "Unknown option: $1. Use --help for usage." + ;; + esac + done +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + if [[ $EUID -ne 0 ]]; then + die "This script must be run as root" + fi + + parse_arguments "$@" + + # Check openssl is available + if ! command -v openssl &>/dev/null; then + die "openssl is required but not installed" + fi + + # Deploy mode — push TLS to remote nodes from the Prometheus server + if [[ -n "$DEPLOY_TARGETS" || -n "$DEPLOY_FILE" ]]; then + deploy_to_nodes + exit $? + fi + + # Auto-detect role if not specified + if [[ -z "$ROLE" ]]; then + detect_role + fi + + case "$ROLE" in + server) setup_server ;; + node) setup_node ;; + esac +} + +main "$@" diff --git a/alb-health-reporter.sh b/alb-health-reporter.sh new file mode 100755 index 0000000..c63701f --- /dev/null +++ b/alb-health-reporter.sh @@ -0,0 +1,574 @@ +#!/usr/bin/env bash + +######################################################################################### +#### alb-health-reporter.sh — Check AWS ALB/NLB target group health and alert #### +#### Reports unhealthy targets, CloudWatch metrics, and sends SNS alerts #### +#### Requires: bash 4+, aws-cli v2, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./alb-health-reporter.sh --check #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +AWS_REGION="${AWS_REGION:-}" +ALB_NAME="${ALB_NAME:-}" +TARGET_GROUP="${TARGET_GROUP:-}" +SNS_TOPIC_ARN="${SNS_TOPIC_ARN:-}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +RUN_MODE="" +START_TIME="" +UNHEALTHY_COUNT=0 +HEALTHY_COUNT=0 +DRAINING_COUNT=0 +TOTAL_TARGETS=0 + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +die() { err "$*"; exit 1; } + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── AWS CLI wrapper ─────────────────────────────────────────────────── +aws_cmd() { + local args=("$@") + [[ -n "$AWS_REGION" ]] && args+=(--region "$AWS_REGION") + verbose "aws ${args[*]}" + aws "${args[@]}" +} + +# ── Dependency check ────────────────────────────────────────────────── +check_deps() { + for cmd in aws jq; do + if ! command -v "$cmd" &>/dev/null; then + die "${cmd} is required but not installed" + fi + done + + if ! aws sts get-caller-identity &>/dev/null; then + die "AWS credentials not configured or expired" + fi + + if [[ -z "$AWS_REGION" ]]; then + AWS_REGION=$(aws configure get region 2>/dev/null || echo "") + if [[ -z "$AWS_REGION" ]]; then + die "AWS_REGION is required" + fi + fi + verbose "Using region: ${AWS_REGION}" +} + +# ── Get load balancers ──────────────────────────────────────────────── +get_load_balancers() { + local query_args=(elbv2 describe-load-balancers) + + if [[ -n "$ALB_NAME" ]]; then + query_args+=(--names "$ALB_NAME") + fi + + aws_cmd "${query_args[@]}" \ + --query 'LoadBalancers[*].{ARN:LoadBalancerArn,Name:LoadBalancerName,Type:Type,State:State.Code,DNSName:DNSName}' \ + --output json 2>/dev/null +} + +# ── Get target groups for a load balancer ───────────────────────────── +get_target_groups() { + local lb_arn="$1" + + if [[ -n "$TARGET_GROUP" ]]; then + aws_cmd elbv2 describe-target-groups \ + --target-group-arns "$TARGET_GROUP" \ + --query 'TargetGroups[*].{ARN:TargetGroupArn,Name:TargetGroupName,Protocol:Protocol,Port:Port,HealthPath:HealthCheckPath}' \ + --output json 2>/dev/null + return + fi + + aws_cmd elbv2 describe-target-groups \ + --load-balancer-arn "$lb_arn" \ + --query 'TargetGroups[*].{ARN:TargetGroupArn,Name:TargetGroupName,Protocol:Protocol,Port:Port,HealthPath:HealthCheckPath}' \ + --output json 2>/dev/null +} + +# ══════════════════════════════════════════════════════════════════════ +# CHECK MODE +# ══════════════════════════════════════════════════════════════════════ + +do_check() { + log "Checking target group health..." + + local lbs_json + lbs_json=$(get_load_balancers) + + local lb_count + lb_count=$(echo "$lbs_json" | jq 'length') + + if [[ "$lb_count" -eq 0 ]]; then + log "No load balancers found" + return + fi + + log "Found ${lb_count} load balancer(s)" + + echo "$lbs_json" | jq -c '.[]' | while IFS= read -r lb; do + local lb_arn lb_name lb_type lb_state + lb_arn=$(echo "$lb" | jq -r '.ARN') + lb_name=$(echo "$lb" | jq -r '.Name') + lb_type=$(echo "$lb" | jq -r '.Type') + lb_state=$(echo "$lb" | jq -r '.State') + + echo "" + echo -e " ${BOLD}${lb_name}${RESET} (${lb_type}, ${lb_state})" + + local tgs_json + tgs_json=$(get_target_groups "$lb_arn") + + local tg_count + tg_count=$(echo "$tgs_json" | jq 'length') + + if [[ "$tg_count" -eq 0 ]]; then + echo " No target groups" + continue + fi + + echo "$tgs_json" | jq -c '.[]' | while IFS= read -r tg; do + local tg_arn tg_name tg_proto tg_port + tg_arn=$(echo "$tg" | jq -r '.ARN') + tg_name=$(echo "$tg" | jq -r '.Name') + tg_proto=$(echo "$tg" | jq -r '.Protocol') + tg_port=$(echo "$tg" | jq -r '.Port') + + echo "" + echo -e " ${BOLD}Target Group: ${tg_name}${RESET} (${tg_proto}:${tg_port})" + + local health_json + health_json=$(aws_cmd elbv2 describe-target-health \ + --target-group-arn "$tg_arn" \ + --query 'TargetHealthDescriptions[*].{Id:Target.Id,Port:Target.Port,State:TargetHealth.State,Reason:TargetHealth.Reason,Desc:TargetHealth.Description}' \ + --output json 2>/dev/null) + + local target_count + target_count=$(echo "$health_json" | jq 'length') + TOTAL_TARGETS=$((TOTAL_TARGETS + target_count)) + + if [[ "$target_count" -eq 0 ]]; then + echo -e " ${YELLOW}No registered targets${RESET}" + continue + fi + + printf " ${BOLD}%-22s %-8s %-12s %s${RESET}\n" "TARGET" "PORT" "STATE" "REASON" + printf " %s\n" "$(printf '%.0s─' {1..60})" + + echo "$health_json" | jq -c '.[]' | while IFS= read -r target; do + local tid tport tstate treason + tid=$(echo "$target" | jq -r '.Id') + tport=$(echo "$target" | jq -r '.Port') + tstate=$(echo "$target" | jq -r '.State') + treason=$(echo "$target" | jq -r '.Reason // "-"') + + local icon color + case "$tstate" in + healthy) + icon="${GREEN}✓${RESET}" + color="$GREEN" + HEALTHY_COUNT=$((HEALTHY_COUNT + 1)) + ;; + unhealthy) + icon="${RED}✗${RESET}" + color="$RED" + UNHEALTHY_COUNT=$((UNHEALTHY_COUNT + 1)) + ;; + draining) + icon="${YELLOW}⊘${RESET}" + color="$YELLOW" + DRAINING_COUNT=$((DRAINING_COUNT + 1)) + ;; + *) + icon="${DIM}?${RESET}" + color="$DIM" + ;; + esac + + printf " ${icon} %-20s %-8s ${color}%-12s${RESET} %s\n" "$tid" "$tport" "$tstate" "$treason" + done + done + done + + echo "" + echo -e " ${BOLD}Summary${RESET}" + echo " Total targets: ${TOTAL_TARGETS}" + echo -e " Healthy: ${GREEN}${HEALTHY_COUNT}${RESET}" + [[ "$UNHEALTHY_COUNT" -gt 0 ]] && echo -e " Unhealthy: ${RED}${UNHEALTHY_COUNT}${RESET}" || echo " Unhealthy: 0" + [[ "$DRAINING_COUNT" -gt 0 ]] && echo -e " Draining: ${YELLOW}${DRAINING_COUNT}${RESET}" || echo " Draining: 0" + log "Completed in $(elapsed)" + + if [[ "$UNHEALTHY_COUNT" -gt 0 ]]; then + return 2 + elif [[ "$DRAINING_COUNT" -gt 0 ]]; then + return 1 + fi + return 0 +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST MODE +# ══════════════════════════════════════════════════════════════════════ + +do_list() { + log "Listing load balancers..." + + local lbs_json + lbs_json=$(get_load_balancers) + + if [[ "$OUTPUT_FORMAT" == "json" ]]; then + echo "$lbs_json" | jq '.' + return + fi + + echo "" + printf " ${BOLD}%-30s %-12s %-10s %s${RESET}\n" "NAME" "TYPE" "STATE" "DNS NAME" + printf " %s\n" "$(printf '%.0s─' {1..90})" + + echo "$lbs_json" | jq -c '.[]' | while IFS= read -r lb; do + local lb_name lb_type lb_state dns_name + lb_name=$(echo "$lb" | jq -r '.Name') + lb_type=$(echo "$lb" | jq -r '.Type') + lb_state=$(echo "$lb" | jq -r '.State') + dns_name=$(echo "$lb" | jq -r '.DNSName') + + local color="$GREEN" + [[ "$lb_state" != "active" ]] && color="$YELLOW" + + printf " %-30s %-12s ${color}%-10s${RESET} %s\n" \ + "${lb_name:0:30}" "$lb_type" "$lb_state" "${dns_name:0:50}" + done + + local count + count=$(echo "$lbs_json" | jq 'length') + echo "" + log "Total: ${count} load balancer(s)" +} + +# ══════════════════════════════════════════════════════════════════════ +# METRICS MODE +# ══════════════════════════════════════════════════════════════════════ + +do_metrics() { + log "Fetching CloudWatch metrics (last 1 hour)..." + + local lbs_json + lbs_json=$(get_load_balancers) + + local now + now=$(date -u +%Y-%m-%dT%H:%M:%SZ) + local one_hour_ago + one_hour_ago=$(date -u -d "-1 hour" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null) || \ + one_hour_ago=$(date -u -v-1H +%Y-%m-%dT%H:%M:%SZ 2>/dev/null) || \ + one_hour_ago="$now" + + echo "$lbs_json" | jq -c '.[]' | while IFS= read -r lb; do + local lb_arn lb_name lb_type + lb_arn=$(echo "$lb" | jq -r '.ARN') + lb_name=$(echo "$lb" | jq -r '.Name') + lb_type=$(echo "$lb" | jq -r '.Type') + + # Extract the ALB suffix for CloudWatch dimension + local lb_suffix + lb_suffix=${lb_arn##*loadbalancer/} + + echo "" + echo -e " ${BOLD}${lb_name}${RESET}" + + local namespace="AWS/ApplicationELB" + [[ "$lb_type" == "network" ]] && namespace="AWS/NetworkELB" + + # Request count + local req_count + req_count=$(aws_cmd cloudwatch get-metric-statistics \ + --namespace "$namespace" \ + --metric-name "RequestCount" \ + --dimensions "Name=LoadBalancer,Value=${lb_suffix}" \ + --start-time "$one_hour_ago" \ + --end-time "$now" \ + --period 3600 \ + --statistics Sum \ + --query 'Datapoints[0].Sum' \ + --output text 2>/dev/null) || req_count="N/A" + [[ "$req_count" == "None" ]] && req_count="0" + + echo " Request count (1h): ${req_count}" + + # 5xx errors + local err_5xx + err_5xx=$(aws_cmd cloudwatch get-metric-statistics \ + --namespace "$namespace" \ + --metric-name "HTTPCode_Target_5XX_Count" \ + --dimensions "Name=LoadBalancer,Value=${lb_suffix}" \ + --start-time "$one_hour_ago" \ + --end-time "$now" \ + --period 3600 \ + --statistics Sum \ + --query 'Datapoints[0].Sum' \ + --output text 2>/dev/null) || err_5xx="N/A" + [[ "$err_5xx" == "None" ]] && err_5xx="0" + + if [[ "$err_5xx" != "0" && "$err_5xx" != "N/A" ]]; then + echo -e " 5XX errors (1h): ${RED}${err_5xx}${RESET}" + else + echo " 5XX errors (1h): ${err_5xx}" + fi + + # Response time + local resp_time + resp_time=$(aws_cmd cloudwatch get-metric-statistics \ + --namespace "$namespace" \ + --metric-name "TargetResponseTime" \ + --dimensions "Name=LoadBalancer,Value=${lb_suffix}" \ + --start-time "$one_hour_ago" \ + --end-time "$now" \ + --period 3600 \ + --statistics Average \ + --query 'Datapoints[0].Average' \ + --output text 2>/dev/null) || resp_time="N/A" + [[ "$resp_time" == "None" ]] && resp_time="N/A" + + if [[ "$resp_time" != "N/A" ]]; then + local resp_ms + resp_ms=$(awk "BEGIN { printf \"%.1f\", $resp_time * 1000 }" 2>/dev/null || echo "$resp_time") + echo " Avg response time: ${resp_ms}ms" + else + echo " Avg response time: N/A" + fi + done + + echo "" + log "Metrics collected in $(elapsed)" +} + +# ══════════════════════════════════════════════════════════════════════ +# ALERT MODE +# ══════════════════════════════════════════════════════════════════════ + +do_alert() { + local check_exit=0 + do_check || check_exit=$? + + if [[ "$check_exit" -eq 2 && -n "$SNS_TOPIC_ARN" ]]; then + log "Sending SNS alert for ${UNHEALTHY_COUNT} unhealthy target(s)..." + + local subject="ALB Health Alert: ${UNHEALTHY_COUNT} unhealthy target(s) in ${AWS_REGION}" + local message + message="ALB Health Reporter Alert + +Region: ${AWS_REGION} +Time: $(date -u +%Y-%m-%dT%H:%M:%SZ) +Hostname: $(hostname -f 2>/dev/null || hostname) + +Summary: + Total targets: ${TOTAL_TARGETS} + Healthy: ${HEALTHY_COUNT} + Unhealthy: ${UNHEALTHY_COUNT} + Draining: ${DRAINING_COUNT} + +Action required: ${UNHEALTHY_COUNT} target(s) are unhealthy. +Run: alb-health-reporter.sh --check for details." + + if aws_cmd sns publish \ + --topic-arn "$SNS_TOPIC_ARN" \ + --subject "${subject:0:100}" \ + --message "$message" \ + --output text &>/dev/null; then + echo -e " ${GREEN}✓${RESET} SNS alert sent to ${SNS_TOPIC_ARN}" + else + warn "Failed to send SNS alert" + fi + elif [[ "$check_exit" -eq 2 && -z "$SNS_TOPIC_ARN" ]]; then + warn "Unhealthy targets found but no --sns-topic specified" + fi + + exit "$check_exit" +} + +# ══════════════════════════════════════════════════════════════════════ +# PROMETHEUS OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_prometheus() { + # Run check silently to collect counts + do_check > /dev/null 2>&1 || true + + local ts + ts=$(date +%s) + cat </dev/null || echo 'default')}" + echo "Mode: ${RUN_MODE}" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" + + check_deps + + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + print_prometheus + return + fi + + case "$RUN_MODE" in + check) do_check ;; + list) do_list ;; + metrics) do_metrics ;; + alert) do_alert ;; + esac +} + +main "$@" diff --git a/alertmanager-exporter.sh b/alertmanager-exporter.sh new file mode 100644 index 0000000..b3b49d1 --- /dev/null +++ b/alertmanager-exporter.sh @@ -0,0 +1,683 @@ +#!/bin/bash +################################################################################ +# Script Name: alertmanager-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for Alertmanager operational overview. +# Queries the Alertmanager API for active alerts, silences, +# cluster health, and config status. Complements the built-in +# /metrics endpoint with higher-level operational metrics. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - curl +# - jq +# - Alertmanager running and accessible +# - netcat (nc) for HTTP mode +# +# Usage: +# # Output to stdout +# ./alertmanager-exporter.sh +# +# # HTTP server mode +# ./alertmanager-exporter.sh --http -p 9094 +# +# # Textfile collector mode +# ./alertmanager-exporter.sh --textfile +# +# Metrics Exported: +# - alertmanager_overview_up - Exporter status (1=up, 0=down) +# - alertmanager_overview_info - Alertmanager version info +# - alertmanager_overview_alerts_active_total - Total active alerts +# - alertmanager_overview_alerts_by_state - Alerts by state +# - alertmanager_overview_alerts_by_severity - Alerts by severity +# - alertmanager_overview_alerts_by_receiver - Alerts by receiver +# - alertmanager_overview_alert_groups_total - Alert group count +# - alertmanager_overview_silences_active - Active silences +# - alertmanager_overview_silences_pending - Pending silences +# - alertmanager_overview_silences_expired - Expired silences +# - alertmanager_overview_silence_coverage_ratio - Silence coverage +# - alertmanager_overview_cluster_peers - Peer count +# - alertmanager_overview_cluster_peer_healthy - Per-peer health +# - alertmanager_overview_config_hash - Config hash for drift detection +# - alertmanager_overview_uptime_seconds - Uptime +# - alertmanager_overview_last_config_reload_timestamp - Last reload +# - alertmanager_overview_exporter_duration_seconds - Script duration +# - alertmanager_overview_exporter_last_run_timestamp - Last run time +# +# Configuration: +# Default HTTP port: 9094 +# Textfile directory: /var/lib/node_exporter +# Alertmanager URL: http://localhost:9093 +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9094 +AM_URL="http://localhost:9093" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Check prerequisites +check_requirements() { + local missing=0 + + if ! command -v curl >/dev/null 2>&1; then + echo "ERROR: curl not found" >&2 + missing=1 + fi + + if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq not found" >&2 + missing=1 + fi + + return $missing +} + +# Query an Alertmanager API endpoint +# Args: $1 - endpoint path (e.g., /api/v2/alerts) +# Returns: JSON response or empty string on failure +am_api() { + local endpoint="$1" + curl -sf --connect-timeout 5 --max-time 10 "${AM_URL}${endpoint}" 2>/dev/null +} + +# ============================================================================ +# METRIC COLLECTION FUNCTIONS +# ============================================================================ + +# Get alert counts by state +# Populates global variables: ALERTS_ACTIVE, ALERTS_SUPPRESSED, ALERTS_UNPROCESSED +collect_alerts() { + local alerts_json + alerts_json=$(am_api "/api/v2/alerts") + + if [ -z "$alerts_json" ]; then + ALERTS_TOTAL=0 + ALERTS_ACTIVE=0 + ALERTS_SUPPRESSED=0 + ALERTS_UNPROCESSED=0 + ALERTS_JSON="[]" + return 1 + fi + + ALERTS_JSON="$alerts_json" + ALERTS_TOTAL=$(echo "$alerts_json" | jq 'length') + ALERTS_ACTIVE=$(echo "$alerts_json" | jq '[.[] | select(.status.state == "active")] | length') + ALERTS_SUPPRESSED=$(echo "$alerts_json" | jq '[.[] | select(.status.state == "suppressed")] | length') + ALERTS_UNPROCESSED=$(echo "$alerts_json" | jq '[.[] | select(.status.state == "unprocessed")] | length') +} + +# Get alert counts by severity label +# Returns: metrics lines for each severity +collect_alerts_by_severity() { + local severity count + + for severity in critical warning info; do + count=$(echo "$ALERTS_JSON" | jq --arg sev "$severity" \ + '[.[] | select(.labels.severity == $sev)] | length') + echo "alertmanager_overview_alerts_by_severity{severity=\"$severity\"} ${count:-0}" + done + + # Count alerts with no severity or other severity values + count=$(echo "$ALERTS_JSON" | jq \ + '[.[] | select(.labels.severity != "critical" and .labels.severity != "warning" and .labels.severity != "info")] | length') + if [ "$count" -gt 0 ]; then + echo "alertmanager_overview_alerts_by_severity{severity=\"other\"} $count" + fi +} + +# Get alert counts by receiver +collect_alerts_by_receiver() { + echo "$ALERTS_JSON" | jq -r ' + [.[] | .receivers[]?.name // "unknown"] | + group_by(.) | + map({receiver: .[0], count: length}) | + .[] | + "alertmanager_overview_alerts_by_receiver{receiver=\"\(.receiver)\"} \(.count)" + ' 2>/dev/null +} + +# Get alert group count +collect_alert_groups() { + local groups_json + groups_json=$(am_api "/api/v2/alerts/groups") + + if [ -z "$groups_json" ]; then + echo "0" + return + fi + + echo "$groups_json" | jq 'length' +} + +# Get silence counts by state +collect_silences() { + local silences_json + silences_json=$(am_api "/api/v2/silences") + + if [ -z "$silences_json" ]; then + SILENCES_ACTIVE=0 + SILENCES_PENDING=0 + SILENCES_EXPIRED=0 + return 1 + fi + + SILENCES_ACTIVE=$(echo "$silences_json" | jq '[.[] | select(.status.state == "active")] | length') + SILENCES_PENDING=$(echo "$silences_json" | jq '[.[] | select(.status.state == "pending")] | length') + SILENCES_EXPIRED=$(echo "$silences_json" | jq '[.[] | select(.status.state == "expired")] | length') +} + +# Calculate silence coverage ratio +# Returns: ratio 0.0-1.0 (suppressed / total alerts) +calculate_silence_coverage() { + if [ "$ALERTS_TOTAL" -gt 0 ]; then + awk "BEGIN {printf \"%.4f\", $ALERTS_SUPPRESSED / $ALERTS_TOTAL}" + else + echo "0" + fi +} + +# Get cluster status +collect_cluster_status() { + local status_json + status_json=$(am_api "/api/v2/status") + + if [ -z "$status_json" ]; then + CLUSTER_PEERS=0 + CLUSTER_STATUS="unknown" + AM_VERSION="unknown" + AM_UPTIME_SECONDS=0 + CONFIG_HASH="0" + LAST_RELOAD=0 + return 1 + fi + + AM_VERSION=$(echo "$status_json" | jq -r '.versionInfo.version // "unknown"') + + # Cluster info + # shellcheck disable=SC2034 # reserved for future use + CLUSTER_STATUS=$(echo "$status_json" | jq -r '.cluster.status // "disabled"') + CLUSTER_PEERS=$(echo "$status_json" | jq '.cluster.peers // [] | length') + + # Peer details (for per-peer health metrics) + CLUSTER_PEERS_JSON=$(echo "$status_json" | jq '.cluster.peers // []') + + # Uptime from start time + local start_time + start_time=$(echo "$status_json" | jq -r '.uptime // empty' 2>/dev/null) + if [ -n "$start_time" ]; then + local start_epoch now_epoch + start_epoch=$(date -d "$start_time" +%s 2>/dev/null || echo 0) + now_epoch=$(date +%s) + if [ "$start_epoch" -gt 0 ]; then + AM_UPTIME_SECONDS=$((now_epoch - start_epoch)) + else + AM_UPTIME_SECONDS=0 + fi + else + AM_UPTIME_SECONDS=0 + fi + + # Config hash — hash the config JSON for drift detection + local config_json + config_json=$(echo "$status_json" | jq -r '.config.original // ""') + if [ -n "$config_json" ]; then + CONFIG_HASH=$(echo "$config_json" | sha256sum | awk '{print $1}' | head -c 16) + else + CONFIG_HASH="0" + fi + + # Last config reload — not directly available from /api/v2/status + # We'll pull this from the built-in /metrics if reachable + local reload_ts + reload_ts=$(curl -sf "${AM_URL}/metrics" 2>/dev/null | \ + grep "^alertmanager_config_last_reload_success_timestamp_seconds" | \ + awk '{print $2}' | head -1) + LAST_RELOAD=${reload_ts:-0} +} + +# Output per-peer health metrics +output_peer_metrics() { + if [ "$CLUSTER_PEERS" -eq 0 ] || [ -z "$CLUSTER_PEERS_JSON" ]; then + return + fi + + echo "$CLUSTER_PEERS_JSON" | jq -r ' + .[] | + "alertmanager_overview_cluster_peer_healthy{peer=\"\(.address // "unknown")\"} 1" + ' 2>/dev/null +} + +# Get notification metrics from built-in /metrics endpoint +collect_notification_metrics() { + local metrics_raw + metrics_raw=$(curl -sf "${AM_URL}/metrics" 2>/dev/null) + + if [ -z "$metrics_raw" ]; then + return 1 + fi + + NOTIFICATION_METRICS="$metrics_raw" +} + +# Output notification rate per receiver (from built-in metrics) +output_notification_rates() { + if [ -z "$NOTIFICATION_METRICS" ]; then + return + fi + + echo "$NOTIFICATION_METRICS" | \ + grep "^alertmanager_notifications_total{" | \ + sed 's/alertmanager_notifications_total/alertmanager_overview_notification_rate/' 2>/dev/null +} + +# Output notification failures per receiver (from built-in metrics) +output_notification_failures() { + if [ -z "$NOTIFICATION_METRICS" ]; then + return + fi + + echo "$NOTIFICATION_METRICS" | \ + grep "^alertmanager_notifications_failed_total{" | \ + sed 's/alertmanager_notifications_failed_total/alertmanager_overview_notification_failures/' 2>/dev/null +} + +# Output notification latency per receiver (from built-in metrics) +output_notification_latency() { + if [ -z "$NOTIFICATION_METRICS" ]; then + return + fi + + # Use the _sum and _count to compute average latency per integration + echo "$NOTIFICATION_METRICS" | \ + grep "^alertmanager_notification_latency_seconds_sum{" | \ + sed 's/alertmanager_notification_latency_seconds_sum/alertmanager_overview_notification_latency_seconds/' 2>/dev/null +} + +# ============================================================================ +# METRIC OUTPUT +# ============================================================================ + +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Check if Alertmanager is reachable + local am_up=1 + if ! am_api "/api/v2/status" >/dev/null 2>&1; then + am_up=0 + fi + + cat <&2 + echo "Alertmanager URL: $AM_URL" >&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +Alertmanager Overview Exporter + +

Alertmanager Overview Exporter v1.0

+

Alertmanager URL: $AM_URL

+

Metrics

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +main() { + parse_args "$@" + + if ! check_requirements; then + exit 1 + fi + + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.alertmanager_overview.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 5 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/alertmanager-silence-manager.sh b/alertmanager-silence-manager.sh new file mode 100644 index 0000000..d238c34 --- /dev/null +++ b/alertmanager-silence-manager.sh @@ -0,0 +1,848 @@ +#!/bin/bash +################################################################################ +# Script Name: alertmanager-silence-manager.sh +# Version: 1.0 +# Description: CLI tool for managing Prometheus Alertmanager silences. +# Create, bulk-create, extend, expire, list, audit, and export +# silences via the Alertmanager API v2. Supports dry-run mode, +# pattern-based operations, and YAML bulk silence files. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - curl +# - jq +# - Alertmanager running and accessible +# +# Usage: +# # Create a single silence +# ./alertmanager-silence-manager.sh create --matcher 'alertname=HighCPU' --duration 2h --comment "Maintenance" +# +# # Bulk create from YAML +# ./alertmanager-silence-manager.sh bulk-create --file maintenance.yaml +# +# # List active silences +# ./alertmanager-silence-manager.sh list --state active +# +# # Extend a silence +# ./alertmanager-silence-manager.sh extend --id abc12345 --duration 1h +# +# # Expire a silence +# ./alertmanager-silence-manager.sh expire --id abc12345 +# +# # Export active silences to YAML +# ./alertmanager-silence-manager.sh export --output silences.yaml +# +# # Audit silences +# ./alertmanager-silence-manager.sh audit +# +# Configuration: +# ALERTMANAGER_URL Alertmanager base URL (default: http://localhost:9093) +# SILENCE_AUTHOR Author name for silences (default: current user) +# SILENCE_COMMENT_PREFIX Prefix for all silence comments (default: none) +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +AM_URL="${ALERTMANAGER_URL:-http://localhost:9093}" +AUTHOR="${SILENCE_AUTHOR:-$(whoami)}" +COMMENT_PREFIX="${SILENCE_COMMENT_PREFIX:-}" +DRY_RUN=false + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat < [OPTIONS] + +Manage Prometheus Alertmanager silences via the API. + +COMMANDS: + create Create a single silence + bulk-create Create silences from a YAML file + list List silences in table format + extend Extend a silence by duration + expire Expire silences by ID or pattern + export Export active silences to YAML + audit Show detailed silence audit info + +CREATE OPTIONS: + --matcher STR Label matcher (e.g., 'alertname=HighCPU'), repeatable + --duration STR Duration (e.g., 2h, 30m, 1d) + --comment STR Silence comment/reason + --dry-run Preview without creating + +BULK-CREATE OPTIONS: + --file PATH Path to YAML silence definitions + --dry-run Preview without creating + +LIST OPTIONS: + --state STR Filter: active, pending, expired, all (default: active) + +EXTEND OPTIONS: + --id ID Silence ID to extend + --duration STR Additional duration (e.g., 1h) + --dry-run Preview without extending + +EXPIRE OPTIONS: + --id ID Silence ID to expire + --match STR Pattern match on comment (e.g., 'comment=~maintenance.*') + --dry-run Preview without expiring + +EXPORT OPTIONS: + --output PATH Output file (default: stdout) + +ENVIRONMENT VARIABLES: + ALERTMANAGER_URL Base URL (default: http://localhost:9093) + SILENCE_AUTHOR Author name (default: current user) + SILENCE_COMMENT_PREFIX Prefix for comments + +EXAMPLES: + $0 create --matcher 'alertname=HighCPU' --matcher 'instance=web-01' --duration 2h --comment "Maintenance" + $0 bulk-create --file maintenance-window.yaml --dry-run + $0 list --state active + $0 extend --id 4a2f8c3e --duration 1h + $0 expire --match 'comment=~maintenance.*' + $0 export --output silences-backup.yaml + $0 audit + +EOF + exit 0 +} + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_dry_run() { echo -e "${YELLOW}[DRY RUN]${NC} $*"; } + +check_requirements() { + local missing=0 + + if ! command -v curl >/dev/null 2>&1; then + log_error "curl not found" + missing=1 + fi + + if ! command -v jq >/dev/null 2>&1; then + log_error "jq not found" + missing=1 + fi + + return $missing +} + +check_connectivity() { + local status + status=$(curl -sf --connect-timeout 5 --max-time 10 -o /dev/null -w "%{http_code}" "${AM_URL}/api/v2/status" 2>/dev/null) + + if [ "$status" != "200" ]; then + log_error "Cannot reach Alertmanager at ${AM_URL} (HTTP $status)" + exit 1 + fi +} + +# Query Alertmanager API +# Args: $1 - method, $2 - endpoint, $3 - data (optional) +am_api() { + local method="$1" endpoint="$2" data="$3" + + if [ -n "$data" ]; then + curl -sf --connect-timeout 5 --max-time 15 \ + -X "$method" \ + -H "Content-Type: application/json" \ + -d "$data" \ + "${AM_URL}${endpoint}" 2>/dev/null + else + curl -sf --connect-timeout 5 --max-time 15 \ + -X "$method" \ + "${AM_URL}${endpoint}" 2>/dev/null + fi +} + +# Parse duration string to seconds +# Supports: 30s, 5m, 2h, 1d +parse_duration() { + local input="$1" + local num unit + + num=$(echo "$input" | sed 's/[^0-9]//g') + unit=$(echo "$input" | sed 's/[0-9]//g') + + case "$unit" in + s) echo "$num" ;; + m) echo $((num * 60)) ;; + h) echo $((num * 3600)) ;; + d) echo $((num * 86400)) ;; + *) log_error "Invalid duration unit: $unit (use s/m/h/d)"; return 1 ;; + esac +} + +# Get ISO 8601 timestamp for now + offset seconds +# Args: $1 - offset in seconds (default: 0) +iso_timestamp() { + local offset="${1:-0}" + date -u -d "+${offset} seconds" "+%Y-%m-%dT%H:%M:%S.000Z" 2>/dev/null || \ + date -u -v "+${offset}S" "+%Y-%m-%dT%H:%M:%S.000Z" 2>/dev/null +} + +# Parse a matcher string like 'alertname=HighCPU' or 'instance=~web-0[1-3]' +# Returns JSON object +parse_matcher() { + local input="$1" + local name value is_regex is_equal + + if [[ "$input" == *"=~"* ]]; then + name="${input%%=~*}" + value="${input#*=~}" + is_regex="true" + is_equal="true" + elif [[ "$input" == *"!~"* ]]; then + name="${input%%!~*}" + value="${input#*!~}" + is_regex="true" + is_equal="false" + elif [[ "$input" == *"!="* ]]; then + name="${input%%!=*}" + value="${input#*!=}" + is_regex="false" + is_equal="false" + elif [[ "$input" == *"="* ]]; then + name="${input%%=*}" + value="${input#*=}" + is_regex="false" + is_equal="true" + else + log_error "Invalid matcher format: $input" + return 1 + fi + + jq -n --arg n "$name" --arg v "$value" \ + --argjson r "$is_regex" --argjson e "$is_equal" \ + '{name: $n, value: $v, isRegex: $r, isEqual: $e}' +} + +# Truncate string to max length +truncate() { + local str="$1" max="${2:-30}" + if [ ${#str} -gt "$max" ]; then + echo "${str:0:$((max-2))}.." + else + echo "$str" + fi +} + +# ============================================================================ +# CREATE COMMAND +# ============================================================================ + +cmd_create() { + local matchers_json="[]" + local duration="" + local comment="" + + while [[ $# -gt 0 ]]; do + case $1 in + --matcher) + local m + m=$(parse_matcher "$2") || exit 1 + matchers_json=$(echo "$matchers_json" | jq --argjson m "$m" '. + [$m]') + shift 2 ;; + --duration) duration="$2"; shift 2 ;; + --comment) comment="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + *) log_error "Unknown option for create: $1"; exit 1 ;; + esac + done + + if [ "$(echo "$matchers_json" | jq 'length')" -eq 0 ]; then + log_error "At least one --matcher is required" + exit 1 + fi + + if [ -z "$duration" ]; then + log_error "--duration is required" + exit 1 + fi + + if [ -z "$comment" ]; then + log_error "--comment is required" + exit 1 + fi + + local duration_secs + duration_secs=$(parse_duration "$duration") || exit 1 + + local starts_at ends_at + starts_at=$(iso_timestamp 0) + ends_at=$(iso_timestamp "$duration_secs") + + local full_comment="${COMMENT_PREFIX}${comment}" + + local payload + payload=$(jq -n \ + --argjson matchers "$matchers_json" \ + --arg startsAt "$starts_at" \ + --arg endsAt "$ends_at" \ + --arg createdBy "$AUTHOR" \ + --arg comment "$full_comment" \ + '{matchers: $matchers, startsAt: $startsAt, endsAt: $endsAt, createdBy: $createdBy, comment: $comment}') + + if [ "$DRY_RUN" = true ]; then + log_dry_run "Would create silence:" + echo "$payload" | jq . + return + fi + + local response + response=$(am_api POST "/api/v2/silences" "$payload") + + if [ $? -eq 0 ] && [ -n "$response" ]; then + local sid + sid=$(echo "$response" | jq -r '.silenceID // empty') + if [ -n "$sid" ]; then + log_info "Silence created: ${sid}" + else + log_info "Silence created" + fi + else + log_error "Failed to create silence" + exit 1 + fi +} + +# ============================================================================ +# BULK-CREATE COMMAND +# ============================================================================ + +cmd_bulk_create() { + local file="" + + while [[ $# -gt 0 ]]; do + case $1 in + --file) file="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + *) log_error "Unknown option for bulk-create: $1"; exit 1 ;; + esac + done + + if [ -z "$file" ] || [ ! -f "$file" ]; then + log_error "Valid --file is required" + exit 1 + fi + + local success=0 failed=0 + local in_silence=false + local in_matchers=false + local matchers_json="[]" + local duration="" comment="" + local current_matcher_name="" current_matcher_value="" current_matcher_regex="false" + + flush_silence() { + if [ -z "$duration" ] || [ "$(echo "$matchers_json" | jq 'length')" -eq 0 ]; then + return + fi + + local duration_secs starts_at ends_at full_comment payload + duration_secs=$(parse_duration "$duration") || { ((failed++)); return; } + starts_at=$(iso_timestamp 0) + ends_at=$(iso_timestamp "$duration_secs") + full_comment="${COMMENT_PREFIX}${comment}" + + payload=$(jq -n \ + --argjson matchers "$matchers_json" \ + --arg startsAt "$starts_at" \ + --arg endsAt "$ends_at" \ + --arg createdBy "$AUTHOR" \ + --arg comment "$full_comment" \ + '{matchers: $matchers, startsAt: $startsAt, endsAt: $endsAt, createdBy: $createdBy, comment: $comment}') + + if [ "$DRY_RUN" = true ]; then + log_dry_run "Would create silence:" + echo "$payload" | jq . + echo "" + ((success++)) + return + fi + + local response + response=$(am_api POST "/api/v2/silences" "$payload") + + if [ $? -eq 0 ] && [ -n "$response" ]; then + local sid + sid=$(echo "$response" | jq -r '.silenceID // empty') + log_info "Silence created: ${sid:-ok} (${comment})" + ((success++)) + else + log_error "Failed to create silence: ${comment}" + ((failed++)) + fi + } + + flush_matcher() { + if [ -n "$current_matcher_name" ]; then + local m + m=$(jq -n --arg n "$current_matcher_name" --arg v "$current_matcher_value" \ + --argjson r "$current_matcher_regex" \ + '{name: $n, value: $v, isRegex: $r, isEqual: true}') + matchers_json=$(echo "$matchers_json" | jq --argjson m "$m" '. + [$m]') + current_matcher_name="" + current_matcher_value="" + current_matcher_regex="false" + fi + } + + while IFS= read -r line || [ -n "$line" ]; do + # Strip leading/trailing whitespace for comparison + local trimmed + trimmed=$(echo "$line" | sed 's/^[[:space:]]*//' | sed 's/[[:space:]]*$//') + + # Skip comments and empty lines + [[ "$trimmed" =~ ^# ]] && continue + [ -z "$trimmed" ] && continue + + # New silence block + if [[ "$trimmed" == "- matchers:" ]]; then + flush_matcher + if [ "$in_silence" = true ]; then + flush_silence + fi + in_silence=true + in_matchers=true + matchers_json="[]" + duration="" + comment="" + continue + fi + + # Matcher entry + if [[ "$trimmed" == "- name:"* ]] && [ "$in_matchers" = true ]; then + flush_matcher + current_matcher_name=$(echo "$trimmed" | sed 's/^- name:[[:space:]]*//') + continue + fi + + if [[ "$trimmed" == "value:"* ]] && [ "$in_matchers" = true ]; then + current_matcher_value=$(echo "$trimmed" | sed 's/^value:[[:space:]]*//') + continue + fi + + if [[ "$trimmed" == "isRegex:"* ]] && [ "$in_matchers" = true ]; then + current_matcher_regex=$(echo "$trimmed" | sed 's/^isRegex:[[:space:]]*//') + continue + fi + + # Duration + if [[ "$trimmed" == "duration:"* ]]; then + flush_matcher + in_matchers=false + duration=$(echo "$trimmed" | sed 's/^duration:[[:space:]]*//') + continue + fi + + # Comment + if [[ "$trimmed" == "comment:"* ]]; then + comment=$(echo "$trimmed" | sed 's/^comment:[[:space:]]*//' | sed 's/^"//' | sed 's/"$//') + continue + fi + + done < "$file" + + # Flush last silence + flush_matcher + if [ "$in_silence" = true ]; then + flush_silence + fi + + echo "" + log_info "Bulk create complete: ${success} succeeded, ${failed} failed" +} + +# ============================================================================ +# LIST COMMAND +# ============================================================================ + +cmd_list() { + local state="active" + + while [[ $# -gt 0 ]]; do + case $1 in + --state) state="$2"; shift 2 ;; + *) log_error "Unknown option for list: $1"; exit 1 ;; + esac + done + + local silences_json + silences_json=$(am_api GET "/api/v2/silences") + + if [ -z "$silences_json" ]; then + log_error "Failed to fetch silences" + exit 1 + fi + + # Filter by state + local filtered + if [ "$state" = "all" ]; then + filtered="$silences_json" + else + filtered=$(echo "$silences_json" | jq --arg s "$state" '[.[] | select(.status.state == $s)]') + fi + + local count + count=$(echo "$filtered" | jq 'length') + + if [ "$count" -eq 0 ]; then + log_info "No ${state} silences found" + return + fi + + printf "${BLUE}%-10s %-10s %-12s %-30s %-20s %-20s %s${NC}\n" \ + "ID" "STATE" "AUTHOR" "MATCHERS" "STARTS" "ENDS" "COMMENT" + printf '%.0s-' {1..120} + echo "" + + echo "$filtered" | jq -r '.[] | [ + .id[0:8], + .status.state, + .createdBy, + ([.matchers[] | "\(.name)=\(.value)"] | join(", ")), + .startsAt[0:19], + .endsAt[0:19], + .comment + ] | @tsv' | while IFS=$'\t' read -r id st author matchers starts ends comment; do + printf "%-10s %-10s %-12s %-30s %-20s %-20s %s\n" \ + "$id" "$st" "$(truncate "$author" 12)" "$(truncate "$matchers" 30)" \ + "$starts" "$ends" "$(truncate "$comment" 40)" + done + + echo "" + log_info "${count} silence(s) found" +} + +# ============================================================================ +# EXTEND COMMAND +# ============================================================================ + +cmd_extend() { + local silence_id="" duration="" + + while [[ $# -gt 0 ]]; do + case $1 in + --id) silence_id="$2"; shift 2 ;; + --duration) duration="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + *) log_error "Unknown option for extend: $1"; exit 1 ;; + esac + done + + if [ -z "$silence_id" ]; then + log_error "--id is required" + exit 1 + fi + + if [ -z "$duration" ]; then + log_error "--duration is required" + exit 1 + fi + + local duration_secs + duration_secs=$(parse_duration "$duration") || exit 1 + + # Find the silence (match by prefix) + local silences_json silence + silences_json=$(am_api GET "/api/v2/silences") + silence=$(echo "$silences_json" | jq --arg id "$silence_id" '[.[] | select(.id | startswith($id)) | select(.status.state == "active")] | first // empty') + + if [ -z "$silence" ] || [ "$silence" = "null" ]; then + log_error "Active silence not found matching ID: ${silence_id}" + exit 1 + fi + + local full_id + full_id=$(echo "$silence" | jq -r '.id') + + # Build new silence with extended endsAt + local new_ends_at + new_ends_at=$(iso_timestamp "$duration_secs") + + local payload + payload=$(echo "$silence" | jq --arg endsAt "$new_ends_at" \ + 'del(.id, .status, .updatedAt) | .endsAt = $endsAt') + + if [ "$DRY_RUN" = true ]; then + log_dry_run "Would extend silence ${full_id} to ${new_ends_at}:" + echo "$payload" | jq . + return + fi + + local response + response=$(am_api POST "/api/v2/silences" "$payload") + + if [ $? -eq 0 ] && [ -n "$response" ]; then + local new_id + new_id=$(echo "$response" | jq -r '.silenceID // empty') + log_info "Silence extended: ${new_id:-ok} (new end: ${new_ends_at})" + else + log_error "Failed to extend silence" + exit 1 + fi +} + +# ============================================================================ +# EXPIRE COMMAND +# ============================================================================ + +cmd_expire() { + local silence_id="" match_pattern="" + + while [[ $# -gt 0 ]]; do + case $1 in + --id) silence_id="$2"; shift 2 ;; + --match) match_pattern="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + *) log_error "Unknown option for expire: $1"; exit 1 ;; + esac + done + + if [ -z "$silence_id" ] && [ -z "$match_pattern" ]; then + log_error "Either --id or --match is required" + exit 1 + fi + + local silences_json + silences_json=$(am_api GET "/api/v2/silences") + + if [ -z "$silences_json" ]; then + log_error "Failed to fetch silences" + exit 1 + fi + + local ids_to_expire=() + + if [ -n "$silence_id" ]; then + # Match by ID prefix + local matched + matched=$(echo "$silences_json" | jq -r --arg id "$silence_id" \ + '.[] | select(.id | startswith($id)) | select(.status.state == "active") | .id') + while IFS= read -r id; do + [ -n "$id" ] && ids_to_expire+=("$id") + done <<< "$matched" + fi + + if [ -n "$match_pattern" ]; then + # Parse pattern: 'comment=~regex' + local field pattern + if [[ "$match_pattern" == *"=~"* ]]; then + field="${match_pattern%%=~*}" + pattern="${match_pattern#*=~}" + else + log_error "Match pattern must use =~ syntax (e.g., 'comment=~maintenance.*')" + exit 1 + fi + + local matched + if [ "$field" = "comment" ]; then + matched=$(echo "$silences_json" | jq -r --arg p "$pattern" \ + '.[] | select(.status.state == "active") | select(.comment | test($p)) | .id') + else + matched=$(echo "$silences_json" | jq -r --arg f "$field" --arg p "$pattern" \ + '.[] | select(.status.state == "active") | select(.matchers[] | select(.name == $f) | .value | test($p)) | .id') + fi + + while IFS= read -r id; do + [ -n "$id" ] && ids_to_expire+=("$id") + done <<< "$matched" + fi + + if [ ${#ids_to_expire[@]} -eq 0 ]; then + log_warn "No matching active silences found" + return + fi + + local success=0 failed=0 + + for id in "${ids_to_expire[@]}"; do + if [ "$DRY_RUN" = true ]; then + log_dry_run "Would expire silence: ${id}" + ((success++)) + continue + fi + + if curl -sf --connect-timeout 5 --max-time 10 \ + -X DELETE "${AM_URL}/api/v2/silence/${id}" >/dev/null 2>&1; then + log_info "Expired silence: ${id}" + ((success++)) + else + log_error "Failed to expire silence: ${id}" + ((failed++)) + fi + done + + echo "" + log_info "Expire complete: ${success} succeeded, ${failed} failed" +} + +# ============================================================================ +# EXPORT COMMAND +# ============================================================================ + +cmd_export() { + local output="" + + while [[ $# -gt 0 ]]; do + case $1 in + --output) output="$2"; shift 2 ;; + *) log_error "Unknown option for export: $1"; exit 1 ;; + esac + done + + local silences_json + silences_json=$(am_api GET "/api/v2/silences") + + if [ -z "$silences_json" ]; then + log_error "Failed to fetch silences" + exit 1 + fi + + local active + active=$(echo "$silences_json" | jq '[.[] | select(.status.state == "active")]') + local count + count=$(echo "$active" | jq 'length') + + if [ "$count" -eq 0 ]; then + log_warn "No active silences to export" + return + fi + + local yaml_output + yaml_output=$(echo "$active" | jq -r ' + "silences:", + (.[] | + " - matchers:", + (.matchers[] | + " - name: \(.name)", + " value: \(.value)", + " isRegex: \(.isRegex)" + ), + " duration: \( + ((.endsAt | fromdateiso8601) - (.startsAt | fromdateiso8601)) | + if . >= 86400 then "\(. / 86400 | floor)d" + elif . >= 3600 then "\(. / 3600 | floor)h" + elif . >= 60 then "\(. / 60 | floor)m" + else "\(.)s" + end + )", + " comment: \"\(.comment)\"" + ) + ') + + if [ -n "$output" ]; then + echo "$yaml_output" > "$output" + log_info "Exported ${count} silence(s) to ${output}" + else + echo "$yaml_output" + fi +} + +# ============================================================================ +# AUDIT COMMAND +# ============================================================================ + +cmd_audit() { + local silences_json + silences_json=$(am_api GET "/api/v2/silences") + + if [ -z "$silences_json" ]; then + log_error "Failed to fetch silences" + exit 1 + fi + + local active + active=$(echo "$silences_json" | jq '[.[] | select(.status.state == "active")]') + local count + count=$(echo "$active" | jq 'length') + + if [ "$count" -eq 0 ]; then + log_info "No active silences" + return + fi + + printf "${BLUE}%-10s %-12s %-20s %-20s %-10s %-28s %s${NC}\n" \ + "ID" "AUTHOR" "CREATED" "EXPIRES" "DURATION" "MATCHERS" "COMMENT" + printf '%.0s-' {1..130} + echo "" + + echo "$active" | jq -r '.[] | [ + .id[0:8], + .createdBy, + .startsAt[0:19], + .endsAt[0:19], + ( + ((.endsAt | fromdateiso8601) - (.startsAt | fromdateiso8601)) | + if . >= 86400 then "\(. / 86400 | floor)d" + elif . >= 3600 then "\(. / 3600 | floor)h" + elif . >= 60 then "\(. / 60 | floor)m" + else "\(.)s" + end + ), + ([.matchers[] | "\(.name)=\(.value)"] | join(", ")), + .comment + ] | @tsv' | while IFS=$'\t' read -r id author created expires duration matchers comment; do + printf "%-10s %-12s %-20s %-20s %-10s %-28s %s\n" \ + "$id" "$(truncate "$author" 12)" "$created" "$expires" "$duration" \ + "$(truncate "$matchers" 28)" "$(truncate "$comment" 40)" + done + + echo "" + log_info "${count} active silence(s)" +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + if [ $# -eq 0 ]; then + show_usage + fi + + check_requirements || exit 1 + + local command="$1" + shift + + case "$command" in + -h|--help) show_usage ;; + create|bulk-create|list|extend|expire|export|audit) ;; + *) log_error "Unknown command: $command"; echo ""; show_usage ;; + esac + + # All commands except --help require connectivity + check_connectivity + + case "$command" in + create) cmd_create "$@" ;; + bulk-create) cmd_bulk_create "$@" ;; + list) cmd_list "$@" ;; + extend) cmd_extend "$@" ;; + expire) cmd_expire "$@" ;; + export) cmd_export "$@" ;; + audit) cmd_audit "$@" ;; + esac +} + +main "$@" diff --git a/alloy-config-generator.sh b/alloy-config-generator.sh new file mode 100755 index 0000000..dd1d5bc --- /dev/null +++ b/alloy-config-generator.sh @@ -0,0 +1,576 @@ +#!/usr/bin/env bash +# +# Alloy Config Generator +# +# Interactive script that generates a Grafana Alloy configuration +# file based on your environment. Asks what backends you use, what +# signals to collect, and what services to monitor, then outputs +# a working config.alloy ready to deploy. +# +# Usage: +# ./alloy-config-generator.sh +# ./alloy-config-generator.sh -o /etc/alloy/config.alloy +# ./alloy-config-generator.sh --non-interactive --metrics --logs --prometheus-url http://mimir:9009 +# +# Parameters: +# -o, --output FILE Write config to file (default: stdout) +# --non-interactive Skip prompts, use flags and defaults +# --metrics Enable host metrics collection +# --logs Enable log collection +# --traces Enable OTLP trace collection +# --journald Enable journald log collection +# --docker Enable Docker container log collection +# --nginx Enable nginx log collection +# --prometheus-url URL Prometheus/Mimir remote_write URL +# --loki-url URL Loki push URL +# --tempo-url URL Tempo OTLP endpoint (host:port) +# --hostname NAME Hostname label for metrics and logs +# --scrape-targets LIST Comma-separated host:port targets to scrape +# --help Show usage +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.0 + +set -euo pipefail + +# --- Configuration --- +readonly VERSION="1.0" +readonly SCRIPT_NAME="$(basename "$0")" + +# Defaults +OUTPUT="" +NON_INTERACTIVE=false +ENABLE_METRICS=false +ENABLE_LOGS=false +ENABLE_TRACES=false +ENABLE_JOURNALD=false +ENABLE_DOCKER=false +ENABLE_NGINX=false +PROMETHEUS_URL="" +LOKI_URL="" +TEMPO_URL="" +HOSTNAME="" +SCRAPE_TARGETS="" + +# --- Functions --- + +usage() { + cat </dev/null || echo "server") + ask "Hostname label" "$detected_hostname" HOSTNAME + echo "" + + # --- Backends --- + echo "== Backends ==" + echo "" + + ask "Prometheus/Mimir remote_write URL (leave empty to skip metrics)" "http://prometheus:9090/api/v1/write" PROMETHEUS_URL + if [[ -n "$PROMETHEUS_URL" ]]; then + ENABLE_METRICS=true + fi + + ask "Loki push URL (leave empty to skip logs)" "http://loki:3100/loki/api/v1/push" LOKI_URL + if [[ -n "$LOKI_URL" ]]; then + ENABLE_LOGS=true + fi + + ask "Tempo OTLP endpoint host:port (leave empty to skip traces)" "" TEMPO_URL + if [[ -n "$TEMPO_URL" ]]; then + ENABLE_TRACES=true + fi + + echo "" + + # --- Metrics options --- + if [[ "$ENABLE_METRICS" == true ]]; then + echo "== Metrics ==" + echo "" + + local extra_targets="" + ask "Additional Prometheus scrape targets (comma-separated host:port, or empty)" "" extra_targets + if [[ -n "$extra_targets" ]]; then + SCRAPE_TARGETS="$extra_targets" + fi + echo "" + fi + + # --- Log options --- + if [[ "$ENABLE_LOGS" == true ]]; then + echo "== Logs ==" + echo "" + ask_yn "Collect journald logs?" "y" ENABLE_JOURNALD + ask_yn "Collect Docker container logs?" "n" ENABLE_DOCKER + ask_yn "Collect nginx logs?" "n" ENABLE_NGINX + echo "" + fi +} + +# --- Config generation functions --- + +generate_header() { + cat <\\\\S+) (?P\\\\S+) (?P\\\\S+)\" (?P\\\\d+) (?P\\\\d+)" + } + + stage.labels { + values = { + method = "", + status = "", + } + } +} + +EOF +} + +generate_logs_write() { + if [[ "$ENABLE_LOGS" != true ]]; then + return + fi + + cat <&2; usage ;; + esac + done + + # Set hostname default + if [[ -z "$HOSTNAME" ]]; then + HOSTNAME=$(hostname -s 2>/dev/null || echo "server") + fi + + # Set backend URL defaults for non-interactive mode + if [[ "$NON_INTERACTIVE" == true ]]; then + [[ "$ENABLE_METRICS" == true && -z "$PROMETHEUS_URL" ]] && PROMETHEUS_URL="http://prometheus:9090/api/v1/write" + [[ "$ENABLE_LOGS" == true && -z "$LOKI_URL" ]] && LOKI_URL="http://loki:3100/loki/api/v1/push" + [[ "$ENABLE_TRACES" == true && -z "$TEMPO_URL" ]] && TEMPO_URL="tempo:4317" + fi + + # Interactive mode + if [[ "$NON_INTERACTIVE" != true ]]; then + interactive_setup + fi + + # Check at least one signal is enabled + if [[ "$ENABLE_METRICS" != true && "$ENABLE_LOGS" != true && "$ENABLE_TRACES" != true ]]; then + echo "ERROR: No signals enabled. Enable at least one of: metrics, logs, traces" >&2 + exit 1 + fi + + # Generate config + if [[ -n "$OUTPUT" ]]; then + generate_config > "$OUTPUT" + echo "" + echo "Config written to: $OUTPUT" + echo "" + echo "Signals enabled:" + [[ "$ENABLE_METRICS" == true ]] && echo " ✓ Metrics → $PROMETHEUS_URL" + [[ "$ENABLE_LOGS" == true ]] && echo " ✓ Logs → $LOKI_URL" + [[ "$ENABLE_TRACES" == true ]] && echo " ✓ Traces → $TEMPO_URL" + echo "" + echo "Next steps:" + echo " 1. Review the config: cat $OUTPUT" + echo " 2. Validate syntax: alloy fmt $OUTPUT" + echo " 3. Test it: alloy run $OUTPUT" + echo " 4. Deploy: sudo cp $OUTPUT /etc/alloy/config.alloy && sudo systemctl restart alloy" + else + generate_config + fi +} + +main "$@" diff --git a/ami-lifecycle-manager.sh b/ami-lifecycle-manager.sh new file mode 100644 index 0000000..a6a95db --- /dev/null +++ b/ami-lifecycle-manager.sh @@ -0,0 +1,670 @@ +#!/usr/bin/env bash + +######################################################################################### +#### ami-lifecycle-manager.sh — AWS AMI lifecycle management #### +#### Create, tag, retain, and deregister AMIs with orphan snapshot cleanup #### +#### Requires: bash 4+, aws-cli v2, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./ami-lifecycle-manager.sh --create --instance-id i-1234567890abcdef0 #### +#### ./ami-lifecycle-manager.sh --enforce --retention-days 30 #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +AWS_REGION="${AWS_REGION:-}" +AWS_PROFILE_OPT="${AWS_PROFILE:-}" +AMI_RETENTION_DAYS="${AMI_RETENTION_DAYS:-30}" +AMI_NAME_PREFIX="${AMI_NAME_PREFIX:-ami-lifecycle}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +DRY_RUN="false" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +RUN_MODE="" +INSTANCE_ID="" +RETENTION_DAYS="$AMI_RETENTION_DAYS" +START_TIME="" +MANAGER_TAG="ami-lifecycle-manager" + +# ── Colors ──────────────────────────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "auto" && ! -t 1 ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" + return + fi + RED="\033[0;31m" + GREEN="\033[0;32m" + YELLOW="\033[0;33m" + BLUE="\033[0;34m" + BOLD="\033[1m" + DIM="\033[2m" + RESET="\033[0m" +} + +# ── Logging ─────────────────────────────────────────────────────────── +log_info() { printf "${GREEN}[INFO]${RESET} %s\n" "$*"; } +log_warn() { printf "${YELLOW}[WARN]${RESET} %s\n" "$*" >&2; } +log_error() { printf "${RED}[ERROR]${RESET} %s\n" "$*" >&2; } +log_verbose() { [[ "$VERBOSE" == "true" ]] && printf "${DIM}[DEBUG] %s${RESET}\n" "$*"; } + +# ── Helpers ─────────────────────────────────────────────────────────── +die() { log_error "$@"; exit 1; } + +today_utc() { date -u +%Y-%m-%d; } + +epoch_from_date() { + local d="$1" + if date --version >/dev/null 2>&1; then + date -d "$d" +%s + else + date -j -f "%Y-%m-%d" "$d" +%s + fi +} + +days_since() { + local created="$1" + local now + now=$(date -u +%s) + local then + then=$(epoch_from_date "$created") + echo $(( (now - then) / 86400 )) +} + +date_offset_days() { + local base="$1" offset="$2" + if date --version >/dev/null 2>&1; then + date -d "${base} +${offset} days" +%Y-%m-%d + else + date -j -v+"${offset}d" -f "%Y-%m-%d" "$base" +%Y-%m-%d + fi +} + +# ── AWS CLI wrapper ─────────────────────────────────────────────────── +aws_cmd() { + local args=("$@") + [[ -n "$AWS_REGION" ]] && args+=(--region "$AWS_REGION") + [[ -n "$AWS_PROFILE_OPT" ]] && args+=(--profile "$AWS_PROFILE_OPT") + log_verbose "aws ${args[*]}" + aws "${args[@]}" +} + +# ── Dependency check ───────────────────────────────────────────────── +check_deps() { + local missing=() + command -v aws >/dev/null 2>&1 || missing+=("aws-cli") + command -v jq >/dev/null 2>&1 || missing+=("jq") + if (( ${#missing[@]} > 0 )); then + die "Missing required tools: ${missing[*]}" + fi + + local bash_major="${BASH_VERSINFO[0]}" + if (( bash_major < 4 )); then + die "Requires bash 4+, found ${BASH_VERSION}" + fi + + # Verify AWS credentials + if ! aws_cmd sts get-caller-identity --output text >/dev/null 2>&1; then + die "AWS credentials not configured or expired" + fi + + # Determine region + if [[ -z "$AWS_REGION" ]]; then + AWS_REGION=$(aws configure get region 2>/dev/null || echo "") + if [[ -z "$AWS_REGION" ]]; then + die "AWS_REGION is required (set via env var, --region, or aws configure)" + fi + fi +} + +# ── Header ──────────────────────────────────────────────────────────── +print_header() { + local account_id + account_id=$(aws_cmd sts get-caller-identity --query Account --output text 2>/dev/null || echo "unknown") + + echo "AMI Lifecycle Manager" + echo "Account: $account_id" + echo "Region: $AWS_REGION" + echo "Mode: $RUN_MODE" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" +} + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat < 0 )); do + case "$1" in + --create) + RUN_MODE="create"; shift ;; + --enforce) + RUN_MODE="enforce"; shift ;; + --clean-snapshots) + RUN_MODE="clean-snapshots"; shift ;; + --inventory) + RUN_MODE="inventory"; shift ;; + --instance-id) + [[ $# -lt 2 ]] && die "--instance-id requires a value" + INSTANCE_ID="$2"; shift 2 ;; + --retention-days) + [[ $# -lt 2 ]] && die "--retention-days requires a value" + RETENTION_DAYS="$2"; shift 2 ;; + --dry-run) + DRY_RUN="true"; shift ;; + --format) + [[ $# -lt 2 ]] && die "--format requires a value" + OUTPUT_FORMAT="$2"; shift 2 ;; + --profile) + [[ $# -lt 2 ]] && die "--profile requires a value" + AWS_PROFILE_OPT="$2"; shift 2 ;; + --region) + [[ $# -lt 2 ]] && die "--region requires a value" + AWS_REGION="$2"; shift 2 ;; + --verbose) + VERBOSE="true"; shift ;; + --no-color) + COLOR="never"; shift ;; + --help|-h) + usage ;; + *) + die "Unknown option: $1 (see --help)" ;; + esac + done + + if [[ -z "$RUN_MODE" ]]; then + log_error "No mode specified" + echo "" + usage + fi + + if [[ "$RUN_MODE" == "create" && -z "$INSTANCE_ID" ]]; then + die "--create requires --instance-id" + fi + + case "$OUTPUT_FORMAT" in + text|csv|json) ;; + *) die "Invalid --format: $OUTPUT_FORMAT (expected text, csv, json)" ;; + esac + + if ! [[ "$RETENTION_DAYS" =~ ^[0-9]+$ ]]; then + die "--retention-days must be a positive integer" + fi +} + +# ── Get instance name ───────────────────────────────────────────────── +get_instance_name() { + local iid="$1" + aws_cmd ec2 describe-instances \ + --instance-ids "$iid" \ + --query 'Reservations[0].Instances[0].Tags[?Key==`Name`].Value | [0]' \ + --output text 2>/dev/null || echo "N/A" +} + +# ── Create AMI ──────────────────────────────────────────────────────── +create_ami() { + local instance_id="$INSTANCE_ID" + local today + today="$(today_utc)" + + log_info "Creating AMI from instance ${instance_id}..." + + # Get instance name + local instance_name + instance_name=$(get_instance_name "$instance_id") + if [[ "$instance_name" == "None" || -z "$instance_name" ]]; then + instance_name="unnamed" + fi + log_info "Instance name: ${instance_name}" + + # Build AMI name + local ami_name="${AMI_NAME_PREFIX}-${instance_name}-${today}" + local ami_description="AMI created by ${MANAGER_TAG} from ${instance_id} (${instance_name}) on ${today}" + + # Calculate expiry date + local expires + expires=$(date_offset_days "$today" "$RETENTION_DAYS") + + # Create the AMI (no-reboot to avoid downtime) + local ami_id + ami_id=$(aws_cmd ec2 create-image \ + --instance-id "$instance_id" \ + --name "$ami_name" \ + --description "$ami_description" \ + --no-reboot \ + --query 'ImageId' \ + --output text 2>/dev/null) || die "Failed to create AMI from ${instance_id}" + + log_info "AMI created: ${ami_id}" + log_info "Name: ${ami_name}" + + # Tag the AMI + aws_cmd ec2 create-tags \ + --resources "$ami_id" \ + --tags \ + "Key=Name,Value=${ami_name}" \ + "Key=managed-by,Value=${MANAGER_TAG}" \ + "Key=source-instance,Value=${instance_id}" \ + "Key=source-name,Value=${instance_name}" \ + "Key=created-date,Value=${today}" \ + "Key=retention-days,Value=${RETENTION_DAYS}" \ + "Key=expires,Value=${expires}" \ + >/dev/null 2>&1 || log_warn "Failed to tag AMI ${ami_id}" + + log_info "Tags applied:" + printf " %-16s = %s\n" "managed-by" "$MANAGER_TAG" + printf " %-16s = %s\n" "source-instance" "$instance_id" + printf " %-16s = %s\n" "source-name" "$instance_name" + printf " %-16s = %s\n" "created-date" "$today" + printf " %-16s = %s\n" "retention-days" "$RETENTION_DAYS" + printf " %-16s = %s\n" "expires" "$expires" + + # Wait briefly for snapshots to appear, then tag them too + log_verbose "Waiting for AMI snapshots to register..." + local retries=0 + local snap_ids="" + while (( retries < 12 )); do + snap_ids=$(aws_cmd ec2 describe-images \ + --image-ids "$ami_id" \ + --query 'Images[0].BlockDeviceMappings[*].Ebs.SnapshotId' \ + --output text 2>/dev/null || echo "") + if [[ -n "$snap_ids" && "$snap_ids" != "None" ]]; then + break + fi + sleep 5 + ((retries++)) || true + done + + if [[ -n "$snap_ids" && "$snap_ids" != "None" ]]; then + for snap_id in $snap_ids; do + aws_cmd ec2 create-tags \ + --resources "$snap_id" \ + --tags \ + "Key=managed-by,Value=${MANAGER_TAG}" \ + "Key=source-ami,Value=${ami_id}" \ + "Key=source-instance,Value=${instance_id}" \ + "Key=created-date,Value=${today}" \ + >/dev/null 2>&1 || log_warn "Failed to tag snapshot ${snap_id}" + log_verbose "Tagged snapshot ${snap_id}" + done + fi +} + +# ── Get managed AMIs ────────────────────────────────────────────────── +get_managed_amis() { + local account_id + account_id=$(aws_cmd sts get-caller-identity --query Account --output text 2>/dev/null) + + aws_cmd ec2 describe-images \ + --owners "$account_id" \ + --filters "Name=tag:managed-by,Values=${MANAGER_TAG}" \ + --query 'Images[*]' \ + --output json 2>/dev/null || echo "[]" +} + +# ── Enforce retention ───────────────────────────────────────────────── +enforce_retention() { + log_info "Enforcing retention policy (${RETENTION_DAYS} days)..." + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "DRY RUN — no AMIs will be deregistered" + fi + + local amis_json + amis_json=$(get_managed_amis) + + local total + total=$(echo "$amis_json" | jq 'length') + log_info "Found ${total} managed AMI(s)" + + if (( total == 0 )); then + log_info "No managed AMIs found — nothing to do" + return + fi + + local today + today=$(today_utc) + local active=0 expired=0 deregistered=0 + + # Print table header for text output + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo "" + printf " %-24s %-42s %-6s %-11s %s\n" "AMI" "NAME" "AGE" "RETENTION" "STATUS" + echo " ──────────────────────────────────────────────────────────────────────────────────────" + fi + + local csv_lines=() + local json_items=() + + while IFS=$'\t' read -r ami_id ami_name created_date retention_tag; do + [[ -z "$ami_id" || "$ami_id" == "null" ]] && continue + + # Use tag retention or default + local ret="${retention_tag}" + if [[ -z "$ret" || "$ret" == "null" || "$ret" == "None" ]]; then + ret="$RETENTION_DAYS" + fi + + local age=0 + if [[ -n "$created_date" && "$created_date" != "null" && "$created_date" != "None" ]]; then + age=$(days_since "$created_date") + fi + + local status="active" + if (( age > ret )); then + status="expired" + ((expired++)) || true + else + ((active++)) || true + fi + + case "$OUTPUT_FORMAT" in + text) + local status_icon="✓ active" + if [[ "$status" == "expired" ]]; then + status_icon="✗ expired" + fi + printf " %-24s %-42s %3dd %3dd %s\n" \ + "$ami_id" "$ami_name" "$age" "$ret" "$status_icon" + ;; + csv) + csv_lines+=("\"${ami_id}\",\"${ami_name}\",${age},${ret},\"${status}\"") + ;; + json) + json_items+=("{\"ami_id\":\"${ami_id}\",\"name\":\"${ami_name}\",\"age_days\":${age},\"retention_days\":${ret},\"status\":\"${status}\"}") + ;; + esac + + # Deregister expired AMIs + if [[ "$status" == "expired" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would deregister ${ami_id} (${age}d old, retention ${ret}d)" + else + log_info "Deregistering ${ami_id} (${age}d old, retention ${ret}d)..." + if aws_cmd ec2 deregister-image --image-id "$ami_id" >/dev/null 2>&1; then + ((deregistered++)) || true + else + log_warn "Failed to deregister ${ami_id}" + fi + fi + fi + done < <(echo "$amis_json" | jq -r '.[] | [.ImageId, (.Tags // [] | map(select(.Key == "Name")) | .[0].Value // "N/A"), (.Tags // [] | map(select(.Key == "created-date")) | .[0].Value // ""), (.Tags // [] | map(select(.Key == "retention-days")) | .[0].Value // "")] | @tsv') + + echo "" + + case "$OUTPUT_FORMAT" in + text) + echo "Summary" + printf " Total managed AMIs: %d\n" "$total" + printf " Active: %d\n" "$active" + printf " Expired: %d\n" "$expired" + if [[ "$DRY_RUN" == "true" ]]; then + printf " Would deregister: %d\n" "$expired" + else + printf " Deregistered: %d\n" "$deregistered" + fi + ;; + csv) + echo "ami_id,name,age_days,retention_days,status" + for line in "${csv_lines[@]}"; do + echo "$line" + done + ;; + json) + local joined + joined=$(printf ",%s" "${json_items[@]}") + joined="${joined:1}" + printf '{"mode":"enforce","retention_days":%d,"dry_run":%s,"total":%d,"active":%d,"expired":%d,"items":[%s]}\n' \ + "$RETENTION_DAYS" "$DRY_RUN" "$total" "$active" "$expired" "$joined" + ;; + esac +} + +# ── Clean orphan snapshots ──────────────────────────────────────────── +clean_orphan_snapshots() { + log_info "Searching for orphaned AMI snapshots..." + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "DRY RUN — no snapshots will be deleted" + fi + + local account_id + account_id=$(aws_cmd sts get-caller-identity --query Account --output text 2>/dev/null) + + # Get all snapshots tagged as managed by us + local snaps_json + snaps_json=$(aws_cmd ec2 describe-snapshots \ + --owner-ids "$account_id" \ + --filters "Name=tag:managed-by,Values=${MANAGER_TAG}" \ + --query 'Snapshots[*]' \ + --output json 2>/dev/null) || die "Failed to describe snapshots" + + local total_snaps + total_snaps=$(echo "$snaps_json" | jq 'length') + log_info "Found ${total_snaps} managed snapshot(s)" + + if (( total_snaps == 0 )); then + log_info "No managed snapshots found — nothing to do" + return + fi + + # Get all currently registered AMI IDs + local registered_amis + registered_amis=$(aws_cmd ec2 describe-images \ + --owners "$account_id" \ + --query 'Images[*].ImageId' \ + --output text 2>/dev/null) || die "Failed to describe images" + + local orphan_count=0 + local deleted_count=0 + local total_size=0 + + while IFS=$'\t' read -r snap_id source_ami snap_size; do + [[ -z "$snap_id" || "$snap_id" == "null" ]] && continue + + # Check if the source AMI still exists + local is_orphan="false" + if [[ -z "$source_ami" || "$source_ami" == "null" || "$source_ami" == "None" ]]; then + is_orphan="true" + elif ! echo "$registered_amis" | grep -qw "$source_ami" 2>/dev/null; then + is_orphan="true" + fi + + if [[ "$is_orphan" == "true" ]]; then + ((orphan_count++)) || true + local size_gb="${snap_size:-0}" + ((total_size += size_gb)) || true + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would delete orphan snapshot ${snap_id} (${size_gb} GiB, source AMI: ${source_ami:-unknown})" + else + log_info "Deleting orphan snapshot ${snap_id} (${size_gb} GiB)..." + if aws_cmd ec2 delete-snapshot --snapshot-id "$snap_id" >/dev/null 2>&1; then + ((deleted_count++)) || true + else + log_warn "Failed to delete snapshot ${snap_id}" + fi + fi + fi + done < <(echo "$snaps_json" | jq -r '.[] | [.SnapshotId, (.Tags // [] | map(select(.Key == "source-ami")) | .[0].Value // ""), (.VolumeSize // 0 | tostring)] | @tsv') + + echo "" + echo "Summary" + printf " Total managed snapshots: %d\n" "$total_snaps" + printf " Orphaned: %d\n" "$orphan_count" + if [[ "$DRY_RUN" == "true" ]]; then + printf " Would delete: %d\n" "$orphan_count" + else + printf " Deleted: %d\n" "$deleted_count" + fi + printf " Storage reclaimed: %d GiB\n" "$total_size" +} + +# ── Inventory report ────────────────────────────────────────────────── +inventory_report() { + log_info "Generating AMI inventory report..." + + local amis_json + amis_json=$(get_managed_amis) + + local total + total=$(echo "$amis_json" | jq 'length') + log_info "Found ${total} managed AMI(s)" + + if (( total == 0 )); then + log_info "No managed AMIs found" + return + fi + + local account_id + account_id=$(aws_cmd sts get-caller-identity --query Account --output text 2>/dev/null) + + case "$OUTPUT_FORMAT" in + text) + echo "" + printf " %-24s %-30s %-12s %-6s %-11s %s\n" \ + "AMI" "SOURCE INSTANCE" "CREATED" "AGE" "RETENTION" "SNAPSHOTS" + echo " ────────────────────────────────────────────────────────────────────────────────────────────────" + ;; + csv) + echo "ami_id,name,source_instance,source_name,created_date,age_days,retention_days,expires,snapshot_count" + ;; + esac + + local json_items=() + + while IFS=$'\t' read -r ami_id ami_name source_instance source_name created_date retention_tag expires_tag snap_count; do + [[ -z "$ami_id" || "$ami_id" == "null" ]] && continue + + # Defaults for missing tags + [[ "$source_instance" == "null" || -z "$source_instance" ]] && source_instance="N/A" + [[ "$source_name" == "null" || -z "$source_name" ]] && source_name="" + [[ "$created_date" == "null" || -z "$created_date" ]] && created_date="unknown" + [[ "$retention_tag" == "null" || -z "$retention_tag" ]] && retention_tag="$RETENTION_DAYS" + [[ "$expires_tag" == "null" || -z "$expires_tag" ]] && expires_tag="N/A" + [[ "$snap_count" == "null" || -z "$snap_count" ]] && snap_count="0" + + local age=0 + if [[ "$created_date" != "unknown" ]]; then + age=$(days_since "$created_date") + fi + + local instance_display="$source_instance" + if [[ -n "$source_name" && "$source_name" != "N/A" ]]; then + instance_display="${source_instance} (${source_name})" + fi + + case "$OUTPUT_FORMAT" in + text) + printf " %-24s %-30s %-12s %3dd %3dd %s\n" \ + "$ami_id" "$instance_display" "$created_date" "$age" "$retention_tag" "$snap_count" + ;; + csv) + echo "\"${ami_id}\",\"${ami_name}\",\"${source_instance}\",\"${source_name}\",\"${created_date}\",${age},${retention_tag},\"${expires_tag}\",${snap_count}" + ;; + json) + json_items+=("{\"ami_id\":\"${ami_id}\",\"name\":\"${ami_name}\",\"source_instance\":\"${source_instance}\",\"source_name\":\"${source_name}\",\"created_date\":\"${created_date}\",\"age_days\":${age},\"retention_days\":${retention_tag},\"expires\":\"${expires_tag}\",\"snapshot_count\":${snap_count}}") + ;; + esac + done < <(echo "$amis_json" | jq -r '.[] | [ + .ImageId, + (.Tags // [] | map(select(.Key == "Name")) | .[0].Value // "N/A"), + (.Tags // [] | map(select(.Key == "source-instance")) | .[0].Value // ""), + (.Tags // [] | map(select(.Key == "source-name")) | .[0].Value // ""), + (.Tags // [] | map(select(.Key == "created-date")) | .[0].Value // ""), + (.Tags // [] | map(select(.Key == "retention-days")) | .[0].Value // ""), + (.Tags // [] | map(select(.Key == "expires")) | .[0].Value // ""), + (.BlockDeviceMappings // [] | map(select(.Ebs.SnapshotId)) | length | tostring) + ] | @tsv') + + if [[ "$OUTPUT_FORMAT" == "json" ]]; then + local joined + joined=$(printf ",%s" "${json_items[@]}") + joined="${joined:1}" + printf '{"mode":"inventory","total":%d,"items":[%s]}\n' "$total" "$joined" + fi + + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo "" + printf " Total: %d managed AMI(s)\n" "$total" + fi +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + parse_args "$@" + setup_colors + check_deps + + START_TIME=$(date +%s) + + print_header + + case "$RUN_MODE" in + create) + create_ami + ;; + enforce) + enforce_retention + ;; + clean-snapshots) + clean_orphan_snapshots + ;; + inventory) + inventory_report + ;; + *) + die "Unknown mode: $RUN_MODE" + ;; + esac + + local elapsed=$(( $(date +%s) - START_TIME )) + log_info "Completed in ${elapsed}s" +} + +main "$@" diff --git a/apache-metrics-exporter.sh b/apache-metrics-exporter.sh new file mode 100644 index 0000000..db049ea --- /dev/null +++ b/apache-metrics-exporter.sh @@ -0,0 +1,1308 @@ +#!/bin/bash +############################################################# +#### Apache Metrics Exporter for Prometheus #### +#### Comprehensive Apache monitoring via mod_status, #### +#### logs, SSL, process, and config metrics #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.01 #### +#### #### +#### Usage: ./apache-metrics-exporter.sh [OPTIONS] #### +############################################################# +# +# Metrics collected: +# - mod_status: accesses, bytes, req/sec, busy/idle workers, scoreboard +# - Process: worker count, memory usage, CPU usage, open files +# - Access logs: requests by status code, response times, bytes transferred +# - SSL: certificate expiry days for configured domains +# - Config: MPM type, MaxRequestWorkers, KeepAliveTimeout +# - Upstream: proxy/balancer status (if configured) +# +# Requirements: +# - Apache with mod_status enabled (ExtendedStatus On) +# - socat (for HTTP server) +# - curl (for server-status fetching) +# +set -euo pipefail + +######################### +### Auto-detect Apache ### +######################### + +APACHE_BIN="" +APACHECTL="" +APACHE_PROC="" + +detect_apache_flavor() { + if command -v apache2 &>/dev/null; then + APACHE_BIN="apache2" + APACHECTL="apache2ctl" + APACHE_PROC="apache2" + elif command -v httpd &>/dev/null; then + APACHE_BIN="httpd" + APACHECTL="httpd" + APACHE_PROC="httpd" + else + APACHE_BIN="" + APACHECTL="" + APACHE_PROC="" + fi +} + +detect_apache_flavor + +######################### +### Configuration ### +######################### + +LISTEN_PORT="${APACHE_EXPORTER_PORT:-9117}" +STATUS_URL="${APACHE_STATUS_URL:-http://127.0.0.1/server-status?auto}" +SSL_CHECK_DOMAINS="${SSL_CHECK_DOMAINS:-}" # Comma-separated list of domains to check SSL +SCRAPE_INTERVAL="${SCRAPE_INTERVAL:-15}" + +# Auto-detect paths based on distro +if [[ -d /etc/apache2 ]]; then + ACCESS_LOG="${APACHE_ACCESS_LOG:-/var/log/apache2/access.log}" + ERROR_LOG="${APACHE_ERROR_LOG:-/var/log/apache2/error.log}" + APACHE_CONF="${APACHE_CONF:-/etc/apache2/apache2.conf}" + SITES_DIR="${APACHE_SITES_DIR:-/etc/apache2/sites-enabled}" + CONF_D_DIR="${APACHE_CONF_D:-/etc/apache2/conf-enabled}" +elif [[ -d /etc/httpd ]]; then + ACCESS_LOG="${APACHE_ACCESS_LOG:-/var/log/httpd/access_log}" + ERROR_LOG="${APACHE_ERROR_LOG:-/var/log/httpd/error_log}" + APACHE_CONF="${APACHE_CONF:-/etc/httpd/conf/httpd.conf}" + SITES_DIR="${APACHE_SITES_DIR:-/etc/httpd/conf.d}" + CONF_D_DIR="${APACHE_CONF_D:-/etc/httpd/conf.d}" +else + ACCESS_LOG="${APACHE_ACCESS_LOG:-/var/log/apache2/access.log}" + ERROR_LOG="${APACHE_ERROR_LOG:-/var/log/apache2/error.log}" + APACHE_CONF="${APACHE_CONF:-/etc/apache2/apache2.conf}" + SITES_DIR="${APACHE_SITES_DIR:-/etc/apache2/sites-enabled}" + CONF_D_DIR="${APACHE_CONF_D:-/etc/apache2/conf-enabled}" +fi + +# Log parsing settings +LOG_TAIL_LINES="${LOG_TAIL_LINES:-10000}" # Number of lines to parse from access log +LOG_PARSE_INTERVAL="${LOG_PARSE_INTERVAL:-60}" # How often to parse logs (seconds) + +# State files for log metrics +STATE_DIR="/tmp/apache-metrics" +LAST_LOG_PARSE=0 + +# Output mode +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false + +######################### +### Logging ### +######################### + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 +} + +######################### +### Parse Arguments ### +######################### + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --textfile) + OUTPUT_FILE="$TEXTFILE_DIR/apache.prom" + shift + ;; + --http) + HTTP_MODE=true + shift + ;; + --output|-o) + OUTPUT_FILE="$2" + shift 2 + ;; + --port) + LISTEN_PORT="$2" + shift 2 + ;; + --status-url) + STATUS_URL="$2" + shift 2 + ;; + --access-log) + ACCESS_LOG="$2" + shift 2 + ;; + --error-log) + ERROR_LOG="$2" + shift 2 + ;; + --apache-conf) + APACHE_CONF="$2" + shift 2 + ;; + --ssl-domains) + SSL_CHECK_DOMAINS="$2" + shift 2 + ;; + --help) + cat </dev/null; then + echo "apt" + elif command -v dnf &>/dev/null; then + echo "dnf" + elif command -v yum &>/dev/null; then + echo "yum" + elif command -v zypper &>/dev/null; then + echo "zypper" + elif command -v pacman &>/dev/null; then + echo "pacman" + elif command -v apk &>/dev/null; then + echo "apk" + else + echo "" + fi +} + +install_package() { + local pkg="$1" + local pkgmgr + pkgmgr=$(detect_package_manager) + + log "Installing $pkg..." + + case "$pkgmgr" in + apt) + apt-get update -qq && apt-get install -y -qq "$pkg" + ;; + dnf) + dnf install -y -q "$pkg" + ;; + yum) + yum install -y -q "$pkg" + ;; + zypper) + zypper install -y -q "$pkg" + ;; + pacman) + pacman -S --noconfirm "$pkg" + ;; + apk) + apk add --quiet "$pkg" + ;; + *) + log "ERROR: Unknown package manager. Please install $pkg manually." + return 1 + ;; + esac +} + +setup() { + mkdir -p "$STATE_DIR" + + # Check for required tools and install if missing + if ! command -v socat &>/dev/null; then + log "socat not found, attempting to install..." + if [[ $EUID -eq 0 ]]; then + if ! install_package socat; then + log "ERROR: Failed to install socat" + exit 1 + fi + log "socat installed successfully" + else + log "ERROR: socat is required. Run as root to auto-install, or install manually:" + log " Debian/Ubuntu: apt install socat" + log " RHEL/CentOS: yum install socat" + log " Fedora: dnf install socat" + log " Alpine: apk add socat" + exit 1 + fi + fi + + if ! command -v curl &>/dev/null; then + log "curl not found, attempting to install..." + if [[ $EUID -eq 0 ]]; then + if ! install_package curl; then + log "ERROR: Failed to install curl" + exit 1 + fi + log "curl installed successfully" + else + log "ERROR: curl is required. Run as root to auto-install, or install manually." + exit 1 + fi + fi + + # Check if Apache is running + if [[ -n "$APACHE_PROC" ]]; then + if ! pgrep -x "$APACHE_PROC" &>/dev/null && ! pidof "$APACHE_PROC" &>/dev/null; then + log "WARNING: $APACHE_PROC process not found - process metrics will show apache_process_running=0" + fi + else + log "WARNING: Apache binary not found (neither apache2 nor httpd)" + fi + + # Check if server-status is accessible + check_server_status +} + +check_server_status() { + log "Checking server-status at $STATUS_URL..." + + local response http_code + response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$STATUS_URL" 2>/dev/null) + + if [[ "$response" == "200" ]]; then + # Verify it's actually mod_status output + local content + content=$(curl -s --max-time 5 "$STATUS_URL" 2>/dev/null) + if echo "$content" | grep -q "Total Accesses"; then + log "✓ mod_status is working correctly" + return 0 + else + log "WARNING: $STATUS_URL returned 200 but doesn't look like mod_status output" + log " Expected 'Total Accesses' in response (ensure ExtendedStatus On)" + show_server_status_help + return 1 + fi + elif [[ "$response" == "000" ]]; then + log "WARNING: Cannot connect to $STATUS_URL (connection refused/timeout)" + log " server-status metrics will show apache_up=0" + show_server_status_help + return 1 + elif [[ "$response" == "403" ]]; then + log "WARNING: Access denied to $STATUS_URL (HTTP 403)" + log " Check 'Require' directives in server-status location block" + show_server_status_help + return 1 + elif [[ "$response" == "404" ]]; then + log "WARNING: server-status endpoint not found at $STATUS_URL (HTTP 404)" + log " mod_status may not be enabled" + show_server_status_help + return 1 + else + log "WARNING: Unexpected response from $STATUS_URL (HTTP $response)" + show_server_status_help + return 1 + fi +} + +show_server_status_help() { + log "" + log "To enable mod_status, configure Apache as follows:" + log "" + log " Debian/Ubuntu:" + log " sudo a2enmod status" + log " # Edit /etc/apache2/mods-enabled/status.conf:" + log " ExtendedStatus On" + log " " + log " SetHandler server-status" + log " Require local" + log " " + log "" + log " RHEL/CentOS/Rocky:" + log " # Add to /etc/httpd/conf.d/status.conf:" + log " ExtendedStatus On" + log " " + log " SetHandler server-status" + log " Require local" + log " " + log "" + log "Then reload: apachectl configtest && systemctl reload apache2 (or httpd)" + log "" + log "Or specify a different URL with: --status-url " + log "" +} + +######################### +### Server Status Metrics ### +######################### + +collect_server_status() { + local status_output + + echo "# HELP apache_up Whether Apache mod_status is reachable" + echo "# TYPE apache_up gauge" + + if ! status_output=$(curl -s --max-time 5 "$STATUS_URL" 2>/dev/null); then + echo "apache_up 0" + return + fi + + # Verify we got valid mod_status output + if ! echo "$status_output" | grep -q "Total Accesses"; then + echo "apache_up 0" + return + fi + + echo "apache_up 1" + + # Parse mod_status ?auto output + # Format: + # Total Accesses: 12345 + # Total kBytes: 67890 + # CPULoad: .0123456 + # Uptime: 86400 + # ReqPerSec: .142857 + # BytesPerSec: 804.571 + # BytesPerReq: 5632 + # BusyWorkers: 3 + # IdleWorkers: 7 + # Scoreboard: __W_K....._R.. + + local total_accesses total_kbytes cpu_load uptime req_per_sec bytes_per_sec bytes_per_req + local busy_workers idle_workers scoreboard + + total_accesses=$(echo "$status_output" | grep '^Total Accesses:' | awk '{print $3}') || total_accesses=0 + total_kbytes=$(echo "$status_output" | grep '^Total kBytes:' | awk '{print $3}') || total_kbytes=0 + cpu_load=$(echo "$status_output" | grep '^CPULoad:' | awk '{print $2}') || cpu_load=0 + uptime=$(echo "$status_output" | grep '^Uptime:' | awk '{print $2}') || uptime=0 + req_per_sec=$(echo "$status_output" | grep '^ReqPerSec:' | awk '{print $2}') || req_per_sec=0 + bytes_per_sec=$(echo "$status_output" | grep '^BytesPerSec:' | awk '{print $2}') || bytes_per_sec=0 + bytes_per_req=$(echo "$status_output" | grep '^BytesPerReq:' | awk '{print $2}') || bytes_per_req=0 + busy_workers=$(echo "$status_output" | grep '^BusyWorkers:' | awk '{print $2}') || busy_workers=0 + idle_workers=$(echo "$status_output" | grep '^IdleWorkers:' | awk '{print $2}') || idle_workers=0 + scoreboard=$(echo "$status_output" | grep '^Scoreboard:' | awk '{print $2}') || scoreboard="" + + # Convert kBytes to bytes + local total_bytes + total_bytes=$(echo "$total_kbytes * 1024" | bc 2>/dev/null || echo "$((total_kbytes * 1024))") + + cat </dev/null || pidof "$APACHE_PROC" 2>/dev/null | awk '{print $1}' || echo "") + + if [[ -z "$apache_master_pid" ]]; then + echo "# HELP apache_process_running Whether Apache process is running" + echo "# TYPE apache_process_running gauge" + echo "apache_process_running 0" + return + fi + + echo "# HELP apache_process_running Whether Apache process is running" + echo "# TYPE apache_process_running gauge" + echo "apache_process_running 1" + + # Get all Apache PIDs + apache_pids=$(pgrep -x "$APACHE_PROC" 2>/dev/null || pidof "$APACHE_PROC" 2>/dev/null || echo "") + + # Count workers (total processes minus master) + worker_count=$(echo "$apache_pids" | wc -w) + if [[ $worker_count -gt 0 ]]; then + worker_count=$((worker_count - 1)) # Subtract master + fi + + echo "# HELP apache_workers_count Number of Apache worker processes" + echo "# TYPE apache_workers_count gauge" + echo "apache_workers_count $worker_count" + + # Calculate total memory usage (RSS in bytes) + total_memory=0 + total_cpu=0 + total_fds=0 + total_threads=0 + + for pid in $apache_pids; do + if [[ -d "/proc/$pid" ]]; then + # Memory (RSS in KB from /proc/pid/status, convert to bytes) + local rss + rss=$(grep -m1 'VmRSS:' "/proc/$pid/status" 2>/dev/null | awk '{print $2}' || echo "0") + total_memory=$((total_memory + rss * 1024)) + + # CPU time (from /proc/pid/stat - utime + stime in jiffies) + local stat_line utime stime + if stat_line=$(cat "/proc/$pid/stat" 2>/dev/null); then + utime=$(echo "$stat_line" | awk '{print $14}') + stime=$(echo "$stat_line" | awk '{print $15}') + total_cpu=$((total_cpu + utime + stime)) + fi + + # Open file descriptors + local fds + fds=$(ls -1 "/proc/$pid/fd" 2>/dev/null | wc -l || echo "0") + total_fds=$((total_fds + fds)) + + # Threads + local threads + threads=$(grep -c '^Threads:' "/proc/$pid/status" 2>/dev/null || true) + if [[ "$threads" -eq 0 ]]; then + threads=$(grep 'Threads:' "/proc/$pid/status" 2>/dev/null | awk '{print $2}' || echo "1") + fi + total_threads=$((total_threads + threads)) + fi + done + + # Convert CPU jiffies to seconds (assuming 100 Hz) + local cpu_seconds + cpu_seconds=$(echo "scale=2; $total_cpu / 100" | bc 2>/dev/null || echo "$total_cpu") + + cat </dev/null || echo "0") + # starttime is in jiffies since boot + start_seconds=$(awk "BEGIN {printf \"%.0f\", $(cat /proc/uptime | awk '{print $1}') - ($starttime / 100)}") + local now_epoch + now_epoch=$(date +%s) + local process_start=$((now_epoch - start_seconds)) + echo "apache_process_start_time_seconds $process_start" + else + echo "apache_process_start_time_seconds 0" + fi + + # Get max open files limit + if [[ -f "/proc/$apache_master_pid/limits" ]]; then + local max_fds + max_fds=$(grep 'Max open files' "/proc/$apache_master_pid/limits" 2>/dev/null | awk '{print $4}' || echo "0") + echo "" + echo "# HELP apache_process_max_fds Maximum number of open file descriptors" + echo "# TYPE apache_process_max_fds gauge" + echo "apache_process_max_fds $max_fds" + fi +} + +######################### +### Config Metrics ### +######################### + +collect_config_metrics() { + if [[ ! -f "$APACHE_CONF" ]]; then + echo "# Apache config not found at $APACHE_CONF" + return + fi + + local max_request_workers keepalive_timeout keepalive_enabled + local mpm_type + + # Parse MaxRequestWorkers (or MaxClients for older Apache) + max_request_workers=$(grep -rihE '^\s*MaxRequestWorkers' "$APACHE_CONF" "$CONF_D_DIR" "$SITES_DIR" 2>/dev/null | head -1 | awk '{print $2}' || echo "") + if [[ -z "$max_request_workers" ]]; then + max_request_workers=$(grep -rihE '^\s*MaxClients' "$APACHE_CONF" "$CONF_D_DIR" "$SITES_DIR" 2>/dev/null | head -1 | awk '{print $2}' || echo "0") + fi + max_request_workers="${max_request_workers:-0}" + + # Parse KeepAliveTimeout + keepalive_timeout=$(grep -rihE '^\s*KeepAliveTimeout' "$APACHE_CONF" "$CONF_D_DIR" 2>/dev/null | head -1 | awk '{print $2}' || echo "0") + keepalive_timeout="${keepalive_timeout:-0}" + + # Check KeepAlive on/off + keepalive_enabled=$(grep -rihE '^\s*KeepAlive\s' "$APACHE_CONF" "$CONF_D_DIR" 2>/dev/null | head -1 | awk '{print tolower($2)}' || echo "on") + if [[ "$keepalive_enabled" == "on" ]]; then + keepalive_enabled=1 + else + keepalive_enabled=0 + fi + + # Detect MPM type + mpm_type="unknown" + if [[ -n "$APACHECTL" ]]; then + local modules_list + modules_list=$($APACHECTL -M 2>/dev/null || echo "") + if echo "$modules_list" | grep -q 'mpm_event_module'; then + mpm_type="event" + elif echo "$modules_list" | grep -q 'mpm_worker_module'; then + mpm_type="worker" + elif echo "$modules_list" | grep -q 'mpm_prefork_module'; then + mpm_type="prefork" + fi + fi + + cat </dev/null | wc -l) + elif [[ -d "$CONF_D_DIR" ]]; then + vhost_count=$(find "$CONF_D_DIR" -name "*.conf" -type f 2>/dev/null | wc -l) + fi + + echo "" + echo "# HELP apache_config_vhosts_total Number of configured virtual hosts" + echo "# TYPE apache_config_vhosts_total gauge" + echo "apache_config_vhosts_total $vhost_count" + + # Parse ServerLimit if available + local server_limit + server_limit=$(grep -rihE '^\s*ServerLimit' "$APACHE_CONF" "$CONF_D_DIR" 2>/dev/null | head -1 | awk '{print $2}' || echo "0") + if [[ "$server_limit" != "0" ]] && [[ -n "$server_limit" ]]; then + echo "" + echo "# HELP apache_config_server_limit ServerLimit setting" + echo "# TYPE apache_config_server_limit gauge" + echo "apache_config_server_limit $server_limit" + fi + + # Parse Timeout + local timeout_val + timeout_val=$(grep -rihE '^\s*Timeout\s' "$APACHE_CONF" "$CONF_D_DIR" 2>/dev/null | head -1 | awk '{print $2}' || echo "0") + if [[ "$timeout_val" != "0" ]] && [[ -n "$timeout_val" ]]; then + echo "" + echo "# HELP apache_config_timeout Timeout setting in seconds" + echo "# TYPE apache_config_timeout gauge" + echo "apache_config_timeout $timeout_val" + fi +} + +######################### +### Access Log Metrics ### +######################### + +collect_access_log_metrics() { + if [[ ! -f "$ACCESS_LOG" ]] || [[ ! -r "$ACCESS_LOG" ]]; then + echo "# Access log not readable at $ACCESS_LOG" + return + fi + + local now + now=$(date +%s) + + # Only parse logs every LOG_PARSE_INTERVAL seconds + if [[ -f "$STATE_DIR/last_parse" ]]; then + LAST_LOG_PARSE=$(cat "$STATE_DIR/last_parse") + fi + + if [[ $((now - LAST_LOG_PARSE)) -lt $LOG_PARSE_INTERVAL ]] && [[ -f "$STATE_DIR/log_metrics" ]]; then + cat "$STATE_DIR/log_metrics" + return + fi + + echo "$now" > "$STATE_DIR/last_parse" + + # Parse access log for status codes and other metrics + # Assuming combined log format: $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" + + local log_data + log_data=$(tail -n "$LOG_TAIL_LINES" "$ACCESS_LOG" 2>/dev/null || echo "") + + if [[ -z "$log_data" ]]; then + echo "# No log data available" + return + fi + + local metrics_output="" + + # Count by status code + local status_counts + status_counts=$(echo "$log_data" | awk '{print $9}' | { grep -E '^[0-9]{3}$' || true; } | sort | uniq -c | sort -rn) + + metrics_output+="# HELP apache_http_requests_by_status_total HTTP requests by status code (from last $LOG_TAIL_LINES log lines) +# TYPE apache_http_requests_by_status_total gauge +" + + # Initialize counters for status code groups + local count_1xx=0 count_2xx=0 count_3xx=0 count_4xx=0 count_5xx=0 + + while read -r count status; do + if [[ -n "$status" ]] && [[ -n "$count" ]]; then + metrics_output+="apache_http_requests_by_status_total{status=\"$status\"} $count +" + # Aggregate by category + case "${status:0:1}" in + 1) count_1xx=$((count_1xx + count)) ;; + 2) count_2xx=$((count_2xx + count)) ;; + 3) count_3xx=$((count_3xx + count)) ;; + 4) count_4xx=$((count_4xx + count)) ;; + 5) count_5xx=$((count_5xx + count)) ;; + esac + fi + done <<< "$status_counts" + + metrics_output+=" +# HELP apache_http_requests_by_status_class_total HTTP requests by status class +# TYPE apache_http_requests_by_status_class_total gauge +apache_http_requests_by_status_class_total{class=\"1xx\"} $count_1xx +apache_http_requests_by_status_class_total{class=\"2xx\"} $count_2xx +apache_http_requests_by_status_class_total{class=\"3xx\"} $count_3xx +apache_http_requests_by_status_class_total{class=\"4xx\"} $count_4xx +apache_http_requests_by_status_class_total{class=\"5xx\"} $count_5xx +" + + # Calculate total bytes sent + local total_bytes + total_bytes=$(echo "$log_data" | awk '{sum += $10} END {print sum+0}') + + metrics_output+=" +# HELP apache_http_response_bytes_total Total bytes sent in responses (from last $LOG_TAIL_LINES log lines) +# TYPE apache_http_response_bytes_total gauge +apache_http_response_bytes_total $total_bytes +" + + # Count requests by method + local method_counts + method_counts=$(echo "$log_data" | awk -F'"' '{print $2}' | awk '{print $1}' | { grep -E '^(GET|POST|PUT|DELETE|PATCH|HEAD|OPTIONS)$' || true; } | sort | uniq -c) + + metrics_output+=" +# HELP apache_http_requests_by_method_total HTTP requests by method (from last $LOG_TAIL_LINES log lines) +# TYPE apache_http_requests_by_method_total gauge +" + + while read -r count method; do + if [[ -n "$method" ]] && [[ -n "$count" ]]; then + metrics_output+="apache_http_requests_by_method_total{method=\"$method\"} $count +" + fi + done <<< "$method_counts" + + # Count unique IPs + local unique_ips + unique_ips=$(echo "$log_data" | awk '{print $1}' | sort -u | wc -l) + + metrics_output+=" +# HELP apache_http_unique_clients Unique client IPs (from last $LOG_TAIL_LINES log lines) +# TYPE apache_http_unique_clients gauge +apache_http_unique_clients $unique_ips +" + + # Top URIs (for potential abuse detection) + local top_uris + top_uris=$(echo "$log_data" | awk -F'"' '{print $2}' | awk '{print $2}' | { grep -v '^-$' || true; } | sort | uniq -c | sort -rn | head -5) + + metrics_output+=" +# HELP apache_http_top_uri_requests_total Top requested URIs (from last $LOG_TAIL_LINES log lines) +# TYPE apache_http_top_uri_requests_total gauge +" + + local rank=1 + while read -r count uri; do + if [[ -n "$uri" ]] && [[ -n "$count" ]]; then + # Truncate URI and escape quotes + uri="${uri:0:100}" + uri="${uri//\"/\\\"}" + metrics_output+="apache_http_top_uri_requests_total{uri=\"$uri\",rank=\"$rank\"} $count +" + rank=$((rank + 1)) + fi + done <<< "$top_uris" + + # Count requests in time windows + local recent_requests + recent_requests=$(echo "$log_data" | wc -l) + + metrics_output+=" +# HELP apache_http_requests_in_sample Total requests in sample window +# TYPE apache_http_requests_in_sample gauge +apache_http_requests_in_sample $recent_requests +" + + # Save metrics for caching + echo "$metrics_output" > "$STATE_DIR/log_metrics" + echo "$metrics_output" +} + +######################### +### Error Log Metrics ### +######################### + +collect_error_log_metrics() { + if [[ ! -f "$ERROR_LOG" ]] || [[ ! -r "$ERROR_LOG" ]]; then + echo "# Error log not readable at $ERROR_LOG" + return + fi + + # Count errors by level from last 1000 lines + local log_data + log_data=$(tail -n 1000 "$ERROR_LOG" 2>/dev/null || echo "") + + if [[ -z "$log_data" ]]; then + return + fi + + local emerg_count alert_count crit_count error_count warn_count notice_count info_count + + emerg_count=$(echo "$log_data" | grep -c '\[emerg\]' 2>/dev/null) || emerg_count=0 + alert_count=$(echo "$log_data" | grep -c '\[alert\]' 2>/dev/null) || alert_count=0 + crit_count=$(echo "$log_data" | grep -c '\[crit\]' 2>/dev/null) || crit_count=0 + error_count=$(echo "$log_data" | grep -c '\[error\]' 2>/dev/null) || error_count=0 + warn_count=$(echo "$log_data" | grep -c '\[warn\]' 2>/dev/null) || warn_count=0 + notice_count=$(echo "$log_data" | grep -c '\[notice\]' 2>/dev/null) || notice_count=0 + info_count=$(echo "$log_data" | grep -c '\[info\]' 2>/dev/null) || info_count=0 + + cat </dev/null || echo "0") + log_mtime=$(stat -c %Y "$ERROR_LOG" 2>/dev/null || echo "0") + now=$(date +%s) + log_age=$((now - log_mtime)) + + cat </dev/null | grep -v '#' | awk '{print $2}' | tr -d '"' | sort -u || echo "") + + if [[ -z "$cert_files" ]]; then + echo "# No SSL certificates found in Apache config" + return + fi + + echo "# HELP apache_ssl_certificate_expiry_days Days until SSL certificate expires" + echo "# TYPE apache_ssl_certificate_expiry_days gauge" + echo "# HELP apache_ssl_certificate_expiry_timestamp Unix timestamp when certificate expires" + echo "# TYPE apache_ssl_certificate_expiry_timestamp gauge" + + while read -r cert_file; do + if [[ -f "$cert_file" ]]; then + local expiry_date expiry_epoch now_epoch days_left cn + + expiry_date=$(openssl x509 -enddate -noout -in "$cert_file" 2>/dev/null | cut -d= -f2 || echo "") + if [[ -n "$expiry_date" ]]; then + expiry_epoch=$(date -d "$expiry_date" +%s 2>/dev/null || echo "0") + now_epoch=$(date +%s) + days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + + # Get CN from certificate + cn=$(openssl x509 -subject -noout -in "$cert_file" 2>/dev/null | grep -oP 'CN\s*=\s*\K[^,/]+' || basename "$cert_file") + cn="${cn// /_}" + + echo "apache_ssl_certificate_expiry_days{certificate=\"$cn\",file=\"$cert_file\"} $days_left" + echo "apache_ssl_certificate_expiry_timestamp{certificate=\"$cn\",file=\"$cert_file\"} $expiry_epoch" + fi + fi + done <<< "$cert_files" + return + fi + + # Check specified domains via network + echo "# HELP apache_ssl_certificate_expiry_days Days until SSL certificate expires" + echo "# TYPE apache_ssl_certificate_expiry_days gauge" + echo "# HELP apache_ssl_certificate_expiry_timestamp Unix timestamp when certificate expires" + echo "# TYPE apache_ssl_certificate_expiry_timestamp gauge" + + IFS=',' read -ra domain_array <<< "$domains" + for domain in "${domain_array[@]}"; do + domain=$(echo "$domain" | tr -d ' ') + if [[ -n "$domain" ]]; then + local expiry_date expiry_epoch now_epoch days_left + + expiry_date=$(echo | openssl s_client -servername "$domain" -connect "$domain:443" 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2 || echo "") + + if [[ -n "$expiry_date" ]]; then + expiry_epoch=$(date -d "$expiry_date" +%s 2>/dev/null || echo "0") + now_epoch=$(date +%s) + days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + + echo "apache_ssl_certificate_expiry_days{domain=\"$domain\"} $days_left" + echo "apache_ssl_certificate_expiry_timestamp{domain=\"$domain\"} $expiry_epoch" + else + echo "apache_ssl_certificate_expiry_days{domain=\"$domain\"} -1" + fi + fi + done +} + +######################### +### Proxy/Upstream Metrics ### +######################### + +collect_upstream_metrics() { + # Check for proxy/balancer configurations + local proxy_passes + proxy_passes=$(grep -rh 'ProxyPass\s' "$SITES_DIR" "$CONF_D_DIR" "$APACHE_CONF" 2>/dev/null | grep -v '#' | grep -v 'ProxyPassReverse' | awk '{print $2}' | sort -u || echo "") + + local balancers + balancers=$(grep -rhoE 'balancer://[a-zA-Z0-9_-]+' "$SITES_DIR" "$CONF_D_DIR" "$APACHE_CONF" 2>/dev/null | sort -u || echo "") + + if [[ -z "$proxy_passes" ]] && [[ -z "$balancers" ]]; then + return + fi + + local proxy_count=0 + if [[ -n "$proxy_passes" ]]; then + proxy_count=$(echo "$proxy_passes" | wc -l) + fi + + local balancer_count=0 + if [[ -n "$balancers" ]]; then + balancer_count=$(echo "$balancers" | wc -l) + fi + + cat </dev/null | grep -c 'BalancerMember' 2>/dev/null) || member_count=0 + echo "apache_balancer_members_total{balancer=\"$name\"} $member_count" + fi + done <<< "$balancers" + fi +} + +######################### +### Version Metrics ### +######################### + +collect_version_metrics() { + local version="unknown" + + if [[ -n "$APACHE_BIN" ]]; then + version=$($APACHE_BIN -v 2>&1 | grep -oP 'Apache/\K[0-9.]+' || echo "unknown") + fi + + echo "# HELP apache_version_info Apache version information" + echo "# TYPE apache_version_info gauge" + echo "apache_version_info{version=\"$version\"} 1" + + # Check loaded modules + if [[ -n "$APACHECTL" ]]; then + local modules_output + modules_output=$($APACHECTL -M 2>/dev/null || echo "") + + local has_ssl has_proxy has_proxy_http has_proxy_balancer has_rewrite + local has_headers has_deflate has_expires has_status has_http2 + + has_ssl=$(echo "$modules_output" | grep -q 'ssl_module' && echo "1" || echo "0") + has_proxy=$(echo "$modules_output" | grep -q 'proxy_module' && echo "1" || echo "0") + has_proxy_http=$(echo "$modules_output" | grep -q 'proxy_http_module' && echo "1" || echo "0") + has_proxy_balancer=$(echo "$modules_output" | grep -q 'proxy_balancer_module' && echo "1" || echo "0") + has_rewrite=$(echo "$modules_output" | grep -q 'rewrite_module' && echo "1" || echo "0") + has_headers=$(echo "$modules_output" | grep -q 'headers_module' && echo "1" || echo "0") + has_deflate=$(echo "$modules_output" | grep -q 'deflate_module' && echo "1" || echo "0") + has_expires=$(echo "$modules_output" | grep -q 'expires_module' && echo "1" || echo "0") + has_status=$(echo "$modules_output" | grep -q 'status_module' && echo "1" || echo "0") + has_http2=$(echo "$modules_output" | grep -q 'http2_module' && echo "1" || echo "0") + + cat </dev/null || echo "0") + ulimit_n=$(ulimit -n 2>/dev/null || echo "0") + + cat </dev/null | awk '{print $1}' || echo "0") + + echo "" + echo "# HELP apache_system_open_files Current system-wide open files" + echo "# TYPE apache_system_open_files gauge" + echo "apache_system_open_files $open_files" +} + +######################### +### Collect All Metrics ### +######################### + +collect_all_metrics() { + local hostname + hostname=$(hostname -f 2>/dev/null || hostname) + + cat </dev/null || { + log "Server error, restarting in 5 seconds..." + sleep 5 + } + done +} + +######################### +### Output ### +######################### + +write_output() { + local metrics + metrics=$(collect_all_metrics) + + if [[ -n "$OUTPUT_FILE" ]]; then + local tmp_file="${OUTPUT_FILE}.$$" + echo "$metrics" > "$tmp_file" + mv "$tmp_file" "$OUTPUT_FILE" + else + echo "$metrics" + fi +} + +######################### +### Main ### +######################### + +main() { + if [[ "${1:-}" == "--handle-request" ]]; then + handle_request + exit 0 + fi + + parse_args "$@" + setup + + if [[ "$HTTP_MODE" == true ]]; then + start_server + elif [[ -n "$OUTPUT_FILE" ]]; then + write_output + else + collect_all_metrics + fi +} + +main "$@" diff --git a/apache-security-auditor.sh b/apache-security-auditor.sh new file mode 100644 index 0000000..cc07e52 --- /dev/null +++ b/apache-security-auditor.sh @@ -0,0 +1,658 @@ +#!/usr/bin/env bash + +######################################################################################### +#### apache-security-auditor.sh — Audit Apache httpd configuration for security issues#### +#### Checks server info, TLS, headers, directories, modules, and file permissions #### +#### Requires: bash 4+, root access #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### sudo ./apache-security-auditor.sh --full #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +# ── Severity counters ──────────────────────────────────────────────── +TOTAL_CRIT=0 +TOTAL_WARN=0 +TOTAL_INFO=0 +TOTAL_OK=0 + +flag_crit() { ((TOTAL_CRIT++)) || true; } +flag_warn() { ((TOTAL_WARN++)) || true; } +flag_info() { ((TOTAL_INFO++)) || true; } +flag_ok() { ((TOTAL_OK++)) || true; } + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +APACHE_CONF="" +APACHECTL="" +APACHE_CONF_DIR="" +APACHE_RUN_USER="" +PLATFORM="" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── Platform detection ─────────────────────────────────────────────── +detect_platform() { + if [[ -n "$APACHE_CONF" ]]; then + if command -v apache2ctl &>/dev/null; then + APACHECTL="apache2ctl" + APACHE_CONF_DIR="$(dirname "$APACHE_CONF")" + APACHE_RUN_USER="www-data" + PLATFORM="Debian/Ubuntu (apache2)" + elif command -v httpd &>/dev/null; then + APACHECTL="httpd" + APACHE_CONF_DIR="$(dirname "$APACHE_CONF")" + APACHE_RUN_USER="apache" + PLATFORM="RHEL/CentOS (httpd)" + else + die "Cannot find apache2ctl or httpd" + fi + return + fi + + if command -v apache2ctl &>/dev/null && [[ -f /etc/apache2/apache2.conf ]]; then + APACHECTL="apache2ctl" + APACHE_CONF="/etc/apache2/apache2.conf" + APACHE_CONF_DIR="/etc/apache2" + APACHE_RUN_USER="www-data" + PLATFORM="Debian/Ubuntu (apache2)" + elif command -v httpd &>/dev/null && [[ -f /etc/httpd/conf/httpd.conf ]]; then + APACHECTL="httpd" + APACHE_CONF="/etc/httpd/conf/httpd.conf" + APACHE_CONF_DIR="/etc/httpd" + APACHE_RUN_USER="apache" + PLATFORM="RHEL/CentOS (httpd)" + else + die "Cannot detect Apache installation — use --config to specify config path" + fi + + verbose "Platform: ${PLATFORM}" + verbose "Config: ${APACHE_CONF}" + verbose "Config dir: ${APACHE_CONF_DIR}" +} + +# ── Get all config files ───────────────────────────────────────────── +get_config_files() { + local files=() + + if [[ -f "$APACHE_CONF" ]]; then + files+=("$APACHE_CONF") + fi + + local included + included=$($APACHECTL -t -D DUMP_INCLUDES 2>/dev/null | grep -oP '\(\*\) \K.*|^ *\K/.*' || true) + if [[ -n "$included" ]]; then + while IFS= read -r f; do + [[ -f "$f" ]] && files+=("$f") + done <<< "$included" + fi + + for d in "${APACHE_CONF_DIR}/sites-enabled" "${APACHE_CONF_DIR}/conf-enabled" \ + "${APACHE_CONF_DIR}/conf.d" "${APACHE_CONF_DIR}/conf.modules.d"; do + if [[ -d "$d" ]]; then + while IFS= read -r f; do + files+=("$f") + done < <(find "$d" -name '*.conf' -type f 2>/dev/null) + fi + done + + printf '%s\n' "${files[@]}" | sort -u +} + +# ── Search across all config files ─────────────────────────────────── +search_config() { + local pattern="$1" + local config_files + config_files=$(get_config_files) + + while IFS= read -r f; do + [[ -z "$f" ]] && continue + grep -iP "$pattern" "$f" 2>/dev/null || true + done <<< "$config_files" +} + +# ── Table header ───────────────────────────────────────────────────── +print_table_header() { + printf " %-32s %-14s %s\n" "CHECK" "STATUS" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..65})" +} + +# ── Table row ──────────────────────────────────────────────────────── +print_row() { + local check="$1" status="$2" severity="$3" + local color="" + case "$severity" in + CRITICAL) color="$RED"; flag_crit ;; + WARN) color="$YELLOW"; flag_warn ;; + INFO) color="$CYAN"; flag_info ;; + OK) color="$GREEN"; flag_ok ;; + esac + printf " %-32s %-14s %b%s%b\n" "$check" "$status" "$color" "$severity" "$RESET" +} + +# ══════════════════════════════════════════════════════════════════════ +# SERVER INFO AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_server_info() { + log "Auditing server information exposure..." + echo "" + print_table_header + + # ServerTokens + local tokens + tokens=$(search_config '^\s*ServerTokens' | tail -1 | awk '{print $2}') + if [[ -z "$tokens" ]]; then + print_row "ServerTokens" "Full (default)" "CRITICAL" + elif [[ "${tokens,,}" == "prod" || "${tokens,,}" == "productonly" ]]; then + print_row "ServerTokens" "Prod" "OK" + elif [[ "${tokens,,}" == "major" ]]; then + print_row "ServerTokens" "Major" "WARN" + else + print_row "ServerTokens" "$tokens" "CRITICAL" + fi + + # ServerSignature + local sig + sig=$(search_config '^\s*ServerSignature' | tail -1 | awk '{print $2}') + if [[ -z "$sig" ]]; then + print_row "ServerSignature" "On (default)" "CRITICAL" + elif [[ "${sig,,}" == "off" ]]; then + print_row "ServerSignature" "Off" "OK" + else + print_row "ServerSignature" "$sig" "CRITICAL" + fi + + # TraceEnable + local trace + trace=$(search_config '^\s*TraceEnable' | tail -1 | awk '{print $2}') + if [[ -z "$trace" ]]; then + print_row "TraceEnable" "On (default)" "WARN" + elif [[ "${trace,,}" == "off" ]]; then + print_row "TraceEnable" "Off" "OK" + else + print_row "TraceEnable" "$trace" "WARN" + fi + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# TLS AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_tls() { + log "Auditing TLS configuration..." + echo "" + print_table_header + + # Check if mod_ssl is loaded + if ! $APACHECTL -M 2>/dev/null | grep -q ssl_module; then + print_row "mod_ssl" "not loaded" "INFO" + echo "" + return + fi + print_row "mod_ssl" "loaded" "OK" + + # SSLProtocol + local proto + proto=$(search_config '^\s*SSLProtocol' | tail -1) + if [[ -z "$proto" ]]; then + print_row "SSLProtocol" "not set (default)" "WARN" + else + if echo "$proto" | grep -iqP '(\+SSLv3|\+TLSv1\.0|\+TLSv1[^.]|[^-]TLSv1[^.23])'; then + print_row "SSLProtocol (legacy)" "enabled" "CRITICAL" + elif echo "$proto" | grep -iqP '(\+TLSv1\.1|[^-]TLSv1\.1)'; then + print_row "SSLProtocol (TLSv1.1)" "enabled" "CRITICAL" + else + print_row "SSLProtocol" "modern only" "OK" + fi + fi + + # SSLCipherSuite + local ciphers + ciphers=$(search_config '^\s*SSLCipherSuite' | tail -1) + if [[ -z "$ciphers" ]]; then + print_row "SSLCipherSuite" "not set" "WARN" + else + print_row "SSLCipherSuite" "configured" "OK" + fi + + # SSLHonorCipherOrder + local honor + honor=$(search_config '^\s*SSLHonorCipherOrder' | tail -1 | awk '{print $2}') + if [[ -z "$honor" ]]; then + print_row "SSLHonorCipherOrder" "not set" "WARN" + elif [[ "${honor,,}" == "on" ]]; then + print_row "SSLHonorCipherOrder" "on" "OK" + else + print_row "SSLHonorCipherOrder" "$honor" "WARN" + fi + + # HSTS + local hsts + hsts=$(search_config 'Strict-Transport-Security') + if [[ -z "$hsts" ]]; then + print_row "HSTS Header" "missing" "WARN" + else + print_row "HSTS Header" "set" "OK" + fi + + # OCSP Stapling + local ocsp + ocsp=$(search_config '^\s*SSLUseStapling' | tail -1 | awk '{print $2}') + if [[ -z "$ocsp" ]]; then + print_row "OCSP Stapling" "not configured" "WARN" + elif [[ "${ocsp,,}" == "on" ]]; then + print_row "OCSP Stapling" "on" "OK" + else + print_row "OCSP Stapling" "$ocsp" "WARN" + fi + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# SECURITY HEADERS AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_headers() { + log "Auditing security headers..." + echo "" + print_table_header + + # Check if mod_headers is loaded + if ! $APACHECTL -M 2>/dev/null | grep -q headers_module; then + print_row "mod_headers" "not loaded" "WARN" + echo "" + return + fi + + local headers=( + "X-Content-Type-Options" + "X-Frame-Options" + "Content-Security-Policy" + "Referrer-Policy" + "Permissions-Policy" + ) + + for header in "${headers[@]}"; do + local found + found=$(search_config "Header.*set.*${header}") + if [[ -n "$found" ]]; then + print_row "$header" "set" "OK" + else + print_row "$header" "missing" "WARN" + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# DIRECTORY AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_directories() { + log "Auditing directory and file restrictions..." + echo "" + print_table_header + + # Options Indexes (enabled = bad) + local indexes_on + indexes_on=$(search_config '^\s*Options\b.*\bIndexes\b' | grep -v '\-Indexes' || true) + if [[ -n "$indexes_on" ]]; then + print_row "Options Indexes" "enabled" "WARN" + else + print_row "Options Indexes" "disabled" "OK" + fi + + # AllowOverride All + local override_all + override_all=$(search_config '^\s*AllowOverride\s+All' || true) + if [[ -n "$override_all" ]]; then + print_row "AllowOverride" "All (permissive)" "WARN" + else + print_row "AllowOverride" "restricted" "OK" + fi + + # Sensitive file protection (.git, .env, .htpasswd) + local sensitive_protection + sensitive_protection=$(search_config '(FilesMatch|Files|Directory).*(\\.git|\\.env|\\.htpasswd)' || true) + if [[ -n "$sensitive_protection" ]]; then + print_row "Sensitive file blocking" "configured" "OK" + else + print_row "Sensitive file blocking" "not configured" "CRITICAL" + fi + + # Check for root directory restriction + local root_deny + root_deny=$(search_config '^\s*Require\s+all\s+denied' || true) + if [[ -n "$root_deny" ]]; then + print_row "Root directory denied" "yes" "OK" + else + print_row "Root directory denied" "not found" "WARN" + fi + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# MODULES AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_modules() { + log "Auditing modules..." + echo "" + print_table_header + + local loaded_modules + loaded_modules=$($APACHECTL -M 2>/dev/null || true) + + # mod_security + if echo "$loaded_modules" | grep -q "security2_module"; then + print_row "mod_security" "loaded" "OK" + else + print_row "mod_security" "not loaded" "INFO" + fi + + # mod_status + if echo "$loaded_modules" | grep -q "status_module"; then + local status_restricted + status_restricted=$(search_config '(/dev/null) + if [[ "$conf_perms" -le 644 ]]; then + print_row "Config ($APACHE_CONF)" "$conf_perms" "OK" + else + print_row "Config ($APACHE_CONF)" "$conf_perms" "WARN" + fi + fi + + # .htpasswd files + local htpasswd_files + htpasswd_files=$(find "$APACHE_CONF_DIR" /var/www -name '.htpasswd' -type f 2>/dev/null || true) + if [[ -n "$htpasswd_files" ]]; then + while IFS= read -r f; do + local perms + perms=$(stat -c '%a' "$f" 2>/dev/null) + if [[ "$perms" -le 640 ]]; then + print_row ".htpasswd ($f)" "$perms" "OK" + else + print_row ".htpasswd ($f)" "$perms" "WARN" + fi + done <<< "$htpasswd_files" + else + verbose "No .htpasswd files found" + fi + + # Document root world-writable check + local docroots + docroots=$(search_config '^\s*DocumentRoot' | awk '{print $2}' | tr -d '"' | sort -u) + if [[ -n "$docroots" ]]; then + while IFS= read -r dr; do + [[ -z "$dr" || ! -d "$dr" ]] && continue + local dr_perms + dr_perms=$(stat -c '%a' "$dr" 2>/dev/null) + if [[ "${dr_perms: -1}" -ge 6 ]]; then + print_row "Docroot ($dr)" "${dr_perms} (world-writable)" "CRITICAL" + else + print_row "Docroot ($dr)" "$dr_perms" "OK" + fi + done <<< "$docroots" + fi + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ +print_summary() { + local elapsed + elapsed=$(( $(date +%s) - START_TIME )) + + echo "" + echo " ══════════════════════════════════════════" + echo " Apache Security Audit Summary" + echo " ══════════════════════════════════════════" + printf " %-20s %b%d%b\n" "CRITICAL:" "$RED" "$TOTAL_CRIT" "$RESET" + printf " %-20s %b%d%b\n" "WARN:" "$YELLOW" "$TOTAL_WARN" "$RESET" + printf " %-20s %b%d%b\n" "INFO:" "$CYAN" "$TOTAL_INFO" "$RESET" + printf " %-20s %b%d%b\n" "OK:" "$GREEN" "$TOTAL_OK" "$RESET" + echo " ──────────────────────────────────────────" + printf " Completed in %ds\n" "$elapsed" + echo "" + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + echo -e " ${RED}${BOLD}Action required:${RESET} ${TOTAL_CRIT} critical finding(s)" + echo "" + echo " Top recommendations:" + echo " • Set ServerTokens Prod and ServerSignature Off" + echo " • Disable SSLv3, TLSv1, and TLSv1.1" + echo " • Restrict mod_status to localhost with Require ip 127.0.0.1" + echo " • Block access to .git, .env, and .htpasswd files" + echo " • Fix world-writable document root permissions" + echo "" + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + echo -e " ${YELLOW}Review recommended:${RESET} ${TOTAL_WARN} warning(s)" + echo "" + echo " Suggestions:" + echo " • Add security headers (CSP, X-Frame-Options, HSTS)" + echo " • Enable OCSP stapling for TLS" + echo " • Disable mod_info and mod_autoindex in production" + echo " • Set TraceEnable Off" + echo "" + else + echo -e " ${GREEN}All checks passed${RESET}" + echo "" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2 + exit 1 + fi + + RUN_MODE="${modes[*]}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ +main() { + parse_args "$@" + setup_colors + detect_platform + + START_TIME=$(date +%s) + + echo "" + echo -e "${BOLD}Apache Security Auditor${RESET}" + echo -e "Host: $(hostname)" + echo -e "Config: ${APACHE_CONF}" + echo -e "Platform: ${PLATFORM}" + echo -e "Mode: ${RUN_MODE}" + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" + + for mode in $RUN_MODE; do + case "$mode" in + server-info) audit_server_info ;; + tls) audit_tls ;; + headers) audit_headers ;; + directories) audit_directories ;; + modules) audit_modules ;; + permissions) audit_permissions ;; + esac + done + + print_summary + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + exit 2 + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@" diff --git a/apt-updates-exporter.sh b/apt-updates-exporter.sh new file mode 100644 index 0000000..9b5c081 --- /dev/null +++ b/apt-updates-exporter.sh @@ -0,0 +1,405 @@ +#!/bin/bash + +############################################################# +#### APT Package Updates Exporter for Prometheus #### +#### Expose pending apt updates as Prometheus metrics #### +#### for Debian and Ubuntu servers #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.7 #### +#### #### +#### Usage: ./apt-updates-exporter.sh #### +############################################################# + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +parse_args "$@" + +# Configuration variables with default values +AUTO_UPDATE_ENABLED="${AUTO_UPDATE_ENABLED:-false}" # Enable automatic package updates +AUTO_REMOVE_ENABLED="${AUTO_REMOVE_ENABLED:-false}" # Enable automatic removal of orphaned packages +APT_GET_CMD="${APT_GET_CMD:-/usr/bin/apt-get}" # Path to apt-get command +AWK_CMD="${AWK_CMD:-/usr/bin/awk}" # Path to awk command +CRON_INTERVAL="${CRON_INTERVAL:-0 0 * * *}" # Cron schedule (daily at midnight) +GREP_CMD="${GREP_CMD:-/usr/bin/grep}" # Path to grep command +METRICS_DIR="${METRICS_DIR:-/var/lib/node_exporter}" # Directory for Prometheus metrics files +SORT_CMD="${SORT_CMD:-/usr/bin/sort}" # Path to sort command +UNIQ_CMD="${UNIQ_CMD:-/usr/bin/uniq}" # Path to uniq command + +# File paths for tracking update state +UPDATES_TIMESTAMP_FILE="$METRICS_DIR/updates_detected" # Tracks when updates were first detected +WAIT_PERIOD_ENABLED="${WAIT_PERIOD_ENABLED:-true}" # Enable waiting period before auto-updates +UPDATED_PACKAGES_FILE="$METRICS_DIR/updated_packages" # List of packages updated in last run +AUTO_REMOVE_FILE="$METRICS_DIR/auto_remove_packages" # List of packages removed in last auto-removal +WAIT_PERIOD_SECONDS=$((3 * 24 * 60 * 60)) # Wait period: 3 days in seconds + +# Safety check: prevent concurrent execution that could cause lock conflicts +if pidof apt apt-get >/dev/null || fuser /var/lib/dpkg/lock /var/lib/apt/lists/lock /var/cache/apt/archives/lock >/dev/null 2>&1; then + echo "node_upgrades_pending{origin=\"error\",arch=\"unknown\"} -1" + echo "node_upgradelist{pkgname=\"error\", update_version=\"\", current_version=\"\", origin=\"\"} -1" + echo "node_auto_updates{status=\"error\"} -1" + exit 1 +fi + +#### Setup: Ensure metrics directory exists with proper permissions #### +if [[ ! -d "$METRICS_DIR" ]]; then + # Create metrics directory + mkdir -p "$METRICS_DIR" || { + echo "Failed to create $METRICS_DIR" + exit 1 + } + # Set ownership to prometheus user (try different formats for compatibility) + chown prometheus:prometheus "$METRICS_DIR" 2>/dev/null || chown prometheus. "$METRICS_DIR" || { + echo "Failed to set ownership of $METRICS_DIR" + exit 1 + } + # Set appropriate permissions for metrics directory + chmod 755 "$METRICS_DIR" || { + echo "Failed to set permissions on $METRICS_DIR" + exit 1 + } +fi + +#### Setup: Ensure cron job exists for automated execution #### +if ! crontab -l | grep -q "updates.sh"; then + # Add cron job to run this script automatically + echo -e "$(crontab -u root -l)\n$CRON_INTERVAL /usr/local/bin/updates.sh > $METRICS_DIR/updates.prom 2>&1" | crontab -u root - + # Verify the cron job was added successfully + crontab -l | grep -q "updates.sh" || { + echo "Failed to add cron job" + exit 1 + } +fi + +#### Setup: Ensure logrotate configuration exists for state files #### +LOGROTATE_CONFIG="/etc/logrotate.d/node-exporter-metrics" +if [[ ! -f "$LOGROTATE_CONFIG" ]]; then + # Create logrotate configuration for monthly rotation of state files + cat >"$LOGROTATE_CONFIG" </dev/null || true + endscript +} +EOF + # Verify the logrotate configuration is valid + logrotate -d "$LOGROTATE_CONFIG" >/dev/null 2>&1 || { + echo "Failed to create valid logrotate configuration" + rm -f "$LOGROTATE_CONFIG" + exit 1 + } +fi + +#### Function: Count pending package upgrades grouped by origin and architecture #### +get_upgrades() { + # Test apt-get upgrade command and exit on failure + if ! $APT_GET_CMD -qq --just-print upgrade >/dev/null 2>&1; then + echo "node_upgrades_pending{origin=\"error\",arch=\"unknown\"} -1" + return 1 + fi + + # Parse apt-get output to extract package info to create Prometheus metrics + $APT_GET_CMD -qq --just-print upgrade | + $AWK_CMD -F '[()]' '/^Inst/ { + sub("^[^ ]+ ", "", $2) # Remove package name from origin field + gsub(" ","",$2) # Remove spaces from origin + sub(/\[|\]/, " ", $2) # Replace brackets with space + print $2 + }' | + $SORT_CMD | # Sort the output + $UNIQ_CMD -c | # Count unique entries + $AWK_CMD '{ + gsub(/\\\\/, "\\\\", $2) # Escape backslashes for Prometheus labels + gsub(/\\/, "\\\\", $2) + gsub(/"/, "\\\"", $2) # Escape quotes for Prometheus labels + gsub(/\[|\]/, "", $3) # Remove brackets from architecture + printf "node_upgrades_pending{origin=\"%s\",arch=\"%s\"} %d\n", $2, $3, $1 + }' +} + +#### Function: Handle automatic package updates with optional wait period #### +handle_auto_updates() { + # Skip if auto-updates are disabled + [[ "$AUTO_UPDATE_ENABLED" != "true" ]] && return + + local should_update=false + + # Check if we should wait before updating (prevents immediate updates on detection) + if [[ "$WAIT_PERIOD_ENABLED" == "true" ]]; then + local current_time detected_time + current_time=$(date +%s) + detected_time=$(cat "$UPDATES_TIMESTAMP_FILE" 2>/dev/null || echo "0") + # Only update if wait period has elapsed + ((current_time - detected_time >= WAIT_PERIOD_SECONDS)) && should_update=true + else + # Update immediately if wait period is disabled + should_update=true + fi + + if [[ "$should_update" == "true" ]]; then + perform_auto_update + # Clear timestamp file after updating (reset wait period) + [[ "$WAIT_PERIOD_ENABLED" == "true" ]] && rm -f "$UPDATES_TIMESTAMP_FILE" + fi +} + +#### Function: Execute automatic package updates and record metrics #### +perform_auto_update() { + # Output Prometheus metric headers + echo '# HELP node_auto_updates Number of packages auto-updated.' + echo '# TYPE node_auto_updates gauge' + + local update_output update_count + # Run apt update and upgrade non-interactively with timeout, capture output + update_output=$(timeout 300 bash -c "DEBIAN_FRONTEND=noninteractive $APT_GET_CMD update >/dev/null 2>&1 && DEBIAN_FRONTEND=noninteractive $APT_GET_CMD -y -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' upgrade 2>&1" || echo "TIMEOUT_ERROR") + # Check for timeout or other errors + if [[ "$update_output" == *"TIMEOUT_ERROR"* ]]; then + echo "node_auto_updates{status=\"timeout\"} -1" + return 1 + fi + + # Count number of packages that were actually unpacked/updated + update_count=$(echo "$update_output" | grep -c "^Unpacking ") + + echo "node_auto_updates{status=\"success\"} $update_count" + # Save list of updated packages with versions for reporting + echo "$update_output" | grep "^Unpacking " | awk '{gsub(/[()]/, "", $3); print $2 " " $3}' >"$UPDATED_PACKAGES_FILE" +} + +#### Function: Generate detailed list of individual packages available for upgrade #### +get_upgrade_list() { + # Test apt-get upgrade command and handle failures + if ! $APT_GET_CMD --just-print upgrade >/dev/null 2>&1; then + echo 'node_upgradelist{pkgname="error", update_version="", current_version="", origin=""} -1' + return 1 + fi + + # Parse each package installation line to extract detailed package information + $APT_GET_CMD --just-print upgrade | + $GREP_CMD Inst | # Filter for installation lines + $AWK_CMD '{ + gsub(/\(|\)/, "", $4) # Remove parentheses from version + gsub(/:/, ".", $4) # Replace colons with dots in version + gsub(/\[|\]/, "", $3) # Remove brackets from current version + gsub(/:/, " ", $5) # Replace colons with spaces in origin + + # Escape special characters for Prometheus label values + gsub(/\\/, "\\\\", $2) # Escape backslashes in package name + gsub(/"/, "\\\"", $2) # Escape quotes in package name + gsub(/\\/, "\\\\", $4) # Escape backslashes in version + gsub(/"/, "\\\"", $4) # Escape quotes in version + + # Output Prometheus metric with package details + printf "node_upgradelist{pkgname=\"%s\",update_version=\"%s\", current_version=\"%s\", origin=\"%s\"} 1\n", $2, $4, $3, $5 + }' +} + +#### Function: Get list of packages that can be automatically removed (orphaned) #### +get_auto_remove_list() { + # Test autoremove command with dry-run to see what would be removed + if ! $APT_GET_CMD --dry-run autoremove >/dev/null 2>&1; then + echo 'node_autoremove_packages{pkgname="error"} -1' + return 1 + fi + + # Parse dry-run output to find packages that would be removed + $APT_GET_CMD --dry-run autoremove 2>/dev/null | + $GREP_CMD "Remv" | # Filter for removal lines + $AWK_CMD '{ + # Escape special characters for Prometheus labels + gsub(/\\/, "\\\\", $2) # Escape backslashes in package name + gsub(/"/, "\\\"", $2) # Escape quotes in package name + printf "node_autoremove_packages{pkgname=\"%s\"} 1\n", $2 + }' +} + +#### Function: Handle automatic removal of orphaned packages #### +handle_auto_remove() { + # Skip if auto-remove is disabled + [[ "$AUTO_REMOVE_ENABLED" != "true" ]] && return + + perform_auto_remove +} + +#### Function: Execute automatic package removal and record metrics #### +perform_auto_remove() { + # Output Prometheus metric headers + echo '# HELP node_auto_remove Number of packages auto-removed.' + echo '# TYPE node_auto_remove gauge' + + local remove_output remove_count + # Run autoremove non-interactively and capture output + remove_output=$(DEBIAN_FRONTEND=noninteractive $APT_GET_CMD -y autoremove 2>&1) + # Count packages that were actually removed + remove_count=$(echo "$remove_output" | grep -c "^Removing ") + + echo "node_auto_remove{status=\"success\"} $remove_count" + # Save list of removed packages for reporting + echo "$remove_output" | grep "^Removing " | awk '{print $2}' >"$AUTO_REMOVE_FILE" +} + +#### Generate all Prometheus metrics #### +generate_metrics() { + #### Upgrade list metrics #### + upgradelist=$(get_upgrade_list) + echo '# HELP node_upgradelist List of packages for upgrade' + echo '# TYPE node_upgradelist gauge' + if [[ -n "${upgradelist}" ]]; then + echo "${upgradelist}" + else + echo 'node_upgradelist{pkgname="", update_version="", current_version="", origin=""} 0' + fi + + #### Pending upgrades metrics and auto-updates #### + pending_upgrades=$(get_upgrades) + echo '# HELP node_upgrades_pending Apt package pending updates by origin.' + echo '# TYPE node_upgrades_pending gauge' + + if [[ -n "$pending_upgrades" ]]; then + printf "%s\n" "$pending_upgrades" + + if [[ ! -f "$UPDATES_TIMESTAMP_FILE" ]]; then + date +%s >"$UPDATES_TIMESTAMP_FILE" + fi + + handle_auto_updates + else + echo 'node_upgrades_pending{origin="", arch=""} 0' + echo '# HELP node_auto_updates Number of packages auto-updated.' + echo '# TYPE node_auto_updates gauge' + echo 'node_auto_updates{status="success"} 0' + rm -f "$UPDATES_TIMESTAMP_FILE" + fi + + #### Auto-removable packages metrics #### + autoremovelist=$(get_auto_remove_list) + echo '# HELP node_autoremove_packages List of packages available for auto-removal' + echo '# TYPE node_autoremove_packages gauge' + if [[ -n "${autoremovelist}" ]]; then + echo "${autoremovelist}" + handle_auto_remove + else + echo 'node_autoremove_packages{pkgname=""} 0' + fi + + #### Packages updated in the last run #### + if [[ -f "$UPDATED_PACKAGES_FILE" ]]; then + echo '# HELP node_updated_packages List of packages updated in last update' + echo '# TYPE node_updated_packages gauge' + while IFS=' ' read -r package version; do + echo "node_updated_packages{package=\"$package\",version=\"$version\"} 1" + done <"$UPDATED_PACKAGES_FILE" + fi + + #### Packages removed in the last auto-removal #### + if [[ -f "$AUTO_REMOVE_FILE" ]]; then + echo '# HELP node_removed_packages List of packages removed in last auto-removal' + echo '# TYPE node_removed_packages gauge' + while IFS= read -r package; do + echo "node_removed_packages{package=\"$package\"} 1" + done <"$AUTO_REMOVE_FILE" + fi + + #### Reboot required check #### + echo '# HELP node_reboot_required Node reboot is required for software updates.' + echo '# TYPE node_reboot_required gauge' + if [[ -f '/run/reboot-required' ]]; then + echo 'node_reboot_required 1' + else + echo 'node_reboot_required 0' + fi +} + +#### Main execution #### +if [[ -n "$OUTPUT_FILE" ]]; then + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + temp_file=$(mktemp "${output_dir}/.apt_updates_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + if [[ "$file_lines" -lt 5 ]]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 +else + generate_metrics +fi diff --git a/artifactory-exporter.sh b/artifactory-exporter.sh new file mode 100755 index 0000000..79011fe --- /dev/null +++ b/artifactory-exporter.sh @@ -0,0 +1,538 @@ +#!/usr/bin/env bash +# +# Artifactory Prometheus Metrics Exporter +# +# Prometheus textfile collector exporter for JFrog Artifactory. +# Uses the Artifactory REST API to collect storage per repo, artifact +# counts, HTTP request stats, GC metrics, DB connections, JVM heap, +# and system health. +# +# Usage: +# ARTIFACTORY_URL="https://artifactory.example.com" ARTIFACTORY_TOKEN="cmVmd..." ./artifactory-exporter.sh +# ARTIFACTORY_URL="https://artifactory.example.com" ARTIFACTORY_TOKEN="cmVmd..." ./artifactory-exporter.sh --textfile +# ARTIFACTORY_URL="https://artifactory.example.com" ARTIFACTORY_TOKEN="cmVmd..." ./artifactory-exporter.sh --install +# +# Parameters: +# --textfile Write to textfile collector directory +# --install Create cron job for automatic collection +# --help Show usage +# +# Environment: +# ARTIFACTORY_URL Artifactory base URL (required) +# ARTIFACTORY_TOKEN API token or access token (required) +# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector) +# CURL_TIMEOUT API request timeout in seconds (default: 10) +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.0 +# +# Metrics Exported: +# Core: +# - artifactory_up +# - artifactory_exporter_info{version} +# - artifactory_health_status +# +# Storage (per-repo): +# - artifactory_repo_used_bytes{repo,type} +# - artifactory_repo_artifact_count{repo,type} +# - artifactory_repo_folder_count{repo,type} +# +# Storage (totals): +# - artifactory_storage_total_bytes +# - artifactory_storage_used_bytes +# - artifactory_storage_free_bytes +# - artifactory_storage_binaries_count +# - artifactory_storage_binaries_total_bytes +# - artifactory_storage_optimization_percent +# +# JVM: +# - artifactory_jvm_heap_used_bytes +# - artifactory_jvm_heap_max_bytes +# - artifactory_jvm_heap_free_bytes +# - artifactory_jvm_nonheap_used_bytes +# +# Database: +# - artifactory_db_pool_active +# - artifactory_db_pool_idle +# - artifactory_db_pool_max +# +# HTTP: +# - artifactory_http_requests_total{status} +# +# Garbage Collection: +# - artifactory_gc_duration_seconds +# - artifactory_gc_freed_bytes +# - artifactory_gc_last_run_timestamp +# +# Exporter: +# - artifactory_exporter_duration_seconds +# - artifactory_exporter_last_run_timestamp + +set -euo pipefail + +# --- Configuration --- +readonly VERSION="1.0" +readonly SCRIPT_NAME="$(basename "$0")" +ARTIFACTORY_URL="${ARTIFACTORY_URL:-}" +ARTIFACTORY_TOKEN="${ARTIFACTORY_TOKEN:-}" +TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +TEXTFILE_MODE=false +OUTPUT="" +START_TIME="" + +# --- Functions --- + +usage() { + cat </dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "ERROR: Missing required commands: ${missing[*]}" >&2 + echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 + exit 1 + fi +} + +validate_config() { + if [[ -z "$ARTIFACTORY_URL" ]]; then + echo "ERROR: ARTIFACTORY_URL environment variable is required" >&2 + exit 1 + fi + if [[ -z "$ARTIFACTORY_TOKEN" ]]; then + echo "ERROR: ARTIFACTORY_TOKEN environment variable is required" >&2 + exit 1 + fi + # Strip trailing slash + ARTIFACTORY_URL="${ARTIFACTORY_URL%/}" +} + +api_get() { + local endpoint="$1" + curl -sf --max-time "$CURL_TIMEOUT" \ + -H "Authorization: Bearer ${ARTIFACTORY_TOKEN}" \ + "${ARTIFACTORY_URL}${endpoint}" 2>/dev/null || echo "" +} + +add_metric() { + local name="$1" + local type="$2" + local help="$3" + local value="$4" + local labels="${5:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name}{${labels}} ${value} +" + else + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name} ${value} +" + fi +} + +add_metric_value() { + local name="$1" + local value="$2" + local labels="${3:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="${name}{${labels}} ${value} +" + else + OUTPUT+="${name} ${value} +" + fi +} + +# Convert Artifactory human-readable size strings to bytes. +# Artifactory returns storage sizes as "1.23 GB", "456.78 MB", etc. +parse_size_to_bytes() { + local size_str="$1" + + if [[ -z "$size_str" || "$size_str" == "null" ]]; then + echo "0" + return + fi + + local number unit + number=$(echo "$size_str" | grep -oP '[\d.]+' | head -1) + unit=$(echo "$size_str" | grep -oP '[A-Za-z]+' | head -1) + + if [[ -z "$number" ]]; then + echo "0" + return + fi + + case "${unit^^}" in + BYTES|B) + echo "$number" | awk '{printf "%.0f", $1}' ;; + KB) + echo "$number" | awk '{printf "%.0f", $1 * 1024}' ;; + MB) + echo "$number" | awk '{printf "%.0f", $1 * 1048576}' ;; + GB) + echo "$number" | awk '{printf "%.0f", $1 * 1073741824}' ;; + TB) + echo "$number" | awk '{printf "%.0f", $1 * 1099511627776}' ;; + *) + echo "$number" | awk '{printf "%.0f", $1}' ;; + esac +} + +# Parse percentage string like "85.43%" to a float. +parse_percent() { + local pct_str="$1" + + if [[ -z "$pct_str" || "$pct_str" == "null" ]]; then + echo "0" + return + fi + + echo "$pct_str" | grep -oP '[\d.]+' | head -1 || echo "0" +} + +collect_health() { + # Simple ping check + local ping_result + ping_result=$(api_get "/api/system/ping") + + if [[ -z "$ping_result" || "$ping_result" != "OK" ]]; then + add_metric "artifactory_up" "gauge" "Artifactory reachability (1=up, 0=down)" "0" + return 1 + fi + + add_metric "artifactory_up" "gauge" "Artifactory reachability (1=up, 0=down)" "1" + + # Detailed health check via router API + local health_json + health_json=$(api_get "/router/api/v1/system/health") + + if [[ -n "$health_json" ]]; then + local node_state + node_state=$(echo "$health_json" | jq -r '.node_state // .services[0].state // empty' 2>/dev/null) + + if [[ "$node_state" == "HEALTHY" ]]; then + add_metric "artifactory_health_status" "gauge" "System health (1=healthy, 0=unhealthy)" "1" + else + add_metric "artifactory_health_status" "gauge" "System health (1=healthy, 0=unhealthy)" "0" + fi + else + # Ping succeeded so system is at least partially healthy + add_metric "artifactory_health_status" "gauge" "System health (1=healthy, 0=unhealthy)" "1" + fi + + return 0 +} + +collect_storage() { + local storage_json + storage_json=$(api_get "/api/storageinfo") + + if [[ -z "$storage_json" ]]; then + return + fi + + # --- Total storage summary --- + local total_space used_space free_space + total_space=$(echo "$storage_json" | jq -r '.fileStoreSummary.totalSpace // empty' 2>/dev/null) + used_space=$(echo "$storage_json" | jq -r '.fileStoreSummary.usedSpace // empty' 2>/dev/null) + free_space=$(echo "$storage_json" | jq -r '.fileStoreSummary.freeSpace // empty' 2>/dev/null) + + [[ -n "$total_space" ]] && add_metric "artifactory_storage_total_bytes" "gauge" "Total file store capacity in bytes" "$(parse_size_to_bytes "$total_space")" + [[ -n "$used_space" ]] && add_metric "artifactory_storage_used_bytes" "gauge" "Used file store space in bytes" "$(parse_size_to_bytes "$used_space")" + [[ -n "$free_space" ]] && add_metric "artifactory_storage_free_bytes" "gauge" "Free file store space in bytes" "$(parse_size_to_bytes "$free_space")" + + # --- Binaries summary --- + local binaries_count binaries_size optimization + binaries_count=$(echo "$storage_json" | jq -r '.binariesSummary.binariesCount // empty' 2>/dev/null) + binaries_size=$(echo "$storage_json" | jq -r '.binariesSummary.binariesSize // empty' 2>/dev/null) + optimization=$(echo "$storage_json" | jq -r '.binariesSummary.optimization // empty' 2>/dev/null) + + if [[ -n "$binaries_count" ]]; then + local clean_count + clean_count=$(echo "$binaries_count" | tr -d ',') + add_metric "artifactory_storage_binaries_count" "gauge" "Total number of binaries stored" "$clean_count" + fi + [[ -n "$binaries_size" ]] && add_metric "artifactory_storage_binaries_total_bytes" "gauge" "Total size of binaries in bytes" "$(parse_size_to_bytes "$binaries_size")" + [[ -n "$optimization" ]] && add_metric "artifactory_storage_optimization_percent" "gauge" "Storage optimization percentage" "$(parse_percent "$optimization")" + + # --- Per-repository metrics --- + local repo_count + repo_count=$(echo "$storage_json" | jq -r '.repositoriesSummaryList | length // 0' 2>/dev/null) + + if [[ "$repo_count" -gt 0 ]]; then + # Extract repo data as tab-separated lines: key, type, usedSpace, filesCount, foldersCount + local repo_lines + repo_lines=$(echo "$storage_json" | jq -r ' + .repositoriesSummaryList[] + | select(.repoKey != "TOTAL") + | [.repoKey, (.repoType // "UNKNOWN"), (.usedSpace // "0 bytes"), (.filesCount // 0), (.foldersCount // 0)] + | @tsv + ' 2>/dev/null) + + if [[ -n "$repo_lines" ]]; then + OUTPUT+="# HELP artifactory_repo_used_bytes Repository used space in bytes +# TYPE artifactory_repo_used_bytes gauge +" + while IFS=$'\t' read -r repo_key repo_type repo_used files_count folders_count; do + local repo_bytes + repo_bytes=$(parse_size_to_bytes "$repo_used") + add_metric_value "artifactory_repo_used_bytes" "$repo_bytes" "repo=\"${repo_key}\",type=\"${repo_type}\"" + done <<< "$repo_lines" + + OUTPUT+="# HELP artifactory_repo_artifact_count Number of artifacts in repository +# TYPE artifactory_repo_artifact_count gauge +" + while IFS=$'\t' read -r repo_key repo_type repo_used files_count folders_count; do + add_metric_value "artifactory_repo_artifact_count" "$files_count" "repo=\"${repo_key}\",type=\"${repo_type}\"" + done <<< "$repo_lines" + + OUTPUT+="# HELP artifactory_repo_folder_count Number of folders in repository +# TYPE artifactory_repo_folder_count gauge +" + while IFS=$'\t' read -r repo_key repo_type repo_used files_count folders_count; do + add_metric_value "artifactory_repo_folder_count" "$folders_count" "repo=\"${repo_key}\",type=\"${repo_type}\"" + done <<< "$repo_lines" + fi + fi +} + +collect_system_info() { + # Try the open metrics endpoint first (Artifactory 7.x+) + local metrics_text + metrics_text=$(api_get "/api/v1/system/metrics") + + if [[ -n "$metrics_text" ]]; then + # Parse JVM heap from open metrics format + local heap_used heap_max heap_free nonheap_used + heap_used=$(echo "$metrics_text" | grep -m1 'jvm_memory_used_bytes.*area="heap"' | grep -oP '[\d.]+$' || true) + heap_max=$(echo "$metrics_text" | grep -m1 'jvm_memory_max_bytes.*area="heap"' | grep -oP '[\d.]+$' || true) + heap_free=$(echo "$metrics_text" | grep -m1 'jvm_memory_committed_bytes.*area="heap"' | grep -oP '[\d.]+$' || true) + nonheap_used=$(echo "$metrics_text" | grep -m1 'jvm_memory_used_bytes.*area="nonheap"' | grep -oP '[\d.]+$' || true) + + [[ -n "$heap_used" ]] && add_metric "artifactory_jvm_heap_used_bytes" "gauge" "JVM heap memory used" "${heap_used%.*}" + [[ -n "$heap_max" ]] && add_metric "artifactory_jvm_heap_max_bytes" "gauge" "JVM heap memory maximum" "${heap_max%.*}" + if [[ -n "$heap_free" && -n "$heap_used" ]]; then + local free_calc + free_calc=$(echo "$heap_free $heap_used" | awk '{printf "%.0f", $1 - $2}') + add_metric "artifactory_jvm_heap_free_bytes" "gauge" "JVM heap memory free" "$free_calc" + fi + [[ -n "$nonheap_used" ]] && add_metric "artifactory_jvm_nonheap_used_bytes" "gauge" "JVM non-heap memory used" "${nonheap_used%.*}" + + # Parse DB pool from open metrics + local db_active db_idle db_max + db_active=$(echo "$metrics_text" | grep -m1 'db_pool_active_connections' | grep -oP '[\d.]+$' || true) + db_idle=$(echo "$metrics_text" | grep -m1 'db_pool_idle_connections' | grep -oP '[\d.]+$' || true) + db_max=$(echo "$metrics_text" | grep -m1 'db_pool_max_connections' | grep -oP '[\d.]+$' || true) + + [[ -n "$db_active" ]] && add_metric "artifactory_db_pool_active" "gauge" "Active database connections" "${db_active%.*}" + [[ -n "$db_idle" ]] && add_metric "artifactory_db_pool_idle" "gauge" "Idle database connections" "${db_idle%.*}" + [[ -n "$db_max" ]] && add_metric "artifactory_db_pool_max" "gauge" "Maximum database connections" "${db_max%.*}" + + return + fi + + # Fallback: use system info endpoint (older Artifactory) + local info_json + info_json=$(api_get "/api/system/info") + + if [[ -z "$info_json" ]]; then + return + fi + + local heap_used_str heap_max_str heap_free_str + heap_used_str=$(echo "$info_json" | jq -r '.["jvm.heap.used"] // empty' 2>/dev/null) + heap_max_str=$(echo "$info_json" | jq -r '.["jvm.heap.max"] // empty' 2>/dev/null) + heap_free_str=$(echo "$info_json" | jq -r '.["jvm.heap.free"] // empty' 2>/dev/null) + + [[ -n "$heap_used_str" ]] && add_metric "artifactory_jvm_heap_used_bytes" "gauge" "JVM heap memory used" "$(parse_size_to_bytes "$heap_used_str")" + [[ -n "$heap_max_str" ]] && add_metric "artifactory_jvm_heap_max_bytes" "gauge" "JVM heap memory maximum" "$(parse_size_to_bytes "$heap_max_str")" + [[ -n "$heap_free_str" ]] && add_metric "artifactory_jvm_heap_free_bytes" "gauge" "JVM heap memory free" "$(parse_size_to_bytes "$heap_free_str")" + + local db_active db_max + db_active=$(echo "$info_json" | jq -r '.["db.pool.active"] // empty' 2>/dev/null) + db_max=$(echo "$info_json" | jq -r '.["db.pool.max"] // empty' 2>/dev/null) + + [[ -n "$db_active" ]] && add_metric "artifactory_db_pool_active" "gauge" "Active database connections" "$db_active" + [[ -n "$db_max" ]] && add_metric "artifactory_db_pool_max" "gauge" "Maximum database connections" "$db_max" +} + +collect_http_stats() { + # Try open metrics endpoint for HTTP stats (Artifactory 7.x+) + local metrics_text + metrics_text=$(api_get "/api/v1/system/metrics") + + if [[ -n "$metrics_text" ]]; then + local http_2xx http_3xx http_4xx http_5xx + http_2xx=$(echo "$metrics_text" | grep -m1 'http_response_total.*status="2xx"' | grep -oP '[\d.]+$' || true) + http_3xx=$(echo "$metrics_text" | grep -m1 'http_response_total.*status="3xx"' | grep -oP '[\d.]+$' || true) + http_4xx=$(echo "$metrics_text" | grep -m1 'http_response_total.*status="4xx"' | grep -oP '[\d.]+$' || true) + http_5xx=$(echo "$metrics_text" | grep -m1 'http_response_total.*status="5xx"' | grep -oP '[\d.]+$' || true) + + OUTPUT+="# HELP artifactory_http_requests_total Total HTTP requests by status class +# TYPE artifactory_http_requests_total counter +" + [[ -n "$http_2xx" ]] && add_metric_value "artifactory_http_requests_total" "${http_2xx%.*}" 'status="2xx"' + [[ -n "$http_3xx" ]] && add_metric_value "artifactory_http_requests_total" "${http_3xx%.*}" 'status="3xx"' + [[ -n "$http_4xx" ]] && add_metric_value "artifactory_http_requests_total" "${http_4xx%.*}" 'status="4xx"' + [[ -n "$http_5xx" ]] && add_metric_value "artifactory_http_requests_total" "${http_5xx%.*}" 'status="5xx"' + fi +} + +collect_gc_info() { + local gc_json + gc_json=$(api_get "/api/system/storage/gc") + + if [[ -z "$gc_json" ]]; then + return + fi + + # Duration in milliseconds + local gc_duration_ms + gc_duration_ms=$(echo "$gc_json" | jq -r '.gcDurationMillis // empty' 2>/dev/null) + + if [[ -n "$gc_duration_ms" ]]; then + local gc_duration_secs + gc_duration_secs=$(echo "$gc_duration_ms" | awk '{printf "%.3f", $1 / 1000}') + add_metric "artifactory_gc_duration_seconds" "gauge" "Duration of last garbage collection in seconds" "$gc_duration_secs" + fi + + # Freed space + local gc_freed_size + gc_freed_size=$(echo "$gc_json" | jq -r '.freedSpace // empty' 2>/dev/null) + + if [[ -n "$gc_freed_size" ]]; then + local gc_freed_bytes + gc_freed_bytes=$(parse_size_to_bytes "$gc_freed_size") + add_metric "artifactory_gc_freed_bytes" "gauge" "Bytes freed by last garbage collection" "$gc_freed_bytes" + fi + + # Last run timestamp + local gc_time + gc_time=$(echo "$gc_json" | jq -r '.gcTime // empty' 2>/dev/null) + + if [[ -n "$gc_time" ]]; then + # Try to convert ISO timestamp to epoch + local gc_epoch + gc_epoch=$(date -d "$gc_time" +%s 2>/dev/null || echo "") + if [[ -n "$gc_epoch" ]]; then + add_metric "artifactory_gc_last_run_timestamp" "gauge" "Unix timestamp of last garbage collection" "$gc_epoch" + fi + fi +} + +write_output() { + if [[ "$TEXTFILE_MODE" == true ]]; then + local output_file="${TEXTFILE_DIR}/artifactory.prom" + local temp_file="${output_file}.$$" + + mkdir -p "$TEXTFILE_DIR" + echo "$OUTPUT" > "$temp_file" + mv "$temp_file" "$output_file" + else + echo "$OUTPUT" + fi +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + cat > /etc/cron.d/artifactory-exporter </dev/null +EOF + + chmod 644 /etc/cron.d/artifactory-exporter + echo "Installed cron job: /etc/cron.d/artifactory-exporter" + echo "Metrics will be written to: ${TEXTFILE_DIR}/artifactory.prom" +} + +# --- Main --- + +main() { + # Parse arguments + for arg in "$@"; do + case "$arg" in + --textfile) TEXTFILE_MODE=true ;; + --install) + check_dependencies + validate_config + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) echo "Unknown option: $arg" >&2; usage ;; + esac + done + + check_dependencies + validate_config + + START_TIME=$(date +%s%N) + + # Exporter info + add_metric "artifactory_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" + + # Collect metrics + if collect_health; then + collect_storage + collect_system_info + collect_http_stats + collect_gc_info + fi + + # Exporter performance + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + add_metric "artifactory_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" + add_metric "artifactory_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" + + write_output +} + +main "$@" diff --git a/audit-log-analyzer.sh b/audit-log-analyzer.sh new file mode 100644 index 0000000..6d0b17c --- /dev/null +++ b/audit-log-analyzer.sh @@ -0,0 +1,575 @@ +#!/bin/bash + +############################################################# +#### Audit Log Analyzer Script for SELinux and AppArmor #### +#### Parses denial logs and suggests fix commands #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### To use this script chmod it to 755 #### +#### or simply type bash #### +############################################################# + +# ── Colors ──────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +# ── Defaults ────────────────────────────────────────────── +MODE="recent" +OUTPUT_FILE="" +QUIET=0 +TOTAL_DENIALS=0 +UNIQUE_TYPES=0 +SUGGESTED_FIXES=0 + +# ── Functions ───────────────────────────────────────────── + +usage() { + echo -e "${BOLD}Audit Log Analyzer — SELinux & AppArmor${NC}" + echo "" + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --help Show this help message" + echo " --recent Analyze denials from the last hour only (default)" + echo " --all Analyze all denials in the log" + echo " --output FILE Save suggested fixes to FILE" + echo " --quiet Show suggestions only, suppress raw denial lines" + echo "" + echo "Examples:" + echo " sudo bash $0 --recent" + echo " sudo bash $0 --all --output fixes.txt" + echo " sudo bash $0 --quiet --output /tmp/fixes.txt" + exit 0 +} + +check_root() { + if [[ $EUID -ne 0 ]]; then + echo -e "${RED}Error: This script must be run as root.${NC}" + echo "Please run with: sudo bash $0" + exit 1 + fi +} + +detect_mac_system() { + SELINUX_ACTIVE=0 + APPARMOR_ACTIVE=0 + + # Check SELinux + if command -v getenforce &>/dev/null; then + SELINUX_STATUS=$(getenforce 2>/dev/null) + if [[ "$SELINUX_STATUS" == "Enforcing" || "$SELINUX_STATUS" == "Permissive" ]]; then + SELINUX_ACTIVE=1 + fi + fi + + # Check AppArmor + if command -v aa-status &>/dev/null; then + if aa-status &>/dev/null; then + APPARMOR_ACTIVE=1 + fi + elif [[ -d /sys/module/apparmor ]]; then + APPARMOR_ACTIVE=1 + fi + + if [[ $SELINUX_ACTIVE -eq 0 && $APPARMOR_ACTIVE -eq 0 ]]; then + echo -e "${YELLOW}Warning: Neither SELinux nor AppArmor appears to be active on this system.${NC}" + exit 1 + fi +} + +output_line() { + local line="$1" + echo -e "$line" + if [[ -n "$OUTPUT_FILE" ]]; then + echo -e "$line" | sed 's/\x1b\[[0-9;]*m//g' >> "$OUTPUT_FILE" + fi +} + +# ── SELinux Analysis ────────────────────────────────────── + +parse_selinux_denial() { + local line="$1" + + local scontext tcontext tclass perm comm name path + + scontext=$(echo "$line" | grep -oP 'scontext=\K[^ ]+') + tcontext=$(echo "$line" | grep -oP 'tcontext=\K[^ ]+') + tclass=$(echo "$line" | grep -oP 'tclass=\K[^ ]+') + perm=$(echo "$line" | grep -oP '\{ \K[^}]+') + comm=$(echo "$line" | grep -oP 'comm="\K[^"]+') + name=$(echo "$line" | grep -oP 'name="\K[^"]+') + path=$(echo "$line" | grep -oP 'path="\K[^"]+') + + if [[ $QUIET -eq 0 ]]; then + output_line "${RED}DENIAL:${NC} $line" + fi + + output_line "${CYAN} Source context:${NC} $scontext" + output_line "${CYAN} Target context:${NC} $tcontext" + output_line "${CYAN} Class:${NC} $tclass" + output_line "${CYAN} Permission:${NC} $perm" + [[ -n "$comm" ]] && output_line "${CYAN} Command:${NC} $comm" + [[ -n "$path" ]] && output_line "${CYAN} Path:${NC} $path" + + suggest_selinux_fix "$scontext" "$tcontext" "$tclass" "$perm" "$path" "$name" + output_line "" +} + +suggest_selinux_fix() { + local scontext="$1" tcontext="$2" tclass="$3" perm="$4" path="$5" name="$6" + + ((SUGGESTED_FIXES++)) + + # Port binding denials + if [[ "$tclass" == "tcp_socket" || "$tclass" == "udp_socket" ]]; then + local stype + stype=$(echo "$scontext" | cut -d: -f3) + output_line "${GREEN} Suggested fix (port rule):${NC}" + output_line "${GREEN} semanage port -a -t ${stype} -p tcp ${NC}" + output_line "${YELLOW} (Replace with the actual port)${NC}" + return + fi + + # File access denials + if [[ -n "$path" && ("$tclass" == "file" || "$tclass" == "dir" || "$tclass" == "lnk_file") ]]; then + local ttype + ttype=$(echo "$tcontext" | cut -d: -f3) + output_line "${GREEN} Suggested fix (file context):${NC}" + output_line "${GREEN} semanage fcontext -a -t ${ttype} \"${path}\"${NC}" + output_line "${GREEN} restorecon -Rv \"${path}\"${NC}" + + # Also check for boolean solutions + suggest_selinux_boolean "$scontext" "$tcontext" "$tclass" "$perm" + return + fi + + # General boolean suggestion + suggest_selinux_boolean "$scontext" "$tcontext" "$tclass" "$perm" +} + +suggest_selinux_boolean() { + local scontext="$1" tcontext="$2" tclass="$3" perm="$4" + local stype + stype=$(echo "$scontext" | cut -d: -f3) + + # Try to find relevant booleans + if command -v getsebool &>/dev/null; then + local booleans + booleans=$(getsebool -a 2>/dev/null | grep -i "${stype%%_t}" | head -5) + if [[ -n "$booleans" ]]; then + output_line "${GREEN} Possibly relevant booleans:${NC}" + while IFS= read -r bool_line; do + local bool_name + bool_name=$(echo "$bool_line" | cut -d' ' -f1) + output_line "${GREEN} setsebool -P ${bool_name} on${NC}" + done <<< "$booleans" + fi + fi + + output_line "${YELLOW} If no boolean applies, consider generating a custom policy module (see below).${NC}" +} + +categorize_selinux_denial() { + local line="$1" + local tclass + tclass=$(echo "$line" | grep -oP 'tclass=\K[^ ]+') + + case "$tclass" in + file|dir|lnk_file|fifo_file|sock_file) + echo "file_access" + ;; + tcp_socket|udp_socket|rawip_socket|netlink_socket) + echo "network" + ;; + *_port_t) + echo "port_binding" + ;; + process|process2) + echo "process" + ;; + *) + echo "other" + ;; + esac +} + +analyze_selinux() { + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "${BOLD} SELinux Audit Log Analysis${NC}" + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "" + + local selinux_status + selinux_status=$(getenforce 2>/dev/null) + output_line "${CYAN}SELinux status:${NC} $selinux_status" + output_line "" + + # Gather denials + local denials="" + + if [[ "$MODE" == "recent" ]]; then + if command -v ausearch &>/dev/null; then + denials=$(ausearch -m avc -ts recent 2>/dev/null | grep "type=AVC") + fi + # Fallback to log file + if [[ -z "$denials" && -f /var/log/audit/audit.log ]]; then + local one_hour_ago + one_hour_ago=$(date -d '1 hour ago' '+%s' 2>/dev/null) + if [[ -n "$one_hour_ago" ]]; then + denials=$(awk -v cutoff="$one_hour_ago" ' + /type=AVC/ { + match($0, /msg=audit\(([0-9]+)\./, arr) + if (arr[1] >= cutoff) print + } + ' /var/log/audit/audit.log) + fi + fi + else + if [[ -f /var/log/audit/audit.log ]]; then + denials=$(grep "type=AVC" /var/log/audit/audit.log) + fi + fi + + if [[ -z "$denials" ]]; then + output_line "${GREEN}No AVC denials found.${NC}" + output_line "" + return + fi + + # Group denials by category + declare -A categories + local denial_count=0 + + while IFS= read -r line; do + [[ -z "$line" ]] && continue + ((denial_count++)) + local category + category=$(categorize_selinux_denial "$line") + categories["$category"]+="$line"$'\n' + done <<< "$denials" + + TOTAL_DENIALS=$denial_count + + # Count unique types + local unique + unique=$(echo "$denials" | grep -oP 'tclass=\K[^ ]+' | sort -u | wc -l) + UNIQUE_TYPES=$unique + + # Display grouped results + for category in "file_access" "network" "port_binding" "process" "other"; do + if [[ -n "${categories[$category]}" ]]; then + local label + case "$category" in + file_access) label="File Access Denials" ;; + network) label="Network Denials" ;; + port_binding) label="Port Binding Denials" ;; + process) label="Process Denials" ;; + other) label="Other Denials" ;; + esac + + output_line "${BOLD}── ${label} ──────────────────────────────────${NC}" + output_line "" + + while IFS= read -r denial_line; do + [[ -z "$denial_line" ]] && continue + parse_selinux_denial "$denial_line" + done <<< "${categories[$category]}" + fi + done + + # Generate policy module suggestion with audit2allow + if command -v audit2allow &>/dev/null; then + output_line "${BOLD}── Policy Module Suggestion ──────────────────────${NC}" + output_line "" + local policy + if [[ "$MODE" == "recent" ]]; then + policy=$(ausearch -m avc -ts recent 2>/dev/null | audit2allow 2>/dev/null) + else + policy=$(audit2allow < /var/log/audit/audit.log 2>/dev/null) + fi + + if [[ -n "$policy" ]]; then + output_line "${GREEN}audit2allow suggests the following policy:${NC}" + output_line "$policy" + output_line "" + output_line "${YELLOW}To create and install a custom module:${NC}" + output_line "${GREEN} ausearch -m avc -ts recent | audit2allow -M my_custom_policy${NC}" + output_line "${GREEN} semodule -i my_custom_policy.pp${NC}" + else + output_line "${CYAN}No policy suggestions generated by audit2allow.${NC}" + fi + output_line "" + else + output_line "${YELLOW}Note: Install audit2allow (policycoreutils-python-utils) for automatic policy generation.${NC}" + output_line "" + fi +} + +# ── AppArmor Analysis ──────────────────────────────────── + +find_apparmor_log_source() { + if [[ -f /var/log/syslog ]]; then + echo "syslog" + elif [[ -f /var/log/kern.log ]]; then + echo "kern.log" + elif command -v journalctl &>/dev/null; then + echo "journalctl" + else + echo "none" + fi +} + +parse_apparmor_denial() { + local line="$1" + + local profile operation denied_mask path info + + profile=$(echo "$line" | grep -oP 'profile="\K[^"]+') + [[ -z "$profile" ]] && profile=$(echo "$line" | grep -oP 'apparmor="\K[^"]+') + operation=$(echo "$line" | grep -oP 'operation="\K[^"]+') + denied_mask=$(echo "$line" | grep -oP 'requested_mask="\K[^"]+') + [[ -z "$denied_mask" ]] && denied_mask=$(echo "$line" | grep -oP 'denied_mask="\K[^"]+') + path=$(echo "$line" | grep -oP 'name="\K[^"]+') + info=$(echo "$line" | grep -oP 'info="\K[^"]+') + + if [[ $QUIET -eq 0 ]]; then + output_line "${RED}DENIAL:${NC} $line" + fi + + [[ -n "$profile" ]] && output_line "${CYAN} Profile:${NC} $profile" + [[ -n "$operation" ]] && output_line "${CYAN} Operation:${NC} $operation" + [[ -n "$path" ]] && output_line "${CYAN} Path:${NC} $path" + [[ -n "$denied_mask" ]] && output_line "${CYAN} Denied mask:${NC} $denied_mask" + [[ -n "$info" ]] && output_line "${CYAN} Info:${NC} $info" + + suggest_apparmor_fix "$profile" "$operation" "$path" "$denied_mask" + output_line "" +} + +suggest_apparmor_fix() { + local profile="$1" operation="$2" path="$3" denied_mask="$4" + + ((SUGGESTED_FIXES++)) + + # Build the permission string from the denied mask + local perm_str="" + case "$denied_mask" in + r) perm_str="r" ;; + w) perm_str="w" ;; + rw) perm_str="rw" ;; + x) perm_str="ix" ;; + rx) perm_str="rix" ;; + rwx) perm_str="rwix" ;; + k) perm_str="k" ;; + l) perm_str="l" ;; + m) perm_str="m" ;; + *) perm_str="$denied_mask" ;; + esac + + if [[ -n "$path" && -n "$perm_str" ]]; then + output_line "${GREEN} Suggested rule to add to profile:${NC}" + output_line "${GREEN} ${path} ${perm_str},${NC}" + fi + + # Show the profile file path + if [[ -n "$profile" ]]; then + local profile_file="/etc/apparmor.d/${profile//\//.}" + # Try to find the actual profile file + if [[ -f "/etc/apparmor.d/$profile" ]]; then + profile_file="/etc/apparmor.d/$profile" + elif [[ -f "/etc/apparmor.d/${profile//\//.}" ]]; then + profile_file="/etc/apparmor.d/${profile//\//.}" + else + # Search for it + local found + found=$(grep -rl "profile $profile" /etc/apparmor.d/ 2>/dev/null | head -1) + [[ -n "$found" ]] && profile_file="$found" + fi + output_line "${CYAN} Profile file:${NC} $profile_file" + fi + + output_line "${YELLOW} Or run interactively:${NC}" + output_line "${GREEN} aa-logprof${NC}" +} + +analyze_apparmor() { + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "${BOLD} AppArmor Audit Log Analysis${NC}" + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "" + + # Show AppArmor status + if command -v aa-status &>/dev/null; then + local enforced loaded + enforced=$(aa-status 2>/dev/null | grep -c "enforce") + loaded=$(aa-status 2>/dev/null | grep -c "loaded") + output_line "${CYAN}AppArmor profiles loaded:${NC} $loaded" + output_line "${CYAN}Profiles in enforce mode:${NC} $enforced" + output_line "" + fi + + # Find log source + local log_source + log_source=$(find_apparmor_log_source) + + if [[ "$log_source" == "none" ]]; then + output_line "${RED}Error: Cannot find AppArmor log source.${NC}" + output_line "${YELLOW}Checked: /var/log/syslog, /var/log/kern.log, journalctl${NC}" + return + fi + + # Gather denials + local denials="" + + if [[ "$log_source" == "journalctl" ]]; then + if [[ "$MODE" == "recent" ]]; then + denials=$(journalctl --since "1 hour ago" --no-pager 2>/dev/null | grep -i "apparmor.*DENIED") + else + denials=$(journalctl --no-pager 2>/dev/null | grep -i "apparmor.*DENIED") + fi + else + local log_file + [[ "$log_source" == "syslog" ]] && log_file="/var/log/syslog" + [[ "$log_source" == "kern.log" ]] && log_file="/var/log/kern.log" + + if [[ "$MODE" == "recent" ]]; then + local one_hour_ago + one_hour_ago=$(date -d '1 hour ago' '+%b %e %H:%M' 2>/dev/null) + if [[ -n "$one_hour_ago" ]]; then + denials=$(awk -v cutoff="$(date -d '1 hour ago' '+%s' 2>/dev/null)" ' + /apparmor.*DENIED/ || /apparmor.*denied/ { + print + } + ' "$log_file" | tail -100) + else + # Fallback: last 100 denial lines + denials=$(grep -i "apparmor.*DENIED" "$log_file" | tail -100) + fi + else + denials=$(grep -i "apparmor.*DENIED" "$log_file") + fi + fi + + if [[ -z "$denials" ]]; then + output_line "${GREEN}No AppArmor denials found.${NC}" + output_line "" + return + fi + + local denial_count=0 + local -A seen_profiles + + output_line "${BOLD}── AppArmor Denials ─────────────────────────────${NC}" + output_line "" + + while IFS= read -r line; do + [[ -z "$line" ]] && continue + ((denial_count++)) + parse_apparmor_denial "$line" + + local p + p=$(echo "$line" | grep -oP 'profile="\K[^"]+') + [[ -n "$p" ]] && seen_profiles["$p"]=1 + done <<< "$denials" + + TOTAL_DENIALS=$denial_count + UNIQUE_TYPES=${#seen_profiles[@]} + + # Suggest aa-logprof for interactive fixing + output_line "${BOLD}── Interactive Fix Suggestion ────────────────────${NC}" + output_line "" + output_line "${YELLOW}For interactive profile updates, run:${NC}" + output_line "${GREEN} aa-logprof${NC}" + output_line "" + output_line "${YELLOW}To set a profile to complain mode for testing:${NC}" + for prof in "${!seen_profiles[@]}"; do + output_line "${GREEN} aa-complain $prof${NC}" + done + output_line "" +} + +# ── Summary ─────────────────────────────────────────────── + +print_summary() { + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "${BOLD} Summary${NC}" + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "" + output_line " Total denials found: ${BOLD}${TOTAL_DENIALS}${NC}" + output_line " Unique denial types: ${BOLD}${UNIQUE_TYPES}${NC}" + output_line " Suggested fixes: ${BOLD}${SUGGESTED_FIXES}${NC}" + output_line "" + + if [[ -n "$OUTPUT_FILE" ]]; then + output_line "${GREEN}Suggestions saved to: ${OUTPUT_FILE}${NC}" + output_line "" + fi +} + +# ── Parse Arguments ─────────────────────────────────────── + +while [[ $# -gt 0 ]]; do + case "$1" in + --help|-h) + usage + ;; + --recent) + MODE="recent" + shift + ;; + --all) + MODE="all" + shift + ;; + --output) + if [[ -z "$2" || "$2" == --* ]]; then + echo -e "${RED}Error: --output requires a filename argument.${NC}" + exit 1 + fi + OUTPUT_FILE="$2" + shift 2 + ;; + --quiet|-q) + QUIET=1 + shift + ;; + *) + echo -e "${RED}Unknown option: $1${NC}" + echo "Use --help for usage information." + exit 1 + ;; + esac +done + +# ── Main ────────────────────────────────────────────────── + +check_root + +# Clear output file if specified +if [[ -n "$OUTPUT_FILE" ]]; then + true > "$OUTPUT_FILE" +fi + +echo -e "${BOLD}Audit Log Analyzer v1.00${NC}" +echo -e "${CYAN}Mode: ${MODE}${NC}" +echo "" + +detect_mac_system + +if [[ $SELINUX_ACTIVE -eq 1 ]]; then + analyze_selinux +fi + +if [[ $APPARMOR_ACTIVE -eq 1 ]]; then + analyze_apparmor +fi + +print_summary diff --git a/aws-ami-finder.sh b/aws-ami-finder.sh new file mode 100755 index 0000000..6ac6e3e --- /dev/null +++ b/aws-ami-finder.sh @@ -0,0 +1,395 @@ +#!/usr/bin/env bash + +######################################################################################### +#### aws-ami-finder.sh — Find the latest AWS AMI for a given OS type #### +#### Queries ec2 describe-images with pre-defined OS profiles. #### +#### Requires: bash, aws CLI #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./aws-ami-finder.sh --os amazon2023 #### +#### ./aws-ami-finder.sh --os ubuntu2204 #### +#### ./aws-ami-finder.sh --list #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors ──────────────────────────────────────────────────────────── +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BOLD='\033[1m' + RESET='\033[0m' +else + RED="" GREEN="" YELLOW="" BOLD="" RESET="" +fi + +log() { echo -e "${GREEN}[OK]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*"; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } + +# ── Defaults ────────────────────────────────────────────────────────── +OS_TYPE="" +MAX_RESULTS=10 +REGION="" +SHOW_INSTANCES=false +INSTANCE_STATE="running" +RUNNING_FILTER="" +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat </dev/null; then + err "AWS CLI not found. Install it first." + exit 1 +fi + +# ── Running instances mode (SSM-based) ──────────────────────────────── +if [[ "$SHOW_INSTANCES" == "true" ]]; then + REGION_ARGS=() + [[ -n "$REGION" ]] && REGION_ARGS=(--region "$REGION") + + active_region="${REGION:-$(aws configure get region 2>/dev/null || echo 'not set')}" + + filter_label="all" + grep_pattern="" + + case "$RUNNING_FILTER" in + amazon2) grep_pattern=$'(Amazon Linux\t2$|Linux/UNIX)'; filter_label="Amazon Linux 2 + Linux/UNIX" ;; + amazon2023) grep_pattern=$'Amazon Linux\t2023'; filter_label="Amazon Linux 2023" ;; + rhel) grep_pattern="Red Hat Enterprise Linux"; filter_label="RHEL" ;; + windows) grep_pattern="Windows"; filter_label="Windows" ;; + "") ;; + *) err "Unknown filter: $RUNNING_FILTER (use: amazon2, amazon2023, rhel, windows)"; exit 1 ;; + esac + + state_label="${INSTANCE_STATE//,/ + }" + log "Querying ${filter_label} instances (${state_label}) in ${active_region}..." + + # Get instances (InstanceId, Name, Owner tag, State, PlatformDetails) + ec2_data=$(aws ec2 describe-instances \ + ${REGION_ARGS[@]+"${REGION_ARGS[@]}"} \ + --filters "Name=instance-state-name,Values=${INSTANCE_STATE}" \ + --query 'Reservations[].Instances[].[InstanceId, Tags[?Key==`Name`].Value | [0], Tags[?Key==`Owner`].Value | [0], State.Name, PlatformDetails]' \ + --output text 2>&1) || { + err "EC2 query failed:" + echo "$ec2_data" >&2 + exit 1 + } + + if [[ -z "$ec2_data" ]]; then + warn "No running instances found" + exit 0 + fi + + # Build lookup: InstanceId → Name, Owner, State, PlatformDetails + declare -A NAMES + declare -A OWNERS + declare -A STATES + declare -A PLATFORMS + while IFS=$'\t' read -r iid name owner state plat; do + NAMES["$iid"]="$name" + OWNERS["$iid"]="${owner:-}" + STATES["$iid"]="$state" + PLATFORMS["$iid"]="$plat" + done <<< "$ec2_data" + + # Get SSM data for OS identification (best-effort) + declare -A SSM_PLATFORM + declare -A SSM_VERSION + ssm_data=$(aws ssm describe-instance-information \ + ${REGION_ARGS[@]+"${REGION_ARGS[@]}"} \ + --query 'InstanceInformationList[].[InstanceId, PlatformName, PlatformVersion]' \ + --output text 2>/dev/null) || ssm_data="" + + if [[ -n "$ssm_data" ]]; then + while IFS=$'\t' read -r iid platform version; do + SSM_PLATFORM["$iid"]="$platform" + SSM_VERSION["$iid"]="$version" + done <<< "$ssm_data" + fi + + # Build display lines, merging EC2 + SSM data + display_lines=() + for iid in "${!NAMES[@]}"; do + name="${NAMES[$iid]}" + owner="${OWNERS[$iid]:--}" + [[ "$owner" == "None" || -z "$owner" ]] && owner="-" + # Strip session suffix (e.g. "jallen-session-12345" → "jallen") + owner="${owner%%-session-*}" + if [[ -n "${SSM_PLATFORM[$iid]:-}" ]]; then + platform="${SSM_PLATFORM[$iid]}" + version="${SSM_VERSION[$iid]}" + else + platform="${PLATFORMS[$iid]}" + version="(no SSM)" + fi + state="${STATES[$iid]}" + display_lines+=("${name}"$'\t'"${iid}"$'\t'"${owner}"$'\t'"${state}"$'\t'"${platform}"$'\t'"${version}") + done + + # Apply filter if set + if [[ -n "$grep_pattern" ]]; then + filtered=() + for line in "${display_lines[@]}"; do + if echo "$line" | grep -qP "$grep_pattern"; then + filtered+=("$line") + fi + done + display_lines=("${filtered[@]+"${filtered[@]}"}") + if [[ ${#display_lines[@]} -eq 0 ]]; then + warn "No ${filter_label} instances found" + exit 0 + fi + fi + + # Sort by name + sorted=$(printf '%s\n' "${display_lines[@]}" | sort -t$'\t' -k1) + + # Calculate dynamic column widths from data + col_name=4; col_iid=11; col_owner=5; col_state=5; col_plat=8; col_ver=7 + while IFS=$'\t' read -r name iid owner state platform version; do + (( ${#name} > col_name )) && col_name=${#name} + (( ${#iid} > col_iid )) && col_iid=${#iid} + (( ${#owner} > col_owner )) && col_owner=${#owner} + (( ${#state} > col_state )) && col_state=${#state} + (( ${#platform}> col_plat )) && col_plat=${#platform} + (( ${#version} > col_ver )) && col_ver=${#version} + done <<< "$sorted" + + # Add padding + col_name=$((col_name + 2)) + col_iid=$((col_iid + 2)) + col_owner=$((col_owner + 2)) + col_state=$((col_state + 2)) + col_plat=$((col_plat + 2)) + + fmt=" %-${col_name}s %-${col_iid}s %-${col_owner}s %-${col_state}s %-${col_plat}s %s\n" + + # Build separator lines matching column widths + sep_name=$(printf '%*s' "$col_name" '' | tr ' ' '-') + sep_iid=$(printf '%*s' "$col_iid" '' | tr ' ' '-') + sep_owner=$(printf '%*s' "$col_owner" '' | tr ' ' '-') + sep_state=$(printf '%*s' "$col_state" '' | tr ' ' '-') + sep_plat=$(printf '%*s' "$col_plat" '' | tr ' ' '-') + sep_ver=$(printf '%*s' "$col_ver" '' | tr ' ' '-') + + # Display + echo "" + printf " ${BOLD}%-${col_name}s %-${col_iid}s %-${col_owner}s %-${col_state}s %-${col_plat}s %s${RESET}\n" "Name" "Instance ID" "Owner" "State" "Platform" "Version" + printf "$fmt" "$sep_name" "$sep_iid" "$sep_owner" "$sep_state" "$sep_plat" "$sep_ver" + while IFS=$'\t' read -r name iid owner state platform version; do + # Colorize state and pad manually (escape codes break printf %-Ns) + state_pad=$(( col_state - ${#state} )) + pad_str=$(printf '%*s' "$state_pad" '') + case "$state" in + running) printf -v state_str '%b' "${GREEN}${state}${RESET}${pad_str}" ;; + stopped) printf -v state_str '%b' "${RED}${state}${RESET}${pad_str}" ;; + *) printf -v state_str "%-${col_state}s" "$state" ;; + esac + printf " %-${col_name}s %-${col_iid}s %-${col_owner}s %s%-${col_plat}s %s\n" \ + "$name" "$iid" "$owner" "$state_str" "$platform" "$version" + done <<< "$sorted" + echo "" + + ssm_count=0 + for iid in "${!NAMES[@]}"; do + [[ -n "${SSM_PLATFORM[$iid]:-}" ]] && ((ssm_count++)) || true + done + log "${#display_lines[@]} instance(s) shown (${ssm_count} identified via SSM)" + exit 0 +fi + +if [[ -z "$OS_TYPE" ]]; then + err "No OS type specified. Use --os TYPE or --running" + exit 1 +fi + +# ── Query ───────────────────────────────────────────────────────────── +REGION_ARGS=() +[[ -n "$REGION" ]] && REGION_ARGS=(--region "$REGION") + +active_region="${REGION:-$(aws configure get region 2>/dev/null || echo 'not set')}" +govcloud=false +is_govcloud "$active_region" && govcloud=true + +set_os_profile "$OS_TYPE" "$govcloud" + +$govcloud && log "GovCloud detected — using GovCloud owner IDs" +log "Querying AMIs for ${OS_TYPE} in ${active_region} (owner: ${OWNER})..." +echo "" + +output=$(aws ec2 describe-images \ + ${REGION_ARGS[@]+"${REGION_ARGS[@]}"} \ + --owners "$OWNER" \ + --filters "Name=name,Values=${NAME_FILTER}" "Name=state,Values=available" \ + --query "reverse(sort_by(Images, &CreationDate))[:${MAX_RESULTS}].[ImageId, Name, Description, CreationDate]" \ + --output table 2>&1 | sed "s/DescribeImages/Available AMIs/") || { + err "AWS CLI failed:" + echo "$output" >&2 + exit 1 +} + +if [[ -z "$output" ]]; then + warn "No AMIs found for ${OS_TYPE}" + warn "Check your AWS region (current: $(aws configure get region 2>/dev/null || echo 'not set'))" + exit 1 +fi + +echo "$output" diff --git a/aws-cost-reporter.sh b/aws-cost-reporter.sh new file mode 100755 index 0000000..5edc818 --- /dev/null +++ b/aws-cost-reporter.sh @@ -0,0 +1,601 @@ +#!/usr/bin/env bash + +######################################################################################### +#### aws-cost-reporter.sh — Daily AWS cost breakdown by service, account, or tag #### +#### Supports email (SES), Slack webhooks, CSV/JSON export, period comparison #### +#### Requires: bash 4+, aws-cli v2, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### export AWS_PROFILE="billing" #### +#### ./aws-cost-reporter.sh --daily #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +AWS_REGION="${AWS_REGION:-us-east-1}" +GROUP_BY="${GROUP_BY:-SERVICE}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +SES_FROM_ADDRESS="${SES_FROM_ADDRESS:-}" +SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}" +COST_TAG_KEY="${COST_TAG_KEY:-}" +COST_TAG_VALUE="${COST_TAG_VALUE:-}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +RUN_MODE="" +CUSTOM_START="" +CUSTOM_END="" +EMAIL_TO="" +SLACK_URL="" +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "auto" && ! -t 1 ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" + return + fi + RED="\033[0;31m" + GREEN="\033[0;32m" + YELLOW="\033[0;33m" + # shellcheck disable=SC2034 # BLUE reserved for future use / caller scripts + BLUE="\033[0;34m" + # shellcheck disable=SC2034 # BOLD reserved for future use / caller scripts + BOLD="\033[1m" + DIM="\033[2m" + RESET="\033[0m" +} + +# ── Logging ─────────────────────────────────────────────────────────── +log_info() { printf "${GREEN}[INFO]${RESET} %s\n" "$*"; } +log_warn() { printf "${YELLOW}[WARN]${RESET} %s\n" "$*" >&2; } +log_error() { printf "${RED}[ERROR]${RESET} %s\n" "$*" >&2; } +log_debug() { [[ "$VERBOSE" == "true" ]] && printf "${DIM}[DEBUG] %s${RESET}\n" "$*"; } + +# ── Helpers ─────────────────────────────────────────────────────────── +die() { log_error "$@"; exit 1; } + +check_deps() { + local missing=() + command -v aws >/dev/null 2>&1 || missing+=("aws-cli") + command -v jq >/dev/null 2>&1 || missing+=("jq") + command -v curl >/dev/null 2>&1 || missing+=("curl") + if (( ${#missing[@]} > 0 )); then + die "Missing required tools: ${missing[*]}" + fi + + local bash_major="${BASH_VERSINFO[0]}" + if (( bash_major < 4 )); then + die "Requires bash 4+, found ${BASH_VERSION}" + fi +} + +validate_date() { + local d="$1" + if [[ ! "$d" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then + die "Invalid date format: $d (expected YYYY-MM-DD)" + fi +} + +# ── Date math (portable) ───────────────────────────────────────────── +date_offset() { + # Usage: date_offset YYYY-MM-DD -N → date N days before + local base="$1" offset="$2" + if date --version >/dev/null 2>&1; then + # GNU date + date -d "${base} ${offset} days" +%Y-%m-%d + else + # macOS date + date -j -v"${offset}d" -f "%Y-%m-%d" "$base" +%Y-%m-%d + fi +} + +today_utc() { date -u +%Y-%m-%d; } + +first_of_month() { + local d="$1" + echo "${d:0:8}01" +} + +first_of_prev_month() { + local d="$1" + local year="${d:0:4}" + local month="${d:5:2}" + month=$((10#$month - 1)) + if (( month == 0 )); then + month=12 + year=$((year - 1)) + fi + printf "%04d-%02d-01" "$year" "$month" +} + +days_between() { + local s="$1" e="$2" + local ss se + if date --version >/dev/null 2>&1; then + ss=$(date -d "$s" +%s) + se=$(date -d "$e" +%s) + else + ss=$(date -j -f "%Y-%m-%d" "$s" +%s) + se=$(date -j -f "%Y-%m-%d" "$e" +%s) + fi + echo $(( (se - ss) / 86400 )) +} + +# ── Compute date ranges ────────────────────────────────────────────── +compute_ranges() { + local today + today="$(today_utc)" + + case "$RUN_MODE" in + daily) + PERIOD_START="$(date_offset "$today" -1)" + PERIOD_END="$today" + PREV_START="$(date_offset "$today" -2)" + PREV_END="$(date_offset "$today" -1)" + ;; + weekly) + PERIOD_START="$(date_offset "$today" -7)" + PERIOD_END="$today" + PREV_START="$(date_offset "$today" -14)" + PREV_END="$(date_offset "$today" -7)" + ;; + monthly) + PERIOD_START="$(first_of_month "$today")" + PERIOD_END="$today" + local prev_first + prev_first="$(first_of_prev_month "$today")" + PREV_START="$prev_first" + PREV_END="$PERIOD_START" + ;; + custom) + PERIOD_START="$CUSTOM_START" + PERIOD_END="$CUSTOM_END" + local span + span="$(days_between "$CUSTOM_START" "$CUSTOM_END")" + PREV_START="$(date_offset "$CUSTOM_START" "-$span")" + PREV_END="$CUSTOM_START" + ;; + *) + die "Unknown mode: $RUN_MODE" + ;; + esac + + log_debug "Current period: $PERIOD_START → $PERIOD_END" + log_debug "Previous period: $PREV_START → $PREV_END" +} + +# ── Build Cost Explorer request ─────────────────────────────────────── +build_ce_filter() { + local filter="" + if [[ -n "$COST_TAG_KEY" && -n "$COST_TAG_VALUE" ]]; then + filter=$(cat </dev/null +} + +# ── Parse cost data ────────────────────────────────────────────────── +parse_costs() { + local raw="$1" + echo "$raw" | jq -r ' + [.ResultsByTime[].Groups[] | + { + key: .Keys[0], + amount: (.Metrics.BlendedCost.Amount | tonumber) + } + ] | + group_by(.key) | + map({ + key: .[0].key, + total: (map(.amount) | add) + }) | + sort_by(-.total) | + .[] | + "\(.key)\t\(.total)" + ' 2>/dev/null || echo "" +} + +# ── Format helpers ──────────────────────────────────────────────────── +fmt_currency() { + printf "$%.2f" "$1" +} + +fmt_delta() { + local curr="$1" prev="$2" + if (( $(echo "$prev == 0" | bc -l) )); then + echo "N/A" + return + fi + local pct + pct=$(echo "scale=1; (($curr - $prev) / $prev) * 100" | bc -l) + local sign="" + if (( $(echo "$pct > 0" | bc -l) )); then + sign="+" + fi + echo "${sign}${pct}%" +} + +print_header() { + local account_id + account_id=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "unknown") + + echo "AWS Cost Reporter" + echo "Account: $account_id" + echo "Region: $AWS_REGION" + echo "Mode: $RUN_MODE" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + + if [[ "$RUN_MODE" == "custom" ]]; then + echo "Period: $PERIOD_START → $PERIOD_END" + fi + echo "" +} + +# ── Text table output ──────────────────────────────────────────────── +output_text_table() { + local -n curr_data=$1 + local -n prev_data=$2 + local label="SERVICE" + case "$GROUP_BY" in + LINKED_ACCOUNT) label="ACCOUNT" ;; + TAG) label="TAG" ;; + esac + local divider="──────────────────────────────────────────────────────────────────────" + printf " %-38s %-12s %-12s %s\n" "$label" "COST" "PREV" "DELTA" + printf " %s\n" "$divider" + local total_curr=0 total_prev=0 + for key in "${!curr_data[@]}"; do + local cost="${curr_data[$key]}" prev_cost="${prev_data[$key]:-0}" + printf " %-38s %-12s %-12s %s\n" \ + "$key" "$(fmt_currency "$cost")" "$(fmt_currency "$prev_cost")" "$(fmt_delta "$cost" "$prev_cost")" + total_curr=$(echo "$total_curr + $cost" | bc -l) + total_prev=$(echo "$total_prev + $prev_cost" | bc -l) + done + printf " %s\n" "$divider" + printf " %-38s %-12s %-12s %s\n" \ + "TOTAL" "$(fmt_currency "$total_curr")" "$(fmt_currency "$total_prev")" "$(fmt_delta "$total_curr" "$total_prev")" +} + +# ── CSV output ──────────────────────────────────────────────────────── +output_csv() { + local -n curr_data=$1 + local -n prev_data=$2 + local label="service" + case "$GROUP_BY" in + LINKED_ACCOUNT) label="account" ;; + TAG) label="tag" ;; + esac + echo "${label},cost,previous_cost,delta_pct" + for key in "${!curr_data[@]}"; do + local cost="${curr_data[$key]}" prev_cost="${prev_data[$key]:-0}" pct="0" + if (( $(echo "$prev_cost != 0" | bc -l) )); then + pct=$(echo "scale=2; (($cost - $prev_cost) / $prev_cost) * 100" | bc -l) + fi + echo "\"$key\",$cost,$prev_cost,$pct" + done +} + +# ── JSON output ─────────────────────────────────────────────────────── +output_json() { + local -n curr_data=$1 + local -n prev_data=$2 + local label="service" + case "$GROUP_BY" in + LINKED_ACCOUNT) label="account" ;; + TAG) label="tag" ;; + esac + local items=() + for key in "${!curr_data[@]}"; do + items+=("{\"${label}\": \"${key}\", \"cost\": ${curr_data[$key]}, \"previous_cost\": ${prev_data[$key]:-0}}") + done + local joined + joined=$(printf ",%s" "${items[@]}") + joined="${joined:1}" + printf '{"mode":"%s","period_start":"%s","period_end":"%s","previous_start":"%s","previous_end":"%s","group_by":"%s","items":[%s]}\n' \ + "$RUN_MODE" "$PERIOD_START" "$PERIOD_END" "$PREV_START" "$PREV_END" "$GROUP_BY" "$joined" +} + +# ── Render report ───────────────────────────────────────────────────── +render_report() { + local curr_raw="$1" prev_raw="$2" + + # Parse into associative arrays + declare -A curr_costs + declare -A prev_costs + + while IFS=$'\t' read -r key amount; do + [[ -z "$key" ]] && continue + curr_costs["$key"]="$amount" + done <<< "$(parse_costs "$curr_raw")" + + while IFS=$'\t' read -r key amount; do + [[ -z "$key" ]] && continue + prev_costs["$key"]="$amount" + done <<< "$(parse_costs "$prev_raw")" + + # Ensure previous-only keys appear in current with 0 + for key in "${!prev_costs[@]}"; do + if [[ -z "${curr_costs[$key]+x}" ]]; then + curr_costs["$key"]="0" + fi + done + + case "$OUTPUT_FORMAT" in + text) + print_header + local title="Cost Breakdown — ${PERIOD_START} → ${PERIOD_END}" + echo "$title" + output_text_table curr_costs prev_costs + echo "" + ;; + csv) + output_csv curr_costs prev_costs + ;; + json) + output_json curr_costs prev_costs + ;; + *) + die "Unknown format: $OUTPUT_FORMAT" + ;; + esac +} + +# ── Email via SES ───────────────────────────────────────────────────── +send_email() { + local report="$1" recipient="$2" + + if [[ -z "$SES_FROM_ADDRESS" ]]; then + die "--email requires SES_FROM_ADDRESS to be set" + fi + + local subject + subject="AWS Cost Report — ${RUN_MODE} — $(today_utc)" + + log_info "Sending report to $recipient via SES..." + + local message + message=$(jq -n \ + --arg from "$SES_FROM_ADDRESS" \ + --arg to "$recipient" \ + --arg subject "$subject" \ + --arg body "$report" \ + '{ + Source: $from, + Destination: { ToAddresses: [$to] }, + Message: { + Subject: { Data: $subject, Charset: "UTF-8" }, + Body: { Text: { Data: $body, Charset: "UTF-8" } } + } + }') + + aws ses send-email \ + --region "$AWS_REGION" \ + --cli-input-json "$message" \ + --output text >/dev/null + + log_info "Email sent to $recipient" +} + +# ── Slack webhook ───────────────────────────────────────────────────── +send_slack() { + local report="$1" webhook="$2" + + log_info "Posting report to Slack..." + + # Truncate for Slack message limits + local max_len=3000 + local body="$report" + if (( ${#body} > max_len )); then + body="${body:0:$max_len} + +... (truncated — full report exceeds Slack message limit)" + fi + + local payload + payload=$(jq -n --arg text "\`\`\`${body}\`\`\`" '{ text: $text }') + + local http_code + http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + -d "$payload" \ + "$webhook") + + if [[ "$http_code" != "200" ]]; then + log_error "Slack webhook returned HTTP $http_code" + return 1 + fi + + log_info "Slack message posted" +} + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat < 0 )); do + case "$1" in + --daily|--weekly|--monthly) + RUN_MODE="${1#--}"; shift ;; + --custom) + RUN_MODE="custom" + [[ $# -lt 3 ]] && die "--custom requires START and END dates" + CUSTOM_START="$2"; CUSTOM_END="$3" + validate_date "$CUSTOM_START"; validate_date "$CUSTOM_END" + shift 3 ;; + --group-by) + [[ $# -lt 2 ]] && die "--group-by requires a value" + GROUP_BY="$2"; shift 2 ;; + --tag) + [[ $# -lt 2 ]] && die "--tag requires KEY=VALUE" + [[ "$2" != *"="* ]] && die "--tag value must be KEY=VALUE" + COST_TAG_KEY="${2%%=*}"; COST_TAG_VALUE="${2#*=}"; shift 2 ;; + --format) + [[ $# -lt 2 ]] && die "--format requires a value" + OUTPUT_FORMAT="$2"; shift 2 ;; + --email) + [[ $# -lt 2 ]] && die "--email requires an address" + EMAIL_TO="$2"; shift 2 ;; + --slack) + [[ $# -lt 2 ]] && die "--slack requires a webhook URL" + SLACK_URL="$2"; shift 2 ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + --help|-h) usage ;; + *) die "Unknown option: $1 (see --help)" ;; + esac + done + + if [[ -z "$RUN_MODE" ]]; then log_error "No mode specified"; echo ""; usage; exit 1; fi + [[ -z "$SLACK_URL" && -n "$SLACK_WEBHOOK_URL" ]] && SLACK_URL="$SLACK_WEBHOOK_URL" + + case "$GROUP_BY" in + SERVICE|TAG|LINKED_ACCOUNT) ;; + *) die "Invalid --group-by: $GROUP_BY" ;; + esac + case "$OUTPUT_FORMAT" in + text|csv|json) ;; + *) die "Invalid --format: $OUTPUT_FORMAT" ;; + esac +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + parse_args "$@" + setup_colors + check_deps + + START_TIME=$(date +%s) + + # Validate AWS credentials + log_debug "Validating AWS credentials..." + aws sts get-caller-identity --output text >/dev/null 2>&1 \ + || die "AWS credentials not configured or expired" + + compute_ranges + + log_info "Querying Cost Explorer ($RUN_MODE, group by $GROUP_BY)..." + + local curr_raw prev_raw + curr_raw="$(query_costs "$PERIOD_START" "$PERIOD_END")" + prev_raw="$(query_costs "$PREV_START" "$PREV_END")" + + if [[ -z "$curr_raw" ]]; then + die "No cost data returned for $PERIOD_START → $PERIOD_END" + fi + + local report + report="$(render_report "$curr_raw" "$prev_raw")" + + # Output to stdout unless sending elsewhere exclusively + echo "$report" + + # Email delivery + if [[ -n "$EMAIL_TO" ]]; then + send_email "$report" "$EMAIL_TO" + fi + + # Slack delivery + if [[ -n "$SLACK_URL" ]]; then + send_slack "$report" "$SLACK_URL" + fi + + local elapsed=$(( $(date +%s) - START_TIME )) + log_info "Completed in ${elapsed}s" +} + +main "$@" diff --git a/aws-smoke-tests.sh b/aws-smoke-tests.sh new file mode 100644 index 0000000..ac92361 --- /dev/null +++ b/aws-smoke-tests.sh @@ -0,0 +1,537 @@ +#!/usr/bin/env bash + +##################################################################################### +#### aws-smoke-tests.sh — Verify AWS connectivity and core service health #### +#### Checks credentials, S3, EC2, IAM, VPC, Route 53, CloudWatch, Security Hub #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: ./aws-smoke-tests.sh #### +#### AWS_PROFILE=prod S3_BUCKET=my-bucket ./aws-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +##################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +AWS_REGION="${AWS_REGION:-us-east-1}" +S3_BUCKET="${S3_BUCKET:-}" +R53_DOMAIN="${R53_DOMAIN:-}" +R53_ZONE_ID="${R53_ZONE_ID:-}" +VPC_ID="${VPC_ID:-}" +COST_THRESHOLD="${COST_THRESHOLD:-}" +SG_CHECK_PORTS="${SG_CHECK_PORTS:-22,3389,3306,5432}" +REQUIRED_PERMISSIONS="${REQUIRED_PERMISSIONS:-}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0; FAIL=0; SKIP=0; TOTAL=0 +RESULTS=() +START_TIME="" +CALLER_ARN="" + +# ── Colors ──────────────────────────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" +setup_colors() { + if [[ "$COLOR" == "never" ]]; then return; fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { [[ "$VERBOSE" == "true" ]] && echo -e "${BLUE}[DEBUG]${RESET} $*" || true; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" detail="${2:-}" + ((PASS++)) || true; ((TOTAL++)) || true + local msg="ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && msg="${msg} (${detail})" + RESULTS+=("$msg") + verbose "PASS: ${name} ${detail}" +} + +record_fail() { + local name="$1" detail="${2:-}" + ((FAIL++)) || true; ((TOTAL++)) || true + local msg="not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && msg="${msg} (${detail})" + RESULTS+=("$msg") + verbose "FAIL: ${name} ${detail}" +} + +record_skip() { + local name="$1" reason="${2:-}" + ((SKIP++)) || true; ((TOTAL++)) || true + local msg="ok ${TOTAL} - # SKIP ${name}" + [[ -n "$reason" ]] && msg="${msg} — ${reason}" + RESULTS+=("$msg") + verbose "SKIP: ${name} ${reason}" +} + +# ── Dependency Check ────────────────────────────────────────────────── +check_dependencies() { + local missing=() + command -v aws >/dev/null 2>&1 || missing+=("aws-cli") + command -v jq >/dev/null 2>&1 || missing+=("jq") + + if [[ ${#missing[@]} -gt 0 ]]; then + err "Missing required tools: ${missing[*]}" + err "Install aws-cli v2 and jq before running this script." + exit 1 + fi + verbose "Dependencies satisfied: aws-cli, jq" +} + +# ── Help ────────────────────────────────────────────────────────────── +show_help() { + cat <<'EOF' +AWS Smoke Tests — Verify AWS connectivity and core service health + +Environment Variables: + AWS_REGION Region to test (default: us-east-1) + AWS_PROFILE AWS CLI profile to use + S3_BUCKET S3 bucket to verify access + R53_DOMAIN Route 53 domain to resolve + R53_ZONE_ID Hosted zone ID to verify + VPC_ID VPC to inspect (auto-detected if not set) + COST_THRESHOLD Monthly cost alert threshold in USD + SG_CHECK_PORTS Ports to check for open SGs (default: 22,3389,3306,5432) + REQUIRED_PERMISSIONS Comma-separated IAM actions to simulate + OUTPUT_FORMAT Output format: text or json (default: text) + VERBOSE Show detailed output (default: false) + COLOR Color output: auto, always, never (default: auto) + +Examples: + ./aws-smoke-tests.sh + AWS_PROFILE=prod S3_BUCKET=my-bucket ./aws-smoke-tests.sh + S3_BUCKET=data R53_DOMAIN=example.com COST_THRESHOLD=5000 ./aws-smoke-tests.sh +EOF + exit 0 +} + +# ── Tests ───────────────────────────────────────────────────────────── + +test_credentials() { + verbose "Testing AWS credentials..." + + local identity + identity=$(aws sts get-caller-identity --output json 2>/dev/null) || { + record_fail "AWS credentials configured" "No valid credentials found" + return + } + + record_pass "AWS credentials configured" + + local account arn + account=$(echo "$identity" | jq -r '.Account // "unknown"') + arn=$(echo "$identity" | jq -r '.Arn // "unknown"') + CALLER_ARN="$arn" + + record_pass "STS GetCallerIdentity succeeds" "account: ${account}" + record_pass "Caller identity" "ARN: ${arn}" +} + +test_s3() { + verbose "Testing S3 access..." + + local bucket_count + bucket_count=$(aws s3api list-buckets --query 'length(Buckets)' --output text 2>/dev/null) || { + record_fail "S3 ListBuckets" "API call failed" + return + } + + record_pass "S3 ListBuckets succeeds" "${bucket_count} buckets" + + if [[ -n "$S3_BUCKET" ]]; then + if aws s3api head-bucket --bucket "$S3_BUCKET" 2>/dev/null; then + record_pass "S3 bucket '${S3_BUCKET}' exists and is accessible" + else + record_fail "S3 bucket '${S3_BUCKET}' exists and is accessible" "head-bucket failed" + fi + else + record_skip "S3 specific bucket check" "S3_BUCKET not set" + fi +} + +test_ec2() { + verbose "Testing EC2 access..." + + local instances + instances=$(aws ec2 describe-instances \ + --query 'Reservations[].Instances[]' \ + --output json 2>/dev/null) || { + record_fail "EC2 DescribeInstances" "API call failed" + return + } + + record_pass "EC2 DescribeInstances succeeds" + + local running + running=$(echo "$instances" | jq '[.[] | select(.State.Name == "running")] | length') + record_pass "Running instances" "${running}" +} + +test_vpc() { + verbose "Testing VPC configuration..." + + local vpc_id="$VPC_ID" + if [[ -z "$vpc_id" ]]; then + vpc_id=$(aws ec2 describe-vpcs \ + --filters "Name=isDefault,Values=true" \ + --query 'Vpcs[0].VpcId' \ + --output text 2>/dev/null) || true + + if [[ -z "$vpc_id" ]] || [[ "$vpc_id" == "None" ]]; then + vpc_id=$(aws ec2 describe-vpcs \ + --query 'Vpcs[0].VpcId' \ + --output text 2>/dev/null) || true + fi + fi + + if [[ -z "$vpc_id" ]] || [[ "$vpc_id" == "None" ]]; then + record_fail "VPC exists" "No VPC found" + return + fi + + record_pass "VPC exists" "${vpc_id}" + + # Check subnets + local subnet_count + subnet_count=$(aws ec2 describe-subnets \ + --filters "Name=vpc-id,Values=${vpc_id}" \ + --query 'length(Subnets)' \ + --output text 2>/dev/null) || subnet_count=0 + + if [[ "$subnet_count" -gt 0 ]]; then + record_pass "VPC has subnets" "${subnet_count}" + else + record_fail "VPC has subnets" "0 subnets found" + fi + + # Check internet gateway + local igw + igw=$(aws ec2 describe-internet-gateways \ + --filters "Name=attachment.vpc-id,Values=${vpc_id}" \ + --query 'InternetGateways[0].InternetGatewayId' \ + --output text 2>/dev/null) || igw="None" + + if [[ -n "$igw" ]] && [[ "$igw" != "None" ]]; then + record_pass "Internet gateway attached to VPC" "${igw}" + else + record_fail "Internet gateway attached to VPC" "None found" + fi +} + +test_route53() { + if [[ -z "$R53_DOMAIN" ]] && [[ -z "$R53_ZONE_ID" ]]; then + record_skip "Route 53 checks" "R53_DOMAIN and R53_ZONE_ID not set" + return + fi + + verbose "Testing Route 53..." + + if [[ -n "$R53_ZONE_ID" ]]; then + local zone_name + zone_name=$(aws route53 get-hosted-zone \ + --id "$R53_ZONE_ID" \ + --query 'HostedZone.Name' \ + --output text 2>/dev/null) || { + record_fail "Route 53 zone ${R53_ZONE_ID} exists" + return + } + record_pass "Route 53 zone exists" "${zone_name}" + fi + + if [[ -n "$R53_DOMAIN" ]]; then + local zone_count + zone_count=$(aws route53 list-hosted-zones \ + --query 'length(HostedZones)' \ + --output text 2>/dev/null) || { + record_fail "Route 53 ListHostedZones" + return + } + record_pass "Route 53 ListHostedZones succeeds" "${zone_count} zones" + + # Try to resolve the domain using system DNS + local resolved + resolved=$(dig +short "$R53_DOMAIN" A 2>/dev/null | head -1) || true + + if [[ -n "$resolved" ]]; then + record_pass "Route 53 domain ${R53_DOMAIN} resolves" "A: ${resolved}" + else + record_fail "Route 53 domain ${R53_DOMAIN} resolves" "No A record returned" + fi + fi +} + +test_security_groups() { + verbose "Testing security groups..." + + local sgs + sgs=$(aws ec2 describe-security-groups \ + --query 'SecurityGroups[].{GroupId:GroupId,GroupName:GroupName,IpPermissions:IpPermissions}' \ + --output json 2>/dev/null) || { + record_fail "Security group audit" "API call failed" + return + } + + IFS=',' read -ra ports <<< "$SG_CHECK_PORTS" + + for port in "${ports[@]}"; do + port=$(echo "$port" | tr -d ' ') + local open_sgs + open_sgs=$(echo "$sgs" | jq -r --argjson port "$port" ' + [.[] | select( + .IpPermissions[]? | + select( + (.IpRanges[]?.CidrIp == "0.0.0.0/0" or .Ipv6Ranges[]?.CidrIpv6 == "::/0") and + ( + (.FromPort <= $port and .ToPort >= $port) or + (.IpProtocol == "-1") + ) + ) + ) | .GroupId] | unique | join(", ") + ' 2>/dev/null) || open_sgs="" + + if [[ -z "$open_sgs" ]]; then + record_pass "No security groups with 0.0.0.0/0 on port ${port}" + else + record_fail "Security group allows 0.0.0.0/0 on port ${port}" "${open_sgs}" + fi + done +} + +test_cloudwatch_alarms() { + verbose "Testing CloudWatch alarms..." + + local alarm_count + alarm_count=$(aws cloudwatch describe-alarms \ + --state-value ALARM \ + --query 'length(MetricAlarms)' \ + --output text 2>/dev/null) || { + record_fail "CloudWatch alarm check" "API call failed" + return + } + + if [[ "$alarm_count" -eq 0 ]]; then + record_pass "CloudWatch alarms" "0 in ALARM state" + else + local alarm_names + alarm_names=$(aws cloudwatch describe-alarms \ + --state-value ALARM \ + --query 'MetricAlarms[].AlarmName' \ + --output text 2>/dev/null | head -c 200) + record_fail "CloudWatch alarms" "${alarm_count} in ALARM state: ${alarm_names}" + fi +} + +test_security_hub() { + verbose "Testing Security Hub..." + + local findings + findings=$(aws securityhub get-findings \ + --filters '{ + "WorkflowStatus": [{"Value":"NEW","Comparison":"EQUALS"}], + "RecordState": [{"Value":"ACTIVE","Comparison":"EQUALS"}], + "SeverityLabel": [{"Value":"CRITICAL","Comparison":"EQUALS"},{"Value":"HIGH","Comparison":"EQUALS"}] + }' \ + --max-items 100 \ + --query 'length(Findings)' \ + --output text 2>/dev/null) || { + record_skip "Security Hub findings" "Security Hub not enabled or no access" + return + } + + local critical high + critical=$(aws securityhub get-findings \ + --filters '{ + "WorkflowStatus": [{"Value":"NEW","Comparison":"EQUALS"}], + "RecordState": [{"Value":"ACTIVE","Comparison":"EQUALS"}], + "SeverityLabel": [{"Value":"CRITICAL","Comparison":"EQUALS"}] + }' \ + --query 'length(Findings)' \ + --output text 2>/dev/null) || critical=0 + + high=$(aws securityhub get-findings \ + --filters '{ + "WorkflowStatus": [{"Value":"NEW","Comparison":"EQUALS"}], + "RecordState": [{"Value":"ACTIVE","Comparison":"EQUALS"}], + "SeverityLabel": [{"Value":"HIGH","Comparison":"EQUALS"}] + }' \ + --query 'length(Findings)' \ + --output text 2>/dev/null) || high=0 + + if [[ "$critical" -eq 0 ]] && [[ "$high" -eq 0 ]]; then + record_pass "Security Hub findings" "0 critical, 0 high" + else + record_fail "Security Hub findings" "${critical} critical, ${high} high" + fi +} + +test_iam_permissions() { + if [[ -z "$REQUIRED_PERMISSIONS" ]] || [[ -z "$CALLER_ARN" ]]; then + record_skip "IAM permission simulation" "REQUIRED_PERMISSIONS not set or no ARN" + return + fi + + verbose "Testing IAM permissions..." + + IFS=',' read -ra actions <<< "$REQUIRED_PERMISSIONS" + local denied=() + + for action in "${actions[@]}"; do + action=$(echo "$action" | tr -d ' ') + local result + result=$(aws iam simulate-principal-policy \ + --policy-source-arn "$CALLER_ARN" \ + --action-names "$action" \ + --query 'EvaluationResults[0].EvalDecision' \ + --output text 2>/dev/null) || result="error" + + if [[ "$result" != "allowed" ]]; then + denied+=("$action") + fi + done + + if [[ ${#denied[@]} -eq 0 ]]; then + record_pass "IAM simulation" "all ${#actions[@]} required actions allowed" + else + record_fail "IAM simulation" "denied: ${denied[*]}" + fi +} + +test_cost() { + if [[ -z "$COST_THRESHOLD" ]]; then + record_skip "Cost check" "COST_THRESHOLD not set" + return + fi + + verbose "Testing monthly cost..." + + local month_start today + month_start=$(date -u +%Y-%m-01) + today=$(date -u +%Y-%m-%d) + + local cost_json + cost_json=$(aws ce get-cost-and-usage \ + --time-period "Start=${month_start},End=${today}" \ + --granularity MONTHLY \ + --metrics BlendedCost \ + --output json 2>/dev/null) || { + record_skip "Cost check" "Cost Explorer API failed (may need to be enabled)" + return + } + + local amount + amount=$(echo "$cost_json" | jq -r '.ResultsByTime[0].Total.BlendedCost.Amount // "0"') + local amount_int=${amount%%.*} + + if [[ "$amount_int" -lt "$COST_THRESHOLD" ]]; then + record_pass "Current month spend" "\$${amount} below threshold \$${COST_THRESHOLD}" + else + record_fail "Current month spend" "\$${amount} exceeds threshold \$${COST_THRESHOLD}" + fi +} + +# ── Output ──────────────────────────────────────────────────────────── + +print_tap() { + echo "TAP version 14" + echo "1..${TOTAL}" + for result in "${RESULTS[@]}"; do + echo "$result" + done + echo "" + local duration=$(( $(date +%s) - START_TIME )) + echo "# Tests: ${TOTAL}, Passed: ${PASS}, Failed: ${FAIL}, Skipped: ${SKIP}" + echo "# Duration: ${duration}s" +} + +print_json() { + local duration=$(( $(date +%s) - START_TIME )) + local json_results="[" + local first=true + for result in "${RESULTS[@]}"; do + local status="pass" + [[ "$result" == not\ ok* ]] && status="fail" + [[ "$result" == *"# SKIP"* ]] && status="skip" + local name + name=$(echo "$result" | sed -E 's/^(not )?ok [0-9]+ - (# SKIP )?//' | sed 's/ — .*//' | sed 's/ (.*//') + + $first || json_results+="," + first=false + json_results+="{\"status\":\"${status}\",\"name\":\"${name}\"}" + done + json_results+="]" + + jq -n \ + --argjson results "$json_results" \ + --argjson total "$TOTAL" \ + --argjson passed "$PASS" \ + --argjson failed "$FAIL" \ + --argjson skipped "$SKIP" \ + --argjson duration "$duration" \ + '{ + total: $total, + passed: $passed, + failed: $failed, + skipped: $skipped, + duration_seconds: $duration, + results: $results + }' +} + +# ── Main ────────────────────────────────────────────────────────────── + +main() { + [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && show_help + + setup_colors + check_dependencies + + START_TIME=$(date +%s) + + log "AWS Smoke Tests — Region: ${AWS_REGION}" + log "────────────────────────────────────────" + + test_credentials + test_s3 + test_ec2 + test_vpc + test_route53 + test_security_groups + test_cloudwatch_alarms + test_security_hub + test_iam_permissions + test_cost + + echo "" + if [[ "$OUTPUT_FORMAT" == "json" ]]; then + print_json + else + print_tap + fi + + if [[ "$FAIL" -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@" diff --git a/azure-ad-audit.sh b/azure-ad-audit.sh new file mode 100755 index 0000000..267e0cd --- /dev/null +++ b/azure-ad-audit.sh @@ -0,0 +1,649 @@ +#!/usr/bin/env bash + +######################################################################################### +#### azure-ad-audit.sh — Audit Azure Entra ID for stale users, MFA gaps, risky #### +#### sign-ins, excessive permissions, and service principal hygiene via az CLI #### +#### Requires: bash 4+, az CLI, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./azure-ad-audit.sh --full #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +# ── Severity counters ──────────────────────────────────────────────── +TOTAL_CRIT=0 +TOTAL_WARN=0 +TOTAL_INFO=0 +TOTAL_OK=0 + +flag_crit() { ((TOTAL_CRIT++)) || true; } +flag_warn() { ((TOTAL_WARN++)) || true; } +flag_info() { ((TOTAL_INFO++)) || true; } +flag_ok() { ((TOTAL_OK++)) || true; } + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +STALE_DAYS="${STALE_DAYS:-90}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── Dependency checks ──────────────────────────────────────────────── +check_deps() { + command -v az &>/dev/null || die "az CLI is required (https://aka.ms/install-azure-cli)" + command -v jq &>/dev/null || die "jq is required" +} + +check_credentials() { + if ! az account show &>/dev/null 2>&1; then + die "Not logged in to Azure CLI — run 'az login' first" + fi + verbose "Azure CLI authenticated" +} + +# ── Date helpers ───────────────────────────────────────────────────── +days_since() { + local date_str="$1" + if [[ -z "$date_str" || "$date_str" == "null" || "$date_str" == "None" ]]; then + echo "never" + return + fi + local then_epoch now_epoch + then_epoch=$(date -d "${date_str}" +%s 2>/dev/null || echo 0) + now_epoch=$(date +%s) + if [[ "$then_epoch" -eq 0 ]]; then + echo "unknown" + return + fi + echo $(( (now_epoch - then_epoch) / 86400 )) +} + +days_until() { + local date_str="$1" + if [[ -z "$date_str" || "$date_str" == "null" || "$date_str" == "None" ]]; then + echo "unknown" + return + fi + local target_epoch now_epoch + target_epoch=$(date -d "${date_str}" +%s 2>/dev/null || echo 0) + now_epoch=$(date +%s) + if [[ "$target_epoch" -eq 0 ]]; then + echo "unknown" + return + fi + echo $(( (target_epoch - now_epoch) / 86400 )) +} + +# ══════════════════════════════════════════════════════════════════════ +# STALE USERS AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_stale_users() { + log "Auditing stale user accounts (threshold: ${STALE_DAYS} days)..." + echo "" + + printf " %-36s %-24s %-20s %-10s %s\n" \ + "UPN" "DISPLAY_NAME" "LAST_SIGN_IN" "DAYS_IDLE" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..110})" + + local users_json + # shellcheck disable=SC2016 + users_json=$(az rest --method GET \ + --url 'https://graph.microsoft.com/v1.0/users?$select=userPrincipalName,displayName,signInActivity,accountEnabled&$top=999' \ + 2>/dev/null || echo '{"value":[]}') + + echo "$users_json" | jq -c '.value[]? // empty' 2>/dev/null | while IFS= read -r user; do + local upn display_name enabled last_sign_in + upn=$(echo "$user" | jq -r '.userPrincipalName // "unknown"' 2>/dev/null) + display_name=$(echo "$user" | jq -r '.displayName // "unknown"' 2>/dev/null) + enabled=$(echo "$user" | jq -r '.accountEnabled // true' 2>/dev/null) + last_sign_in=$(echo "$user" | jq -r '.signInActivity.lastSignInDateTime // "null"' 2>/dev/null) + + [[ "$enabled" == "false" ]] && continue + + local idle_days last_sign_display + idle_days=$(days_since "$last_sign_in") + + if [[ "$last_sign_in" == "null" || -z "$last_sign_in" ]]; then + last_sign_display="Never" + else + last_sign_display="${last_sign_in:0:10}" + fi + + if [[ "$idle_days" == "never" ]]; then + printf " %-36s %-24s %-20s %-10s %b%s%b\n" \ + "${upn:0:34}" "${display_name:0:22}" "$last_sign_display" \ + "N/A" "$YELLOW" "WARN" "$RESET" + flag_warn + elif [[ "$idle_days" == "unknown" ]]; then + verbose "Skipping ${upn}: unable to parse sign-in date" + elif [[ "$idle_days" -gt "$STALE_DAYS" ]]; then + printf " %-36s %-24s %-20s %-10s %b%s%b\n" \ + "${upn:0:34}" "${display_name:0:22}" "$last_sign_display" \ + "$idle_days" "$YELLOW" "WARN" "$RESET" + flag_warn + else + verbose "User ${upn}: active (${idle_days}d idle)" + flag_ok + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# MFA AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_mfa() { + log "Auditing MFA registration status..." + echo "" + + printf " %-36s %-14s %-10s %s\n" \ + "UPN" "MFA_STATUS" "IS_ADMIN" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..80})" + + local reg_json + # shellcheck disable=SC2016 + reg_json=$(az rest --method GET \ + --url 'https://graph.microsoft.com/v1.0/reports/credentialUserRegistrationDetails?$top=999' \ + 2>/dev/null || echo '{"value":[]}') + + local admin_upns + admin_upns=$(az rest --method GET \ + --url 'https://graph.microsoft.com/v1.0/directoryRoles' \ + 2>/dev/null | jq -r '.value[]? | select(.displayName | test("Admin|Administrator"; "i")) | .id' 2>/dev/null || true) + + local admin_members="" + while IFS= read -r role_id; do + [[ -z "$role_id" ]] && continue + local members + members=$(az rest --method GET \ + --url "https://graph.microsoft.com/v1.0/directoryRoles/${role_id}/members" \ + 2>/dev/null | jq -r '.value[]?.userPrincipalName // empty' 2>/dev/null || true) + admin_members="${admin_members}${members}"$'\n' + done <<< "$admin_upns" + + echo "$reg_json" | jq -c '.value[]? // empty' 2>/dev/null | while IFS= read -r entry; do + local upn mfa_registered + upn=$(echo "$entry" | jq -r '.userPrincipalName // "unknown"' 2>/dev/null) + mfa_registered=$(echo "$entry" | jq -r '.isMfaRegistered // false' 2>/dev/null) + + local is_admin="No" + if echo "$admin_members" | grep -qi "^${upn}$" 2>/dev/null; then + is_admin="Yes" + fi + + local mfa_display severity + if [[ "$mfa_registered" == "true" ]]; then + mfa_display="Registered" + severity="OK" + printf " %-36s %-14s %-10s %b%s%b\n" \ + "${upn:0:34}" "$mfa_display" "$is_admin" "$GREEN" "$severity" "$RESET" + flag_ok + else + mfa_display="Not registered" + if [[ "$is_admin" == "Yes" ]]; then + severity="CRITICAL" + printf " %-36s %-14s %-10s %b%s%b\n" \ + "${upn:0:34}" "$mfa_display" "$is_admin" "$RED" "$severity" "$RESET" + flag_crit + else + severity="WARN" + printf " %-36s %-14s %-10s %b%s%b\n" \ + "${upn:0:34}" "$mfa_display" "$is_admin" "$YELLOW" "$severity" "$RESET" + flag_warn + fi + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# ADMIN ROLES AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_admins() { + log "Auditing privileged role assignments..." + echo "" + + printf " %-36s %-28s %-20s %s\n" \ + "UPN" "ROLE" "SCOPE" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..95})" + + local owner_json + owner_json=$(az role assignment list --role "Owner" --all 2>/dev/null || echo '[]') + + echo "$owner_json" | jq -c '.[]? // empty' 2>/dev/null | while IFS= read -r assignment; do + local upn role scope + upn=$(echo "$assignment" | jq -r '.principalName // "unknown"' 2>/dev/null) + role=$(echo "$assignment" | jq -r '.roleDefinitionName // "unknown"' 2>/dev/null) + scope=$(echo "$assignment" | jq -r '.scope // "/"' 2>/dev/null) + + [[ -z "$upn" || "$upn" == "unknown" ]] && continue + + local scope_display + scope_display="${scope##*/}" + [[ -z "$scope_display" ]] && scope_display="$scope" + + printf " %-36s %-28s %-20s %b%s%b\n" \ + "${upn:0:34}" "${role:0:26}" "${scope_display:0:18}" \ + "$CYAN" "INFO" "$RESET" + flag_info + done + + local ga_json + ga_json=$(az rest --method GET \ + --url 'https://graph.microsoft.com/v1.0/directoryRoles' \ + 2>/dev/null | jq -c '.value[]? | select(.displayName == "Global Administrator")' 2>/dev/null || echo '') + + if [[ -n "$ga_json" ]]; then + local ga_role_id + ga_role_id=$(echo "$ga_json" | jq -r '.id' 2>/dev/null) + + local ga_members + ga_members=$(az rest --method GET \ + --url "https://graph.microsoft.com/v1.0/directoryRoles/${ga_role_id}/members" \ + 2>/dev/null || echo '{"value":[]}') + + local ga_count + ga_count=$(echo "$ga_members" | jq '[.value[]?] | length' 2>/dev/null || echo 0) + + echo "$ga_members" | jq -c '.value[]? // empty' 2>/dev/null | while IFS= read -r member; do + local m_upn + m_upn=$(echo "$member" | jq -r '.userPrincipalName // "unknown"' 2>/dev/null) + + printf " %-36s %-28s %-20s %b%s%b\n" \ + "${m_upn:0:34}" "Global Administrator" "Tenant" \ + "$CYAN" "INFO" "$RESET" + flag_info + done + + if [[ "$ga_count" -gt 5 ]]; then + echo "" + warn "Excessive Global Administrators: ${ga_count} found (recommended: ≤5)" + printf " %-36s %-28s %-20s %b%s%b\n" \ + "— policy —" "Global Admin count: ${ga_count}" ">5 threshold" \ + "$YELLOW" "WARN" "$RESET" + flag_warn + elif [[ "$ga_count" -gt 0 ]]; then + verbose "Global Administrator count: ${ga_count} (within threshold)" + flag_ok + fi + fi + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# SERVICE PRINCIPALS AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_service_principals() { + log "Auditing service principal credentials..." + echo "" + + printf " %-30s %-14s %-20s %-14s %s\n" \ + "APP_NAME" "CRED_TYPE" "EXPIRY" "STATUS" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..100})" + + local sp_json + sp_json=$(az ad sp list --all --query "[].{appDisplayName:appDisplayName,appId:appId,keyCredentials:keyCredentials,passwordCredentials:passwordCredentials}" 2>/dev/null || echo '[]') + + echo "$sp_json" | jq -c '.[]? // empty' 2>/dev/null | while IFS= read -r sp; do + local app_name app_id + app_name=$(echo "$sp" | jq -r '.appDisplayName // "unnamed"' 2>/dev/null) + app_id=$(echo "$sp" | jq -r '.appId // "unknown"' 2>/dev/null) + + [[ "$app_name" == "unnamed" || -z "$app_name" ]] && app_name="$app_id" + + echo "$sp" | jq -c '.passwordCredentials[]? // empty' 2>/dev/null | while IFS= read -r cred; do + local end_date + end_date=$(echo "$cred" | jq -r '.endDateTime // "null"' 2>/dev/null) + + local remaining status severity + remaining=$(days_until "$end_date") + + if [[ "$remaining" == "unknown" ]]; then + status="Unknown" + severity="INFO" + printf " %-30s %-14s %-20s %-14s %b%s%b\n" \ + "${app_name:0:28}" "Password" "Unknown" "$status" \ + "$CYAN" "$severity" "$RESET" + flag_info + elif [[ "$remaining" -lt 0 ]]; then + status="Expired" + severity="WARN" + printf " %-30s %-14s %-20s %-14s %b%s%b\n" \ + "${app_name:0:28}" "Password" "${end_date:0:10}" "$status" \ + "$YELLOW" "$severity" "$RESET" + flag_warn + elif [[ "$remaining" -lt 30 ]]; then + status="Expiring (${remaining}d)" + severity="WARN" + printf " %-30s %-14s %-20s %-14s %b%s%b\n" \ + "${app_name:0:28}" "Password" "${end_date:0:10}" "$status" \ + "$YELLOW" "$severity" "$RESET" + flag_warn + else + verbose "SP ${app_name}: password credential valid (${remaining}d remaining)" + flag_ok + fi + done + + echo "$sp" | jq -c '.keyCredentials[]? // empty' 2>/dev/null | while IFS= read -r cred; do + local end_date + end_date=$(echo "$cred" | jq -r '.endDateTime // "null"' 2>/dev/null) + + local remaining status severity + remaining=$(days_until "$end_date") + + if [[ "$remaining" == "unknown" ]]; then + status="Unknown" + severity="INFO" + printf " %-30s %-14s %-20s %-14s %b%s%b\n" \ + "${app_name:0:28}" "Certificate" "Unknown" "$status" \ + "$CYAN" "$severity" "$RESET" + flag_info + elif [[ "$remaining" -lt 0 ]]; then + status="Expired" + severity="WARN" + printf " %-30s %-14s %-20s %-14s %b%s%b\n" \ + "${app_name:0:28}" "Certificate" "${end_date:0:10}" "$status" \ + "$YELLOW" "$severity" "$RESET" + flag_warn + elif [[ "$remaining" -lt 30 ]]; then + status="Expiring (${remaining}d)" + severity="WARN" + printf " %-30s %-14s %-20s %-14s %b%s%b\n" \ + "${app_name:0:28}" "Certificate" "${end_date:0:10}" "$status" \ + "$YELLOW" "$severity" "$RESET" + flag_warn + else + verbose "SP ${app_name}: key credential valid (${remaining}d remaining)" + flag_ok + fi + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# GUEST USERS AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_guests() { + log "Auditing guest user accounts (threshold: ${STALE_DAYS} days)..." + echo "" + + printf " %-36s %-20s %-20s %s\n" \ + "UPN" "CREATED" "LAST_ACTIVITY" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..90})" + + local guests_json + # shellcheck disable=SC2016 + guests_json=$(az rest --method GET \ + --url 'https://graph.microsoft.com/v1.0/users?$filter=userType%20eq%20%27Guest%27&$select=userPrincipalName,displayName,createdDateTime,signInActivity&$top=999' \ + 2>/dev/null || echo '{"value":[]}') + + echo "$guests_json" | jq -c '.value[]? // empty' 2>/dev/null | while IFS= read -r guest; do + local upn created last_activity + upn=$(echo "$guest" | jq -r '.userPrincipalName // "unknown"' 2>/dev/null) + created=$(echo "$guest" | jq -r '.createdDateTime // "null"' 2>/dev/null) + last_activity=$(echo "$guest" | jq -r '.signInActivity.lastSignInDateTime // "null"' 2>/dev/null) + + local created_display last_display + if [[ "$created" == "null" || -z "$created" ]]; then + created_display="Unknown" + else + created_display="${created:0:10}" + fi + + if [[ "$last_activity" == "null" || -z "$last_activity" ]]; then + last_display="Never" + else + last_display="${last_activity:0:10}" + fi + + local idle_days + idle_days=$(days_since "$last_activity") + + if [[ "$idle_days" == "never" ]]; then + printf " %-36s %-20s %-20s %b%s%b\n" \ + "${upn:0:34}" "$created_display" "$last_display" \ + "$YELLOW" "WARN" "$RESET" + flag_warn + elif [[ "$idle_days" == "unknown" ]]; then + verbose "Guest ${upn}: unable to determine activity" + flag_info + elif [[ "$idle_days" -gt "$STALE_DAYS" ]]; then + printf " %-36s %-20s %-20s %b%s%b\n" \ + "${upn:0:34}" "$created_display" "$last_display" \ + "$YELLOW" "WARN" "$RESET" + flag_warn + else + verbose "Guest ${upn}: active (${idle_days}d idle)" + flag_ok + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ +print_summary() { + local elapsed + elapsed=$(( $(date +%s) - START_TIME )) + + echo "" + echo " ══════════════════════════════════════════" + echo " Azure Entra ID Audit Summary" + echo " ══════════════════════════════════════════" + printf " %-20s %b%d%b\n" "CRITICAL:" "$RED" "$TOTAL_CRIT" "$RESET" + printf " %-20s %b%d%b\n" "WARN:" "$YELLOW" "$TOTAL_WARN" "$RESET" + printf " %-20s %b%d%b\n" "INFO:" "$CYAN" "$TOTAL_INFO" "$RESET" + printf " %-20s %b%d%b\n" "OK:" "$GREEN" "$TOTAL_OK" "$RESET" + echo " ──────────────────────────────────────────" + printf " Completed in %ds\n" "$elapsed" + echo "" + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + echo -e " ${RED}${BOLD}Action required:${RESET} ${TOTAL_CRIT} critical finding(s)" + echo "" + echo " Top recommendations:" + echo " • Enable MFA for all admin accounts immediately" + echo " • Review and remove stale user accounts" + echo " • Rotate expired service principal credentials" + echo " • Reduce the number of Global Administrators to ≤5" + echo " • Remove inactive guest accounts" + echo "" + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + echo -e " ${YELLOW}Review recommended:${RESET} ${TOTAL_WARN} warning(s)" + echo "" + echo " Suggestions:" + echo " • Disable or remove stale user accounts" + echo " • Enforce MFA registration for all users" + echo " • Renew expiring service principal credentials" + echo " • Clean up inactive guest accounts" + echo "" + else + echo -e " ${GREEN}All checks passed${RESET}" + echo "" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat < N days + --mfa Check MFA registration status + --admins Audit privileged role assignments + --service-principals Audit service principal credential expiry + --guests Find stale guest accounts + +${BOLD}OPTIONS${RESET} + --stale-days N Override stale threshold in days (default: 90) + --verbose Debug output + --no-color Disable colored output + --help Show this help message + +${BOLD}ENVIRONMENT VARIABLES${RESET} + STALE_DAYS Days before a user is considered stale (default: 90) + VERBOSE Enable verbose output (true/false) + COLOR Color mode: auto, always, never + +${BOLD}PREREQUISITES${RESET} + • Azure CLI authenticated: az login + • Microsoft Graph API permissions for sign-in activity and reports + • Reader role or higher for role assignment queries + +${BOLD}EXAMPLES${RESET} + # Full audit + ${SCRIPT_NAME} --full + + # Check stale users only + ${SCRIPT_NAME} --stale-users + + # MFA audit with custom stale threshold + ${SCRIPT_NAME} --mfa --stale-days 60 + + # Service principal credential check + ${SCRIPT_NAME} --service-principals + + # Guest user audit + ${SCRIPT_NAME} --guests + +${BOLD}EXIT CODES${RESET} + 0 All checks passed + 1 Warnings found (review recommended) + 2 Critical findings (action required) +EOF +} + +# ══════════════════════════════════════════════════════════════════════ +# PARSE ARGS +# ══════════════════════════════════════════════════════════════════════ +parse_args() { + local modes=() + + while [[ $# -gt 0 ]]; do + case "$1" in + --full) + modes=(stale-users mfa admins service-principals guests) + shift ;; + --stale-users) + modes+=(stale-users); shift ;; + --mfa) + modes+=(mfa); shift ;; + --admins) + modes+=(admins); shift ;; + --service-principals) + modes+=(service-principals); shift ;; + --guests) + modes+=(guests); shift ;; + --stale-days) + STALE_DAYS="${2:?--stale-days requires a value}"; shift 2 ;; + --verbose) + VERBOSE="true"; shift ;; + --no-color) + COLOR="never"; shift ;; + --help|-h) + setup_colors; show_help; exit 0 ;; + *) + die "Unknown option: $1 (see --help)" ;; + esac + done + + if [[ ${#modes[@]} -eq 0 ]]; then + err "No audit mode specified" + echo "Run ${SCRIPT_NAME} --help for usage" >&2 + exit 1 + fi + + RUN_MODE="${modes[*]}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ +main() { + parse_args "$@" + setup_colors + check_deps + check_credentials + + START_TIME=$(date +%s) + + echo "" + echo -e "${BOLD}Azure Entra ID Auditor${RESET}" + echo -e "Mode: ${RUN_MODE}" + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" + + for mode in $RUN_MODE; do + case "$mode" in + stale-users) audit_stale_users ;; + mfa) audit_mfa ;; + admins) audit_admins ;; + service-principals) audit_service_principals ;; + guests) audit_guests ;; + esac + done + + print_summary + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + exit 2 + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@" diff --git a/azure-blob-manager.sh b/azure-blob-manager.sh new file mode 100644 index 0000000..e5cd21d --- /dev/null +++ b/azure-blob-manager.sh @@ -0,0 +1,597 @@ +#!/usr/bin/env bash + +######################################################################################### +#### azure-blob-manager.sh — Manage Azure Blob Storage containers, lifecycle, and #### +#### access auditing via az CLI. Upload, sync, tier, and audit blob storage #### +#### Requires: bash 4+, az CLI, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./azure-blob-manager.sh --list #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Severity counters (for audit mode) ─────────────────────────────── +TOTAL_CRIT=0 +TOTAL_WARN=0 +TOTAL_INFO=0 +TOTAL_OK=0 + +flag_crit() { ((TOTAL_CRIT++)) || true; } +flag_warn() { ((TOTAL_WARN++)) || true; } +flag_info() { ((TOTAL_INFO++)) || true; } +flag_ok() { ((TOTAL_OK++)) || true; } + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +STORAGE_ACCOUNT="" +CONTAINER_NAME="" +RESOURCE_GROUP="" +OUTPUT_FORMAT="${ABM_FORMAT:-text}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +MAX_AGE="${ABM_MAX_AGE:-90}" +TIER_TARGET="" +SOURCE_PATH="" +SUBSCRIPTION="" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── Dependency and credential checks ──────────────────────────────── +check_deps() { + command -v az &>/dev/null || die "az CLI is required (install: https://aka.ms/InstallAzureCLIDeb)" + command -v jq &>/dev/null || die "jq is required" +} + +check_credentials() { + local acct + acct=$(az account show --output json 2>&1) || die "Azure credentials not configured — run 'az login'" + + local sub_name + sub_name=$(echo "$acct" | jq -r '.name') + log "Subscription: ${sub_name}" + + if [[ -n "$SUBSCRIPTION" ]]; then + az account set --subscription "$SUBSCRIPTION" 2>/dev/null \ + || die "Cannot switch to subscription: ${SUBSCRIPTION}" + fi +} + +# ── Azure CLI wrapper ──────────────────────────────────────────────── +az_cmd() { + local args=("$@") + [[ -n "$SUBSCRIPTION" ]] && args+=(--subscription "$SUBSCRIPTION") + verbose "az ${args[*]}" + az "${args[@]}" +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST +# ══════════════════════════════════════════════════════════════════════ +do_list() { + if [[ -n "$STORAGE_ACCOUNT" && -n "$CONTAINER_NAME" ]]; then + list_blobs + elif [[ -n "$STORAGE_ACCOUNT" ]]; then + list_containers + else + list_accounts + fi +} + +list_accounts() { + section_header "Storage Accounts" + + local accounts + local args=(storage account list --output json) + [[ -n "$RESOURCE_GROUP" ]] && args+=(--resource-group "$RESOURCE_GROUP") + accounts=$(az_cmd "${args[@]}" 2>/dev/null) + + local count + count=$(echo "$accounts" | jq 'length') + + printf " %-28s %-16s %-12s %-12s %s\n" \ + "ACCOUNT" "RESOURCE_GROUP" "KIND" "REPLICATION" "LOCATION" + printf " %s\n" "$(printf '%.0s─' {1..90})" + + echo "$accounts" | jq -c '.[]' | while IFS= read -r acct; do + local name rg kind repl location + name=$(echo "$acct" | jq -r '.name') + rg=$(echo "$acct" | jq -r '.resourceGroup') + kind=$(echo "$acct" | jq -r '.kind') + repl=$(echo "$acct" | jq -r '.sku.name') + location=$(echo "$acct" | jq -r '.location') + + printf " %-28s %-16s %-12s %-12s %s\n" \ + "${name:0:27}" "${rg:0:15}" "${kind:0:11}" "${repl:0:11}" "$location" + done + + echo "" + field "Total accounts:" "$count" +} + +list_containers() { + section_header "Containers in ${STORAGE_ACCOUNT}" + + local containers + containers=$(az_cmd storage container list \ + --account-name "$STORAGE_ACCOUNT" --auth-mode login \ + --output json 2>/dev/null) || die "Failed to list containers — check permissions" + + local count + count=$(echo "$containers" | jq 'length') + + printf " %-32s %-16s %-12s %s\n" \ + "CONTAINER" "PUBLIC_ACCESS" "LEASE_STATE" "LAST_MODIFIED" + printf " %s\n" "$(printf '%.0s─' {1..80})" + + echo "$containers" | jq -c '.[]' | while IFS= read -r ctr; do + local name public_access lease_state last_mod + name=$(echo "$ctr" | jq -r '.name') + public_access=$(echo "$ctr" | jq -r '.properties.publicAccess // "none"') + lease_state=$(echo "$ctr" | jq -r '.properties.leaseState // "available"') + last_mod=$(echo "$ctr" | jq -r '.properties.lastModified // ""' | cut -dT -f1) + + printf " %-32s %-16s %-12s %s\n" \ + "${name:0:31}" "$public_access" "$lease_state" "$last_mod" + done + + echo "" + field "Total containers:" "$count" +} + +list_blobs() { + section_header "Blobs in ${STORAGE_ACCOUNT}/${CONTAINER_NAME}" + + local blobs + blobs=$(az_cmd storage blob list \ + --account-name "$STORAGE_ACCOUNT" --container-name "$CONTAINER_NAME" \ + --auth-mode login --output json 2>/dev/null) \ + || die "Failed to list blobs — check permissions" + + local count + count=$(echo "$blobs" | jq 'length') + + printf " %-40s %-12s %-8s %s\n" \ + "NAME" "SIZE" "TIER" "LAST_MODIFIED" + printf " %s\n" "$(printf '%.0s─' {1..80})" + + echo "$blobs" | jq -c '.[]' | while IFS= read -r blob; do + local name size tier last_mod size_str + name=$(echo "$blob" | jq -r '.name') + size=$(echo "$blob" | jq -r '.properties.contentLength // 0') + tier=$(echo "$blob" | jq -r '.properties.blobTier // "N/A"') + last_mod=$(echo "$blob" | jq -r '.properties.lastModified // ""' | cut -dT -f1) + + if (( size > 1073741824 )); then + size_str="$(( size / 1073741824 )) GB" + elif (( size > 1048576 )); then + size_str="$(( size / 1048576 )) MB" + elif (( size > 1024 )); then + size_str="$(( size / 1024 )) KB" + else + size_str="${size} B" + fi + + printf " %-40s %-12s %-8s %s\n" \ + "${name:0:39}" "$size_str" "$tier" "$last_mod" + done + + echo "" + field "Total blobs:" "$count" +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ +do_audit() { + section_header "Storage Security Audit" + + local accounts + local args=(storage account list --output json) + [[ -n "$RESOURCE_GROUP" ]] && args+=(--resource-group "$RESOURCE_GROUP") + accounts=$(az_cmd "${args[@]}" 2>/dev/null) + + printf " %-28s %-16s %-14s %-14s %s\n" \ + "ACCOUNT" "HTTPS_ONLY" "PUBLIC_BLOB" "NETWORK_RULES" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..95})" + + echo "$accounts" | jq -c '.[]' | while IFS= read -r acct; do + local name https_only public_access net_default + name=$(echo "$acct" | jq -r '.name') + https_only=$(echo "$acct" | jq -r '.enableHttpsTrafficOnly // true') + public_access=$(echo "$acct" | jq -r '.allowBlobPublicAccess // false') + net_default=$(echo "$acct" | jq -r '.networkRuleSet.defaultAction // "Allow"') + + local severity="OK" color="$GREEN" + + if [[ "$public_access" == "true" ]]; then + severity="CRITICAL"; color="$RED"; flag_crit + elif [[ "$https_only" != "true" ]]; then + severity="WARN"; color="$YELLOW"; flag_warn + elif [[ "$net_default" == "Allow" ]]; then + severity="WARN"; color="$YELLOW"; flag_warn + else + flag_ok + fi + + printf " %-28s %-16s %-14s %-14s %b%s%b\n" \ + "${name:0:27}" "$https_only" "$public_access" "$net_default" \ + "$color" "$severity" "$RESET" + done + + echo "" + + # Check individual containers for public access + log "Checking container-level public access..." + echo "" + + echo "$accounts" | jq -r '.[].name' | while IFS= read -r acct_name; do + local containers + containers=$(az_cmd storage container list \ + --account-name "$acct_name" --auth-mode login \ + --output json 2>/dev/null) || continue + + echo "$containers" | jq -c '.[]' | while IFS= read -r ctr; do + local ctr_name public_access + ctr_name=$(echo "$ctr" | jq -r '.name') + public_access=$(echo "$ctr" | jq -r '.properties.publicAccess // "none"') + + if [[ "$public_access" != "none" && "$public_access" != "null" ]]; then + printf " %-28s %-28s %-14s %b%s%b\n" \ + "${acct_name:0:27}" "${ctr_name:0:27}" "$public_access" \ + "$RED" "CRITICAL" "$RESET" + flag_crit + fi + done + done + + print_summary +} + +# ══════════════════════════════════════════════════════════════════════ +# SYNC +# ══════════════════════════════════════════════════════════════════════ +do_sync() { + [[ -z "$STORAGE_ACCOUNT" ]] && die "--sync requires --account" + [[ -z "$CONTAINER_NAME" ]] && die "--sync requires --container" + [[ -z "$SOURCE_PATH" ]] && die "--sync requires --source PATH" + [[ -d "$SOURCE_PATH" ]] || die "Source path does not exist: ${SOURCE_PATH}" + + section_header "Syncing to ${STORAGE_ACCOUNT}/${CONTAINER_NAME}" + field "Source:" "$SOURCE_PATH" + echo "" + + if az_cmd storage blob upload-batch \ + --account-name "$STORAGE_ACCOUNT" \ + --destination "$CONTAINER_NAME" \ + --source "$SOURCE_PATH" \ + --auth-mode login \ + --overwrite 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} Sync complete" + else + die "Sync failed" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# TIER +# ══════════════════════════════════════════════════════════════════════ +do_tier() { + [[ -z "$STORAGE_ACCOUNT" ]] && die "--tier requires --account" + [[ -z "$CONTAINER_NAME" ]] && die "--tier requires --container" + [[ -z "$TIER_TARGET" ]] && die "--tier requires --set-tier TIER" + + section_header "Changing Blob Tier" + field "Account:" "$STORAGE_ACCOUNT" + field "Container:" "$CONTAINER_NAME" + field "Target tier:" "$TIER_TARGET" + echo "" + + local blobs + blobs=$(az_cmd storage blob list \ + --account-name "$STORAGE_ACCOUNT" --container-name "$CONTAINER_NAME" \ + --auth-mode login --output json 2>/dev/null) \ + || die "Failed to list blobs" + + local changed=0 errors=0 + + echo "$blobs" | jq -r '.[].name' | while IFS= read -r blob_name; do + if az_cmd storage blob set-tier \ + --account-name "$STORAGE_ACCOUNT" --container-name "$CONTAINER_NAME" \ + --name "$blob_name" --tier "$TIER_TARGET" \ + --auth-mode login 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} ${blob_name} → ${TIER_TARGET}" + ((changed++)) || true + else + echo -e " ${RED}✗${RESET} ${blob_name} — failed" + ((errors++)) || true + fi + done + + echo "" + field "Changed:" "$changed" + [[ "$errors" -gt 0 ]] && field_color "Errors:" "${RED}${errors}${RESET}" +} + +# ══════════════════════════════════════════════════════════════════════ +# LIFECYCLE +# ══════════════════════════════════════════════════════════════════════ +do_lifecycle() { + [[ -z "$STORAGE_ACCOUNT" ]] && die "--lifecycle requires --account" + [[ -z "$RESOURCE_GROUP" ]] && die "--lifecycle requires --resource-group" + + section_header "Lifecycle Management Policy" + field "Account:" "$STORAGE_ACCOUNT" + echo "" + + local policy + policy=$(az_cmd storage account management-policy show \ + --account-name "$STORAGE_ACCOUNT" --resource-group "$RESOURCE_GROUP" \ + --output json 2>/dev/null) + + if [[ -z "$policy" || "$policy" == "null" ]]; then + log "No lifecycle policy configured" + else + echo "$policy" | jq '.policy.rules[] | {name: .name, type: .type, definition: .definition}' + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# STATS +# ══════════════════════════════════════════════════════════════════════ +do_stats() { + section_header "Storage Statistics" + + local accounts + local args=(storage account list --output json) + [[ -n "$RESOURCE_GROUP" ]] && args+=(--resource-group "$RESOURCE_GROUP") + accounts=$(az_cmd "${args[@]}" 2>/dev/null) + + local total_accounts + total_accounts=$(echo "$accounts" | jq 'length') + + printf " %-28s %-16s %-12s %s\n" \ + "ACCOUNT" "LOCATION" "KIND" "REPLICATION" + printf " %s\n" "$(printf '%.0s─' {1..75})" + + echo "$accounts" | jq -c '.[]' | while IFS= read -r acct; do + local name location kind repl + name=$(echo "$acct" | jq -r '.name') + location=$(echo "$acct" | jq -r '.location') + kind=$(echo "$acct" | jq -r '.kind') + repl=$(echo "$acct" | jq -r '.sku.name') + + printf " %-28s %-16s %-12s %s\n" \ + "${name:0:27}" "$location" "${kind:0:11}" "$repl" + done + + echo "" + field "Total accounts:" "$total_accounts" +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ +print_summary() { + echo "" + echo " ══════════════════════════════════════════" + echo " Storage Audit Summary" + echo " ══════════════════════════════════════════" + printf " %-20s %b%d%b\n" "CRITICAL:" "$RED" "$TOTAL_CRIT" "$RESET" + printf " %-20s %b%d%b\n" "WARN:" "$YELLOW" "$TOTAL_WARN" "$RESET" + printf " %-20s %b%d%b\n" "INFO:" "$CYAN" "$TOTAL_INFO" "$RESET" + printf " %-20s %b%d%b\n" "OK:" "$GREEN" "$TOTAL_OK" "$RESET" + echo " ──────────────────────────────────────────" + echo "" + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + echo -e " ${RED}${BOLD}Action required:${RESET} ${TOTAL_CRIT} critical finding(s)" + echo "" + echo " Top recommendations:" + echo " • Disable public blob access on all storage accounts" + echo " • Set container access level to private" + echo " • Enable HTTPS-only traffic" + echo " • Configure network rules to restrict access" + echo "" + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + echo -e " ${YELLOW}Review recommended:${RESET} ${TOTAL_WARN} warning(s)" + echo "" + else + echo -e " ${GREEN}All checks passed${RESET}" + echo "" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2; } +log_error() { printf "${RED}[ERROR]${RESET} %s\n" "$*" >&2; } +log_debug() { [[ "$VERBOSE" == "true" ]] && printf "${DIM}[DEBUG] %s${RESET}\n" "$*"; } + +# ── Helpers ─────────────────────────────────────────────────────────── +die() { log_error "$@"; exit 1; } + +check_deps() { + local missing=() + command -v az >/dev/null 2>&1 || missing+=("az-cli") + command -v jq >/dev/null 2>&1 || missing+=("jq") + command -v curl >/dev/null 2>&1 || missing+=("curl") + if (( ${#missing[@]} > 0 )); then + die "Missing required tools: ${missing[*]}" + fi + + local bash_major="${BASH_VERSINFO[0]}" + if (( bash_major < 4 )); then + die "Requires bash 4+, found ${BASH_VERSION}" + fi +} + +validate_date() { + local d="$1" + if [[ ! "$d" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then + die "Invalid date format: $d (expected YYYY-MM-DD)" + fi +} + +# ── Date math (portable) ───────────────────────────────────────────── +date_offset() { + # Usage: date_offset YYYY-MM-DD -N → date N days before + local base="$1" offset="$2" + if date --version >/dev/null 2>&1; then + # GNU date + date -d "${base} ${offset} days" +%Y-%m-%d + else + # macOS date + date -j -v"${offset}d" -f "%Y-%m-%d" "$base" +%Y-%m-%d + fi +} + +today_utc() { date -u +%Y-%m-%d; } + +first_of_month() { + local d="$1" + echo "${d:0:8}01" +} + +first_of_prev_month() { + local d="$1" + local year="${d:0:4}" + local month="${d:5:2}" + month=$((10#$month - 1)) + if (( month == 0 )); then + month=12 + year=$((year - 1)) + fi + printf "%04d-%02d-01" "$year" "$month" +} + +days_between() { + local s="$1" e="$2" + local ss se + if date --version >/dev/null 2>&1; then + ss=$(date -d "$s" +%s) + se=$(date -d "$e" +%s) + else + ss=$(date -j -f "%Y-%m-%d" "$s" +%s) + se=$(date -j -f "%Y-%m-%d" "$e" +%s) + fi + echo $(( (se - ss) / 86400 )) +} + +# ── Compute date ranges ────────────────────────────────────────────── +compute_ranges() { + local today + today="$(today_utc)" + + case "$RUN_MODE" in + daily) + PERIOD_START="$(date_offset "$today" -1)" + PERIOD_END="$today" + PREV_START="$(date_offset "$today" -2)" + PREV_END="$(date_offset "$today" -1)" + ;; + weekly) + PERIOD_START="$(date_offset "$today" -7)" + PERIOD_END="$today" + PREV_START="$(date_offset "$today" -14)" + PREV_END="$(date_offset "$today" -7)" + ;; + monthly) + PERIOD_START="$(first_of_month "$today")" + PERIOD_END="$today" + local prev_first + prev_first="$(first_of_prev_month "$today")" + PREV_START="$prev_first" + PREV_END="$PERIOD_START" + ;; + custom) + PERIOD_START="$CUSTOM_START" + PERIOD_END="$CUSTOM_END" + local span + span="$(days_between "$CUSTOM_START" "$CUSTOM_END")" + PREV_START="$(date_offset "$CUSTOM_START" "-$span")" + PREV_END="$CUSTOM_START" + ;; + *) + die "Unknown mode: $RUN_MODE" + ;; + esac + + log_debug "Current period: $PERIOD_START → $PERIOD_END" + log_debug "Previous period: $PREV_START → $PREV_END" +} + +# ── Resolve subscription ───────────────────────────────────────────── +resolve_subscription() { + if [[ -n "$SUBSCRIPTION" ]]; then + log_debug "Using subscription: $SUBSCRIPTION" + return + fi + SUBSCRIPTION=$(az account show --query 'id' -o tsv 2>/dev/null) \ + || die "Cannot determine subscription. Use --subscription or az account set." + log_debug "Resolved subscription: $SUBSCRIPTION" +} + +# ── Build Cost Management query payload ────────────────────────────── +build_query_payload() { + local start="$1" end="$2" + local grouping_name grouping_type + + case "$GROUP_BY" in + SERVICE) + grouping_name="ServiceName" + grouping_type="Dimension" + ;; + RESOURCE_GROUP) + grouping_name="ResourceGroupName" + grouping_type="Dimension" + ;; + TAG) + if [[ -z "$COST_TAG_KEY" ]]; then + die "--group-by TAG requires --tag KEY=VALUE" + fi + grouping_name="$COST_TAG_KEY" + grouping_type="TagKey" + ;; + *) + die "Invalid --group-by value: $GROUP_BY (expected SERVICE, RESOURCE_GROUP, or TAG)" + ;; + esac + + local filter_block="{}" + if [[ -n "$COST_TAG_KEY" && -n "$COST_TAG_VALUE" && "$GROUP_BY" != "TAG" ]]; then + filter_block=$(jq -n \ + --arg key "$COST_TAG_KEY" \ + --arg val "$COST_TAG_VALUE" \ + '{ + "Tags": { + "Name": $key, + "Operator": "In", + "Values": [$val] + } + }') + fi + + local payload + if [[ "$filter_block" == "{}" ]]; then + payload=$(jq -n \ + --arg start "$start" \ + --arg end "$end" \ + --arg gname "$grouping_name" \ + --arg gtype "$grouping_type" \ + '{ + "type": "ActualCost", + "dataSet": { + "granularity": "None", + "aggregation": { + "totalCost": { + "name": "Cost", + "function": "Sum" + } + }, + "grouping": [ + { + "type": $gtype, + "name": $gname + } + ] + }, + "timeframe": "Custom", + "timePeriod": { + "from": $start, + "to": $end + } + }') + else + payload=$(jq -n \ + --arg start "$start" \ + --arg end "$end" \ + --arg gname "$grouping_name" \ + --arg gtype "$grouping_type" \ + --argjson filter "$filter_block" \ + '{ + "type": "ActualCost", + "dataSet": { + "granularity": "None", + "aggregation": { + "totalCost": { + "name": "Cost", + "function": "Sum" + } + }, + "grouping": [ + { + "type": $gtype, + "name": $gname + } + ], + "filter": $filter + }, + "timeframe": "Custom", + "timePeriod": { + "from": $start, + "to": $end + } + }') + fi + + echo "$payload" +} + +# ── Query Cost Management API ──────────────────────────────────────── +query_costs() { + local start="$1" end="$2" + local payload + payload="$(build_query_payload "$start" "$end")" + + local scope="/subscriptions/${SUBSCRIPTION}" + local api_url="https://management.azure.com${scope}/providers/Microsoft.CostManagement/query?api-version=2023-11-01" + + log_debug "Querying: $api_url" + log_debug "Payload: $payload" + + az rest \ + --method post \ + --url "$api_url" \ + --body "$payload" \ + --output json 2>/dev/null +} + +# ── Parse cost data ────────────────────────────────────────────────── +parse_costs() { + local raw="$1" + echo "$raw" | jq -r ' + .properties.rows // [] | + map({ + key: .[1], + amount: (.[0] | tonumber) + }) | + group_by(.key) | + map({ + key: .[0].key, + total: (map(.amount) | add) + }) | + sort_by(-.total) | + .[] | + "\(.key)\t\(.total)" + ' 2>/dev/null || echo "" +} + +# ── Format helpers ──────────────────────────────────────────────────── +fmt_currency() { + printf "$%.2f" "$1" +} + +fmt_delta() { + local curr="$1" prev="$2" + if (( $(echo "$prev == 0" | bc -l) )); then + echo "N/A" + return + fi + local pct + pct=$(echo "scale=1; (($curr - $prev) / $prev) * 100" | bc -l) + local sign="" + if (( $(echo "$pct > 0" | bc -l) )); then + sign="+" + fi + echo "${sign}${pct}%" +} + +print_header() { + local sub_name + sub_name=$(az account show --query 'name' -o tsv 2>/dev/null || echo "unknown") + + echo "Azure Cost Reporter" + echo "Subscription: $sub_name ($SUBSCRIPTION)" + echo "Mode: $RUN_MODE" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + + if [[ "$RUN_MODE" == "custom" ]]; then + echo "Period: $PERIOD_START → $PERIOD_END" + fi + echo "" +} + +# ── Text table output ──────────────────────────────────────────────── +output_text_table() { + local -n curr_data=$1 + local -n prev_data=$2 + local label="SERVICE" + case "$GROUP_BY" in + RESOURCE_GROUP) label="RESOURCE_GROUP" ;; + TAG) label="TAG" ;; + esac + local divider="──────────────────────────────────────────────────────────────────────" + printf " %-38s %-12s %-12s %s\n" "$label" "COST" "PREV" "DELTA" + printf " %s\n" "$divider" + local total_curr=0 total_prev=0 + for key in "${!curr_data[@]}"; do + local cost="${curr_data[$key]}" prev_cost="${prev_data[$key]:-0}" + printf " %-38s %-12s %-12s %s\n" \ + "$key" "$(fmt_currency "$cost")" "$(fmt_currency "$prev_cost")" "$(fmt_delta "$cost" "$prev_cost")" + total_curr=$(echo "$total_curr + $cost" | bc -l) + total_prev=$(echo "$total_prev + $prev_cost" | bc -l) + done + printf " %s\n" "$divider" + printf " %-38s %-12s %-12s %s\n" \ + "TOTAL" "$(fmt_currency "$total_curr")" "$(fmt_currency "$total_prev")" "$(fmt_delta "$total_curr" "$total_prev")" +} + +# ── CSV output ──────────────────────────────────────────────────────── +output_csv() { + local -n curr_data=$1 + local -n prev_data=$2 + local label="service" + case "$GROUP_BY" in + RESOURCE_GROUP) label="resource_group" ;; + TAG) label="tag" ;; + esac + echo "${label},cost,previous_cost,delta_pct" + for key in "${!curr_data[@]}"; do + local cost="${curr_data[$key]}" prev_cost="${prev_data[$key]:-0}" pct="0" + if (( $(echo "$prev_cost != 0" | bc -l) )); then + pct=$(echo "scale=2; (($cost - $prev_cost) / $prev_cost) * 100" | bc -l) + fi + echo "\"$key\",$cost,$prev_cost,$pct" + done +} + +# ── JSON output ─────────────────────────────────────────────────────── +output_json() { + local -n curr_data=$1 + local -n prev_data=$2 + local label="service" + case "$GROUP_BY" in + RESOURCE_GROUP) label="resource_group" ;; + TAG) label="tag" ;; + esac + local items=() + for key in "${!curr_data[@]}"; do + items+=("{\"${label}\": \"${key}\", \"cost\": ${curr_data[$key]}, \"previous_cost\": ${prev_data[$key]:-0}}") + done + local joined + joined=$(printf ",%s" "${items[@]}") + joined="${joined:1}" + printf '{"mode":"%s","period_start":"%s","period_end":"%s","previous_start":"%s","previous_end":"%s","group_by":"%s","items":[%s]}\n' \ + "$RUN_MODE" "$PERIOD_START" "$PERIOD_END" "$PREV_START" "$PREV_END" "$GROUP_BY" "$joined" +} + +# ── Render report ───────────────────────────────────────────────────── +render_report() { + local curr_raw="$1" prev_raw="$2" + + # Parse into associative arrays + declare -A curr_costs + declare -A prev_costs + + while IFS=$'\t' read -r key amount; do + [[ -z "$key" ]] && continue + curr_costs["$key"]="$amount" + done <<< "$(parse_costs "$curr_raw")" + + while IFS=$'\t' read -r key amount; do + [[ -z "$key" ]] && continue + prev_costs["$key"]="$amount" + done <<< "$(parse_costs "$prev_raw")" + + # Ensure previous-only keys appear in current with 0 + for key in "${!prev_costs[@]}"; do + if [[ -z "${curr_costs[$key]+x}" ]]; then + curr_costs["$key"]="0" + fi + done + + case "$OUTPUT_FORMAT" in + text) + print_header + local title="Cost Breakdown — ${PERIOD_START} → ${PERIOD_END}" + echo "$title" + output_text_table curr_costs prev_costs + echo "" + ;; + csv) + output_csv curr_costs prev_costs + ;; + json) + output_json curr_costs prev_costs + ;; + *) + die "Unknown format: $OUTPUT_FORMAT" + ;; + esac +} + +# ── Slack webhook ───────────────────────────────────────────────────── +send_slack() { + local report="$1" webhook="$2" + + log_info "Posting report to Slack..." + + # Truncate for Slack message limits + local max_len=3000 + local body="$report" + if (( ${#body} > max_len )); then + body="${body:0:$max_len} + +... (truncated — full report exceeds Slack message limit)" + fi + + local payload + payload=$(jq -n --arg text "\`\`\`${body}\`\`\`" '{ text: $text }') + + local http_code + http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + -d "$payload" \ + "$webhook") + + if [[ "$http_code" != "200" ]]; then + log_error "Slack webhook returned HTTP $http_code" + return 1 + fi + + log_info "Slack message posted" +} + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat < 0 )); do + case "$1" in + --daily|--weekly|--monthly) + RUN_MODE="${1#--}"; shift ;; + --custom) + RUN_MODE="custom" + [[ $# -lt 3 ]] && die "--custom requires START and END dates" + CUSTOM_START="$2"; CUSTOM_END="$3" + validate_date "$CUSTOM_START"; validate_date "$CUSTOM_END" + shift 3 ;; + --group-by) + [[ $# -lt 2 ]] && die "--group-by requires a value" + GROUP_BY="$2"; shift 2 ;; + --tag) + [[ $# -lt 2 ]] && die "--tag requires KEY=VALUE" + [[ "$2" != *"="* ]] && die "--tag value must be KEY=VALUE" + COST_TAG_KEY="${2%%=*}"; COST_TAG_VALUE="${2#*=}"; shift 2 ;; + --subscription) + [[ $# -lt 2 ]] && die "--subscription requires a value" + SUBSCRIPTION="$2"; shift 2 ;; + --format) + [[ $# -lt 2 ]] && die "--format requires a value" + OUTPUT_FORMAT="$2"; shift 2 ;; + --slack) + [[ $# -lt 2 ]] && die "--slack requires a webhook URL" + SLACK_URL="$2"; shift 2 ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + --help|-h) usage ;; + *) die "Unknown option: $1 (see --help)" ;; + esac + done + + if [[ -z "$RUN_MODE" ]]; then log_error "No mode specified"; echo ""; usage; exit 1; fi + [[ -z "$SLACK_URL" && -n "$SLACK_WEBHOOK_URL" ]] && SLACK_URL="$SLACK_WEBHOOK_URL" + + case "$GROUP_BY" in + SERVICE|RESOURCE_GROUP|TAG) ;; + *) die "Invalid --group-by: $GROUP_BY" ;; + esac + case "$OUTPUT_FORMAT" in + text|csv|json) ;; + *) die "Invalid --format: $OUTPUT_FORMAT" ;; + esac +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + parse_args "$@" + setup_colors + check_deps + + START_TIME=$(date +%s) + + # Validate Azure credentials + log_debug "Validating Azure credentials..." + az account show >/dev/null 2>&1 \ + || die "Azure credentials not configured — run 'az login' first" + + resolve_subscription + compute_ranges + + log_info "Querying Cost Management ($RUN_MODE, group by $GROUP_BY)..." + + local curr_raw prev_raw + curr_raw="$(query_costs "$PERIOD_START" "$PERIOD_END")" + prev_raw="$(query_costs "$PREV_START" "$PREV_END")" + + if [[ -z "$curr_raw" ]]; then + die "No cost data returned for $PERIOD_START → $PERIOD_END" + fi + + local report + report="$(render_report "$curr_raw" "$prev_raw")" + + # Output to stdout + echo "$report" + + # Slack delivery + if [[ -n "$SLACK_URL" ]]; then + send_slack "$report" "$SLACK_URL" + fi + + local elapsed=$(( $(date +%s) - START_TIME )) + log_info "Completed in ${elapsed}s" +} + +main "$@" diff --git a/azure-snapshot-manager.sh b/azure-snapshot-manager.sh new file mode 100644 index 0000000..c836fd3 --- /dev/null +++ b/azure-snapshot-manager.sh @@ -0,0 +1,726 @@ +#!/usr/bin/env bash + +######################################################################################### +#### azure-snapshot-manager.sh — Create, rotate, list, audit, and restore Azure #### +#### managed disk snapshots via az CLI. Automated retention and fleet-wide ops #### +#### Requires: bash 4+, az CLI, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./azure-snapshot-manager.sh --snapshot --all #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +ALSO_ROTATE="false" +VM_NAME="" +RESOURCE_GROUP="" +TARGET_ALL="false" +SNAPSHOT_ID="" +KEEP="${ASM_KEEP:-3}" +PREFIX="${ASM_PREFIX:-auto}" +MAX_AGE="${ASM_MAX_AGE:-7}" +OUTPUT_FORMAT="${ASM_FORMAT:-text}" +DRY_RUN="true" +FORCE="false" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +SUBSCRIPTION="" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +SNAP_CREATED=0 +SNAP_DELETED=0 +SNAP_ERRORS=0 + +# ── Dependency and credential checks ──────────────────────────────── +check_deps() { + command -v az &>/dev/null || die "az CLI is required (install: https://aka.ms/InstallAzureCLIDeb)" + command -v jq &>/dev/null || die "jq is required" +} + +check_credentials() { + local acct + acct=$(az account show --output json 2>&1) || die "Azure credentials not configured — run 'az login'" + + local sub_name sub_id + sub_name=$(echo "$acct" | jq -r '.name') + sub_id=$(echo "$acct" | jq -r '.id') + verbose "Subscription: ${sub_name} (${sub_id})" + log "Subscription: ${sub_name}" + + if [[ -n "$SUBSCRIPTION" ]]; then + az account set --subscription "$SUBSCRIPTION" 2>/dev/null \ + || die "Cannot switch to subscription: ${SUBSCRIPTION}" + log "Switched to subscription: ${SUBSCRIPTION}" + fi +} + +# ── Azure CLI wrapper ──────────────────────────────────────────────── +az_cmd() { + local args=("$@") + [[ -n "$SUBSCRIPTION" ]] && args+=(--subscription "$SUBSCRIPTION") + verbose "az ${args[*]}" + az "${args[@]}" +} + +# ── VM helpers ─────────────────────────────────────────────────────── +get_all_vms() { + local args=(vm list --output json) + [[ -n "$RESOURCE_GROUP" ]] && args+=(--resource-group "$RESOURCE_GROUP") + az_cmd "${args[@]}" 2>/dev/null +} + +get_vm_os_disk_id() { + local vm_name="$1" rg="$2" + az_cmd vm show --name "$vm_name" --resource-group "$rg" \ + --query 'storageProfile.osDisk.managedDisk.id' --output tsv 2>/dev/null +} + +get_vm_rg() { + local vm_json="$1" + echo "$vm_json" | jq -r '.resourceGroup' +} + +# ── Snapshot helpers ───────────────────────────────────────────────── +list_snapshots() { + local args=(snapshot list --output json) + [[ -n "$RESOURCE_GROUP" ]] && args+=(--resource-group "$RESOURCE_GROUP") + az_cmd "${args[@]}" 2>/dev/null +} + +managed_snapshots() { + list_snapshots | jq --arg pfx "$PREFIX" \ + '[.[] | select(.name | startswith($pfx))]' +} + +# ══════════════════════════════════════════════════════════════════════ +# SNAPSHOT +# ══════════════════════════════════════════════════════════════════════ +do_snapshot() { + local vm_json + vm_json=$(get_all_vms) + + local vms + if [[ "$TARGET_ALL" == "true" ]]; then + vms="$vm_json" + elif [[ -n "$VM_NAME" ]]; then + vms=$(echo "$vm_json" | jq --arg n "$VM_NAME" '[.[] | select(.name == $n)]') + else + die "Specify --vm NAME or --all" + fi + + local count + count=$(echo "$vms" | jq 'length') + [[ "$count" -eq 0 ]] && die "No VMs found" + + local target_label="$VM_NAME" + [[ "$TARGET_ALL" == "true" ]] && target_label="all (${count} VMs)" + + section_header "Creating Snapshots" + field "Target:" "$target_label" + field "Prefix:" "$PREFIX" + echo "" + + echo "$vms" | jq -c '.[]' | while IFS= read -r vm; do + local name rg disk_id snap_name + name=$(echo "$vm" | jq -r '.name') + rg=$(echo "$vm" | jq -r '.resourceGroup') + disk_id=$(get_vm_os_disk_id "$name" "$rg") + snap_name="${PREFIX}-${name}-$(date +%Y%m%d-%H%M%S)" + + if [[ -z "$disk_id" ]]; then + echo -e " ${RED}✗${RESET} ${name} (${rg}) no OS disk found" + ((SNAP_ERRORS++)) || true + continue + fi + + verbose "Snapshotting ${name} disk ${disk_id}" + + if az_cmd snapshot create \ + --resource-group "$rg" \ + --name "$snap_name" \ + --source "$disk_id" \ + --tags "managed-by=${SCRIPT_NAME}" "source-vm=${name}" \ + --output none 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} ${name} (${rg}) ${snap_name}" + ((SNAP_CREATED++)) || true + else + echo -e " ${RED}✗${RESET} ${name} (${rg}) failed" + ((SNAP_ERRORS++)) || true + fi + + sleep 1 + done + + echo "" + field_color "Created:" "${GREEN}${SNAP_CREATED}${RESET}" + if [[ "$SNAP_ERRORS" -gt 0 ]]; then + field_color "Errors:" "${RED}${SNAP_ERRORS}${RESET}" + fi + + if [[ "$ALSO_ROTATE" == "true" ]]; then + do_rotate + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# ROTATE +# ══════════════════════════════════════════════════════════════════════ +do_rotate() { + section_header "Rotating Snapshots" + field "Keep:" "$KEEP per VM" + field "Prefix:" "$PREFIX" + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + field "Mode:" "DRY RUN (use --force to delete)" + else + field "Mode:" "LIVE — deletions are permanent" + fi + echo "" + + local snaps + snaps=$(managed_snapshots) + + local vm_names + vm_names=$(echo "$snaps" | jq -r '.[].tags["source-vm"] // empty' | sort -u) + + if [[ -z "$vm_names" ]]; then + log "No managed snapshots found matching prefix '${PREFIX}'" + return + fi + + while IFS= read -r vm; do + [[ -z "$vm" ]] && continue + local vm_snaps + vm_snaps=$(echo "$snaps" | jq --arg vm "$vm" \ + '[.[] | select(.tags["source-vm"] == $vm)] | sort_by(.timeCreated) | reverse') + local total + total=$(echo "$vm_snaps" | jq 'length') + + if (( total <= KEEP )); then + verbose "${vm}: ${total} snapshots, keeping all" + continue + fi + + local to_delete + to_delete=$(echo "$vm_snaps" | jq --argjson k "$KEEP" '.[$k:]') + local del_count + del_count=$(echo "$to_delete" | jq 'length') + + echo "$to_delete" | jq -c '.[]' | while IFS= read -r snap; do + local sname srg + sname=$(echo "$snap" | jq -r '.name') + srg=$(echo "$snap" | jq -r '.resourceGroup') + + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + echo -e " ${DIM}[DRY RUN]${RESET} would delete ${sname} (${srg})" + else + if az_cmd snapshot delete --name "$sname" --resource-group "$srg" \ + --output none 2>/dev/null; then + echo -e " ${YELLOW}✓${RESET} deleted ${sname}" + ((SNAP_DELETED++)) || true + else + echo -e " ${RED}✗${RESET} failed to delete ${sname}" + ((SNAP_ERRORS++)) || true + fi + fi + done + + log "${vm}: ${total} total, keeping ${KEEP}, removing ${del_count}" + done <<< "$vm_names" + + echo "" + field_color "Deleted:" "${YELLOW}${SNAP_DELETED}${RESET}" +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST +# ══════════════════════════════════════════════════════════════════════ +do_list() { + section_header "All Snapshots" + + local snaps + snaps=$(list_snapshots) + local count + count=$(echo "$snaps" | jq 'length') + + if [[ "$count" -eq 0 ]]; then + log "No snapshots found" + return + fi + + printf " %-36s %-16s %-8s %-12s %s\n" \ + "NAME" "RESOURCE_GROUP" "SIZE_GB" "AGE" "SOURCE_VM" + printf " %s\n" "$(printf '%.0s─' {1..90})" + + local now + now=$(date +%s) + + echo "$snaps" | jq -c '.[]' | while IFS= read -r snap; do + local name rg size_gb created source_vm age_str + name=$(echo "$snap" | jq -r '.name') + rg=$(echo "$snap" | jq -r '.resourceGroup') + size_gb=$(echo "$snap" | jq -r '.diskSizeGb // 0') + created=$(echo "$snap" | jq -r '.timeCreated // ""') + source_vm=$(echo "$snap" | jq -r '.tags["source-vm"] // "manual"') + + if [[ -n "$created" ]]; then + local snap_epoch + snap_epoch=$(date -d "$created" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + local age_days=$(( (now - snap_epoch) / 86400 )) + age_str="${age_days}d" + else + age_str="unknown" + fi + else + age_str="unknown" + fi + + printf " %-36s %-16s %-8s %-12s %s\n" \ + "${name:0:35}" "${rg:0:15}" "$size_gb" "$age_str" "${source_vm:0:20}" + done + + echo "" + field "Total snapshots:" "$count" +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ +do_audit() { + section_header "Snapshot Audit" + + local vm_json + vm_json=$(get_all_vms) + local snaps + snaps=$(list_snapshots) + local now + now=$(date +%s) + + printf " %-24s %-16s %-20s %-8s %-8s %s\n" \ + "VM_NAME" "RESOURCE_GROUP" "LATEST_SNAPSHOT" "AGE" "COUNT" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..95})" + + echo "$vm_json" | jq -c '.[]' | while IFS= read -r vm; do + local name rg + name=$(echo "$vm" | jq -r '.name') + rg=$(echo "$vm" | jq -r '.resourceGroup') + + local vm_snaps snap_count + vm_snaps=$(echo "$snaps" | jq --arg vm "$name" \ + '[.[] | select(.tags["source-vm"] == $vm)]') + snap_count=$(echo "$vm_snaps" | jq 'length') + + if [[ "$snap_count" -eq 0 ]]; then + printf " %-24s %-16s %-20s %-8s %-8s %b%s%b\n" \ + "${name:0:23}" "${rg:0:15}" "(none)" "—" "0" \ + "$RED" "✗ Unprotected" "$RESET" + continue + fi + + local latest_name latest_date age_str status color + latest_name=$(echo "$vm_snaps" | jq -r 'sort_by(.timeCreated) | last | .name // ""') + latest_date=$(echo "$vm_snaps" | jq -r 'sort_by(.timeCreated) | last | .timeCreated // ""') + + if [[ -n "$latest_date" ]]; then + local snap_epoch + snap_epoch=$(date -d "$latest_date" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + local age_days=$(( (now - snap_epoch) / 86400 )) + age_str="${age_days}d" + if (( age_days > MAX_AGE )); then + status="⚠ Stale" + color="$YELLOW" + else + status="✓ OK" + color="$GREEN" + fi + else + age_str="unknown" + status="✓ OK" + color="$GREEN" + fi + else + age_str="unknown" + status="✓ OK" + color="$GREEN" + fi + + printf " %-24s %-16s %-20s %-8s %-8s %b%s%b\n" \ + "${name:0:23}" "${rg:0:15}" "${latest_name:0:19}" \ + "$age_str" "$snap_count" "$color" "$status" "$RESET" + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# RESTORE +# ══════════════════════════════════════════════════════════════════════ +do_restore() { + [[ -z "$VM_NAME" ]] && die "--restore requires --vm NAME" + [[ -z "$SNAPSHOT_ID" ]] && die "--restore requires --snapshot-id NAME" + [[ -z "$RESOURCE_GROUP" ]] && die "--restore requires --resource-group RG" + + section_header "Restore from Snapshot" + field "VM:" "$VM_NAME" + field "Snapshot:" "$SNAPSHOT_ID" + field "Resource Group:" "$RESOURCE_GROUP" + echo "" + + if [[ "$FORCE" != "true" ]]; then + warn "This will replace the VM's OS disk. Use --force to confirm." + return + fi + + log "Creating disk from snapshot..." + local disk_name="restored-${VM_NAME}-$(date +%Y%m%d-%H%M%S)" + local snap_id + snap_id=$(az_cmd snapshot show --name "$SNAPSHOT_ID" --resource-group "$RESOURCE_GROUP" \ + --query 'id' --output tsv 2>/dev/null) || die "Snapshot not found: ${SNAPSHOT_ID}" + + if az_cmd disk create \ + --resource-group "$RESOURCE_GROUP" \ + --name "$disk_name" \ + --source "$snap_id" \ + --output none 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} Disk created: ${disk_name}" + else + die "Failed to create disk from snapshot" + fi + + log "Deallocating VM..." + az_cmd vm deallocate --name "$VM_NAME" --resource-group "$RESOURCE_GROUP" \ + --output none 2>/dev/null || die "Failed to deallocate VM" + + local new_disk_id + new_disk_id=$(az_cmd disk show --name "$disk_name" --resource-group "$RESOURCE_GROUP" \ + --query 'id' --output tsv 2>/dev/null) + + log "Swapping OS disk..." + if az_cmd vm update --name "$VM_NAME" --resource-group "$RESOURCE_GROUP" \ + --os-disk "$new_disk_id" --output none 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} OS disk swapped" + else + die "Failed to swap OS disk" + fi + + log "Starting VM..." + az_cmd vm start --name "$VM_NAME" --resource-group "$RESOURCE_GROUP" \ + --output none 2>/dev/null + echo -e " ${GREEN}✓${RESET} VM started" +} + +# ══════════════════════════════════════════════════════════════════════ +# STATUS +# ══════════════════════════════════════════════════════════════════════ +do_status() { + local vm_json + vm_json=$(get_all_vms) + local snaps + snaps=$(list_snapshots) + local now + now=$(date +%s) + + local total_vms=0 total_snaps=0 total_gb=0 + local protected=0 stale=0 unprotected=0 + + while IFS= read -r vm; do + [[ -z "$vm" ]] && continue + ((total_vms++)) || true + + local name + name=$(echo "$vm" | jq -r '.name') + + local vm_snaps snap_count + vm_snaps=$(echo "$snaps" | jq --arg vm "$name" \ + '[.[] | select(.tags["source-vm"] == $vm)]') + snap_count=$(echo "$vm_snaps" | jq 'length') + total_snaps=$(( total_snaps + snap_count )) + + local gb + gb=$(echo "$vm_snaps" | jq '[.[].diskSizeGb // 0] | add // 0') + total_gb=$(( total_gb + gb )) + + if [[ "$snap_count" -eq 0 ]]; then + ((unprotected++)) || true + continue + fi + + local latest_date + latest_date=$(echo "$vm_snaps" | jq -r \ + 'sort_by(.timeCreated) | last | .timeCreated // ""') + + if [[ -n "$latest_date" ]]; then + local snap_epoch + snap_epoch=$(date -d "$latest_date" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + local age_days=$(( (now - snap_epoch) / 86400 )) + if (( age_days > MAX_AGE )); then + ((stale++)) || true + else + ((protected++)) || true + fi + else + ((protected++)) || true + fi + else + ((protected++)) || true + fi + done < <(echo "$vm_json" | jq -c '.[]') + + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + cat <${MAX_AGE}d):" "${YELLOW}${stale}${RESET}" + else + field_color "Stale (>${MAX_AGE}d):" "${GREEN}0${RESET}" + fi + if [[ "$unprotected" -gt 0 ]]; then + field_color "Unprotected:" "${RED}${unprotected}${RESET}" + else + field_color "Unprotected:" "${GREEN}0${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" detail="${2:-}" + ((PASS++)) || true; ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name}" + else echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_fail() { + local name="$1" detail="${2:-}" + ((FAIL++)) || true; ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_skip() { + local name="$1" reason="${2:-}" + ((SKIP++)) || true; ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}"; fi +} + +# ── Helpers ─────────────────────────────────────────────────────────── +has_cmd() { command -v "$1" >/dev/null 2>&1; } +restore_ok() { find "${RESTORE_TMP}" -type f | grep -q .; } +require_tool() { if ! has_cmd "$1"; then record_skip "$2" "$1 not installed"; return 1; fi; } + +# ── Cleanup ─────────────────────────────────────────────────────────── +# shellcheck disable=SC2317 +cleanup() { [[ -n "${RESTORE_TMP}" && -d "${RESTORE_TMP}" ]] && rm -rf "${RESTORE_TMP}"; } +trap cleanup EXIT + +# ══════════════════════════════════════════════════════════════════════ +# TEST SUITES +# ══════════════════════════════════════════════════════════════════════ + +# ── 1. Repository Health ───────────────────────────────────────────── +test_repo_health() { + echo "" + echo -e "${BOLD}Repository Health${RESET}" + # 1a. Mount check (if configured) + if [[ -n "${MOUNT_CHECK}" ]]; then + if mountpoint -q "${MOUNT_CHECK}" 2>/dev/null; then + record_pass "Mount check" "${MOUNT_CHECK} is mounted" + else + record_fail "Mount check" "${MOUNT_CHECK} is not mounted" + warn "Skipping remaining tests — mount not available" + return + fi + fi + # 1b. Backup exists + case "${BACKUP_TYPE}" in + restic) + require_tool restic "Repository exists" || return + if restic cat config >/dev/null 2>&1; then record_pass "Repository exists" + else record_fail "Repository exists" "not accessible"; fi ;; + borg) + require_tool borg "Repository exists" || return + if borg info 2>/dev/null | grep -q "Repository ID"; then record_pass "Repository exists" + else record_fail "Repository exists" "not accessible"; fi ;; + directory|rsnapshot) + if [[ -d "${BACKUP_DIR}" ]]; then record_pass "Backup directory exists" + else record_fail "Backup directory exists" "${BACKUP_DIR} not found"; fi ;; + esac + # 1c. Repository reachable + case "${BACKUP_TYPE}" in + restic) + if [[ "${BACKUP_REPO}" =~ ^(s3|sftp|rest): ]]; then + require_tool restic "Repository reachable" || return + if restic cat config >/dev/null 2>&1; then record_pass "Repository reachable" + else record_fail "Repository reachable" "remote repository unreachable"; fi + else record_pass "Repository reachable" "local"; fi ;; + borg) + if [[ "${BACKUP_REPO}" =~ ^ssh:// || "${BACKUP_REPO}" =~ .*@.*:.* ]]; then + require_tool borg "Repository reachable" || return + if borg info >/dev/null 2>&1; then record_pass "Repository reachable" + else record_fail "Repository reachable" "remote repository unreachable"; fi + else record_pass "Repository reachable" "local"; fi ;; + directory|rsnapshot) + if [[ -r "${BACKUP_DIR}" ]]; then record_pass "Backup directory reachable" + else record_fail "Backup directory reachable" "${BACKUP_DIR} not readable"; fi ;; + esac +} + +# ── 2. Backup Status ───────────────────────────────────────────────── +test_backup_status() { + echo "" + echo -e "${BOLD}Backup Status${RESET}" + # 2a. Recent backup + local last_ts="" max_age_s=$((MAX_AGE_HOURS * 3600)) + case "${BACKUP_TYPE}" in + restic) + require_tool restic "Recent backup" || { test_size; test_snapshot_count; return; } + local latest + latest=$(restic snapshots --json --latest 1 2>/dev/null) || true + if [[ -z "${latest}" || "${latest}" == "[]" || "${latest}" == "null" ]]; then + record_fail "Recent backup" "no snapshots found" + else + local time_str + time_str=$(echo "${latest}" | grep -oP '"time"\s*:\s*"\K[^"]+' | head -1) + if [[ -z "${time_str}" ]]; then record_fail "Recent backup" "could not parse snapshot time" + else last_ts=$(date -d "${time_str}" +%s 2>/dev/null) || true; fi + fi ;; + borg) + require_tool borg "Recent backup" || { test_size; test_snapshot_count; return; } + local borg_time + borg_time=$(borg list --format '{time}{NL}' 2>/dev/null | tail -1) || true + if [[ -z "${borg_time}" ]]; then record_fail "Recent backup" "no archives found" + else last_ts=$(date -d "${borg_time}" +%s 2>/dev/null) || true; fi ;; + directory|rsnapshot) + local newest + newest=$(find "${BACKUP_DIR}" -maxdepth 1 -mindepth 1 -type d -printf '%T@\n' 2>/dev/null | sort -rn | head -1) + [[ -z "${newest}" ]] && newest=$(find "${BACKUP_DIR}" -maxdepth 1 -mindepth 1 -printf '%T@\n' 2>/dev/null | sort -rn | head -1) + if [[ -z "${newest}" ]]; then record_fail "Recent backup" "no backups found in ${BACKUP_DIR}" + else last_ts="${newest%%.*}"; fi ;; + esac + if [[ -n "${last_ts}" ]]; then + local now_ts age_s age_h + now_ts=$(date +%s); age_s=$((now_ts - last_ts)); age_h=$((age_s / 3600)) + if [[ ${age_s} -le ${max_age_s} ]]; then record_pass "Recent backup" "${age_h}h ago (max ${MAX_AGE_HOURS}h)" + else record_fail "Recent backup" "${age_h}h ago (max ${MAX_AGE_HOURS}h)"; fi + fi + test_size + test_snapshot_count +} + +test_size() { + local size_mb=0 + case "${BACKUP_TYPE}" in + restic) + require_tool restic "Backup size" || return + local stats total_bytes + stats=$(restic stats --json --mode raw-data 2>/dev/null) || true + total_bytes=$(echo "${stats}" | grep -oP '"total_size"\s*:\s*\K[0-9]+' | head -1) || true + [[ -n "${total_bytes}" ]] && size_mb=$((total_bytes / 1048576)) ;; + borg) + require_tool borg "Backup size" || return + local size_str num unit + size_str=$(borg info 2>/dev/null | grep -i "all archives" | grep -oP '[0-9.]+\s*(TB|GB|MB|kB)' | head -1) || true + if [[ -n "${size_str}" ]]; then + num=$(echo "${size_str}" | grep -oP '[0-9.]+'); unit=$(echo "${size_str}" | grep -oP '[A-Za-z]+') + case "${unit}" in + TB) size_mb=$(echo "${num} * 1048576" | bc 2>/dev/null | cut -d. -f1) || size_mb=999999 ;; + GB) size_mb=$(echo "${num} * 1024" | bc 2>/dev/null | cut -d. -f1) || size_mb=999999 ;; + MB) size_mb=$(echo "${num}" | cut -d. -f1) ;; + kB) size_mb=0 ;; + esac + fi ;; + directory|rsnapshot) + size_mb=$(du -sm "${BACKUP_DIR}" 2>/dev/null | awk '{print $1}') || size_mb=0 ;; + esac + if [[ ${size_mb} -ge ${MIN_SIZE_MB} ]]; then record_pass "Backup size" "${size_mb} MB (min ${MIN_SIZE_MB} MB)" + else record_fail "Backup size" "${size_mb} MB < ${MIN_SIZE_MB} MB"; fi +} + +test_snapshot_count() { + local count=0 + case "${BACKUP_TYPE}" in + restic) + require_tool restic "Snapshot count" || return + count=$(restic snapshots --json 2>/dev/null | grep -c '"time"') || count=0 ;; + borg) + require_tool borg "Snapshot count" || return + count=$(borg list 2>/dev/null | wc -l) || count=0 ;; + directory) + count=$(find "${BACKUP_DIR}" -maxdepth 1 -mindepth 1 | wc -l) || count=0 ;; + rsnapshot) + count=$(find "${BACKUP_DIR}" -maxdepth 1 -mindepth 1 -type d | wc -l) || count=0 ;; + esac + if [[ ${count} -ge ${MIN_SNAPSHOTS} ]]; then record_pass "Snapshot count" "${count} (min ${MIN_SNAPSHOTS})" + else record_fail "Snapshot count" "${count} < ${MIN_SNAPSHOTS}"; fi +} + +# ── 3. Integrity ───────────────────────────────────────────────────── +test_integrity_suite() { + echo "" + echo -e "${BOLD}Integrity${RESET}" + # 3a. Integrity check + if [[ "${SKIP_INTEGRITY}" == "true" ]]; then + record_skip "Integrity check" "SKIP_INTEGRITY=true" + else + case "${BACKUP_TYPE}" in + restic) + require_tool restic "Integrity check" || return + if restic check 2>/dev/null; then record_pass "Integrity check" + else record_fail "Integrity check" "restic check failed"; fi ;; + borg) + require_tool borg "Integrity check" || return + if borg check 2>/dev/null; then record_pass "Integrity check" + else record_fail "Integrity check" "borg check failed"; fi ;; + directory|rsnapshot) + record_skip "Integrity check" "not applicable for ${BACKUP_TYPE}" ;; + esac + fi + # 3b. Lock check + case "${BACKUP_TYPE}" in + restic) + require_tool restic "Lock check" || return + local lock_output + lock_output=$(restic list locks 2>/dev/null) || true + if [[ -z "${lock_output}" ]]; then record_pass "Lock check" "no stale locks" + else record_fail "Lock check" "$(echo "${lock_output}" | wc -l) lock(s) found"; fi ;; + borg) + require_tool borg "Lock check" || return + if borg info 2>&1 | grep -qi "lock"; then record_fail "Lock check" "repository appears locked" + else record_pass "Lock check" "no stale locks"; fi ;; + directory|rsnapshot) + local lc + lc=$(find "${BACKUP_DIR}" -maxdepth 1 \( -name "*.lock" -o -name ".lock" \) 2>/dev/null | wc -l) || lc=0 + if [[ ${lc} -eq 0 ]]; then record_pass "Lock check" "no stale locks" + else record_fail "Lock check" "${lc} lock file(s) in ${BACKUP_DIR}"; fi ;; + esac +} + +# ── 4. Recovery ────────────────────────────────────────────────────── +test_recovery() { + echo "" + echo -e "${BOLD}Recovery${RESET}" + if [[ "${SKIP_RESTORE}" == "true" ]]; then record_skip "Test restore" "SKIP_RESTORE=true"; return; fi + RESTORE_TMP=$(mktemp -d /tmp/backup-smoke-test-XXXXXX) + case "${BACKUP_TYPE}" in + restic) + require_tool restic "Test restore" || return + restic_restore "${RESTORE_TEST_FILE}" ;; + borg) + require_tool borg "Test restore" || return + borg_restore "${RESTORE_TEST_FILE}" ;; + directory) + dir_restore "${RESTORE_TEST_FILE:+${BACKUP_DIR}/${RESTORE_TEST_FILE}}" ;; + rsnapshot) + dir_restore "${RESTORE_TEST_FILE:+${BACKUP_DIR}/${RESTORE_TEST_FILE}}" ;; + esac +} + +restic_restore() { + local target="${1:-}" + if [[ -z "${target}" ]]; then + target=$(restic ls latest 2>/dev/null | head -1) || true + [[ -z "${target}" ]] && { record_skip "Test restore" "no files in latest snapshot"; return; } + fi + if restic restore latest --target "${RESTORE_TMP}" --include "${target}" 2>/dev/null && restore_ok; then + record_pass "Test restore" "file restored successfully" + else record_fail "Test restore" "restic restore failed"; fi +} + +borg_restore() { + local archive target + archive=$(borg list --format '{archive}{NL}' 2>/dev/null | tail -1) || true + [[ -z "${archive}" ]] && { record_skip "Test restore" "no archives found"; return; } + target="${1:-}" + if [[ -z "${target}" ]]; then + target=$(borg list "::${archive}" --format '{path}{NL}' 2>/dev/null | grep -v '/$' | head -1) || true + [[ -z "${target}" ]] && { record_skip "Test restore" "no files in latest archive"; return; } + fi + if (cd "${RESTORE_TMP}" && borg extract "::${archive}" "${target}" 2>/dev/null) && restore_ok; then + record_pass "Test restore" "file restored successfully" + else record_fail "Test restore" "borg extract failed"; fi +} + +dir_restore() { + local src_file="${1:-}" + [[ -z "${src_file}" ]] && src_file=$(find "${BACKUP_DIR}" -type f 2>/dev/null | head -1) + [[ -z "${src_file}" || ! -f "${src_file}" ]] && { record_skip "Test restore" "no files in backup directory"; return; } + local dest_file + dest_file="${RESTORE_TMP}/$(basename "${src_file}")" + if cp "${src_file}" "${dest_file}" 2>/dev/null && [[ -f "${dest_file}" ]]; then + record_pass "Test restore" "file copied successfully" + else record_fail "Test restore" "copy failed"; fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + local end_time; end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} ${BACKUP_TYPE} ${BACKUP_REPO:-${BACKUP_DIR}}" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + if [[ $FAIL -eq 0 ]]; then echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}"; fi +} + +print_tap_header() { + echo "TAP version 13" +} + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +write_junit() { + local end_time; end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + cat > "$JUNIT_FILE" < + + +JUNIT_EOF + + for result in "${RESULTS[@]}"; do + local status name detail + IFS='|' read -r status name detail <<< "$result" + name=$(echo "$name" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + detail=$(echo "$detail" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + echo " " >> "$JUNIT_FILE" + case "$status" in + PASS) [[ -n "$detail" ]] && echo " ${detail}" >> "$JUNIT_FILE" ;; + FAIL) echo " FAILED: ${name} — ${detail}" >> "$JUNIT_FILE" ;; + SKIP) echo " " >> "$JUNIT_FILE" ;; + esac + echo " " >> "$JUNIT_FILE" + done + echo " " >> "$JUNIT_FILE" + echo "" >> "$JUNIT_FILE" + log "JUnit report written to ${JUNIT_FILE}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +# ── Helpers ─────────────────────────────────────────────────────────── +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +human_bytes() { + local bytes="$1" + if [[ "$bytes" -ge 1073741824 ]]; then + awk "BEGIN { printf \"%.1f GiB\", $bytes / 1073741824 }" + elif [[ "$bytes" -ge 1048576 ]]; then + awk "BEGIN { printf \"%.1f MiB\", $bytes / 1048576 }" + elif [[ "$bytes" -ge 1024 ]]; then + awk "BEGIN { printf \"%.1f KiB\", $bytes / 1024 }" + else + echo "${bytes} B" + fi +} + +human_age() { + local seconds="$1" + local days=$((seconds / 86400)) + local hours=$(( (seconds % 86400) / 3600 )) + local mins=$(( (seconds % 3600) / 60 )) + + if [[ "$days" -gt 0 ]]; then + echo "${days}d ${hours}h" + elif [[ "$hours" -gt 0 ]]; then + echo "${hours}h ${mins}m" + else + echo "${mins}m" + fi +} + +# Convert age string (24h, 7d, 2w) to seconds +parse_age_to_seconds() { + local age="$1" + local num="${age%[hdw]*}" + local unit="${age##*[0-9]}" + + case "$unit" in + h) echo $((num * 3600)) ;; + d) echo $((num * 86400)) ;; + w) echo $((num * 604800)) ;; + *) echo $((num * 3600)) ;; + esac +} + +# Convert size string (1, 1K, 1M, 1G) to bytes +parse_size_to_bytes() { + local size="$1" + + # Pure number + if [[ "$size" =~ ^[0-9]+$ ]]; then + echo "$size" + return + fi + + local num="${size%[KkMmGg]*}" + local unit="${size##*[0-9]}" + + case "${unit^^}" in + K) echo $((num * 1024)) ;; + M) echo $((num * 1048576)) ;; + G) echo $((num * 1073741824)) ;; + *) echo "$num" ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# VERIFICATION +# ══════════════════════════════════════════════════════════════════════ + +verify_file() { + local file="$1" + local max_age_str="$2" + local min_size_str="$3" + + local max_age_secs min_size_bytes + max_age_secs=$(parse_age_to_seconds "$max_age_str") + min_size_bytes=$(parse_size_to_bytes "$min_size_str") + + COUNT_TOTAL=$((COUNT_TOTAL + 1)) + verbose "Checking: ${file} (max-age=${max_age_str}, min-size=${min_size_str})" + + # Check existence + if [[ ! -e "$file" ]]; then + printf " %b%-50s %10s %10s %s%b\n" "$RED" "$file" "--" "--" "MISSING" "$RESET" + COUNT_CRITICAL=$((COUNT_CRITICAL + 1)) + return + fi + + # Get file info + local file_size file_mtime now_epoch age_secs + file_size=$(stat -c%s "$file" 2>/dev/null || echo "0") + file_mtime=$(stat -c%Y "$file" 2>/dev/null || echo "0") + now_epoch=$(date +%s) + age_secs=$((now_epoch - file_mtime)) + + local size_str age_str + size_str=$(human_bytes "$file_size") + age_str=$(human_age "$age_secs") + + # Check zero-size + if [[ "$file_size" -eq 0 ]]; then + printf " %b%-50s %10s %10s %s%b\n" "$RED" "$file" "$size_str" "$age_str" "EMPTY" "$RESET" + COUNT_CRITICAL=$((COUNT_CRITICAL + 1)) + return + fi + + # Check minimum size + if [[ "$file_size" -lt "$min_size_bytes" ]]; then + printf " %b%-50s %10s %10s %s%b\n" "$YELLOW" "$file" "$size_str" "$age_str" "SMALL" "$RESET" + COUNT_WARNING=$((COUNT_WARNING + 1)) + return + fi + + # Check age + if [[ "$age_secs" -gt "$max_age_secs" ]]; then + printf " %b%-50s %10s %10s %s%b\n" "$YELLOW" "$file" "$size_str" "$age_str" "STALE" "$RESET" + COUNT_WARNING=$((COUNT_WARNING + 1)) + return + fi + + # All good + printf " %b%-50s %10s %10s %s%b\n" "$GREEN" "$file" "$size_str" "$age_str" "OK" "$RESET" + COUNT_OK=$((COUNT_OK + 1)) +} + +verify_glob() { + local pattern="$1" + local max_age_str="$2" + local min_size_str="$3" + + local found=false + # Use compgen to safely expand globs + local files + files=$(compgen -G "$pattern" 2>/dev/null || true) + + if [[ -z "$files" ]]; then + COUNT_TOTAL=$((COUNT_TOTAL + 1)) + COUNT_CRITICAL=$((COUNT_CRITICAL + 1)) + printf " %b%-50s %10s %10s %s%b\n" "$RED" "$pattern" "--" "--" "MISSING" "$RESET" + return + fi + + while IFS= read -r file; do + found=true + verify_file "$file" "$max_age_str" "$min_size_str" + done <<< "$files" + + if [[ "$found" == "false" ]]; then + COUNT_TOTAL=$((COUNT_TOTAL + 1)) + COUNT_CRITICAL=$((COUNT_CRITICAL + 1)) + printf " %b%-50s %10s %10s %s%b\n" "$RED" "$pattern" "--" "--" "MISSING" "$RESET" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# INPUT PARSING +# ══════════════════════════════════════════════════════════════════════ + +add_entry() { + local path="$1" + local max_age="${2:-$BACKUP_MAX_AGE}" + local min_size="${3:-$BACKUP_MIN_SIZE}" + ENTRIES+=("${path}|${max_age}|${min_size}") +} + +load_config_file() { + local file="$1" + if [[ ! -f "$file" ]]; then + err "Config file not found: $file" + exit 1 + fi + while IFS= read -r line; do + line=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + [[ -z "$line" || "$line" == \#* ]] && continue + + local path max_age + path=$(echo "$line" | awk '{print $1}') + max_age=$(echo "$line" | awk '{print $2}') + if [[ -z "$max_age" ]]; then + max_age="$BACKUP_MAX_AGE" + fi + add_entry "$path" "$max_age" + done < "$file" +} + +load_paths_from_file() { + local file="$1" + if [[ ! -f "$file" ]]; then + err "File not found: $file" + exit 1 + fi + while IFS= read -r line; do + line=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + [[ -z "$line" || "$line" == \#* ]] && continue + add_entry "$line" + done < "$file" +} + +load_paths_from_stdin() { + while IFS= read -r line; do + line=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + [[ -z "$line" || "$line" == \#* ]] && continue + add_entry "$line" + done +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2 + exit 1 ;; + *) + add_entry "$1"; shift ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + # Load from config file + if [[ -n "$CONFIG_FILE" ]]; then + load_config_file "$CONFIG_FILE" + fi + + # Load from path file + if [[ -n "$PATH_FILE" ]]; then + load_paths_from_file "$PATH_FILE" + fi + + # Load from stdin if no entries yet and stdin is not a terminal + if [[ ${#ENTRIES[@]} -eq 0 ]] && ! [[ -t 0 ]]; then + load_paths_from_stdin + fi + + if [[ ${#ENTRIES[@]} -eq 0 ]]; then + err "No backup paths specified" + echo "Run ${SCRIPT_NAME} --help for usage" >&2 + exit 1 + fi + + echo "" + echo -e "${BOLD}Backup Verification — $(hostname -f 2>/dev/null || hostname)${RESET}" + echo -e "${DIM}$(date '+%Y-%m-%d %H:%M:%S %Z')${RESET}" + echo -e "${DIM}Defaults: max-age=${BACKUP_MAX_AGE}, min-size=${BACKUP_MIN_SIZE}${RESET}" + + section_header "Backup Status" + + printf " ${BOLD}%-50s %10s %10s %s${RESET}\n" "FILE" "SIZE" "AGE" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..85})" + + for entry in "${ENTRIES[@]}"; do + local path max_age min_size + path=$(echo "$entry" | cut -d'|' -f1) + max_age=$(echo "$entry" | cut -d'|' -f2) + min_size=$(echo "$entry" | cut -d'|' -f3) + + # Check if path contains glob characters + if [[ "$path" == *\** || "$path" == *\?* || "$path" == *\[* ]]; then + verify_glob "$path" "$max_age" "$min_size" + else + verify_file "$path" "$max_age" "$min_size" + fi + done + + section_header "Summary" + field "Total checked:" "$COUNT_TOTAL" + field_color "OK:" "${GREEN}${COUNT_OK}${RESET}" + if [[ "$COUNT_WARNING" -gt 0 ]]; then + field_color "Warnings:" "${YELLOW}${COUNT_WARNING}${RESET}" + else + field "Warnings:" "$COUNT_WARNING" + fi + if [[ "$COUNT_CRITICAL" -gt 0 ]]; then + field_color "Critical:" "${RED}${COUNT_CRITICAL}${RESET}" + else + field "Critical:" "$COUNT_CRITICAL" + fi + + echo "" + + # Exit with error code if any critical issues + if [[ "$COUNT_CRITICAL" -gt 0 ]]; then + return 2 + elif [[ "$COUNT_WARNING" -gt 0 ]]; then + return 1 + fi +} + +main "$@" diff --git a/bastion-hardener.sh b/bastion-hardener.sh new file mode 100755 index 0000000..d7909e3 --- /dev/null +++ b/bastion-hardener.sh @@ -0,0 +1,614 @@ +#!/usr/bin/env bash + +######################################################################################### +#### bastion-hardener.sh — Harden SSH bastion/jump hosts with audit and rollback #### +#### Disables password auth, restricts ciphers, sets idle timeout, fail2ban config #### +#### Requires: bash 4+, root privileges #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### sudo ./bastion-hardener.sh --audit #### +#### #### +#### See --help for all options. #### +######################################################################################### +# v1.01 changes: +# - Fixed: ((0++)) returns 1 under set -e; added || true guards +# - Fixed: grep in pipeline crashes under set -euo pipefail when no matches found. Added || true guard +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +SSHD_CONFIG="${SSHD_CONFIG:-/etc/ssh/sshd_config}" +BACKUP_ROOT="${BACKUP_ROOT:-/etc/ssh}" +ALLOW_USERS="${ALLOW_USERS:-}" +ALLOW_GROUPS="${ALLOW_GROUPS:-}" +IDLE_TIMEOUT="${IDLE_TIMEOUT:-300}" +MAX_AUTH_TRIES="${MAX_AUTH_TRIES:-3}" +MAX_SESSIONS="${MAX_SESSIONS:-2}" +SESSION_LOG_DIR="${SESSION_LOG_DIR:-/var/log/bastion-sessions}" +FAIL2BAN_BANTIME="${FAIL2BAN_BANTIME:-3600}" +FAIL2BAN_MAXRETRY="${FAIL2BAN_MAXRETRY:-3}" +DRY_RUN="${DRY_RUN:-false}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +ENABLE_SESSION_LOGGING="false" +CONFIGURE_FAIL2BAN="false" +ROLLBACK_DIR="" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +PASS_COUNT=0 +FAIL_COUNT=0 +WARN_COUNT=0 +CHANGES=0 + +# ── Hardening settings ─────────────────────────────────────────────── +readonly RECOMMENDED_CIPHERS="chacha20-poly1305@openssh.com,aes256-gcm@openssh.com,aes128-gcm@openssh.com" +readonly RECOMMENDED_MACS="hmac-sha2-512-etm@openssh.com,hmac-sha2-256-etm@openssh.com" +readonly RECOMMENDED_KEX="curve25519-sha256,curve25519-sha256@libssh.org" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + # shellcheck disable=SC2034 + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + # shellcheck disable=SC2034 + RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +die() { err "$*"; exit 1; } + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +require_root() { + if [[ $EUID -ne 0 ]]; then + die "This operation requires root privileges. Run with sudo." + fi +} + +# ── SSHD config helpers ────────────────────────────────────────────── +get_sshd_setting() { + local key="$1" + local val + val=$({ grep -i "^[[:space:]]*${key}[[:space:]]" "$SSHD_CONFIG" 2>/dev/null || true; } | tail -1 | awk '{print $2}') + if [[ -z "$val" ]]; then + echo "(not set)" + else + echo "$val" + fi +} + +set_sshd_config() { + local key="$1" + local value="$2" + local file="$3" + + if grep -qi "^[[:space:]]*${key}[[:space:]]" "$file" 2>/dev/null; then + sed -i "s|^[[:space:]]*${key}[[:space:]].*|${key} ${value}|i" "$file" + elif grep -qi "^[[:space:]]*#[[:space:]]*${key}[[:space:]]" "$file" 2>/dev/null; then + sed -i "s|^[[:space:]]*#[[:space:]]*${key}[[:space:]].*|${key} ${value}|i" "$file" + else + echo "${key} ${value}" >> "$file" + fi + verbose "Set ${key} = ${value}" +} + +# ── Audit check helper ─────────────────────────────────────────────── +check_setting() { + local name="$1" + local current="$2" + local recommended="$3" + local is_warn="${4:-false}" + + local status_icon + if [[ "${current,,}" == "${recommended,,}" ]]; then + status_icon="${GREEN}✓ PASS${RESET}" + ((PASS_COUNT++)) || true + elif [[ "$is_warn" == "true" ]]; then + status_icon="${YELLOW}! WARN${RESET}" + ((WARN_COUNT++)) || true + else + status_icon="${RED}✗ FAIL${RESET}" + ((FAIL_COUNT++)) || true + fi + + printf " %-34s %-16s %-16s %b\n" "$name" "$current" "$recommended" "$status_icon" +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT MODE +# ══════════════════════════════════════════════════════════════════════ + +do_audit() { + if [[ ! -f "$SSHD_CONFIG" ]]; then + die "sshd_config not found at ${SSHD_CONFIG}" + fi + + log "Auditing SSH configuration..." + echo "" + echo -e " ${BOLD}SSH Configuration Audit${RESET}" + printf " ${BOLD}%-34s %-16s %-16s %s${RESET}\n" "SETTING" "CURRENT" "RECOMMENDED" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + # Core auth settings + check_setting "PermitRootLogin" "$(get_sshd_setting PermitRootLogin)" "no" + check_setting "PasswordAuthentication" "$(get_sshd_setting PasswordAuthentication)" "no" + check_setting "ChallengeResponseAuthentication" "$(get_sshd_setting ChallengeResponseAuthentication)" "no" + check_setting "PubkeyAuthentication" "$(get_sshd_setting PubkeyAuthentication)" "yes" + + # Limits + check_setting "MaxAuthTries" "$(get_sshd_setting MaxAuthTries)" "$MAX_AUTH_TRIES" + check_setting "MaxSessions" "$(get_sshd_setting MaxSessions)" "$MAX_SESSIONS" + + # Timeouts + check_setting "ClientAliveInterval" "$(get_sshd_setting ClientAliveInterval)" "$IDLE_TIMEOUT" + + local cac_current + cac_current=$(get_sshd_setting ClientAliveCountMax) + if [[ "$cac_current" == "3" ]]; then + check_setting "ClientAliveCountMax" "$cac_current" "2" "true" + else + check_setting "ClientAliveCountMax" "$cac_current" "2" + fi + + # Forwarding + check_setting "X11Forwarding" "$(get_sshd_setting X11Forwarding)" "no" + check_setting "AllowTcpForwarding" "$(get_sshd_setting AllowTcpForwarding)" "no" + check_setting "AllowAgentForwarding" "$(get_sshd_setting AllowAgentForwarding)" "no" + check_setting "PermitTunnel" "$(get_sshd_setting PermitTunnel)" "no" + + # Crypto + local ciphers_current + ciphers_current=$(get_sshd_setting Ciphers) + if [[ "$ciphers_current" == "(not set)" ]]; then + check_setting "Ciphers" "(default)" "(restricted)" + elif [[ "$ciphers_current" == "$RECOMMENDED_CIPHERS" ]]; then + check_setting "Ciphers" "(restricted)" "(restricted)" + else + check_setting "Ciphers" "(custom)" "(restricted)" + fi + + local macs_current + macs_current=$(get_sshd_setting MACs) + if [[ "$macs_current" == "(not set)" ]]; then + check_setting "MACs" "(default)" "(restricted)" + elif [[ "$macs_current" == "$RECOMMENDED_MACS" ]]; then + check_setting "MACs" "(restricted)" "(restricted)" + else + check_setting "MACs" "(custom)" "(restricted)" + fi + + local kex_current + kex_current=$(get_sshd_setting KexAlgorithms) + if [[ "$kex_current" == "(not set)" ]]; then + check_setting "KexAlgorithms" "(default)" "(restricted)" + elif [[ "$kex_current" == "$RECOMMENDED_KEX" ]]; then + check_setting "KexAlgorithms" "(restricted)" "(restricted)" + else + check_setting "KexAlgorithms" "(custom)" "(restricted)" + fi + + # Logging and misc + check_setting "LogLevel" "$(get_sshd_setting LogLevel)" "VERBOSE" + check_setting "LoginGraceTime" "$(get_sshd_setting LoginGraceTime)" "30" + + # AllowUsers / AllowGroups (warn if not set) + local au_current ag_current + au_current=$(get_sshd_setting AllowUsers) + ag_current=$(get_sshd_setting AllowGroups) + if [[ "$au_current" == "(not set)" ]]; then + check_setting "AllowUsers" "(not set)" "(recommended)" "true" + else + check_setting "AllowUsers" "(configured)" "(recommended)" + fi + if [[ "$ag_current" == "(not set)" ]]; then + check_setting "AllowGroups" "(not set)" "(recommended)" "true" + else + check_setting "AllowGroups" "(configured)" "(recommended)" + fi + + # Summary + local total_checks=$((PASS_COUNT + FAIL_COUNT + WARN_COUNT)) + local score=0 + if [[ "$total_checks" -gt 0 ]]; then + score=$(( PASS_COUNT * 100 / total_checks )) + fi + + echo "" + echo -e " ${BOLD}Summary${RESET}" + echo " Total checks: ${total_checks}" + echo -e " Passed: ${GREEN}${PASS_COUNT}${RESET}" + echo -e " Failed: ${RED}${FAIL_COUNT}${RESET}" + echo -e " Warnings: ${YELLOW}${WARN_COUNT}${RESET}" + echo " Score: ${score} / 100" + + # Extra warnings + echo "" + if ! command -v fail2ban-client &>/dev/null; then + warn "Fail2ban not installed — brute-force protection unavailable" + fi + if [[ "$au_current" == "(not set)" && "$ag_current" == "(not set)" ]]; then + warn "No AllowUsers/AllowGroups configured — all users can SSH in" + fi + + log "Run with --apply to harden this host" + log "Completed in $(elapsed)" +} + +# ══════════════════════════════════════════════════════════════════════ +# APPLY MODE +# ══════════════════════════════════════════════════════════════════════ + +do_apply() { + require_root + + if [[ ! -f "$SSHD_CONFIG" ]]; then + die "sshd_config not found at ${SSHD_CONFIG}" + fi + + # Create backup + local backup_dir + backup_dir="${BACKUP_ROOT}/bastion-hardener-backup-$(date +%Y%m%d-%H%M%S)" + log "Backing up ${SSHD_CONFIG} → ${backup_dir}/sshd_config" + mkdir -p "$backup_dir" + cp -p "$SSHD_CONFIG" "${backup_dir}/sshd_config" + [[ -f /etc/ssh/banner.txt ]] && cp -p /etc/ssh/banner.txt "${backup_dir}/banner.txt" + + if [[ "$DRY_RUN" == "true" ]]; then + log "${YELLOW}DRY RUN${RESET} — previewing changes (no files will be modified)" + local tmp_config + tmp_config=$(mktemp) + cp "$SSHD_CONFIG" "$tmp_config" + apply_settings "$tmp_config" + echo "" + log "Diff preview:" + diff "$SSHD_CONFIG" "$tmp_config" || true + rm -f "$tmp_config" + log "Run without --dry-run to apply changes" + return + fi + + log "Applying SSH hardening..." + apply_settings "$SSHD_CONFIG" + + # Create banner + if [[ ! -f /etc/ssh/banner.txt ]]; then + cat > /etc/ssh/banner.txt <<'BANNER' +*************************************************************************** +* AUTHORIZED ACCESS ONLY * +* * +* This system is restricted to authorized users. All activities are * +* monitored and logged. Unauthorized access is prohibited and subject * +* to prosecution under applicable law. * +* * +* By proceeding, you acknowledge that you have read and agree to the * +* organization's acceptable use policies. * +*************************************************************************** +BANNER + log "Created warning banner at /etc/ssh/banner.txt" + fi + + # Session logging directory + if [[ "$ENABLE_SESSION_LOGGING" == "true" ]]; then + mkdir -p "$SESSION_LOG_DIR" + chmod 700 "$SESSION_LOG_DIR" + log "Session log directory: ${SESSION_LOG_DIR}" + fi + + # Fail2ban configuration + if [[ "$CONFIGURE_FAIL2BAN" == "true" ]]; then + configure_fail2ban + fi + + # Validate config + log "Validating sshd configuration..." + if sshd -t -f "$SSHD_CONFIG" 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} sshd -t passed" + else + err "sshd -t validation failed — restoring backup" + cp -p "${backup_dir}/sshd_config" "$SSHD_CONFIG" + die "Config validation failed. Original config restored." + fi + + # Restart sshd + log "Restarting sshd..." + if systemctl restart sshd 2>/dev/null || systemctl restart ssh 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} sshd restarted successfully" + else + warn "Could not restart sshd — restart manually" + fi + + # Write audit report + local report_file + report_file="/var/log/bastion-hardener-$(date +%Y%m%d-%H%M%S).log" + { + echo "Bastion Hardener — Apply Report" + echo "Time: $(date -u '+%Y-%m-%dT%H:%M:%SZ')" + echo "Host: $(hostname -f 2>/dev/null || hostname)" + echo "Changes: ${CHANGES}" + echo "Backup: ${backup_dir}" + } > "$report_file" 2>/dev/null || true + log "Writing audit report → ${report_file}" + + log "Changes applied: ${CHANGES}, skipped: 0" + log "Backup directory: ${backup_dir}" + log "To rollback: ./${SCRIPT_NAME} --rollback" + log "Completed in $(elapsed)" +} + +apply_settings() { + local config_file="$1" + + local settings=( + "PermitRootLogin no" + "PasswordAuthentication no" + "ChallengeResponseAuthentication no" + "KbdInteractiveAuthentication no" + "PubkeyAuthentication yes" + "MaxAuthTries ${MAX_AUTH_TRIES}" + "MaxSessions ${MAX_SESSIONS}" + "ClientAliveInterval ${IDLE_TIMEOUT}" + "ClientAliveCountMax 2" + "X11Forwarding no" + "AllowTcpForwarding no" + "AllowAgentForwarding no" + "PermitTunnel no" + "Ciphers ${RECOMMENDED_CIPHERS}" + "MACs ${RECOMMENDED_MACS}" + "KexAlgorithms ${RECOMMENDED_KEX}" + "LoginGraceTime 30" + "LogLevel VERBOSE" + "Banner /etc/ssh/banner.txt" + ) + + for setting in "${settings[@]}"; do + local key value + key="${setting%% *}" + value="${setting#* }" + set_sshd_config "$key" "$value" "$config_file" + echo -e " ${GREEN}✓${RESET} ${key} → ${value}" + ((CHANGES++)) || true + done + + # AllowUsers + if [[ -n "$ALLOW_USERS" ]]; then + local users_val="${ALLOW_USERS//,/ }" + set_sshd_config "AllowUsers" "$users_val" "$config_file" + echo -e " ${GREEN}✓${RESET} AllowUsers → ${users_val}" + ((CHANGES++)) || true + fi + + # AllowGroups + if [[ -n "$ALLOW_GROUPS" ]]; then + local groups_val="${ALLOW_GROUPS//,/ }" + set_sshd_config "AllowGroups" "$groups_val" "$config_file" + echo -e " ${GREEN}✓${RESET} AllowGroups → ${groups_val}" + ((CHANGES++)) || true + fi +} + +configure_fail2ban() { + if ! command -v fail2ban-client &>/dev/null; then + warn "fail2ban not installed — skipping jail configuration" + return + fi + + local jail_file="/etc/fail2ban/jail.d/bastion-ssh.conf" + log "Configuring fail2ban SSH jail → ${jail_file}" + + cat > "$jail_file" </dev/null; then + echo -e " ${GREEN}✓${RESET} fail2ban SSH jail configured and restarted" + else + warn "Could not restart fail2ban" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# ROLLBACK MODE +# ══════════════════════════════════════════════════════════════════════ + +do_rollback() { + require_root + + local target_dir="$ROLLBACK_DIR" + + if [[ -z "$target_dir" ]]; then + # Find most recent backup + target_dir=$(find "$BACKUP_ROOT" -maxdepth 1 -type d -name "bastion-hardener-backup-*" 2>/dev/null | sort -r | head -1) + if [[ -z "$target_dir" ]]; then + die "No backup directories found in ${BACKUP_ROOT}" + fi + fi + + if [[ ! -d "$target_dir" ]]; then + die "Backup directory not found: ${target_dir}" + fi + + log "Restoring from ${target_dir}..." + + if [[ -f "${target_dir}/sshd_config" ]]; then + cp -p "${target_dir}/sshd_config" "$SSHD_CONFIG" + echo -e " ${GREEN}✓${RESET} Restored sshd_config" + else + die "No sshd_config found in backup directory" + fi + + if [[ -f "${target_dir}/banner.txt" ]]; then + cp -p "${target_dir}/banner.txt" /etc/ssh/banner.txt + echo -e " ${GREEN}✓${RESET} Restored banner.txt" + fi + + # Validate + log "Validating restored configuration..." + if sshd -t -f "$SSHD_CONFIG" 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} sshd -t passed" + else + die "Restored config failed validation" + fi + + # Restart + log "Restarting sshd..." + if systemctl restart sshd 2>/dev/null || systemctl restart ssh 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} sshd restarted successfully" + else + warn "Could not restart sshd — restart manually" + fi + + log "Rollback complete from ${target_dir}" + log "Completed in $(elapsed)" +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ + +show_help() { + cat </dev/null || hostname)" + echo "Mode: ${RUN_MODE}" + echo "Time: $(date -u '+%Y-%m-%dT%H:%M:%SZ')" + echo "" + + case "$RUN_MODE" in + audit) do_audit ;; + apply) do_apply ;; + rollback) do_rollback ;; + esac +} + +main "$@" diff --git a/borg-backup-exporter.sh b/borg-backup-exporter.sh new file mode 100755 index 0000000..70c1720 --- /dev/null +++ b/borg-backup-exporter.sh @@ -0,0 +1,273 @@ +#!/bin/bash +################################################################################ +# Script Name: borg-backup-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for Borg backups — last archive time, +# backup age, repo size, archive counts, and deduplication metrics +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - borg installed +# - BORG_PASSPHRASE or BORG_PASSCOMMAND env set +# - netcat (nc) for HTTP mode +# +# Usage: +# ./borg-backup-exporter.sh --repo /mnt/backup --textfile +# ./borg-backup-exporter.sh --repo /mnt/backup --http -p 9201 +# BORG_REPO=/mnt/backup ./borg-backup-exporter.sh +# +# Configuration: +# Default HTTP port: 9201 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +EXPORTER_VERSION="1.0" +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9201 +REPOS=() + +show_usage() { + cat <&2; exit 1 ;; + esac + done + if [ ${#REPOS[@]} -eq 0 ] && [ -n "$BORG_REPO" ]; then + REPOS+=("$BORG_REPO") + fi +} + +check_borg() { + if ! command -v borg >/dev/null 2>&1; then + echo "# ERROR: borg not found" >&2 + return 1 + fi + return 0 +} + +generate_repo_metrics() { + local repo="$1" + local repo_label + repo_label=$(echo "$repo" | sed 's/[^a-zA-Z0-9_\/-]/_/g') + + # Get repo info + local info_json + if ! info_json=$(borg info --json "$repo" 2>/dev/null) || [ -z "$info_json" ]; then + echo "# WARNING: could not read repo $repo" >&2 + echo "borg_backup_up 0" + return + fi + + local location + location=$(echo "$info_json" | jq -r '.repository.location // empty') + echo "borg_backup_repo_info{repo=\"$repo_label\",location=\"$location\"} 1" + + # Repo size metrics from cache/stats + local total_size total_csize unique_csize + total_size=$(echo "$info_json" | jq '.cache.stats.total_size // 0') + total_csize=$(echo "$info_json" | jq '.cache.stats.total_csize // 0') + unique_csize=$(echo "$info_json" | jq '.cache.stats.unique_csize // 0') + echo "borg_backup_repo_total_size_bytes{repo=\"$repo_label\"} ${total_size:-0}" + echo "borg_backup_repo_total_csize_bytes{repo=\"$repo_label\"} ${total_csize:-0}" + echo "borg_backup_repo_unique_csize_bytes{repo=\"$repo_label\"} ${unique_csize:-0}" + + # List archives + local list_json + list_json=$(borg list --json "$repo" 2>/dev/null) + local archive_count + archive_count=$(echo "$list_json" | jq '.archives | length') + echo "borg_backup_archive_count{repo=\"$repo_label\"} ${archive_count:-0}" + + # Last archive metrics + local last_archive + last_archive=$(echo "$list_json" | jq -r '.archives | sort_by(.start) | last | .start // empty' 2>/dev/null) + if [ -n "$last_archive" ]; then + local last_unix + last_unix=$(date -d "$last_archive" +%s 2>/dev/null || echo 0) + local now + now=$(date +%s) + local age=$((now - last_unix)) + echo "borg_backup_last_archive_timestamp{repo=\"$repo_label\"} $last_unix" + echo "borg_backup_last_archive_age_seconds{repo=\"$repo_label\"} $age" + else + echo "borg_backup_last_archive_timestamp{repo=\"$repo_label\"} 0" + echo "borg_backup_last_archive_age_seconds{repo=\"$repo_label\"} 0" + fi + + # Last archive detailed info + local last_archive_name + last_archive_name=$(echo "$list_json" | jq -r '.archives | sort_by(.start) | last | .archive // empty' 2>/dev/null) + if [ -n "$last_archive_name" ]; then + local archive_info_json + archive_info_json=$(borg info --json "$repo::$last_archive_name" 2>/dev/null) + if [ -n "$archive_info_json" ]; then + local original_size dedup_size duration + original_size=$(echo "$archive_info_json" | jq '.archives[0].stats.original_size // 0') + dedup_size=$(echo "$archive_info_json" | jq '.archives[0].stats.deduplicated_size // 0') + duration=$(echo "$archive_info_json" | jq '.archives[0].duration // empty') + echo "borg_backup_last_archive_original_size_bytes{repo=\"$repo_label\"} ${original_size:-0}" + echo "borg_backup_last_archive_deduplicated_size_bytes{repo=\"$repo_label\"} ${dedup_size:-0}" + if [ -n "$duration" ] && [ "$duration" != "null" ]; then + echo "borg_backup_last_archive_duration_seconds{repo=\"$repo_label\"} $duration" + fi + fi + fi + + # Borg check age (from log file if available) + local check_log="/var/log/borg-check.log" + if [ -f "$check_log" ]; then + local check_mtime + check_mtime=$(stat -c %Y "$check_log" 2>/dev/null) + if [ -n "$check_mtime" ]; then + local now + now=$(date +%s) + local check_age=$((now - check_mtime)) + echo "borg_backup_repo_check_age_seconds{repo=\"$repo_label\"} $check_age" + fi + fi +} + +generate_metrics() { + local script_start + script_start=$(date +%s) + + if ! check_borg; then + echo "# HELP borg_backup_up Exporter status (1=up, 0=down)" + echo "# TYPE borg_backup_up gauge" + echo "borg_backup_up 0" + return + fi + + echo "# HELP borg_backup_up Exporter status (1=up, 0=down)" + echo "# TYPE borg_backup_up gauge" + echo "borg_backup_up 1" + echo "# HELP borg_backup_exporter_info Exporter version info" + echo "# TYPE borg_backup_exporter_info gauge" + echo "borg_backup_exporter_info{version=\"$EXPORTER_VERSION\"} 1" + + # Collect all per-repo metric lines, then output grouped by metric name + local all_output + all_output="" + for repo in "${REPOS[@]}"; do + all_output+="$(generate_repo_metrics "$repo")"$'\n' + done + + # Output each metric type with HELP/TYPE immediately before its values + local -a metric_names=( + "borg_backup_repo_info|Repository info" + "borg_backup_repo_total_size_bytes|Total deduplicated size in bytes" + "borg_backup_repo_total_csize_bytes|Total compressed size in bytes" + "borg_backup_repo_unique_csize_bytes|Unique compressed size (actual disk usage) in bytes" + "borg_backup_archive_count|Total number of archives" + "borg_backup_last_archive_timestamp|Unix timestamp of most recent archive" + "borg_backup_last_archive_age_seconds|Seconds since last archive" + "borg_backup_last_archive_original_size_bytes|Original size of last archive" + "borg_backup_last_archive_deduplicated_size_bytes|Deduplicated size of last archive" + "borg_backup_last_archive_duration_seconds|Duration of last archive" + "borg_backup_repo_check_age_seconds|Seconds since last borg check" + ) + + for entry in "${metric_names[@]}"; do + local mname="${entry%%|*}" + local mdesc="${entry#*|}" + local lines + lines=$(echo "$all_output" | grep "^${mname}[{[:space:]]" || true) + if [ -n "$lines" ]; then + echo "# HELP ${mname} ${mdesc}" + echo "# TYPE ${mname} gauge" + echo "$lines" + fi + done + + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + echo "# HELP borg_backup_exporter_duration_seconds Script execution time" + echo "# TYPE borg_backup_exporter_duration_seconds gauge" + echo "borg_backup_exporter_duration_seconds $script_duration" + echo "# HELP borg_backup_exporter_last_run_timestamp Last successful run" + echo "# TYPE borg_backup_exporter_last_run_timestamp gauge" + echo "borg_backup_exporter_last_run_timestamp $script_end" +} + +run_http_server() { + echo "# Starting borg backup exporter on port $HTTP_PORT..." >&2 + if ! command -v nc >/dev/null 2>&1; then + echo "# ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + echo "Borg Backup Exporter

Borg Backup Prometheus Exporter

Metrics

" + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +main() { + parse_args "$@" + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + local temp_file + temp_file=$(mktemp "${output_dir}/.borg_metrics.XXXXXX") + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "# ERROR: Failed to generate metrics" >&2 + exit 1 + fi + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + echo "# Metrics written to $OUTPUT_FILE" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/caprover-backup.sh b/caprover-backup.sh new file mode 100644 index 0000000..0c9ee6e --- /dev/null +++ b/caprover-backup.sh @@ -0,0 +1,398 @@ +#!/usr/bin/env bash +# caprover-backup.sh — Comprehensive CapRover backup script +# Author: Phil Connor +# License: MIT +# Version: 1.11 +# +# Backs up /captain config, Docker volumes (captain-- prefixed), +# and app definitions via CapRover API. +# Supports local, NFS, and S3 (via aws cli or rclone) destinations. +# +# Migration mode (--migrate) stops all CapRover app containers before +# backing up volumes, ensuring database-consistent snapshots. Produces +# a single migration tarball for transfer to a new server. +# +# Usage: +# ./caprover-backup.sh # local backup to /backups/caprover +# ./caprover-backup.sh --migrate # full server migration (stops containers) +# BACKUP_DEST=s3 S3_BUCKET=my-bucket ./caprover-backup.sh +# BACKUP_DEST=rclone RCLONE_REMOTE=myremote:backups ./caprover-backup.sh +# BACKUP_DEST=nfs NFS_MOUNT=/mnt/nfs/backups ./caprover-backup.sh + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Configuration — override via environment variables +# --------------------------------------------------------------------------- +BACKUP_DIR="${BACKUP_DIR:-/backups/caprover}" +BACKUP_DEST="${BACKUP_DEST:-local}" # local | nfs | s3 | rclone +RETENTION_DAYS="${RETENTION_DAYS:-30}" +DATE=$(date +%Y%m%d-%H%M%S) +LOG_FILE="${LOG_FILE:-/var/log/caprover-backup.log}" +MIGRATE=false + +# CapRover API settings (for app definition export) +CAPROVER_URL="${CAPROVER_URL:-https://captain.apps.example.com}" +CAPROVER_PASSWORD="${CAPROVER_PASSWORD:-}" + +# S3 settings +S3_BUCKET="${S3_BUCKET:-}" +S3_PREFIX="${S3_PREFIX:-caprover-backups}" + +# rclone settings +RCLONE_REMOTE="${RCLONE_REMOTE:-}" + +# NFS settings +NFS_MOUNT="${NFS_MOUNT:-/mnt/nfs/backups}" + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- +log() { + local msg + msg="[$(date '+%Y-%m-%d %H:%M:%S')] $1" + echo "$msg" | tee -a "$LOG_FILE" +} + +log_error() { + log "ERROR: $1" +} + +# --------------------------------------------------------------------------- +# Argument parsing +# --------------------------------------------------------------------------- +for arg in "$@"; do + case "$arg" in + --migrate) MIGRATE=true ;; + -h|--help) + sed -n '2,/^$/{ s/^# \?//; p }' "$0" + exit 0 + ;; + esac +done + +# --------------------------------------------------------------------------- +# Pre-flight checks +# --------------------------------------------------------------------------- +preflight() { + if [ "$(id -u)" -ne 0 ]; then + log_error "Run as root." + exit 1 + fi + + if ! command -v docker &>/dev/null; then + log_error "Docker not found." + exit 1 + fi + + if [ "$BACKUP_DEST" = "s3" ] && ! command -v aws &>/dev/null; then + log_error "aws CLI not found. Install it or use BACKUP_DEST=rclone." + exit 1 + fi + + if [ "$BACKUP_DEST" = "rclone" ] && ! command -v rclone &>/dev/null; then + log_error "rclone not found." + exit 1 + fi + + mkdir -p "$BACKUP_DIR" + mkdir -p "$(dirname "$LOG_FILE")" +} + +# --------------------------------------------------------------------------- +# Backup /captain directory +# --------------------------------------------------------------------------- +backup_captain_config() { + log "Backing up /captain directory..." + local dest="${BACKUP_DIR}/captain-config-${DATE}.tar.gz" + + if [ ! -d /captain ]; then + log_error "/captain directory not found. Is CapRover installed?" + return 1 + fi + + tar czf "$dest" -C / captain + log "Captain config saved: $dest ($(du -sh "$dest" | cut -f1))" +} + +# --------------------------------------------------------------------------- +# Backup Docker volumes (captain-- prefixed) +# --------------------------------------------------------------------------- +backup_volumes() { + log "Backing up Docker volumes..." + local volumes + volumes=$(docker volume ls -q | grep "^captain--" || true) + + if [ -z "$volumes" ]; then + log "No captain-- volumes found. Skipping." + return 0 + fi + + for vol in $volumes; do + local app_name="${vol#captain--}" + local dest="${BACKUP_DIR}/vol-${app_name}-${DATE}.tar.gz" + + log " Backing up volume: $vol" + docker run --rm \ + -v "${vol}:/source:ro" \ + -v "${BACKUP_DIR}:/backup" \ + alpine tar czf "/backup/vol-${app_name}-${DATE}.tar.gz" -C /source . + + log " Volume $vol saved: $dest ($(du -sh "$dest" | cut -f1))" + done +} + +# --------------------------------------------------------------------------- +# Export app definitions via CapRover API +# --------------------------------------------------------------------------- +export_app_definitions() { + if [ -z "$CAPROVER_PASSWORD" ]; then + log "CAPROVER_PASSWORD not set. Skipping API export." + return 0 + fi + + log "Exporting app definitions via CapRover API..." + + # Get auth token + local token + token=$(curl -s -X POST "${CAPROVER_URL}/api/v2/login" \ + -H "Content-Type: application/json" \ + -H "x-namespace: captain" \ + -d "{\"password\":\"${CAPROVER_PASSWORD}\"}" \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['data']['token'])" 2>/dev/null) || true + + if [ -z "$token" ]; then + log_error "Failed to authenticate with CapRover API." + return 1 + fi + + # Export app definitions + local dest="${BACKUP_DIR}/app-definitions-${DATE}.json" + local http_code + http_code=$(curl -s -o "$dest" -w "%{http_code}" \ + "${CAPROVER_URL}/api/v2/user/apps/appDefinitions" \ + -H "Content-Type: application/json" \ + -H "x-namespace: captain" \ + -H "x-captain-auth: ${token}") + + if [ "$http_code" = "200" ]; then + log "App definitions saved: $dest" + else + log_error "API returned HTTP $http_code. App definitions export failed." + rm -f "$dest" + return 1 + fi +} + +# --------------------------------------------------------------------------- +# Upload to remote destination +# --------------------------------------------------------------------------- +upload_remote() { + case "$BACKUP_DEST" in + local) + log "Backup stored locally at $BACKUP_DIR" + ;; + nfs) + log "Copying backups to NFS mount: $NFS_MOUNT" + if ! mountpoint -q "$NFS_MOUNT" 2>/dev/null; then + log_error "$NFS_MOUNT is not mounted." + return 1 + fi + mkdir -p "${NFS_MOUNT}/caprover" + cp "${BACKUP_DIR}"/*-"${DATE}"* "${NFS_MOUNT}/caprover/" + log "Copied to NFS." + ;; + s3) + log "Uploading backups to S3: s3://${S3_BUCKET}/${S3_PREFIX}/" + for f in "${BACKUP_DIR}"/*-"${DATE}"*; do + aws s3 cp "$f" "s3://${S3_BUCKET}/${S3_PREFIX}/$(basename "$f")" --quiet + done + log "S3 upload complete." + ;; + rclone) + log "Uploading backups via rclone to: $RCLONE_REMOTE" + for f in "${BACKUP_DIR}"/*-"${DATE}"*; do + rclone copy "$f" "$RCLONE_REMOTE" --quiet + done + log "rclone upload complete." + ;; + *) + log_error "Unknown BACKUP_DEST: $BACKUP_DEST" + return 1 + ;; + esac +} + +# --------------------------------------------------------------------------- +# Retention — delete local backups older than RETENTION_DAYS +# --------------------------------------------------------------------------- +apply_retention() { + log "Applying retention policy: deleting backups older than ${RETENTION_DAYS} days..." + local count + count=$(find "$BACKUP_DIR" -name "*.tar.gz" -o -name "*.json" | \ + xargs -I{} find {} -mtime +"$RETENTION_DAYS" 2>/dev/null | wc -l) + + find "$BACKUP_DIR" \( -name "*.tar.gz" -o -name "*.json" \) \ + -mtime +"$RETENTION_DAYS" -delete + + log "Removed $count old backup file(s)." +} + +# --------------------------------------------------------------------------- +# Migration — stop all CapRover app containers +# --------------------------------------------------------------------------- +stop_captain_containers() { + log "Stopping all CapRover app containers..." + local containers + containers=$(docker ps -q --filter "label=com.docker.swarm.service.name" \ + --filter "name=srv-captain--" 2>/dev/null || true) + + if [ -z "$containers" ]; then + # fallback: stop services via Docker Swarm + local services + services=$(docker service ls -q --filter "name=srv-captain--" 2>/dev/null || true) + if [ -n "$services" ]; then + local count=0 + for svc in $services; do + local svc_name + svc_name=$(docker service inspect --format '{{.Spec.Name}}' "$svc") + log " Scaling down: $svc_name" + docker service scale "$svc_name=0" --detach 2>/dev/null + ((count++)) || true + done + log "Scaled down $count service(s). Waiting 10s for graceful shutdown..." + sleep 10 + else + log "No CapRover app services found." + fi + else + local count + count=$(echo "$containers" | wc -w) + docker stop $containers + log "Stopped $count container(s). Waiting 5s..." + sleep 5 + fi +} + +# --------------------------------------------------------------------------- +# Migration — record service state for restore on new server +# --------------------------------------------------------------------------- +save_service_state() { + log "Saving Docker service state..." + local dest="${BACKUP_DIR}/service-state-${DATE}.json" + docker service ls --format '{{json .}}' > "$dest" + log "Service state saved: $dest" + + # Save docker info for reference (Swarm tokens, node info) + local info_dest="${BACKUP_DIR}/docker-info-${DATE}.txt" + docker info > "$info_dest" 2>&1 + docker node ls >> "$info_dest" 2>/dev/null || true + log "Docker info saved: $info_dest" +} + +# --------------------------------------------------------------------------- +# Migration — package everything into a single tarball +# --------------------------------------------------------------------------- +create_migration_bundle() { + local bundle="${BACKUP_DIR}/caprover-migration-${DATE}.tar.gz" + log "Creating migration bundle..." + + # Collect all files from this run + tar czf "$bundle" -C "$BACKUP_DIR" \ + $(ls -1 "$BACKUP_DIR" | grep "$DATE" | grep -v "caprover-migration") + + local size + size=$(du -sh "$bundle" | cut -f1) + log "Migration bundle ready: $bundle ($size)" + log "" + log "=========================================" + log " MIGRATION INSTRUCTIONS" + log "=========================================" + log "1. Copy bundle to new server:" + log " scp $bundle root@new-server:/backups/" + log "" + log "2. On the new server, install CapRover:" + log " docker run -p 80:80 -p 443:443 -p 3000:3000 \\" + log " -e ACCEPTED_TERMS=true -v /captain:/captain \\" + log " caprover/caprover-edge" + log "" + log "3. Extract the bundle:" + log " mkdir -p /backups/restore && cd /backups/restore" + log " tar xzf caprover-migration-${DATE}.tar.gz" + log "" + log "4. Stop CapRover on new server:" + log " docker service rm captain-captain --force" + log "" + log "5. Restore /captain config:" + log " tar xzf captain-config-${DATE}.tar.gz -C /" + log "" + log "6. Restore Docker volumes:" + log " for f in vol-*-${DATE}.tar.gz; do" + log ' vol="captain--${f#vol-}"' + log ' vol="${vol%-'"${DATE}"'.tar.gz}"' + log " docker volume create \"\$vol\"" + log " docker run --rm -v \"\${vol}:/dest\" -v \"\$(pwd):/backup:ro\" \\" + log " alpine sh -c \"tar xzf /backup/\$f -C /dest\"" + log " done" + log "" + log "7. Start CapRover and re-deploy apps:" + log " docker run -p 80:80 -p 443:443 -p 3000:3000 \\" + log " -e ACCEPTED_TERMS=true -v /captain:/captain \\" + log " caprover/caprover-edge" + log "" + log "8. Update DNS to point to the new server IP." + log "=========================================" +} + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +main() { + if $MIGRATE; then + log "=========================================" + log "CapRover MIGRATION backup — ${DATE}" + log "Mode: full migration (containers will be stopped)" + log "=========================================" + + preflight + local errors=0 + + export_app_definitions || { ((errors++)) || true; } + save_service_state || { ((errors++)) || true; } + stop_captain_containers + backup_captain_config || { ((errors++)) || true; } + backup_volumes || { ((errors++)) || true; } + create_migration_bundle + + if [ "$errors" -gt 0 ]; then + log "Migration backup completed with $errors error(s)." + exit 1 + else + log "Migration backup completed successfully." + log "Containers remain stopped. This server is ready to decommission." + fi + else + log "=========================================" + log "CapRover backup started — ${DATE}" + log "Destination: ${BACKUP_DEST}" + log "=========================================" + + preflight + local errors=0 + + backup_captain_config || { ((errors++)) || true; } + backup_volumes || { ((errors++)) || true; } + export_app_definitions || { ((errors++)) || true; } + upload_remote || { ((errors++)) || true; } + apply_retention + + if [ "$errors" -gt 0 ]; then + log "Backup completed with $errors error(s)." + exit 1 + else + log "Backup completed successfully." + fi + fi +} + +main "$@" diff --git a/caprover-exporter.sh b/caprover-exporter.sh new file mode 100644 index 0000000..fa19b32 --- /dev/null +++ b/caprover-exporter.sh @@ -0,0 +1,427 @@ +#!/bin/bash +################################################################################ +# Script Name: caprover-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for CapRover PaaS providing operational +# metrics via the CapRover API — app deployment status, container +# health, resource usage, and platform metrics +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - CapRover installed and running +# - CapRover API accessible (default: http://localhost:3000) +# - curl for API calls +# - jq for JSON parsing +# - netcat (nc) for HTTP mode +# +# Usage: +# ./caprover-exporter.sh # Output to stdout +# ./caprover-exporter.sh --http -p 9196 # HTTP server mode +# ./caprover-exporter.sh --textfile # Textfile collector mode +# ./caprover-exporter.sh --password secret # Custom password +# +# Metrics Exported: +# - caprover_up - API reachability (1=up, 0=down) +# - caprover_info{version} - CapRover version info +# - caprover_apps_total - Total app count +# - caprover_apps_running - Running app count +# - caprover_apps_stopped - Stopped app count +# - caprover_app_running{app} - Per-app running status (1/0) +# - caprover_app_instance_count{app} - Per-app replica count +# - caprover_app_has_ssl{app} - Per-app SSL status (1/0) +# - caprover_app_force_ssl{app} - Per-app force SSL status (1/0) +# - caprover_nodes_total - Swarm node count +# - caprover_volumes_total - Docker volume count +# - caprover_disk_used_bytes - Disk usage in bytes +# - caprover_disk_total_bytes - Total disk in bytes +# - caprover_exporter_duration_seconds - Script execution time +# - caprover_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9196 +# Default CapRover URL: http://localhost:3000 +# Default password: captain42 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9196 +CAPROVER_URL="http://localhost:3000" +CAPROVER_PASSWORD="captain42" +AUTH_TOKEN="" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +check_prerequisites() { + if ! command -v curl >/dev/null 2>&1; then + echo "ERROR: curl not found" >&2; return 1 + fi + if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq not found (required for JSON parsing)" >&2; return 1 + fi + return 0 +} + +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/}" + echo "$val" +} + +authenticate() { + if [ -n "$AUTH_TOKEN" ]; then return 0; fi + + local response + response=$(curl -s -X POST \ + -H "Content-Type: application/json" \ + -H "x-namespace: captain" \ + -d "{\"password\":\"${CAPROVER_PASSWORD}\"}" \ + "${CAPROVER_URL}/api/v2/login" 2>/dev/null) || return 1 + + local status + status=$(echo "$response" | jq -r '.status // 0' 2>/dev/null) + if [ "$status" != "100" ]; then + echo "ERROR: Failed to authenticate with CapRover API" >&2; return 1 + fi + + AUTH_TOKEN=$(echo "$response" | jq -r '.data.token // empty' 2>/dev/null) + if [ -z "$AUTH_TOKEN" ]; then + echo "ERROR: No auth token received from CapRover API" >&2; return 1 + fi + return 0 +} + +api_call() { + local endpoint="$1" + curl -s -X POST \ + -H "Content-Type: application/json" \ + -H "x-namespace: captain" \ + -H "x-captain-auth: ${AUTH_TOKEN}" \ + -d "{}" \ + "${CAPROVER_URL}${endpoint}" 2>/dev/null +} + +api_get() { + local endpoint="$1" + curl -s -X GET \ + -H "Content-Type: application/json" \ + -H "x-namespace: captain" \ + -H "x-captain-auth: ${AUTH_TOKEN}" \ + "${CAPROVER_URL}${endpoint}" 2>/dev/null +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +generate_metrics() { + local script_start + script_start=$(date +%s) + + if ! check_prerequisites; then + echo "# HELP caprover_up CapRover API reachability (1=up, 0=down)" + echo "# TYPE caprover_up gauge" + echo "caprover_up 0" + return + fi + + AUTH_TOKEN="" + if ! authenticate; then + echo "# HELP caprover_up CapRover API reachability (1=up, 0=down)" + echo "# TYPE caprover_up gauge" + echo "caprover_up 0" + return + fi + + cat </dev/null) + if [ "$info_status" = "100" ]; then + caprover_version=$(echo "$system_info" | jq -r '.data.caproverVersion // "unknown"' 2>/dev/null) + node_count=$(echo "$system_info" | jq -r '.data.swarmNodesCount // 0' 2>/dev/null) + node_count=${node_count:-0} + disk_used=$(echo "$system_info" | jq -r '.data.diskUsedInMb // 0' 2>/dev/null) + disk_total=$(echo "$system_info" | jq -r '.data.diskTotalInMb // 0' 2>/dev/null) + disk_used=${disk_used:-0}; disk_total=${disk_total:-0} + [ "$disk_used" != "0" ] && disk_used=$((disk_used * 1024 * 1024)) + [ "$disk_total" != "0" ] && disk_total=$((disk_total * 1024 * 1024)) + volume_count=$(echo "$system_info" | jq -r '.data.dockerVolumesCount // 0' 2>/dev/null) + volume_count=${volume_count:-0} + fi + fi + + cat </dev/null) + + if [ "$apps_status" = "100" ]; then + total_apps=$(echo "$apps_response" | jq '.data.appDefinitions | length // 0' 2>/dev/null) + total_apps=${total_apps:-0} + running_apps=$(echo "$apps_response" | jq '[.data.appDefinitions[] | select(.deployedVersion != null and .deployedVersion != 0)] | length' 2>/dev/null) + stopped_apps=$(echo "$apps_response" | jq '[.data.appDefinitions[] | select(.deployedVersion == null or .deployedVersion == 0)] | length' 2>/dev/null) + running_apps=${running_apps:-0}; stopped_apps=${stopped_apps:-0} + + cat </dev/null | while read -r name val; do + [ -z "$name" ] && continue + echo "caprover_app_running{app=\"$(prom_escape "$name")\"} $val" + done + echo "" + + # Per-app instance count + echo "# HELP caprover_app_instance_count Number of replicas per app" + echo "# TYPE caprover_app_instance_count gauge" + echo "$apps_response" | jq -r '.data.appDefinitions[] | "\(.appName) \(.instanceCount // 1)"' 2>/dev/null | while read -r name val; do + [ -z "$name" ] && continue + echo "caprover_app_instance_count{app=\"$(prom_escape "$name")\"} $val" + done + echo "" + + # Per-app SSL status + echo "# HELP caprover_app_has_ssl SSL enabled per app (1=yes, 0=no)" + echo "# TYPE caprover_app_has_ssl gauge" + echo "$apps_response" | jq -r '.data.appDefinitions[] | "\(.appName) \(if .hasDefaultSubDomainSsl == true then 1 else 0 end)"' 2>/dev/null | while read -r name val; do + [ -z "$name" ] && continue + echo "caprover_app_has_ssl{app=\"$(prom_escape "$name")\"} $val" + done + echo "" + + # Per-app force SSL status + echo "# HELP caprover_app_force_ssl Force SSL per app (1=yes, 0=no)" + echo "# TYPE caprover_app_force_ssl gauge" + echo "$apps_response" | jq -r '.data.appDefinitions[] | "\(.appName) \(if .forceSsl == true then 1 else 0 end)"' 2>/dev/null | while read -r name val; do + [ -z "$name" ] && continue + echo "caprover_app_force_ssl{app=\"$(prom_escape "$name")\"} $val" + done + else + echo "# HELP caprover_apps_total Total number of deployed apps" + echo "# TYPE caprover_apps_total gauge" + echo "caprover_apps_total 0" + fi + else + echo "# HELP caprover_apps_total Total number of deployed apps" + echo "# TYPE caprover_apps_total gauge" + echo "caprover_apps_total 0" + fi + + echo "" + + # ======================================================================== + # EXPORTER RUNTIME + # ======================================================================== + + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +CapRover Exporter v1.0 + +

CapRover Prometheus Exporter v1.0

+

Metrics

+

Operational metrics from the CapRover API.

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.caprover_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/caprover-smoke-tests.sh b/caprover-smoke-tests.sh new file mode 100755 index 0000000..36464c6 --- /dev/null +++ b/caprover-smoke-tests.sh @@ -0,0 +1,518 @@ +#!/bin/bash +################################################################################ +# Script Name: caprover-smoke-tests.sh +# Version: 1.01 +# Description: Smoke test suite for CapRover PaaS — validates API health, +# app deployment lifecycle, SSL certificates, Docker Swarm status, +# and resource usage +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - bash 4+ +# - curl +# - jq +# - openssl (for SSL checks) +# +# Usage: +# export CAPROVER_URL="https://captain.apps.example.com" +# export CAPROVER_PASSWORD="your-password" +# ./caprover-smoke-tests.sh +# ./caprover-smoke-tests.sh --skip-app --skip-ssl +# ./caprover-smoke-tests.sh --format tap +# ./caprover-smoke-tests.sh --format junit --junit-file results.xml +# +################################################################################ + +set -euo pipefail + +# --- Defaults --- +CAPROVER_URL="${CAPROVER_URL:-}" +CAPROVER_PASSWORD="${CAPROVER_PASSWORD:-}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +CURL_INSECURE="${CURL_INSECURE:-false}" +SKIP_APP="${SKIP_APP_LIFECYCLE:-false}" +SKIP_SSL="${SKIP_SSL:-false}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}" +VERBOSE=false +USE_COLOR=true +AUTH_TOKEN="" +TEST_APP_NAME="" +PASSED=0 +FAILED=0 +SKIPPED=0 +START_TIME="" +CURL_OPTS=() +JUNIT_RESULTS=() +TAP_RESULTS=() +TEST_NUM=0 + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +usage() { + cat <&2 + exit 1 +fi +if [[ -z "$CAPROVER_PASSWORD" ]]; then + echo "Error: CAPROVER_PASSWORD is required" >&2 + exit 1 +fi + +# Strip trailing slash +CAPROVER_URL="${CAPROVER_URL%/}" + +# --- Helpers --- +debug() { + if [[ "$VERBOSE" == "true" ]]; then + echo -e " ${CYAN}[debug]${NC} $*" >&2 + fi +} + +api_call() { + local method="$1" endpoint="$2" + shift 2 + local url="${CAPROVER_URL}${endpoint}" + debug "curl -s -X $method $url" + curl -s -X "$method" \ + --connect-timeout "$CURL_TIMEOUT" \ + --max-time "$((CURL_TIMEOUT * 3))" \ + -H "Content-Type: application/json" \ + -H "x-captain-auth: ${AUTH_TOKEN}" \ + "${CURL_OPTS[@]}" \ + "$url" "$@" +} + +pass() { + local suite="$1" msg="$2" + ((TEST_NUM++)) || true + ((PASSED++)) || true + case "$OUTPUT_FORMAT" in + tap) TAP_RESULTS+=("ok $TEST_NUM - [$suite] $msg") ;; + junit) JUNIT_RESULTS+=("") ;; + *) echo -e " ${GREEN}✓${NC} $msg" ;; + esac +} + +fail() { + local suite="$1" msg="$2" detail="${3:-}" + ((TEST_NUM++)) || true + ((FAILED++)) || true + case "$OUTPUT_FORMAT" in + tap) TAP_RESULTS+=("not ok $TEST_NUM - [$suite] $msg") ;; + junit) JUNIT_RESULTS+=("$detail") ;; + *) echo -e " ${RED}✗${NC} $msg${detail:+ — $detail}" ;; + esac +} + +skip() { + local suite="$1" msg="$2" + ((TEST_NUM++)) || true + ((SKIPPED++)) || true + case "$OUTPUT_FORMAT" in + tap) TAP_RESULTS+=("ok $TEST_NUM - [$suite] $msg # SKIP") ;; + junit) JUNIT_RESULTS+=("") ;; + *) echo -e " ${YELLOW}⊘${NC} $msg — skipped" ;; + esac +} + +suite_header() { + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e "\n${BOLD}$1${NC}" + fi +} + +# --- Cleanup --- +cleanup() { + if [[ -n "$TEST_APP_NAME" && -n "$AUTH_TOKEN" ]]; then + debug "Cleaning up test app: $TEST_APP_NAME" + api_call POST "/api/v2/user/apps/appDefinitions/delete" \ + -d "{\"appName\":\"$TEST_APP_NAME\"}" >/dev/null 2>&1 || true + TEST_APP_NAME="" + fi +} +trap cleanup EXIT INT TERM + +# --- Header --- +START_TIME=$(date +%s) +if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e "${BOLD}CapRover Smoke Tests${NC}" + echo "Target: $CAPROVER_URL" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" +fi + +# ===================================================== +# Suite 1: Connectivity +# ===================================================== +suite_header "Connectivity" + +http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + --connect-timeout "$CURL_TIMEOUT" \ + "${CURL_OPTS[@]}" \ + "$CAPROVER_URL/" 2>/dev/null || echo "000") + +if [[ "$http_code" =~ ^(200|302)$ ]]; then + pass "Connectivity" "Dashboard reachable — HTTP $http_code" +else + fail "Connectivity" "Dashboard unreachable" "HTTP $http_code" +fi + +api_code=$(curl -s -o /dev/null -w "%{http_code}" \ + --connect-timeout "$CURL_TIMEOUT" \ + "${CURL_OPTS[@]}" \ + "$CAPROVER_URL/api/v2/user/system/info" 2>/dev/null || echo "000") + +if [[ "$api_code" != "000" ]]; then + pass "Connectivity" "API endpoint responding — HTTP $api_code" +else + fail "Connectivity" "API endpoint not responding" +fi + +# ===================================================== +# Suite 2: API +# ===================================================== +suite_header "API" + +login_response=$(curl -s -X POST \ + --connect-timeout "$CURL_TIMEOUT" \ + --max-time "$((CURL_TIMEOUT * 3))" \ + -H "Content-Type: application/json" \ + "${CURL_OPTS[@]}" \ + "$CAPROVER_URL/api/v2/login" \ + -d "{\"password\":\"$CAPROVER_PASSWORD\"}" 2>/dev/null || echo "{}") + +debug "Login response: $login_response" + +AUTH_TOKEN=$(echo "$login_response" | jq -r '.data.token // empty' 2>/dev/null || true) + +if [[ -n "$AUTH_TOKEN" ]]; then + pass "API" "API login — authenticated successfully" +else + fail "API" "API login failed" "Could not obtain auth token" + # Cannot continue without auth + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e "\n${RED}Cannot continue without authentication. Aborting.${NC}" + fi + exit 1 +fi + +# App definitions +app_response=$(api_call GET "/api/v2/user/apps/appDefinitions" 2>/dev/null || echo "{}") +app_count=$(echo "$app_response" | jq -r '.data.appDefinitions | length // 0' 2>/dev/null || echo "0") +status_code=$(echo "$app_response" | jq -r '.status // 0' 2>/dev/null || echo "0") + +if [[ "$status_code" == "100" ]]; then + pass "API" "App definitions — $app_count apps found" +else + fail "API" "App definitions" "Unexpected status: $status_code" +fi + +# Version +version_info=$(api_call GET "/api/v2/user/system/versioninfo" 2>/dev/null || echo "{}") +cr_version=$(echo "$version_info" | jq -r '.data.currentVersion // "unknown"' 2>/dev/null || echo "unknown") + +if [[ "$cr_version" != "unknown" ]]; then + pass "API" "CapRover version — $cr_version" +else + fail "API" "CapRover version" "Could not retrieve version" +fi + +# System info +sys_response=$(api_call GET "/api/v2/user/system/info" 2>/dev/null || echo "{}") +sys_status=$(echo "$sys_response" | jq -r '.status // 0' 2>/dev/null || echo "0") + +if [[ "$sys_status" == "100" ]]; then + pass "API" "System info — retrieved successfully" +else + fail "API" "System info" "Unexpected status: $sys_status" +fi + +# ===================================================== +# Suite 3: App Lifecycle +# ===================================================== +if [[ "$SKIP_APP" == "true" ]]; then + suite_header "App Lifecycle" + skip "App Lifecycle" "Create test app" + skip "App Lifecycle" "Deploy image" + skip "App Lifecycle" "App responding" + skip "App Lifecycle" "Delete test app" +else + suite_header "App Lifecycle" + + TEST_APP_NAME="smoke-test-$(date +%s)" + debug "Test app name: $TEST_APP_NAME" + + # Create app + create_response=$(api_call POST "/api/v2/user/apps/appDefinitions/register" \ + -d "{\"appName\":\"$TEST_APP_NAME\",\"hasPersistentData\":false}" 2>/dev/null || echo "{}") + create_status=$(echo "$create_response" | jq -r '.status // 0' 2>/dev/null || echo "0") + + if [[ "$create_status" == "100" ]]; then + pass "App Lifecycle" "Create test app — $TEST_APP_NAME" + else + fail "App Lifecycle" "Create test app" "$(echo "$create_response" | jq -r '.description // "unknown error"' 2>/dev/null)" + skip "App Lifecycle" "Deploy image" + skip "App Lifecycle" "App responding" + skip "App Lifecycle" "Delete test app" + TEST_APP_NAME="" + SKIP_APP=true + fi + + if [[ "$SKIP_APP" != "true" ]]; then + # Deploy image + deploy_response=$(api_call POST "/api/v2/user/apps/appData/$TEST_APP_NAME" \ + -d "{\"captainDefinitionContent\":\"{\\\"schemaVersion\\\":2,\\\"imageName\\\":\\\"nginxdemos/hello\\\"}\"}" 2>/dev/null || echo "{}") + deploy_status=$(echo "$deploy_response" | jq -r '.status // 0' 2>/dev/null || echo "0") + + if [[ "$deploy_status" == "100" ]]; then + pass "App Lifecycle" "Deploy image — nginxdemos/hello deployed" + else + fail "App Lifecycle" "Deploy image" "$(echo "$deploy_response" | jq -r '.description // "deploy failed"' 2>/dev/null)" + fi + + # Wait for app to be running (up to 60 seconds) + app_ready=false + for i in $(seq 1 12); do + sleep 5 + debug "Waiting for app to start... attempt $i/12" + check=$(api_call GET "/api/v2/user/apps/appDefinitions" 2>/dev/null || echo "{}") + is_running=$(echo "$check" | jq -r ".data.appDefinitions[] | select(.appName==\"$TEST_APP_NAME\") | .isAppBuilding" 2>/dev/null || echo "true") + if [[ "$is_running" == "false" ]]; then + app_ready=true + break + fi + done + + # Extract root domain from CapRover URL to build app URL + root_domain=$(echo "$CAPROVER_URL" | sed -E 's|https?://captain\.||') + app_url="http://${TEST_APP_NAME}.${root_domain}" + debug "App URL: $app_url" + + if [[ "$app_ready" == "true" ]]; then + # Give nginx a moment to reconfigure + sleep 3 + app_http=$(curl -s -o /dev/null -w "%{http_code}" \ + --connect-timeout "$CURL_TIMEOUT" \ + "${CURL_OPTS[@]}" \ + "$app_url" 2>/dev/null || echo "000") + + if [[ "$app_http" == "200" ]]; then + pass "App Lifecycle" "App responding — HTTP 200 at ${TEST_APP_NAME}.${root_domain}" + else + fail "App Lifecycle" "App responding" "HTTP $app_http at $app_url" + fi + else + fail "App Lifecycle" "App responding" "Timed out waiting for app to start" + fi + + # Delete test app + delete_response=$(api_call POST "/api/v2/user/apps/appDefinitions/delete" \ + -d "{\"appName\":\"$TEST_APP_NAME\"}" 2>/dev/null || echo "{}") + delete_status=$(echo "$delete_response" | jq -r '.status // 0' 2>/dev/null || echo "0") + + if [[ "$delete_status" == "100" ]]; then + pass "App Lifecycle" "Delete test app — cleaned up" + TEST_APP_NAME="" + else + fail "App Lifecycle" "Delete test app" "Manual cleanup may be required" + fi + fi +fi + +# ===================================================== +# Suite 4: SSL +# ===================================================== +if [[ "$SKIP_SSL" == "true" ]]; then + suite_header "SSL" + skip "SSL" "TLS certificate valid" + skip "SSL" "Certificate chain complete" +else + suite_header "SSL" + + # Extract hostname from URL + cr_host=$(echo "$CAPROVER_URL" | sed -E 's|https?://||;s|/.*||;s|:.*||') + cr_port=$(echo "$CAPROVER_URL" | grep -oP ':\K[0-9]+' || echo "443") + + if [[ "$CAPROVER_URL" == https://* ]]; then + cert_output=$(echo | openssl s_client -servername "$cr_host" -connect "${cr_host}:${cr_port}" 2>/dev/null || true) + cert_enddate=$(echo "$cert_output" | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2 || true) + + if [[ -n "$cert_enddate" ]]; then + expiry_epoch=$(date -d "$cert_enddate" +%s 2>/dev/null || echo "0") + now_epoch=$(date +%s) + days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + + if [[ "$days_left" -gt 0 ]]; then + pass "SSL" "TLS certificate valid — $days_left days remaining" + else + fail "SSL" "TLS certificate expired" "$days_left days past expiry" + fi + else + fail "SSL" "TLS certificate valid" "Could not read certificate" + fi + + # Check chain + verify_result=$(echo | openssl s_client -servername "$cr_host" -connect "${cr_host}:${cr_port}" 2>&1 | grep "Verify return code" || true) + if echo "$verify_result" | grep -q "0 (ok)"; then + pass "SSL" "Certificate chain complete" + else + fail "SSL" "Certificate chain complete" "$verify_result" + fi + else + skip "SSL" "TLS certificate valid" + skip "SSL" "Certificate chain complete" + fi +fi + +# ===================================================== +# Suite 5: Docker Swarm +# ===================================================== +suite_header "Docker Swarm" + +node_count=$(echo "$sys_response" | jq -r '.data.swarmNodesCount // "unknown"' 2>/dev/null || echo "unknown") + +if [[ "$node_count" != "unknown" && "$node_count" -gt 0 ]] 2>/dev/null; then + pass "Docker Swarm" "Swarm active — $node_count node(s)" +else + fail "Docker Swarm" "Swarm status" "Could not determine node count" +fi + +# Count running services from app definitions +running_count=$(echo "$app_response" | jq '[.data.appDefinitions[] | select(.isAppBuilding == false)] | length' 2>/dev/null || echo "0") +total_count=$(echo "$app_response" | jq '.data.appDefinitions | length' 2>/dev/null || echo "0") +# Add 3 for captain-captain, captain-nginx, captain-certbot +service_count=$((running_count + 3)) + +pass "Docker Swarm" "Services running — $service_count services ($total_count apps + 3 system)" + +# ===================================================== +# Suite 6: Resources +# ===================================================== +suite_header "Resources" + +disk_used=$(echo "$sys_response" | jq -r '.data.diskUsedPercentage // "unknown"' 2>/dev/null || echo "unknown") + +if [[ "$disk_used" != "unknown" ]]; then + pass "Resources" "Disk usage — ${disk_used}%" +else + fail "Resources" "Disk usage" "Could not retrieve disk info" +fi + +volume_count=$(echo "$sys_response" | jq -r '.data.dockerVolumes | length // "unknown"' 2>/dev/null || echo "unknown") +if [[ "$volume_count" != "unknown" ]]; then + pass "Resources" "Docker volumes — $volume_count volumes" +else + # Volumes may not be in system info, skip gracefully + skip "Resources" "Docker volumes" +fi + +image_count=$(echo "$sys_response" | jq -r '.data.dockerImages | length // "unknown"' 2>/dev/null || echo "unknown") +if [[ "$image_count" != "unknown" ]]; then + pass "Resources" "Docker images — $image_count images" +else + skip "Resources" "Docker images" +fi + +# ===================================================== +# Summary +# ===================================================== +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +case "$OUTPUT_FORMAT" in + tap) + echo "TAP version 13" + echo "1..$TEST_NUM" + for line in "${TAP_RESULTS[@]}"; do + echo "$line" + done + echo "# passed: $PASSED" + echo "# failed: $FAILED" + echo "# skipped: $SKIPPED" + echo "# duration: ${DURATION}s" + ;; + junit) + { + echo '' + echo "" + echo " " + echo " " + echo " " + for result in "${JUNIT_RESULTS[@]}"; do + echo " $result" + done + echo "" + } > "$JUNIT_FILE" + echo "JUnit results written to $JUNIT_FILE" + ;; + *) + echo "" + echo "────────────────────────────────────────" + echo -e "Summary ${BOLD}$CAPROVER_URL${NC}" + echo -e " ${GREEN}$PASSED passed${NC} ${RED}$FAILED failed${NC} ${YELLOW}$SKIPPED skipped${NC} (${DURATION}s)" + echo "────────────────────────────────────────" + if [[ "$FAILED" -eq 0 ]]; then + echo -e "${GREEN}All tests passed.${NC}" + else + echo -e "${RED}Some tests failed.${NC}" + fi + ;; +esac + +exit $((FAILED > 0 ? 1 : 0)) diff --git a/certificate-smoke-tests.sh b/certificate-smoke-tests.sh new file mode 100755 index 0000000..9d02c0e --- /dev/null +++ b/certificate-smoke-tests.sh @@ -0,0 +1,650 @@ +#!/usr/bin/env bash + +##################################################################################### +#### certificate-smoke-tests.sh — Verify TLS certificates are healthy #### +#### Checks expiry, chain, OCSP, TLS version, ciphers, SAN, on-disk certs. #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: TARGETS="example.com:443" ./certificate-smoke-tests.sh #### +#### CERT_FILES="/etc/ssl/certs/app.pem" ./certificate-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +##################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +TARGETS="${TARGETS:-}" +WARN_DAYS="${WARN_DAYS:-30}" +CRITICAL_DAYS="${CRITICAL_DAYS:-7}" +CERT_FILES="${CERT_FILES:-}" +CERT_FILE="${CERT_FILE:-}" +KEY_FILE="${KEY_FILE:-}" +CHECK_OCSP="${CHECK_OCSP:-true}" +CHECK_TLS_VERSION="${CHECK_TLS_VERSION:-true}" +CHECK_HSTS="${CHECK_HSTS:-true}" +REJECT_SELF_SIGNED="${REJECT_SELF_SIGNED:-false}" +SKIP_CHAIN="${SKIP_CHAIN:-false}" +SKIP_OCSP="${SKIP_OCSP:-false}" +SKIP_TLS_VERSION="${SKIP_TLS_VERSION:-false}" +CONNECT_TIMEOUT="${CONNECT_TIMEOUT:-10}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +COLOR="${COLOR:-auto}" +VERBOSE="${VERBOSE:-false}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0; FAIL=0; SKIP=0; TOTAL=0 +RESULTS=() +START_TIME="" +CERT_TMP="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" detail="${2:-}" + ((PASS++)) || true; ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name}${detail:+ (${detail})}" + else echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_fail() { + local name="$1" detail="${2:-}" + ((FAIL++)) || true; ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_skip() { + local name="$1" reason="${2:-}" + ((SKIP++)) || true; ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}"; fi +} + +# ── Helpers ─────────────────────────────────────────────────────────── +has_cmd() { command -v "$1" >/dev/null 2>&1; } + +section() { + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then echo ""; echo -e "${BOLD}$1${RESET}"; fi +} + +# ── Cleanup ─────────────────────────────────────────────────────────── +# shellcheck disable=SC2317 +cleanup() { + [[ -n "${CERT_TMP}" && -d "${CERT_TMP}" ]] && rm -rf "${CERT_TMP}" +} +trap cleanup EXIT + +# ══════════════════════════════════════════════════════════════════════ +# HELPER FUNCTIONS +# ══════════════════════════════════════════════════════════════════════ + +# Fetch certificate from a remote host:port, store in temp file +# Returns path to PEM file on stdout, empty on failure +fetch_cert() { + local host="$1" port="$2" pem_file + pem_file="${CERT_TMP}/${host}_${port}.pem" + verbose "Fetching certificate from ${host}:${port}" + if echo | timeout "${CONNECT_TIMEOUT}" openssl s_client \ + -connect "${host}:${port}" \ + -servername "${host}" \ + -showcerts /dev/null \ + | openssl x509 -outform PEM > "${pem_file}" 2>/dev/null; then + if [[ -s "${pem_file}" ]]; then + echo "${pem_file}" + return 0 + fi + fi + return 1 +} + +# Fetch full chain from remote host:port +fetch_chain() { + local host="$1" port="$2" chain_file + chain_file="${CERT_TMP}/${host}_${port}_chain.pem" + echo | timeout "${CONNECT_TIMEOUT}" openssl s_client \ + -connect "${host}:${port}" \ + -servername "${host}" \ + -showcerts "${CERT_TMP}/s_client_err.txt" \ + > "${chain_file}" 2>/dev/null || true + if [[ -s "${chain_file}" ]]; then + echo "${chain_file}" + fi +} + +# Get days until certificate expires +# Args: path to PEM file +# Returns: integer days (negative = already expired) +cert_days_remaining() { + local pem_file="$1" + local end_date epoch_end epoch_now + end_date=$(openssl x509 -in "${pem_file}" -noout -enddate 2>/dev/null | sed 's/notAfter=//') || return 1 + epoch_end=$(date -d "${end_date}" +%s 2>/dev/null) || return 1 + epoch_now=$(date +%s) + echo $(( (epoch_end - epoch_now) / 86400 )) +} + +# ══════════════════════════════════════════════════════════════════════ +# TEST FUNCTIONS +# ══════════════════════════════════════════════════════════════════════ + +# ── Certificate Expiry ─────────────────────────────────────────────── +test_cert_expiry() { + local host="$1" port="$2" pem_file + pem_file=$(fetch_cert "$host" "$port") || { + record_fail "Certificate expiry (${host}:${port})" "could not connect" + return + } + + local days + days=$(cert_days_remaining "$pem_file") || { + record_fail "Certificate expiry (${host}:${port})" "could not parse expiry date" + return + } + + if [[ $days -lt 0 ]]; then + record_fail "Certificate expiry (${host}:${port})" "EXPIRED ${days#-} days ago" + elif [[ $days -lt $CRITICAL_DAYS ]]; then + record_fail "Certificate expiry (${host}:${port})" "expires in ${days}d (critical < ${CRITICAL_DAYS}d)" + elif [[ $days -lt $WARN_DAYS ]]; then + record_pass "Certificate expiry (${host}:${port})" "expires in ${days}d (warning < ${WARN_DAYS}d)" + else + record_pass "Certificate expiry (${host}:${port})" "expires in ${days}d" + fi +} + +# ── Subject / SAN Match ───────────────────────────────────────────── +test_san_match() { + local host="$1" port="$2" pem_file + pem_file="${CERT_TMP}/${host}_${port}.pem" + [[ ! -s "$pem_file" ]] && { record_skip "SAN match (${host}:${port})" "no certificate fetched"; return; } + + local san_output cn_output matched=false + san_output=$(openssl x509 -in "${pem_file}" -noout -ext subjectAltName 2>/dev/null) || true + cn_output=$(openssl x509 -in "${pem_file}" -noout -subject 2>/dev/null | grep -oP 'CN\s*=\s*\K[^/,]+') || true + + if echo "$san_output" | grep -qi "DNS:${host}"; then + matched=true + elif echo "$san_output" | grep -qi "DNS:\*.$(echo "$host" | sed 's/^[^.]*\.//')"; then + matched=true + elif [[ "${cn_output}" == "${host}" ]]; then + matched=true + fi + + if $matched; then + record_pass "SAN match (${host}:${port})" "hostname matches certificate" + else + record_fail "SAN match (${host}:${port})" "hostname not in CN or SAN" + fi +} + +# ── Chain Validation ───────────────────────────────────────────────── +test_chain_valid() { + local host="$1" port="$2" + if [[ "$SKIP_CHAIN" == "true" ]]; then + record_skip "Chain valid (${host}:${port})" "SKIP_CHAIN=true" + return + fi + + local verify_output + verify_output=$(echo | timeout "${CONNECT_TIMEOUT}" openssl s_client \ + -connect "${host}:${port}" \ + -servername "${host}" \ + -verify_return_error &1) || true + + if echo "$verify_output" | grep -q "Verify return code: 0"; then + record_pass "Chain valid (${host}:${port})" "full chain verified" + elif echo "$verify_output" | grep -q "Verify return code: 18\|Verify return code: 19"; then + if [[ "$REJECT_SELF_SIGNED" == "true" ]]; then + record_fail "Chain valid (${host}:${port})" "self-signed certificate" + else + record_pass "Chain valid (${host}:${port})" "self-signed (allowed)" + fi + else + local code + code=$(echo "$verify_output" | grep -oP 'Verify return code: \K[0-9]+' | head -1) || code="unknown" + record_fail "Chain valid (${host}:${port})" "verify failed (code ${code})" + fi +} + +# ── Self-signed Detection ──────────────────────────────────────────── +test_self_signed() { + local host="$1" port="$2" pem_file + pem_file="${CERT_TMP}/${host}_${port}.pem" + [[ ! -s "$pem_file" ]] && { record_skip "Self-signed check (${host}:${port})" "no certificate fetched"; return; } + + local issuer subject + issuer=$(openssl x509 -in "${pem_file}" -noout -issuer 2>/dev/null) || true + subject=$(openssl x509 -in "${pem_file}" -noout -subject 2>/dev/null) || true + + if [[ "$issuer" == "$subject" ]]; then + if [[ "$REJECT_SELF_SIGNED" == "true" ]]; then + record_fail "Self-signed check (${host}:${port})" "certificate is self-signed" + else + record_pass "Self-signed check (${host}:${port})" "self-signed (allowed)" + fi + else + record_pass "Self-signed check (${host}:${port})" "CA-signed" + fi +} + +# ── OCSP Stapling ──────────────────────────────────────────────────── +test_ocsp_stapling() { + local host="$1" port="$2" + if [[ "$SKIP_OCSP" == "true" ]]; then + record_skip "OCSP stapling (${host}:${port})" "SKIP_OCSP=true" + return + fi + + local ocsp_output + ocsp_output=$(echo | timeout "${CONNECT_TIMEOUT}" openssl s_client \ + -connect "${host}:${port}" \ + -servername "${host}" \ + -status &1) || true + + if echo "$ocsp_output" | grep -q "OCSP Response Status: successful"; then + record_pass "OCSP stapling (${host}:${port})" "stapled response present" + elif echo "$ocsp_output" | grep -q "OCSP response: no response sent"; then + record_pass "OCSP stapling (${host}:${port})" "not configured (optional)" + else + record_pass "OCSP stapling (${host}:${port})" "status unknown (non-critical)" + fi +} + +# ── OCSP Responder Reachable ───────────────────────────────────────── +test_ocsp_responder() { + local host="$1" port="$2" pem_file + if [[ "$SKIP_OCSP" == "true" ]]; then + record_skip "OCSP responder (${host}:${port})" "SKIP_OCSP=true" + return + fi + + pem_file="${CERT_TMP}/${host}_${port}.pem" + [[ ! -s "$pem_file" ]] && { record_skip "OCSP responder (${host}:${port})" "no certificate fetched"; return; } + + local ocsp_uri + ocsp_uri=$(openssl x509 -in "${pem_file}" -noout -ocsp_uri 2>/dev/null) || true + + if [[ -z "$ocsp_uri" ]]; then + record_skip "OCSP responder (${host}:${port})" "no OCSP URI in certificate" + return + fi + + verbose "OCSP URI: ${ocsp_uri}" + local ocsp_host + ocsp_host=$(echo "$ocsp_uri" | sed 's|https\?://||' | cut -d/ -f1) + + if has_cmd curl; then + if curl -sf --max-time 5 -o /dev/null "${ocsp_uri}" 2>/dev/null; then + record_pass "OCSP responder (${host}:${port})" "${ocsp_host} reachable" + else + record_fail "OCSP responder (${host}:${port})" "${ocsp_host} unreachable" + fi + elif ping -c1 -W3 "$ocsp_host" >/dev/null 2>&1; then + record_pass "OCSP responder (${host}:${port})" "${ocsp_host} reachable (ping)" + else + record_fail "OCSP responder (${host}:${port})" "${ocsp_host} unreachable" + fi +} + +# ── TLS Version Check ──────────────────────────────────────────────── +test_tls_version() { + local host="$1" port="$2" + if [[ "$SKIP_TLS_VERSION" == "true" ]]; then + record_skip "TLS version (${host}:${port})" "SKIP_TLS_VERSION=true" + return + fi + + # Check TLS 1.2 supported + local tls12_ok=false tls13_ok=false + if echo | timeout "${CONNECT_TIMEOUT}" openssl s_client \ + -connect "${host}:${port}" -servername "${host}" \ + -tls1_2 &1 | grep -q "Protocol.*TLSv1.2"; then + tls12_ok=true + fi + + # Check TLS 1.3 supported + if echo | timeout "${CONNECT_TIMEOUT}" openssl s_client \ + -connect "${host}:${port}" -servername "${host}" \ + -tls1_3 &1 | grep -q "Protocol.*TLSv1.3"; then + tls13_ok=true + fi + + if $tls13_ok; then + record_pass "TLS version (${host}:${port})" "TLS 1.3 supported" + elif $tls12_ok; then + record_pass "TLS version (${host}:${port})" "TLS 1.2 supported" + else + record_fail "TLS version (${host}:${port})" "neither TLS 1.2 nor 1.3 supported" + fi + + # Check TLS 1.0 rejected + local tls10_output + tls10_output=$(echo | timeout "${CONNECT_TIMEOUT}" openssl s_client \ + -connect "${host}:${port}" -servername "${host}" \ + -tls1 &1) || true + + if echo "$tls10_output" | grep -q "Protocol.*TLSv1$\|Protocol.*TLSv1.0"; then + record_fail "TLS 1.0 rejected (${host}:${port})" "TLS 1.0 still accepted" + else + record_pass "TLS 1.0 rejected (${host}:${port})" "correctly refused" + fi +} + +# ── Cipher Strength ────────────────────────────────────────────────── +test_cipher_strength() { + local host="$1" port="$2" + + local cipher_output negotiated + cipher_output=$(echo | timeout "${CONNECT_TIMEOUT}" openssl s_client \ + -connect "${host}:${port}" -servername "${host}" \ + &1) || true + + negotiated=$(echo "$cipher_output" | grep -oP 'Cipher\s+:\s+\K\S+' | head -1) || true + + if [[ -z "$negotiated" ]]; then + record_skip "Cipher strength (${host}:${port})" "could not determine cipher" + return + fi + + local weak_ciphers="RC4|DES|3DES|NULL|EXPORT|MD5|anon" + if echo "$negotiated" | grep -qiE "$weak_ciphers"; then + record_fail "Cipher strength (${host}:${port})" "weak cipher: ${negotiated}" + else + record_pass "Cipher strength (${host}:${port})" "${negotiated}" + fi +} + +# ── HSTS Header ────────────────────────────────────────────────────── +test_hsts() { + local host="$1" port="$2" + if [[ "$CHECK_HSTS" != "true" ]]; then + record_skip "HSTS header (${host}:${port})" "CHECK_HSTS=false" + return + fi + if ! has_cmd curl; then + record_skip "HSTS header (${host}:${port})" "curl not installed" + return + fi + if [[ "$port" != "443" ]]; then + record_skip "HSTS header (${host}:${port})" "not HTTPS port" + return + fi + + local headers + headers=$(curl -sI --max-time 5 -k "https://${host}/" 2>/dev/null) || true + + if echo "$headers" | grep -qi "Strict-Transport-Security"; then + local max_age + max_age=$(echo "$headers" | grep -oi 'max-age=[0-9]*' | head -1 | cut -d= -f2) || true + record_pass "HSTS header (${host}:${port})" "max-age=${max_age:-unknown}" + else + record_fail "HSTS header (${host}:${port})" "header not present" + fi +} + +# ── Certificate SCT ───────────────────────────────────────────────── +test_sct() { + local host="$1" port="$2" + + local sct_output + sct_output=$(echo | timeout "${CONNECT_TIMEOUT}" openssl s_client \ + -connect "${host}:${port}" -servername "${host}" \ + -ct &1) || true + + if echo "$sct_output" | grep -qi "SCT validation status\|Signed Certificate Timestamp"; then + record_pass "Certificate transparency (${host}:${port})" "SCT present" + elif echo "$sct_output" | grep -qi "unknown option\|unrecognized option"; then + record_skip "Certificate transparency (${host}:${port})" "openssl does not support -ct" + else + record_pass "Certificate transparency (${host}:${port})" "SCT status unknown (non-critical)" + fi +} + +# ── On-disk Certificate File Expiry ────────────────────────────────── +test_cert_file_expiry() { + local cert_path="$1" + local filename + filename=$(basename "$cert_path") + + if [[ ! -f "$cert_path" ]]; then + record_fail "File expiry (${filename})" "file not found: ${cert_path}" + return + fi + + local days + days=$(cert_days_remaining "$cert_path") || { + record_fail "File expiry (${filename})" "could not parse certificate" + return + } + + if [[ $days -lt 0 ]]; then + record_fail "File expiry (${filename})" "EXPIRED ${days#-} days ago" + elif [[ $days -lt $CRITICAL_DAYS ]]; then + record_fail "File expiry (${filename})" "expires in ${days}d (critical < ${CRITICAL_DAYS}d)" + elif [[ $days -lt $WARN_DAYS ]]; then + record_pass "File expiry (${filename})" "expires in ${days}d (warning < ${WARN_DAYS}d)" + else + record_pass "File expiry (${filename})" "expires in ${days}d" + fi +} + +# ── Key / Cert Match ──────────────────────────────────────────────── +test_key_cert_match() { + if [[ -z "$CERT_FILE" || -z "$KEY_FILE" ]]; then + record_skip "Key/cert match" "CERT_FILE or KEY_FILE not set" + return + fi + if [[ ! -f "$CERT_FILE" ]]; then + record_fail "Key/cert match" "cert file not found: ${CERT_FILE}" + return + fi + if [[ ! -f "$KEY_FILE" ]]; then + record_fail "Key/cert match" "key file not found: ${KEY_FILE}" + return + fi + + local cert_mod key_mod + cert_mod=$(openssl x509 -in "${CERT_FILE}" -noout -modulus 2>/dev/null | md5sum | awk '{print $1}') || true + key_mod=$(openssl rsa -in "${KEY_FILE}" -noout -modulus 2>/dev/null | md5sum | awk '{print $1}') || { + key_mod=$(openssl ec -in "${KEY_FILE}" -noout -text 2>/dev/null | md5sum | awk '{print $1}') || true + } + + if [[ -n "$cert_mod" && "$cert_mod" == "$key_mod" ]]; then + record_pass "Key/cert match" "modulus matches" + elif [[ -z "$cert_mod" || -z "$key_mod" ]]; then + record_skip "Key/cert match" "could not extract modulus" + else + record_fail "Key/cert match" "cert and key do not match" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_tap_header() { echo "TAP version 13"; } + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +print_summary() { + local end_time; end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} Certificate Smoke Tests" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + if [[ $FAIL -eq 0 ]]; then echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}"; fi +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +# ── Helpers ─────────────────────────────────────────────────────────── +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +detect_package_manager() { + if command -v dpkg-query &>/dev/null; then + echo "dpkg" + elif command -v rpm &>/dev/null; then + echo "rpm" + else + echo "unknown" + fi +} + +get_package_list() { + local pm + pm=$(detect_package_manager) + case "$pm" in + dpkg) + dpkg-query -W -f='${Package} ${Version}\n' 2>/dev/null | sort + ;; + rpm) + rpm -qa --queryformat '%{NAME} %{VERSION}-%{RELEASE}\n' 2>/dev/null | sort + ;; + *) + err "No supported package manager found (need dpkg or rpm)" + exit 1 + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# SNAPSHOT +# ══════════════════════════════════════════════════════════════════════ + +do_snapshot() { + if [[ ! -d "$SNAPSHOT_DIR" ]]; then + mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || { + err "Cannot create snapshot directory: ${SNAPSHOT_DIR}" + exit 1 + } + fi + + local timestamp + timestamp=$(date +%Y%m%d-%H%M%S) + local host + host=$(hostname -s 2>/dev/null || hostname) + local pm + pm=$(detect_package_manager) + local snapshot_file="${SNAPSHOT_DIR}/${host}-${pm}-${timestamp}.txt" + + verbose "Package manager: ${pm}" + verbose "Snapshot file: ${snapshot_file}" + + get_package_list > "$snapshot_file" + + local pkg_count + pkg_count=$(wc -l < "$snapshot_file") + + log "Snapshot saved: ${snapshot_file}" + field "Packages:" "$pkg_count" + field "Package manager:" "$pm" + field "File:" "$snapshot_file" +} + +# ══════════════════════════════════════════════════════════════════════ +# DIFF +# ══════════════════════════════════════════════════════════════════════ + +do_diff() { + local file_a="$1" + local file_b="$2" + + if [[ ! -f "$file_a" ]]; then + err "File not found: ${file_a}" + exit 1 + fi + if [[ ! -f "$file_b" ]]; then + err "File not found: ${file_b}" + exit 1 + fi + + echo "" + echo -e "${BOLD}Package Diff${RESET}" + field "Before:" "$file_a" + field "After:" "$file_b" + + local added=0 removed=0 upgraded=0 downgraded=0 + + # Build associative arrays + local tmp_added tmp_removed tmp_changed + tmp_added=$(mktemp) + tmp_removed=$(mktemp) + tmp_changed=$(mktemp) + trap 'rm -f "$tmp_added" "$tmp_removed" "$tmp_changed"' EXIT + + # Find added packages (in B but not A) + while IFS=' ' read -r pkg ver; do + if ! grep -q "^${pkg} " "$file_a"; then + echo "${pkg} ${ver}" >> "$tmp_added" + fi + done < "$file_b" + + # Find removed packages (in A but not B) + while IFS=' ' read -r pkg ver; do + if ! grep -q "^${pkg} " "$file_b"; then + echo "${pkg} ${ver}" >> "$tmp_removed" + fi + done < "$file_a" + + # Find changed packages + while IFS=' ' read -r pkg ver_b; do + local ver_a + ver_a=$(grep "^${pkg} " "$file_a" 2>/dev/null | head -1 | cut -d' ' -f2-) + if [[ -n "$ver_a" && "$ver_a" != "$ver_b" ]]; then + echo "${pkg} ${ver_a} ${ver_b}" >> "$tmp_changed" + fi + done < "$file_b" + + # Display additions + if [[ -s "$tmp_added" ]]; then + section_header "Added Packages" + while IFS=' ' read -r pkg ver; do + printf " ${CYAN}+${RESET} %-40s %s\n" "$pkg" "$ver" + added=$((added + 1)) + done < "$tmp_added" + fi + + # Display removals + if [[ -s "$tmp_removed" ]]; then + section_header "Removed Packages" + while IFS=' ' read -r pkg ver; do + printf " ${RED}-${RESET} %-40s %s\n" "$pkg" "$ver" + removed=$((removed + 1)) + done < "$tmp_removed" + fi + + # Display upgrades and downgrades + if [[ -s "$tmp_changed" ]]; then + local has_upgrades=false + local has_downgrades=false + + # First pass: categorize + while IFS=' ' read -r pkg ver_a ver_b; do + if dpkg --compare-versions "$ver_b" gt "$ver_a" 2>/dev/null; then + has_upgrades=true + elif dpkg --compare-versions "$ver_b" lt "$ver_a" 2>/dev/null; then + has_downgrades=true + else + # Fallback: string comparison + if [[ "$ver_b" > "$ver_a" ]]; then + has_upgrades=true + else + has_downgrades=true + fi + fi + done < "$tmp_changed" + + if [[ "$has_upgrades" == "true" ]]; then + section_header "Upgraded Packages" + while IFS=' ' read -r pkg ver_a ver_b; do + local is_upgrade=false + if dpkg --compare-versions "$ver_b" gt "$ver_a" 2>/dev/null; then + is_upgrade=true + elif ! dpkg --compare-versions "$ver_b" lt "$ver_a" 2>/dev/null && [[ "$ver_b" > "$ver_a" ]]; then + is_upgrade=true + fi + if [[ "$is_upgrade" == "true" ]]; then + printf " ${GREEN}↑${RESET} %-35s %s → %s\n" "$pkg" "$ver_a" "$ver_b" + upgraded=$((upgraded + 1)) + fi + done < "$tmp_changed" + fi + + if [[ "$has_downgrades" == "true" ]]; then + section_header "Downgraded Packages" + while IFS=' ' read -r pkg ver_a ver_b; do + local is_downgrade=false + if dpkg --compare-versions "$ver_b" lt "$ver_a" 2>/dev/null; then + is_downgrade=true + elif ! dpkg --compare-versions "$ver_b" gt "$ver_a" 2>/dev/null && [[ "$ver_b" < "$ver_a" ]]; then + is_downgrade=true + fi + if [[ "$is_downgrade" == "true" ]]; then + printf " ${YELLOW}↓${RESET} %-35s %s → %s\n" "$pkg" "$ver_a" "$ver_b" + downgraded=$((downgraded + 1)) + fi + done < "$tmp_changed" + fi + fi + + # Summary + local total=$((added + removed + upgraded + downgraded)) + + echo "" + echo -e " ${BOLD}══════════════════════════════════════════${RESET}" + echo -e " ${BOLD}Change Summary${RESET}" + echo -e " ${BOLD}══════════════════════════════════════════${RESET}" + field "Total changes:" "$total" + if [[ "$added" -gt 0 ]]; then + printf " ${BOLD}%-22s${RESET} ${CYAN}%s${RESET}\n" "Additions:" "$added" + else + field "Additions:" "0" + fi + if [[ "$removed" -gt 0 ]]; then + printf " ${BOLD}%-22s${RESET} ${RED}%s${RESET}\n" "Removals:" "$removed" + else + field "Removals:" "0" + fi + if [[ "$upgraded" -gt 0 ]]; then + printf " ${BOLD}%-22s${RESET} ${GREEN}%s${RESET}\n" "Upgrades:" "$upgraded" + else + field "Upgrades:" "0" + fi + if [[ "$downgraded" -gt 0 ]]; then + printf " ${BOLD}%-22s${RESET} ${YELLOW}%s${RESET}\n" "Downgrades:" "$downgraded" + else + field "Downgrades:" "0" + fi + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# REMOTE COMPARE +# ══════════════════════════════════════════════════════════════════════ + +do_remote() { + local remote="$1" + + if ! command -v ssh &>/dev/null; then + err "ssh is required for remote comparison" + exit 1 + fi + + log "Fetching local package list..." + local local_file + local_file=$(mktemp) + + log "Fetching remote package list from ${remote}..." + local remote_file + remote_file=$(mktemp) + trap 'rm -f "$local_file" "$remote_file"' EXIT + + get_package_list > "$local_file" + + local pm + pm=$(detect_package_manager) + case "$pm" in + dpkg) + ssh "$remote" "dpkg-query -W -f='\${Package} \${Version}\n' 2>/dev/null | sort" > "$remote_file" || { + err "Failed to fetch package list from ${remote}" + exit 1 + } + ;; + rpm) + ssh "$remote" "rpm -qa --queryformat '%{NAME} %{VERSION}-%{RELEASE}\n' 2>/dev/null | sort" > "$remote_file" || { + err "Failed to fetch package list from ${remote}" + exit 1 + } + ;; + esac + + log "Comparing local vs ${remote}..." + do_diff "$local_file" "$remote_file" +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat < + ${SCRIPT_NAME} --remote + +MODES: + --snapshot Save current package list to a timestamped file + --diff FILE1 FILE2 Compare two snapshot files + --remote HOST Compare local packages with a remote host via SSH + +OPTIONS: + --snapshot-dir DIR Directory for snapshots (default: ${SNAPSHOT_DIR}) + --verbose Enable debug output + --no-color Disable colored output + --help Show this help + +ENVIRONMENT VARIABLES: + SNAPSHOT_DIR Snapshot directory (default: /var/backups/pkg-snapshots) + COLOR Color mode: auto, always, never (default: auto) + +EXAMPLES: + # Take a snapshot before upgrade + ./changelog-diff.sh --snapshot + + # Upgrade packages, take another snapshot, then diff + ./changelog-diff.sh --snapshot + sudo apt upgrade -y + ./changelog-diff.sh --snapshot + ./changelog-diff.sh --diff /var/backups/pkg-snapshots/host-*.txt + + # Compare with a remote server + ./changelog-diff.sh --remote admin@prod-server + + # Custom snapshot directory + ./changelog-diff.sh --snapshot --snapshot-dir /tmp/snapshots +EOF +} + +# ══════════════════════════════════════════════════════════════════════ +# ARGUMENT PARSING +# ══════════════════════════════════════════════════════════════════════ + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --snapshot) + MODE="snapshot"; shift ;; + --diff) + MODE="diff" + if [[ $# -lt 3 ]]; then + err "--diff requires two file arguments" + exit 1 + fi + DIFF_FILE_A="$2" + DIFF_FILE_B="$3" + shift 3 ;; + --remote) + MODE="remote" + if [[ $# -lt 2 ]]; then + err "--remote requires a host argument" + exit 1 + fi + REMOTE_HOST="$2" + shift 2 ;; + --snapshot-dir) + SNAPSHOT_DIR="$2"; shift 2 ;; + --verbose) + VERBOSE="true"; shift ;; + --no-color) + COLOR="never"; shift ;; + --help|-h) + setup_colors + usage + exit 0 ;; + *) + err "Unknown option: $1" + echo "Run ${SCRIPT_NAME} --help for usage" >&2 + exit 1 ;; + esac + done + + if [[ -z "$MODE" ]]; then + err "No mode specified. Use --snapshot, --diff, or --remote" + echo "Run ${SCRIPT_NAME} --help for usage" >&2 + exit 1 + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + echo "" + echo -e "${BOLD}Package Changelog Diff — $(hostname -f 2>/dev/null || hostname)${RESET}" + echo -e "${DIM}$(date '+%Y-%m-%d %H:%M:%S %Z')${RESET}" + + case "$MODE" in + snapshot) + do_snapshot + ;; + diff) + do_diff "$DIFF_FILE_A" "$DIFF_FILE_B" + ;; + remote) + do_remote "$REMOTE_HOST" + ;; + esac +} + +main "$@" diff --git a/chaos-runner.sh b/chaos-runner.sh new file mode 100755 index 0000000..2755fe3 --- /dev/null +++ b/chaos-runner.sh @@ -0,0 +1,739 @@ +#!/usr/bin/env bash +######################################################################################### +#### chaos-runner.sh — Inject controlled failures and verify system recovery #### +#### CPU stress, memory pressure, disk fill, service kill, network faults #### +#### Requires: bash 4+, root privileges #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### sudo ./chaos-runner.sh --fault cpu-stress --duration 30 #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Color variables — pre-initialized empty, set by setup_colors() +# --------------------------------------------------------------------------- +RED="" +GREEN="" +YELLOW="" +BLUE="" +CYAN="" +BOLD="" +DIM="" +RESET="" + +setup_colors() { + if [[ "${COLOR}" == "never" ]]; then + return + fi + if [[ "${COLOR}" == "always" ]] || [[ -t 1 ]]; then + RED="\033[0;31m" + GREEN="\033[0;32m" + YELLOW="\033[0;33m" + BLUE="\033[0;34m" + CYAN="\033[0;36m" + BOLD="\033[1m" + DIM="\033[2m" + RESET="\033[0m" + fi +} + +# --------------------------------------------------------------------------- +# Standard helpers +# --------------------------------------------------------------------------- +log() { printf "%b[+]%b %s\n" "$GREEN" "$RESET" "$*"; } +warn() { printf "%b[!]%b %s\n" "$YELLOW" "$RESET" "$*" >&2; } +err() { printf "%b[-]%b %s\n" "$RED" "$RESET" "$*" >&2; } +verbose() { [[ "$VERBOSE" == "true" ]] && printf "%b[~]%b %s\n" "$DIM" "$RESET" "$*"; return 0; } +die() { err "$*"; exit 1; } + +section_header() { + printf "\n%b%b══ %b%s%b\n" "$CYAN" "$BOLD" "$BLUE" "$*" "$RESET" +} + +field() { + printf " %-24s %s\n" "$1" "$2" +} + +field_color() { + local label="$1" color="$2" value="$3" + printf " %-24s %b%s%b\n" "$label" "$color" "$value" "$RESET" +} + +# --------------------------------------------------------------------------- +# Defaults +# --------------------------------------------------------------------------- +RUN_MODE="" +FAULT_TYPE="" +DURATION="${CHAOS_DURATION:-30}" +TARGET_SERVICE="" +FILL_PATH="${CHAOS_FILL_PATH:-/tmp}" +FILL_SIZE="${CHAOS_FILL_SIZE:-90}" +LATENCY_MS="${CHAOS_LATENCY:-200}" +DROP_PERCENT="${CHAOS_DROP:-50}" +NETWORK_IFACE="${CHAOS_IFACE:-eth0}" +PLAN_FILE="" +CONFIRM_YES=false +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +CLEANUP_PIDS=() +CLEANUP_FILES=() +CHAOS_ACTIVE=false + +# --------------------------------------------------------------------------- +# State +# --------------------------------------------------------------------------- +readonly SCRIPT_NAME="${0##*/}" +START_TIME="" + +# --------------------------------------------------------------------------- +# Trap +# --------------------------------------------------------------------------- +trap cleanup_all EXIT INT TERM + +# --------------------------------------------------------------------------- +# Safety — cleanup +# --------------------------------------------------------------------------- +cleanup_all() { + if [[ "${CHAOS_ACTIVE}" != "true" ]]; then + return + fi + CHAOS_ACTIVE=false + warn "Running cleanup..." + + # Kill tracked background PIDs + local pid + for pid in "${CLEANUP_PIDS[@]}"; do + kill "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + done + CLEANUP_PIDS=() + + # Remove tracked temp files + local f + for f in "${CLEANUP_FILES[@]}"; do + if [[ -d "$f" ]] && mountpoint -q "$f" 2>/dev/null; then + umount "$f" 2>/dev/null || true + rmdir "$f" 2>/dev/null || true + elif [[ -f "$f" ]]; then + rm -f "$f" 2>/dev/null || true + elif [[ -d "$f" ]]; then + rmdir "$f" 2>/dev/null || true + fi + done + CLEANUP_FILES=() + + # Remove tc qdiscs + tc qdisc del dev "$NETWORK_IFACE" root 2>/dev/null || true + + # Restore resolv.conf from backup + if [[ -f /etc/resolv.conf.chaos-backup ]]; then + mv /etc/resolv.conf.chaos-backup /etc/resolv.conf 2>/dev/null || true + log "Restored /etc/resolv.conf from backup" + fi + + log "Cleanup complete" +} + +# --------------------------------------------------------------------------- +# Utilities +# --------------------------------------------------------------------------- +require_root() { + if [[ "$(id -u)" -ne 0 ]]; then + die "This operation requires root privileges. Run with sudo." + fi +} + +confirm_action() { + local message="$1" + if [[ "$CONFIRM_YES" == "true" ]]; then + return 0 + fi + printf "%b[?]%b %s [y/N] " "$YELLOW" "$RESET" "$message" + local answer + read -r answer + case "$answer" in + [yY]|[yY][eE][sS]) return 0 ;; + *) die "Aborted by user" ;; + esac +} + +wait_duration() { + local remaining="$DURATION" + while [[ "$remaining" -gt 0 ]]; do + printf "\r %bTime remaining: %ds%b " "$DIM" "$remaining" "$RESET" + sleep 1 + ((remaining--)) || true + done + printf "\r%40s\r" "" +} + +# --------------------------------------------------------------------------- +# Fault: cpu-stress +# --------------------------------------------------------------------------- +fault_cpu_stress() { + local cores + cores=$(nproc) + section_header "CPU Stress — saturating $cores cores for ${DURATION}s" + CHAOS_ACTIVE=true + + local i + for ((i = 0; i < cores; i++)); do + while :; do :; done & + CLEANUP_PIDS+=("$!") + verbose "Spawned CPU worker PID $!" + done + + log "Started $cores CPU stress workers" + wait_duration + cleanup_all +} + +# --------------------------------------------------------------------------- +# Fault: memory-pressure +# --------------------------------------------------------------------------- +fault_memory_pressure() { + section_header "Memory Pressure — filling tmpfs for ${DURATION}s" + CHAOS_ACTIVE=true + + local mount_dir + mount_dir=$(mktemp -d /tmp/chaos-mem-XXXXXX) + mount -t tmpfs -o size=256M tmpfs "$mount_dir" + CLEANUP_FILES+=("$mount_dir") + + log "Mounted tmpfs at $mount_dir (256M)" + head -c 240M /dev/urandom > "${mount_dir}/fill.dat" 2>/dev/null || true + log "Filled tmpfs with ~240M of data" + + wait_duration + cleanup_all +} + +# --------------------------------------------------------------------------- +# Fault: disk-fill +# --------------------------------------------------------------------------- +fault_disk_fill() { + section_header "Disk Fill — filling ${FILL_PATH} to ${FILL_SIZE}% for ${DURATION}s" + CHAOS_ACTIVE=true + + local current_usage target_bytes fill_file total_kb + fill_file="${FILL_PATH}/chaos-fill-$(date +%s).dat" + + total_kb=$(df --output=size -k "$FILL_PATH" | tail -1 | tr -d ' ') + current_usage=$(df --output=pcent "$FILL_PATH" | tail -1 | tr -d ' %') + + if [[ "$current_usage" -ge "$FILL_SIZE" ]]; then + warn "Disk already at ${current_usage}% — above target ${FILL_SIZE}%" + return + fi + + target_bytes=$(( (FILL_SIZE - current_usage) * total_kb * 1024 / 100 )) + local target_mb=$(( target_bytes / 1048576 )) + + log "Writing ${target_mb}M to $fill_file" + dd if=/dev/zero of="$fill_file" bs=1M count="$target_mb" status=none 2>/dev/null || true + CLEANUP_FILES+=("$fill_file") + + log "Disk fill complete — $(df --output=pcent "$FILL_PATH" | tail -1 | tr -d ' ') used" + wait_duration + cleanup_all +} + +# --------------------------------------------------------------------------- +# Fault: service-kill +# --------------------------------------------------------------------------- +fault_service_kill() { + if [[ -z "$TARGET_SERVICE" ]]; then + die "service-kill requires --target SERVICE_NAME" + fi + section_header "Service Kill — stopping ${TARGET_SERVICE} for ${DURATION}s" + CHAOS_ACTIVE=true + + if ! systemctl is-active --quiet "$TARGET_SERVICE"; then + die "Service '$TARGET_SERVICE' is not currently active" + fi + + confirm_action "Stop service '$TARGET_SERVICE' for ${DURATION}s?" + + systemctl stop "$TARGET_SERVICE" + log "Stopped $TARGET_SERVICE" + + wait_duration + + log "Restarting $TARGET_SERVICE..." + systemctl start "$TARGET_SERVICE" + log "Service $TARGET_SERVICE restarted" + CHAOS_ACTIVE=false +} + +# --------------------------------------------------------------------------- +# Fault: network-latency +# --------------------------------------------------------------------------- +fault_network_latency() { + section_header "Network Latency — ${LATENCY_MS}ms on ${NETWORK_IFACE} for ${DURATION}s" + CHAOS_ACTIVE=true + + if ! command -v tc &>/dev/null; then + die "tc (iproute2) is required for network faults" + fi + + tc qdisc del dev "$NETWORK_IFACE" root 2>/dev/null || true + tc qdisc add dev "$NETWORK_IFACE" root netem delay "${LATENCY_MS}ms" + log "Added ${LATENCY_MS}ms latency to $NETWORK_IFACE" + + wait_duration + cleanup_all +} + +# --------------------------------------------------------------------------- +# Fault: network-drop +# --------------------------------------------------------------------------- +fault_network_drop() { + section_header "Network Drop — ${DROP_PERCENT}% loss on ${NETWORK_IFACE} for ${DURATION}s" + CHAOS_ACTIVE=true + + if ! command -v tc &>/dev/null; then + die "tc (iproute2) is required for network faults" + fi + + tc qdisc del dev "$NETWORK_IFACE" root 2>/dev/null || true + tc qdisc add dev "$NETWORK_IFACE" root netem loss "${DROP_PERCENT}%" + log "Added ${DROP_PERCENT}% packet loss to $NETWORK_IFACE" + + wait_duration + cleanup_all +} + +# --------------------------------------------------------------------------- +# Fault: dns-failure +# --------------------------------------------------------------------------- +fault_dns_failure() { + section_header "DNS Failure — breaking DNS for ${DURATION}s" + CHAOS_ACTIVE=true + + if [[ -f /etc/resolv.conf.chaos-backup ]]; then + die "A chaos backup of resolv.conf already exists — run --cleanup first" + fi + + cp /etc/resolv.conf /etc/resolv.conf.chaos-backup + CLEANUP_FILES+=("/etc/resolv.conf.chaos-backup") + + printf "# Chaos: DNS intentionally broken\nnameserver 127.0.0.254\n" > /etc/resolv.conf + log "Replaced /etc/resolv.conf with broken nameserver" + + wait_duration + cleanup_all +} + +# --------------------------------------------------------------------------- +# Fault: io-latency +# --------------------------------------------------------------------------- +fault_io_latency() { + section_header "I/O Latency — degrading I/O for ${DURATION}s" + CHAOS_ACTIVE=true + + local io_file + io_file="${FILL_PATH}/chaos-io-$(date +%s).dat" + + ionice -c 2 -n 7 dd if=/dev/urandom of="$io_file" bs=4K count=0 status=none 2>/dev/null & + CLEANUP_PIDS+=("$!") + CLEANUP_FILES+=("$io_file") + + # Run continuous slow I/O in background + ( + while :; do + ionice -c 3 dd if=/dev/zero of="$io_file" bs=4K count=256 conv=fdatasync status=none 2>/dev/null || true + sync + sleep 0.5 + done + ) & + CLEANUP_PIDS+=("$!") + + log "Started degraded I/O worker (idle-class ionice)" + wait_duration + cleanup_all +} + +# --------------------------------------------------------------------------- +# Dispatch +# --------------------------------------------------------------------------- +do_fault() { + require_root + if [[ -z "$FAULT_TYPE" ]]; then + die "No fault type specified. Use --fault TYPE" + fi + + confirm_action "Inject fault '${FAULT_TYPE}' for ${DURATION}s?" + + START_TIME=$(date +%s) + log "Starting fault injection: $FAULT_TYPE (duration: ${DURATION}s)" + + case "$FAULT_TYPE" in + cpu-stress) fault_cpu_stress ;; + memory-pressure) fault_memory_pressure ;; + disk-fill) fault_disk_fill ;; + service-kill) fault_service_kill ;; + network-latency) fault_network_latency ;; + network-drop) fault_network_drop ;; + dns-failure) fault_dns_failure ;; + io-latency) fault_io_latency ;; + *) die "Unknown fault type: $FAULT_TYPE" ;; + esac + + local elapsed=$(( $(date +%s) - START_TIME )) + log "Fault injection complete (${elapsed}s elapsed)" +} + +# --------------------------------------------------------------------------- +# List fault types +# --------------------------------------------------------------------------- +do_list() { + section_header "Available Fault Types" + printf "\n" + printf " %-20s %s\n" "FAULT TYPE" "DESCRIPTION" + printf " ─────────────────────────────────────────────────────────────\n" + printf " %-20s %s\n" "cpu-stress" "Saturate all CPU cores" + printf " %-20s %s\n" "memory-pressure" "Fill memory via tmpfs allocation" + printf " %-20s %s\n" "disk-fill" "Fill disk to threshold percentage" + printf " %-20s %s\n" "service-kill" "Stop a systemd service temporarily" + printf " %-20s %s\n" "network-latency" "Add network latency via tc netem" + printf " %-20s %s\n" "network-drop" "Drop packets via tc netem" + printf " %-20s %s\n" "dns-failure" "Break DNS resolution temporarily" + printf " %-20s %s\n" "io-latency" "Degrade I/O performance via ionice" + printf "\n" +} + +# --------------------------------------------------------------------------- +# Verify system health +# --------------------------------------------------------------------------- +do_verify() { + section_header "System Health Check" + local issues=0 + + # CPU load + local load_1m + load_1m=$(awk '{print $1}' /proc/loadavg) + local cores + cores=$(nproc) + if awk "BEGIN {exit !($load_1m > $cores * 0.9)}"; then + field_color "CPU load (1m):" "$RED" "${load_1m} — HIGH (cores: ${cores})" + ((issues++)) || true + else + field_color "CPU load (1m):" "$GREEN" "${load_1m} (cores: ${cores})" + fi + + # Memory + local mem_avail_kb mem_total_kb mem_pct + mem_total_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo) + mem_avail_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo) + mem_pct=$(( (mem_total_kb - mem_avail_kb) * 100 / mem_total_kb )) + if [[ "$mem_pct" -gt 90 ]]; then + field_color "Memory usage:" "$RED" "${mem_pct}% — HIGH" + ((issues++)) || true + else + field_color "Memory usage:" "$GREEN" "${mem_pct}%" + fi + + # Disk + local disk_pct + disk_pct=$(df --output=pcent / | tail -1 | tr -d ' %') + if [[ "$disk_pct" -gt 90 ]]; then + field_color "Disk usage (/):" "$RED" "${disk_pct}% — HIGH" + ((issues++)) || true + else + field_color "Disk usage (/):" "$GREEN" "${disk_pct}%" + fi + + # Network connectivity + if ping -c 1 -W 3 8.8.8.8 &>/dev/null; then + field_color "Network (ping):" "$GREEN" "OK" + else + field_color "Network (ping):" "$RED" "UNREACHABLE" + ((issues++)) || true + fi + + # DNS resolution + if host google.com &>/dev/null; then + field_color "DNS resolution:" "$GREEN" "OK" + else + field_color "DNS resolution:" "$RED" "FAILING" + ((issues++)) || true + fi + + # Chaos artifacts + if [[ -f /etc/resolv.conf.chaos-backup ]]; then + field_color "Chaos artifacts:" "$YELLOW" "resolv.conf backup found" + ((issues++)) || true + else + field_color "Chaos artifacts:" "$GREEN" "None" + fi + + printf "\n" + if [[ "$issues" -gt 0 ]]; then + warn "Found $issues issue(s)" + return 1 + else + log "All checks passed" + return 0 + fi +} + +# --------------------------------------------------------------------------- +# Plan execution +# --------------------------------------------------------------------------- +do_plan() { + require_root + if [[ -z "$PLAN_FILE" ]]; then + die "No plan file specified. Use --plan FILE" + fi + if [[ ! -f "$PLAN_FILE" ]]; then + die "Plan file not found: $PLAN_FILE" + fi + if ! command -v jq &>/dev/null; then + die "jq is required for plan execution" + fi + + section_header "Executing Chaos Plan: $PLAN_FILE" + + local plan_length + plan_length=$(jq '.faults | length' "$PLAN_FILE") + log "Plan contains $plan_length fault(s)" + + local i fault_entry f_type f_duration + for ((i = 0; i < plan_length; i++)); do + fault_entry=$(jq -r ".faults[$i]" "$PLAN_FILE") + f_type=$(printf '%s' "$fault_entry" | jq -r '.type') + f_duration=$(printf '%s' "$fault_entry" | jq -r '.duration // 30') + + log "Step $((i + 1))/$plan_length: $f_type (${f_duration}s)" + + FAULT_TYPE="$f_type" + DURATION="$f_duration" + + # Extract optional fields + local f_target f_iface + f_target=$(printf '%s' "$fault_entry" | jq -r '.target // empty') + f_iface=$(printf '%s' "$fault_entry" | jq -r '.iface // empty') + + [[ -n "$f_target" ]] && TARGET_SERVICE="$f_target" + [[ -n "$f_iface" ]] && NETWORK_IFACE="$f_iface" + + case "$FAULT_TYPE" in + cpu-stress) fault_cpu_stress ;; + memory-pressure) fault_memory_pressure ;; + disk-fill) fault_disk_fill ;; + service-kill) fault_service_kill ;; + network-latency) fault_network_latency ;; + network-drop) fault_network_drop ;; + dns-failure) fault_dns_failure ;; + io-latency) fault_io_latency ;; + *) warn "Unknown fault type in plan: $FAULT_TYPE — skipping" ;; + esac + + if [[ "$i" -lt $((plan_length - 1)) ]]; then + log "Pausing 5s before next fault..." + sleep 5 + fi + done + + log "Plan execution complete" +} + +# --------------------------------------------------------------------------- +# Force cleanup +# --------------------------------------------------------------------------- +do_cleanup() { + require_root + section_header "Force Cleanup" + CHAOS_ACTIVE=true + cleanup_all + log "Force cleanup complete" +} + +# --------------------------------------------------------------------------- +# Help +# --------------------------------------------------------------------------- +show_help() { + cat <&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; ((ERRORS++)) || true; } +debug() { [[ "$VERBOSE" == "true" ]] && echo -e "${CYAN}[DEBUG]${RESET} $*"; } +step() { echo -e "\n${BOLD}${BLUE}── $* ──${RESET}"; } + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat << EOF +${BOLD}$SCRIPT_NAME${RESET} — Bootstrap a new server with chezmoi dotfiles + +${BOLD}USAGE${RESET} + $SCRIPT_NAME --repo [OPTIONS] + +${BOLD}REQUIRED${RESET} + --repo Git repository URL (HTTPS or SSH) + +${BOLD}OPTIONS${RESET} + --force Apply changes (default: dry-run) + --install-dir Chezmoi install directory (default: /usr/local/bin) + --install-age Also install age for encrypted files + --age-key Path to age key file (default: ~/.config/chezmoi/key.txt) + --packages Comma-separated packages to install first + --chezmoi-args Extra arguments to pass to chezmoi init + --verbose Show debug output + --no-color Disable colored output + --help Show this help + +${BOLD}EXAMPLES${RESET} + # Dry run — see what would happen + $SCRIPT_NAME --repo https://github.com/user/dotfiles.git + + # Apply dotfiles from a private repo + $SCRIPT_NAME --repo git@github.com:user/dotfiles.git --force + + # Install age + pre-install packages + apply + $SCRIPT_NAME --repo git@github.com:user/dotfiles.git \\ + --install-age --packages vim,tmux,htop --force + + # Custom install dir + verbose + $SCRIPT_NAME --repo https://github.com/user/dotfiles.git \\ + --install-dir \$HOME/.local/bin --verbose --force + +${BOLD}ENVIRONMENT VARIABLES${RESET} + DRY_RUN Set to false to apply (same as --force) + VERBOSE Set to true for debug output + COLOR auto | always | never + AGE_KEY_PATH Path to age identity file +EOF +} + +# ── Argument Parsing ────────────────────────────────────────────────── +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --repo) REPO_URL="$2"; shift 2 ;; + --force) DRY_RUN="false"; shift ;; + --install-dir) INSTALL_DIR="$2"; shift 2 ;; + --install-age) INSTALL_AGE="true"; shift ;; + --age-key) AGE_KEY_PATH="$2"; shift 2 ;; + --packages) PRE_PACKAGES="$2"; shift 2 ;; + --chezmoi-args) CHEZMOI_ARGS="$2"; shift 2 ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + --help|-h) usage; exit 0 ;; + *) err "Unknown option: $1"; usage; exit 1 ;; + esac + done + + if [[ -z "$REPO_URL" ]]; then + err "Missing required --repo argument" + usage + exit 1 + fi +} + +# ── System Detection ────────────────────────────────────────────────── +detect_system() { + step "Detecting system" + + OS="$(uname -s)" + ARCH="$(uname -m)" + HOSTNAME_SHORT="$(hostname -s)" + + if [[ -f /etc/os-release ]]; then + # shellcheck disable=SC1091 + . /etc/os-release + DISTRO="${ID:-unknown}" + DISTRO_VERSION="${VERSION_ID:-unknown}" + else + DISTRO="unknown" + DISTRO_VERSION="unknown" + fi + + # Detect package manager + if command -v apt > /dev/null 2>&1; then + PKG_MGR="apt" + elif command -v dnf > /dev/null 2>&1; then + PKG_MGR="dnf" + elif command -v yum > /dev/null 2>&1; then + PKG_MGR="yum" + elif command -v pacman > /dev/null 2>&1; then + PKG_MGR="pacman" + else + PKG_MGR="unknown" + fi + + log "OS: $OS ($ARCH)" + log "Distro: $DISTRO $DISTRO_VERSION" + log "Package manager: $PKG_MGR" + log "Hostname: $HOSTNAME_SHORT" +} + +# ── Package Installation ───────────────────────────────────────────── +install_packages() { + local packages="$1" + if [[ -z "$packages" ]]; then + return 0 + fi + + step "Installing prerequisite packages" + + # Convert comma-separated to space-separated + local pkg_list + pkg_list="${packages//,/ }" + + log "Packages: $pkg_list" + + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY RUN] Would install: $pkg_list" + return 0 + fi + + case "$PKG_MGR" in + apt) + sudo apt update -qq + # shellcheck disable=SC2086 + sudo apt install -y -qq $pkg_list + ;; + dnf|yum) + # shellcheck disable=SC2086 + sudo "$PKG_MGR" install -y -q $pkg_list + ;; + pacman) + # shellcheck disable=SC2086 + sudo pacman -S --noconfirm $pkg_list + ;; + *) + warn "Unknown package manager — install manually: $pkg_list" + ;; + esac + + log "Packages installed" +} + +# ── Install chezmoi ─────────────────────────────────────────────────── +install_chezmoi() { + step "Installing chezmoi" + + if command -v chezmoi > /dev/null 2>&1; then + local current_version + current_version="$(chezmoi --version | awk '{print $3}')" + log "chezmoi already installed: $current_version" + return 0 + fi + + log "Install directory: $INSTALL_DIR" + + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY RUN] Would install chezmoi to $INSTALL_DIR" + return 0 + fi + + mkdir -p "$INSTALL_DIR" + sh -c "$(curl -fsLS get.chezmoi.io)" -- -b "$INSTALL_DIR" + + if command -v chezmoi > /dev/null 2>&1; then + log "chezmoi installed: $(chezmoi --version | awk '{print $3}')" + else + # Might not be in PATH yet + if [[ -x "$INSTALL_DIR/chezmoi" ]]; then + export PATH="$INSTALL_DIR:$PATH" + log "chezmoi installed: $(chezmoi --version | awk '{print $3}')" + log "Added $INSTALL_DIR to PATH" + else + err "chezmoi installation failed" + return 1 + fi + fi +} + +# ── Install age ─────────────────────────────────────────────────────── +install_age() { + if [[ "$INSTALL_AGE" != "true" ]]; then + return 0 + fi + + step "Installing age" + + if command -v age > /dev/null 2>&1; then + log "age already installed: $(age --version)" + return 0 + fi + + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY RUN] Would install age" + return 0 + fi + + case "$PKG_MGR" in + apt) sudo apt install -y -qq age ;; + dnf) sudo dnf install -y -q age ;; + *) + # Fallback: install from GitHub + local age_version + age_version="$(curl -s https://api.github.com/repos/FiloSottile/age/releases/latest | grep tag_name | cut -d'"' -f4)" + curl -fsSL "https://github.com/FiloSottile/age/releases/download/${age_version}/age-${age_version}-linux-amd64.tar.gz" | \ + sudo tar -xz -C /usr/local/bin/ --strip-components=1 age/age age/age-keygen + ;; + esac + + log "age installed: $(age --version)" +} + +# ── Age Key Setup ───────────────────────────────────────────────────── +setup_age_key() { + if [[ "$INSTALL_AGE" != "true" ]]; then + return 0 + fi + + step "Checking age key" + + if [[ -f "$AGE_KEY_PATH" ]]; then + log "Age key exists: $AGE_KEY_PATH" + return 0 + fi + + warn "No age key found at $AGE_KEY_PATH" + warn "If your dotfiles use encrypted files, create a key:" + warn " age-keygen -o $AGE_KEY_PATH" + warn "Or copy your existing key from another machine" +} + +# ── Initialize chezmoi ──────────────────────────────────────────────── +init_chezmoi() { + step "Initializing chezmoi" + + log "Repository: $REPO_URL" + + if [[ -d "$HOME/.local/share/chezmoi" ]]; then + warn "chezmoi source directory already exists" + warn "Use 'chezmoi update' to pull latest changes" + + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY RUN] Would run: chezmoi update" + else + log "Running chezmoi update..." + chezmoi update -v + fi + return 0 + fi + + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY RUN] Would run: chezmoi init --apply $REPO_URL $CHEZMOI_ARGS" + return 0 + fi + + # shellcheck disable=SC2086 + chezmoi init --apply -v "$REPO_URL" $CHEZMOI_ARGS + log "chezmoi initialized and applied" +} + +# ── Verify ──────────────────────────────────────────────────────────── +verify() { + step "Verification" + + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY RUN] Skipping verification" + return 0 + fi + + if ! command -v chezmoi > /dev/null 2>&1; then + err "chezmoi not found in PATH" + return 1 + fi + + local managed_count + managed_count="$(chezmoi managed | wc -l)" + log "Managed files: $managed_count" + + # List managed files + if [[ "$VERBOSE" == "true" ]]; then + debug "Managed files:" + chezmoi managed | while read -r f; do + debug " $f" + done + fi + + # Check for issues + local status_count + status_count="$(chezmoi status | wc -l)" + if [[ "$status_count" -gt 0 ]]; then + warn "$status_count files differ from source:" + chezmoi status + else + log "All managed files match source" + fi + + # Run chezmoi doctor + debug "Running chezmoi doctor..." + if [[ "$VERBOSE" == "true" ]]; then + chezmoi doctor || true + fi +} + +# ── Summary ─────────────────────────────────────────────────────────── +summary() { + step "Summary" + + echo "" + echo -e " ${BOLD}Hostname:${RESET} $HOSTNAME_SHORT" + echo -e " ${BOLD}Distro:${RESET} $DISTRO $DISTRO_VERSION" + echo -e " ${BOLD}Repository:${RESET} $REPO_URL" + echo -e " ${BOLD}chezmoi:${RESET} $(command -v chezmoi 2>/dev/null || echo 'not installed')" + + if command -v age > /dev/null 2>&1; then + echo -e " ${BOLD}age:${RESET} $(age --version)" + fi + + if [[ -d "$HOME/.local/share/chezmoi" ]]; then + echo -e " ${BOLD}Source dir:${RESET} $HOME/.local/share/chezmoi" + echo -e " ${BOLD}Managed:${RESET} $(chezmoi managed 2>/dev/null | wc -l) files" + fi + + echo "" + + if [[ "$DRY_RUN" == "true" ]]; then + echo -e " ${YELLOW}${BOLD}DRY RUN${RESET} — no changes were made." + echo -e " Run with ${BOLD}--force${RESET} to apply." + elif [[ "$ERRORS" -gt 0 ]]; then + echo -e " ${RED}${BOLD}Completed with $ERRORS error(s)${RESET}" + else + echo -e " ${GREEN}${BOLD}Bootstrap complete${RESET}" + fi + + echo "" +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + parse_args "$@" + setup_colors + + echo "" + echo -e "${BOLD}$SCRIPT_NAME${RESET} — chezmoi dotfile bootstrap" + echo "" + + if [[ "$DRY_RUN" == "true" ]]; then + log "Running in ${YELLOW}DRY RUN${RESET} mode (use --force to apply)" + fi + + detect_system + install_packages "$PRE_PACKAGES" + install_chezmoi + install_age + setup_age_key + init_chezmoi + verify + summary + + [[ "$ERRORS" -gt 0 ]] && exit 1 + exit 0 +} + +main "$@" diff --git a/cisa-kev-monitor.sh b/cisa-kev-monitor.sh new file mode 100644 index 0000000..dd70212 --- /dev/null +++ b/cisa-kev-monitor.sh @@ -0,0 +1,575 @@ +#!/usr/bin/env bash + +########################################################################################## +#### cisa-kev-monitor.sh — Monitor CISA Known Exploited Vulnerabilities catalog #### +#### Polls the KEV JSON feed, detects new entries, alerts via email/Slack/Telegram #### +#### Requires: bash 4+, curl, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./cisa-kev-monitor.sh #### +#### ./cisa-kev-monitor.sh --filter linux,kernel #### +#### ./cisa-kev-monitor.sh --telegram --filter linux #### +#### #### +#### See --help for all options. #### +########################################################################################## + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +KEV_URL="https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json" +STATE_DIR="${KEV_STATE_DIR:-${HOME:-/tmp}/.cisa-kev-monitor}" +STATE_FILE="$STATE_DIR/known-cves.txt" +FILTER_KEYWORDS="${KEV_FILTER:-}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# Notification channels +SMTP_TO="${KEV_SMTP_TO:-}" +SMTP_FROM="${KEV_SMTP_FROM:-cisa-kev-monitor@$(hostname -f 2>/dev/null || echo localhost)}" +SLACK_WEBHOOK="${KEV_SLACK_WEBHOOK:-}" +TELEGRAM_BOT_TOKEN="${KEV_TELEGRAM_BOT_TOKEN:-}" +TELEGRAM_CHAT_ID="${KEV_TELEGRAM_CHAT_ID:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +NEW_CVES=() +NEW_COUNT=0 +TOTAL_COUNT=0 +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" +setup_colors() { + if [[ "$COLOR" == "never" ]]; then return; fi + if [[ "$COLOR" == "auto" && ! -t 1 ]]; then return; fi + RED="\033[0;31m" + GREEN="\033[0;32m" + YELLOW="\033[0;33m" + BLUE="\033[0;34m" + BOLD="\033[1m" + DIM="\033[2m" + RESET="\033[0m" +} + +# ── Helpers ─────────────────────────────────────────────────────────── +die() { printf "%b\n" "${RED}[ERROR]${RESET} $*" >&2; exit 1; } +log_info() { printf "%b\n" "${GREEN}[INFO]${RESET} $*"; } +log_warn() { printf "%b\n" "${YELLOW}[WARN]${RESET} $*"; } +log_verbose() { [[ "$VERBOSE" == "true" ]] && printf "%b\n" "${DIM}[DEBUG]${RESET} $*" || true; } + +usage() { + cat <&1 | logger -t kev-monitor +EOF + exit 0 +} + +# ── Dependency Check ────────────────────────────────────────────────── +check_deps() { + local missing=() + for cmd in curl jq; do + if ! command -v "$cmd" &>/dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + die "Missing required commands: ${missing[*]}" + fi +} + +# ── Argument Parsing ────────────────────────────────────────────────── +DRY_RUN="false" +LIST_MODE="false" +LIST_NEW_DAYS="" +STATS_MODE="false" +RESET_MODE="false" +NOTIFY_EMAIL="false" +NOTIFY_SLACK="false" +NOTIFY_TELEGRAM="false" + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --filter) FILTER_KEYWORDS="${2:?--filter requires keywords}"; shift 2 ;; + --email) SMTP_TO="${2:?--email requires an address}"; NOTIFY_EMAIL="true"; shift 2 ;; + --slack) SLACK_WEBHOOK="${2:?--slack requires a webhook URL}"; NOTIFY_SLACK="true"; shift 2 ;; + --telegram) NOTIFY_TELEGRAM="true"; shift ;; + --list) LIST_MODE="true"; shift ;; + --list-new) LIST_NEW_DAYS="${2:?--list-new requires days}"; shift 2 ;; + --stats) STATS_MODE="true"; shift ;; + --state-dir) STATE_DIR="${2:?--state-dir requires a path}"; STATE_FILE="$STATE_DIR/known-cves.txt"; shift 2 ;; + --reset) RESET_MODE="true"; shift ;; + --dry-run) DRY_RUN="true"; shift ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + --help) usage ;; + *) die "Unknown option: $1" ;; + esac + done + + if [[ "$NOTIFY_TELEGRAM" == "true" ]]; then + [[ -z "$TELEGRAM_BOT_TOKEN" ]] && die "KEV_TELEGRAM_BOT_TOKEN not set" + [[ -z "$TELEGRAM_CHAT_ID" ]] && die "KEV_TELEGRAM_CHAT_ID not set" + fi +} + +# ── Fetch KEV Feed ──────────────────────────────────────────────────── +fetch_kev() { + log_verbose "Fetching KEV catalog from CISA..." + local tmpfile + tmpfile=$(mktemp) + + if ! curl -sS --max-time 30 --retry 2 -o "$tmpfile" "$KEV_URL" 2>/dev/null; then + rm -f "$tmpfile" + die "Failed to fetch KEV catalog from $KEV_URL" + fi + + # Validate JSON + if ! jq empty "$tmpfile" 2>/dev/null; then + rm -f "$tmpfile" + die "Invalid JSON received from KEV feed" + fi + + echo "$tmpfile" +} + +# ── Filter Entries ──────────────────────────────────────────────────── +filter_entries() { + local json_file="$1" + + if [[ -z "$FILTER_KEYWORDS" ]]; then + jq -r '.vulnerabilities[]' "$json_file" + return + fi + + # Build jq filter from comma-separated keywords + local jq_filter="" + IFS=',' read -ra keywords <<< "$FILTER_KEYWORDS" + for kw in "${keywords[@]}"; do + kw=$(echo "$kw" | xargs) # trim whitespace + kw_lower=$(echo "$kw" | tr '[:upper:]' '[:lower:]') + if [[ -n "$jq_filter" ]]; then + jq_filter="$jq_filter or" + fi + jq_filter="$jq_filter ((.vendorProject // \"\" | ascii_downcase | contains(\"$kw_lower\")) or (.product // \"\" | ascii_downcase | contains(\"$kw_lower\")) or (.shortDescription // \"\" | ascii_downcase | contains(\"$kw_lower\")) or (.vulnerabilityName // \"\" | ascii_downcase | contains(\"$kw_lower\")))" + done + + jq -r ".vulnerabilities[] | select($jq_filter)" "$json_file" +} + +# ── Initialize State ───────────────────────────────────────────────── +init_state() { + mkdir -p "$STATE_DIR" + + if [[ "$RESET_MODE" == "true" && -f "$STATE_FILE" ]]; then + rm -f "$STATE_FILE" + log_info "State file reset" + fi + + if [[ ! -f "$STATE_FILE" ]]; then + log_info "First run — initializing state file" + return 1 + fi + return 0 +} + +# ── Format CVE for Display ──────────────────────────────────────────── +format_cve_text() { + local cve="$1" + local cve_id vendor product name date_added desc due_date ransomware + + cve_id=$(echo "$cve" | jq -r '.cveID') + vendor=$(echo "$cve" | jq -r '.vendorProject') + product=$(echo "$cve" | jq -r '.product') + name=$(echo "$cve" | jq -r '.vulnerabilityName') + date_added=$(echo "$cve" | jq -r '.dateAdded') + desc=$(echo "$cve" | jq -r '.shortDescription') + due_date=$(echo "$cve" | jq -r '.dueDate') + ransomware=$(echo "$cve" | jq -r '.knownRansomwareCampaignUse') + + printf "%b%s%b — %s\n" "$BOLD" "$cve_id" "$RESET" "$name" + printf " Vendor: %s / %s\n" "$vendor" "$product" + printf " Added: %s\n" "$date_added" + printf " Due: %s\n" "$due_date" + printf " Ransomware: %s\n" "$ransomware" + printf " %s\n" "$desc" + printf " NVD: https://nvd.nist.gov/vuln/detail/%s\n" "$cve_id" + echo "" +} + +# ── Format CVE for Notifications ────────────────────────────────────── +format_cve_plain() { + local cve="$1" + local cve_id vendor product name date_added desc + + cve_id=$(echo "$cve" | jq -r '.cveID') + vendor=$(echo "$cve" | jq -r '.vendorProject') + product=$(echo "$cve" | jq -r '.product') + name=$(echo "$cve" | jq -r '.vulnerabilityName') + date_added=$(echo "$cve" | jq -r '.dateAdded') + desc=$(echo "$cve" | jq -r '.shortDescription') + + echo "$cve_id — $name" + echo "Vendor: $vendor / $product" + echo "Added: $date_added" + echo "$desc" + echo "https://nvd.nist.gov/vuln/detail/$cve_id" + echo "" +} + +format_cve_telegram() { + local cve="$1" + local cve_id vendor product name date_added desc ransomware + + cve_id=$(echo "$cve" | jq -r '.cveID') + vendor=$(echo "$cve" | jq -r '.vendorProject') + product=$(echo "$cve" | jq -r '.product') + name=$(echo "$cve" | jq -r '.vulnerabilityName') + date_added=$(echo "$cve" | jq -r '.dateAdded') + desc=$(echo "$cve" | jq -r '.shortDescription' | head -c 200) + ransomware=$(echo "$cve" | jq -r '.knownRansomwareCampaignUse') + + local emoji="🔴" + [[ "$ransomware" == "Known" ]] && emoji="🔴🛑" + + echo "${emoji} ${cve_id} — ${name}" + echo "📦 ${vendor} / ${product}" + echo "📅 Added: ${date_added}" + [[ "$ransomware" == "Known" ]] && echo "💀 Known ransomware use" + echo "" + echo "${desc}..." + echo "" + echo "🔗 NVD" +} + +# ── Notification: Email ─────────────────────────────────────────────── +send_email() { + local subject="$1" + local body="$2" + + if ! command -v sendmail &>/dev/null && ! command -v msmtp &>/dev/null; then + log_warn "No sendmail or msmtp found — skipping email" + return + fi + + local mailer="sendmail" + command -v msmtp &>/dev/null && mailer="msmtp" + + { + echo "From: $SMTP_FROM" + echo "To: $SMTP_TO" + echo "Subject: $subject" + echo "Content-Type: text/plain; charset=utf-8" + echo "" + echo "$body" + } | "$mailer" -t "$SMTP_TO" + + log_verbose "Email sent to $SMTP_TO" +} + +# ── Notification: Slack ─────────────────────────────────────────────── +send_slack() { + local text="$1" + + # Truncate for Slack's 3000 char limit + text=$(echo "$text" | head -c 2900) + + local payload + payload=$(jq -n --arg text "$text" '{text: $text}') + + curl -sS --max-time 10 -X POST \ + -H "Content-Type: application/json" \ + -d "$payload" \ + "$SLACK_WEBHOOK" >/dev/null 2>&1 + + log_verbose "Slack notification sent" +} + +# ── Notification: Telegram ──────────────────────────────────────────── +send_telegram() { + local text="$1" + + # Telegram message limit is 4096 chars + text=$(echo "$text" | head -c 4000) + + curl -sS --max-time 10 -X POST \ + "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ + -d "chat_id=${TELEGRAM_CHAT_ID}" \ + -d "parse_mode=HTML" \ + -d "disable_web_page_preview=true" \ + --data-urlencode "text=$text" >/dev/null 2>&1 + + log_verbose "Telegram notification sent" +} + +# ── Notify All Channels ────────────────────────────────────────────── +notify() { + local count=${#NEW_CVES[@]} + local filter_label="" + [[ -n "$FILTER_KEYWORDS" ]] && filter_label=" (filter: $FILTER_KEYWORDS)" + + # Build plain text body + local plain_body="" + plain_body+="CISA KEV Monitor — $count new CVE(s) detected${filter_label}" + plain_body+=$'\n\n' + for cve_json in "${NEW_CVES[@]}"; do + plain_body+=$(format_cve_plain "$cve_json") + plain_body+=$'\n' + done + + # Build Telegram body + local tg_body="" + tg_body+="🚨 CISA KEV — ${count} new CVE(s)${filter_label}" + tg_body+=$'\n\n' + for cve_json in "${NEW_CVES[@]}"; do + tg_body+=$(format_cve_telegram "$cve_json") + tg_body+=$'\n' + done + + if [[ "$DRY_RUN" == "true" ]]; then + log_warn "DRY-RUN — would send notifications to:" + [[ "$NOTIFY_EMAIL" == "true" ]] && echo " Email: $SMTP_TO" + [[ "$NOTIFY_SLACK" == "true" ]] && echo " Slack: (webhook configured)" + [[ "$NOTIFY_TELEGRAM" == "true" ]] && echo " Telegram: chat $TELEGRAM_CHAT_ID" + return + fi + + [[ "$NOTIFY_EMAIL" == "true" ]] && send_email "CISA KEV: $count new CVE(s)${filter_label}" "$plain_body" + [[ "$NOTIFY_SLACK" == "true" ]] && send_slack "$plain_body" + [[ "$NOTIFY_TELEGRAM" == "true" ]] && send_telegram "$tg_body" +} + +# ── Mode: Stats ─────────────────────────────────────────────────────── +run_stats() { + local json_file + json_file=$(fetch_kev) + + local total last_updated + total=$(jq '.vulnerabilities | length' "$json_file") + last_updated=$(jq -r '.catalogVersion' "$json_file") + + local last_7d last_30d + local cutoff_7d cutoff_30d + cutoff_7d=$(date -u -d "7 days ago" '+%Y-%m-%d' 2>/dev/null || date -u -v-7d '+%Y-%m-%d' 2>/dev/null) + cutoff_30d=$(date -u -d "30 days ago" '+%Y-%m-%d' 2>/dev/null || date -u -v-30d '+%Y-%m-%d' 2>/dev/null) + + last_7d=$(jq --arg d "$cutoff_7d" '[.vulnerabilities[] | select(.dateAdded >= $d)] | length' "$json_file") + last_30d=$(jq --arg d "$cutoff_30d" '[.vulnerabilities[] | select(.dateAdded >= $d)] | length' "$json_file") + + local ransomware_known + ransomware_known=$(jq '[.vulnerabilities[] | select(.knownRansomwareCampaignUse == "Known")] | length' "$json_file") + + echo "" + printf "%bCISA KEV Catalog Statistics%b\n" "$BOLD" "$RESET" + echo "Catalog version: $last_updated" + echo "Total CVEs: $total" + echo "Last 7 days: $last_7d" + echo "Last 30 days: $last_30d" + echo "Ransomware use: $ransomware_known" + + if [[ -n "$FILTER_KEYWORDS" ]]; then + local filtered + filtered=$(filter_entries "$json_file" | jq -s 'length') + echo "Matching filter: $filtered (keywords: $FILTER_KEYWORDS)" + fi + + rm -f "$json_file" +} + +# ── Mode: List New ──────────────────────────────────────────────────── +run_list_new() { + local days="$1" + local json_file + json_file=$(fetch_kev) + + local cutoff + cutoff=$(date -u -d "$days days ago" '+%Y-%m-%d' 2>/dev/null || date -u -v-"${days}d" '+%Y-%m-%d' 2>/dev/null) + + echo "" + printf "%bCISA KEV — entries added in the last %s days%b\n\n" "$BOLD" "$days" "$RESET" + + local count=0 + while IFS= read -r entry; do + [[ -z "$entry" ]] && continue + local date_added + date_added=$(echo "$entry" | jq -r '.dateAdded') + if [[ "$date_added" > "$cutoff" || "$date_added" == "$cutoff" ]]; then + format_cve_text "$entry" + count=$((count + 1)) + fi + done < <(filter_entries "$json_file" | jq -c '.') + + log_info "$count entries found" + rm -f "$json_file" +} + +# ── Mode: List ──────────────────────────────────────────────────────── +run_list() { + local json_file + json_file=$(fetch_kev) + + echo "" + printf "%bCISA KEV — all matching entries%b\n" "$BOLD" "$RESET" + [[ -n "$FILTER_KEYWORDS" ]] && echo "Filter: $FILTER_KEYWORDS" + echo "" + + local count=0 + while IFS= read -r entry; do + [[ -z "$entry" ]] && continue + format_cve_text "$entry" + count=$((count + 1)) + done < <(filter_entries "$json_file" | jq -c '.') + + log_info "$count entries" + rm -f "$json_file" +} + +# ── Mode: Monitor ───────────────────────────────────────────────────── +run_monitor() { + local json_file + json_file=$(fetch_kev) + + TOTAL_COUNT=$(jq '.vulnerabilities | length' "$json_file") + + # Initialize state on first run + if ! init_state; then + jq -r '.vulnerabilities[].cveID' "$json_file" | sort > "$STATE_FILE" + local init_count + init_count=$(wc -l < "$STATE_FILE") + log_info "Initialized with $init_count CVEs. Future runs will detect new entries." + rm -f "$json_file" + return + fi + + # Extract current CVE IDs + local current_cves + current_cves=$(mktemp) + jq -r '.vulnerabilities[].cveID' "$json_file" | sort > "$current_cves" + + # Find new CVEs not in state file + local new_ids + new_ids=$(comm -13 "$STATE_FILE" "$current_cves") + + if [[ -z "$new_ids" ]]; then + log_info "No new KEV entries (catalog: $TOTAL_COUNT CVEs)" + rm -f "$current_cves" "$json_file" + return + fi + + # Collect new CVE details, applying filter + while IFS= read -r cve_id; do + [[ -z "$cve_id" ]] && continue + + local cve_json + cve_json=$(jq -c --arg id "$cve_id" '.vulnerabilities[] | select(.cveID == $id)' "$json_file") + + # Apply filter if set + if [[ -n "$FILTER_KEYWORDS" ]]; then + local matches="false" + IFS=',' read -ra keywords <<< "$FILTER_KEYWORDS" + for kw in "${keywords[@]}"; do + kw=$(echo "$kw" | xargs | tr '[:upper:]' '[:lower:]') + if echo "$cve_json" | tr '[:upper:]' '[:lower:]' | grep -q "$kw"; then + matches="true" + break + fi + done + [[ "$matches" == "false" ]] && continue + fi + + NEW_CVES+=("$cve_json") + format_cve_text "$cve_json" + done <<< "$new_ids" + + NEW_COUNT=${#NEW_CVES[@]} + + # Update state file with all current CVEs + mv "$current_cves" "$STATE_FILE" + + if [[ $NEW_COUNT -eq 0 ]]; then + local total_new + total_new=$(echo "$new_ids" | wc -w) + log_info "No new entries matching filter (${total_new} new total, $TOTAL_COUNT in catalog)" + rm -f "$json_file" + return + fi + + log_info "$NEW_COUNT new KEV entry/entries matching filter" + + # Send notifications + if [[ "$NOTIFY_EMAIL" == "true" || "$NOTIFY_SLACK" == "true" || "$NOTIFY_TELEGRAM" == "true" ]]; then + notify + fi + + rm -f "$json_file" +} + +# ── Entry Point ─────────────────────────────────────────────────────── +main() { + START_TIME=$(date +%s) + setup_colors + parse_args "$@" + check_deps + + if [[ "$STATS_MODE" == "true" ]]; then + run_stats + elif [[ -n "$LIST_NEW_DAYS" ]]; then + run_list_new "$LIST_NEW_DAYS" + elif [[ "$LIST_MODE" == "true" ]]; then + run_list + else + run_monitor + fi + + local elapsed=$(( $(date +%s) - START_TIME )) + log_verbose "Completed in ${elapsed}s" +} + +main "$@" diff --git a/clickhouse-exporter.sh b/clickhouse-exporter.sh new file mode 100644 index 0000000..dccfbf3 --- /dev/null +++ b/clickhouse-exporter.sh @@ -0,0 +1,350 @@ +#!/usr/bin/env bash +################################################################################ +# Script Name: clickhouse-exporter.sh +# Version: 1.0 +# Description: Prometheus textfile exporter for ClickHouse. Pulls metrics from +# the native Prometheus endpoint (/metrics on port 9363) and writes +# a filtered subset to a .prom file for node_exporter's textfile +# collector. Keeps original ClickHouse metric names for community +# dashboard compatibility. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - curl +# - ClickHouse Prometheus endpoint enabled (port 9363) +# +# Usage: +# ./clickhouse-exporter.sh +# ./clickhouse-exporter.sh --textfile +# ./clickhouse-exporter.sh --http +# CLICKHOUSE_URL="http://ch-node:9363" ./clickhouse-exporter.sh --textfile +# +# Parameters: +# --textfile Write to textfile collector directory +# --http Run as HTTP server +# --install Create cron job for automatic collection +# --help Show usage +# +# Environment: +# CLICKHOUSE_URL ClickHouse Prometheus endpoint (default: http://localhost:9363) +# METRICS_PATH Metrics path (default: /metrics) +# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector) +# CURL_TIMEOUT Request timeout in seconds (default: 10) +# +# Metrics Exported (Tier 2 — ~30 key metrics, original ClickHouse names): +# +# Gauges (ClickHouseMetrics_*): +# - ClickHouseMetrics_Query +# - ClickHouseMetrics_Merge +# - ClickHouseMetrics_MemoryTracking +# - ClickHouseMetrics_TCPConnection +# - ClickHouseMetrics_HTTPConnection +# - ClickHouseMetrics_OpenFileForRead +# - ClickHouseMetrics_OpenFileForWrite +# - ClickHouseMetrics_ReplicasMaxQueueSize +# - ClickHouseMetrics_BackgroundMergesAndMutationsPoolTask +# - ClickHouseMetrics_DelayedInserts +# +# Counters (ClickHouseProfileEvents_*): +# - ClickHouseProfileEvents_Query +# - ClickHouseProfileEvents_SelectQuery +# - ClickHouseProfileEvents_InsertQuery +# - ClickHouseProfileEvents_FailedQuery +# - ClickHouseProfileEvents_InsertedRows +# - ClickHouseProfileEvents_InsertedBytes +# - ClickHouseProfileEvents_MergedRows +# - ClickHouseProfileEvents_ReadCompressedBytes +# - ClickHouseProfileEvents_CompressedReadBufferBytes +# - ClickHouseProfileEvents_ReplicatedPartFetches +# - ClickHouseProfileEvents_ReplicatedPartFailedFetches +# - ClickHouseProfileEvents_DiskReadElapsedMicroseconds +# - ClickHouseProfileEvents_DiskWriteElapsedMicroseconds +# - ClickHouseProfileEvents_NetworkSendBytes +# - ClickHouseProfileEvents_NetworkReceiveBytes +# - ClickHouseProfileEvents_ZooKeeperTransactions +# - ClickHouseProfileEvents_DNSError +# +# Async Metrics (ClickHouseAsyncMetrics_*): +# - ClickHouseAsyncMetrics_Uptime +# - ClickHouseAsyncMetrics_MaxPartCountForPartition +# - ClickHouseAsyncMetrics_MemoryResident +# - ClickHouseAsyncMetrics_ReplicasMaxAbsoluteDelay +# +# Exporter: +# - clickhouse_exporter_up +# - clickhouse_exporter_duration_seconds +# - clickhouse_exporter_last_run_timestamp +# +################################################################################ + +set -euo pipefail + +# --- Configuration --- +readonly VERSION="1.0" +readonly SCRIPT_NAME="$(basename "$0")" +CLICKHOUSE_URL="${CLICKHOUSE_URL:-http://localhost:9363}" +METRICS_PATH="${METRICS_PATH:-/metrics}" +TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +TEXTFILE_MODE=false +HTTP_MODE=false +HTTP_PORT=9201 +OUTPUT="" +START_TIME="" + +# Tier 2 metric filter — grep pattern (one metric name per line) +readonly METRIC_FILTER='ClickHouseMetrics_Query[[:space:]] +ClickHouseMetrics_Merge[[:space:]] +ClickHouseMetrics_MemoryTracking[[:space:]] +ClickHouseMetrics_TCPConnection[[:space:]] +ClickHouseMetrics_HTTPConnection[[:space:]] +ClickHouseMetrics_OpenFileForRead[[:space:]] +ClickHouseMetrics_OpenFileForWrite[[:space:]] +ClickHouseMetrics_ReplicasMaxQueueSize[[:space:]] +ClickHouseMetrics_BackgroundMergesAndMutationsPoolTask[[:space:]] +ClickHouseMetrics_DelayedInserts[[:space:]] +ClickHouseProfileEvents_Query[[:space:]] +ClickHouseProfileEvents_SelectQuery[[:space:]] +ClickHouseProfileEvents_InsertQuery[[:space:]] +ClickHouseProfileEvents_FailedQuery[[:space:]] +ClickHouseProfileEvents_InsertedRows[[:space:]] +ClickHouseProfileEvents_InsertedBytes[[:space:]] +ClickHouseProfileEvents_MergedRows[[:space:]] +ClickHouseProfileEvents_ReadCompressedBytes[[:space:]] +ClickHouseProfileEvents_CompressedReadBufferBytes[[:space:]] +ClickHouseProfileEvents_ReplicatedPartFetches[[:space:]] +ClickHouseProfileEvents_ReplicatedPartFailedFetches[[:space:]] +ClickHouseProfileEvents_DiskReadElapsedMicroseconds[[:space:]] +ClickHouseProfileEvents_DiskWriteElapsedMicroseconds[[:space:]] +ClickHouseProfileEvents_NetworkSendBytes[[:space:]] +ClickHouseProfileEvents_NetworkReceiveBytes[[:space:]] +ClickHouseProfileEvents_ZooKeeperTransactions[[:space:]] +ClickHouseProfileEvents_DNSError[[:space:]] +ClickHouseAsyncMetrics_Uptime[[:space:]] +ClickHouseAsyncMetrics_MaxPartCountForPartition[[:space:]] +ClickHouseAsyncMetrics_MemoryResident[[:space:]] +ClickHouseAsyncMetrics_ReplicasMaxAbsoluteDelay[[:space:]]' + +# --- Functions --- + +usage() { + cat </dev/null; then + echo "# ERROR: curl is required" >&2 + echo "# Install with: apt install curl OR dnf install curl" >&2 + exit 1 + fi +} + +collect_metrics() { + local raw + raw=$(curl -sf --max-time "$CURL_TIMEOUT" \ + "${CLICKHOUSE_URL}${METRICS_PATH}" 2>/dev/null) || { + OUTPUT+="# HELP clickhouse_exporter_up ClickHouse Prometheus endpoint reachability (1=up, 0=down) +# TYPE clickhouse_exporter_up gauge +clickhouse_exporter_up 0 +" + return 1 + } + + OUTPUT+="# HELP clickhouse_exporter_up ClickHouse Prometheus endpoint reachability (1=up, 0=down) +# TYPE clickhouse_exporter_up gauge +clickhouse_exporter_up 1 +" + + # Filter raw metrics to Tier 2 subset + # Include HELP and TYPE lines for matched metrics, plus the value lines + local filtered + filtered=$(echo "$raw" | grep -E "$METRIC_FILTER" || true) + + if [[ -z "$filtered" ]]; then + return 0 + fi + + # For each matched metric, also grab its HELP and TYPE lines + local seen_metrics="" + while IFS= read -r line; do + # Extract metric name (before space or brace) + local metric_name + metric_name=$(echo "$line" | awk '{print $1}' | sed 's/{.*//') + + # Add HELP/TYPE lines if we haven't seen this metric yet + if [[ ! "$seen_metrics" == *"|${metric_name}|"* ]]; then + local help_line type_line + help_line=$(echo "$raw" | grep "^# HELP ${metric_name} " || true) + type_line=$(echo "$raw" | grep "^# TYPE ${metric_name} " || true) + if [[ -n "$help_line" ]]; then + OUTPUT+="${help_line} +" + fi + if [[ -n "$type_line" ]]; then + OUTPUT+="${type_line} +" + fi + seen_metrics+="|${metric_name}|" + fi + + OUTPUT+="${line} +" + done <<< "$filtered" + + return 0 +} + +# --- Output --- + +write_output() { + if [[ "$TEXTFILE_MODE" == true ]]; then + local output_file="${TEXTFILE_DIR}/clickhouse.prom" + local temp_file="${output_file}.$$" + + mkdir -p "$TEXTFILE_DIR" + echo "$OUTPUT" > "$temp_file" + mv "$temp_file" "$output_file" + echo "# Wrote metrics to ${output_file}" >&2 + else + echo "$OUTPUT" + fi +} + +serve_http() { + if ! command -v nc &>/dev/null && ! command -v ncat &>/dev/null; then + echo "# ERROR: nc (netcat) or ncat required for HTTP mode" >&2 + exit 1 + fi + + echo "# ClickHouse exporter listening on port ${HTTP_PORT}" >&2 + echo "# Metrics endpoint: http://localhost:${HTTP_PORT}/metrics" >&2 + + local nc_cmd="nc" + if command -v ncat &>/dev/null; then + nc_cmd="ncat" + fi + + while true; do + OUTPUT="" + START_TIME=$(date +%s%N) + + collect_metrics + + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + OUTPUT+="# HELP clickhouse_exporter_duration_seconds Time to collect and filter metrics +# TYPE clickhouse_exporter_duration_seconds gauge +clickhouse_exporter_duration_seconds ${duration} +# HELP clickhouse_exporter_last_run_timestamp Unix timestamp of last successful run +# TYPE clickhouse_exporter_last_run_timestamp gauge +clickhouse_exporter_last_run_timestamp $(date +%s) +" + + local content_length=${#OUTPUT} + local response="HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4; charset=utf-8\r\nContent-Length: ${content_length}\r\nConnection: close\r\n\r\n${OUTPUT}" + + echo -e "$response" | $nc_cmd -l -p "$HTTP_PORT" -q 1 2>/dev/null || \ + echo -e "$response" | $nc_cmd -l "$HTTP_PORT" -c 2>/dev/null || \ + echo -e "$response" | $nc_cmd -l -p "$HTTP_PORT" 2>/dev/null || true + done +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "# ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + cat > /etc/cron.d/clickhouse-exporter </dev/null +EOF + + chmod 644 /etc/cron.d/clickhouse-exporter + echo "# Installed cron job: /etc/cron.d/clickhouse-exporter" >&2 + echo "# Metrics will be written to: ${TEXTFILE_DIR}/clickhouse.prom" >&2 +} + +# --- Main --- + +main() { + for arg in "$@"; do + case "$arg" in + --textfile) TEXTFILE_MODE=true ;; + --http) HTTP_MODE=true ;; + -p|--port) shift; HTTP_PORT="${1:-$HTTP_PORT}" ;; + --install) + check_dependencies + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) ;; + esac + done + + check_dependencies + + if [[ "$HTTP_MODE" == true ]]; then + serve_http + exit 0 + fi + + START_TIME=$(date +%s%N) + + collect_metrics + + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + OUTPUT+="# HELP clickhouse_exporter_duration_seconds Time to collect and filter metrics +# TYPE clickhouse_exporter_duration_seconds gauge +clickhouse_exporter_duration_seconds ${duration} +# HELP clickhouse_exporter_last_run_timestamp Unix timestamp of last successful run +# TYPE clickhouse_exporter_last_run_timestamp gauge +clickhouse_exporter_last_run_timestamp $(date +%s) +" + + write_output +} + +main "$@" diff --git a/config-backup.sh b/config-backup.sh new file mode 100644 index 0000000..90c818a --- /dev/null +++ b/config-backup.sh @@ -0,0 +1,507 @@ +#!/usr/bin/env bash + +######################################################################################### +#### config-backup.sh — Snapshot system configs into a timestamped tarball #### +#### Backs up /etc, crontabs, package lists, systemd units, and firewall rules #### +#### Dry-run by default — nothing is written without --force #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./config-backup.sh #### +#### ./config-backup.sh --force #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +BACKUP_DIR="${BACKUP_DIR:-/var/backups/config-snapshots}" +DRY_RUN="${DRY_RUN:-true}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +INCLUDE_PATHS=() +EXCLUDE_PATHS=() +STAGING_DIR="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${DIM}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +# ── Helpers ─────────────────────────────────────────────────────────── +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +human_bytes() { + local bytes="$1" + if [[ "$bytes" -ge 1073741824 ]]; then + awk "BEGIN { printf \"%.1f GiB\", $bytes / 1073741824 }" + elif [[ "$bytes" -ge 1048576 ]]; then + awk "BEGIN { printf \"%.1f MiB\", $bytes / 1048576 }" + elif [[ "$bytes" -ge 1024 ]]; then + awk "BEGIN { printf \"%.1f KiB\", $bytes / 1024 }" + else + echo "${bytes} B" + fi +} + +cleanup_staging() { + if [[ -n "$STAGING_DIR" && -d "$STAGING_DIR" ]]; then + rm -rf "$STAGING_DIR" + verbose "Cleaned up staging directory" + fi +} + +is_excluded() { + local path="$1" + for exc in "${EXCLUDE_PATHS[@]}"; do + if [[ "$path" == "$exc" || "$path" == "$exc"/* ]]; then + return 0 + fi + done + return 1 +} + +# ══════════════════════════════════════════════════════════════════════ +# COLLECT ITEMS +# ══════════════════════════════════════════════════════════════════════ + +collect_etc() { + section_header "/etc Configuration" + + if [[ ! -d /etc ]]; then + warn "/etc not found" + return + fi + + if is_excluded "/etc"; then + log "Skipping /etc (excluded)" + return + fi + + local etc_size + etc_size=$(du -sb /etc 2>/dev/null | awk '{print $1}' || echo "0") + field "Size:" "$(human_bytes "$etc_size")" + + local etc_files + etc_files=$(find /etc -type f 2>/dev/null | wc -l) + field "Files:" "$etc_files" + + if [[ "$DRY_RUN" == "false" ]]; then + cp -a /etc "$STAGING_DIR/etc" 2>/dev/null || warn "Some /etc files could not be copied" + log "Collected /etc" + else + log "[DRY-RUN] Would collect /etc" + fi +} + +collect_crontabs() { + section_header "User Crontabs" + + local crontab_dir="/var/spool/cron/crontabs" + local count=0 + + if [[ -d "$crontab_dir" ]]; then + count=$(find "$crontab_dir" -type f 2>/dev/null | wc -l) + field "User crontabs:" "$count" + + if [[ "$VERBOSE" == "true" && "$count" -gt 0 ]]; then + find "$crontab_dir" -type f 2>/dev/null | while IFS= read -r f; do + printf " %s\n" "$(basename "$f")" + done + fi + + if [[ "$DRY_RUN" == "false" && "$count" -gt 0 ]]; then + mkdir -p "$STAGING_DIR/crontabs" + cp -a "$crontab_dir"/* "$STAGING_DIR/crontabs/" 2>/dev/null || warn "Some crontabs could not be copied" + log "Collected user crontabs" + fi + else + field "User crontabs:" "0 (${crontab_dir} not found)" + fi + + # Root crontab via crontab -l + if crontab -l &>/dev/null; then + if [[ "$DRY_RUN" == "false" ]]; then + mkdir -p "$STAGING_DIR/crontabs" + crontab -l > "$STAGING_DIR/crontabs/root-crontab-l.txt" 2>/dev/null || true + fi + field "Root crontab:" "present" + else + field "Root crontab:" "none" + fi + + if [[ "$DRY_RUN" == "true" && "$count" -gt 0 ]]; then + log "[DRY-RUN] Would collect crontabs" + fi +} + +collect_package_list() { + section_header "Package List" + + if command -v dpkg &>/dev/null; then + local dpkg_count + dpkg_count=$(dpkg -l 2>/dev/null | grep -c "^ii" || true) + field "dpkg packages:" "$dpkg_count" + + if [[ "$DRY_RUN" == "false" ]]; then + mkdir -p "$STAGING_DIR/packages" + dpkg --get-selections > "$STAGING_DIR/packages/dpkg-selections.txt" 2>/dev/null || true + dpkg -l > "$STAGING_DIR/packages/dpkg-list.txt" 2>/dev/null || true + log "Collected dpkg package list" + else + log "[DRY-RUN] Would collect dpkg package list" + fi + fi + + if command -v rpm &>/dev/null; then + local rpm_count + rpm_count=$(rpm -qa 2>/dev/null | wc -l || echo "0") + field "rpm packages:" "$rpm_count" + + if [[ "$DRY_RUN" == "false" ]]; then + mkdir -p "$STAGING_DIR/packages" + rpm -qa --qf '%{NAME}-%{VERSION}-%{RELEASE}.%{ARCH}\n' > "$STAGING_DIR/packages/rpm-list.txt" 2>/dev/null || true + log "Collected rpm package list" + else + log "[DRY-RUN] Would collect rpm package list" + fi + fi + + if ! command -v dpkg &>/dev/null && ! command -v rpm &>/dev/null; then + log "No package manager detected (dpkg/rpm)" + fi +} + +collect_systemd_units() { + section_header "Systemd Units" + + if ! command -v systemctl &>/dev/null; then + log "systemd not available" + return + fi + + local enabled_count + enabled_count=$(systemctl list-unit-files --state=enabled --no-legend 2>/dev/null | wc -l) + field "Enabled units:" "$enabled_count" + + local custom_count=0 + for unit_dir in /etc/systemd/system /etc/systemd/user; do + if [[ -d "$unit_dir" ]]; then + local dir_count + dir_count=$(find "$unit_dir" -maxdepth 1 -name "*.service" -o -name "*.timer" 2>/dev/null | wc -l) + custom_count=$((custom_count + dir_count)) + fi + done + field "Custom unit files:" "$custom_count" + + if [[ "$DRY_RUN" == "false" ]]; then + mkdir -p "$STAGING_DIR/systemd" + systemctl list-unit-files --no-legend > "$STAGING_DIR/systemd/unit-files.txt" 2>/dev/null || true + + for unit_dir in /etc/systemd/system /etc/systemd/user; do + if [[ -d "$unit_dir" ]]; then + cp -a "$unit_dir" "$STAGING_DIR/systemd/" 2>/dev/null || true + fi + done + log "Collected systemd units" + else + log "[DRY-RUN] Would collect systemd units" + fi +} + +collect_firewall_rules() { + section_header "Firewall Rules" + + local fw_found=false + + if command -v iptables &>/dev/null; then + fw_found=true + local ipt_rules + ipt_rules=$(iptables -S 2>/dev/null | wc -l || echo "0") + field "iptables rules:" "$ipt_rules" + + if [[ "$DRY_RUN" == "false" ]]; then + mkdir -p "$STAGING_DIR/firewall" + iptables-save > "$STAGING_DIR/firewall/iptables.rules" 2>/dev/null || warn "Could not save iptables rules" + log "Collected iptables rules" + fi + fi + + if command -v nft &>/dev/null; then + fw_found=true + local nft_tables + nft_tables=$(nft list tables 2>/dev/null | wc -l || echo "0") + field "nftables tables:" "$nft_tables" + + if [[ "$DRY_RUN" == "false" ]]; then + mkdir -p "$STAGING_DIR/firewall" + nft list ruleset > "$STAGING_DIR/firewall/nftables.rules" 2>/dev/null || warn "Could not save nftables rules" + log "Collected nftables rules" + fi + fi + + if [[ "$fw_found" == "false" ]]; then + log "No firewall tools detected (iptables, nftables)" + elif [[ "$DRY_RUN" == "true" ]]; then + log "[DRY-RUN] Would collect firewall rules" + fi +} + +collect_custom_includes() { + if [[ ${#INCLUDE_PATHS[@]} -eq 0 ]]; then + return + fi + + section_header "Custom Includes" + + for inc_path in "${INCLUDE_PATHS[@]}"; do + if [[ ! -e "$inc_path" ]]; then + warn "Include path not found: $inc_path" + continue + fi + + local inc_size + inc_size=$(du -sb "$inc_path" 2>/dev/null | awk '{print $1}' || echo "0") + field "$inc_path:" "$(human_bytes "$inc_size")" + + if [[ "$DRY_RUN" == "false" ]]; then + local dest_dir="$STAGING_DIR/custom${inc_path}" + mkdir -p "$(dirname "$dest_dir")" + cp -a "$inc_path" "$dest_dir" 2>/dev/null || warn "Could not copy $inc_path" + fi + done + + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY-RUN] Would collect custom paths" + else + log "Collected custom paths" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# CREATE TARBALL +# ══════════════════════════════════════════════════════════════════════ + +create_tarball() { + local timestamp hostname_val tarball_name tarball_path + + timestamp=$(date '+%Y%m%d-%H%M%S') + hostname_val=$(hostname -s 2>/dev/null || hostname) + tarball_name="config-backup-${hostname_val}-${timestamp}.tar.gz" + tarball_path="${BACKUP_DIR}/${tarball_name}" + + section_header "Creating Backup" + + field "Output directory:" "$BACKUP_DIR" + field "Tarball:" "$tarball_name" + + if [[ "$DRY_RUN" == "true" ]]; then + # Estimate total size + local est_size=0 + + if [[ -d /etc ]] && ! is_excluded "/etc"; then + est_size=$((est_size + $(du -sb /etc 2>/dev/null | awk '{print $1}' || echo 0))) + fi + + for inc_path in "${INCLUDE_PATHS[@]}"; do + if [[ -e "$inc_path" ]]; then + est_size=$((est_size + $(du -sb "$inc_path" 2>/dev/null | awk '{print $1}' || echo 0))) + fi + done + + field_color "Estimated size:" "${YELLOW}~$(human_bytes "$est_size") (uncompressed)${RESET}" + echo "" + echo -e " ${YELLOW}Dry-run mode — no backup created${RESET}" + echo -e " Run with --force to create the backup" + return + fi + + # Create output directory + mkdir -p "$BACKUP_DIR" || { err "Cannot create ${BACKUP_DIR}"; exit 1; } + + # Create tarball from staging + local staging_size + staging_size=$(du -sb "$STAGING_DIR" 2>/dev/null | awk '{print $1}' || echo "0") + field "Staging size:" "$(human_bytes "$staging_size")" + + tar -czf "$tarball_path" -C "$STAGING_DIR" . 2>/dev/null || { err "Failed to create tarball"; exit 1; } + + # Validate tarball + log "Validating tarball..." + local file_count + file_count=$(tar -tzf "$tarball_path" 2>/dev/null | wc -l) + + if [[ "$file_count" -eq 0 ]]; then + err "Tarball validation failed — archive appears empty" + exit 1 + fi + + local tarball_size + tarball_size=$(stat -c%s "$tarball_path" 2>/dev/null || echo "0") + + field_color "Status:" "${GREEN}Success${RESET}" + field "Archive size:" "$(human_bytes "$tarball_size")" + field "Files archived:" "$file_count" + field "Location:" "$tarball_path" +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2 + exit 1 ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + echo "" + echo -e "${BOLD}Config Backup — $(hostname -f 2>/dev/null || hostname)${RESET}" + + if [[ "$DRY_RUN" == "true" ]]; then + echo -e "Safety: ${YELLOW}dry-run (use --force to create backup)${RESET}" + else + echo -e "Safety: ${RED}LIVE — backup will be created${RESET}" + fi + + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + + # Create staging directory for live runs + if [[ "$DRY_RUN" == "false" ]]; then + STAGING_DIR=$(mktemp -d "/tmp/config-backup-XXXXXX") + trap cleanup_staging EXIT + verbose "Staging directory: $STAGING_DIR" + fi + + collect_etc + collect_crontabs + collect_package_list + collect_systemd_units + collect_firewall_rules + collect_custom_includes + create_tarball + + echo "" +} + +main "$@" diff --git a/configure-miab-metrics.sh b/configure-miab-metrics.sh new file mode 100755 index 0000000..b77d2db --- /dev/null +++ b/configure-miab-metrics.sh @@ -0,0 +1,633 @@ +#!/bin/bash +# +# configure-miab-metrics.sh - Enable extended metrics logging on Mail-in-a-Box +# +# Enables SpamAssassin rules logging and/or TLS cipher logging for +# postfix-metrics.sh to collect detailed metrics. +# + +set -euo pipefail + +SCRIPT_NAME=$(basename "$0") +VERSION="1.0.0" + +# Defaults +DRY_RUN=false +VERBOSE=false +ENABLE_SPAMASSASSIN=false +ENABLE_TLS=false +BACKUP=true +FORCE=false +STATUS_ONLY=false + +# Colors (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[1;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' GREEN='' YELLOW='' BLUE='' NC='' +fi + +usage() { + cat <&2 +} + +log_dry() { + echo -e "${YELLOW}[DRY-RUN]${NC} $*" +} + +log_verbose() { + if $VERBOSE; then + echo -e "${BLUE}[VERBOSE]${NC} $*" + fi +} + +run_cmd() { + local desc="$1" + shift + if $DRY_RUN; then + log_dry "$desc: $*" + return 0 + fi + log_verbose "Running: $*" + if ! "$@"; then + log_error "Failed: $desc" + return 1 + fi + return 0 +} + +write_file() { + local file="$1" + local content="$2" + local desc="${3:-$file}" + + if $DRY_RUN; then + log_dry "Would create $desc:" + echo "$content" | sed 's/^/ /' + return 0 + fi + + if [[ -f "$file" ]] && $BACKUP; then + local backup="${file}.bak.$(date +%Y%m%d%H%M%S)" + log_verbose "Backing up $file to $backup" + cp "$file" "$backup" + fi + + echo "$content" > "$file" + log_ok "Created $desc" +} + +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "This script must be run as root (use sudo)" + exit 1 + fi +} + +check_miab() { + if [[ ! -d /home/user-data ]] && [[ ! -f /etc/mailinabox.conf ]]; then + log_warn "This doesn't appear to be a Mail-in-a-Box installation" + if ! $FORCE; then + read -rp "Continue anyway? [y/N] " response + if [[ ! "$response" =~ ^[Yy] ]]; then + exit 1 + fi + fi + fi +} + +confirm_action() { + local msg="$1" + if $FORCE || $DRY_RUN; then + return 0 + fi + read -rp "$msg [y/N] " response + [[ "$response" =~ ^[Yy] ]] +} + +# ============================================================================ +# SpamAssassin Configuration +# ============================================================================ + +SPAMPD_DEFAULTS="/etc/default/spampd" + +check_spampd() { + if ! systemctl list-unit-files spampd.service &>/dev/null; then + log_error "spampd service not found" + return 1 + fi + return 0 +} + +is_spampd_debug_enabled() { + # Check /etc/default/spampd for --debug in ADDOPTS + if [[ -f "$SPAMPD_DEFAULTS" ]]; then + if grep -qE '^ADDOPTS\s*=.*--debug' "$SPAMPD_DEFAULTS" 2>/dev/null; then + return 0 + fi + fi + return 1 +} + +is_rsyslog_spamassassin_configured() { + [[ -f /etc/rsyslog.d/50-spamassassin.conf ]] +} + +is_logrotate_configured() { + [[ -f /etc/logrotate.d/spamassassin ]] +} + +configure_spamassassin() { + log_info "Configuring SpamAssassin rules logging..." + + if ! check_spampd; then + return 1 + fi + + local changes_made=false + + # 1. Enable debug mode via /etc/default/spampd + if is_spampd_debug_enabled; then + log_ok "spampd debug mode already enabled" + else + log_info "Enabling spampd debug mode..." + + if [[ ! -f "$SPAMPD_DEFAULTS" ]]; then + log_error "$SPAMPD_DEFAULTS not found" + return 1 + fi + + # Get current ADDOPTS value + local current_addopts="" + if grep -qE '^ADDOPTS\s*=' "$SPAMPD_DEFAULTS" 2>/dev/null; then + current_addopts=$(grep -E '^ADDOPTS\s*=' "$SPAMPD_DEFAULTS" | tail -1 | sed 's/^ADDOPTS\s*=\s*//' | tr -d '"'"'") + fi + + # Build new ADDOPTS with --debug appended + local new_addopts + if [[ -n "$current_addopts" ]]; then + new_addopts="${current_addopts} --debug" + else + new_addopts="--debug" + fi + + if $DRY_RUN; then + log_dry "Would update ADDOPTS in $SPAMPD_DEFAULTS:" + if [[ -n "$current_addopts" ]]; then + echo " Current: ADDOPTS=\"$current_addopts\"" + fi + echo " New: ADDOPTS=\"$new_addopts\"" + else + if $BACKUP; then + local backup="${SPAMPD_DEFAULTS}.bak.$(date +%Y%m%d%H%M%S)" + cp "$SPAMPD_DEFAULTS" "$backup" + log_verbose "Backed up to $backup" + fi + + if grep -qE '^ADDOPTS\s*=' "$SPAMPD_DEFAULTS"; then + # Update existing ADDOPTS line + sed -i "s|^ADDOPTS\s*=.*|ADDOPTS=\"$new_addopts\"|" "$SPAMPD_DEFAULTS" + log_ok "Updated ADDOPTS to include --debug" + else + # Add new ADDOPTS line + cat >> "$SPAMPD_DEFAULTS" </dev/null && [[ "$smtp_level" -ge 1 ]] 2>/dev/null +} + +configure_tls() { + log_info "Configuring TLS cipher logging..." + + if [[ ! -f "$POSTFIX_MAIN_CF" ]]; then + log_error "Postfix main.cf not found at $POSTFIX_MAIN_CF" + return 1 + fi + + if is_tls_logging_enabled; then + log_ok "TLS logging already enabled in Postfix" + return 0 + fi + + log_info "Adding TLS log level settings to $POSTFIX_MAIN_CF..." + + if $DRY_RUN; then + log_dry "Would add to $POSTFIX_MAIN_CF:" + echo " smtpd_tls_loglevel = 1" + echo " smtp_tls_loglevel = 1" + else + if $BACKUP; then + local backup="${POSTFIX_MAIN_CF}.bak.$(date +%Y%m%d%H%M%S)" + cp "$POSTFIX_MAIN_CF" "$backup" + log_verbose "Backed up to $backup" + fi + + # Remove any existing settings first (to avoid duplicates) + sed -i '/^smtpd_tls_loglevel\s*=/d' "$POSTFIX_MAIN_CF" + sed -i '/^smtp_tls_loglevel\s*=/d' "$POSTFIX_MAIN_CF" + + # Add new settings + cat >> "$POSTFIX_MAIN_CF" </dev/null | cut -f1) + echo -e "${GREEN}EXISTS${NC} ($size)" + else + echo -e "${YELLOW}MISSING${NC}" + fi + + echo -n "SpamAssassin logrotate: " + if is_logrotate_configured; then + echo -e "${GREEN}CONFIGURED${NC}" + else + echo -e "${YELLOW}NOT CONFIGURED${NC}" + fi + + echo -n "Postfix TLS logging: " + if is_tls_logging_enabled; then + echo -e "${GREEN}ENABLED${NC}" + else + echo -e "${YELLOW}DISABLED${NC}" + fi + + # Show service status + echo "" + echo -n "spampd service: " + if systemctl is-active --quiet spampd 2>/dev/null; then + echo -e "${GREEN}RUNNING${NC}" + else + echo -e "${YELLOW}STOPPED${NC}" + fi + + echo -n "postfix service: " + if systemctl is-active --quiet postfix 2>/dev/null; then + echo -e "${GREEN}RUNNING${NC}" + else + echo -e "${YELLOW}STOPPED${NC}" + fi + + echo "" +} + +# ============================================================================ +# Main +# ============================================================================ + +main() { + # Parse arguments + while [[ $# -gt 0 ]]; do + case "$1" in + -s|--spamassassin) + ENABLE_SPAMASSASSIN=true + shift + ;; + -t|--tls) + ENABLE_TLS=true + shift + ;; + -a|--all) + ENABLE_SPAMASSASSIN=true + ENABLE_TLS=true + shift + ;; + -n|--dry-run) + DRY_RUN=true + shift + ;; + -f|--force) + FORCE=true + shift + ;; + -v|--verbose) + VERBOSE=true + shift + ;; + --no-backup) + BACKUP=false + shift + ;; + --status) + STATUS_ONLY=true + shift + ;; + -h|--help) + usage + ;; + --version) + version + ;; + *) + log_error "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac + done + + # Default to both if neither specified + if ! $ENABLE_SPAMASSASSIN && ! $ENABLE_TLS; then + ENABLE_SPAMASSASSIN=true + ENABLE_TLS=true + fi + + # Check root (skip for dry-run or status-only) + if ! $DRY_RUN && ! $STATUS_ONLY; then + check_root + fi + + check_miab + + # Show current status + show_status + + # Exit if status-only mode + if $STATUS_ONLY; then + exit 0 + fi + + # Build action summary + local actions="" + if $ENABLE_SPAMASSASSIN; then + actions+="SpamAssassin rules logging" + fi + if $ENABLE_TLS; then + [[ -n "$actions" ]] && actions+=", " + actions+="TLS cipher logging" + fi + + if $DRY_RUN; then + log_info "DRY RUN - showing changes that would be made for: $actions" + echo "" + else + if ! confirm_action "Enable $actions?"; then + log_info "Aborted" + exit 0 + fi + echo "" + fi + + local exit_code=0 + + if $ENABLE_SPAMASSASSIN; then + if ! configure_spamassassin; then + exit_code=1 + fi + echo "" + fi + + if $ENABLE_TLS; then + if ! configure_tls; then + exit_code=1 + fi + echo "" + fi + + if [[ $exit_code -eq 0 ]]; then + if $DRY_RUN; then + log_info "Dry run complete - no changes made" + else + log_ok "Configuration complete!" + echo "" + echo "Metrics should now be available after some mail traffic." + echo "Run your postfix-metrics.sh script to verify." + fi + else + log_error "Some configurations failed" + fi + + exit $exit_code +} + +main "$@" diff --git a/configure-openshift-metrics.sh b/configure-openshift-metrics.sh new file mode 100644 index 0000000..888a9fb --- /dev/null +++ b/configure-openshift-metrics.sh @@ -0,0 +1,687 @@ +#!/bin/bash +############################################################################### +# configure-openshift-metrics.sh +# +# Configure an external Prometheus server to receive metrics from OpenShift. +# Supports federation (pull) and remote write (push) modes. +# +# Usage: +# sudo ./configure-openshift-metrics.sh --method federation \ +# --openshift-url ROUTE --cluster-name NAME +# +# sudo ./configure-openshift-metrics.sh --method remote-write \ +# --prometheus-url URL --cluster-name NAME +# +# Requirements: +# - Root or sudo access on the Prometheus server +# - oc CLI logged in with cluster-admin (unless --skip-openshift) +# - Prometheus installed via binary (not containerized) +# +# https://mylinux.work/guides/openshift-metrics-to-external-prometheus/ +############################################################################### + +set -euo pipefail + +VERSION="1.0" + +#------------------------------------------------------------------------------ +# Defaults +#------------------------------------------------------------------------------ +METHOD="federation" +OPENSHIFT_URL="" +PROMETHEUS_URL="" +CLUSTER_NAME="openshift" +PROMETHEUS_CONFIG="/etc/prometheus/prometheus.yml" +PROMETHEUS_SERVICE="prometheus" +RULES_DIR="/etc/prometheus/rules" +TOKEN_FILE="/etc/prometheus/openshift-token" +PROMETHEUS_USER="prometheus" +SKIP_OPENSHIFT=false +SKIP_RULES=false +DRY_RUN=false +OC_NAMESPACE="openshift-monitoring" +SA_NAME="prometheus-external" +TOKEN_DURATION="8760h" + +#------------------------------------------------------------------------------ +# Colors and logging +#------------------------------------------------------------------------------ +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log() { echo -e "${GREEN}[openshift-metrics]${NC} $1"; } +warn() { echo -e "${YELLOW}[openshift-metrics]${NC} $1"; } +error() { echo -e "${RED}[openshift-metrics]${NC} $1" >&2; } +info() { echo -e "${BLUE}[openshift-metrics]${NC} $1"; } + +#------------------------------------------------------------------------------ +# Usage +#------------------------------------------------------------------------------ +usage() { + cat </dev/null; then + warn "promtool not found — config validation will be skipped" + fi + + if [[ "$SKIP_OPENSHIFT" == false ]] && ! command -v oc &>/dev/null; then + error "oc CLI not found. Install it or use --skip-openshift with an existing token" + exit 1 + fi +} + +#------------------------------------------------------------------------------ +# Backup existing config +#------------------------------------------------------------------------------ +backup_config() { + local backup_dir + backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups" + mkdir -p "$backup_dir" + + local timestamp + timestamp=$(date +%F_%H%M%S) + local backup_file="${backup_dir}/prometheus.yml.${timestamp}" + + if $DRY_RUN; then + info "[dry-run] Would backup $PROMETHEUS_CONFIG to $backup_file" + else + cp "$PROMETHEUS_CONFIG" "$backup_file" + log "Backed up config to $backup_file" + fi +} + +#------------------------------------------------------------------------------ +# OpenShift: Create service account and token +#------------------------------------------------------------------------------ +setup_openshift_sa() { + if $SKIP_OPENSHIFT; then + if [[ -f "$TOKEN_FILE" ]]; then + log "Using existing token from $TOKEN_FILE" + else + error "No token found at $TOKEN_FILE. Provide a token or remove --skip-openshift." + exit 1 + fi + return + fi + + log "Setting up OpenShift service account..." + + # Check oc is logged in + if ! oc whoami &>/dev/null; then + error "Not logged into OpenShift. Run: oc login " + exit 1 + fi + + local cluster_info + cluster_info=$(oc whoami --show-server 2>/dev/null || echo "unknown") + log "Connected to: $cluster_info" + + if $DRY_RUN; then + info "[dry-run] Would create service account $SA_NAME in $OC_NAMESPACE" + info "[dry-run] Would grant cluster-monitoring-view role" + info "[dry-run] Would generate token with duration $TOKEN_DURATION" + return + fi + + # Create service account (ignore if exists) + if oc get serviceaccount "$SA_NAME" -n "$OC_NAMESPACE" &>/dev/null; then + warn "Service account $SA_NAME already exists in $OC_NAMESPACE" + else + oc create serviceaccount "$SA_NAME" -n "$OC_NAMESPACE" + log "Created service account: $SA_NAME" + fi + + # Grant cluster-monitoring-view role + if oc get clusterrolebinding "${SA_NAME}-monitoring-view" &>/dev/null 2>&1; then + warn "Role binding already exists" + else + oc adm policy add-cluster-role-to-user cluster-monitoring-view \ + -z "$SA_NAME" -n "$OC_NAMESPACE" + log "Granted cluster-monitoring-view role" + fi + + # Generate token + local token + token=$(oc create token "$SA_NAME" -n "$OC_NAMESPACE" --duration="$TOKEN_DURATION") + + echo "$token" > "$TOKEN_FILE" + chmod 600 "$TOKEN_FILE" + chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$TOKEN_FILE" + log "Token saved to $TOKEN_FILE (expires in $TOKEN_DURATION)" +} + +#------------------------------------------------------------------------------ +# Generate federation scrape config +#------------------------------------------------------------------------------ +generate_federation_config() { + cat < 0.9 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU on OpenShift node {{ \$labels.instance }}" + description: "CPU usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})." + + - alert: OpenShiftNodeHighMemory + expr: openshift:node_memory_utilization:ratio > 0.9 + for: 10m + labels: + severity: warning + annotations: + summary: "High memory on OpenShift node {{ \$labels.instance }}" + description: "Memory usage above 90% for 10 minutes (current: {{ \$value | humanizePercentage }})." + + - alert: OpenShiftPodCrashLooping + expr: rate(kube_pod_container_status_restarts_total{cluster="${CLUSTER_NAME}"}[15m]) * 60 * 5 > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Pod {{ \$labels.namespace }}/{{ \$labels.pod }} is crash looping" + description: "Pod has restarted {{ \$value | humanize }} times in the last 15 minutes." + + - alert: OpenShiftDeploymentReplicasMismatch + expr: | + kube_deployment_spec_replicas{cluster="${CLUSTER_NAME}"} + != kube_deployment_status_ready_replicas{cluster="${CLUSTER_NAME}"} + for: 10m + labels: + severity: warning + annotations: + summary: "Deployment {{ \$labels.namespace }}/{{ \$labels.deployment }} replica mismatch" + description: "Deployment does not have expected number of ready replicas." + + - alert: OpenShiftEtcdLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{cluster="${CLUSTER_NAME}"}[1h]) > 3 + for: 5m + labels: + severity: warning + annotations: + summary: "Frequent etcd leader changes on {{ \$labels.cluster }}" + description: "etcd leader changed {{ \$value | humanize }} times in the last hour." +YAML +} + +#------------------------------------------------------------------------------ +# Apply federation configuration +#------------------------------------------------------------------------------ +apply_federation() { + log "Configuring federation from $OPENSHIFT_URL..." + + # Set up OpenShift service account and token + setup_openshift_sa + + # Backup existing config + backup_config + + # Generate and append federation scrape config + local federation_config + federation_config=$(generate_federation_config) + + if $DRY_RUN; then + info "[dry-run] Would append to $PROMETHEUS_CONFIG:" + echo "$federation_config" + else + # Check if the job already exists + if grep -q 'job_name: "openshift-federate"' "$PROMETHEUS_CONFIG" 2>/dev/null; then + warn "Federation job 'openshift-federate' already exists in $PROMETHEUS_CONFIG" + warn "Remove the existing job first or edit it manually." + return 1 + fi + + echo "$federation_config" >> "$PROMETHEUS_CONFIG" + chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$PROMETHEUS_CONFIG" + log "Federation scrape job added to $PROMETHEUS_CONFIG" + fi + + # Generate rules + if [[ "$SKIP_RULES" == false ]]; then + generate_rules + fi + + # Validate and reload + validate_and_reload +} + +#------------------------------------------------------------------------------ +# Apply remote write configuration +#------------------------------------------------------------------------------ +apply_remote_write() { + log "Configuring remote write to $PROMETHEUS_URL..." + + # Backup existing config + backup_config + + # Enable remote write receiver + local service_file="/etc/systemd/system/${PROMETHEUS_SERVICE}.service" + if [[ -f "$service_file" ]]; then + if grep -q "web.enable-remote-write-receiver" "$service_file"; then + log "Remote write receiver already enabled" + else + if $DRY_RUN; then + info "[dry-run] Would add --web.enable-remote-write-receiver to $service_file" + else + warn "You need to add --web.enable-remote-write-receiver to your Prometheus service." + warn "Edit $service_file and add the flag to ExecStart, then run:" + warn " sudo systemctl daemon-reload && sudo systemctl restart prometheus" + echo "" + fi + fi + fi + + # Generate basic auth credentials + local rw_password + rw_password=$(openssl rand -base64 24 2>/dev/null || head -c 24 /dev/urandom | base64) + local rw_user="openshift" + + log "Generated remote write credentials:" + log " Username: $rw_user" + log " Password: $rw_password" + echo "" + + # Generate web.yml with basic auth + local web_config_file + web_config_file="$(dirname "$PROMETHEUS_CONFIG")/web.yml" + + if command -v htpasswd &>/dev/null; then + local hash + hash=$(htpasswd -nbBC 12 "" "$rw_password" | tr -d ':\n') + + if $DRY_RUN; then + info "[dry-run] Would create $web_config_file with basic_auth_users" + else + if [[ -f "$web_config_file" ]]; then + warn "$web_config_file already exists — add this entry manually:" + echo " $rw_user: \"$hash\"" + else + cat > "$web_config_file" < "$RULES_DIR/openshift-rules.yml" + chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-rules.yml" + log "Created $RULES_DIR/openshift-rules.yml" + + generate_alert_rules > "$RULES_DIR/openshift-alerts.yml" + chown "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$RULES_DIR/openshift-alerts.yml" + log "Created $RULES_DIR/openshift-alerts.yml" +} + +#------------------------------------------------------------------------------ +# Validate config and reload Prometheus +#------------------------------------------------------------------------------ +validate_and_reload() { + if $DRY_RUN; then + info "[dry-run] Would validate config and reload Prometheus" + return + fi + + # Validate with promtool + if command -v promtool &>/dev/null; then + log "Validating Prometheus configuration..." + + if ! promtool check config "$PROMETHEUS_CONFIG"; then + error "Config validation failed. Restoring backup..." + local backup_dir + backup_dir="$(dirname "$PROMETHEUS_CONFIG")/backups" + local latest_backup + latest_backup=$(ls -t "$backup_dir"/prometheus.yml.* 2>/dev/null | head -1) + if [[ -n "$latest_backup" ]]; then + cp "$latest_backup" "$PROMETHEUS_CONFIG" + log "Restored from $latest_backup" + fi + exit 1 + fi + log "Config validation passed" + + # Validate rules + if [[ "$SKIP_RULES" == false ]]; then + for rule_file in "$RULES_DIR"/openshift-*.yml; do + if [[ -f "$rule_file" ]]; then + if ! promtool check rules "$rule_file"; then + error "Rule validation failed: $rule_file" + exit 1 + fi + fi + done + log "Rule validation passed" + fi + fi + + # Reload Prometheus + if systemctl is-active --quiet "$PROMETHEUS_SERVICE"; then + systemctl reload "$PROMETHEUS_SERVICE" 2>/dev/null || \ + systemctl restart "$PROMETHEUS_SERVICE" + log "Prometheus reloaded" + else + warn "Prometheus service is not running. Start it with: sudo systemctl start $PROMETHEUS_SERVICE" + fi +} + +#------------------------------------------------------------------------------ +# Print summary +#------------------------------------------------------------------------------ +print_summary() { + echo "" + echo "============================================" + echo " OpenShift Metrics Configuration Complete" + echo "============================================" + echo "" + echo " Method: $METHOD" + echo " Cluster name: $CLUSTER_NAME" + + if [[ "$METHOD" == "federation" ]]; then + echo " OpenShift URL: $OPENSHIFT_URL" + echo " Token file: $TOKEN_FILE" + else + echo " Prometheus URL: $PROMETHEUS_URL" + fi + + echo " Config file: $PROMETHEUS_CONFIG" + + if [[ "$SKIP_RULES" == false ]]; then + echo " Rules dir: $RULES_DIR" + fi + + echo "" + echo " Verify:" + echo " - Check targets: http://localhost:9090/targets" + + if [[ "$METHOD" == "federation" ]]; then + echo " - Test query: node_memory_MemAvailable_bytes{cluster=\"${CLUSTER_NAME}\"}" + else + echo " - Test query: up{cluster=\"${CLUSTER_NAME}\"}" + fi + + echo "" +} + +#------------------------------------------------------------------------------ +# Main +#------------------------------------------------------------------------------ +main() { + echo "" + log "configure-openshift-metrics.sh v${VERSION}" + echo "" + + validate + + if $DRY_RUN; then + warn "DRY RUN — no changes will be made" + echo "" + fi + + case "$METHOD" in + federation) apply_federation ;; + remote-write) apply_remote_write ;; + esac + + if ! $DRY_RUN; then + print_summary + fi + + log "Done." +} + +main diff --git a/consul-exporter.sh b/consul-exporter.sh new file mode 100644 index 0000000..4954db8 --- /dev/null +++ b/consul-exporter.sh @@ -0,0 +1,358 @@ +#!/usr/bin/env bash +# +# Consul Prometheus Metrics Exporter +# +# Prometheus textfile collector exporter for Consul. +# Uses the Consul HTTP API to collect cluster health, Raft consensus, +# service catalog, health check states, KV store entry counts, +# and node membership. +# +# Usage: +# ./consul-exporter.sh +# ./consul-exporter.sh --textfile +# CONSUL_TOKEN="xxx" ./consul-exporter.sh --textfile +# ./consul-exporter.sh --install +# +# Parameters: +# --textfile Write to textfile collector directory +# --install Create cron job for automatic collection +# --help Show usage +# +# Environment: +# CONSUL_URL Consul HTTP API base URL (default: http://127.0.0.1:8500) +# CONSUL_TOKEN ACL token (optional, required if ACLs are enabled) +# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector) +# CURL_TIMEOUT API request timeout in seconds (default: 10) +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.0 +# +# Metrics Exported: +# Core: +# - consul_up +# - consul_exporter_info{version} +# - consul_peers_total +# - consul_leader +# +# Catalog: +# - consul_services_total +# - consul_nodes_total +# +# Health: +# - consul_health_checks_passing +# - consul_health_checks_warning +# - consul_health_checks_critical +# +# KV: +# - consul_kv_entries_total +# +# Raft: +# - consul_raft_commit_time_seconds +# - consul_raft_last_contact_seconds +# +# Exporter: +# - consul_exporter_duration_seconds +# - consul_exporter_last_run_timestamp + +set -euo pipefail + +# --- Configuration --- +readonly VERSION="1.0" +readonly SCRIPT_NAME="$(basename "$0")" +CONSUL_URL="${CONSUL_URL:-http://127.0.0.1:8500}" +CONSUL_TOKEN="${CONSUL_TOKEN:-}" +TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +TEXTFILE_MODE=false +OUTPUT="" +START_TIME="" + +# --- Functions --- + +usage() { + cat </dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "ERROR: Missing required commands: ${missing[*]}" >&2 + echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 + exit 1 + fi +} + +validate_config() { + # Strip trailing slash + CONSUL_URL="${CONSUL_URL%/}" +} + +api_get() { + local endpoint="$1" + local curl_args=(-sf --max-time "$CURL_TIMEOUT") + + if [[ -n "$CONSUL_TOKEN" ]]; then + curl_args+=(-H "X-Consul-Token: ${CONSUL_TOKEN}") + fi + + curl "${curl_args[@]}" "${CONSUL_URL}${endpoint}" 2>/dev/null || echo "" +} + +add_metric() { + local name="$1" + local type="$2" + local help="$3" + local value="$4" + local labels="${5:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name}{${labels}} ${value} +" + else + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name} ${value} +" + fi +} + +add_metric_value() { + local name="$1" + local value="$2" + local labels="${3:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="${name}{${labels}} ${value} +" + else + OUTPUT+="${name} ${value} +" + fi +} + +collect_health() { + local members_json + members_json=$(api_get "/v1/agent/members") + + if [[ -z "$members_json" ]]; then + add_metric "consul_up" "gauge" "Consul reachability (1=up, 0=down)" "0" + return 1 + fi + + add_metric "consul_up" "gauge" "Consul reachability (1=up, 0=down)" "1" + + # Node count from members + local node_count + node_count=$(echo "$members_json" | jq 'length' 2>/dev/null) + add_metric "consul_nodes_total" "gauge" "Total number of cluster nodes" "${node_count:-0}" + + return 0 +} + +collect_raft() { + local raft_json + raft_json=$(api_get "/v1/operator/raft/configuration") + + if [[ -z "$raft_json" ]]; then + return + fi + + # Peer count + local peer_count + peer_count=$(echo "$raft_json" | jq '.Servers | length' 2>/dev/null) + add_metric "consul_peers_total" "gauge" "Number of Raft peers in the cluster" "${peer_count:-0}" + + # Leader detection — check if current node is leader + local self_json leader_addr self_addr + self_json=$(api_get "/v1/agent/self") + + if [[ -n "$self_json" ]]; then + leader_addr=$(echo "$raft_json" | jq -r '.Servers[] | select(.Leader == true) | .Address' 2>/dev/null) + self_addr=$(echo "$self_json" | jq -r '.Config.RaftAddress // .Stats.raft.applied_index // empty' 2>/dev/null) + local self_name self_leader_name + self_name=$(echo "$self_json" | jq -r '.Config.NodeName // empty' 2>/dev/null) + self_leader_name=$(echo "$raft_json" | jq -r '.Servers[] | select(.Leader == true) | .Node' 2>/dev/null) + + if [[ -n "$self_name" && "$self_name" == "$self_leader_name" ]]; then + add_metric "consul_leader" "gauge" "Whether this node is the cluster leader (1=leader, 0=follower)" "1" + else + add_metric "consul_leader" "gauge" "Whether this node is the cluster leader (1=leader, 0=follower)" "0" + fi + + # Raft stats from /v1/agent/self + local raft_commit_time raft_last_contact + raft_commit_time=$(echo "$self_json" | jq -r '.Stats.raft.commit_time // empty' 2>/dev/null) + raft_last_contact=$(echo "$self_json" | jq -r '.Stats.raft.last_contact // empty' 2>/dev/null) + + if [[ -n "$raft_commit_time" ]]; then + # Convert milliseconds to seconds + local commit_seconds + commit_seconds=$(echo "scale=6; ${raft_commit_time%ms} / 1000" | bc 2>/dev/null || echo "0") + add_metric "consul_raft_commit_time_seconds" "gauge" "Raft commit time in seconds" "$commit_seconds" + fi + + if [[ -n "$raft_last_contact" ]]; then + # Convert milliseconds to seconds + local contact_seconds + contact_seconds=$(echo "scale=6; ${raft_last_contact%ms} / 1000" | bc 2>/dev/null || echo "0") + add_metric "consul_raft_last_contact_seconds" "gauge" "Time since last Raft leader contact in seconds" "$contact_seconds" + fi + fi +} + +collect_services() { + local services_json + services_json=$(api_get "/v1/catalog/services") + + if [[ -z "$services_json" ]]; then + return + fi + + local service_count + service_count=$(echo "$services_json" | jq 'keys | length' 2>/dev/null) + add_metric "consul_services_total" "gauge" "Total number of registered services" "${service_count:-0}" +} + +collect_health_checks() { + local checks_json + checks_json=$(api_get "/v1/health/state/any") + + if [[ -z "$checks_json" ]]; then + return + fi + + local passing warning critical + passing=$(echo "$checks_json" | jq '[.[] | select(.Status == "passing")] | length' 2>/dev/null) + warning=$(echo "$checks_json" | jq '[.[] | select(.Status == "warning")] | length' 2>/dev/null) + critical=$(echo "$checks_json" | jq '[.[] | select(.Status == "critical")] | length' 2>/dev/null) + + add_metric "consul_health_checks_passing" "gauge" "Number of passing health checks" "${passing:-0}" + add_metric "consul_health_checks_warning" "gauge" "Number of warning health checks" "${warning:-0}" + add_metric "consul_health_checks_critical" "gauge" "Number of critical health checks" "${critical:-0}" +} + +collect_kv() { + local kv_json + kv_json=$(api_get "/v1/kv/?keys") + + if [[ -z "$kv_json" ]]; then + add_metric "consul_kv_entries_total" "gauge" "Total number of KV store entries" "0" + return + fi + + local kv_count + kv_count=$(echo "$kv_json" | jq 'length' 2>/dev/null) + add_metric "consul_kv_entries_total" "gauge" "Total number of KV store entries" "${kv_count:-0}" +} + +write_output() { + if [[ "$TEXTFILE_MODE" == true ]]; then + local output_file="${TEXTFILE_DIR}/consul.prom" + local temp_file="${output_file}.$$" + + mkdir -p "$TEXTFILE_DIR" + echo "$OUTPUT" > "$temp_file" + mv "$temp_file" "$output_file" + else + echo "$OUTPUT" + fi +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + cat > /etc/cron.d/consul-exporter </dev/null +EOF + + chmod 644 /etc/cron.d/consul-exporter + echo "Installed cron job: /etc/cron.d/consul-exporter" + echo "Metrics will be written to: ${TEXTFILE_DIR}/consul.prom" +} + +# --- Main --- + +main() { + # Parse arguments + for arg in "$@"; do + case "$arg" in + --textfile) TEXTFILE_MODE=true ;; + --install) + check_dependencies + validate_config + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) echo "Unknown option: $arg" >&2; usage ;; + esac + done + + check_dependencies + validate_config + + START_TIME=$(date +%s%N) + + # Exporter info + add_metric "consul_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" + + # Collect metrics + if collect_health; then + collect_raft + collect_services + collect_health_checks + collect_kv + fi + + # Exporter performance + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + add_metric "consul_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" + add_metric "consul_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" + + write_output +} + +main "$@" diff --git a/contabo-backup-auditor.sh b/contabo-backup-auditor.sh new file mode 100644 index 0000000..27b53fb --- /dev/null +++ b/contabo-backup-auditor.sh @@ -0,0 +1,521 @@ +#!/usr/bin/env bash + +######################################################################################### +#### contabo-backup-auditor.sh — Audit snapshot ages and backup coverage for #### +#### Contabo VPS/VDS instances via the REST API #### +#### Requires: bash 4+, curl, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./contabo-backup-auditor.sh --audit #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +INSTANCE_ID="" +TAG_ID="" +OUTPUT_FORMAT="${CBA_FORMAT:-table}" +MAX_AGE_HOURS="${CBA_MAX_AGE:-48}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +CONTABO_CLIENT_ID="${CONTABO_CLIENT_ID:-}" +CONTABO_CLIENT_SECRET="${CONTABO_CLIENT_SECRET:-}" +CONTABO_API_USER="${CONTABO_API_USER:-}" +CONTABO_API_PASS="${CONTABO_API_PASS:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── API helpers ────────────────────────────────────────────────────── +contabo_token() { + local resp + resp=$(curl -s -d "client_id=${CONTABO_CLIENT_ID}" \ + -d "client_secret=${CONTABO_CLIENT_SECRET}" \ + --data-urlencode "username=${CONTABO_API_USER}" \ + --data-urlencode "password=${CONTABO_API_PASS}" \ + -d "grant_type=password" \ + "https://auth.contabo.com/auth/realms/contabo/protocol/openid-connect/token") + local token + token=$(echo "$resp" | jq -r '.access_token // empty' 2>/dev/null) + if [[ -z "$token" ]]; then + die "Failed to obtain access token — check credentials" + fi + echo "$token" +} + +contabo_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/cba_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer $(contabo_token)" \ + -H "Content-Type: application/json" \ + -H "x-request-id: $(cat /proc/sys/kernel/random/uuid 2>/dev/null || date +%s%N)" \ + "https://api.contabo.com/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + cat /tmp/cba_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$CONTABO_CLIENT_ID" ]] && die "CONTABO_CLIENT_ID not set" + [[ -z "$CONTABO_CLIENT_SECRET" ]] && die "CONTABO_CLIENT_SECRET not set" + [[ -z "$CONTABO_API_USER" ]] && die "CONTABO_API_USER not set" + [[ -z "$CONTABO_API_PASS" ]] && die "CONTABO_API_PASS not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Pagination helper ──────────────────────────────────────────────── +fetch_all_contabo() { + local endpoint="$1" key="$2" + local page=1 size=100 all_data="[]" + while true; do + local sep="?" + [[ "$endpoint" == *"?"* ]] && sep="&" + local resp + resp=$(contabo_api GET "${endpoint}${sep}page=${page}&size=${size}") + local page_data + page_data=$(echo "$resp" | jq ".${key} // []" 2>/dev/null) + local page_count + page_count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$page_count" -eq 0 ]] && break + all_data=$(echo -e "${all_data}\n${page_data}" | jq -s 'add' 2>/dev/null) + (( page_count < size )) && break + ((page++)) || true + done + echo "$all_data" +} + +# ── Age helpers ────────────────────────────────────────────────────── +iso_to_epoch() { + date -d "$1" +%s 2>/dev/null || echo 0 +} + +age_hours() { + local created_epoch="$1" + local now + now=$(date +%s) + echo $(( (now - created_epoch) / 3600 )) +} + +format_age() { + local hours="$1" + if [[ "$hours" -lt 24 ]]; then + echo "${hours}h" + else + local days=$(( hours / 24 )) + local rem=$(( hours % 24 )) + echo "${days}d ${rem}h" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ +do_audit() { + local instances + instances=$(fetch_all_contabo "/compute/instances" "data") + local instance_count + instance_count=$(echo "$instances" | jq 'length' 2>/dev/null || echo 0) + [[ "$instance_count" -eq 0 ]] && die "No instances found" + + # Filter by tag if specified + if [[ -n "$TAG_ID" ]]; then + instances=$(echo "$instances" | jq --arg tid "$TAG_ID" \ + '[.[] | select(.tags[]? | .tagId == ($tid | tonumber))]' 2>/dev/null) + instance_count=$(echo "$instances" | jq 'length' 2>/dev/null || echo 0) + [[ "$instance_count" -eq 0 ]] && die "No instances found with tag ${TAG_ID}" + fi + + # Filter by instance ID + if [[ -n "$INSTANCE_ID" ]]; then + instances=$(echo "$instances" | jq --arg iid "$INSTANCE_ID" \ + '[.[] | select(.instanceId == ($iid | tonumber))]' 2>/dev/null) + instance_count=$(echo "$instances" | jq 'length' 2>/dev/null || echo 0) + [[ "$instance_count" -eq 0 ]] && die "Instance not found: ${INSTANCE_ID}" + fi + + local snapshots + snapshots=$(fetch_all_contabo "/compute/snapshots" "data") + + local warnings=0 + local no_backup=0 + local stale=0 + local healthy=0 + local results="" + + while IFS=$'\t' read -r iid iname istatus; do + [[ -z "$iid" ]] && continue + + # Find most recent snapshot for this instance + local latest_snap + latest_snap=$(echo "$snapshots" | jq -r \ + --arg iid "$iid" \ + '[.[] | select(.instanceId == ($iid | tonumber))] | sort_by(.createdDate) | last | .createdDate // empty' \ + 2>/dev/null) + + local age_h="—" + local status_flag="none" + if [[ -n "$latest_snap" ]]; then + local nepoch + nepoch=$(iso_to_epoch "$latest_snap") + age_h=$(age_hours "$nepoch") + if [[ "$age_h" -le "$MAX_AGE_HOURS" ]]; then + status_flag="ok" + ((healthy++)) || true + else + status_flag="stale" + ((stale++)) || true + ((warnings++)) || true + fi + else + ((no_backup++)) || true + ((warnings++)) || true + fi + + # Count snapshots for this instance + local snap_count + snap_count=$(echo "$snapshots" | jq --arg iid "$iid" \ + '[.[] | select(.instanceId == ($iid | tonumber))] | length' 2>/dev/null || echo 0) + + results="${results}${iid}\t${iname}\t${istatus}\t${snap_count}\t${age_h}\t${status_flag}\n" + done < <(echo "$instances" | jq -r \ + '.[] | "\(.instanceId)\t\(.name // .displayName // "unknown")\t\(.status // "—")"' \ + 2>/dev/null) + + case "$OUTPUT_FORMAT" in + json) + jq -n \ + --argjson instances "$instance_count" \ + --argjson healthy "$healthy" \ + --argjson stale "$stale" \ + --argjson no_backup "$no_backup" \ + --argjson warnings "$warnings" \ + --argjson max_age "$MAX_AGE_HOURS" \ + '{instances: $instances, healthy: $healthy, stale: $stale, no_backup: $no_backup, warnings: $warnings, max_age_hours: $max_age}' + ;; + prometheus) + cat </dev/null || echo 0) + [[ "$total" -eq 0 ]] && die "No snapshots found" + + case "$OUTPUT_FORMAT" in + json) + echo "$snapshots" | jq '[.[] | { + id: (.snapshotId // .id), name: .name, + instance_id: .instanceId, created: .createdDate + }]' + ;; + prometheus) + local stale_count=0 + while IFS=$'\t' read -r sid screated; do + [[ -z "$sid" ]] && continue + local cepoch + cepoch=$(iso_to_epoch "$screated") + local ah + ah=$(age_hours "$cepoch") + [[ "$ah" -gt "$MAX_AGE_HOURS" ]] && ((stale_count++)) || true + done < <(echo "$snapshots" | jq -r '.[] | "\(.snapshotId // .id)\t\(.createdDate // "")"' 2>/dev/null) + + cat </dev/null \ + | while IFS=$'\t' read -r sid sname siid screated; do + local cepoch ah age_display age_color + cepoch=$(iso_to_epoch "$screated") + ah=$(age_hours "$cepoch") + age_display=$(format_age "$ah") + age_color="$GREEN" + [[ "$ah" -gt "$MAX_AGE_HOURS" ]] && age_color="$YELLOW" + + printf " %-38s %-18s %-10s %-20s " \ + "${sid:0:36}" "${sname:0:16}" "$siid" "${screated:0:19}" + echo -e "${age_color}${age_display}${RESET}" + done + + echo "" + field "Snapshots:" "$total" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat < /var/lib/node_exporter/textfile/contabo_backup.prom 2>/dev/null + +${BOLD}EXIT CODES${RESET} + 0 Success + 1 Runtime error +EOF +} + +# ══════════════════════════════════════════════════════════════════════ +# PARSE ARGS +# ══════════════════════════════════════════════════════════════════════ +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --audit) RUN_MODE="audit"; shift ;; + --snapshots) RUN_MODE="snapshots"; shift ;; + --instance) INSTANCE_ID="${2:?--instance requires an ID}"; shift 2 ;; + --tag) TAG_ID="${2:?--tag requires a TAG_ID}"; shift 2 ;; + --max-age) MAX_AGE_HOURS="${2:?--max-age requires HOURS}"; shift 2 ;; + --format) OUTPUT_FORMAT="${2:?--format requires a value}"; shift 2 ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + --help|-h) setup_colors; show_help; exit 0 ;; + *) die "Unknown option: $1 (see --help)" ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ +main() { + parse_args "$@" + setup_colors + + if [[ -z "$RUN_MODE" ]]; then + RUN_MODE="audit" + fi + + check_deps + check_credentials + + START_TIME=$(date +%s) + + case "$RUN_MODE" in + audit) do_audit ;; + snapshots) do_snapshots ;; + *) die "Unknown mode: ${RUN_MODE}" ;; + esac + + if [[ "$OUTPUT_FORMAT" != "prometheus" ]]; then + echo "" + field "Duration:" "$(elapsed)" + fi +} + +main "$@" diff --git a/contabo-cost-monitor.sh b/contabo-cost-monitor.sh new file mode 100644 index 0000000..928ae2a --- /dev/null +++ b/contabo-cost-monitor.sh @@ -0,0 +1,551 @@ +#!/usr/bin/env bash + +######################################################################################### +#### contabo-cost-monitor.sh — Track and report Contabo spending via the REST API. #### +#### Instance costs, snapshot usage, and alert thresholds with Prometheus output #### +#### Requires: bash 4+, curl, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./contabo-cost-monitor.sh --summary #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +OUTPUT_FORMAT="${CCM_FORMAT:-table}" +ALERT_THRESHOLD="${CCM_ALERT:-0}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +CONTABO_CLIENT_ID="${CONTABO_CLIENT_ID:-}" +CONTABO_CLIENT_SECRET="${CONTABO_CLIENT_SECRET:-}" +CONTABO_API_USER="${CONTABO_API_USER:-}" +CONTABO_API_PASS="${CONTABO_API_PASS:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── API helpers ────────────────────────────────────────────────────── +contabo_token() { + local resp + resp=$(curl -s -d "client_id=${CONTABO_CLIENT_ID}" \ + -d "client_secret=${CONTABO_CLIENT_SECRET}" \ + --data-urlencode "username=${CONTABO_API_USER}" \ + --data-urlencode "password=${CONTABO_API_PASS}" \ + -d "grant_type=password" \ + "https://auth.contabo.com/auth/realms/contabo/protocol/openid-connect/token") + local token + token=$(echo "$resp" | jq -r '.access_token // empty' 2>/dev/null) + if [[ -z "$token" ]]; then + die "Failed to obtain access token — check credentials" + fi + echo "$token" +} + +contabo_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/ccm_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer $(contabo_token)" \ + -H "Content-Type: application/json" \ + -H "x-request-id: $(cat /proc/sys/kernel/random/uuid 2>/dev/null || date +%s%N)" \ + "https://api.contabo.com/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + cat /tmp/ccm_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$CONTABO_CLIENT_ID" ]] && die "CONTABO_CLIENT_ID not set" + [[ -z "$CONTABO_CLIENT_SECRET" ]] && die "CONTABO_CLIENT_SECRET not set" + [[ -z "$CONTABO_API_USER" ]] && die "CONTABO_API_USER not set" + [[ -z "$CONTABO_API_PASS" ]] && die "CONTABO_API_PASS not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Pagination helper ──────────────────────────────────────────────── +fetch_all_contabo() { + local endpoint="$1" key="$2" + local page=1 size=100 all_data="[]" + while true; do + local sep="?" + [[ "$endpoint" == *"?"* ]] && sep="&" + local resp + resp=$(contabo_api GET "${endpoint}${sep}page=${page}&size=${size}") + local page_data + page_data=$(echo "$resp" | jq ".${key} // []" 2>/dev/null) + local page_count + page_count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$page_count" -eq 0 ]] && break + all_data=$(echo -e "${all_data}\n${page_data}" | jq -s 'add' 2>/dev/null) + (( page_count < size )) && break + ((page++)) || true + done + echo "$all_data" +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ +do_summary() { + local instances + instances=$(fetch_all_contabo "/compute/instances" "data") + local instance_count + instance_count=$(echo "$instances" | jq 'length' 2>/dev/null || echo 0) + local running_count + running_count=$(echo "$instances" | jq '[.[] | select(.status == "running")] | length' 2>/dev/null || echo 0) + + local snapshots + snapshots=$(fetch_all_contabo "/compute/snapshots" "data") + local snapshot_count + snapshot_count=$(echo "$snapshots" | jq 'length' 2>/dev/null || echo 0) + + local object_storage + object_storage=$(fetch_all_contabo "/object-storages" "data") + local storage_count + storage_count=$(echo "$object_storage" | jq 'length' 2>/dev/null || echo 0) + local storage_tb + storage_tb=$(echo "$object_storage" | jq '[.[].totalPurchasedSpaceTB // 0] | add // 0' 2>/dev/null || echo 0) + local storage_used_bytes + storage_used_bytes=$(echo "$object_storage" | jq '[.[].usedSpaceBytes // 0] | add // 0' 2>/dev/null || echo 0) + local storage_used_gb + storage_used_gb=$(awk "BEGIN {printf \"%.1f\", ${storage_used_bytes} / 1073741824}") + + # Estimate costs from instance product IDs + # Contabo uses fixed monthly pricing per product tier + local instance_cost="0.00" + while IFS=$'\t' read -r pid pname status; do + [[ -z "$pid" ]] && continue + # Extract monthly cost from product info if available + local cost_per_month="0" + # Contabo productId maps to fixed monthly rates + # These are approximations — actual billing comes from the Contabo panel + case "$pid" in + V1) cost_per_month="4.99" ;; + V2) cost_per_month="5.99" ;; + V4) cost_per_month="8.99" ;; + V8) cost_per_month="13.99" ;; + V16) cost_per_month="19.99" ;; + V24) cost_per_month="24.99" ;; + V30) cost_per_month="29.99" ;; + V45) cost_per_month="39.99" ;; + V60) cost_per_month="49.99" ;; + *) cost_per_month="0" ;; + esac + instance_cost=$(awk "BEGIN {printf \"%.2f\", ${instance_cost} + ${cost_per_month}}") + done < <(echo "$instances" | jq -r \ + '.[] | "\(.productId // "—")\t\(.name // .displayName // "unknown")\t\(.status // "—")"' \ + 2>/dev/null) + + local total_cost="$instance_cost" + + # Alert check + local alert_triggered="false" + if [[ "$ALERT_THRESHOLD" != "0" ]]; then + local over + over=$(awk "BEGIN {print (${total_cost} > ${ALERT_THRESHOLD}) ? 1 : 0}") + [[ "$over" == "1" ]] && alert_triggered="true" + fi + + case "$OUTPUT_FORMAT" in + json) + jq -n \ + --argjson instances "$instance_count" \ + --argjson running "$running_count" \ + --argjson snapshots "$snapshot_count" \ + --argjson object_storage "$storage_count" \ + --arg storage_tb "$storage_tb" \ + --arg storage_used_gb "$storage_used_gb" \ + --arg instance_cost "$instance_cost" \ + --arg total_cost "$total_cost" \ + --arg alert_threshold "$ALERT_THRESHOLD" \ + --argjson alert_triggered "$alert_triggered" \ + '{ + instances: $instances, running: $running, + snapshots: $snapshots, + object_storage: $object_storage, + storage_purchased_tb: ($storage_tb | tonumber), + storage_used_gb: ($storage_used_gb | tonumber), + monthly_estimate: { + instances: $instance_cost, total: $total_cost + }, + alert: { threshold: $alert_threshold, triggered: $alert_triggered } + }' + ;; + prometheus) + cat </dev/null || echo 0) + [[ "$instance_count" -eq 0 ]] && die "No instances found" + + case "$OUTPUT_FORMAT" in + json) + echo "$instances" | jq '[.[] | { + id: .instanceId, name: (.name // .displayName), + status: .status, product: .productId, + region: .region, ip: .ipConfig.v4.ip + }]' + ;; + *) + section_header "Instance Cost Breakdown" + + printf " ${BOLD}%-10s %-20s %-8s %-10s %-10s %10s${RESET}\n" \ + "ID" "NAME" "PRODUCT" "STATUS" "REGION" "MONTHLY €" + printf " %s\n" "$(printf '%.0s─' {1..72})" + + while IFS=$'\t' read -r iid iname pid status region; do + [[ -z "$iid" ]] && continue + local cost_per_month="0.00" + case "$pid" in + V1) cost_per_month="4.99" ;; + V2) cost_per_month="5.99" ;; + V4) cost_per_month="8.99" ;; + V8) cost_per_month="13.99" ;; + V16) cost_per_month="19.99" ;; + V24) cost_per_month="24.99" ;; + V30) cost_per_month="29.99" ;; + V45) cost_per_month="39.99" ;; + V60) cost_per_month="49.99" ;; + *) cost_per_month="—" ;; + esac + + local status_color="$GREEN" + case "$status" in + running) status_color="$GREEN" ;; + stopped) status_color="$YELLOW" ;; + *) status_color="$RED" ;; + esac + + printf " %-10s %-20s %-8s " "$iid" "${iname:0:18}" "$pid" + echo -ne "${status_color}" + printf "%-10s" "$status" + echo -ne "${RESET}" + printf " %-10s %10s\n" "${region:0:8}" "$cost_per_month" + done < <(echo "$instances" | jq -r \ + '.[] | "\(.instanceId)\t\(.name // .displayName // "unknown")\t\(.productId // "—")\t\(.status // "—")\t\(.region // "—")"' \ + 2>/dev/null) + + echo "" + field "Instances:" "$instance_count" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# RESOURCES +# ══════════════════════════════════════════════════════════════════════ +do_resources() { + # Snapshots + local snapshots + snapshots=$(fetch_all_contabo "/compute/snapshots" "data") + local snap_count + snap_count=$(echo "$snapshots" | jq 'length' 2>/dev/null || echo 0) + + case "$OUTPUT_FORMAT" in + json) + local storage + storage=$(fetch_all_contabo "/object-storages" "data") + jq -n \ + --argjson snapshots "$snapshots" \ + --argjson object_storage "$storage" \ + '{snapshots: $snapshots, object_storage: $object_storage}' + ;; + *) + if [[ "$snap_count" -gt 0 ]]; then + section_header "Snapshots" + printf " ${BOLD}%-38s %-18s %-20s${RESET}\n" \ + "SNAPSHOT_ID" "NAME" "CREATED" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + echo "$snapshots" | jq -r \ + '.[] | "\(.snapshotId // .id // "—")\t\(.name // "—")\t\(.createdDate // "—")"' \ + 2>/dev/null \ + | while IFS=$'\t' read -r sid sname screated; do + printf " %-38s %-18s %-20s\n" \ + "${sid:0:36}" "${sname:0:16}" "${screated:0:19}" + done + echo "" + field "Snapshots:" "$snap_count" + fi + + # Object Storage + local storage + storage=$(fetch_all_contabo "/object-storages" "data") + local storage_count + storage_count=$(echo "$storage" | jq 'length' 2>/dev/null || echo 0) + + if [[ "$storage_count" -gt 0 ]]; then + section_header "Object Storage" + printf " ${BOLD}%-38s %-10s %-12s %-12s${RESET}\n" \ + "STORAGE_ID" "REGION" "SIZE (TB)" "USED (GB)" + printf " %s\n" "$(printf '%.0s─' {1..74})" + + echo "$storage" | jq -r \ + '.[] | "\(.objectStorageId // .id // "—")\t\(.region // "—")\t\(.totalPurchasedSpaceTB // 0)\t\(.usedSpaceBytes // 0)"' \ + 2>/dev/null \ + | while IFS=$'\t' read -r oid oregion osize oused; do + local used_gb + used_gb=$(awk "BEGIN {printf \"%.1f\", ${oused} / 1073741824}") + printf " %-38s %-10s %-12s %-12s\n" \ + "${oid:0:36}" "${oregion:0:8}" "$osize" "$used_gb" + done + echo "" + field "Object Storage:" "$storage_count" + fi + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat < /var/lib/node_exporter/textfile/contabo_cost.prom 2>/dev/null + +${BOLD}NOTES${RESET} + Contabo uses fixed monthly pricing per product tier. + Cost estimates are based on productId mapping — verify against your invoice. + Snapshots and object storage are typically included in Contabo plans. + +${BOLD}EXIT CODES${RESET} + 0 Success + 1 Runtime error +EOF +} + +# ══════════════════════════════════════════════════════════════════════ +# PARSE ARGS +# ══════════════════════════════════════════════════════════════════════ +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --summary) RUN_MODE="summary"; shift ;; + --breakdown) RUN_MODE="breakdown"; shift ;; + --resources) RUN_MODE="resources"; shift ;; + --format) OUTPUT_FORMAT="${2:?--format requires a value}"; shift 2 ;; + --alert) ALERT_THRESHOLD="${2:?--alert requires a threshold}"; shift 2 ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + --help|-h) setup_colors; show_help; exit 0 ;; + *) die "Unknown option: $1 (see --help)" ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ +main() { + parse_args "$@" + setup_colors + + if [[ -z "$RUN_MODE" ]]; then + RUN_MODE="summary" + fi + + check_deps + check_credentials + + START_TIME=$(date +%s) + + case "$RUN_MODE" in + summary) do_summary ;; + breakdown) do_breakdown ;; + resources) do_resources ;; + *) die "Unknown mode: ${RUN_MODE}" ;; + esac + + if [[ "$OUTPUT_FORMAT" != "prometheus" ]]; then + echo "" + field "Duration:" "$(elapsed)" + fi +} + +main "$@" diff --git a/contabo-dns-manager.sh b/contabo-dns-manager.sh new file mode 100644 index 0000000..94adf05 --- /dev/null +++ b/contabo-dns-manager.sh @@ -0,0 +1,648 @@ +#!/usr/bin/env bash + +######################################################################################### +#### contabo-dns-manager.sh — Manage DNS zones and records via the Contabo DNS API #### +#### List zones, add/update/delete records, audit, bulk operations #### +#### Requires: bash 4+, curl, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./contabo-dns-manager.sh --zones #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +ZONE_NAME="" +RECORD_ID="" +RECORD_TYPE="" +RECORD_NAME="" +RECORD_CONTENT="" +RECORD_TTL="3600" +RECORD_PRIO="" +CSV_FILE="" +OUTPUT_FORMAT="${CDM_FORMAT:-table}" +FORCE="false" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +CONTABO_CLIENT_ID="${CONTABO_CLIENT_ID:-}" +CONTABO_CLIENT_SECRET="${CONTABO_CLIENT_SECRET:-}" +CONTABO_API_USER="${CONTABO_API_USER:-}" +CONTABO_API_PASS="${CONTABO_API_PASS:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +ACTION_OK=0 +ACTION_FAIL=0 + +# ── API helpers ────────────────────────────────────────────────────── +contabo_token() { + local resp + resp=$(curl -s -d "client_id=${CONTABO_CLIENT_ID}" \ + -d "client_secret=${CONTABO_CLIENT_SECRET}" \ + --data-urlencode "username=${CONTABO_API_USER}" \ + --data-urlencode "password=${CONTABO_API_PASS}" \ + -d "grant_type=password" \ + "https://auth.contabo.com/auth/realms/contabo/protocol/openid-connect/token") + local token + token=$(echo "$resp" | jq -r '.access_token // empty' 2>/dev/null) + if [[ -z "$token" ]]; then + die "Failed to obtain access token — check credentials" + fi + echo "$token" +} + +contabo_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/cdm_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer $(contabo_token)" \ + -H "Content-Type: application/json" \ + -H "x-request-id: $(cat /proc/sys/kernel/random/uuid 2>/dev/null || date +%s%N)" \ + "https://api.contabo.com/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + cat /tmp/cdm_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$CONTABO_CLIENT_ID" ]] && die "CONTABO_CLIENT_ID not set" + [[ -z "$CONTABO_CLIENT_SECRET" ]] && die "CONTABO_CLIENT_SECRET not set" + [[ -z "$CONTABO_API_USER" ]] && die "CONTABO_API_USER not set" + [[ -z "$CONTABO_API_PASS" ]] && die "CONTABO_API_PASS not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ══════════════════════════════════════════════════════════════════════ +# ZONES +# ══════════════════════════════════════════════════════════════════════ +do_zones() { + local page=1 size=100 all_data="[]" + + while true; do + local resp + resp=$(contabo_api GET "/dns/zones?page=${page}&size=${size}") + local page_data + page_data=$(echo "$resp" | jq '.data // []' 2>/dev/null) + local page_count + page_count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$page_count" -eq 0 ]] && break + all_data=$(echo -e "${all_data}\n${page_data}" | jq -s 'add' 2>/dev/null) + (( page_count < size )) && break + ((page++)) || true + done + + local total + total=$(echo "$all_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$total" -eq 0 ]] && die "No zones found" + + case "$OUTPUT_FORMAT" in + json) + echo "$all_data" | jq '.' + ;; + prometheus) + cat </dev/null \ + | while IFS=$'\t' read -r name status zid rcount; do + printf " %-25s %-10s %-36s %-8s\n" \ + "${name:0:23}" "$status" "${zid:0:34}" "$rcount" + done + + echo "" + field "Zones:" "$total" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# RECORDS +# ══════════════════════════════════════════════════════════════════════ +do_records() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + + local resp + resp=$(contabo_api GET "/dns/zones/${ZONE_NAME}/records") + local records + records=$(echo "$resp" | jq '.data // []' 2>/dev/null) + local total + total=$(echo "$records" | jq 'length' 2>/dev/null || echo 0) + + case "$OUTPUT_FORMAT" in + json) + echo "$records" | jq '.' + ;; + prometheus) + cat </dev/null \ + | while IFS=$'\t' read -r rid rtype rname rcontent rttl rprio; do + printf " %-36s %-6s %-10s %-26s %-6s %-5s\n" \ + "${rid:0:34}" "$rtype" "${rname:0:8}" "${rcontent:0:24}" "$rttl" "$rprio" + done + + echo "" + field "Records:" "$total" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# ADD +# ══════════════════════════════════════════════════════════════════════ +do_add() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + [[ -z "$RECORD_TYPE" ]] && die "Specify --type TYPE" + [[ -z "$RECORD_NAME" ]] && die "Specify --name NAME" + [[ -z "$RECORD_CONTENT" ]] && die "Specify --content CONTENT" + + local payload + payload=$(jq -n \ + --arg type "$RECORD_TYPE" \ + --arg name "$RECORD_NAME" \ + --arg content "$RECORD_CONTENT" \ + --argjson ttl "$RECORD_TTL" \ + '{type: $type, name: $name, content: $content, ttl: $ttl}') + + if [[ -n "$RECORD_PRIO" ]]; then + payload=$(echo "$payload" | jq --argjson prio "$RECORD_PRIO" '. + {prio: $prio}') + fi + + local resp + resp=$(contabo_api POST "/dns/zones/${ZONE_NAME}/records" -d "$payload") + + local rid + rid=$(echo "$resp" | jq -r '.data[0].recordId // .data[0].id // empty' 2>/dev/null) + + if [[ -n "$rid" ]]; then + echo -e " ${GREEN}✓${RESET} Record created: ${RECORD_TYPE} ${RECORD_NAME} → ${RECORD_CONTENT} (ID: ${rid})" + ((ACTION_OK++)) || true + else + local errmsg + errmsg=$(echo "$resp" | jq -r '.message // "unknown error"' 2>/dev/null) + echo -e " ${RED}✗${RESET} Failed to create record: ${errmsg}" + ((ACTION_FAIL++)) || true + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# UPDATE +# ══════════════════════════════════════════════════════════════════════ +do_update() { + [[ -z "$RECORD_ID" ]] && die "Specify --record-id ID" + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + [[ -z "$RECORD_TYPE" ]] && die "Specify --type TYPE" + [[ -z "$RECORD_NAME" ]] && die "Specify --name NAME" + [[ -z "$RECORD_CONTENT" ]] && die "Specify --content CONTENT" + + local payload + payload=$(jq -n \ + --arg type "$RECORD_TYPE" \ + --arg name "$RECORD_NAME" \ + --arg content "$RECORD_CONTENT" \ + --argjson ttl "$RECORD_TTL" \ + '{type: $type, name: $name, content: $content, ttl: $ttl}') + + if [[ -n "$RECORD_PRIO" ]]; then + payload=$(echo "$payload" | jq --argjson prio "$RECORD_PRIO" '. + {prio: $prio}') + fi + + local resp + resp=$(contabo_api PUT "/dns/zones/${ZONE_NAME}/records/${RECORD_ID}" -d "$payload") + + local rid + rid=$(echo "$resp" | jq -r '.data[0].recordId // .data[0].id // empty' 2>/dev/null) + + if [[ -n "$rid" ]]; then + echo -e " ${GREEN}✓${RESET} Record updated: ${RECORD_TYPE} ${RECORD_NAME} → ${RECORD_CONTENT} (ID: ${rid})" + ((ACTION_OK++)) || true + else + local errmsg + errmsg=$(echo "$resp" | jq -r '.message // "unknown error"' 2>/dev/null) + echo -e " ${RED}✗${RESET} Failed to update record: ${errmsg}" + ((ACTION_FAIL++)) || true + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# DELETE +# ══════════════════════════════════════════════════════════════════════ +do_delete() { + [[ -z "$RECORD_ID" ]] && die "Specify --record-id ID" + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + [[ "$FORCE" != "true" ]] && die "Delete is destructive — use --force to confirm" + + local resp + resp=$(contabo_api DELETE "/dns/zones/${ZONE_NAME}/records/${RECORD_ID}") + + echo -e " ${GREEN}✓${RESET} Record deleted: ${RECORD_ID}" + ((ACTION_OK++)) || true +} + +# ══════════════════════════════════════════════════════════════════════ +# BULK ADD +# ══════════════════════════════════════════════════════════════════════ +do_bulk_add() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + [[ -z "$CSV_FILE" ]] && die "Specify --csv FILE" + [[ ! -f "$CSV_FILE" ]] && die "CSV file not found: ${CSV_FILE}" + + section_header "Bulk Add — ${ZONE_NAME}" + + local line_num=0 + while IFS=',' read -r rtype rname rcontent rttl rprio; do + ((line_num++)) || true + [[ -z "$rtype" || "$rtype" =~ ^# ]] && continue + rtype=$(echo "$rtype" | xargs) + rname=$(echo "$rname" | xargs) + rcontent=$(echo "$rcontent" | xargs) + rttl=$(echo "${rttl:-3600}" | xargs) + rprio=$(echo "${rprio:-}" | xargs) + + local payload + payload=$(jq -n \ + --arg type "$rtype" \ + --arg name "$rname" \ + --arg content "$rcontent" \ + --argjson ttl "$rttl" \ + '{type: $type, name: $name, content: $content, ttl: $ttl}') + + if [[ -n "$rprio" ]]; then + payload=$(echo "$payload" | jq --argjson prio "$rprio" '. + {prio: $prio}') + fi + + local resp + resp=$(contabo_api POST "/dns/zones/${ZONE_NAME}/records" -d "$payload") + + local rid + rid=$(echo "$resp" | jq -r '.data[0].recordId // .data[0].id // empty' 2>/dev/null) + + if [[ -n "$rid" ]]; then + echo -e " ${GREEN}✓${RESET} ${rtype} ${rname} → ${rcontent} (line ${line_num})" + ((ACTION_OK++)) || true + else + echo -e " ${RED}✗${RESET} ${rtype} ${rname} → ${rcontent} (line ${line_num})" + ((ACTION_FAIL++)) || true + fi + + sleep 0.5 + done < "$CSV_FILE" + + echo "" + field_color "Succeeded:" "${GREEN}${ACTION_OK}${RESET}" + if [[ "$ACTION_FAIL" -gt 0 ]]; then + field_color "Failed:" "${RED}${ACTION_FAIL}${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ +do_audit() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + + local resp + resp=$(contabo_api GET "/dns/zones/${ZONE_NAME}/records") + local records + records=$(echo "$resp" | jq '.data // []' 2>/dev/null) + local total + total=$(echo "$records" | jq 'length' 2>/dev/null || echo 0) + + local warnings=0 + + if [[ "$OUTPUT_FORMAT" != "prometheus" ]]; then + section_header "DNS Audit — ${ZONE_NAME}" + field "Records:" "$total" + echo "" + fi + + # Check SOA + local soa_count + soa_count=$(echo "$records" | jq '[.[] | select(.type == "SOA")] | length' 2>/dev/null || echo 0) + if [[ "$soa_count" -eq 0 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${YELLOW}⚠${RESET} No SOA record found" + else + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${GREEN}✓${RESET} SOA record present" + fi + + # Check NS + local ns_count + ns_count=$(echo "$records" | jq '[.[] | select(.type == "NS")] | length' 2>/dev/null || echo 0) + if [[ "$ns_count" -eq 0 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${RED}✗${RESET} No NS records found" + elif [[ "$ns_count" -lt 2 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${YELLOW}⚠${RESET} Only ${ns_count} NS record(s) — recommend at least 2" + else + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${GREEN}✓${RESET} ${ns_count} NS records" + fi + + # Check common types + for rtype in A AAAA MX TXT; do + local rcount + rcount=$(echo "$records" | jq --arg t "$rtype" '[.[] | select(.type == $t)] | length' 2>/dev/null || echo 0) + if [[ "$rcount" -eq 0 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${YELLOW}⚠${RESET} No ${rtype} records found" + else + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${GREEN}✓${RESET} ${rcount} ${rtype} record(s)" + fi + done + + # Check low TTLs + local low_ttl + low_ttl=$(echo "$records" | jq '[.[] | select(.ttl < 300 and .ttl > 0)] | length' 2>/dev/null || echo 0) + if [[ "$low_ttl" -gt 0 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${YELLOW}⚠${RESET} ${low_ttl} record(s) with TTL < 300s" + else + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${GREEN}✓${RESET} All TTLs ≥ 300s" + fi + + # Check wildcards + local wildcard + wildcard=$(echo "$records" | jq '[.[] | select(.name | startswith("*"))] | length' 2>/dev/null || echo 0) + if [[ "$wildcard" -gt 0 ]]; then + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${CYAN}ℹ${RESET} ${wildcard} wildcard record(s)" + fi + + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + cat <&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +# ── Severity counters ──────────────────────────────────────────────── +TOTAL_CRIT=0 +TOTAL_WARN=0 +TOTAL_INFO=0 +TOTAL_OK=0 + +flag_crit() { ((TOTAL_CRIT++)) || true; } +flag_warn() { ((TOTAL_WARN++)) || true; } +flag_info() { ((TOTAL_INFO++)) || true; } +flag_ok() { ((TOTAL_OK++)) || true; } + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +DANGEROUS_PORTS="${DANGEROUS_PORTS:-22,3389,3306,5432,1433,6379,27017,9200,8080,8443}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +CONTABO_CLIENT_ID="${CONTABO_CLIENT_ID:-}" +CONTABO_CLIENT_SECRET="${CONTABO_CLIENT_SECRET:-}" +CONTABO_API_USER="${CONTABO_API_USER:-}" +CONTABO_API_PASS="${CONTABO_API_PASS:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── API helpers ────────────────────────────────────────────────────── +contabo_token() { + local resp + resp=$(curl -s -d "client_id=${CONTABO_CLIENT_ID}" \ + -d "client_secret=${CONTABO_CLIENT_SECRET}" \ + --data-urlencode "username=${CONTABO_API_USER}" \ + --data-urlencode "password=${CONTABO_API_PASS}" \ + -d "grant_type=password" \ + "https://auth.contabo.com/auth/realms/contabo/protocol/openid-connect/token") + local token + token=$(echo "$resp" | jq -r '.access_token // empty' 2>/dev/null) + if [[ -z "$token" ]]; then + die "Failed to obtain access token — check credentials" + fi + echo "$token" +} + +contabo_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/cfa_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer $(contabo_token)" \ + -H "Content-Type: application/json" \ + -H "x-request-id: $(cat /proc/sys/kernel/random/uuid 2>/dev/null || date +%s%N)" \ + "https://api.contabo.com/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + cat /tmp/cfa_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$CONTABO_CLIENT_ID" ]] && die "CONTABO_CLIENT_ID not set" + [[ -z "$CONTABO_CLIENT_SECRET" ]] && die "CONTABO_CLIENT_SECRET not set" + [[ -z "$CONTABO_API_USER" ]] && die "CONTABO_API_USER not set" + [[ -z "$CONTABO_API_PASS" ]] && die "CONTABO_API_PASS not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Instance helpers ───────────────────────────────────────────────── +get_all_instances() { + local page=1 size=100 result="[]" + while true; do + local resp + resp=$(contabo_api GET "/compute/instances?page=${page}&size=${size}") + local page_data + page_data=$(echo "$resp" | jq '.data // []' 2>/dev/null) + local count + count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$count" -eq 0 ]] && break + result=$(echo "$result" "$page_data" | jq -s '.[0] + .[1]') + (( count < size )) && break + ((page++)) || true + done + echo "$result" +} + +# ── Firewall helpers ───────────────────────────────────────────────── +get_all_firewalls() { + local page=1 size=100 result="[]" + while true; do + local resp + resp=$(contabo_api GET "/firewalls?page=${page}&size=${size}") + local page_data + page_data=$(echo "$resp" | jq '.data // []' 2>/dev/null) + local count + count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$count" -eq 0 ]] && break + result=$(echo "$result" "$page_data" | jq -s '.[0] + .[1]') + (( count < size )) && break + ((page++)) || true + done + echo "$result" +} + +# ── Port-to-service mapping ───────────────────────────────────────── +port_to_service() { + local port="$1" + case "$port" in + 22) echo "SSH" ;; + 80) echo "HTTP" ;; + 443) echo "HTTPS" ;; + 3306) echo "MySQL" ;; + 5432) echo "PostgreSQL" ;; + 1433) echo "MSSQL" ;; + 3389) echo "RDP" ;; + 6379) echo "Redis" ;; + 27017) echo "MongoDB" ;; + 9200) echo "Elasticsearch" ;; + 8080) echo "HTTP-Alt" ;; + 8443) echo "HTTPS-Alt" ;; + 53) echo "DNS" ;; + 25) echo "SMTP" ;; + 5900) echo "VNC" ;; + 11211) echo "Memcached" ;; + 2379) echo "etcd" ;; + 9090) echo "Prometheus" ;; + *) echo "" ;; + esac +} + +# ── Check if port is in dangerous list ─────────────────────────────── +is_dangerous_port() { + local port="$1" + local IFS=',' + for dp in $DANGEROUS_PORTS; do + if [[ "$port" == "$dp" ]]; then + return 0 + fi + done + return 1 +} + +# ══════════════════════════════════════════════════════════════════════ +# OPEN PORTS AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_open_ports() { + log "Auditing firewall rules for dangerous open ports..." + log "Dangerous ports: ${DANGEROUS_PORTS}" + echo "" + + printf " %-10s %-22s %-8s %-8s %-18s %-12s %s\n" \ + "FW_ID" "FW_NAME" "PORT" "PROTO" "SOURCE" "SERVICE" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..95})" + + local fw_json + fw_json=$(get_all_firewalls) + + echo "$fw_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r fw; do + local fw_id fw_name + fw_id=$(echo "$fw" | jq -r '.firewallId' 2>/dev/null) + fw_name=$(echo "$fw" | jq -r '.displayName // .name // "unnamed"' 2>/dev/null) + + echo "$fw" | jq -c '.rules[]? // empty' 2>/dev/null | while IFS= read -r rule; do + local action protocol port_str source_cidr + action=$(echo "$rule" | jq -r '.action // "accept"' 2>/dev/null) + protocol=$(echo "$rule" | jq -r '.protocol // "tcp"' 2>/dev/null) + port_str=$(echo "$rule" | jq -r '.port // .destPorts // .destinationPorts // ""' 2>/dev/null) + source_cidr=$(echo "$rule" | jq -r '.srcCidr // .source // .ipRange // "0.0.0.0/0"' 2>/dev/null) + + [[ "$action" != "accept" && "$action" != "allow" ]] && continue + [[ "$source_cidr" != "0.0.0.0/0" && "$source_cidr" != "::/0" ]] && continue + + if [[ -z "$port_str" || "$port_str" == "null" ]]; then + local IFS=',' + for dp in $DANGEROUS_PORTS; do + local svc + svc=$(port_to_service "$dp") + printf " %-10s %-22s %-8s %-8s %-18s %-12s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "$dp" "$protocol" \ + "$source_cidr" "${svc:-unknown}" "$RED" "CRITICAL" "$RESET" + flag_crit + done + continue + fi + + local IFS=',' + for port_entry in $port_str; do + local single_port="$port_entry" + if [[ "$port_entry" == *-* ]]; then + local range_start range_end + range_start="${port_entry%-*}" + range_end="${port_entry#*-}" + local IFS=',' + for dp in $DANGEROUS_PORTS; do + if [[ "$dp" -ge "$range_start" && "$dp" -le "$range_end" ]]; then + local svc + svc=$(port_to_service "$dp") + printf " %-10s %-22s %-8s %-8s %-18s %-12s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "$dp" "$protocol" \ + "$source_cidr" "${svc:-unknown}" "$RED" "CRITICAL" "$RESET" + flag_crit + fi + done + continue + fi + + if is_dangerous_port "$single_port"; then + local svc + svc=$(port_to_service "$single_port") + printf " %-10s %-22s %-8s %-8s %-18s %-12s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "$single_port" "$protocol" \ + "$source_cidr" "${svc:-unknown}" "$RED" "CRITICAL" "$RESET" + flag_crit + elif [[ "$single_port" == "80" || "$single_port" == "443" ]]; then + local svc + svc=$(port_to_service "$single_port") + printf " %-10s %-22s %-8s %-8s %-18s %-12s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "$single_port" "$protocol" \ + "$source_cidr" "${svc:-$single_port}" "$CYAN" "INFO" "$RESET" + flag_info + fi + done + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# UNPROTECTED INSTANCES +# ══════════════════════════════════════════════════════════════════════ +audit_unprotected() { + log "Checking for instances without firewalls..." + echo "" + + printf " %-10s %-22s %-16s %-10s %s\n" \ + "INST_ID" "NAME" "IP" "STATUS" "FIREWALL" + printf " %s\n" "$(printf '%.0s─' {1..75})" + + local instances + instances=$(get_all_instances) + + local fw_json + fw_json=$(get_all_firewalls) + + local assigned_instances + assigned_instances=$(echo "$fw_json" | jq -r \ + '[.[].assignedInstances // [] | .[]] | unique | .[]' 2>/dev/null || true) + + echo "$instances" | jq -c '.[]' 2>/dev/null | while IFS= read -r inst; do + local iid iname ip status + iid=$(echo "$inst" | jq -r '.instanceId' 2>/dev/null) + iname=$(echo "$inst" | jq -r '.name // .displayName // "unknown"' 2>/dev/null) + ip=$(echo "$inst" | jq -r '.ipConfig.v4.ip // "N/A"' 2>/dev/null) + status=$(echo "$inst" | jq -r '.status // "unknown"' 2>/dev/null) + + local has_fw="false" + if echo "$assigned_instances" | grep -q "^${iid}$" 2>/dev/null; then + has_fw="true" + fi + + if [[ "$has_fw" == "false" ]]; then + printf " %-10s %-22s %-16s %-10s %b%s%b\n" \ + "$iid" "${iname:0:20}" "$ip" "$status" \ + "$RED" "NONE — UNPROTECTED" "$RESET" + flag_crit + else + printf " %-10s %-22s %-16s %-10s %b%s%b\n" \ + "$iid" "${iname:0:20}" "$ip" "$status" \ + "$GREEN" "✓ Protected" "$RESET" + flag_ok + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# PERMISSIVE RULES AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_permissive() { + log "Auditing overly permissive firewall rules..." + echo "" + + printf " %-10s %-22s %-10s %-8s %-18s %-14s %s\n" \ + "FW_ID" "FW_NAME" "PORTS" "PROTO" "SOURCE" "ISSUE" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..100})" + + local fw_json + fw_json=$(get_all_firewalls) + + echo "$fw_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r fw; do + local fw_id fw_name + fw_id=$(echo "$fw" | jq -r '.firewallId' 2>/dev/null) + fw_name=$(echo "$fw" | jq -r '.displayName // .name // "unnamed"' 2>/dev/null) + + echo "$fw" | jq -c '.rules[]? // empty' 2>/dev/null | while IFS= read -r rule; do + local action protocol port_str source_cidr + action=$(echo "$rule" | jq -r '.action // "accept"' 2>/dev/null) + protocol=$(echo "$rule" | jq -r '.protocol // "tcp"' 2>/dev/null) + port_str=$(echo "$rule" | jq -r '.port // .destPorts // .destinationPorts // ""' 2>/dev/null) + source_cidr=$(echo "$rule" | jq -r '.srcCidr // .source // .ipRange // ""' 2>/dev/null) + + [[ "$action" != "accept" && "$action" != "allow" ]] && continue + + if [[ -z "$port_str" || "$port_str" == "null" ]] && [[ "$source_cidr" == "0.0.0.0/0" || "$source_cidr" == "::/0" ]]; then + printf " %-10s %-22s %-10s %-8s %-18s %-14s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "ALL" "$protocol" \ + "$source_cidr" "all-ports" "$RED" "CRITICAL" "$RESET" + flag_crit + continue + fi + + if [[ "$protocol" == "all" || "$protocol" == "-1" ]] && [[ "$source_cidr" == "0.0.0.0/0" || "$source_cidr" == "::/0" ]]; then + printf " %-10s %-22s %-10s %-8s %-18s %-14s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "${port_str:-ALL}" "all" \ + "$source_cidr" "all-protocols" "$RED" "CRITICAL" "$RESET" + flag_crit + continue + fi + + if [[ -n "$source_cidr" && "$source_cidr" != "null" ]]; then + if [[ "$source_cidr" == *"/8" || "$source_cidr" == *"/16" ]]; then + printf " %-10s %-22s %-10s %-8s %-18s %-14s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "${port_str:-ALL}" "$protocol" \ + "${source_cidr:0:16}" "wide-cidr" "$YELLOW" "WARN" "$RESET" + flag_warn + fi + fi + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# UNUSED FIREWALLS +# ══════════════════════════════════════════════════════════════════════ +audit_unused() { + log "Checking for unused firewalls..." + echo "" + + printf " %-10s %-28s %-8s %s\n" \ + "FW_ID" "FW_NAME" "RULES" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..60})" + + local fw_json + fw_json=$(get_all_firewalls) + + echo "$fw_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r fw; do + local fw_id fw_name rule_count assigned_count + fw_id=$(echo "$fw" | jq -r '.firewallId' 2>/dev/null) + fw_name=$(echo "$fw" | jq -r '.displayName // .name // "unnamed"' 2>/dev/null) + rule_count=$(echo "$fw" | jq '[.rules[]?] | length' 2>/dev/null || echo 0) + assigned_count=$(echo "$fw" | jq '[.assignedInstances // [] | .[]] | length' 2>/dev/null || echo 0) + + if [[ "$assigned_count" -eq 0 ]]; then + printf " %-10s %-28s %-8s %b%s%b\n" \ + "$fw_id" "${fw_name:0:26}" "$rule_count" \ + "$YELLOW" "UNUSED" "$RESET" + flag_warn + else + verbose "Firewall ${fw_id} (${fw_name}): assigned to ${assigned_count} instance(s)" + flag_ok + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST ALL RULES +# ══════════════════════════════════════════════════════════════════════ +list_rules() { + log "Listing all firewall rules..." + echo "" + + printf " %-10s %-20s %-8s %-8s %-12s %-18s %s\n" \ + "FW_ID" "FW_NAME" "ACTION" "PROTO" "PORTS" "SOURCE" "SERVICE" + printf " %s\n" "$(printf '%.0s─' {1..90})" + + local fw_json + fw_json=$(get_all_firewalls) + + echo "$fw_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r fw; do + local fw_id fw_name + fw_id=$(echo "$fw" | jq -r '.firewallId' 2>/dev/null) + fw_name=$(echo "$fw" | jq -r '.displayName // .name // "unnamed"' 2>/dev/null) + + echo "$fw" | jq -c '.rules[]? // empty' 2>/dev/null | while IFS= read -r rule; do + local action protocol port_str source_cidr + action=$(echo "$rule" | jq -r '.action // "accept"' 2>/dev/null) + protocol=$(echo "$rule" | jq -r '.protocol // "tcp"' 2>/dev/null) + port_str=$(echo "$rule" | jq -r '.port // .destPorts // .destinationPorts // "all"' 2>/dev/null) + source_cidr=$(echo "$rule" | jq -r '.srcCidr // .source // .ipRange // "any"' 2>/dev/null) + + [[ "$port_str" == "null" ]] && port_str="all" + [[ "$source_cidr" == "null" ]] && source_cidr="any" + + local svc="" + if [[ "$port_str" =~ ^[0-9]+$ ]]; then + svc=$(port_to_service "$port_str") + fi + + local action_color="$GREEN" + [[ "$action" == "drop" || "$action" == "deny" || "$action" == "reject" ]] && action_color="$RED" + + printf " %-10s %-20s %b%-8s%b %-8s %-12s %-18s %s\n" \ + "$fw_id" "${fw_name:0:18}" "$action_color" "$action" "$RESET" \ + "$protocol" "${port_str:0:10}" "${source_cidr:0:16}" "${svc}" + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ +print_summary() { + local elapsed + elapsed=$(( $(date +%s) - START_TIME )) + + echo "" + echo " ══════════════════════════════════════════" + echo " Firewall Audit Summary" + echo " ══════════════════════════════════════════" + printf " %-20s %b%d%b\n" "CRITICAL:" "$RED" "$TOTAL_CRIT" "$RESET" + printf " %-20s %b%d%b\n" "WARN:" "$YELLOW" "$TOTAL_WARN" "$RESET" + printf " %-20s %b%d%b\n" "INFO:" "$CYAN" "$TOTAL_INFO" "$RESET" + printf " %-20s %b%d%b\n" "OK:" "$GREEN" "$TOTAL_OK" "$RESET" + echo " ──────────────────────────────────────────" + printf " Completed in %ds\n" "$elapsed" + echo "" + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + echo -e " ${RED}${BOLD}Action required:${RESET} ${TOTAL_CRIT} critical finding(s)" + echo "" + echo " Top recommendations:" + echo " • Assign firewalls to all unprotected instances" + echo " • Close 0.0.0.0/0 rules on SSH (22), RDP (3389), and database ports" + echo " • Replace all-port allow rules with specific port lists" + echo " • Remove unused firewalls to reduce configuration sprawl" + echo "" + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + echo -e " ${YELLOW}Review recommended:${RESET} ${TOTAL_WARN} warning(s)" + echo "" + echo " Suggestions:" + echo " • Review wide CIDR rules and narrow where possible" + echo " • Delete unused firewalls" + echo " • Restrict outbound where applicable" + echo "" + else + echo -e " ${GREEN}All checks passed${RESET}" + echo "" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2 + exit 1 + fi + + RUN_MODE="${modes[*]}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ +main() { + parse_args "$@" + setup_colors + check_deps + check_credentials + + START_TIME=$(date +%s) + + echo "" + echo -e "${BOLD}Contabo Firewall Auditor${RESET}" + echo -e "Mode: ${RUN_MODE}" + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" + + for mode in $RUN_MODE; do + case "$mode" in + open-ports) audit_open_ports ;; + unprotected) audit_unprotected ;; + permissive) audit_permissive ;; + unused) audit_unused ;; + rules) list_rules ;; + esac + done + + print_summary + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + exit 2 + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@" diff --git a/contabo-fleet-manager.sh b/contabo-fleet-manager.sh new file mode 100755 index 0000000..9993b9b --- /dev/null +++ b/contabo-fleet-manager.sh @@ -0,0 +1,608 @@ +#!/usr/bin/env bash + +######################################################################################### +#### contabo-fleet-manager.sh — Inventory, health checks, and bulk operations for #### +#### Contabo VPS/VDS instances via the REST API. Fleet-wide visibility and control #### +#### Requires: bash 4+, curl, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./contabo-fleet-manager.sh --inventory --all #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +INSTANCE_ID="" +TARGET_ALL="false" +TAG_ID="" +TAG_SUB_MODE="" +OUTPUT_FORMAT="${CFM_FORMAT:-text}" +PING_CHECK="false" +FORCE="false" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +CONTABO_CLIENT_ID="${CONTABO_CLIENT_ID:-}" +CONTABO_CLIENT_SECRET="${CONTABO_CLIENT_SECRET:-}" +CONTABO_API_USER="${CONTABO_API_USER:-}" +CONTABO_API_PASS="${CONTABO_API_PASS:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +ACTION_OK=0 +ACTION_FAIL=0 + +# ── API helpers ────────────────────────────────────────────────────── +contabo_token() { + local resp + resp=$(curl -s -d "client_id=${CONTABO_CLIENT_ID}" \ + -d "client_secret=${CONTABO_CLIENT_SECRET}" \ + --data-urlencode "username=${CONTABO_API_USER}" \ + --data-urlencode "password=${CONTABO_API_PASS}" \ + -d "grant_type=password" \ + "https://auth.contabo.com/auth/realms/contabo/protocol/openid-connect/token") + local token + token=$(echo "$resp" | jq -r '.access_token // empty' 2>/dev/null) + if [[ -z "$token" ]]; then + die "Failed to obtain access token — check credentials" + fi + echo "$token" +} + +contabo_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/cfm_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer $(contabo_token)" \ + -H "Content-Type: application/json" \ + -H "x-request-id: $(cat /proc/sys/kernel/random/uuid 2>/dev/null || date +%s%N)" \ + "https://api.contabo.com/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + cat /tmp/cfm_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$CONTABO_CLIENT_ID" ]] && die "CONTABO_CLIENT_ID not set" + [[ -z "$CONTABO_CLIENT_SECRET" ]] && die "CONTABO_CLIENT_SECRET not set" + [[ -z "$CONTABO_API_USER" ]] && die "CONTABO_API_USER not set" + [[ -z "$CONTABO_API_PASS" ]] && die "CONTABO_API_PASS not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Instance helpers ───────────────────────────────────────────────── +get_all_instance_ids() { + local page=1 size=100 ids="" + while true; do + local resp + resp=$(contabo_api GET "/compute/instances?page=${page}&size=${size}") + local page_ids + page_ids=$(echo "$resp" | jq -r '.data[].instanceId' 2>/dev/null) + [[ -z "$page_ids" ]] && break + ids="${ids}${ids:+$'\n'}${page_ids}" + local count + count=$(echo "$page_ids" | wc -l) + (( count < size )) && break + ((page++)) || true + done + echo "$ids" +} + +get_instance_name() { + local iid="$1" + contabo_api GET "/compute/instances/${iid}" \ + | jq -r '.data[0].name // .data[0].displayName // "unknown"' 2>/dev/null +} + +get_instance_ids() { + if [[ "$TARGET_ALL" == "true" ]]; then + get_all_instance_ids + elif [[ -n "$INSTANCE_ID" ]]; then + echo "$INSTANCE_ID" + elif [[ -n "$TAG_ID" ]]; then + get_instances_by_tag "$TAG_ID" + else + die "Specify --instance ID, --all, or --tag TAG_ID" + fi +} + +get_instances_by_tag() { + local tid="$1" + local page=1 size=100 ids="" + while true; do + local resp + resp=$(contabo_api GET "/compute/instances?page=${page}&size=${size}") + local page_ids + page_ids=$(echo "$resp" | jq -r --arg tid "$tid" \ + '.data[] | select(.tags[]? | .tagId == ($tid | tonumber)) | .instanceId' 2>/dev/null) + [[ -z "$page_ids" ]] && break + ids="${ids}${ids:+$'\n'}${page_ids}" + local count + count=$(echo "$page_ids" | wc -l) + (( count < size )) && break + ((page++)) || true + done + echo "$ids" +} + +# ══════════════════════════════════════════════════════════════════════ +# INVENTORY +# ══════════════════════════════════════════════════════════════════════ +do_inventory() { + local page=1 size=100 all_data="[]" + + while true; do + local resp + resp=$(contabo_api GET "/compute/instances?page=${page}&size=${size}") + local page_data + page_data=$(echo "$resp" | jq '.data // []' 2>/dev/null) + local page_count + page_count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$page_count" -eq 0 ]] && break + all_data=$(echo -e "${all_data}\n${page_data}" | jq -s 'add' 2>/dev/null) + (( page_count < size )) && break + ((page++)) || true + done + + local total + total=$(echo "$all_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$total" -eq 0 ]] && die "No instances found" + + # Filter by tag if specified + if [[ -n "$TAG_ID" ]]; then + all_data=$(echo "$all_data" | jq --arg tid "$TAG_ID" \ + '[.[] | select(.tags[]? | .tagId == ($tid | tonumber))]' 2>/dev/null) + total=$(echo "$all_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$total" -eq 0 ]] && die "No instances found with tag ${TAG_ID}" + fi + + case "$OUTPUT_FORMAT" in + json) + echo "$all_data" | jq '.' + ;; + ansible) + echo "[contabo]" + echo "$all_data" | jq -r \ + '.[] | (.ipConfig.v4.ip // "unknown") + " # " + (.name // .displayName // "unknown") + " id=" + (.instanceId | tostring)' \ + 2>/dev/null + ;; + *) + section_header "Fleet Inventory" + + printf " ${BOLD}%-13s %-20s %-11s %-16s %-8s %-8s${RESET}\n" \ + "INSTANCE_ID" "NAME" "STATUS" "IP" "REGION" "PRODUCT" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + echo "$all_data" | jq -r \ + '.[] | "\(.instanceId)\t\(.name // .displayName // "unknown")\t\(.status // "unknown")\t\(.ipConfig.v4.ip // "—")\t\(.region // "—")\t\(.productId // "—")"' \ + 2>/dev/null \ + | while IFS=$'\t' read -r iid name status ip region product; do + printf " %-13s %-20s %-11s %-16s %-8s %-8s\n" \ + "$iid" "${name:0:18}" "$status" "$ip" "${region:0:6}" "$product" + done + + echo "" + field "Total:" "$total" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# HEALTH +# ══════════════════════════════════════════════════════════════════════ +do_health() { + local ids + ids=$(get_instance_ids) + [[ -z "$ids" ]] && die "No instances found" + + local running=0 stopped=0 errored=0 total_instances=0 + local results="" + + while IFS= read -r iid; do + [[ -z "$iid" ]] && continue + ((total_instances++)) || true + + local resp + resp=$(contabo_api GET "/compute/instances/${iid}") + local name status ip + name=$(echo "$resp" | jq -r '.data[0].name // .data[0].displayName // "unknown"' 2>/dev/null) + status=$(echo "$resp" | jq -r '.data[0].status // "unknown"' 2>/dev/null) + ip=$(echo "$resp" | jq -r '.data[0].ipConfig.v4.ip // ""' 2>/dev/null) + + local ping_result="—" + if [[ "$PING_CHECK" == "true" && -n "$ip" ]]; then + if ping -c 1 -W 3 "$ip" &>/dev/null; then + ping_result="reachable" + else + ping_result="unreachable" + fi + fi + + case "$status" in + running) ((running++)) || true ;; + stopped) ((stopped++)) || true ;; + *) ((errored++)) || true ;; + esac + + results="${results}${iid}\t${name}\t${status}\t${ip}\t${ping_result}\n" + done <<< "$ids" + + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + cat < /dev/null 2>&1; then + echo -e " ${GREEN}✓${RESET} ${iname} (${iid}) ${action} sent" + ((ACTION_OK++)) || true + else + echo -e " ${RED}✗${RESET} ${iname} (${iid}) ${action} failed" + ((ACTION_FAIL++)) || true + fi + + sleep 1 + done <<< "$ids" + + echo "" + field_color "Succeeded:" "${GREEN}${ACTION_OK}${RESET}" + if [[ "$ACTION_FAIL" -gt 0 ]]; then + field_color "Failed:" "${RED}${ACTION_FAIL}${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# TAGS +# ══════════════════════════════════════════════════════════════════════ +do_tags() { + if [[ "$TAG_SUB_MODE" == "list" ]]; then + local resp + resp=$(contabo_api GET "/tags?page=1&size=100") + + if [[ "$OUTPUT_FORMAT" == "json" ]]; then + echo "$resp" | jq '.data // []' + return + fi + + section_header "Tags" + + printf " ${BOLD}%-10s %-30s %-10s${RESET}\n" "TAG_ID" "NAME" "COLOR" + printf " %s\n" "$(printf '%.0s─' {1..52})" + + echo "$resp" | jq -r '.data[] | "\(.tagId)\t\(.name)\t\(.color // "—")"' 2>/dev/null \ + | while IFS=$'\t' read -r tid tname tcolor; do + printf " %-10s %-30s %-10s\n" "$tid" "${tname:0:28}" "$tcolor" + done + elif [[ "$TAG_SUB_MODE" == "filter" ]]; then + [[ -z "$TAG_ID" ]] && die "Specify --filter TAG_ID" + INSTANCE_ID="" + TARGET_ALL="false" + do_inventory + else + die "Specify --list or --filter TAG_ID with --tags" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +ALSO_ROTATE="false" +INSTANCE_ID="" +TARGET_ALL="false" +SNAPSHOT_ID="" +KEEP="${CSM_KEEP:-3}" +PREFIX="${CSM_PREFIX:-auto}" +MAX_AGE="${CSM_MAX_AGE:-7}" +OUTPUT_FORMAT="${CSM_FORMAT:-text}" +DRY_RUN="true" +FORCE="false" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +CONTABO_CLIENT_ID="${CONTABO_CLIENT_ID:-}" +CONTABO_CLIENT_SECRET="${CONTABO_CLIENT_SECRET:-}" +CONTABO_API_USER="${CONTABO_API_USER:-}" +CONTABO_API_PASS="${CONTABO_API_PASS:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +SNAP_CREATED=0 +SNAP_DELETED=0 +SNAP_ERRORS=0 + +# ── API helpers ────────────────────────────────────────────────────── +contabo_token() { + local resp + resp=$(curl -s -d "client_id=${CONTABO_CLIENT_ID}" \ + -d "client_secret=${CONTABO_CLIENT_SECRET}" \ + --data-urlencode "username=${CONTABO_API_USER}" \ + --data-urlencode "password=${CONTABO_API_PASS}" \ + -d "grant_type=password" \ + "https://auth.contabo.com/auth/realms/contabo/protocol/openid-connect/token") + local token + token=$(echo "$resp" | jq -r '.access_token // empty' 2>/dev/null) + if [[ -z "$token" ]]; then + die "Failed to obtain access token — check credentials" + fi + echo "$token" +} + +contabo_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/csm_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer $(contabo_token)" \ + -H "Content-Type: application/json" \ + -H "x-request-id: $(cat /proc/sys/kernel/random/uuid 2>/dev/null || date +%s%N)" \ + "https://api.contabo.com/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + cat /tmp/csm_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$CONTABO_CLIENT_ID" ]] && die "CONTABO_CLIENT_ID not set" + [[ -z "$CONTABO_CLIENT_SECRET" ]] && die "CONTABO_CLIENT_SECRET not set" + [[ -z "$CONTABO_API_USER" ]] && die "CONTABO_API_USER not set" + [[ -z "$CONTABO_API_PASS" ]] && die "CONTABO_API_PASS not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Instance helpers ───────────────────────────────────────────────── +get_all_instance_ids() { + local page=1 size=100 ids="" + while true; do + local resp + resp=$(contabo_api GET "/compute/instances?page=${page}&size=${size}") + local page_ids + page_ids=$(echo "$resp" | jq -r '.data[].instanceId' 2>/dev/null) + [[ -z "$page_ids" ]] && break + ids="${ids}${ids:+$'\n'}${page_ids}" + local count + count=$(echo "$page_ids" | wc -l) + (( count < size )) && break + ((page++)) || true + done + echo "$ids" +} + +get_instance_name() { + local iid="$1" + contabo_api GET "/compute/instances/${iid}" \ + | jq -r '.data[0].name // .data[0].displayName // "unknown"' 2>/dev/null +} + +get_instance_ids() { + if [[ "$TARGET_ALL" == "true" ]]; then + get_all_instance_ids + elif [[ -n "$INSTANCE_ID" ]]; then + echo "$INSTANCE_ID" + else + die "Specify --instance ID or --all" + fi +} + +# ── Snapshot helpers ───────────────────────────────────────────────── +get_snapshots() { + local iid="$1" + contabo_api GET "/compute/instances/${iid}/snapshots" \ + | jq -r '.data // []' 2>/dev/null +} + +# ══════════════════════════════════════════════════════════════════════ +# SNAPSHOT +# ══════════════════════════════════════════════════════════════════════ +do_snapshot() { + local ids + ids=$(get_instance_ids) + [[ -z "$ids" ]] && die "No instances found" + + local count + count=$(echo "$ids" | grep -c . || true) + local target_label="instance ${INSTANCE_ID}" + [[ "$TARGET_ALL" == "true" ]] && target_label="all (${count} instances)" + + section_header "Creating Snapshots" + field "Target:" "$target_label" + field "Prefix:" "$PREFIX" + echo "" + + while IFS= read -r iid; do + [[ -z "$iid" ]] && continue + local snap_name + snap_name="${PREFIX}-$(date +%Y%m%d-%H%M%S)" + local iname + iname=$(get_instance_name "$iid") + + verbose "Snapshotting ${iname} (${iid}) as ${snap_name}" + + if contabo_api POST "/compute/instances/${iid}/snapshots" \ + -d "{\"name\": \"${snap_name}\", \"description\": \"Managed by ${SCRIPT_NAME}\"}" > /dev/null 2>&1; then + echo -e " ${GREEN}✓${RESET} ${iname} (${iid}) ${snap_name}" + ((SNAP_CREATED++)) || true + else + echo -e " ${RED}✗${RESET} ${iname} (${iid}) failed" + ((SNAP_ERRORS++)) || true + fi + + # Brief pause to avoid rate limiting on large fleets + sleep 1 + done <<< "$ids" + + echo "" + field_color "Created:" "${GREEN}${SNAP_CREATED}${RESET}" + if [[ "$SNAP_ERRORS" -gt 0 ]]; then + field_color "Errors:" "${RED}${SNAP_ERRORS}${RESET}" + fi + + if [[ "$ALSO_ROTATE" == "true" ]]; then + do_rotate + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# ROTATE +# ══════════════════════════════════════════════════════════════════════ +do_rotate() { + local ids + ids=$(get_instance_ids) + [[ -z "$ids" ]] && die "No instances found" + + section_header "Rotating Snapshots" + field "Keep:" "$KEEP per instance" + field "Prefix:" "$PREFIX" + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + field_color "Mode:" "${YELLOW}DRY-RUN${RESET} (use --force to delete)" + else + field_color "Mode:" "${RED}LIVE${RESET}" + fi + echo "" + + while IFS= read -r iid; do + [[ -z "$iid" ]] && continue + local iname + iname=$(get_instance_name "$iid") + local snaps + snaps=$(get_snapshots "$iid") + + # Filter to managed snapshots (matching prefix), sort by date descending + local managed + managed=$(echo "$snaps" | jq -r \ + --arg prefix "$PREFIX" \ + '[.[] | select(.name | startswith($prefix))] | sort_by(.createdDate) | reverse' \ + 2>/dev/null) + + local total + total=$(echo "$managed" | jq 'length' 2>/dev/null || echo 0) + + if (( total <= KEEP )); then + verbose "${iname}: ${total} managed snapshots, keeping all" + continue + fi + + local to_delete + to_delete=$(echo "$managed" | jq -r ".[$KEEP:][] | .snapshotId" 2>/dev/null) + + while IFS= read -r sid; do + [[ -z "$sid" ]] && continue + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + echo -e " ${YELLOW}⊘${RESET} Would delete: ${iname} (${iid}) → ${sid}" + else + if contabo_api DELETE "/compute/instances/${iid}/snapshots/${sid}" > /dev/null 2>&1; then + echo -e " ${GREEN}✓${RESET} Deleted: ${iname} (${iid}) → ${sid}" + ((SNAP_DELETED++)) || true + else + echo -e " ${RED}✗${RESET} Failed: ${iname} (${iid}) → ${sid}" + ((SNAP_ERRORS++)) || true + fi + sleep 1 + fi + done <<< "$to_delete" + done <<< "$ids" + + echo "" + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + log "Dry-run complete — use --force to execute" + else + field_color "Deleted:" "${GREEN}${SNAP_DELETED}${RESET}" + if [[ "$SNAP_ERRORS" -gt 0 ]]; then + field_color "Errors:" "${RED}${SNAP_ERRORS}${RESET}" + fi + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST +# ══════════════════════════════════════════════════════════════════════ +do_list() { + local ids + ids=$(get_instance_ids) + [[ -z "$ids" ]] && die "No instances found" + + section_header "Snapshots" + + printf " ${BOLD}%-8s %-18s %-28s %-22s${RESET}\n" "INST" "SNAPSHOT ID" "NAME" "CREATED" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + local total=0 + while IFS= read -r iid; do + [[ -z "$iid" ]] && continue + local snaps + snaps=$(get_snapshots "$iid") + + echo "$snaps" | jq -r --arg iid "$iid" \ + '.[] | "\($iid)\t\(.snapshotId)\t\(.name)\t\(.createdDate)"' 2>/dev/null \ + | while IFS=$'\t' read -r inst sid name created; do + printf " %-8s %-18s %-28s %-22s\n" "$inst" "$sid" "${name:0:26}" "${created:0:20}" + ((total++)) 2>/dev/null || true + done + done <<< "$ids" +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ +do_audit() { + local ids + ids=$(get_all_instance_ids) + [[ -z "$ids" ]] && die "No instances found" + + section_header "Snapshot Audit" + + printf " ${BOLD}%-20s %-20s %6s %6s %-12s${RESET}\n" \ + "INSTANCE" "LATEST SNAPSHOT" "AGE" "COUNT" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..68})" + + local protected=0 stale=0 unprotected=0 total_instances=0 + local now + now=$(date +%s) + + while IFS= read -r iid; do + [[ -z "$iid" ]] && continue + ((total_instances++)) || true + + local iname + iname=$(get_instance_name "$iid") + local snaps + snaps=$(get_snapshots "$iid") + local snap_count + snap_count=$(echo "$snaps" | jq 'length' 2>/dev/null || echo 0) + + if [[ "$snap_count" -eq 0 ]]; then + printf " %-20s %-20s %6s %6s " "${iname:0:18}" "(none)" "—" "0" + echo -e "${RED}✗ Unprotected${RESET}" + ((unprotected++)) || true + continue + fi + + local latest + latest=$(echo "$snaps" | jq -r \ + '[.[] | select(.name)] | sort_by(.createdDate) | last' 2>/dev/null) + local latest_name latest_date + latest_name=$(echo "$latest" | jq -r '.name // "unknown"' 2>/dev/null) + latest_date=$(echo "$latest" | jq -r '.createdDate // ""' 2>/dev/null) + + local age_days="?" + if [[ -n "$latest_date" ]]; then + local snap_epoch + snap_epoch=$(date -d "$latest_date" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + age_days=$(( (now - snap_epoch) / 86400 )) + fi + fi + + local status_str status_color + if [[ "$age_days" != "?" ]] && (( age_days > MAX_AGE )); then + status_str="⚠ Stale" + status_color="$YELLOW" + ((stale++)) || true + else + status_str="✓ OK" + status_color="$GREEN" + ((protected++)) || true + fi + + printf " %-20s %-20s %5sd %6s " \ + "${iname:0:18}" "${latest_name:0:18}" "$age_days" "$snap_count" + echo -e "${status_color}${status_str}${RESET}" + done <<< "$ids" + + echo "" + field "Instances:" "$total_instances" + field_color "Protected:" "${GREEN}${protected}${RESET}" + if [[ "$stale" -gt 0 ]]; then + field_color "Stale (>${MAX_AGE}d):" "${YELLOW}${stale}${RESET}" + fi + if [[ "$unprotected" -gt 0 ]]; then + field_color "Unprotected:" "${RED}${unprotected}${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# RESTORE +# ══════════════════════════════════════════════════════════════════════ +do_restore() { + [[ -z "$INSTANCE_ID" ]] && die "Specify --instance ID" + [[ -z "$SNAPSHOT_ID" ]] && die "Specify --snapshot-id ID" + + local iname + iname=$(get_instance_name "$INSTANCE_ID") + + section_header "Restore Snapshot" + field "Instance:" "${iname} (${INSTANCE_ID})" + field "Snapshot:" "$SNAPSHOT_ID" + echo "" + + if [[ "$FORCE" != "true" ]]; then + echo -e " ${RED}WARNING: This will revert the instance to the snapshot state.${RESET}" + echo -e " ${RED}All changes since the snapshot will be lost.${RESET}" + echo "" + read -r -p " Type 'yes' to confirm: " confirm + if [[ "$confirm" != "yes" ]]; then + log "Restore cancelled" + return + fi + fi + + if contabo_api POST "/compute/instances/${INSTANCE_ID}/snapshots/${SNAPSHOT_ID}" \ + -d '{}' > /dev/null 2>&1; then + echo -e " ${GREEN}✓${RESET} Restore initiated — instance will revert to ${SNAPSHOT_ID}" + log "Monitor instance status — revert may take several minutes" + else + echo -e " ${RED}✗${RESET} Restore failed" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# STATUS +# ══════════════════════════════════════════════════════════════════════ +do_status() { + local ids + ids=$(get_all_instance_ids) + [[ -z "$ids" ]] && die "No instances found" + + local total_instances=0 total_snaps=0 + local protected=0 stale=0 unprotected=0 + local now + now=$(date +%s) + + while IFS= read -r iid; do + [[ -z "$iid" ]] && continue + ((total_instances++)) || true + + local snaps + snaps=$(get_snapshots "$iid") + local snap_count + snap_count=$(echo "$snaps" | jq 'length' 2>/dev/null || echo 0) + total_snaps=$(( total_snaps + snap_count )) + + if [[ "$snap_count" -eq 0 ]]; then + ((unprotected++)) || true + continue + fi + + local latest_date + latest_date=$(echo "$snaps" | jq -r \ + '[.[] | select(.createdDate)] | sort_by(.createdDate) | last | .createdDate // ""' \ + 2>/dev/null) + + if [[ -n "$latest_date" ]]; then + local snap_epoch + snap_epoch=$(date -d "$latest_date" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + local age_days=$(( (now - snap_epoch) / 86400 )) + if (( age_days > MAX_AGE )); then + ((stale++)) || true + else + ((protected++)) || true + fi + else + ((protected++)) || true + fi + else + ((protected++)) || true + fi + done <<< "$ids" + + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + cat <${MAX_AGE}d):" "${YELLOW}${stale}${RESET}" + else + field_color "Stale (>${MAX_AGE}d):" "${GREEN}0${RESET}" + fi + if [[ "$unprotected" -gt 0 ]]; then + field_color "Unprotected:" "${RED}${unprotected}${RESET}" + else + field_color "Unprotected:" "${GREEN}0${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2 + exit "$exit_code" +} + +trap 'handle_error $? $LINENO' ERR + +debug_echo() { + if [[ -n "$DEBUG" ]]; then + echo "[DEBUG] $*" >&2 + fi +} + +show_help() { + cat << EOF +Usage: $SCRIPT_NAME [OPTIONS] + +Container health metrics collector for Prometheus node_exporter textfile directory. + +Collects per-container health check status, image age, restart counts, exit codes, +and running state via docker inspect and writes them as Prometheus metrics. + +OPTIONS: + --once Run collection once and exit (default) + --daemon Run continuously at COLLECTION_INTERVAL + --help, -h Show this help message + +ENVIRONMENT VARIABLES: + NODE_DIR Node exporter textfile directory (default: $DEFAULT_NODE_DIR) + COLLECTION_INTERVAL Seconds between collections in daemon mode (default: $DEFAULT_COLLECTION_INTERVAL) + DEBUG Enable debug output + +EXAMPLES: + $SCRIPT_NAME --once + $SCRIPT_NAME --daemon + COLLECTION_INTERVAL=30 $SCRIPT_NAME --daemon + +OUTPUT: + Writes metrics to \$NODE_DIR/textfile_collector/container_health.prom + +EOF + exit 0 +} + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --once) RUN_MODE="once"; shift ;; + --daemon) RUN_MODE="daemon"; shift ;; + --help|-h) show_help ;; + *) echo "Unknown option: $1" >&2; show_help ;; + esac +done + +# Validate configuration +validate_config() { + if ! command -v docker &>/dev/null; then + echo "Error: docker is not installed or not in PATH" >&2 + exit 1 + fi + + local textfile_dir="${NODE_DIR}/textfile_collector" + if [[ ! -d "$textfile_dir" ]]; then + echo "Error: Textfile collector directory not found: $textfile_dir" >&2 + echo "Create it: sudo mkdir -p $textfile_dir" >&2 + exit 1 + fi +} + +# Collect metrics for all containers +collect_all() { + local output_dir="${NODE_DIR}/textfile_collector" + local output_file="${output_dir}/container_health.prom" + local temp_file + temp_file=$(mktemp "${output_file}.XXXXXX") + + local start_time + start_time=$(date +%s%N) + local success=1 + + debug_echo "Starting collection..." + + { + local containers + containers=$(docker ps -a --format '{{.Names}}') + + if [[ -z "$containers" ]]; then + debug_echo "No containers found" + fi + + # Per-container metrics headers + echo "# HELP container_health_status Health check status of the container (1 for current status)." + echo "# TYPE container_health_status gauge" + echo "# HELP container_image_age_seconds Age of the container image in seconds." + echo "# TYPE container_image_age_seconds gauge" + echo "# HELP container_restart_count Number of container restarts." + echo "# TYPE container_restart_count gauge" + echo "# HELP container_exit_code Exit code of the container." + echo "# TYPE container_exit_code gauge" + echo "# HELP container_running Whether the container is running (1=running, 0=stopped)." + echo "# TYPE container_running gauge" + + local now + now=$(date +%s) + + while IFS= read -r container_name; do + [[ -z "$container_name" ]] && continue + + debug_echo "Inspecting container: $container_name" + + # Extract all fields in a single docker inspect call + local inspect_data + inspect_data=$(docker inspect --format \ + '{{.Config.Image}}|{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}|{{.Created}}|{{.RestartCount}}|{{.State.ExitCode}}|{{.State.Running}}' \ + "$container_name" 2>/dev/null) || { + debug_echo "Failed to inspect container: $container_name" + success=0 + continue + } + + local image health_status created restart_count exit_code running_raw + IFS='|' read -r image health_status created restart_count exit_code running_raw <<< "$inspect_data" + + # Calculate image age in seconds + local created_epoch image_age + created_epoch=$(date -d "$created" +%s 2>/dev/null) || created_epoch=0 + image_age=$((now - created_epoch)) + + # Convert running boolean to 0/1 + local running=0 + if [[ "$running_raw" == "true" ]]; then + running=1 + fi + + # Health status — emit a 1 for the current status, 0 for others + for status in healthy unhealthy starting none; do + if [[ "$health_status" == "$status" ]]; then + echo "container_health_status{name=\"${container_name}\",image=\"${image}\",status=\"${status}\"} 1" + else + echo "container_health_status{name=\"${container_name}\",image=\"${image}\",status=\"${status}\"} 0" + fi + done + + echo "container_image_age_seconds{name=\"${container_name}\",image=\"${image}\"} ${image_age}" + echo "container_restart_count{name=\"${container_name}\",image=\"${image}\"} ${restart_count}" + echo "container_exit_code{name=\"${container_name}\",image=\"${image}\"} ${exit_code}" + echo "container_running{name=\"${container_name}\",image=\"${image}\"} ${running}" + + done <<< "$containers" + + # Exporter metadata + local end_time duration + end_time=$(date +%s%N) + duration=$(awk "BEGIN {printf \"%.4f\", ($end_time - $start_time) / 1000000000}") + + echo "" + echo "# HELP container_health_exporter_duration_seconds Time taken to collect metrics." + echo "# TYPE container_health_exporter_duration_seconds gauge" + echo "container_health_exporter_duration_seconds ${duration}" + echo "" + echo "# HELP container_health_exporter_last_run_timestamp Unix timestamp of last collection." + echo "# TYPE container_health_exporter_last_run_timestamp gauge" + echo "container_health_exporter_last_run_timestamp $(date +%s)" + echo "" + echo "# HELP container_health_exporter_success Whether the last collection succeeded (1=success, 0=failure)." + echo "# TYPE container_health_exporter_success gauge" + echo "container_health_exporter_success ${success}" + + } > "$temp_file" 2>/dev/null + + mv "$temp_file" "$output_file" + + debug_echo "Collection complete. Wrote to $output_file" +} + +# Main +main() { + validate_config + + case "$RUN_MODE" in + once) + collect_all + ;; + daemon) + echo "$SCRIPT_NAME running in daemon mode (interval: ${COLLECTION_INTERVAL}s)" + while true; do + collect_all + sleep "$COLLECTION_INTERVAL" + done + ;; + esac +} + +main diff --git a/container-update-checker.sh b/container-update-checker.sh new file mode 100755 index 0000000..74d1b5a --- /dev/null +++ b/container-update-checker.sh @@ -0,0 +1,410 @@ +#!/usr/bin/env bash + +######################################################################################### +#### container-update-checker.sh — Check Docker/Podman containers for image updates #### +#### Compares local image digests against remote registry digests #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./container-update-checker.sh #### +#### ./container-update-checker.sh --docker --filter nginx #### +#### ./container-update-checker.sh --json --quiet #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +RUNTIME="${CONTAINER_RUNTIME:-auto}" +TIMEOUT="${REGISTRY_TIMEOUT:-10}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +JSON_OUTPUT="false" +QUIET="false" +FILTER="" +LABEL="" +TEXTFILE_DIR="/var/lib/node_exporter" +PROM_FILE="" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +COUNT_CURRENT=0 +COUNT_UPDATE=0 +COUNT_ERROR=0 +COUNT_TOTAL=0 +JSON_ITEMS="" +PROM_LINES="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' + BOLD='\033[1m' DIM='\033[2m' RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*" >&2; fi; } + +# ── Runtime Detection ───────────────────────────────────────────────── +detect_runtime() { + if [[ "$RUNTIME" == "docker" || "$RUNTIME" == "podman" ]]; then + if ! command -v "$RUNTIME" &>/dev/null; then + err "${RUNTIME^} not found"; exit 2 + fi + return + fi + if command -v docker &>/dev/null && docker info &>/dev/null; then + RUNTIME="docker" + elif command -v podman &>/dev/null; then + RUNTIME="podman" + else + err "Neither Docker nor Podman found"; exit 2 + fi + verbose "Auto-detected runtime: ${RUNTIME}" +} + +# ── Auth Helper ─────────────────────────────────────────────────────── +get_auth_header() { + local registry="$1" config_file="" + if [[ "$RUNTIME" == "podman" ]]; then + config_file="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}/containers/auth.json" + [[ -f "$config_file" ]] || config_file="${HOME}/.config/containers/auth.json" + fi + [[ -f "${config_file:-}" ]] || config_file="${HOME}/.docker/config.json" + [[ -f "$config_file" ]] || return 0 + local auth + auth=$(grep -A1 "\"${registry}\"" "$config_file" 2>/dev/null \ + | grep '"auth"' | head -1 | sed 's/.*"auth"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/') || true + if [[ -n "$auth" ]]; then + echo "Authorization: Basic ${auth}" + fi +} + +# ── Parse Image Reference ──────────────────────────────────────────── +parse_image_ref() { + local image="$1" registry="" path="" tag="" + local without_tag="${image%%@*}" + if [[ "$without_tag" == *:* && "${without_tag##*:}" != */* ]]; then + tag="${without_tag##*:}" + without_tag="${without_tag%:*}" + fi + [[ -z "$tag" ]] && tag="latest" + if [[ "$without_tag" == *"."*"/"* ]] || [[ "$without_tag" == *":"*"/"* ]] || [[ "$without_tag" == "localhost/"* ]]; then + registry="${without_tag%%/*}" + path="${without_tag#*/}" + else + registry="docker.io" + [[ "$without_tag" == *"/"* ]] && path="$without_tag" || path="library/${without_tag}" + fi + echo "${registry}" "${path}" "${tag}" +} + +# ── Get Local Digest ───────────────────────────────────────────────── +get_local_digest() { + local image="$1" digest + digest=$($RUNTIME image inspect "$image" --format '{{index .RepoDigests 0}}' 2>/dev/null) || true + if [[ -n "$digest" && "$digest" == *"@"* ]]; then + echo "${digest##*@}"; return + fi + digest=$($RUNTIME image inspect "$image" --format '{{.Id}}' 2>/dev/null) || true + echo "${digest:-}" +} + +# ── Extract JSON Value (pure bash, no python/jq) ───────────────────── +json_value() { + local key="$1" + sed -n "s/.*\"${key}\"[[:space:]]*:[[:space:]]*\"\([^\"]*\)\".*/\1/p" | head -1 +} + +# ── Get Remote Digest via Skopeo ────────────────────────────────────── +get_remote_digest_skopeo() { + local registry="$1" path="$2" tag="$3" + local digest + digest=$(timeout "$TIMEOUT" skopeo inspect --no-tags "docker://${registry}/${path}:${tag}" 2>/dev/null \ + | json_value "Digest") || true + echo "${digest:-}" +} + +# ── Get Remote Digest via Curl ──────────────────────────────────────── +get_remote_digest_curl() { + local registry="$1" path="$2" tag="$3" + local token="" digest="" + if [[ "$registry" == "docker.io" || "$registry" == "registry-1.docker.io" ]]; then + token=$(curl -sf --max-time "$TIMEOUT" \ + "https://auth.docker.io/token?service=registry.docker.io&scope=repository:${path}:pull" \ + | json_value "token") || true + [[ -z "$token" ]] && return + digest=$(curl -sf --max-time "$TIMEOUT" \ + -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + -H "Accept: application/vnd.oci.image.index.v1+json" \ + -H "Authorization: Bearer ${token}" \ + "https://registry-1.docker.io/v2/${path}/manifests/${tag}" \ + -o /dev/null -D - 2>/dev/null \ + | grep -i "docker-content-digest" | tr -d '\r' | awk '{print $2}') || true + else + local auth_hdr auth_args=() + auth_hdr=$(get_auth_header "$registry") + [[ -n "$auth_hdr" ]] && auth_args=(-H "$auth_hdr") + digest=$(curl -sf --max-time "$TIMEOUT" \ + -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + -H "Accept: application/vnd.oci.image.index.v1+json" \ + "${auth_args[@]+"${auth_args[@]}"}" \ + "https://${registry}/v2/${path}/manifests/${tag}" \ + -o /dev/null -D - 2>/dev/null \ + | grep -i "docker-content-digest" | tr -d '\r' | awk '{print $2}') || true + fi + echo "${digest:-}" +} +# ── Get Remote Digest (skopeo then curl fallback) ───────────────────── +get_remote_digest() { + local registry="$1" path="$2" tag="$3" digest="" + if command -v skopeo &>/dev/null; then + verbose "Trying skopeo for ${registry}/${path}:${tag}" + digest=$(get_remote_digest_skopeo "$registry" "$path" "$tag") + fi + if [[ -z "$digest" ]]; then + verbose "Trying curl fallback for ${registry}/${path}:${tag}" + digest=$(get_remote_digest_curl "$registry" "$path" "$tag") + fi + echo "${digest:-}" +} + +# ── Check Single Container ──────────────────────────────────────────── +check_container() { + local name="$1" image="$2" + local status="" local_digest="" remote_digest="" registry path tag + read -r registry path tag <<< "$(parse_image_ref "$image")" + verbose "Container=${name} image=${image} registry=${registry} path=${path} tag=${tag}" + local_digest=$(get_local_digest "$image") + verbose "Local digest: ${local_digest:-none}" + if [[ -z "$local_digest" ]]; then + status="error" + else + remote_digest=$(get_remote_digest "$registry" "$path" "$tag") + verbose "Remote digest: ${remote_digest:-none}" + if [[ -z "$remote_digest" ]]; then + status="error" + elif [[ "$local_digest" == "$remote_digest" ]]; then + status="current" + else + status="update" + fi + fi + COUNT_TOTAL=$((COUNT_TOTAL + 1)) + case "$status" in + current) COUNT_CURRENT=$((COUNT_CURRENT + 1)) ;; + update) COUNT_UPDATE=$((COUNT_UPDATE + 1)) ;; + error) COUNT_ERROR=$((COUNT_ERROR + 1)) ;; + esac + if [[ -n "$PROM_FILE" ]]; then + local val=1; [[ "$status" == "update" ]] && val=0 + PROM_LINES+="container_image_up_to_date{name=\"${name}\",image=\"${image}\"} ${val}"$'\n' + fi + [[ "$QUIET" == "true" && "$status" != "update" ]] && return + if [[ "$JSON_OUTPUT" == "true" ]]; then + local item + item=$(printf '{"container":"%s","image":"%s","status":"%s"}' "$name" "$image" "$status") + [[ -n "$JSON_ITEMS" ]] && JSON_ITEMS="${JSON_ITEMS},${item}" || JSON_ITEMS="${item}" + else + local color symbol + case "$status" in + current) color="$GREEN"; symbol="up-to-date" ;; + update) color="$YELLOW"; symbol="update available" ;; + error) color="$RED"; symbol="check failed" ;; + *) color=""; symbol="?" ;; + esac + printf " %-30s %-40s %b%s%b\n" "$name" "$image" "$color" "$symbol" "$RESET" + fi +} + +# ── List Containers ─────────────────────────────────────────────────── +list_containers() { + local filter_args=() + [[ -n "$LABEL" ]] && filter_args+=(--filter "label=${LABEL}") + $RUNTIME ps --format '{{.Names}}\t{{.Image}}' "${filter_args[@]}" 2>/dev/null +} + +# ── Write Prometheus Metrics ────────────────────────────────────────── +write_prom_metrics() { + local file="$1" + local output_dir + output_dir="$(dirname "$file")" + mkdir -p "$output_dir" + local tmp + tmp=$(mktemp "${output_dir}/.container_updates.XXXXXX") + { + echo "# HELP container_image_up_to_date Whether the container image is up to date (1=yes, 0=no)" + echo "# TYPE container_image_up_to_date gauge" + printf '%s' "$PROM_LINES" + echo "# HELP container_update_check_timestamp Unix timestamp of last update check" + echo "# TYPE container_update_check_timestamp gauge" + echo "container_update_check_timestamp $(date +%s)" + echo "# HELP container_update_check_total Total containers checked" + echo "# TYPE container_update_check_total gauge" + echo "container_update_check_total ${COUNT_TOTAL}" + echo "# HELP container_update_available_total Containers with updates available" + echo "# TYPE container_update_available_total gauge" + echo "container_update_available_total ${COUNT_UPDATE}" + } > "$tmp" + chmod 644 "$tmp" + mv -f "$tmp" "$file" + verbose "Metrics written to ${file}" +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2 + exit 2 ;; + *) + err "Unexpected argument: $1" + echo "Run ${SCRIPT_NAME} --help for usage" >&2 + exit 2 ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + detect_runtime + + local containers=() + while IFS=$'\t' read -r name image; do + [[ -z "$name" ]] && continue + [[ -n "$FILTER" && "$name" != *"${FILTER}"* ]] && continue + containers+=("${name} ${image}") + done < <(list_containers) + + if [[ ${#containers[@]} -eq 0 ]]; then + if [[ "$JSON_OUTPUT" == "true" ]]; then + echo '{"results":[],"summary":{"total":0,"current":0,"update_available":0,"errors":0}}' + else + warn "No running containers found" + fi + exit 0 + fi + + verbose "Found ${#containers[@]} containers to check" + + if [[ "$JSON_OUTPUT" != "true" ]]; then + echo "" + echo -e "${BOLD}Container Update Checker${RESET}" + echo -e "${DIM}Runtime: ${RUNTIME} | Timeout: ${TIMEOUT}s${RESET}" + echo "" + printf " ${BOLD}%-30s %-40s %s${RESET}\n" "CONTAINER" "IMAGE" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..82})" + fi + + for entry in "${containers[@]}"; do + check_container "${entry%% *}" "${entry#* }" + done + + if [[ "$JSON_OUTPUT" == "true" ]]; then + printf '{"results":[%s],"summary":{"total":%d,"current":%d,"update_available":%d,"errors":%d}}\n' \ + "$JSON_ITEMS" "$COUNT_TOTAL" "$COUNT_CURRENT" "$COUNT_UPDATE" "$COUNT_ERROR" + else + echo "" + echo -e " ${BOLD}Summary${RESET}" + printf " %-20s %d\n" "Total checked:" "$COUNT_TOTAL" + printf " %-20s %b%d%b\n" "Up-to-date:" "$GREEN" "$COUNT_CURRENT" "$RESET" + printf " %-20s %b%d%b\n" "Update available:" "$YELLOW" "$COUNT_UPDATE" "$RESET" + printf " %-20s %b%d%b\n" "Errors:" "$RED" "$COUNT_ERROR" "$RESET" + echo "" + fi + + [[ -n "$PROM_FILE" ]] && write_prom_metrics "$PROM_FILE" + + if [[ "$COUNT_ERROR" -gt 0 ]]; then exit 2 + elif [[ "$COUNT_UPDATE" -gt 0 ]]; then exit 1 + fi + exit 0 +} + +main "$@" diff --git a/coolify-exporter.sh b/coolify-exporter.sh new file mode 100644 index 0000000..2f1e503 --- /dev/null +++ b/coolify-exporter.sh @@ -0,0 +1,505 @@ +#!/bin/bash +################################################################################ +# Script Name: coolify-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for Coolify PaaS providing operational +# metrics via the Coolify API — application status, deployment +# counts, database health, server info, SSL certificate expiry, +# and API health +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - Coolify instance running with API enabled +# - Coolify API token (generate in Settings → API Tokens) +# - curl for API calls +# - jq for JSON parsing +# - netcat (nc) for HTTP mode +# +# Usage: +# # Output to stdout +# ./coolify-exporter.sh +# +# # HTTP server mode +# ./coolify-exporter.sh --http -p 9196 +# +# # Textfile collector mode +# ./coolify-exporter.sh --textfile +# +# # Custom API token and URL +# ./coolify-exporter.sh --api-url http://coolify.local:8000 --api-token mytoken +# +# Metrics Exported: +# - coolify_up - API reachability (1=up, 0=down) +# - coolify_info{version} - Coolify version info +# - coolify_applications_total - Total application count +# - coolify_applications_by_status{status} - Applications by status +# - coolify_deployments_total - Total deployments +# - coolify_deployments_running - Currently running deployments +# - coolify_deployments_failed_total - Total failed deployments +# - coolify_databases_total - Total managed databases +# - coolify_databases_running - Running databases +# - coolify_servers_total - Total servers managed +# - coolify_servers_reachable - Reachable servers +# - coolify_services_total - Total services +# - coolify_services_running - Running services +# - coolify_exporter_duration_seconds - Script execution time +# - coolify_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9196 +# Default API URL: http://localhost:8000 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9196 +API_URL="http://localhost:8000" +API_TOKEN="" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Check prerequisites +# Returns: 0 if OK, 1 if error +check_prerequisites() { + if ! command -v curl >/dev/null 2>&1; then + echo "ERROR: curl not found" >&2 + return 1 + fi + + if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq not found (required for JSON parsing)" >&2 + return 1 + fi + + if [ -z "$API_TOKEN" ]; then + echo "ERROR: --api-token is required" >&2 + return 1 + fi + + return 0 +} + +# Escape special characters in Prometheus label values +# Args: $1 - string to escape +# Returns: escaped string safe for Prometheus labels +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/}" + echo "$val" +} + +# Make an authenticated API call +# Args: $1 - API endpoint path (e.g., /api/v1/applications) +# Returns: JSON response on stdout +api_call() { + local endpoint="$1" + curl -s -X GET \ + -H "Authorization: Bearer ${API_TOKEN}" \ + -H "Accept: application/json" \ + "${API_URL}${endpoint}" 2>/dev/null +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Check prerequisites + if ! check_prerequisites; then + cat </dev/null) + + if [ -z "$version_response" ]; then + cat </dev/null) + + if [ -z "$coolify_version" ] || [ "$coolify_version" = "null" ]; then + cat </dev/null) + running_apps=$(echo "$apps_response" | jq '[.[] | select(.status == "running")] | length' 2>/dev/null) + stopped_apps=$(echo "$apps_response" | jq '[.[] | select(.status == "stopped" or .status == "exited")] | length' 2>/dev/null) + exited_apps=$(echo "$apps_response" | jq '[.[] | select(.status == "restarting" or .status == "degraded")] | length' 2>/dev/null) + total_apps=${total_apps:-0} + running_apps=${running_apps:-0} + stopped_apps=${stopped_apps:-0} + exited_apps=${exited_apps:-0} + fi + + cat </dev/null) + running_deployments=$(echo "$deployments_response" | jq '[.[] | select(.status == "in_progress")] | length' 2>/dev/null) + failed_deployments=$(echo "$deployments_response" | jq '[.[] | select(.status == "failed" or .status == "error")] | length' 2>/dev/null) + queued_deployments=$(echo "$deployments_response" | jq '[.[] | select(.status == "queued")] | length' 2>/dev/null) + total_deployments=${total_deployments:-0} + running_deployments=${running_deployments:-0} + failed_deployments=${failed_deployments:-0} + queued_deployments=${queued_deployments:-0} + fi + + cat </dev/null) + running_databases=$(echo "$databases_response" | jq '[.[] | select(.status == "running")] | length' 2>/dev/null) + total_databases=${total_databases:-0} + running_databases=${running_databases:-0} + fi + + cat </dev/null) + reachable_servers=$(echo "$servers_response" | jq '[.[] | select(.settings.is_reachable == true)] | length' 2>/dev/null) + total_servers=${total_servers:-0} + reachable_servers=${reachable_servers:-0} + fi + + cat </dev/null) + running_services=$(echo "$services_response" | jq '[.[] | select(.status == "running")] | length' 2>/dev/null) + total_services=${total_services:-0} + running_services=${running_services:-0} + fi + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + # Infinite loop accepting HTTP requests + while true; do + { + read -r request + # Check if request is for /metrics endpoint + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else # Serve HTML landing page for other requests + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +Coolify Exporter v1.0 + +

Coolify Prometheus Exporter v1.0

+

Metrics

+

Operational metrics from the Coolify API.

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Main entry point - routes to appropriate output mode +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + # Run HTTP server (blocks until killed) + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + # Textfile collector mode: write atomically using temp file + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + # Create temp file in SAME directory for atomic rename (same filesystem) + local temp_file + temp_file=$(mktemp "${output_dir}/.coolify_metrics.XXXXXX") + + # Generate metrics to temp file + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + # Validate: file must exist, have content + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + # Set permissions before move + chmod 644 "$temp_file" + + # Atomic rename - no gap where file is missing + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + # Default: output to stdout + generate_metrics + fi +} + +# Execute main function with all script arguments +main "$@" diff --git a/create_swap.sh b/create_swap.sh index c704114..ffc561d 100644 --- a/create_swap.sh +++ b/create_swap.sh @@ -5,10 +5,13 @@ #### #### #### Author: Phil Connor #### #### Contact: pconnor@ara.com #### -#### Version 3.50.20250729 #### +#### Version 3.51.20250729 #### #### #### #### Created 06/01/2023 #### ############################################## +# v3.51 changes: +# - Fixed: grep in pipeline crashes under set -euo pipefail when no matches found. Added || true guard +############################################## # Exit on any error, undefined variables, and pipe failures set -euo pipefail @@ -68,7 +71,7 @@ detect_os() { get_memory_gb() { local mem_kb # Extract memory from /proc/meminfo (in KB) - mem_kb=$(grep MemTotal /proc/meminfo | awk '{print $2}') + mem_kb=$({ grep MemTotal /proc/meminfo || true; } | awk '{print $2}') if [[ -z "$mem_kb" || "$mem_kb" -eq 0 ]]; then error "Unable to determine system memory" diff --git a/cron-doctor.sh b/cron-doctor.sh new file mode 100644 index 0000000..ff1846b --- /dev/null +++ b/cron-doctor.sh @@ -0,0 +1,522 @@ +#!/usr/bin/env bash + +######################################################################################### +#### cron-doctor.sh — Diagnose common cron and systemd timer problems #### +#### Checks PATH, missing binaries, unescaped %, output redirection, permissions, #### +#### overlap risk, and failed timer services #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.0 #### +#### #### +#### Usage: #### +#### ./cron-doctor.sh #### +#### ./cron-doctor.sh --user admin #### +#### ./cron-doctor.sh --fix-suggestions #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +TARGET_USER="" +FIX_SUGGESTIONS=false +CRON_ONLY=false +TIMERS_ONLY=false + +# ── Counters ────────────────────────────────────────────────────────── +WARN_COUNT=0 +FAIL_COUNT=0 +INFO_COUNT=0 +FAIL_MESSAGES=() +WARN_MESSAGES=() + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[0;33m' + CYAN='\033[0;36m'; BOLD='\033[1m'; DIM='\033[2m'; RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e " ${GREEN}[OK]${RESET} $*"; } +warn() { echo -e " ${YELLOW}[WARN]${RESET} $*"; WARN_MESSAGES+=("$*"); (( WARN_COUNT++ )) || true; } +fail() { echo -e " ${RED}[FAIL]${RESET} $*"; FAIL_MESSAGES+=("$*"); (( FAIL_COUNT++ )) || true; } +info() { echo -e " ${CYAN}[INFO]${RESET} $*"; (( INFO_COUNT++ )) || true; } +suggest() { [[ "$FIX_SUGGESTIONS" == "true" ]] && echo -e " ${DIM}→ $*${RESET}"; return 0; } +verbose() { [[ "$VERBOSE" == "true" ]] && echo -e " ${DIM}[DEBUG]${RESET} $*"; return 0; } + +section() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat <&2; usage ;; + esac +done + +setup_colors # re-init in case --no-color was passed + +# ── Detect crontab directory ────────────────────────────────────────── +detect_cron_spool() { + if [[ -d /var/spool/cron/crontabs ]]; then + echo "/var/spool/cron/crontabs" # Debian/Ubuntu + elif [[ -d /var/spool/cron ]]; then + echo "/var/spool/cron" # RHEL/Rocky + else + echo "" + fi +} + +CRON_SPOOL="$(detect_cron_spool)" + +# ── Get list of crontab files to check ──────────────────────────────── +get_crontab_files() { + local files=() + + if [[ -n "$CRON_SPOOL" ]]; then + if [[ -n "$TARGET_USER" ]]; then + [[ -f "$CRON_SPOOL/$TARGET_USER" ]] && files+=("$CRON_SPOOL/$TARGET_USER") + else + for f in "$CRON_SPOOL"/*; do + [[ -f "$f" ]] && files+=("$f") + done + fi + fi + + printf '%s\n' "${files[@]}" 2>/dev/null || true +} + +# ── Parse cron entries from a file ──────────────────────────────────── +# Outputs: schedule|command (skips comments, blanks, variables) +parse_cron_entries() { + local file="$1" has_user_field="${2:-false}" + + while IFS= read -r line; do + # skip comments and blank lines + [[ "$line" =~ ^[[:space:]]*# ]] && continue + [[ "$line" =~ ^[[:space:]]*$ ]] && continue + # skip variable assignments (MAILTO=, PATH=, SHELL=, etc.) + [[ "$line" =~ ^[[:space:]]*[A-Za-z_]+= ]] && continue + + if [[ "$has_user_field" == "true" ]]; then + # system crontab: min hour dom mon dow user command + echo "$line" | awk '{ + if ($1 ~ /^@/) { sched=$1; user=$2; cmd=""; for(i=3;i<=NF;i++) cmd=cmd" "$i } + else { sched=$1" "$2" "$3" "$4" "$5; user=$6; cmd=""; for(i=7;i<=NF;i++) cmd=cmd" "$i } + gsub(/^[[:space:]]+/, "", cmd) + print sched"|"cmd + }' + else + # user crontab: min hour dom mon dow command + echo "$line" | awk '{ + if ($1 ~ /^@/) { sched=$1; cmd=""; for(i=2;i<=NF;i++) cmd=cmd" "$i } + else { sched=$1" "$2" "$3" "$4" "$5; cmd=""; for(i=6;i<=NF;i++) cmd=cmd" "$i } + gsub(/^[[:space:]]+/, "", cmd) + print sched"|"cmd + }' + fi + done < "$file" +} + +# ── Check: crontab environment (PATH) ───────────────────────────────── +check_cron_environment() { + local file="$1" label="$2" + local has_path=false has_mailto=false + + while IFS= read -r line; do + [[ "$line" =~ ^[[:space:]]*PATH= ]] && has_path=true + [[ "$line" =~ ^[[:space:]]*MAILTO= ]] && has_mailto=true + done < "$file" + + if [[ "$has_path" == "false" ]]; then + warn "${label}: no PATH set — cron uses /usr/bin:/bin only" + suggest "Add to top of crontab: PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + else + verbose "${label}: PATH is set" + fi + + if [[ "$has_mailto" == "false" ]]; then + local has_unredirected=false + while IFS='|' read -r _sched cmd; do + [[ -z "$cmd" ]] && continue + if ! echo "$cmd" | grep -qE '>\s*/|>\s*&|2>&1|>/dev/null'; then + has_unredirected=true + break + fi + done < <(parse_cron_entries "$file" false) + + if [[ "$has_unredirected" == "true" ]]; then + warn "${label}: no MAILTO and some jobs lack output redirection — output may be lost" + suggest "Add MAILTO=admin@example.com or redirect: command >> /var/log/job.log 2>&1" + fi + fi +} + +# ── Check: missing binaries ─────────────────────────────────────────── +check_missing_binaries() { + local file="$1" label="$2" has_user="${3:-false}" + + while IFS='|' read -r _sched cmd; do + [[ -z "$cmd" ]] && continue + + # extract the first word (binary) — handle cd/env/sudo/flock prefixes + local binary + binary=$(echo "$cmd" | sed -E ' + s#^(cd [^ ;]+[; ]+(&&[[:space:]]*)?)## + s#^(sudo (-u [^ ]+ )?)## + s#^(env (-i )?([A-Za-z_]+=[^ ]+ )*)## + s#^(/usr/bin/flock [^ ]+ )## + s#^(/bin/sh -c |/bin/bash -c )## + ' | awk '{print $1}') + + # strip trailing shell metacharacters (;, &&, ||, |) + binary="${binary%%[;&|]*}" + + [[ -z "$binary" ]] && continue + # skip shell builtins + [[ "$binary" =~ ^(test|true|false|echo|cd|source|\[|\[\[)$ ]] && continue + + # if it's an absolute path, check directly + if [[ "$binary" == /* ]]; then + if [[ ! -f "$binary" ]]; then + fail "${label}: binary not found: ${binary}" + suggest "Check path: which $(basename "$binary")" + elif [[ ! -x "$binary" ]]; then + fail "${label}: not executable: ${binary}" + suggest "chmod +x ${binary}" + fi + else + # relative binary — check if it exists in cron's default PATH + if ! command -v "$binary" &>/dev/null; then + verbose "${label}: can't verify relative command: ${binary}" + fi + fi + done < <(parse_cron_entries "$file" "$has_user") +} + +# ── Check: unescaped percent signs ──────────────────────────────────── +check_percent_signs() { + local file="$1" label="$2" + local lineno=0 + + while IFS= read -r line; do + (( lineno++ )) || true + [[ "$line" =~ ^[[:space:]]*# ]] && continue + [[ "$line" =~ ^[[:space:]]*$ ]] && continue + [[ "$line" =~ ^[[:space:]]*[A-Za-z_]+= ]] && continue + + # check for % not preceded by \ (unescaped) + if echo "$line" | grep -qP '(?/dev/null) || return + owner=$(stat -c '%U' "$file" 2>/dev/null) || return + + # system files (/etc/crontab, /etc/cron.d/*) are expected to be 644 root-owned + # user crontabs are expected to be 600 owned by the user + if [[ "$file" == /etc/* ]]; then + if [[ "$owner" != "root" ]]; then + warn "${label}: owned by ${owner}, expected root" + fi + else + local expected_user + expected_user=$(basename "$file") + if [[ "$perms" != "600" ]]; then + warn "${label}: permissions are ${perms}, expected 600" + suggest "chmod 600 ${file}" + fi + if [[ "$owner" != "$expected_user" && "$owner" != "root" ]]; then + fail "${label}: owned by ${owner}, expected ${expected_user} or root" + fi + fi +} + +# ── Check: missing trailing newline ─────────────────────────────────── +check_trailing_newline() { + local file="$1" label="$2" + + if [[ ! -r "$file" ]] || [[ ! -s "$file" ]]; then + return + fi + + # check if file ends with newline + if [[ "$(tail -c 1 "$file" | xxd -p)" != "0a" ]]; then + fail "${label}: no trailing newline — last cron entry will not run" + suggest "echo '' >> ${file}" + fi +} + +# ── Check: overlap risk ────────────────────────────────────────────── +check_overlap_risk() { + local file="$1" label="$2" has_user="${3:-false}" + + while IFS='|' read -r sched cmd; do + [[ -z "$cmd" ]] && continue + + # check for frequent schedules (every minute or every 5 min) + local is_frequent=false + if echo "$sched" | grep -qE '^\*[[:space:]]|^\*/[1-5][[:space:]]'; then + is_frequent=true + fi + + if [[ "$is_frequent" == "true" ]]; then + # check if command uses flock or lockfile + if ! echo "$cmd" | grep -qiE 'flock|lockfile|lock'; then + warn "${label}: frequent job (${sched%% *}) without locking: $(echo "$cmd" | cut -c1-60)" + suggest "Wrap with flock: /usr/bin/flock -n /var/lock/myjob.lock $cmd" + fi + fi + done < <(parse_cron_entries "$file" "$has_user") +} + +# ── Check: cron.allow / cron.deny ───────────────────────────────────── +check_cron_access() { + section "Cron Access Control" + + if [[ -f /etc/cron.allow ]]; then + info "/etc/cron.allow exists — only listed users can use cron" + if [[ -n "$TARGET_USER" ]]; then + if grep -qxF "$TARGET_USER" /etc/cron.allow 2>/dev/null; then + log "${TARGET_USER} is in cron.allow" + else + fail "${TARGET_USER} is NOT in cron.allow — cron jobs will not run" + suggest "echo '${TARGET_USER}' >> /etc/cron.allow" + fi + fi + elif [[ -f /etc/cron.deny ]]; then + info "/etc/cron.deny exists — listed users are blocked" + if [[ -n "$TARGET_USER" ]]; then + if grep -qxF "$TARGET_USER" /etc/cron.deny 2>/dev/null; then + fail "${TARGET_USER} is in cron.deny — cron jobs will not run" + suggest "Remove ${TARGET_USER} from /etc/cron.deny" + else + log "${TARGET_USER} is not in cron.deny" + fi + fi + else + verbose "No cron.allow or cron.deny found" + fi +} + +# ── Check: systemd timers ───────────────────────────────────────────── +check_systemd_timers() { + section "Systemd Timers" + + if ! command -v systemctl &>/dev/null; then + info "systemctl not found — skipping timer checks" + return + fi + + # failed timer-triggered services + local failed + failed=$(systemctl list-units --type=service --state=failed --no-pager --plain 2>/dev/null | \ + awk '{print $1}' | grep -v '^$' | grep -v '^UNIT' || true) + + if [[ -n "$failed" ]]; then + while IFS= read -r svc; do + # check if this service has a matching timer + local timer="${svc%.service}.timer" + if systemctl list-unit-files "$timer" &>/dev/null 2>&1; then + fail "Timer-triggered service failed: ${svc}" + suggest "journalctl -u ${svc} -b --no-pager | tail -20" + fi + done <<< "$failed" + else + log "No failed timer-triggered services" + fi + + # timers enabled but not active + while IFS= read -r line; do + local timer_name state + timer_name=$(echo "$line" | awk '{print $1}') + state=$(echo "$line" | awk '{print $3}') + + [[ -z "$timer_name" ]] && continue + [[ "$timer_name" != *.timer ]] && continue + + if [[ "$state" != "active" ]]; then + warn "Timer ${timer_name} is loaded but not active (state: ${state})" + suggest "systemctl start ${timer_name}" + fi + done < <(systemctl list-units --type=timer --all --no-pager --plain 2>/dev/null || true) + + # timers without Persistent=true + while IFS= read -r timer_name; do + [[ -z "$timer_name" ]] && continue + [[ "$timer_name" != *.timer ]] && continue + local persistent + persistent=$(systemctl show "$timer_name" -p Persistent 2>/dev/null | cut -d= -f2) + if [[ "$persistent" == "no" ]]; then + local has_calendar + has_calendar=$(systemctl show "$timer_name" -p TimersCalendar 2>/dev/null) + if [[ -n "$has_calendar" && "$has_calendar" != "TimersCalendar=" ]]; then + warn "${timer_name}: Persistent=false — missed runs during downtime won't catch up" + suggest "Add Persistent=true to [Timer] section: systemctl edit ${timer_name}" + fi + fi + done < <(systemctl list-units --type=timer --state=active --no-pager --plain 2>/dev/null | awk '{print $1}') +} + +# ── Run cron checks on a single file ───────────────────────────────── +check_crontab_file() { + local file="$1" label="$2" has_user="${3:-false}" + + verbose "Checking: ${file}" + check_crontab_permissions "$file" "$label" + check_trailing_newline "$file" "$label" + check_cron_environment "$file" "$label" + check_percent_signs "$file" "$label" + check_missing_binaries "$file" "$label" "$has_user" + check_overlap_risk "$file" "$label" "$has_user" +} + +# ══════════════════════════════════════════════════════════════════════ +# Main +# ══════════════════════════════════════════════════════════════════════ + +echo "" +echo -e " ${BOLD}Cron Doctor${RESET} — diagnosing scheduled task issues" +echo -e " ${DIM}$(date '+%Y-%m-%d %H:%M:%S')${RESET}" + +# ── Cron checks ─────────────────────────────────────────────────────── +if [[ "$TIMERS_ONLY" == "false" ]]; then + + check_cron_access + + # User crontabs + section "User Crontabs" + + crontab_files=$(get_crontab_files) + if [[ -z "$crontab_files" ]]; then + if [[ -n "$TARGET_USER" ]]; then + info "No crontab found for user: ${TARGET_USER}" + else + info "No user crontabs found in ${CRON_SPOOL:-/var/spool/cron}" + fi + else + while IFS= read -r file; do + [[ -z "$file" ]] && continue + user=$(basename "$file") + check_crontab_file "$file" "crontab(${user})" false + done <<< "$crontab_files" + fi + + # System crontab + if [[ -f /etc/crontab ]]; then + section "System Crontab (/etc/crontab)" + check_crontab_file "/etc/crontab" "/etc/crontab" true + fi + + # /etc/cron.d drop-ins + if [[ -d /etc/cron.d ]]; then + section "Drop-ins (/etc/cron.d)" + found_drop_ins=false + for f in /etc/cron.d/*; do + [[ ! -f "$f" ]] && continue + # skip dpkg/ucf leftovers + [[ "$f" =~ \.(dpkg-|ucf-) ]] && continue + found_drop_ins=true + check_crontab_file "$f" "cron.d/$(basename "$f")" true + done + if [[ "$found_drop_ins" == "false" ]]; then + info "No drop-in files in /etc/cron.d" + fi + fi +fi + +# ── Systemd timer checks ───────────────────────────────────────────── +if [[ "$CRON_ONLY" == "false" ]]; then + check_systemd_timers +fi + +# ── Summary ─────────────────────────────────────────────────────────── +echo "" +echo -e " ${BOLD}── Summary ──${RESET}" +echo "" +TOTAL=$(( FAIL_COUNT + WARN_COUNT )) +if [[ $TOTAL -eq 0 ]]; then + echo -e " ${GREEN}✓ No issues found${RESET}" +else + if [[ $FAIL_COUNT -gt 0 ]]; then + echo -e " ${RED}${FAIL_COUNT} failure(s):${RESET}" + for msg in "${FAIL_MESSAGES[@]}"; do + echo -e " ${RED}•${RESET} ${msg}" + done + fi + if [[ $WARN_COUNT -gt 0 ]]; then + echo -e " ${YELLOW}${WARN_COUNT} warning(s)${RESET}" + fi +fi +echo "" + +if [[ $FAIL_COUNT -gt 0 ]]; then + exit 2 +elif [[ $WARN_COUNT -gt 0 ]]; then + exit 1 +else + exit 0 +fi diff --git a/cron-job-exporter.sh b/cron-job-exporter.sh new file mode 100644 index 0000000..5f84bc2 --- /dev/null +++ b/cron-job-exporter.sh @@ -0,0 +1,316 @@ +#!/bin/bash +############################################################# +#### Cron Job Monitoring Exporter for Prometheus #### +#### Tracks whether scheduled cron jobs ran successfully, #### +#### their exit codes, duration, and staleness #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: ./cron-job-exporter.sh [OPTIONS] #### +############################################################# +# +# Monitors cron job execution by wrapping cron commands. +# Two modes of operation: +# 1. Wrapper mode: wrap a cron command to record metrics +# 2. Collector mode: scan state files and write .prom output +# +# Metrics exported: +# - cron_job_exit_code (last exit code) +# - cron_job_duration_seconds (last execution time) +# - cron_job_last_run_timestamp (unix timestamp of last run) +# - cron_job_success (1 if last run exited 0, else 0) +# - cron_job_runs_total (total number of runs) +# +# Requirements: +# - Bash 4.0+ +# - node_exporter with textfile collector enabled +# +set -euo pipefail + +######################### +### Configuration ### +######################### + +NODE_DIR="${NODE_DIR:-/var/lib/node_exporter}" +STATE_DIR="${STATE_DIR:-/var/lib/cron-job-exporter}" +PROM_FILE="${NODE_DIR}/cron_jobs.prom" +STALE_THRESHOLD="${STALE_THRESHOLD:-86400}" # 24 hours +DEBUG="${DEBUG:-}" + +######################### +### Logging ### +######################### + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +log_debug() { + [[ -n "$DEBUG" ]] && echo "[DEBUG] $1" >&2 +} + +######################### +### Parse Arguments ### +######################### + +show_help() { + cat < -- + Collector mode: $0 --collect + +WRAPPER MODE (use in crontab): + Wraps a cron command, records exit code, duration, and timestamp + to a state file. Run --collect separately to generate .prom output. + + Example crontab: + * * * * * /opt/cron-job-exporter.sh --wrap --name backup_db -- /opt/backup-db.sh + 0 * * * * /opt/cron-job-exporter.sh --wrap --name log_cleanup -- /opt/cleanup-logs.sh + +COLLECTOR MODE (run on schedule or as oneshot): + Reads all state files and writes a single .prom file for node_exporter. + + Example crontab: + * * * * * /opt/cron-job-exporter.sh --collect + +OPTIONS: + --wrap Wrapper mode: run a command and record metrics + --collect Collector mode: generate .prom from state files + --name NAME Job name for wrapper mode (required with --wrap) + --stale-threshold SEC Seconds before a job is considered stale (default: 86400) + --state-dir DIR State file directory (default: /var/lib/cron-job-exporter) + --help Show this help + +EOF + exit 0 +} + +MODE="" +JOB_NAME="" +JOB_CMD=() + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --wrap) MODE="wrap"; shift ;; + --collect) MODE="collect"; shift ;; + --name) JOB_NAME="$2"; shift 2 ;; + --stale-threshold) STALE_THRESHOLD="$2"; shift 2 ;; + --state-dir) STATE_DIR="$2"; shift 2 ;; + --help) show_help ;; + --) shift; JOB_CMD=("$@"); break ;; + *) log_error "Unknown option: $1"; exit 1 ;; + esac + done + + if [[ -z "$MODE" ]]; then + log_error "Must specify --wrap or --collect" + echo "Run '$0 --help' for usage." + exit 1 + fi + + if [[ "$MODE" == "wrap" ]]; then + if [[ -z "$JOB_NAME" ]]; then + log_error "--name is required in wrapper mode" + exit 1 + fi + if [[ ${#JOB_CMD[@]} -eq 0 ]]; then + log_error "No command specified after --" + exit 1 + fi + fi +} + +######################### +### Sanitize ### +######################### + +sanitize_name() { + local name="$1" + name="${name,,}" + name="${name// /_}" + name=$(echo "$name" | sed 's/[^a-z0-9_]/_/g') + name=$(echo "$name" | sed 's/__*/_/g; s/^_//; s/_$//') + echo "$name" +} + +######################### +### Wrapper Mode ### +######################### + +run_wrapper() { + mkdir -p "$STATE_DIR" + + local safe_name + safe_name=$(sanitize_name "$JOB_NAME") + local state_file="${STATE_DIR}/${safe_name}.state" + + log_debug "Wrapping command: ${JOB_CMD[*]}" + log_debug "Job name: $safe_name" + + local start_time end_time duration exit_code + start_time=$(date +%s%N) + + # Run the command, capturing exit code + set +e + "${JOB_CMD[@]}" + exit_code=$? + set -e + + end_time=$(date +%s%N) + duration=$(echo "scale=3; ($end_time - $start_time) / 1000000000" | bc 2>/dev/null || echo "0") + + # Read current run count + local runs=0 + if [[ -f "$state_file" ]]; then + runs=$(grep '^runs=' "$state_file" 2>/dev/null | cut -d= -f2 || echo "0") + fi + runs=$((runs + 1)) + + # Write state file atomically + local tmpfile + tmpfile=$(mktemp "${state_file}.XXXXXX") + + cat > "$tmpfile" < STALE_THRESHOLD )); then + stale=1 + fi + + metrics+="cron_job_exit_code{job=\"${name}\"} ${exit_code} +" + metrics+="cron_job_duration_seconds{job=\"${name}\"} ${duration} +" + metrics+="cron_job_last_run_timestamp{job=\"${name}\"} ${timestamp} +" + metrics+="cron_job_success{job=\"${name}\"} ${success} +" + metrics+="cron_job_runs_total{job=\"${name}\"} ${runs} +" + metrics+="cron_job_stale{job=\"${name}\"} ${stale} +" + + log_debug "Collected: $name (exit=$exit_code, stale=$stale)" + done + + if [[ $found -eq 0 ]]; then + log_debug "No state files found in $STATE_DIR" + fi + + # Collector metadata + metrics+=" +# HELP cron_job_collector_last_run_timestamp Unix timestamp of last collector run +# TYPE cron_job_collector_last_run_timestamp gauge +cron_job_collector_last_run_timestamp $now +" + + # Atomic write + local tmpfile + tmpfile=$(mktemp "${PROM_FILE}.XXXXXX") + echo "$metrics" > "$tmpfile" + mv "$tmpfile" "$PROM_FILE" + + log_info "Metrics written to $PROM_FILE ($found jobs)" +} + +######################### +### Main ### +######################### + +main() { + parse_args "$@" + + case "$MODE" in + wrap) run_wrapper ;; + collect) run_collector ;; + esac +} + +main "$@" diff --git a/cron-lister.sh b/cron-lister.sh new file mode 100644 index 0000000..46bdcc3 --- /dev/null +++ b/cron-lister.sh @@ -0,0 +1,433 @@ +#!/usr/bin/env bash + +######################################################################################### +#### cron-lister.sh — List all cron jobs across users, system cron, and timers #### +#### Scans user crontabs, /etc/cron.*, systemd timers, and anacron #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./cron-lister.sh #### +#### ./cron-lister.sh --format raw #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +FORMAT="${FORMAT:-table}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +COUNT_USER_CRONTAB=0 +COUNT_SYSTEM_CRONTAB=0 +COUNT_CRON_D=0 +COUNT_CRON_DIRS=0 +COUNT_SYSTEMD_TIMER=0 +COUNT_ANACRON=0 + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + GREEN="" YELLOW="" BLUE="" MAGENTA="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + MAGENTA='\033[0;35m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + GREEN="" YELLOW="" BLUE="" MAGENTA="" CYAN="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${DIM}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +# ── Helpers ─────────────────────────────────────────────────────────── +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +print_table_header() { + printf " ${BOLD}%-18s %-14s %-22s %s${RESET}\n" "SOURCE" "USER/UNIT" "SCHEDULE" "COMMAND" + printf " %s\n" "$(printf '%.0s─' {1..80})" +} + +print_job() { + local source="$1" + local user="$2" + local schedule="$3" + local command="$4" + + # Truncate long commands + if [[ ${#command} -gt 60 ]]; then + command="${command:0:57}..." + fi + + if [[ "$FORMAT" == "raw" ]]; then + printf "%s\t%s\t%s\t%s\n" "$source" "$user" "$schedule" "$command" + return + fi + + local color + case "$source" in + user-crontab) color="$GREEN" ;; + /etc/crontab) color="$BLUE" ;; + /etc/cron.d/*) color="$CYAN" ;; + cron.hourly|cron.daily|cron.weekly|cron.monthly) color="$MAGENTA" ;; + systemd-timer) color="$YELLOW" ;; + anacron) color="$DIM" ;; + *) color="" ;; + esac + + printf " %b%-18s%b %-14s %-22s %s\n" "$color" "$source" "$RESET" "$user" "$schedule" "$command" +} + +# ══════════════════════════════════════════════════════════════════════ +# USER CRONTABS +# ══════════════════════════════════════════════════════════════════════ + +scan_user_crontabs() { + section_header "User Crontabs" + + local crontab_dir="/var/spool/cron/crontabs" + local found=false + + if [[ -d "$crontab_dir" ]] && [[ -r "$crontab_dir" ]]; then + while IFS= read -r crontab_file; do + [[ -z "$crontab_file" ]] && continue + found=true + local username + username=$(basename "$crontab_file") + verbose "Reading crontab for user: $username" + + while IFS= read -r line; do + # Skip comments and empty lines + [[ -z "$line" || "$line" == "#"* || "$line" == "SHELL="* || "$line" == "PATH="* || "$line" == "MAILTO="* ]] && continue + + local schedule cmd + schedule=$(echo "$line" | awk '{print $1, $2, $3, $4, $5}') + cmd=$(echo "$line" | awk '{for(i=6;i<=NF;i++) printf "%s ", $i; print ""}' | sed 's/ *$//') + + print_job "user-crontab" "$username" "$schedule" "$cmd" + COUNT_USER_CRONTAB=$((COUNT_USER_CRONTAB + 1)) + done < "$crontab_file" + done < <(find "$crontab_dir" -type f 2>/dev/null) + fi + + if [[ "$found" == "false" ]]; then + verbose "No user crontabs found in $crontab_dir" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# SYSTEM CRONTAB +# ══════════════════════════════════════════════════════════════════════ + +scan_system_crontab() { + section_header "/etc/crontab" + + if [[ ! -f /etc/crontab ]]; then + verbose "/etc/crontab not found" + return + fi + + while IFS= read -r line; do + [[ -z "$line" || "$line" == "#"* || "$line" == "SHELL="* || "$line" == "PATH="* || "$line" == "MAILTO="* || "$line" == "HOME="* ]] && continue + + local schedule user cmd + schedule=$(echo "$line" | awk '{print $1, $2, $3, $4, $5}') + user=$(echo "$line" | awk '{print $6}') + cmd=$(echo "$line" | awk '{for(i=7;i<=NF;i++) printf "%s ", $i; print ""}' | sed 's/ *$//') + + if [[ -n "$cmd" ]]; then + print_job "/etc/crontab" "$user" "$schedule" "$cmd" + COUNT_SYSTEM_CRONTAB=$((COUNT_SYSTEM_CRONTAB + 1)) + fi + done < /etc/crontab +} + +# ══════════════════════════════════════════════════════════════════════ +# /etc/cron.d +# ══════════════════════════════════════════════════════════════════════ + +scan_cron_d() { + section_header "/etc/cron.d" + + if [[ ! -d /etc/cron.d ]]; then + verbose "/etc/cron.d not found" + return + fi + + while IFS= read -r cron_file; do + [[ -z "$cron_file" ]] && continue + local filename + filename=$(basename "$cron_file") + + # Skip dpkg and package manager files + [[ "$filename" == *.dpkg-* || "$filename" == *.ucf-* || "$filename" == "." || "$filename" == ".." ]] && continue + + verbose "Reading /etc/cron.d/$filename" + + while IFS= read -r line; do + [[ -z "$line" || "$line" == "#"* || "$line" == "SHELL="* || "$line" == "PATH="* || "$line" == "MAILTO="* || "$line" == "HOME="* ]] && continue + + local schedule user cmd + schedule=$(echo "$line" | awk '{print $1, $2, $3, $4, $5}') + user=$(echo "$line" | awk '{print $6}') + cmd=$(echo "$line" | awk '{for(i=7;i<=NF;i++) printf "%s ", $i; print ""}' | sed 's/ *$//') + + if [[ -n "$cmd" ]]; then + print_job "/etc/cron.d/$filename" "$user" "$schedule" "$cmd" + COUNT_CRON_D=$((COUNT_CRON_D + 1)) + fi + done < "$cron_file" + done < <(find /etc/cron.d -maxdepth 1 -type f 2>/dev/null) +} + +# ══════════════════════════════════════════════════════════════════════ +# CRON DIRECTORIES +# ══════════════════════════════════════════════════════════════════════ + +scan_cron_dirs() { + section_header "Cron Directories" + + local period + for period in hourly daily weekly monthly; do + local dir="/etc/cron.${period}" + if [[ ! -d "$dir" ]]; then + continue + fi + + while IFS= read -r script; do + [[ -z "$script" ]] && continue + local script_name + script_name=$(basename "$script") + + # Skip non-executable and package manager leftovers + [[ "$script_name" == *.dpkg-* || "$script_name" == *.ucf-* || "$script_name" == "." || "$script_name" == ".." ]] && continue + + if [[ -x "$script" ]]; then + print_job "cron.${period}" "root" "$period" "$script_name" + COUNT_CRON_DIRS=$((COUNT_CRON_DIRS + 1)) + else + verbose "Skipping non-executable: $script" + fi + done < <(find "$dir" -maxdepth 1 -type f 2>/dev/null) + done +} + +# ══════════════════════════════════════════════════════════════════════ +# SYSTEMD TIMERS +# ══════════════════════════════════════════════════════════════════════ + +scan_systemd_timers() { + section_header "Systemd Timers" + + if ! command -v systemctl &>/dev/null; then + verbose "systemd not available" + return + fi + + systemctl list-timers --all --no-legend --no-pager 2>/dev/null | while IFS= read -r line; do + [[ -z "$line" ]] && continue + + local unit_name schedule_info + # Timer unit is the second-to-last field, schedule is NEXT + LEFT + unit_name=$(echo "$line" | awk '{print $(NF-1)}') + schedule_info=$(echo "$line" | awk '{print $1, $2, $3}') + + if [[ -n "$unit_name" && "$unit_name" != "UNIT" ]]; then + # Get the trigger schedule from the timer unit + local on_calendar + on_calendar=$(systemctl show "$unit_name" --property=TimersCalendar 2>/dev/null | sed 's/TimersCalendar=//' | head -1) + + if [[ -z "$on_calendar" || "$on_calendar" == "" ]]; then + on_calendar=$(systemctl show "$unit_name" --property=TimersMonotonic 2>/dev/null | sed 's/TimersMonotonic=//' | head -1) + fi + + on_calendar="${on_calendar:-$schedule_info}" + + # Truncate schedule if too long + if [[ ${#on_calendar} -gt 20 ]]; then + on_calendar="${on_calendar:0:17}..." + fi + + print_job "systemd-timer" "$unit_name" "$on_calendar" "$(echo "$line" | awk '{print $NF}')" + COUNT_SYSTEMD_TIMER=$((COUNT_SYSTEMD_TIMER + 1)) + fi + done +} + +# ══════════════════════════════════════════════════════════════════════ +# ANACRON +# ══════════════════════════════════════════════════════════════════════ + +scan_anacron() { + section_header "Anacron" + + if [[ ! -f /etc/anacrontab ]]; then + verbose "/etc/anacrontab not found" + return + fi + + while IFS= read -r line; do + [[ -z "$line" || "$line" == "#"* || "$line" == "SHELL="* || "$line" == "PATH="* || "$line" == "MAILTO="* || "$line" == "HOME="* || "$line" == "START_HOURS_RANGE="* || "$line" == "RANDOM_DELAY="* ]] && continue + + local period delay ident cmd + period=$(echo "$line" | awk '{print $1}') + delay=$(echo "$line" | awk '{print $2}') + ident=$(echo "$line" | awk '{print $3}') + cmd=$(echo "$line" | awk '{for(i=4;i<=NF;i++) printf "%s ", $i; print ""}' | sed 's/ *$//') + + if [[ -n "$cmd" ]]; then + print_job "anacron" "$ident" "every ${period}d +${delay}m" "$cmd" + COUNT_ANACRON=$((COUNT_ANACRON + 1)) + fi + done < /etc/anacrontab +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + local total=$((COUNT_USER_CRONTAB + COUNT_SYSTEM_CRONTAB + COUNT_CRON_D + COUNT_CRON_DIRS + COUNT_SYSTEMD_TIMER + COUNT_ANACRON)) + + echo "" + echo -e " ${BOLD}══════════════════════════════════════════${RESET}" + echo -e " ${BOLD}Summary${RESET}" + echo -e " ${BOLD}══════════════════════════════════════════${RESET}" + + printf " %-22s %d\n" "User crontabs:" "$COUNT_USER_CRONTAB" + printf " %-22s %d\n" "/etc/crontab:" "$COUNT_SYSTEM_CRONTAB" + printf " %-22s %d\n" "/etc/cron.d:" "$COUNT_CRON_D" + printf " %-22s %d\n" "cron.{h,d,w,m}:" "$COUNT_CRON_DIRS" + printf " %-22s %d\n" "Systemd timers:" "$COUNT_SYSTEMD_TIMER" + printf " %-22s %d\n" "Anacron:" "$COUNT_ANACRON" + printf " %s\n" "$(printf '%.0s─' {1..30})" + printf " ${BOLD}%-22s %d${RESET}\n" "Total:" "$total" + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2 + echo "Run ${SCRIPT_NAME} --help for usage" >&2 + exit 1 ;; + esac + done + + if [[ "$FORMAT" != "table" && "$FORMAT" != "raw" ]]; then + echo "Invalid format: $FORMAT (must be 'table' or 'raw')" >&2 + exit 1 + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + if [[ "$FORMAT" != "raw" ]]; then + echo "" + echo -e "${BOLD}Cron Job Lister — $(hostname -f 2>/dev/null || hostname)${RESET}" + echo -e "${DIM}$(date '+%Y-%m-%d %H:%M:%S %Z')${RESET}" + fi + + if [[ "$FORMAT" == "table" ]]; then + echo "" + print_table_header + fi + + scan_user_crontabs + scan_system_crontab + scan_cron_d + scan_cron_dirs + scan_systemd_timers + scan_anacron + + if [[ "$FORMAT" != "raw" ]]; then + print_summary + fi +} + +main "$@" diff --git a/crowdsec-decisions-exporter.sh b/crowdsec-decisions-exporter.sh new file mode 100755 index 0000000..83035ff --- /dev/null +++ b/crowdsec-decisions-exporter.sh @@ -0,0 +1,518 @@ +#!/bin/bash +################################################################################ +# Script Name: crowdsec-decisions-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for CrowdSec active decisions — detailed +# metrics on bans, captchas, scopes, origins, countries, and +# decision lifecycle timestamps +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Note: This exporter focuses exclusively on CrowdSec active decisions +# (bans/captchas). For general CrowdSec operational metrics (alerts, +# bouncers, machines, hub items), see crowdsec-exporter.sh. +# +# Prerequisites: +# - CrowdSec installed and running +# - cscli command available +# - jq for JSON parsing +# - Root/sudo access +# - netcat (nc) for HTTP mode +# +# Usage: +# # Output to stdout +# sudo ./crowdsec-decisions-exporter.sh +# +# # HTTP server mode +# sudo ./crowdsec-decisions-exporter.sh --http -p 9202 +# +# # Textfile collector mode +# sudo ./crowdsec-decisions-exporter.sh --textfile +# +# Metrics Exported: +# - crowdsec_decisions_up - Exporter status (1=up, 0=down) +# - crowdsec_decisions_exporter_info{version} - Exporter version info +# - crowdsec_decisions_active_total - Total active decisions +# - crowdsec_decisions_active_by_type{type} - Active decisions by type +# - crowdsec_decisions_active_by_scope{scope} - Active decisions by scope +# - crowdsec_decisions_active_by_origin{origin} - Active decisions by origin +# - crowdsec_decisions_active_by_scenario{scenario} - Active decisions per scenario +# - crowdsec_decisions_active_by_country{country} - Active decisions per country (top 20) +# - crowdsec_decisions_oldest_timestamp - Oldest active decision timestamp +# - crowdsec_decisions_newest_timestamp - Newest active decision timestamp +# - crowdsec_decisions_expiring_1h - Decisions expiring within 1 hour +# - crowdsec_decisions_local_api_up - LAPI reachability (1/0) +# - crowdsec_decisions_exporter_duration_seconds - Script execution time +# - crowdsec_decisions_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9202 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9202 + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Check if CrowdSec is installed and responding +# Returns: 0 if OK, 1 if error +check_crowdsec() { + if ! command -v cscli >/dev/null 2>&1; then + echo "ERROR: cscli command not found" >&2 + return 1 + fi + + if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq not found (required for JSON parsing)" >&2 + return 1 + fi + + return 0 +} + +# Escape special characters in Prometheus label values +# Args: $1 - string to escape +# Returns: escaped string safe for Prometheus labels +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/}" + echo "$val" +} + +# Check LAPI health +# Returns: 1 if healthy, 0 if not +get_lapi_status() { + if cscli lapi status >/dev/null 2>&1; then + echo "1" + else + echo "0" + fi +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Check CrowdSec status first + if ! check_crowdsec; then + cat </dev/null) + + # Handle "null" or empty output from cscli (means no active decisions) + local total_decisions=0 + if [ -n "$decisions_json" ] && [ "$decisions_json" != "null" ]; then + total_decisions=$(echo "$decisions_json" | jq 'length' 2>/dev/null) + total_decisions=${total_decisions:-0} + fi + + # ======================================================================== + # ACTIVE DECISIONS TOTAL + # ======================================================================== + + cat </dev/null; then + echo "$decisions_json" | jq -r ' + group_by(.type) | .[] | + "\(.[0].type) \(length)" + ' 2>/dev/null | while read -r dtype count; do + [ -z "$dtype" ] && continue + echo "crowdsec_decisions_active_by_type{type=\"$(prom_escape "$dtype")\"} $count" + done + fi + + echo "" + + # ======================================================================== + # DECISIONS BY SCOPE (ip, range, country) + # ======================================================================== + + cat </dev/null; then + echo "$decisions_json" | jq -r ' + group_by(.scope) | .[] | + "\(.[0].scope) \(length)" + ' 2>/dev/null | while read -r scope count; do + [ -z "$scope" ] && continue + echo "crowdsec_decisions_active_by_scope{scope=\"$(prom_escape "$scope")\"} $count" + done + fi + + echo "" + + # ======================================================================== + # DECISIONS BY ORIGIN (cscli, crowdsec, CAPI) + # ======================================================================== + + cat </dev/null; then + echo "$decisions_json" | jq -r ' + group_by(.origin) | .[] | + "\(.[0].origin) \(length)" + ' 2>/dev/null | while read -r origin count; do + [ -z "$origin" ] && continue + echo "crowdsec_decisions_active_by_origin{origin=\"$(prom_escape "$origin")\"} $count" + done + fi + + echo "" + + # ======================================================================== + # DECISIONS BY SCENARIO (top scenarios) + # ======================================================================== + + cat </dev/null; then + echo "$decisions_json" | jq -r ' + group_by(.scenario) | map({scenario: .[0].scenario, count: length}) | + sort_by(-.count) | .[] | + "\(.scenario) \(.count)" + ' 2>/dev/null | while read -r scenario count; do + [ -z "$scenario" ] && continue + echo "crowdsec_decisions_active_by_scenario{scenario=\"$(prom_escape "$scenario")\"} $count" + done + fi + + echo "" + + # ======================================================================== + # DECISIONS BY COUNTRY (top 20) + # ======================================================================== + + cat </dev/null; then + echo "$decisions_json" | jq -r ' + [.[] | select(.scope == "Country" or .scope == "country")] | + if length > 0 then + group_by(.value) | map({country: .[0].value, count: length}) | + sort_by(-.count) | .[0:20] | .[] | + "\(.country) \(.count)" + else + empty + end + ' 2>/dev/null | while read -r country count; do + [ -z "$country" ] && continue + echo "crowdsec_decisions_active_by_country{country=\"$(prom_escape "$country")\"} $count" + done + fi + + echo "" + + # ======================================================================== + # DECISION TIMESTAMPS (oldest and newest) + # ======================================================================== + + cat </dev/null; then + local oldest_ts + oldest_ts=$(echo "$decisions_json" | jq -r '[.[].created_at] | sort | first // empty' 2>/dev/null) + if [ -n "$oldest_ts" ]; then + local oldest_unix + oldest_unix=$(date -d "$oldest_ts" +%s 2>/dev/null || echo "0") + echo "crowdsec_decisions_oldest_timestamp $oldest_unix" + else + echo "crowdsec_decisions_oldest_timestamp 0" + fi + else + echo "crowdsec_decisions_oldest_timestamp 0" + fi + + echo "" + + cat </dev/null; then + local newest_ts + newest_ts=$(echo "$decisions_json" | jq -r '[.[].created_at] | sort | last // empty' 2>/dev/null) + if [ -n "$newest_ts" ]; then + local newest_unix + newest_unix=$(date -d "$newest_ts" +%s 2>/dev/null || echo "0") + echo "crowdsec_decisions_newest_timestamp $newest_unix" + else + echo "crowdsec_decisions_newest_timestamp 0" + fi + else + echo "crowdsec_decisions_newest_timestamp 0" + fi + + echo "" + + # ======================================================================== + # DECISIONS EXPIRING WITHIN 1 HOUR + # ======================================================================== + + cat </dev/null; then + local now_epoch cutoff_epoch expiring_count + now_epoch=$(date +%s) + cutoff_epoch=$((now_epoch + 3600)) + expiring_count=$(echo "$decisions_json" | jq --arg now "$now_epoch" --arg cutoff "$cutoff_epoch" ' + [.[] | select(.until != null) | + (.until | sub("\\.[0-9]+.*$"; "Z") | fromdateiso8601) as $exp | + select($exp > ($now | tonumber) and $exp <= ($cutoff | tonumber)) + ] | length + ' 2>/dev/null) + echo "crowdsec_decisions_expiring_1h ${expiring_count:-0}" + else + echo "crowdsec_decisions_expiring_1h 0" + fi + + echo "" + + # ======================================================================== + # LAPI HEALTH + # ======================================================================== + + local lapi_status + lapi_status=$(get_lapi_status) + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + # Infinite loop accepting HTTP requests + while true; do + { + read -r request + # Check if request is for /metrics endpoint + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else # Serve HTML landing page for other requests + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +CrowdSec Decisions Exporter v1.0 + +

CrowdSec Decisions Prometheus Exporter v1.0

+

Metrics

+

Active decision metrics from cscli decisions list.

+

For general CrowdSec operational metrics, see crowdsec-exporter.sh.

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Main entry point - routes to appropriate output mode +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + # Run HTTP server (blocks until killed) + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + # Textfile collector mode: write atomically using temp file + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + # Create temp file in SAME directory for atomic rename (same filesystem) + local temp_file + temp_file=$(mktemp "${output_dir}/.crowdsec_decisions_metrics.XXXXXX") + + # Generate metrics to temp file + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + # Validate: file must exist, have content + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + # Set permissions before move + chmod 644 "$temp_file" + + # Atomic rename - no gap where file is missing + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + # Default: output to stdout + generate_metrics + fi +} + +# Execute main function with all script arguments +main "$@" diff --git a/crowdsec-exporter.sh b/crowdsec-exporter.sh new file mode 100755 index 0000000..4ea2bc2 --- /dev/null +++ b/crowdsec-exporter.sh @@ -0,0 +1,647 @@ +#!/bin/bash +################################################################################ +# Script Name: crowdsec-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for CrowdSec providing supplementary +# operational metrics from cscli commands — active decisions, +# alerts, bouncers, machines, hub items, and threat analysis +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Note: CrowdSec has a built-in Prometheus endpoint at port 6060 for internal +# metrics (bucket counts, parser hits, etc.). This exporter provides +# SUPPLEMENTARY operational metrics from cscli commands. +# +# Prerequisites: +# - CrowdSec installed and running +# - cscli command available +# - jq for JSON parsing +# - Root/sudo access +# - netcat (nc) for HTTP mode +# - curl for --grab-local mode +# +# Usage: +# # Output to stdout +# sudo ./crowdsec-exporter.sh +# +# # HTTP server mode +# sudo ./crowdsec-exporter.sh --http -p 9192 +# +# # Textfile collector mode +# sudo ./crowdsec-exporter.sh --textfile +# +# Metrics Exported: +# - crowdsec_up - Exporter status (1=up, 0=down) +# - crowdsec_info{version,exporter_version} - CrowdSec version info +# - crowdsec_decisions_active - Total active decisions +# - crowdsec_decisions_active_by_type{type} - Active decisions by type +# - crowdsec_decisions_active_by_origin{origin} - Active decisions by origin +# - crowdsec_decisions_active_by_scenario{scenario} - Active decisions by scenario +# - crowdsec_alerts_total - Total alerts +# - crowdsec_alerts_per_period{period} - Alerts in 1h/24h +# - crowdsec_top_attacker_decisions{ip} - Top 5 IPs by decision count +# - crowdsec_top_scenario_alerts{scenario} - Top 5 scenarios by alert count +# - crowdsec_bouncer_up{name} - Per-bouncer registered status +# - crowdsec_bouncer_last_pull_timestamp{name} - Per-bouncer last pull time +# - crowdsec_machine_up{name} - Machine registration status +# - crowdsec_lapi_up - LAPI health status +# - crowdsec_hub_items{type} - Installed hub items per type +# - crowdsec_exporter_duration_seconds - Script execution time +# - crowdsec_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9192 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9192 +GRAB_LOCAL=false +LOCAL_PORT=6060 + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Check if CrowdSec is installed and responding +# Returns: 0 if OK, 1 if error +check_crowdsec() { + if ! command -v cscli >/dev/null 2>&1; then + echo "ERROR: cscli command not found" >&2 + return 1 + fi + + if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq not found (required for JSON parsing)" >&2 + return 1 + fi + + # Verify LAPI is responding + if ! cscli lapi status >/dev/null 2>&1; then + echo "ERROR: CrowdSec LAPI not responding" >&2 + return 1 + fi + + return 0 +} + +# Escape special characters in Prometheus label values +# Args: $1 - string to escape +# Returns: escaped string safe for Prometheus labels +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/}" + echo "$val" +} + +# Get CrowdSec version string +# Returns: version string (e.g., "1.5.4") +get_crowdsec_version() { + cscli version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1 +} + +# Get active decisions as JSON +# Returns: JSON array of active decisions, or "null" on error +get_decisions_json() { + cscli decisions list -o json 2>/dev/null +} + +# Get alerts as JSON +# Args: $1 - optional --since parameter (e.g., "1h") +# Returns: JSON array of alerts, or "null" on error +get_alerts_json() { + local since="$1" + if [ -n "$since" ]; then + cscli alerts list --since "$since" -o json 2>/dev/null + else + cscli alerts list -o json 2>/dev/null + fi +} + +# Get bouncers as JSON +# Returns: JSON array of bouncers +get_bouncers_json() { + cscli bouncers list -o json 2>/dev/null +} + +# Get machines as JSON +# Returns: JSON array of machines +get_machines_json() { + cscli machines list -o json 2>/dev/null +} + +# Check LAPI health +# Returns: 1 if healthy, 0 if not +get_lapi_status() { + if cscli lapi status >/dev/null 2>&1; then + echo "1" + else + echo "0" + fi +} + +# Get hub items as JSON +# Returns: JSON output from cscli hub list +get_hub_json() { + cscli hub list -o json 2>/dev/null +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Check CrowdSec status first + if ! check_crowdsec; then + cat </dev/null) + total_decisions=${total_decisions:-0} + fi + + cat </dev/null; then + echo "$decisions_json" | jq -r ' + group_by(.type) | .[] | + "\(.[0].type) \(length)" + ' 2>/dev/null | while read -r dtype count; do + [ -z "$dtype" ] && continue + echo "crowdsec_decisions_active_by_type{type=\"$(prom_escape "$dtype")\"} $count" + done + fi + + echo "" + + # Decisions by origin (crowdsec, cscli, CAPI) + cat </dev/null; then + echo "$decisions_json" | jq -r ' + group_by(.origin) | .[] | + "\(.[0].origin) \(length)" + ' 2>/dev/null | while read -r origin count; do + [ -z "$origin" ] && continue + echo "crowdsec_decisions_active_by_origin{origin=\"$(prom_escape "$origin")\"} $count" + done + fi + + echo "" + + # Decisions by scenario + cat </dev/null; then + echo "$decisions_json" | jq -r ' + group_by(.scenario) | .[] | + "\(.[0].scenario) \(length)" + ' 2>/dev/null | while read -r scenario count; do + [ -z "$scenario" ] && continue + echo "crowdsec_decisions_active_by_scenario{scenario=\"$(prom_escape "$scenario")\"} $count" + done + fi + + echo "" + + # Top 5 attackers by decision count + cat </dev/null; then + echo "$decisions_json" | jq -r ' + group_by(.value) | map({ip: .[0].value, count: length}) | + sort_by(-.count) | .[0:5] | .[] | + "\(.ip) \(.count)" + ' 2>/dev/null | while read -r ip count; do + [ -z "$ip" ] && continue + echo "crowdsec_top_attacker_decisions{ip=\"$(prom_escape "$ip")\"} $count" + done + fi + + echo "" + + # ======================================================================== + # ALERTS METRICS + # ======================================================================== + + local alerts_json + alerts_json=$(get_alerts_json) + + local total_alerts=0 + if [ -n "$alerts_json" ] && [ "$alerts_json" != "null" ]; then + total_alerts=$(echo "$alerts_json" | jq 'length' 2>/dev/null) + total_alerts=${total_alerts:-0} + fi + + cat </dev/null) + alerts_1h=${alerts_1h:-0} + fi + + local alerts_24h=0 + if [ -n "$alerts_24h_json" ] && [ "$alerts_24h_json" != "null" ]; then + alerts_24h=$(echo "$alerts_24h_json" | jq 'length' 2>/dev/null) + alerts_24h=${alerts_24h:-0} + fi + + cat </dev/null; then + echo "$alerts_json" | jq -r ' + group_by(.scenario) | map({scenario: .[0].scenario, count: length}) | + sort_by(-.count) | .[0:5] | .[] | + "\(.scenario) \(.count)" + ' 2>/dev/null | while read -r scenario count; do + [ -z "$scenario" ] && continue + echo "crowdsec_top_scenario_alerts{scenario=\"$(prom_escape "$scenario")\"} $count" + done + fi + + echo "" + + # ======================================================================== + # BOUNCER METRICS + # ======================================================================== + + local bouncers_json + bouncers_json=$(get_bouncers_json) + + cat </dev/null | while read -r name status; do + [ -z "$name" ] && continue + echo "crowdsec_bouncer_up{name=\"$(prom_escape "$name")\"} $status" + done + fi + + echo "" + + cat </dev/null | while read -r name last_pull; do + [ -z "$name" ] && continue + # Convert ISO timestamp to Unix epoch + local ts + ts=$(date -d "$last_pull" +%s 2>/dev/null || echo "0") + echo "crowdsec_bouncer_last_pull_timestamp{name=\"$(prom_escape "$name")\"} $ts" + done + fi + + echo "" + + # ======================================================================== + # MACHINE METRICS + # ======================================================================== + + local machines_json + machines_json=$(get_machines_json) + + cat </dev/null | while read -r name status; do + [ -z "$name" ] && continue + echo "crowdsec_machine_up{name=\"$(prom_escape "$name")\"} $status" + done + fi + + echo "" + + # ======================================================================== + # LAPI HEALTH + # ======================================================================== + + local lapi_status + lapi_status=$(get_lapi_status) + + cat </dev/null) + parsers=$(echo "$hub_json" | jq '[.parsers // [] | .[] | select(.installed == true)] | length' 2>/dev/null) + scenarios=$(echo "$hub_json" | jq '[.scenarios // [] | .[] | select(.installed == true)] | length' 2>/dev/null) + postoverflows=$(echo "$hub_json" | jq '[.postoverflows // [] | .[] | select(.installed == true)] | length' 2>/dev/null) + + echo "crowdsec_hub_items{type=\"collections\"} ${collections:-0}" + echo "crowdsec_hub_items{type=\"parsers\"} ${parsers:-0}" + echo "crowdsec_hub_items{type=\"scenarios\"} ${scenarios:-0}" + echo "crowdsec_hub_items{type=\"postoverflows\"} ${postoverflows:-0}" + else + echo "crowdsec_hub_items{type=\"collections\"} 0" + echo "crowdsec_hub_items{type=\"parsers\"} 0" + echo "crowdsec_hub_items{type=\"scenarios\"} 0" + echo "crowdsec_hub_items{type=\"postoverflows\"} 0" + fi + + echo "" + + # ======================================================================== + # BUILT-IN METRICS (optional, via --grab-local) + # ======================================================================== + + if [ "$GRAB_LOCAL" = true ]; then + local builtin_metrics + builtin_metrics=$(curl -sf --max-time 5 "http://localhost:${LOCAL_PORT}/metrics" 2>/dev/null) + + if [ -n "$builtin_metrics" ]; then + echo "# ================================================================" + echo "# CrowdSec built-in metrics from localhost:${LOCAL_PORT}" + echo "# ================================================================" + echo "$builtin_metrics" + echo "" + else + echo "# WARNING: Failed to fetch built-in metrics from localhost:${LOCAL_PORT}" + echo "" + fi + fi + + # ======================================================================== + # EXPORTER RUNTIME + # ======================================================================== + + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + # Infinite loop accepting HTTP requests + while true; do + { + read -r request + # Check if request is for /metrics endpoint + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else # Serve HTML landing page for other requests + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +CrowdSec Exporter v1.0 + +

CrowdSec Prometheus Exporter v1.0

+

Metrics

+

Supplementary operational metrics from cscli commands.

+

For internal CrowdSec metrics (buckets, parsers), see port 6060.

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Main entry point - routes to appropriate output mode +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + # Run HTTP server (blocks until killed) + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + # Textfile collector mode: write atomically using temp file + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + # Create temp file in SAME directory for atomic rename (same filesystem) + local temp_file + temp_file=$(mktemp "${output_dir}/.crowdsec_metrics.XXXXXX") + + # Generate metrics to temp file + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + # Validate: file must exist, have content + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + # Set permissions before move + chmod 644 "$temp_file" + + # Atomic rename - no gap where file is missing + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + # Default: output to stdout + generate_metrics + fi +} + +# Execute main function with all script arguments +main "$@" diff --git a/crowdsec-install.sh b/crowdsec-install.sh new file mode 100644 index 0000000..502ee7e --- /dev/null +++ b/crowdsec-install.sh @@ -0,0 +1,444 @@ +#!/bin/bash +################################################################################ +# Script Name: crowdsec-install.sh +# Version: 1.0 +# Description: Automated CrowdSec installation with firewall bouncer, +# collection selection, allowlists, and Prometheus integration +# on Debian/Ubuntu and RHEL/Rocky/AlmaLinux +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Usage: +# sudo ./crowdsec-install.sh +# sudo ./crowdsec-install.sh --collections "sshd,nginx" +# sudo ./crowdsec-install.sh --allowlist "10.0.0.0/8" --prometheus +# sudo ./crowdsec-install.sh --dry-run +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +COLLECTIONS="" +ALLOWLIST="" +BOUNCER_TYPE="iptables" +PROMETHEUS=false +ENROLL_KEY="" +NO_BOUNCER=false +DRY_RUN=false + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +show_usage() { + cat </dev/null || systemctl is-active --quiet ssh 2>/dev/null; then + detected+=("crowdsecurity/sshd") + log_info " Detected: SSH" + fi + + # Nginx + if systemctl is-active --quiet nginx 2>/dev/null; then + detected+=("crowdsecurity/nginx") + log_info " Detected: Nginx" + fi + + # Apache + if systemctl is-active --quiet apache2 2>/dev/null || systemctl is-active --quiet httpd 2>/dev/null; then + detected+=("crowdsecurity/apache2") + log_info " Detected: Apache" + fi + + # Postfix + if systemctl is-active --quiet postfix 2>/dev/null; then + detected+=("crowdsecurity/postfix") + log_info " Detected: Postfix" + fi + + # Dovecot + if systemctl is-active --quiet dovecot 2>/dev/null; then + detected+=("crowdsecurity/dovecot") + log_info " Detected: Dovecot" + fi + + # MySQL/MariaDB + if systemctl is-active --quiet mysql 2>/dev/null || systemctl is-active --quiet mariadb 2>/dev/null; then + detected+=("crowdsecurity/mysql") + log_info " Detected: MySQL/MariaDB" + fi + + # PostgreSQL + if systemctl is-active --quiet postgresql 2>/dev/null; then + detected+=("crowdsecurity/pgsql") + log_info " Detected: PostgreSQL" + fi + + DETECTED_COLLECTIONS="${detected[*]}" +} + +# ============================================================================ +# INSTALLATION +# ============================================================================ + +add_repo_debian() { + log_step "Adding CrowdSec repository (Debian/Ubuntu)..." + + if [ "$DRY_RUN" = true ]; then + log_info "[DRY RUN] Would add CrowdSec apt repository" + return + fi + + apt-get update -qq + apt-get install -y -qq curl gnupg apt-transport-https >/dev/null 2>&1 + + curl -s https://packagecloud.io/install/repositories/crowdsec/crowdsec/script.deb.sh | bash >/dev/null 2>&1 +} + +add_repo_rhel() { + log_step "Adding CrowdSec repository (RHEL/Rocky)..." + + if [ "$DRY_RUN" = true ]; then + log_info "[DRY RUN] Would add CrowdSec yum repository" + return + fi + + curl -s https://packagecloud.io/install/repositories/crowdsec/crowdsec/script.rpm.sh | bash >/dev/null 2>&1 +} + +install_agent() { + log_step "Installing CrowdSec agent..." + + if [ "$DRY_RUN" = true ]; then + log_info "[DRY RUN] Would install crowdsec package" + return + fi + + case "$OS_FAMILY" in + debian) apt-get install -y -qq crowdsec >/dev/null 2>&1 ;; + rhel) dnf install -y -q crowdsec >/dev/null 2>&1 ;; + esac + + systemctl enable --now crowdsec >/dev/null 2>&1 + log_info "CrowdSec agent installed and running" +} + +install_bouncer() { + if [ "$NO_BOUNCER" = true ]; then + log_info "Skipping bouncer installation (--no-bouncer)" + return + fi + + local pkg="crowdsec-firewall-bouncer-${BOUNCER_TYPE}" + log_step "Installing bouncer: $pkg..." + + if [ "$DRY_RUN" = true ]; then + log_info "[DRY RUN] Would install $pkg" + return + fi + + case "$OS_FAMILY" in + debian) apt-get install -y -qq "$pkg" >/dev/null 2>&1 ;; + rhel) dnf install -y -q "$pkg" >/dev/null 2>&1 ;; + esac + + systemctl enable --now "$pkg" >/dev/null 2>&1 + log_info "Bouncer installed: $pkg" +} + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +install_collections() { + local collections_to_install="" + + if [ -n "$COLLECTIONS" ]; then + # User-specified collections + IFS=',' read -ra cols <<< "$COLLECTIONS" + for col in "${cols[@]}"; do + col=$(echo "$col" | xargs) + # Add crowdsecurity/ prefix if not present + if [[ "$col" != */* ]]; then + col="crowdsecurity/$col" + fi + collections_to_install="$collections_to_install $col" + done + else + # Auto-detected collections + collections_to_install="$DETECTED_COLLECTIONS" + fi + + log_step "Installing collections: $collections_to_install" + + if [ "$DRY_RUN" = true ]; then + log_info "[DRY RUN] Would install: $collections_to_install" + return + fi + + for col in $collections_to_install; do + if cscli collections install "$col" >/dev/null 2>&1; then + log_info " Installed: $col" + else + log_warn " Failed to install: $col" + fi + done +} + +configure_allowlist() { + if [ -z "$ALLOWLIST" ]; then + return + fi + + log_step "Configuring allowlist..." + + if [ "$DRY_RUN" = true ]; then + log_info "[DRY RUN] Would whitelist: $ALLOWLIST" + return + fi + + IFS=',' read -ra ips <<< "$ALLOWLIST" + for ip in "${ips[@]}"; do + ip=$(echo "$ip" | xargs) + if cscli decisions add --ip "$ip" --type whitelist --duration 87600h >/dev/null 2>&1; then + log_info " Whitelisted: $ip" + else + log_warn " Failed to whitelist: $ip" + fi + done +} + +configure_prometheus() { + if [ "$PROMETHEUS" != true ]; then + return + fi + + log_step "Enabling Prometheus metrics on :6060..." + + if [ "$DRY_RUN" = true ]; then + log_info "[DRY RUN] Would enable Prometheus metrics" + return + fi + + local config="/etc/crowdsec/config.yaml" + if [ -f "$config" ]; then + # Prometheus is enabled by default in CrowdSec, verify + if grep -q "prometheus:" "$config"; then + log_info "Prometheus metrics already configured" + fi + fi + + log_info "Prometheus metrics available at http://localhost:6060/metrics" +} + +enroll_console() { + if [ -z "$ENROLL_KEY" ]; then + return + fi + + log_step "Enrolling with CrowdSec console..." + + if [ "$DRY_RUN" = true ]; then + log_info "[DRY RUN] Would enroll with key: $ENROLL_KEY" + return + fi + + if cscli console enroll "$ENROLL_KEY" >/dev/null 2>&1; then + log_info "Enrolled with CrowdSec console" + else + log_warn "Console enrollment failed — verify enrollment key" + fi +} + +# ============================================================================ +# VERIFICATION +# ============================================================================ + +verify_installation() { + log_step "Verifying installation..." + + echo "" + + # CrowdSec agent + if systemctl is-active --quiet crowdsec 2>/dev/null; then + log_info "✓ CrowdSec agent: running" + else + log_error "✗ CrowdSec agent: not running" + fi + + # Bouncer + if [ "$NO_BOUNCER" != true ]; then + local bouncer_svc="crowdsec-firewall-bouncer-${BOUNCER_TYPE}" + if systemctl is-active --quiet "$bouncer_svc" 2>/dev/null; then + log_info "✓ Firewall bouncer: running" + else + log_error "✗ Firewall bouncer: not running" + fi + fi + + # Collections + log_info "Installed collections:" + cscli collections list 2>/dev/null | grep -E "enabled|installed" || true + + # Bouncers + log_info "Registered bouncers:" + cscli bouncers list 2>/dev/null || true + + echo "" + log_info "Installation complete" + echo "" + log_info "Useful commands:" + echo " cscli decisions list — view active decisions" + echo " cscli alerts list — view recent alerts" + echo " cscli metrics — view metrics summary" + echo " cscli hub list — view installed hub items" +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + check_root + detect_os + + echo "" + log_info "=== CrowdSec Installation Script v1.0 ===" + echo "" + + if [ "$DRY_RUN" = true ]; then + log_warn "DRY RUN MODE — no changes will be made" + echo "" + fi + + detect_services + + case "$OS_FAMILY" in + debian) add_repo_debian ;; + rhel) add_repo_rhel ;; + esac + + install_agent + install_collections + configure_allowlist + install_bouncer + configure_prometheus + enroll_console + + if [ "$DRY_RUN" != true ]; then + verify_installation + fi +} + +main "$@" diff --git a/database-backup-exporter.sh b/database-backup-exporter.sh new file mode 100644 index 0000000..686b6ef --- /dev/null +++ b/database-backup-exporter.sh @@ -0,0 +1,313 @@ +#!/bin/bash +############################################################# +#### Database Backup Exporter for Prometheus #### +#### Monitor MySQL and PostgreSQL backup freshness, #### +#### size, and status via node_exporter textfile collector #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: ./database-backup-exporter.sh [OPTIONS] #### +############################################################# + +set -euo pipefail + +# ----------------------------- +# Defaults +# ----------------------------- +BACKUP_DIR="/opt/backups" +MAX_AGE=86400 +PROM_FILE="/var/lib/node_exporter/database_backups.prom" +INTERVAL=300 +RUN_ONCE=false + +# ----------------------------- +# Color codes +# ----------------------------- +RED='\033[0;31m' +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +NC='\033[0m' + +# ----------------------------- +# Logging +# ----------------------------- +log_info() { + echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" >&2 +} + +# ----------------------------- +# Usage +# ----------------------------- +usage() { + cat <_YYYYMMDD[HHMMSS]. + Examples: myapp_20260309.sql.gz orders_20260308120000.pgdump + +EOF + exit 0 +} + +# ----------------------------- +# Parse arguments +# ----------------------------- +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --backup-dir) + BACKUP_DIR="$2" + shift 2 + ;; + --max-age) + MAX_AGE="$2" + shift 2 + ;; + --prom-file) + PROM_FILE="$2" + shift 2 + ;; + --interval) + INTERVAL="$2" + shift 2 + ;; + --once) + RUN_ONCE=true + shift + ;; + --help) + usage + ;; + *) + log_error "Unknown option: $1" + usage + ;; + esac + done +} + +# ----------------------------- +# Detect backup type from ext +# ----------------------------- +detect_type() { + local filename="$1" + case "$filename" in + *.pgdump|*.dump) + echo "postgres" + ;; + *.sql|*.sql.gz) + echo "mysql" + ;; + *) + echo "unknown" + ;; + esac +} + +# ----------------------------- +# Extract database name +# ----------------------------- +extract_dbname() { + local filename + filename="$(basename "$1")" + # Strip all known extensions + filename="${filename%.gz}" + filename="${filename%.sql}" + filename="${filename%.dump}" + filename="${filename%.pgdump}" + # Expect pattern: dbname_YYYYMMDD... — grab everything before the date segment + echo "$filename" | sed -E 's/_[0-9]{8,14}$//' +} + +# ----------------------------- +# Collect and write metrics +# ----------------------------- +collect_metrics() { + local backup_dir="$1" + local max_age="$2" + local now + now="$(date +%s)" + + if [[ ! -d "$backup_dir" ]]; then + log_error "Backup directory does not exist: $backup_dir" + return 1 + fi + + # Associative arrays keyed by "dbname|type" + declare -A latest_ts + declare -A latest_size + declare -A file_count + + # Scan for backup files + local found=0 + while IFS= read -r -d '' file; do + local base + base="$(basename "$file")" + local btype + btype="$(detect_type "$base")" + [[ "$btype" == "unknown" ]] && continue + + local dbname + dbname="$(extract_dbname "$file")" + [[ -z "$dbname" ]] && continue + + local key="${dbname}|${btype}" + local mtime + mtime="$(stat -c '%Y' "$file" 2>/dev/null)" || continue + local fsize + fsize="$(stat -c '%s' "$file" 2>/dev/null)" || continue + + # Track count + file_count[$key]=$(( ${file_count[$key]:-0} + 1 )) + + # Track most recent + if [[ -z "${latest_ts[$key]:-}" ]] || (( mtime > latest_ts[$key] )); then + latest_ts[$key]="$mtime" + latest_size[$key]="$fsize" + fi + + found=$((found + 1)) + done < <(find "$backup_dir" -type f \( -name '*.sql' -o -name '*.sql.gz' -o -name '*.dump' -o -name '*.pgdump' \) -print0 2>/dev/null) + + log_info "Found $found backup file(s) in $backup_dir" + + # Build output + local output="" + + output+="# HELP db_backup_last_timestamp Unix timestamp of most recent backup.\n" + output+="# TYPE db_backup_last_timestamp gauge\n" + for key in "${!latest_ts[@]}"; do + local dbname="${key%%|*}" + local btype="${key##*|}" + output+="db_backup_last_timestamp{database=\"${dbname}\",type=\"${btype}\"} ${latest_ts[$key]}\n" + done + + output+="# HELP db_backup_age_seconds Seconds since most recent backup.\n" + output+="# TYPE db_backup_age_seconds gauge\n" + for key in "${!latest_ts[@]}"; do + local dbname="${key%%|*}" + local btype="${key##*|}" + local age=$(( now - latest_ts[$key] )) + output+="db_backup_age_seconds{database=\"${dbname}\",type=\"${btype}\"} ${age}\n" + done + + output+="# HELP db_backup_size_bytes Size of most recent backup file in bytes.\n" + output+="# TYPE db_backup_size_bytes gauge\n" + for key in "${!latest_size[@]}"; do + local dbname="${key%%|*}" + local btype="${key##*|}" + output+="db_backup_size_bytes{database=\"${dbname}\",type=\"${btype}\"} ${latest_size[$key]}\n" + done + + output+="# HELP db_backup_count Number of backup files found.\n" + output+="# TYPE db_backup_count gauge\n" + for key in "${!file_count[@]}"; do + local dbname="${key%%|*}" + local btype="${key##*|}" + output+="db_backup_count{database=\"${dbname}\",type=\"${btype}\"} ${file_count[$key]}\n" + done + + output+="# HELP db_backup_fresh 1 if backup is within max_age, 0 if stale.\n" + output+="# TYPE db_backup_fresh gauge\n" + for key in "${!latest_ts[@]}"; do + local dbname="${key%%|*}" + local btype="${key##*|}" + local age=$(( now - latest_ts[$key] )) + local fresh=1 + if (( age > max_age )); then + fresh=0 + log_warn "Stale backup: database=${dbname} type=${btype} age=${age}s exceeds max_age=${max_age}s" + fi + output+="db_backup_fresh{database=\"${dbname}\",type=\"${btype}\"} ${fresh}\n" + done + + output+="# HELP db_backup_exporter_last_run Timestamp of last exporter run.\n" + output+="# TYPE db_backup_exporter_last_run gauge\n" + output+="db_backup_exporter_last_run ${now}\n" + + echo "$output" +} + +# ----------------------------- +# Write metrics atomically +# ----------------------------- +write_metrics() { + local content="$1" + local prom_file="$2" + + local prom_dir + prom_dir="$(dirname "$prom_file")" + + if [[ ! -d "$prom_dir" ]]; then + log_error "Prom directory does not exist: $prom_dir" + return 1 + fi + + local tmp_file + tmp_file="$(mktemp "${prom_dir}/.database_backups.prom.XXXXXX")" + + echo -e "$content" > "$tmp_file" + mv "$tmp_file" "$prom_file" + + log_info "Metrics written to $prom_file" +} + +# ----------------------------- +# Main +# ----------------------------- +main() { + parse_args "$@" + + log_info "Database Backup Exporter starting" + log_info "Backup directory: $BACKUP_DIR" + log_info "Max backup age: ${MAX_AGE}s" + log_info "Prom file: $PROM_FILE" + + while true; do + local metrics + metrics="$(collect_metrics "$BACKUP_DIR" "$MAX_AGE")" || true + + if [[ -n "$metrics" ]]; then + write_metrics "$metrics" "$PROM_FILE" + fi + + if [[ "$RUN_ONCE" == true ]]; then + log_info "Single run complete, exiting" + break + fi + + log_info "Sleeping ${INTERVAL}s until next collection" + sleep "$INTERVAL" + done +} + +main "$@" diff --git a/database-smoke-tests.sh b/database-smoke-tests.sh new file mode 100644 index 0000000..080c980 --- /dev/null +++ b/database-smoke-tests.sh @@ -0,0 +1,573 @@ +#!/usr/bin/env bash + +##################################################################################### +#### database-smoke-tests.sh — Verify database health #### +#### Checks connectivity, auth, replication, backup age, bloat, connections. #### +#### Supports: PostgreSQL, MySQL/MariaDB, Redis #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: DB_TYPE=postgresql DB_HOST=localhost ./database-smoke-tests.sh #### +#### DB_TYPE=redis REDIS_HOST=localhost ./database-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +##################################################################################### + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Help +# --------------------------------------------------------------------------- +show_help() { + cat <<'EOF' +database-smoke-tests.sh — Database health smoke testing + +ENVIRONMENT VARIABLES: + DB_TYPE (required) postgresql | mysql | redis + DB_HOST Database host (default: localhost) + DB_PORT Database port (default: auto — 5432/3306/6379) + DB_USER Database user (default: postgres | root | "") + DB_PASS Database password (default: "") + DB_NAME Database name (default: postgres | mysql) + + REDIS_HOST Redis host (falls back to DB_HOST) + REDIS_PORT Redis port (falls back to DB_PORT) + REDIS_AUTH Redis auth (falls back to DB_PASS) + + MAX_REPLICATION_LAG_S Max replication lag in seconds (default: 30) + MAX_BACKUP_AGE_H Max backup / last-save age in hours (default: 26) + MAX_CONNECTIONS_PCT Connection usage threshold % (default: 80) + SKIP_REPLICATION Skip replication checks (default: false) + SKIP_BACKUP_AGE Skip backup-age checks (default: false) + + OUTPUT_FORMAT text | tap (default: text) + COLOR auto | always | never (default: auto) + VERBOSE true | false (default: false) + +EXAMPLES: + DB_TYPE=postgresql DB_HOST=db1 DB_PASS=secret ./database-smoke-tests.sh + DB_TYPE=mysql DB_HOST=db2 DB_USER=app DB_NAME=mydb ./database-smoke-tests.sh + DB_TYPE=redis REDIS_HOST=cache1 REDIS_AUTH=pass ./database-smoke-tests.sh +EOF + exit 0 +} + +[[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && show_help + +# --------------------------------------------------------------------------- +# Environment defaults +# --------------------------------------------------------------------------- +DB_TYPE="${DB_TYPE:-}" +DB_HOST="${DB_HOST:-localhost}" +DB_PORT="${DB_PORT:-}" +DB_USER="${DB_USER:-}" +DB_PASS="${DB_PASS:-}" +DB_NAME="${DB_NAME:-}" + +REDIS_HOST="${REDIS_HOST:-$DB_HOST}" +REDIS_PORT="${REDIS_PORT:-${DB_PORT:-6379}}" +REDIS_AUTH="${REDIS_AUTH:-$DB_PASS}" + +MAX_REPLICATION_LAG_S="${MAX_REPLICATION_LAG_S:-30}" +MAX_BACKUP_AGE_H="${MAX_BACKUP_AGE_H:-26}" +MAX_CONNECTIONS_PCT="${MAX_CONNECTIONS_PCT:-80}" +SKIP_REPLICATION="${SKIP_REPLICATION:-false}" +SKIP_BACKUP_AGE="${SKIP_BACKUP_AGE:-false}" + +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +COLOR="${COLOR:-auto}" +VERBOSE="${VERBOSE:-false}" + +# --------------------------------------------------------------------------- +# Apply per-engine defaults after DB_TYPE is known +# --------------------------------------------------------------------------- +apply_defaults() { + case "$DB_TYPE" in + postgresql) + DB_PORT="${DB_PORT:-5432}" + DB_USER="${DB_USER:-postgres}" + DB_NAME="${DB_NAME:-postgres}" + ;; + mysql) + DB_PORT="${DB_PORT:-3306}" + DB_USER="${DB_USER:-root}" + DB_NAME="${DB_NAME:-mysql}" + ;; + redis) + REDIS_PORT="${REDIS_PORT:-6379}" + ;; + *) + echo "ERROR: DB_TYPE must be one of: postgresql, mysql, redis" >&2 + exit 1 + ;; + esac +} + +# --------------------------------------------------------------------------- +# Colour setup +# --------------------------------------------------------------------------- +setup_colors() { + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + local use_color=false + case "$COLOR" in + always) use_color=true ;; + never) use_color=false ;; + auto) [[ -t 1 ]] && use_color=true ;; + esac + if $use_color; then + RED=$'\033[0;31m' + GREEN=$'\033[0;32m' + YELLOW=$'\033[1;33m' + BLUE=$'\033[0;34m' + BOLD=$'\033[1m' + RESET=$'\033[0m' + fi +} + +# --------------------------------------------------------------------------- +# Counters +# --------------------------------------------------------------------------- +PASS_COUNT=0 +FAIL_COUNT=0 +SKIP_COUNT=0 +TEST_NUM=0 + +# --------------------------------------------------------------------------- +# run_test "description" command... +# --------------------------------------------------------------------------- +run_test() { + local desc="$1"; shift + TEST_NUM=$((TEST_NUM + 1)) + local output rc + output=$("$@" 2>&1) && rc=0 || rc=$? + + if [[ $rc -eq 0 ]]; then + PASS_COUNT=$((PASS_COUNT + 1)) + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok $TEST_NUM - $desc" + else + echo " ${GREEN}PASS${RESET} $desc" + fi + elif [[ $rc -eq 2 ]]; then + SKIP_COUNT=$((SKIP_COUNT + 1)) + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok $TEST_NUM - $desc # SKIP ${output:-skipped}" + else + echo " ${YELLOW}SKIP${RESET} $desc — ${output:-skipped}" + fi + else + FAIL_COUNT=$((FAIL_COUNT + 1)) + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok $TEST_NUM - $desc" + [[ -n "$output" ]] && echo "# $output" + else + echo " ${RED}FAIL${RESET} $desc" + [[ -n "$output" ]] && echo " $output" + fi + fi + + if [[ "$VERBOSE" == "true" && -n "$output" && $rc -eq 0 ]]; then + echo " ${BLUE}→${RESET} $output" + fi +} + +# --------------------------------------------------------------------------- +# skip_test "description" "reason" +# --------------------------------------------------------------------------- +skip_test() { + local desc="$1" reason="${2:-skipped}" + TEST_NUM=$((TEST_NUM + 1)) + SKIP_COUNT=$((SKIP_COUNT + 1)) + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok $TEST_NUM - $desc # SKIP $reason" + else + echo " ${YELLOW}SKIP${RESET} $desc — $reason" + fi +} + +# --------------------------------------------------------------------------- +# check_port host port [timeout] +# --------------------------------------------------------------------------- +check_port() { + local host="$1" port="$2" timeout="${3:-5}" + if command -v nc &>/dev/null; then + nc -z -w "$timeout" "$host" "$port" 2>/dev/null + elif [[ -e /dev/tcp ]]; then + timeout "$timeout" bash -c "echo >/dev/tcp/$host/$port" 2>/dev/null + else + (echo >/dev/tcp/"$host"/"$port") 2>/dev/null + fi +} + +# --------------------------------------------------------------------------- +# Helper: build psql / mysql invocations +# --------------------------------------------------------------------------- +run_psql() { + PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" \ + -d "${1:-$DB_NAME}" -t -A -c "$2" 2>&1 +} + +run_mysql() { + MYSQL_PWD="$DB_PASS" mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USER" \ + -D "${1:-$DB_NAME}" -N -s -e "$2" 2>&1 +} + +run_redis() { + local auth_args=() + [[ -n "$REDIS_AUTH" ]] && auth_args=(-a "$REDIS_AUTH" --no-auth-warning) + redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" "${auth_args[@]}" "$@" 2>&1 +} + +# =========================================================================== +# PostgreSQL tests +# =========================================================================== +run_postgresql_tests() { + echo "${BOLD}PostgreSQL smoke tests — ${DB_HOST}:${DB_PORT}${RESET}" + echo "" + + # 1. TCP connectivity + run_test "TCP connectivity to ${DB_HOST}:${DB_PORT}" \ + check_port "$DB_HOST" "$DB_PORT" + + # 2. Authentication + run_test "Authentication as ${DB_USER}" \ + bash -c 'PGPASSWORD="'"$DB_PASS"'" psql -h "'"$DB_HOST"'" -p "'"$DB_PORT"'" -U "'"$DB_USER"'" -d "'"$DB_NAME"'" -t -A -c "SELECT 1" >/dev/null' + + # 3. Version + run_test "Server version" \ + bash -c ' + ver=$(PGPASSWORD="'"$DB_PASS"'" psql -h "'"$DB_HOST"'" -p "'"$DB_PORT"'" -U "'"$DB_USER"'" -d "'"$DB_NAME"'" -t -A -c "SHOW server_version" 2>&1) + echo "PostgreSQL $ver" + ' + + # 4. Database accessible + run_test "Database '${DB_NAME}' accessible" \ + bash -c 'PGPASSWORD="'"$DB_PASS"'" psql -h "'"$DB_HOST"'" -p "'"$DB_PORT"'" -U "'"$DB_USER"'" -d "'"$DB_NAME"'" -t -A -c "SELECT current_database()" >/dev/null' + + # 5. Replication lag + if [[ "$SKIP_REPLICATION" == "true" ]]; then + skip_test "Replication lag" "SKIP_REPLICATION=true" + else + run_test "Replication lag < ${MAX_REPLICATION_LAG_S}s" \ + bash -c ' + is_replica=$(PGPASSWORD="'"$DB_PASS"'" psql -h "'"$DB_HOST"'" -p "'"$DB_PORT"'" -U "'"$DB_USER"'" -d "'"$DB_NAME"'" -t -A -c "SELECT pg_is_in_recovery()" 2>&1) + if [[ "$is_replica" == "t" ]]; then + lag=$(PGPASSWORD="'"$DB_PASS"'" psql -h "'"$DB_HOST"'" -p "'"$DB_PORT"'" -U "'"$DB_USER"'" -d "'"$DB_NAME"'" -t -A -c \ + "SELECT COALESCE(EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::int, 0)" 2>&1) + if [[ "$lag" -gt '"$MAX_REPLICATION_LAG_S"' ]]; then + echo "lag=${lag}s exceeds ${'"$MAX_REPLICATION_LAG_S"'}s"; exit 1 + fi + echo "replica lag=${lag}s" + else + echo "not a replica"; exit 2 + fi + ' + fi + + # 6. Connection count + run_test "Connection usage < ${MAX_CONNECTIONS_PCT}%" \ + bash -c ' + read -r used max_c <<< $(PGPASSWORD="'"$DB_PASS"'" psql -h "'"$DB_HOST"'" -p "'"$DB_PORT"'" -U "'"$DB_USER"'" -d "'"$DB_NAME"'" -t -A -c \ + "SELECT sum(numbackends), (SELECT setting::int FROM pg_settings WHERE name='"'"'max_connections'"'"') FROM pg_stat_database" 2>&1 | tr "|" " ") + pct=$((used * 100 / max_c)) + if [[ $pct -ge '"$MAX_CONNECTIONS_PCT"' ]]; then + echo "${used}/${max_c} (${pct}%)"; exit 1 + fi + echo "${used}/${max_c} (${pct}%)" + ' + + # 7. Long-running queries + run_test "No queries running > 300s" \ + bash -c ' + count=$(PGPASSWORD="'"$DB_PASS"'" psql -h "'"$DB_HOST"'" -p "'"$DB_PORT"'" -U "'"$DB_USER"'" -d "'"$DB_NAME"'" -t -A -c \ + "SELECT count(*) FROM pg_stat_activity WHERE state='"'"'active'"'"' AND now()-query_start > interval '"'"'300 seconds'"'"' AND pid <> pg_backend_pid()" 2>&1) + if [[ "$count" -gt 0 ]]; then + echo "${count} long-running queries found"; exit 1 + fi + echo "none" + ' + + # 8. Table bloat + run_test "Table bloat (dead tuple ratio < 20%)" \ + bash -c ' + worst=$(PGPASSWORD="'"$DB_PASS"'" psql -h "'"$DB_HOST"'" -p "'"$DB_PORT"'" -U "'"$DB_USER"'" -d "'"$DB_NAME"'" -t -A -c \ + "SELECT schemaname||'"'"'.'"'"'||relname||'"'"' '"'"'||round(100.0*n_dead_tup/(n_live_tup+n_dead_tup+1),1)||'"'"'%'"'"' FROM pg_stat_user_tables WHERE n_live_tup+n_dead_tup>1000 AND 100.0*n_dead_tup/(n_live_tup+n_dead_tup+1)>20 ORDER BY n_dead_tup DESC LIMIT 3" 2>&1) + if [[ -n "$worst" ]]; then + echo "bloated: $worst"; exit 1 + fi + echo "ok" + ' + + # 9. Disk usage + run_test "Disk usage for '${DB_NAME}'" \ + bash -c ' + size=$(PGPASSWORD="'"$DB_PASS"'" psql -h "'"$DB_HOST"'" -p "'"$DB_PORT"'" -U "'"$DB_USER"'" -d "'"$DB_NAME"'" -t -A -c \ + "SELECT pg_size_pretty(pg_database_size(current_database()))" 2>&1) + echo "$size" + ' +} + +# =========================================================================== +# MySQL / MariaDB tests +# =========================================================================== +run_mysql_tests() { + echo "${BOLD}MySQL smoke tests — ${DB_HOST}:${DB_PORT}${RESET}" + echo "" + + # 1. TCP connectivity + run_test "TCP connectivity to ${DB_HOST}:${DB_PORT}" \ + check_port "$DB_HOST" "$DB_PORT" + + # 2. Authentication + run_test "Authentication as ${DB_USER}" \ + bash -c 'MYSQL_PWD="'"$DB_PASS"'" mysql -h "'"$DB_HOST"'" -P "'"$DB_PORT"'" -u "'"$DB_USER"'" -e "SELECT 1" >/dev/null' + + # 3. Version + run_test "Server version" \ + bash -c ' + ver=$(MYSQL_PWD="'"$DB_PASS"'" mysql -h "'"$DB_HOST"'" -P "'"$DB_PORT"'" -u "'"$DB_USER"'" -N -s -e "SELECT version()" 2>&1) + echo "MySQL $ver" + ' + + # 4. Database accessible + run_test "Database '${DB_NAME}' accessible" \ + bash -c 'MYSQL_PWD="'"$DB_PASS"'" mysql -h "'"$DB_HOST"'" -P "'"$DB_PORT"'" -u "'"$DB_USER"'" -D "'"$DB_NAME"'" -e "SELECT 1" >/dev/null' + + # 5. Replication lag + if [[ "$SKIP_REPLICATION" == "true" ]]; then + skip_test "Replication lag" "SKIP_REPLICATION=true" + else + run_test "Replication lag < ${MAX_REPLICATION_LAG_S}s" \ + bash -c ' + status=$(MYSQL_PWD="'"$DB_PASS"'" mysql -h "'"$DB_HOST"'" -P "'"$DB_PORT"'" -u "'"$DB_USER"'" -N -s -e "SHOW REPLICA STATUS\G" 2>&1) + if [[ -z "$status" ]]; then + status=$(MYSQL_PWD="'"$DB_PASS"'" mysql -h "'"$DB_HOST"'" -P "'"$DB_PORT"'" -u "'"$DB_USER"'" -N -s -e "SHOW SLAVE STATUS\G" 2>&1) + fi + if [[ -z "$status" ]]; then + echo "not a replica"; exit 2 + fi + lag=$(echo "$status" | grep -i "Seconds_Behind" | awk "{print \$NF}") + if [[ "$lag" == "NULL" || -z "$lag" ]]; then + echo "replication not running (lag=NULL)"; exit 1 + fi + if [[ "$lag" -gt '"$MAX_REPLICATION_LAG_S"' ]]; then + echo "lag=${lag}s exceeds '"$MAX_REPLICATION_LAG_S"'s"; exit 1 + fi + echo "replica lag=${lag}s" + ' + fi + + # 6. Connection count + run_test "Connection usage < ${MAX_CONNECTIONS_PCT}%" \ + bash -c ' + used=$(MYSQL_PWD="'"$DB_PASS"'" mysql -h "'"$DB_HOST"'" -P "'"$DB_PORT"'" -u "'"$DB_USER"'" -N -s -e "SELECT count(*) FROM information_schema.processlist" 2>&1) + max_c=$(MYSQL_PWD="'"$DB_PASS"'" mysql -h "'"$DB_HOST"'" -P "'"$DB_PORT"'" -u "'"$DB_USER"'" -N -s -e "SHOW VARIABLES LIKE '"'"'max_connections'"'"'" 2>&1 | awk "{print \$2}") + pct=$((used * 100 / max_c)) + if [[ $pct -ge '"$MAX_CONNECTIONS_PCT"' ]]; then + echo "${used}/${max_c} (${pct}%)"; exit 1 + fi + echo "${used}/${max_c} (${pct}%)" + ' + + # 7. Slow query log + run_test "Slow query log enabled" \ + bash -c ' + val=$(MYSQL_PWD="'"$DB_PASS"'" mysql -h "'"$DB_HOST"'" -P "'"$DB_PORT"'" -u "'"$DB_USER"'" -N -s -e "SHOW VARIABLES LIKE '"'"'slow_query_log'"'"'" 2>&1 | awk "{print \$2}") + if [[ "$val" != "ON" ]]; then + echo "slow_query_log=$val"; exit 1 + fi + echo "enabled" + ' + + # 8. Binary log space + run_test "Binary log disk usage" \ + bash -c ' + logs=$(MYSQL_PWD="'"$DB_PASS"'" mysql -h "'"$DB_HOST"'" -P "'"$DB_PORT"'" -u "'"$DB_USER"'" -N -s -e "SHOW BINARY LOGS" 2>&1) + if [[ "$logs" == *"not enabled"* || -z "$logs" ]]; then + echo "binary logging disabled"; exit 2 + fi + total=$(echo "$logs" | awk "{s+=\$2} END {printf \"%.1f MB\", s/1048576}") + echo "$total" + ' +} + +# =========================================================================== +# Redis tests +# =========================================================================== +run_redis_tests() { + echo "${BOLD}Redis smoke tests — ${REDIS_HOST}:${REDIS_PORT}${RESET}" + echo "" + + local auth_args=() + [[ -n "$REDIS_AUTH" ]] && auth_args=(-a "$REDIS_AUTH" --no-auth-warning) + + # 1. TCP connectivity + run_test "TCP connectivity to ${REDIS_HOST}:${REDIS_PORT}" \ + check_port "$REDIS_HOST" "$REDIS_PORT" + + # 2. PING/PONG + run_test "PING/PONG" \ + bash -c ' + reply=$(redis-cli -h "'"$REDIS_HOST"'" -p "'"$REDIS_PORT"'" '"$(printf '%q ' "${auth_args[@]}")"' PING 2>&1) + if [[ "$reply" != "PONG" ]]; then + echo "got: $reply"; exit 1 + fi + echo "PONG" + ' + + # 3. Server info + run_test "Server info" \ + bash -c ' + info=$(redis-cli -h "'"$REDIS_HOST"'" -p "'"$REDIS_PORT"'" '"$(printf '%q ' "${auth_args[@]}")"' INFO server 2>&1) + ver=$(echo "$info" | grep "^redis_version:" | cut -d: -f2 | tr -d "\r") + up=$(echo "$info" | grep "^uptime_in_days:" | cut -d: -f2 | tr -d "\r") + echo "v${ver}, uptime ${up}d" + ' + + # 4. Memory usage + run_test "Memory usage vs maxmemory" \ + bash -c ' + info=$(redis-cli -h "'"$REDIS_HOST"'" -p "'"$REDIS_PORT"'" '"$(printf '%q ' "${auth_args[@]}")"' INFO memory 2>&1) + used=$(echo "$info" | grep "^used_memory_human:" | cut -d: -f2 | tr -d "\r") + max_raw=$(echo "$info" | grep "^maxmemory:" | cut -d: -f2 | tr -d "\r") + max_h=$(echo "$info" | grep "^maxmemory_human:" | cut -d: -f2 | tr -d "\r") + if [[ "$max_raw" == "0" ]]; then + echo "used=${used}, maxmemory=unlimited"; exit 0 + fi + echo "used=${used}, max=${max_h}" + ' + + # 5. Connected clients + run_test "Connected clients" \ + bash -c ' + info=$(redis-cli -h "'"$REDIS_HOST"'" -p "'"$REDIS_PORT"'" '"$(printf '%q ' "${auth_args[@]}")"' INFO clients 2>&1) + count=$(echo "$info" | grep "^connected_clients:" | cut -d: -f2 | tr -d "\r") + echo "${count} clients" + ' + + # 6. Replication status + if [[ "$SKIP_REPLICATION" == "true" ]]; then + skip_test "Replication status" "SKIP_REPLICATION=true" + else + run_test "Replication status" \ + bash -c ' + info=$(redis-cli -h "'"$REDIS_HOST"'" -p "'"$REDIS_PORT"'" '"$(printf '%q ' "${auth_args[@]}")"' INFO replication 2>&1) + role=$(echo "$info" | grep "^role:" | cut -d: -f2 | tr -d "\r") + if [[ "$role" == "master" ]]; then + slaves=$(echo "$info" | grep "^connected_slaves:" | cut -d: -f2 | tr -d "\r") + echo "role=master, replicas=${slaves}" + elif [[ "$role" == "slave" ]]; then + link=$(echo "$info" | grep "^master_link_status:" | cut -d: -f2 | tr -d "\r") + if [[ "$link" != "up" ]]; then + echo "replica link $link"; exit 1 + fi + echo "role=replica, link=up" + else + echo "role=$role" + fi + ' + fi + + # 7. Last save time + if [[ "$SKIP_BACKUP_AGE" == "true" ]]; then + skip_test "Last RDB/AOF save" "SKIP_BACKUP_AGE=true" + else + run_test "Last RDB save < ${MAX_BACKUP_AGE_H}h" \ + bash -c ' + info=$(redis-cli -h "'"$REDIS_HOST"'" -p "'"$REDIS_PORT"'" '"$(printf '%q ' "${auth_args[@]}")"' INFO persistence 2>&1) + last_save=$(echo "$info" | grep "^rdb_last_save_time:" | cut -d: -f2 | tr -d "\r") + if [[ -z "$last_save" || "$last_save" == "0" ]]; then + echo "no RDB save recorded"; exit 2 + fi + now=$(date +%s) + age_h=$(( (now - last_save) / 3600 )) + if [[ $age_h -gt '"$MAX_BACKUP_AGE_H"' ]]; then + echo "last save ${age_h}h ago (max '"$MAX_BACKUP_AGE_H"'h)"; exit 1 + fi + echo "last save ${age_h}h ago" + ' + fi + + # 8. Keyspace + run_test "Keyspace info" \ + bash -c ' + info=$(redis-cli -h "'"$REDIS_HOST"'" -p "'"$REDIS_PORT"'" '"$(printf '%q ' "${auth_args[@]}")"' INFO keyspace 2>&1) + dbs=$(echo "$info" | grep "^db[0-9]" || true) + if [[ -z "$dbs" ]]; then + echo "no databases with keys"; exit 2 + fi + total=0 + while IFS= read -r line; do + keys=$(echo "$line" | grep -oP "keys=\K[0-9]+") + total=$((total + keys)) + done <<< "$dbs" + echo "${total} keys across $(echo "$dbs" | wc -l) database(s)" + ' + + # 9. Eviction policy + run_test "Eviction policy" \ + bash -c ' + policy=$(redis-cli -h "'"$REDIS_HOST"'" -p "'"$REDIS_PORT"'" '"$(printf '%q ' "${auth_args[@]}")"' CONFIG GET maxmemory-policy 2>&1 | tail -1) + echo "policy=$policy" + ' +} + +# =========================================================================== +# Summary +# =========================================================================== +print_summary() { + local total=$((PASS_COUNT + FAIL_COUNT + SKIP_COUNT)) + echo "" + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "1..$total" + echo "# pass $PASS_COUNT" + echo "# fail $FAIL_COUNT" + echo "# skip $SKIP_COUNT" + else + echo "${BOLD}───────────────────────────────────────${RESET}" + echo " ${GREEN}PASS${RESET} $PASS_COUNT" + echo " ${RED}FAIL${RESET} $FAIL_COUNT" + echo " ${YELLOW}SKIP${RESET} $SKIP_COUNT" + echo " Total $total" + echo "${BOLD}───────────────────────────────────────${RESET}" + if [[ $FAIL_COUNT -gt 0 ]]; then + echo " ${RED}${BOLD}RESULT: FAIL${RESET}" + else + echo " ${GREEN}${BOLD}RESULT: PASS${RESET}" + fi + fi +} + +# =========================================================================== +# Main +# =========================================================================== +main() { + if [[ -z "$DB_TYPE" ]]; then + echo "ERROR: DB_TYPE is required (postgresql, mysql, redis)" >&2 + echo "Run with --help for usage information." >&2 + exit 1 + fi + + apply_defaults + setup_colors + + case "$DB_TYPE" in + postgresql) run_postgresql_tests ;; + mysql) run_mysql_tests ;; + redis) run_redis_tests ;; + *) + echo "ERROR: Unsupported DB_TYPE '${DB_TYPE}'" >&2 + exit 1 + ;; + esac + + print_summary + + [[ $FAIL_COUNT -gt 0 ]] && exit 1 + exit 0 +} + +main "$@" diff --git a/deploy-exporter.sh b/deploy-exporter.sh new file mode 100755 index 0000000..cbe721b --- /dev/null +++ b/deploy-exporter.sh @@ -0,0 +1,588 @@ +#!/bin/bash +################################################################################ +# Script Name: deploy-exporter.sh +# Version: 1.0 +# Description: Deployment tool for Prometheus exporters from mylinux.work. +# Downloads, installs, configures cron jobs, validates output, +# and manages lifecycle (install, update, remove, status) for +# any exporter script hosted at mylinux.work/downloads/. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - wget or curl +# - root access (for /usr/local/bin/ and /etc/cron.d/) +# +# Usage: +# deploy-exporter.sh list # list available +# deploy-exporter.sh install process-metrics-exporter # install one +# deploy-exporter.sh install process-metrics-exporter --cron "*/3 * * * *" +# deploy-exporter.sh install process-metrics-exporter journal-error-exporter +# deploy-exporter.sh status # check installed +# deploy-exporter.sh remove process-metrics-exporter # remove one +# deploy-exporter.sh update # update all +# +################################################################################ + +set -uo pipefail + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +BASE_URL="https://mylinux.work/downloads" +INSTALL_DIR="/usr/local/bin" +CRON_DIR="/etc/cron.d" +TEXTFILE_DIR="/var/lib/node_exporter" + +# ============================================================================ +# AVAILABLE EXPORTERS +# ============================================================================ + +declare -A EXPORTER_DESC=( + [alertmanager-exporter]="Alertmanager notification and silence metrics" + [apache-metrics-exporter]="Apache HTTP server performance metrics" + [apt-updates-exporter]="Pending apt package updates" + [artifactory-exporter]="JFrog Artifactory repository metrics" + [backup-status-exporter]="Backup job status and age metrics" + [borg-backup-exporter]="Borg backup repository and archive metrics" + [caprover-exporter]="CapRover app deployment metrics" + [clickhouse-exporter]="ClickHouse query, memory, merge, and replication metrics" + [consul-exporter]="HashiCorp Consul service health metrics" + [container-health-exporter]="Docker container health and resource metrics" + [coolify-exporter]="Coolify deployment platform metrics" + [dokku-exporter]="Dokku deployment platform metrics" + [dokploy-exporter]="Dokploy deployment platform metrics" + [cron-job-exporter]="Cron job execution status and timing" + [crowdsec-decisions-exporter]="CrowdSec active decisions and ban metrics" + [crowdsec-exporter]="CrowdSec intrusion detection metrics" + [database-backup-exporter]="Database backup status and size metrics" + [dhcp-lease-exporter]="DHCP lease allocation metrics" + [directory-size-exporter]="Directory size and file count metrics" + [disk-io-exporter]="Disk I/O throughput and latency metrics" + [docker-swarm-exporter]="Docker Swarm node and service metrics" + [dovecot-metrics-exporter]="Dovecot mail server metrics" + [duplicati-exporter]="Duplicati backup job metrics" + [elasticsearch-exporter]="Elasticsearch cluster health and index metrics" + [fail2ban-exporter]="Fail2ban jail and ban metrics" + [freeradius-exporter]="FreeRADIUS authentication metrics" + [game-server-exporter]="Game server player count and status metrics" + [gitea-exporter]="Gitea repository and user metrics" + [glpi-exporter]="GLPI ITSM ticket and asset metrics" + [gitlab-metrics-exporter]="GitLab instance performance metrics" + [gitlab-migration-exporter]="GitLab migration progress metrics" + [gpu-exporter]="GPU utilization and temperature metrics" + [graylog-exporter]="Graylog log management metrics" + [headscale-metrics-exporter]="Headscale coordination server metrics" + [ip-intel-exporter]="IP intelligence from nginx access logs" + [jenkins-exporter]="Jenkins build and queue metrics" + [incus-metrics-exporter]="Incus storage pool, snapshot, and instance inventory metrics" + [journal-error-exporter]="Journalctl error and warning metrics" + [keepalived-exporter]="Keepalived VRRP failover metrics" + [login-attempt-exporter]="SSH and system login attempt metrics" + [ufw-blocklist-metrics]="UFW blocklist feed, ipset, and block count metrics" + [users-logged-in]="User login sessions, terminals, sudo, and failed login metrics" + [logrotate-check-exporter]="Logrotate configuration health metrics" + [lynis-metrics-exporter]="Lynis security audit score metrics" + [mailcow-exporter]="Mailcow mail server metrics" + [memory-pressure-exporter]="Memory pressure and swap usage metrics" + [mysql-exporter]="MySQL/MariaDB performance metrics" + [n8n-exporter]="n8n workflow automation metrics" + [network-info-exporter]="Network interface and routing metrics" + [nexus-exporter]="Sonatype Nexus Repository metrics" + [nextcloud-exporter]="Nextcloud instance health metrics" + [nfs-exporter]="NFS client mount and performance metrics" + [nfs-server-exporter]="NFS server export and connection metrics" + [nginx-metrics-exporter]="Nginx connection and request metrics" + [ntp-drift-exporter]="NTP clock drift and sync metrics" + [ollama-exporter]="Ollama LLM model and inference metrics" + [openvpn-exporter]="OpenVPN tunnel and client metrics" + [password-expiry-exporter]="System user password expiry metrics" + [pihole-exporter]="Pi-hole DNS filtering metrics" + [plex-exporter]="Plex media server activity metrics" + [podman-container-exporter]="Podman container health and resource metrics" + [postgresql-exporter]="PostgreSQL database performance metrics" + [postgresql-ha-exporter]="PostgreSQL HA replication metrics" + [process-metrics-exporter]="Process CPU/memory/state metrics" + [textfile-health-exporter]="Textfile collector health monitoring" + [rabbitmq-exporter]="RabbitMQ queue and connection metrics" + [redis-metrics-exporter]="Redis server performance metrics" + [redis-sentinel-exporter]="Redis Sentinel failover metrics" + [restic-backup-exporter]="Restic backup snapshot and size metrics" + [rsyslog-metrics-exporter]="Rsyslog message processing metrics" + [samba-exporter]="Samba file share and session metrics" + [seo-exporter]="SEO health and crawl metrics" + [smart-drive-exporter]="SMART disk health and temperature metrics" + [snipeit-exporter]="Snipe-IT asset management metrics" + [sonarqube-exporter]="SonarQube code quality metrics" + [squid-exporter]="Squid proxy cache and request metrics" + [storage-health-exporter]="Storage pool and volume health metrics" + [suricata-exporter]="Suricata IDS/IPS alert metrics" + [syncthing-exporter]="Syncthing folder sync and device metrics" + [systemd-boot-time-exporter]="Systemd boot and service startup timing" + [systemd-service-exporter]="Systemd service state and restart metrics" + [systemd-timer-exporter]="Systemd timer schedule and execution metrics" + [tailscale-exporter]="Tailscale node and network metrics" + [trivy-cve-auditor]="Trivy container image vulnerability metrics" + [vault-exporter]="HashiCorp Vault seal and token metrics" + [vaultwarden-exporter]="Vaultwarden password manager metrics" + [wazuh-exporter]="Wazuh SIEM alert and agent metrics" + [web-traffic-exporter]="Web traffic request and response metrics" + [webtop-selkies-exporter]="Webtop and Selkies container desktop metrics" + [wickr-io-exporter]="Wickr.io bot and message metrics" + [wickr-metrics-exporter]="Wickr messaging platform metrics" + [wireguard-exporter]="WireGuard tunnel and peer metrics" + [yum-updates-exporter]="Pending yum/dnf package updates" +) + +declare -A EXPORTER_CRON=( + [alertmanager-exporter]="*/5 * * * *" + [apache-metrics-exporter]="*/3 * * * *" + [apt-updates-exporter]="0 0 * * *" + [artifactory-exporter]="*/5 * * * *" + [backup-status-exporter]="*/15 * * * *" + [borg-backup-exporter]="*/15 * * * *" + [caprover-exporter]="*/5 * * * *" + [clickhouse-exporter]="*/3 * * * *" + [consul-exporter]="*/3 * * * *" + [container-health-exporter]="*/3 * * * *" + [coolify-exporter]="*/5 * * * *" + [dokku-exporter]="*/5 * * * *" + [dokploy-exporter]="*/5 * * * *" + [cron-job-exporter]="*/5 * * * *" + [crowdsec-decisions-exporter]="*/5 * * * *" + [crowdsec-exporter]="*/5 * * * *" + [database-backup-exporter]="*/15 * * * *" + [dhcp-lease-exporter]="*/5 * * * *" + [directory-size-exporter]="*/15 * * * *" + [disk-io-exporter]="*/3 * * * *" + [docker-swarm-exporter]="*/3 * * * *" + [dovecot-metrics-exporter]="*/5 * * * *" + [duplicati-exporter]="*/15 * * * *" + [elasticsearch-exporter]="*/3 * * * *" + [fail2ban-exporter]="*/5 * * * *" + [freeradius-exporter]="*/5 * * * *" + [game-server-exporter]="*/3 * * * *" + [gitea-exporter]="*/5 * * * *" + [glpi-exporter]="*/5 * * * *" + [gitlab-metrics-exporter]="*/5 * * * *" + [gitlab-migration-exporter]="*/5 * * * *" + [gpu-exporter]="*/3 * * * *" + [graylog-exporter]="*/5 * * * *" + [headscale-metrics-exporter]="*/5 * * * *" + [ip-intel-exporter]="*/5 * * * *" + [jenkins-exporter]="*/5 * * * *" + [incus-metrics-exporter]="*/5 * * * *" + [journal-error-exporter]="*/5 * * * *" + [keepalived-exporter]="*/5 * * * *" + [login-attempt-exporter]="*/5 * * * *" + [ufw-blocklist-metrics]="*/5 * * * *" + [users-logged-in]="*/3 * * * *" + [logrotate-check-exporter]="0 */6 * * *" + [lynis-metrics-exporter]="0 0 * * *" + [mailcow-exporter]="*/5 * * * *" + [memory-pressure-exporter]="*/3 * * * *" + [mysql-exporter]="*/3 * * * *" + [n8n-exporter]="*/5 * * * *" + [network-info-exporter]="*/5 * * * *" + [nexus-exporter]="*/5 * * * *" + [nextcloud-exporter]="*/5 * * * *" + [nfs-exporter]="*/5 * * * *" + [nfs-server-exporter]="*/5 * * * *" + [nginx-metrics-exporter]="*/3 * * * *" + [ntp-drift-exporter]="*/5 * * * *" + [ollama-exporter]="*/5 * * * *" + [openvpn-exporter]="*/5 * * * *" + [password-expiry-exporter]="0 0 * * *" + [pihole-exporter]="*/5 * * * *" + [plex-exporter]="*/5 * * * *" + [podman-container-exporter]="*/3 * * * *" + [postgresql-exporter]="*/3 * * * *" + [postgresql-ha-exporter]="*/3 * * * *" + [process-metrics-exporter]="*/3 * * * *" + [textfile-health-exporter]="*/5 * * * *" + [rabbitmq-exporter]="*/5 * * * *" + [redis-metrics-exporter]="*/3 * * * *" + [redis-sentinel-exporter]="*/5 * * * *" + [restic-backup-exporter]="*/15 * * * *" + [rsyslog-metrics-exporter]="*/5 * * * *" + [samba-exporter]="*/5 * * * *" + [seo-exporter]="0 */6 * * *" + [smart-drive-exporter]="*/15 * * * *" + [snipeit-exporter]="*/5 * * * *" + [sonarqube-exporter]="*/5 * * * *" + [squid-exporter]="*/5 * * * *" + [storage-health-exporter]="*/15 * * * *" + [suricata-exporter]="*/5 * * * *" + [syncthing-exporter]="*/5 * * * *" + [systemd-boot-time-exporter]="*/15 * * * *" + [systemd-service-exporter]="*/5 * * * *" + [systemd-timer-exporter]="*/5 * * * *" + [tailscale-exporter]="*/5 * * * *" + [trivy-cve-auditor]="*/30 * * * *" + [vault-exporter]="*/5 * * * *" + [vaultwarden-exporter]="*/5 * * * *" + [wazuh-exporter]="*/5 * * * *" + [web-traffic-exporter]="*/5 * * * *" + [webtop-selkies-exporter]="*/3 * * * *" + [wickr-io-exporter]="*/5 * * * *" + [wickr-metrics-exporter]="*/5 * * * *" + [wireguard-exporter]="*/5 * * * *" + [yum-updates-exporter]="0 0 * * *" +) + +# ============================================================================ +# HELPERS +# ============================================================================ + +log() { echo "# $*"; } +warn() { echo "# WARN: $*" >&2; } +err() { echo "# ERROR: $*" >&2; } +die() { err "$@"; exit 1; } + +check_root() { + [[ $EUID -eq 0 ]] || die "Must run as root (need write access to ${INSTALL_DIR}/ and ${CRON_DIR}/)" +} + +download() { + local url="$1" dest="$2" + if command -v wget &>/dev/null; then + wget -q -O "$dest" "$url" + elif command -v curl &>/dev/null; then + curl -fsSL -o "$dest" "$url" + else + die "Neither wget nor curl found" + fi +} + +get_script_version() { + local file="$1" + grep -m1 '^# Version:' "$file" 2>/dev/null | awk '{print $3}' || echo "unknown" +} + +# ============================================================================ +# LIST +# ============================================================================ + +cmd_list() { + log "Available exporters from mylinux.work (${#EXPORTER_DESC[@]} total)" + log "" + printf "# %-40s %-15s %s\n" "EXPORTER" "DEFAULT CRON" "DESCRIPTION" + printf "# %-40s %-15s %s\n" "--------" "------------" "-----------" + + for name in $(echo "${!EXPORTER_DESC[@]}" | tr ' ' '\n' | sort); do + local cron="${EXPORTER_CRON[$name]:-*/5 * * * *}" + local desc="${EXPORTER_DESC[$name]}" + local installed="" + [[ -f "${INSTALL_DIR}/${name}.sh" ]] && installed=" [installed]" + printf "# %-40s %-15s %s%s\n" "$name" "$cron" "$desc" "$installed" + done +} + +# ============================================================================ +# INSTALL +# ============================================================================ + +cmd_install() { + check_root + + local names=() + local cron_schedule="" + + while [[ $# -gt 0 ]]; do + case "$1" in + --cron) + cron_schedule="$2" + shift 2 + ;; + -*) + die "Unknown option: $1" + ;; + *) + names+=("$1") + shift + ;; + esac + done + + [[ ${#names[@]} -gt 0 ]] || die "No exporter name(s) specified" + + for name in "${names[@]}"; do + install_one "$name" "$cron_schedule" + done +} + +install_one() { + local name="$1" + local cron_schedule="$2" + local url="${BASE_URL}/${name}.sh" + local dest="${INSTALL_DIR}/${name}.sh" + local temp_file + + if [[ -z "${EXPORTER_DESC[$name]+x}" ]]; then + warn "Unknown exporter '${name}' — not in the built-in list, attempting download anyway" + fi + + log "Installing ${name}..." + + temp_file=$(mktemp "/tmp/${name}.XXXXXX") + if ! download "$url" "$temp_file"; then + rm -f "$temp_file" + err "Failed to download ${url}" + return 1 + fi + + if [[ ! -s "$temp_file" ]]; then + rm -f "$temp_file" + err "Downloaded file is empty: ${url}" + return 1 + fi + + chmod +x "$temp_file" + + log "Validating ${name}..." + local test_output + test_output=$("$temp_file" 2>/dev/null || true) + local line_count + line_count=$(echo "$test_output" | wc -l) + + if [[ "$line_count" -lt 3 ]]; then + rm -f "$temp_file" + err "Validation failed — ${name} produced only ${line_count} lines of output" + return 1 + fi + + mv -f "$temp_file" "$dest" + chmod +x "$dest" + log "Installed ${dest}" + + if [[ -n "$cron_schedule" ]]; then + local default_cron="$cron_schedule" + elif [[ -n "${EXPORTER_CRON[$name]+x}" ]]; then + local default_cron="${EXPORTER_CRON[$name]}" + log "No --cron specified, using default: ${default_cron}" + else + local default_cron="" + fi + + if [[ -n "$default_cron" ]]; then + mkdir -p "$TEXTFILE_DIR" + local cron_file="${CRON_DIR}/${name}" + cat > "$cron_file" <&1 +EOF + log "Created cron job: ${cron_file}" + log " Schedule: ${default_cron}" + log " Command: ${INSTALL_DIR}/${name}.sh --textfile" + fi + + log "${name} installed successfully" + log "" +} + +# ============================================================================ +# STATUS +# ============================================================================ + +cmd_status() { + local found=0 + + log "Installed exporters in ${INSTALL_DIR}/:" + log "" + printf "# %-40s %-12s %-10s %s\n" "EXPORTER" "VERSION" "CRON" "LAST .prom UPDATE" + printf "# %-40s %-12s %-10s %s\n" "--------" "-------" "----" "-----------------" + + for script in "${INSTALL_DIR}"/*-exporter.sh; do + [[ -f "$script" ]] || continue + found=1 + + local name + name=$(basename "$script" .sh) + local version + version=$(get_script_version "$script") + + local cron_status="none" + [[ -f "${CRON_DIR}/${name}" ]] && cron_status="active" + + local prom_name + prom_name=$(echo "$name" | tr '-' '_') + local prom_file="${TEXTFILE_DIR}/${prom_name}.prom" + local prom_age="no .prom file" + + if [[ -f "$prom_file" ]]; then + local mod_time now age_sec + mod_time=$(stat -c %Y "$prom_file" 2>/dev/null || echo 0) + now=$(date +%s) + age_sec=$(( now - mod_time )) + + if [[ $age_sec -lt 60 ]]; then + prom_age="${age_sec}s ago" + elif [[ $age_sec -lt 3600 ]]; then + prom_age="$(( age_sec / 60 ))m ago" + elif [[ $age_sec -lt 86400 ]]; then + prom_age="$(( age_sec / 3600 ))h ago" + else + prom_age="$(( age_sec / 86400 ))d ago (STALE)" + fi + fi + + printf "# %-40s %-12s %-10s %s\n" "$name" "$version" "$cron_status" "$prom_age" + done + + if [[ $found -eq 0 ]]; then + log "No exporters installed in ${INSTALL_DIR}/" + fi +} + +# ============================================================================ +# REMOVE +# ============================================================================ + +cmd_remove() { + check_root + [[ $# -gt 0 ]] || die "No exporter name specified" + + for name in "$@"; do + remove_one "$name" + done +} + +remove_one() { + local name="$1" + local script="${INSTALL_DIR}/${name}.sh" + local cron_file="${CRON_DIR}/${name}" + local prom_name + prom_name=$(echo "$name" | tr '-' '_') + local prom_file="${TEXTFILE_DIR}/${prom_name}.prom" + + if [[ ! -f "$script" ]]; then + warn "${name} is not installed in ${INSTALL_DIR}/" + return 1 + fi + + rm -f "$script" + log "Removed ${script}" + + if [[ -f "$cron_file" ]]; then + rm -f "$cron_file" + log "Removed cron job: ${cron_file}" + fi + + if [[ -f "$prom_file" ]]; then + rm -f "$prom_file" + log "Removed .prom file: ${prom_file}" + fi + + log "${name} removed" + log "" +} + +# ============================================================================ +# UPDATE +# ============================================================================ + +cmd_update() { + check_root + + local found=0 + + for script in "${INSTALL_DIR}"/*-exporter.sh; do + [[ -f "$script" ]] || continue + found=1 + + local name + name=$(basename "$script" .sh) + local old_version + old_version=$(get_script_version "$script") + + log "Updating ${name} (current: v${old_version})..." + + local url="${BASE_URL}/${name}.sh" + local temp_file + temp_file=$(mktemp "/tmp/${name}.XXXXXX") + + if ! download "$url" "$temp_file"; then + rm -f "$temp_file" + err "Failed to download ${name}, skipping" + continue + fi + + if [[ ! -s "$temp_file" ]]; then + rm -f "$temp_file" + err "Downloaded file is empty for ${name}, skipping" + continue + fi + + local new_version + new_version=$(get_script_version "$temp_file") + + chmod +x "$temp_file" + mv -f "$temp_file" "$script" + chmod +x "$script" + + if [[ "$old_version" == "$new_version" ]]; then + log "${name}: v${new_version} (unchanged)" + else + log "${name}: v${old_version} → v${new_version}" + fi + done + + if [[ $found -eq 0 ]]; then + log "No exporters installed in ${INSTALL_DIR}/" + fi +} + +# ============================================================================ +# USAGE +# ============================================================================ + +show_usage() { + cat <&2; } +info() { echo -e "${BOLD}[INFO]${RESET} $*"; } + +usage() { + cat </dev/null || true + log "Stopped FreshRSS containers" + fi + fi + + # Remove nginx config + for f in /etc/nginx/conf.d/freshrss.conf /etc/nginx/sites-enabled/freshrss.conf /etc/nginx/sites-available/freshrss.conf; do + if [[ -f "$f" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + info "Would remove: $f" + else + rm -f "$f" + log "Removed $f" + fi + fi + done + + if [[ "$DRY_RUN" != "true" ]] && command -v nginx &>/dev/null; then + nginx -t 2>/dev/null && systemctl reload nginx 2>/dev/null && log "Reloaded Nginx" + fi + + echo "" + if [[ "$DRY_RUN" != "true" ]]; then + log "Containers stopped and Nginx config removed." + info "Data preserved at ${INSTALL_DIR}/ - remove manually if desired:" + echo " rm -rf ${INSTALL_DIR}" + fi + exit 0 +fi + +# -- Validation -- + +if [[ -z "$DOMAIN" ]]; then + err "Domain is required: --domain rss.example.com" + exit 1 +fi + +if ! command -v docker &>/dev/null; then + err "Docker is not installed. Install Docker first." + exit 1 +fi + +if ! docker compose version &>/dev/null 2>&1; then + err "Docker Compose v2 is not available. Install docker-compose-plugin." + exit 1 +fi + +# Generate DB password if not provided +if [[ -z "$DB_PASSWORD" ]]; then + DB_PASSWORD=$(openssl rand -base64 24 | tr -d '/+=' | head -c 24) +fi + +# -- Install mode -- + +info "Deploying FreshRSS..." +echo "" +info "Domain: ${DOMAIN}" +info "Port: ${PORT}" +info "Install dir: ${INSTALL_DIR}" +info "Timezone: ${TZ}" +info "Feed cron: ${CRON_MIN}" +echo "" + +# 1. Create directory +if [[ -d "$INSTALL_DIR" ]]; then + info "Directory ${INSTALL_DIR} already exists" +else + if [[ "$DRY_RUN" == "true" ]]; then + info "Would create: ${INSTALL_DIR}" + else + mkdir -p "$INSTALL_DIR" + log "Created ${INSTALL_DIR}" + fi +fi + +# 2. Docker Compose file +COMPOSE_FILE="${INSTALL_DIR}/docker-compose.yml" + +if [[ -f "$COMPOSE_FILE" ]]; then + info "docker-compose.yml already exists - skipping (delete to recreate)" +else + if [[ "$DRY_RUN" == "true" ]]; then + info "Would create: ${COMPOSE_FILE}" + else + cat > "$COMPOSE_FILE" </dev/null | grep -q '^freshrss$'; then + info "FreshRSS container is already running" +else + if [[ "$DRY_RUN" == "true" ]]; then + info "Would run: docker compose up -d" + else + cd "$INSTALL_DIR" + docker compose up -d + log "Started FreshRSS containers" + fi +fi + +# 4. Nginx reverse proxy +if [[ "$SKIP_NGINX" == "true" ]]; then + info "Skipping Nginx config (--skip-nginx)" +elif ! command -v nginx &>/dev/null; then + warn "Nginx not installed - skipping reverse proxy config" + SKIP_NGINX=true +else + NGINX_CONF="" + # Detect config directory style + if [[ -d /etc/nginx/conf.d ]]; then + NGINX_CONF="/etc/nginx/conf.d/freshrss.conf" + elif [[ -d /etc/nginx/sites-available ]]; then + NGINX_CONF="/etc/nginx/sites-available/freshrss.conf" + else + warn "Could not detect Nginx config directory - skipping" + SKIP_NGINX=true + fi + + if [[ "$SKIP_NGINX" != "true" && -n "$NGINX_CONF" ]]; then + if [[ -f "$NGINX_CONF" ]]; then + info "Nginx config already exists at ${NGINX_CONF} - skipping" + else + if [[ "$DRY_RUN" == "true" ]]; then + info "Would create: ${NGINX_CONF}" + else + if [[ "$SKIP_SSL" == "true" ]]; then + # HTTP only + cat > "$NGINX_CONF" < "$NGINX_CONF" </dev/null; then + if [[ "$DRY_RUN" == "true" ]]; then + info "Would run: certbot certonly --nginx -d ${DOMAIN}" + else + certbot certonly --nginx -d "$DOMAIN" --non-interactive --agree-tos --register-unsafely-without-email || { + warn "Certbot failed - configure SSL manually" + warn "Run: certbot certonly --nginx -d ${DOMAIN}" + } + fi + else + warn "certbot not installed - configure SSL manually" + fi +fi + +# 6. Reload Nginx +if [[ "$SKIP_NGINX" != "true" && "$DRY_RUN" != "true" ]]; then + if nginx -t 2>/dev/null; then + systemctl reload nginx + log "Reloaded Nginx" + else + warn "Nginx config test failed - check config manually" + fi +fi + +# -- Summary -- + +echo "" +echo -e "${BOLD}Deployment summary:${RESET}" +echo " Docker Compose: ${INSTALL_DIR}/docker-compose.yml" +echo " FreshRSS: http://127.0.0.1:${PORT}" +if [[ "$SKIP_NGINX" != "true" ]]; then + if [[ "$SKIP_SSL" == "true" ]]; then + echo " Public URL: http://${DOMAIN}" + else + echo " Public URL: https://${DOMAIN}" + fi + echo " Nginx config: ${NGINX_CONF:-/etc/nginx/conf.d/freshrss.conf}" +fi +echo " Database: PostgreSQL (freshrss-db container)" +echo " Feed updates: Every ${CRON_MIN} minutes" +echo " Data directory: ${INSTALL_DIR}/data/" +echo "" +echo -e "${BOLD}Next steps:${RESET}" +if [[ "$SKIP_SSL" == "true" ]]; then + echo " 1. Open http://${DOMAIN} and complete the setup wizard" +else + echo " 1. Open https://${DOMAIN} and complete the setup wizard" +fi +echo " 2. Database config in wizard:" +echo " Type: PostgreSQL" +echo " Host: freshrss-db" +echo " Database: freshrss" +echo " User: freshrss" +echo " Password: (saved in ${INSTALL_DIR}/docker-compose.yml)" +echo " 3. Create your admin account" +echo " 4. Add your first feed: https://mylinux.work/index.xml" +echo "" +info "Remove with: $(basename "$0") --remove" diff --git a/deploy-password-expiry-checker.ps1 b/deploy-password-expiry-checker.ps1 new file mode 100644 index 0000000..5d8e65e --- /dev/null +++ b/deploy-password-expiry-checker.ps1 @@ -0,0 +1,389 @@ +<# +.SYNOPSIS + Deploy the password expiry checker to Windows machines. +.DESCRIPTION + Downloads password-expiry-check.ps1, installs it to a configurable + directory, creates a scheduled task for recurring checks, and + optionally copies the script to NETLOGON for GPO deployment. +.NOTES + Author: Phil Connor + License: MIT (https://opensource.org/licenses/MIT) + Version: 1.01 +#> + +param( + [string]$InstallDir = "C:\Scripts", + [int]$WarningDays = 14, + [int]$IntervalHours = 4, + [switch]$NetlogonCopy, + [switch]$CmdPrompt, + [switch]$NoProfile, + [switch]$Remove, + [switch]$DryRun, + [Alias("h")] + [switch]$Help +) + +$ScriptUrl = "https://mylinux.work/downloads/password-expiry-check.ps1.zip" +$ScriptName = "password-expiry-check.ps1" +$TaskName = "PasswordExpiryCheck" + +# ── Colors ──────────────────────────────────────────────────────────── + +function Write-OK { param([string]$Msg) Write-Host "[OK] $Msg" -ForegroundColor Green } +function Write-Warn { param([string]$Msg) Write-Host "[WARN] $Msg" -ForegroundColor Yellow } +function Write-Err { param([string]$Msg) Write-Host "[ERROR] $Msg" -ForegroundColor Red } +function Write-Info { param([string]$Msg) Write-Host "[INFO] $Msg" -ForegroundColor Cyan } + +# ── Help ────────────────────────────────────────────────────────────── + +if ($Help) { + Write-Host @" +Usage: .\deploy-password-expiry-checker.ps1 [OPTIONS] + +Deploy password expiry notifications on Windows machines. + +Installs: + 1. password-expiry-check.ps1 to C:\Scripts\ (configurable) + 2. Scheduled task - runs every 4 hours (configurable) under logged-on user + 3. Logon-triggered task - fires on every user logon + 4. PowerShell profile hook - warning banner in every new PowerShell window + 5. Optional cmd.exe AutoRun hook - warning banner in every new cmd window + 6. Optional NETLOGON copy for GPO deployment + +Options: + -InstallDir PATH Installation directory (default: C:\Scripts) + -WarningDays N Warning threshold in days (default: 14) + -IntervalHours N Scheduled task interval in hours (default: 4) + -CmdPrompt Also add warning to cmd.exe via AutoRun registry key + -NoProfile Skip PowerShell profile hook (scheduled tasks only) + -NetlogonCopy Copy script to NETLOGON share for GPO deployment + -Remove Remove deployed components + -DryRun Show what would be done without making changes + -Help Show this help + +Examples: + .\deploy-password-expiry-checker.ps1 # install with defaults + .\deploy-password-expiry-checker.ps1 -CmdPrompt # also hook into cmd.exe + .\deploy-password-expiry-checker.ps1 -NoProfile # skip profile hook + .\deploy-password-expiry-checker.ps1 -WarningDays 30 # 30-day warning threshold + .\deploy-password-expiry-checker.ps1 -IntervalHours 8 # check every 8 hours + .\deploy-password-expiry-checker.ps1 -NetlogonCopy # also copy to NETLOGON + .\deploy-password-expiry-checker.ps1 -DryRun # preview changes + .\deploy-password-expiry-checker.ps1 -Remove # uninstall +"@ + exit 0 +} + +# ── Admin check ─────────────────────────────────────────────────────── + +$currentUser = [Security.Principal.WindowsIdentity]::GetCurrent() +$principal = New-Object Security.Principal.WindowsPrincipal($currentUser) +if (-not $principal.IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator)) { + Write-Err "Must run as Administrator" + exit 1 +} + +$ScriptPath = Join-Path $InstallDir $ScriptName + +# ── Remove mode ─────────────────────────────────────────────────────── + +if ($Remove) { + Write-Info "Removing password expiry checker deployment..." + Write-Host "" + + # Remove scheduled tasks + foreach ($name in @($TaskName, "${TaskName}Logon")) { + $task = Get-ScheduledTask -TaskName $name -ErrorAction SilentlyContinue + if ($task) { + if ($DryRun) { + Write-Info "Would remove scheduled task: $name" + } else { + Unregister-ScheduledTask -TaskName $name -Confirm:$false + Write-OK "Removed scheduled task: $name" + } + } else { + Write-Info "Scheduled task '$name' not found, skipping" + } + } + + # Remove script + if (Test-Path $ScriptPath) { + if ($DryRun) { + Write-Info "Would remove: $ScriptPath" + } else { + Remove-Item -Path $ScriptPath -Force + Write-OK "Removed $ScriptPath" + } + } + + # Remove PowerShell profile hook + $profileMarker = "# PasswordExpiryCheck" + $allUsersProfile = $PROFILE.AllUsersAllHosts + if ((Test-Path $allUsersProfile) -and (Select-String -Path $allUsersProfile -Pattern $profileMarker -Quiet)) { + if ($DryRun) { + Write-Info "Would remove profile hook from $allUsersProfile" + } else { + $content = Get-Content $allUsersProfile | Where-Object { $_ -notmatch $profileMarker } + if ($content) { + Set-Content -Path $allUsersProfile -Value $content + } else { + Remove-Item -Path $allUsersProfile -Force + } + Write-OK "Removed PowerShell profile hook" + } + } + + # Remove cmd.exe AutoRun + $cmdAutoRun = Get-ItemProperty -Path "HKLM:\Software\Microsoft\Command Processor" -Name "AutoRun" -ErrorAction SilentlyContinue + if ($cmdAutoRun -and $cmdAutoRun.AutoRun -match "password-expiry-check") { + if ($DryRun) { + Write-Info "Would remove cmd.exe AutoRun registry key" + } else { + $existing = $cmdAutoRun.AutoRun + # Remove our command, handle single command or chained with ampersand + $cleaned = ($existing -split '\s*&\s*' | Where-Object { $_ -notmatch 'password-expiry-check' }) -join ' & ' + if ($cleaned.Trim()) { + Set-ItemProperty -Path "HKLM:\Software\Microsoft\Command Processor" -Name "AutoRun" -Value $cleaned.Trim() + } else { + Remove-ItemProperty -Path "HKLM:\Software\Microsoft\Command Processor" -Name "AutoRun" -ErrorAction SilentlyContinue + } + Write-OK "Removed cmd.exe AutoRun hook" + } + } + + # Remove install dir if empty + if ((Test-Path $InstallDir) -and @(Get-ChildItem $InstallDir -Force).Count -eq 0) { + if ($DryRun) { + Write-Info "Would remove empty directory: $InstallDir" + } else { + Remove-Item -Path $InstallDir -Force + Write-OK "Removed empty directory: $InstallDir" + } + } + + Write-Host "" + if (-not $DryRun) { + Write-OK "Removal complete." + } + exit 0 +} + +# ── Install mode ────────────────────────────────────────────────────── + +Write-Info "Deploying password expiry checker..." +Write-Host "" + +# 1. Create install directory +if (-not (Test-Path $InstallDir)) { + if ($DryRun) { + Write-Info "Would create directory: $InstallDir" + } else { + New-Item -Path $InstallDir -ItemType Directory -Force | Out-Null + Write-OK "Created directory: $InstallDir" + } +} + +# 2. Download script +if (Test-Path $ScriptPath) { + Write-Info "Script already exists at $ScriptPath - downloading latest version" +} + +if ($DryRun) { + Write-Info "Would download $ScriptUrl and extract to $ScriptPath" +} else { + $zipPath = Join-Path $env:TEMP "password-expiry-check.ps1.zip" + try { + Invoke-WebRequest -Uri $ScriptUrl -OutFile $zipPath -UseBasicParsing -ErrorAction Stop + Expand-Archive -Path $zipPath -DestinationPath $InstallDir -Force + Remove-Item $zipPath -Force -ErrorAction SilentlyContinue + if (Test-Path $ScriptPath) { + Write-OK "Downloaded and extracted $ScriptPath" + } else { + Write-Err "Zip extracted but $ScriptName not found in $InstallDir" + exit 1 + } + } catch { + Write-Err "Failed to download: $($_.Exception.Message)" + exit 1 + } +} + +# 3. Scheduled task - recurring interval +$taskArgs = "-NoProfile -ExecutionPolicy Bypass -WindowStyle Hidden -File `"$ScriptPath`" -Quiet -WarningDays $WarningDays" + +$existingTask = Get-ScheduledTask -TaskName $TaskName -ErrorAction SilentlyContinue +if ($existingTask) { + Write-Info "Scheduled task '$TaskName' already exists - recreating" + if (-not $DryRun) { + Unregister-ScheduledTask -TaskName $TaskName -Confirm:$false + } +} + +if ($DryRun) { + Write-Info "Would create scheduled task: $TaskName (every ${IntervalHours}h)" +} else { + $action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument $taskArgs + $trigger = New-ScheduledTaskTrigger -Once -At (Get-Date).Date.AddHours(9) ` + -RepetitionInterval (New-TimeSpan -Hours $IntervalHours) ` + -RepetitionDuration (New-TimeSpan -Days 365) + $settings = New-ScheduledTaskSettingsSet ` + -AllowStartIfOnBatteries ` + -DontStopIfGoingOnBatteries ` + -StartWhenAvailable ` + -RunOnlyIfNetworkAvailable:$false + $principal = New-ScheduledTaskPrincipal -GroupId "S-1-5-32-545" -RunLevel Limited + + Register-ScheduledTask -TaskName $TaskName -Action $action -Trigger $trigger ` + -Settings $settings -Principal $principal ` + -Description "Check password expiry every $IntervalHours hours (mylinux.work)" | Out-Null + Write-OK "Created scheduled task: $TaskName (every ${IntervalHours}h)" +} + +# 4. Logon trigger task +$logonTaskName = "${TaskName}Logon" +$existingLogon = Get-ScheduledTask -TaskName $logonTaskName -ErrorAction SilentlyContinue +if ($existingLogon) { + Write-Info "Logon task '$logonTaskName' already exists - recreating" + if (-not $DryRun) { + Unregister-ScheduledTask -TaskName $logonTaskName -Confirm:$false + } +} + +if ($DryRun) { + Write-Info "Would create logon trigger task: $logonTaskName" +} else { + $logonAction = New-ScheduledTaskAction -Execute "powershell.exe" -Argument $taskArgs + $logonTrigger = New-ScheduledTaskTrigger -AtLogOn + $logonSettings = New-ScheduledTaskSettingsSet ` + -AllowStartIfOnBatteries ` + -DontStopIfGoingOnBatteries ` + -StartWhenAvailable ` + -ExecutionTimeLimit (New-TimeSpan -Minutes 5) + # Delay 30 seconds after logon to let the desktop load + $logonTrigger.Delay = "PT30S" + $logonPrincipal = New-ScheduledTaskPrincipal -GroupId "S-1-5-32-545" -RunLevel Limited + + Register-ScheduledTask -TaskName $logonTaskName -Action $logonAction -Trigger $logonTrigger ` + -Settings $logonSettings -Principal $logonPrincipal ` + -Description "Check password expiry at logon (mylinux.work)" | Out-Null + Write-OK "Created logon trigger task: $logonTaskName" +} + +# 5. NETLOGON copy (optional) +if ($NetlogonCopy) { + $logonServer = $env:LOGONSERVER + if ($logonServer) { + $netlogonPath = Join-Path "$logonServer\NETLOGON" $ScriptName + if ($DryRun) { + Write-Info "Would copy $ScriptPath to $netlogonPath" + } else { + try { + Copy-Item -Path $ScriptPath -Destination $netlogonPath -Force -ErrorAction Stop + Write-OK "Copied to $netlogonPath" + } catch { + Write-Warn "Could not copy to NETLOGON: $($_.Exception.Message)" + Write-Warn "Copy manually: Copy-Item '$ScriptPath' '$netlogonPath'" + } + } + } else { + Write-Warn "LOGONSERVER not set - machine may not be domain-joined" + Write-Warn "Copy manually to \\DC\NETLOGON\$ScriptName" + } +} + +# 6. PowerShell profile hook (default, skip with -NoProfile) +$profileMarker = "# PasswordExpiryCheck" +$profileLine = "& `"$ScriptPath`" -Quiet -WarningDays $WarningDays $profileMarker" + +if (-not $NoProfile) { + $allUsersProfile = $PROFILE.AllUsersAllHosts + $profileDir = Split-Path $allUsersProfile -Parent + + # Check if hook already exists + $hookExists = (Test-Path $allUsersProfile) -and (Select-String -Path $allUsersProfile -Pattern $profileMarker -Quiet) + + if ($hookExists) { + Write-Info "PowerShell profile hook already present - updating" + if (-not $DryRun) { + $content = Get-Content $allUsersProfile | Where-Object { $_ -notmatch $profileMarker } + $content += $profileLine + Set-Content -Path $allUsersProfile -Value $content + } + } else { + if ($DryRun) { + Write-Info "Would add profile hook to $allUsersProfile" + } else { + if (-not (Test-Path $profileDir)) { + New-Item -Path $profileDir -ItemType Directory -Force | Out-Null + } + Add-Content -Path $allUsersProfile -Value $profileLine + } + } + Write-OK "PowerShell profile hook: $allUsersProfile" +} else { + Write-Info "Skipping PowerShell profile hook (-NoProfile)" +} + +# 7. cmd.exe AutoRun hook (optional, enable with -CmdPrompt) +if ($CmdPrompt) { + $cmdCommand = '@powershell.exe -NoProfile -ExecutionPolicy Bypass -File "' + $ScriptPath + '" -Quiet -WarningDays ' + $WarningDays + $regPath = "HKLM:\Software\Microsoft\Command Processor" + + $existing = Get-ItemProperty -Path $regPath -Name "AutoRun" -ErrorAction SilentlyContinue + if ($existing -and $existing.AutoRun -match "password-expiry-check") { + Write-Info "cmd.exe AutoRun hook already present - updating" + if (-not $DryRun) { + $cleaned = ($existing.AutoRun -split '\s*&\s*' | Where-Object { $_ -notmatch 'password-expiry-check' }) -join ' & ' + if ($cleaned.Trim()) { + $newValue = $cleaned.Trim() + " & " + $cmdCommand + } else { + $newValue = $cmdCommand + } + Set-ItemProperty -Path $regPath -Name "AutoRun" -Value $newValue + } + } elseif ($existing -and $existing.AutoRun.Trim()) { + if ($DryRun) { + Write-Info "Would append to existing cmd.exe AutoRun" + } else { + $newValue = $existing.AutoRun.Trim() + " & " + $cmdCommand + Set-ItemProperty -Path $regPath -Name "AutoRun" -Value $newValue + } + } else { + if ($DryRun) { + Write-Info "Would create cmd.exe AutoRun registry key" + } else { + Set-ItemProperty -Path $regPath -Name "AutoRun" -Value $cmdCommand + } + } + Write-OK "cmd.exe AutoRun hook: $regPath" +} + +# ── Summary ─────────────────────────────────────────────────────────── + +Write-Host "" +Write-Host "Deployment summary:" -ForegroundColor White +Write-Host " Script: $ScriptPath" +Write-Host " Warning: $WarningDays days" +Write-Host " Interval task: $TaskName (every ${IntervalHours}h)" +Write-Host " Logon task: $logonTaskName (at user logon, 30s delay)" +if (-not $NoProfile) { + Write-Host " PS profile: $($PROFILE.AllUsersAllHosts) (all users)" +} +if ($CmdPrompt) { + Write-Host " cmd.exe: AutoRun registry hook (HKLM)" +} +if ($NetlogonCopy) { + Write-Host " NETLOGON: $env:LOGONSERVER\NETLOGON\$ScriptName" +} +Write-Host "" +Write-Host "Users will see warnings via:" -ForegroundColor White +Write-Host " MessageBox popup every $IntervalHours hours (scheduled task)" +Write-Host " MessageBox popup at logon (logon trigger task)" +Write-Host " Terminal banner in new PowerShell windows (profile hook)" +if ($CmdPrompt) { + Write-Host " Terminal banner in new cmd.exe windows (AutoRun hook)" +} +Write-Host "" +Write-Info "Test with: & '$ScriptPath' -Test" +Write-Info "Remove with: .\deploy-password-expiry-checker.ps1 -Remove" diff --git a/deploy-password-expiry-timer.sh b/deploy-password-expiry-timer.sh new file mode 100644 index 0000000..7757cae --- /dev/null +++ b/deploy-password-expiry-timer.sh @@ -0,0 +1,249 @@ +#!/usr/bin/env bash + +######################################################################################### +#### deploy-password-expiry-timer.sh — Deploy password expiry desktop notifications #### +#### Sets up systemd user timer + /etc/bashrc integration for all users. #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### sudo ./deploy-password-expiry-timer.sh #### +#### sudo ./deploy-password-expiry-timer.sh --dry-run #### +#### sudo ./deploy-password-expiry-timer.sh --remove #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +DRY_RUN=false +REMOVE=false +SCRIPT_PATH="/usr/local/bin/password-expiry-check.sh" +SCRIPT_URL="https://mylinux.work/downloads/password-expiry-check.sh" + +# ── Colors ──────────────────────────────────────────────────────────── +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BOLD='\033[1m' + RESET='\033[0m' +else + RED="" GREEN="" YELLOW="" BOLD="" RESET="" +fi + +log() { echo -e "${GREEN}[OK]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*"; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +info() { echo -e "${BOLD}[INFO]${RESET} $*"; } + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat </dev/null && \ + log "Disabled global user timer" || info "Timer was not enabled" + fi + + # Remove systemd files + for f in /etc/systemd/user/password-expiry-check.service /etc/systemd/user/password-expiry-check.timer; do + if [[ -f "$f" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + info "Would remove: $f" + else + rm -f "$f" + log "Removed $f" + fi + fi + done + + # Remove XDG autostart + if [[ -f /etc/xdg/autostart/password-expiry-check.desktop ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + info "Would remove: /etc/xdg/autostart/password-expiry-check.desktop" + else + rm -f /etc/xdg/autostart/password-expiry-check.desktop + log "Removed XDG autostart" + fi + fi + + # Remove bashrc entry + if grep -q "$BASHRC_MARKER" /etc/bashrc 2>/dev/null; then + if [[ "$DRY_RUN" == "true" ]]; then + info "Would remove password-expiry lines from /etc/bashrc" + else + sed -i "/${BASHRC_MARKER}/d" /etc/bashrc + sed -i "/password-expiry-check/d" /etc/bashrc + log "Removed /etc/bashrc entry" + fi + fi + + echo "" + if [[ "$DRY_RUN" != "true" ]]; then + log "Removal complete. Script left at ${SCRIPT_PATH} (remove manually if desired)" + fi + exit 0 +fi + +# ── Install mode ────────────────────────────────────────────────────── +info "Deploying password expiry notifications..." +echo "" + +# 1. Install script +if [[ -f "$SCRIPT_PATH" ]]; then + info "Script already exists at ${SCRIPT_PATH}" +else + if [[ "$DRY_RUN" == "true" ]]; then + info "Would download ${SCRIPT_URL} to ${SCRIPT_PATH}" + else + if command -v curl &>/dev/null; then + curl -sSL -o "$SCRIPT_PATH" "$SCRIPT_URL" + elif command -v wget &>/dev/null; then + wget -q -O "$SCRIPT_PATH" "$SCRIPT_URL" + else + err "Neither curl nor wget found — copy password-expiry-check.sh to ${SCRIPT_PATH} manually" + exit 1 + fi + chmod +x "$SCRIPT_PATH" + log "Installed ${SCRIPT_PATH}" + fi +fi + +# 2. Systemd user service +SERVICE_CONTENT="[Unit] +Description=Password Expiry Checker +After=graphical-session.target + +[Service] +Type=oneshot +ExecStart=${SCRIPT_PATH} -q +Environment=DISPLAY=:0" + +if [[ "$DRY_RUN" == "true" ]]; then + info "Would create: /etc/systemd/user/password-expiry-check.service" +else + mkdir -p /etc/systemd/user + echo "$SERVICE_CONTENT" > /etc/systemd/user/password-expiry-check.service + log "Created /etc/systemd/user/password-expiry-check.service" +fi + +# 3. Systemd user timer — every 4 hours +TIMER_CONTENT="[Unit] +Description=Check password expiry every 4 hours + +[Timer] +OnStartupSec=60 +OnUnitActiveSec=4h +Persistent=true + +[Install] +WantedBy=timers.target" + +if [[ "$DRY_RUN" == "true" ]]; then + info "Would create: /etc/systemd/user/password-expiry-check.timer" + info "Would run: systemctl --global enable password-expiry-check.timer" +else + echo "$TIMER_CONTENT" > /etc/systemd/user/password-expiry-check.timer + log "Created /etc/systemd/user/password-expiry-check.timer" + + systemctl --global enable password-expiry-check.timer 2>/dev/null + log "Enabled timer globally for all users" +fi + +# 4. XDG autostart (graphical login trigger with delay) +DESKTOP_CONTENT="[Desktop Entry] +Type=Application +Name=Password Expiry Checker +Comment=Check password expiry on login +Exec=bash -c 'sleep 10 && ${SCRIPT_PATH} -q' +Terminal=false +NoDisplay=true +X-GNOME-Autostart-enabled=true" + +if [[ "$DRY_RUN" == "true" ]]; then + info "Would create: /etc/xdg/autostart/password-expiry-check.desktop" +else + mkdir -p /etc/xdg/autostart + echo "$DESKTOP_CONTENT" > /etc/xdg/autostart/password-expiry-check.desktop + log "Created /etc/xdg/autostart/password-expiry-check.desktop" +fi + +# 5. /etc/bashrc entry (terminal warning) +if grep -q "$BASHRC_MARKER" /etc/bashrc 2>/dev/null; then + info "/etc/bashrc entry already exists" +else + if [[ "$DRY_RUN" == "true" ]]; then + info "Would add to /etc/bashrc:" + echo " ${BASHRC_LINE}" + echo " ${BASHRC_EXEC}" + else + { + echo "" + echo "$BASHRC_LINE" + echo "$BASHRC_EXEC ${BASHRC_MARKER}" + } >> /etc/bashrc + log "Added /etc/bashrc entry" + fi +fi + +echo "" +echo -e "${BOLD}Deployment summary:${RESET}" +echo " • Script: ${SCRIPT_PATH}" +echo " • Timer: /etc/systemd/user/password-expiry-check.timer (every 4h)" +echo " • XDG autostart: /etc/xdg/autostart/password-expiry-check.desktop (login + 10s delay)" +echo " • Terminal: /etc/bashrc (quiet mode — warns only when near expiry)" +echo "" +echo -e "${BOLD}Users will see warnings via:${RESET}" +echo " • Desktop popup every 4 hours (systemd timer)" +echo " • Desktop popup on graphical login (XDG autostart)" +echo " • Terminal banner on every new shell (bashrc)" +echo "" +info "Test with: ${SCRIPT_PATH} --test" +info "Remove with: $(basename "$0") --remove" diff --git a/dhcp-lease-exporter.sh b/dhcp-lease-exporter.sh new file mode 100644 index 0000000..4072830 --- /dev/null +++ b/dhcp-lease-exporter.sh @@ -0,0 +1,668 @@ +#!/bin/bash +################################################################################ +# Script Name: dhcp-lease-exporter.sh +# Version: 1.01 +# Description: Prometheus exporter for DHCP lease metrics — pool utilization, +# active leases per subnet, lease expirations, reservation status, +# DORA packet counts, and lease duration tracking for ISC DHCP +# (dhcpd) and ISC Kea. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Usage: +# # Output to stdout +# sudo ./dhcp-lease-exporter.sh +# +# # Textfile collector mode +# sudo ./dhcp-lease-exporter.sh --textfile +# +# # HTTP server mode +# sudo ./dhcp-lease-exporter.sh --http +# +# # Custom port +# sudo ./dhcp-lease-exporter.sh --http --port 9533 +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +readonly VERSION="1.0" +readonly SCRIPT_NAME="${0##*/}" + +# DHCP backend — auto, dhcpd, or kea +DHCP_BACKEND="auto" + +# dhcpd paths +DHCPD_LEASES="/var/lib/dhcp/dhcpd.leases" +DHCPD_CONF="/etc/dhcp/dhcpd.conf" + +# Kea paths and API +KEA_LEASES="/var/lib/kea/kea-leases4.csv" +KEA_API="http://127.0.0.1:8000" +KEA_USE_API="true" + +# Output settings +TEXTFILE_DIR="/var/lib/node_exporter/textfile_collector" +HTTP_PORT=9533 +LOCK_FILE="/tmp/dhcp-lease-exporter.lock" + +# Runtime +MODE="stdout" +ONCE=false +DETECTED_BACKEND="" + +# ============================================================================ +# COLORS +# ============================================================================ + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*" >&2; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*" >&2; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } + +show_usage() { + cat </dev/null || true) + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + log_error "Another instance is running (PID $pid)" + exit 1 + fi + rm -f "$LOCK_FILE" + fi + echo $$ > "$LOCK_FILE" + trap 'rm -f "$LOCK_FILE"' EXIT +} + +# ============================================================================ +# BACKEND DETECTION +# ============================================================================ + +detect_backend() { + if [ "$DHCP_BACKEND" != "auto" ]; then + DETECTED_BACKEND="$DHCP_BACKEND" + return + fi + + if systemctl is-active --quiet isc-kea-dhcp4-server 2>/dev/null || \ + systemctl is-active --quiet kea-dhcp4 2>/dev/null; then + DETECTED_BACKEND="kea" + elif systemctl is-active --quiet isc-dhcp-server 2>/dev/null || \ + systemctl is-active --quiet dhcpd 2>/dev/null; then + DETECTED_BACKEND="dhcpd" + elif [ -f "$KEA_LEASES" ]; then + DETECTED_BACKEND="kea" + elif [ -f "$DHCPD_LEASES" ]; then + DETECTED_BACKEND="dhcpd" + else + DETECTED_BACKEND="unknown" + fi +} + +# ============================================================================ +# DHCPD FUNCTIONS +# ============================================================================ + +# Parse dhcpd.conf for subnet definitions and pool ranges +parse_dhcpd_subnets() { + local conf="$DHCPD_CONF" + [ -f "$conf" ] || return + + local current_subnet="" current_name="" range_start="" range_end="" + local in_subnet=false + + while IFS= read -r line; do + # Match subnet declaration + if [[ "$line" =~ ^[[:space:]]*subnet[[:space:]]+([0-9.]+)[[:space:]]+netmask[[:space:]]+([0-9.]+) ]]; then + current_subnet="${BASH_REMATCH[1]}" + local netmask="${BASH_REMATCH[2]}" + current_name="$current_subnet" + in_subnet=true + range_start="" + range_end="" + # Calculate CIDR from netmask + local cidr + cidr=$(netmask_to_cidr "$netmask") + current_subnet="${current_subnet}/${cidr}" + fi + + # Check for comment-based name + if $in_subnet && [[ "$line" =~ ^[[:space:]]*#[[:space:]]*(.+) ]]; then + if [ "$current_name" = "${current_subnet%%/*}" ]; then + current_name="${BASH_REMATCH[1]}" + fi + fi + + # Match range declaration + if $in_subnet && [[ "$line" =~ ^[[:space:]]*range[[:space:]]+([0-9.]+)[[:space:]]+([0-9.]+) ]]; then + range_start="${BASH_REMATCH[1]}" + range_end="${BASH_REMATCH[2]}" + fi + + # End of subnet block + if $in_subnet && [[ "$line" =~ ^[[:space:]]*\} ]]; then + if [ -n "$range_start" ] && [ -n "$range_end" ]; then + local total + total=$(ip_range_count "$range_start" "$range_end") + echo "${current_subnet}|${current_name}|${total}|${range_start}|${range_end}" + fi + in_subnet=false + fi + done < "$conf" +} + +netmask_to_cidr() { + local netmask="$1" + local cidr=0 + for octet in $(echo "$netmask" | tr '.' ' '); do + case $octet in + 255) cidr=$((cidr + 8)) ;; + 254) cidr=$((cidr + 7)) ;; + 252) cidr=$((cidr + 6)) ;; + 248) cidr=$((cidr + 5)) ;; + 240) cidr=$((cidr + 4)) ;; + 224) cidr=$((cidr + 3)) ;; + 192) cidr=$((cidr + 2)) ;; + 128) cidr=$((cidr + 1)) ;; + 0) ;; + esac + done + echo "$cidr" +} + +ip_to_int() { + local a b c d + IFS='.' read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) +} + +ip_range_count() { + local start_int end_int + start_int=$(ip_to_int "$1") + end_int=$(ip_to_int "$2") + echo $(( end_int - start_int + 1 )) +} + +# Count active leases per subnet from dhcpd.leases +count_dhcpd_leases() { + local lease_file="$DHCPD_LEASES" + [ -f "$lease_file" ] || return + + local now + now=$(date +%s) + + awk -v now="$now" ' + /^lease / { ip = $2 } + /ends / { + gsub(/[;\/:]/, " ", $0) + if ($2 != "never") { + t = mktime($3 " " $4 " " $5 " " $6 " " $7 " " $8) + if (t > now) active[ip] = t - now + } + } + /binding state active/ { state[ip] = "active" } + END { + for (ip in active) { + if (state[ip] == "active") { + print ip, active[ip] + } + } + }' "$lease_file" +} + +# Count reservations from dhcpd.conf +count_dhcpd_reservations() { + local conf="$DHCPD_CONF" + [ -f "$conf" ] || return + grep -c "fixed-address" "$conf" 2>/dev/null || true +} + +# Parse DORA stats from syslog +parse_dhcpd_dora() { + local logfile="/var/log/syslog" + [ -f "$logfile" ] || logfile="/var/log/messages" + [ -f "$logfile" ] || return + + local discovers offers requests acks naks declines releases + discovers=$(grep -c "DHCPDISCOVER" "$logfile" 2>/dev/null || true) + offers=$(grep -c "DHCPOFFER" "$logfile" 2>/dev/null || true) + requests=$(grep -c "DHCPREQUEST" "$logfile" 2>/dev/null || true) + acks=$(grep -c "DHCPACK" "$logfile" 2>/dev/null || true) + naks=$(grep -c "DHCPNAK" "$logfile" 2>/dev/null || true) + declines=$(grep -c "DHCPDECLINE" "$logfile" 2>/dev/null || true) + releases=$(grep -c "DHCPRELEASE" "$logfile" 2>/dev/null || true) + + echo "${discovers}|${offers}|${requests}|${acks}|${naks}|${declines}|${releases}" +} + +# ============================================================================ +# KEA FUNCTIONS +# ============================================================================ + +kea_api_call() { + local command="$1" + curl -s --max-time 5 -X POST "${KEA_API}" \ + -H "Content-Type: application/json" \ + -d "{\"command\": \"${command}\", \"service\": [\"dhcp4\"]}" 2>/dev/null +} + +parse_kea_leases_file() { + local lease_file="$KEA_LEASES" + [ -f "$lease_file" ] || return + + local now + now=$(date +%s) + + awk -F',' -v now="$now" ' + NR > 1 && NF >= 9 { + ip = $1 + expire = $7 + state = $9 + if (state == 0 && expire > now) { + remaining = expire - now + print ip, remaining + } + }' "$lease_file" +} + +parse_kea_api_subnets() { + local response + response=$(kea_api_call "subnet4-list") + if [ -z "$response" ]; then + return 1 + fi + + echo "$response" | python3 -c " +import sys, json +data = json.load(sys.stdin) +if data[0]['result'] == 0: + for s in data[0].get('arguments', {}).get('subnets', []): + sid = s.get('id', 0) + subnet = s.get('subnet', '') + print(f'{sid}|{subnet}') +" 2>/dev/null +} + +parse_kea_api_stats() { + local response + response=$(kea_api_call "statistic-get-all") + if [ -z "$response" ]; then + return 1 + fi + echo "$response" +} + +# ============================================================================ +# METRIC COLLECTION +# ============================================================================ + +collect_metrics() { + local start_time + start_time=$(date +%s%N) + local metrics="" + + # Exporter status + metrics+="$(write_metric_header "dhcp_up" "gauge" "Exporter status (1=up, 0=down)")"$'\n' + metrics+="$(write_metric_header "dhcp_exporter_info" "gauge" "Exporter version and backend")"$'\n' + + if [ "$DETECTED_BACKEND" = "unknown" ]; then + metrics+="dhcp_up 0"$'\n' + echo "$metrics" + return + fi + + metrics+="dhcp_up 1"$'\n' + metrics+="dhcp_exporter_info{version=\"${VERSION}\",backend=\"${DETECTED_BACKEND}\"} 1"$'\n' + + local subnet_count=0 + local total_active=0 + + if [ "$DETECTED_BACKEND" = "dhcpd" ]; then + collect_dhcpd_metrics + elif [ "$DETECTED_BACKEND" = "kea" ]; then + collect_kea_metrics + fi + + # Subnet count + metrics+="$(write_metric_header "dhcp_subnets_total" "gauge" "Total number of configured subnets")"$'\n' + metrics+="dhcp_subnets_total ${subnet_count}"$'\n' + + # Total active leases + metrics+="$(write_metric_header "dhcp_leases_active_total" "gauge" "Total active leases across all subnets")"$'\n' + metrics+="dhcp_leases_active_total ${total_active}"$'\n' + + # Lease file info + if [ "$DETECTED_BACKEND" = "dhcpd" ] && [ -f "$DHCPD_LEASES" ]; then + local file_age file_size + file_age=$(( $(date +%s) - $(stat -c %Y "$DHCPD_LEASES") )) + file_size=$(stat -c %s "$DHCPD_LEASES") + metrics+="$(write_metric_header "dhcp_lease_file_age_seconds" "gauge" "Seconds since the lease file was last modified")"$'\n' + metrics+="dhcp_lease_file_age_seconds ${file_age}"$'\n' + metrics+="$(write_metric_header "dhcp_lease_file_size_bytes" "gauge" "Size of the lease file")"$'\n' + metrics+="dhcp_lease_file_size_bytes ${file_size}"$'\n' + elif [ "$DETECTED_BACKEND" = "kea" ] && [ -f "$KEA_LEASES" ]; then + local file_age file_size + file_age=$(( $(date +%s) - $(stat -c %Y "$KEA_LEASES") )) + file_size=$(stat -c %s "$KEA_LEASES") + metrics+="$(write_metric_header "dhcp_lease_file_age_seconds" "gauge" "Seconds since the lease file was last modified")"$'\n' + metrics+="dhcp_lease_file_age_seconds ${file_age}"$'\n' + metrics+="$(write_metric_header "dhcp_lease_file_size_bytes" "gauge" "Size of the lease file")"$'\n' + metrics+="dhcp_lease_file_size_bytes ${file_size}"$'\n' + fi + + # Execution time + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $start_time) / 1000000000" | bc 2>/dev/null || echo "0") + metrics+="$(write_metric_header "dhcp_exporter_duration_seconds" "gauge" "Script execution time")"$'\n' + metrics+="dhcp_exporter_duration_seconds ${duration}"$'\n' + metrics+="$(write_metric_header "dhcp_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run")"$'\n' + metrics+="dhcp_exporter_last_run_timestamp $(date +%s)"$'\n' + + echo "$metrics" +} + +collect_dhcpd_metrics() { + # Parse subnets from config + local subnet_data + subnet_data=$(parse_dhcpd_subnets) + + # Get active leases + local lease_data + lease_data=$(count_dhcpd_leases) + + metrics+="$(write_metric_header "dhcp_subnet_pool_total" "gauge" "Total addresses in the pool")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_pool_active" "gauge" "Currently leased addresses")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_pool_free" "gauge" "Available addresses in the pool")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_pool_utilization" "gauge" "Pool utilization percentage (0-100)")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_pool_reserved" "gauge" "Number of static reservations")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_leases_expiring" "gauge" "Leases expiring within threshold")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_lease_longest_seconds" "gauge" "Remaining time on the longest lease")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_lease_shortest_seconds" "gauge" "Remaining time on the shortest lease")"$'\n' + + while IFS='|' read -r subnet name pool_total range_start range_end; do + [ -z "$subnet" ] && continue + subnet_count=$((subnet_count + 1)) + + # Count active leases in this subnet range + local active=0 longest=0 shortest=999999999 + local expiring_1h=0 expiring_4h=0 expiring_24h=0 + local start_int end_int + start_int=$(ip_to_int "$range_start") + end_int=$(ip_to_int "$range_end") + + while read -r lease_ip remaining; do + [ -z "$lease_ip" ] && continue + local lip + lip=$(ip_to_int "$lease_ip") + if [ "$lip" -ge "$start_int" ] && [ "$lip" -le "$end_int" ]; then + active=$((active + 1)) + total_active=$((total_active + 1)) + [ "$remaining" -gt "$longest" ] && longest=$remaining + [ "$remaining" -lt "$shortest" ] && shortest=$remaining + [ "$remaining" -le 3600 ] && expiring_1h=$((expiring_1h + 1)) + [ "$remaining" -le 14400 ] && expiring_4h=$((expiring_4h + 1)) + [ "$remaining" -le 86400 ] && expiring_24h=$((expiring_24h + 1)) + fi + done <<< "$lease_data" + + local free=$((pool_total - active)) + [ $free -lt 0 ] && free=0 + local util=0 + if [ "$pool_total" -gt 0 ]; then + util=$(echo "scale=2; $active * 100 / $pool_total" | bc 2>/dev/null || echo "0") + fi + [ $active -eq 0 ] && shortest=0 + + local reserved + reserved=$(count_dhcpd_reservations) + + metrics+="dhcp_subnet_pool_total{subnet=\"${subnet}\",name=\"${name}\"} ${pool_total}"$'\n' + metrics+="dhcp_subnet_pool_active{subnet=\"${subnet}\",name=\"${name}\"} ${active}"$'\n' + metrics+="dhcp_subnet_pool_free{subnet=\"${subnet}\",name=\"${name}\"} ${free}"$'\n' + metrics+="dhcp_subnet_pool_utilization{subnet=\"${subnet}\",name=\"${name}\"} ${util}"$'\n' + metrics+="dhcp_subnet_pool_reserved{subnet=\"${subnet}\",name=\"${name}\"} ${reserved}"$'\n' + metrics+="dhcp_subnet_leases_expiring{subnet=\"${subnet}\",name=\"${name}\",within=\"1h\"} ${expiring_1h}"$'\n' + metrics+="dhcp_subnet_leases_expiring{subnet=\"${subnet}\",name=\"${name}\",within=\"4h\"} ${expiring_4h}"$'\n' + metrics+="dhcp_subnet_leases_expiring{subnet=\"${subnet}\",name=\"${name}\",within=\"24h\"} ${expiring_24h}"$'\n' + metrics+="dhcp_subnet_lease_longest_seconds{subnet=\"${subnet}\",name=\"${name}\"} ${longest}"$'\n' + metrics+="dhcp_subnet_lease_shortest_seconds{subnet=\"${subnet}\",name=\"${name}\"} ${shortest}"$'\n' + done <<< "$subnet_data" + + # DORA stats + local dora + dora=$(parse_dhcpd_dora) + if [ -n "$dora" ]; then + IFS='|' read -r discovers offers requests acks naks declines releases <<< "$dora" + metrics+="$(write_metric_header "dhcp_discovers_total" "counter" "Total DHCPDISCOVER packets received")"$'\n' + metrics+="dhcp_discovers_total ${discovers}"$'\n' + metrics+="$(write_metric_header "dhcp_offers_total" "counter" "Total DHCPOFFER packets sent")"$'\n' + metrics+="dhcp_offers_total ${offers}"$'\n' + metrics+="$(write_metric_header "dhcp_requests_total" "counter" "Total DHCPREQUEST packets received")"$'\n' + metrics+="dhcp_requests_total ${requests}"$'\n' + metrics+="$(write_metric_header "dhcp_acks_total" "counter" "Total DHCPACK packets sent")"$'\n' + metrics+="dhcp_acks_total ${acks}"$'\n' + metrics+="$(write_metric_header "dhcp_naks_total" "counter" "Total DHCPNAK packets sent")"$'\n' + metrics+="dhcp_naks_total ${naks}"$'\n' + metrics+="$(write_metric_header "dhcp_declines_total" "counter" "Total DHCPDECLINE packets received")"$'\n' + metrics+="dhcp_declines_total ${declines}"$'\n' + metrics+="$(write_metric_header "dhcp_releases_total" "counter" "Total DHCPRELEASE packets received")"$'\n' + metrics+="dhcp_releases_total ${releases}"$'\n' + fi +} + +collect_kea_metrics() { + metrics+="$(write_metric_header "dhcp_subnet_pool_total" "gauge" "Total addresses in the pool")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_pool_active" "gauge" "Currently leased addresses")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_pool_free" "gauge" "Available addresses in the pool")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_pool_utilization" "gauge" "Pool utilization percentage (0-100)")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_pool_reserved" "gauge" "Number of static reservations")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_leases_expiring" "gauge" "Leases expiring within threshold")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_lease_longest_seconds" "gauge" "Remaining time on the longest lease")"$'\n' + metrics+="$(write_metric_header "dhcp_subnet_lease_shortest_seconds" "gauge" "Remaining time on the shortest lease")"$'\n' + + if [ "$KEA_USE_API" = "true" ]; then + collect_kea_api_metrics + else + collect_kea_file_metrics + fi +} + +collect_kea_api_metrics() { + local stats_json + stats_json=$(kea_api_call "statistic-get-all") + + if [ -z "$stats_json" ]; then + log_warn "Kea API not responding, falling back to file mode" + collect_kea_file_metrics + return + fi + + # Parse stats via python3 + echo "$stats_json" | python3 -c " +import sys, json +data = json.load(sys.stdin) +if data[0]['result'] == 0: + args = data[0].get('arguments', {}) + for key, val in args.items(): + if val and isinstance(val, list): + v = val[0][0] if isinstance(val[0], list) else val[0] + print(f'{key}={v}') +" 2>/dev/null | while IFS='=' read -r key value; do + case "$key" in + subnet*total-addresses*) + local sid="${key#subnet[}" + sid="${sid%%]*}" + metrics+="dhcp_subnet_pool_total{subnet=\"${sid}\"} ${value}"$'\n' + ;; + subnet*assigned-addresses*) + local sid="${key#subnet[}" + sid="${sid%%]*}" + metrics+="dhcp_subnet_pool_active{subnet=\"${sid}\"} ${value}"$'\n' + ;; + pkt4-discover-received) + metrics+="$(write_metric_header "dhcp_discovers_total" "counter" "Total DHCPDISCOVER packets received")"$'\n' + metrics+="dhcp_discovers_total ${value}"$'\n' + ;; + pkt4-offer-sent) + metrics+="$(write_metric_header "dhcp_offers_total" "counter" "Total DHCPOFFER packets sent")"$'\n' + metrics+="dhcp_offers_total ${value}"$'\n' + ;; + pkt4-request-received) + metrics+="$(write_metric_header "dhcp_requests_total" "counter" "Total DHCPREQUEST packets received")"$'\n' + metrics+="dhcp_requests_total ${value}"$'\n' + ;; + pkt4-ack-sent) + metrics+="$(write_metric_header "dhcp_acks_total" "counter" "Total DHCPACK packets sent")"$'\n' + metrics+="dhcp_acks_total ${value}"$'\n' + ;; + pkt4-nak-sent) + metrics+="$(write_metric_header "dhcp_naks_total" "counter" "Total DHCPNAK packets sent")"$'\n' + metrics+="dhcp_naks_total ${value}"$'\n' + ;; + pkt4-decline-received) + metrics+="$(write_metric_header "dhcp_declines_total" "counter" "Total DHCPDECLINE packets received")"$'\n' + metrics+="dhcp_declines_total ${value}"$'\n' + ;; + pkt4-release-received) + metrics+="$(write_metric_header "dhcp_releases_total" "counter" "Total DHCPRELEASE packets received")"$'\n' + metrics+="dhcp_releases_total ${value}"$'\n' + ;; + esac + done +} + +collect_kea_file_metrics() { + local lease_data + lease_data=$(parse_kea_leases_file) + local now + now=$(date +%s) + + # Simple lease counting from CSV + while read -r lease_ip remaining; do + [ -z "$lease_ip" ] && continue + total_active=$((total_active + 1)) + done <<< "$lease_data" +} + +# ============================================================================ +# OUTPUT +# ============================================================================ + +output_metrics() { + local all_metrics + all_metrics=$(collect_metrics) + + case "$MODE" in + stdout) + echo "$all_metrics" + ;; + textfile) + mkdir -p "$TEXTFILE_DIR" + local tmp_file + tmp_file=$(mktemp "${TEXTFILE_DIR}/.dhcp-metrics.XXXXXX") + echo "$all_metrics" > "$tmp_file" + mv "$tmp_file" "${TEXTFILE_DIR}/dhcp-metrics.prom" + log_info "Wrote metrics to ${TEXTFILE_DIR}/dhcp-metrics.prom" + ;; + http) + run_http_server "$all_metrics" + ;; + esac +} + +run_http_server() { + log_info "Starting HTTP server on port ${HTTP_PORT}" + while true; do + local all_metrics + all_metrics=$(collect_metrics) + + { + echo -e "HTTP/1.1 200 OK\r" + echo -e "Content-Type: text/plain; version=0.0.4; charset=utf-8\r" + echo -e "Content-Length: ${#all_metrics}\r" + echo -e "\r" + echo "$all_metrics" + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null || \ + { + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\n\r\n${all_metrics}" + } | nc -l "$HTTP_PORT" 2>/dev/null + + if $ONCE; then + break + fi + done +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + acquire_lock + detect_backend + log_info "Detected DHCP backend: ${DETECTED_BACKEND}" + output_metrics +} + +main "$@" diff --git a/directory-size-exporter.sh b/directory-size-exporter.sh index ff25c40..0b67038 100644 --- a/directory-size-exporter.sh +++ b/directory-size-exporter.sh @@ -9,7 +9,7 @@ # Author: Phil Connor # Contact: contact@mylinux.work # License: MIT -# Version: 1.0.0 +# Version: 1.0.1 set -euo pipefail @@ -27,28 +27,23 @@ TARGET_DIRECTORIES=() # ── Metrics Collection ────────────────────────────────────────────── log_verbose() { - [[ "$VERBOSE" == true ]] && echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 + [[ "$VERBOSE" == true ]] && echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 || true } log_info() { - [[ "$QUIET" == false ]] && echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 + [[ "$QUIET" == false ]] && echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 || true } collect_metrics() { local start_time start_time=$(date +%s%N) - echo "# HELP node_directory_size_bytes Disk space used by directory" - echo "# TYPE node_directory_size_bytes gauge" - echo "# HELP node_directory_filesystem_usage_percent Filesystem usage percentage for the directory mount point" - echo "# TYPE node_directory_filesystem_usage_percent gauge" - local success=1 + local size_lines="" pct_lines="" for directory in "${TARGET_DIRECTORIES[@]}"; do log_verbose "Running du for: $directory" - # Get directory size in bytes local du_output du_output=$(timeout "$TIMEOUT" du --block-size=1 --summarize "$directory" 2>/dev/null) || { log_info "WARNING: du failed for $directory" @@ -58,16 +53,24 @@ collect_metrics() { local size_bytes size_bytes=$(echo "$du_output" | awk '{print $1}') - echo "node_directory_size_bytes{directory=\"${directory}\"} ${size_bytes}" + size_lines+="node_directory_size_bytes{directory=\"${directory}\"} ${size_bytes}"$'\n' - # Get filesystem usage percentage for the mount point local pct pct=$(df --output=pcent "$directory" 2>/dev/null | tail -n 1 | tr -d ' %') if [[ "$pct" =~ ^[0-9]+$ ]]; then - echo "node_directory_filesystem_usage_percent{directory=\"${directory}\"} ${pct}" + pct_lines+="node_directory_filesystem_usage_percent{directory=\"${directory}\"} ${pct}"$'\n' fi done + echo "# HELP node_directory_size_bytes Disk space used by directory" + echo "# TYPE node_directory_size_bytes gauge" + printf "%s" "$size_lines" + + echo "" + echo "# HELP node_directory_filesystem_usage_percent Filesystem usage percentage for the directory mount point" + echo "# TYPE node_directory_filesystem_usage_percent gauge" + printf "%s" "$pct_lines" + # ── Script runtime ── local end_time runtime end_time=$(date +%s%N) @@ -78,10 +81,12 @@ collect_metrics() { echo "# TYPE ${EXPORTER_NAME}_duration_seconds gauge" echo "${EXPORTER_NAME}_duration_seconds ${runtime}" + echo "" echo "# HELP ${EXPORTER_NAME}_last_run_timestamp Last successful run" echo "# TYPE ${EXPORTER_NAME}_last_run_timestamp gauge" echo "${EXPORTER_NAME}_last_run_timestamp $(date +%s)" + echo "" echo "# HELP ${EXPORTER_NAME}_success Whether the exporter ran successfully" echo "# TYPE ${EXPORTER_NAME}_success gauge" echo "${EXPORTER_NAME}_success ${success}" @@ -191,8 +196,8 @@ while [[ $# -gt 0 ]]; do shift ;; --handle-request) - handle_request - exit 0 + OUTPUT_MODE="handle-request" + shift ;; -h|--help) show_help @@ -236,6 +241,10 @@ if [[ "$DRY_RUN" == true ]]; then fi case "$OUTPUT_MODE" in + handle-request) + handle_request + exit 0 + ;; stdout) collect_metrics ;; @@ -262,6 +271,6 @@ case "$OUTPUT_MODE" in fi echo "${EXPORTER_NAME} listening on port ${PORT}..." echo "Monitoring directories: ${TARGET_DIRECTORIES[*]}" - socat TCP-LISTEN:"$PORT",reuseaddr,fork EXEC:"$0 --handle-request" + socat TCP-LISTEN:"$PORT",reuseaddr,fork EXEC:"$0 --handle-request ${TARGET_DIRECTORIES[*]}" ;; esac diff --git a/disk-cleanup.sh b/disk-cleanup.sh new file mode 100644 index 0000000..b80387b --- /dev/null +++ b/disk-cleanup.sh @@ -0,0 +1,584 @@ +#!/usr/bin/env bash + +######################################################################################### +#### disk-cleanup.sh — Find and clean disk space hogs on Linux servers #### +#### Scans logs, temp files, package caches, old kernels, journal, and Docker cruft #### +#### Dry-run by default — nothing is deleted without --force #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./disk-cleanup.sh --scan #### +#### ./disk-cleanup.sh --clean --force #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +LOG_AGE_DAYS="${LOG_AGE_DAYS:-30}" +TMP_AGE_DAYS="${TMP_AGE_DAYS:-7}" +JOURNAL_MAX="${JOURNAL_MAX:-500M}" +LARGE_FILE_MIN="${LARGE_FILE_MIN:-100M}" +LARGE_FILE_DIRS="${LARGE_FILE_DIRS:-/var /home /opt /tmp /srv}" +DRY_RUN="${DRY_RUN:-true}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +RUN_MODE="" +TOTAL_RECLAIMABLE=0 +TOTAL_CLEANED=0 + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + RED="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +human_bytes() { + local bytes="$1" + if [[ "$bytes" -ge 1073741824 ]]; then + awk "BEGIN { printf \"%.1f GiB\", $bytes / 1073741824 }" + elif [[ "$bytes" -ge 1048576 ]]; then + awk "BEGIN { printf \"%.1f MiB\", $bytes / 1048576 }" + elif [[ "$bytes" -ge 1024 ]]; then + awk "BEGIN { printf \"%.1f KiB\", $bytes / 1024 }" + else + echo "${bytes} B" + fi +} + +add_reclaimable() { + TOTAL_RECLAIMABLE=$((TOTAL_RECLAIMABLE + $1)) +} + +add_cleaned() { + TOTAL_CLEANED=$((TOTAL_CLEANED + $1)) +} + +# ══════════════════════════════════════════════════════════════════════ +# OLD LOGS +# ══════════════════════════════════════════════════════════════════════ + +scan_old_logs() { + section_header "Old Log Files (> ${LOG_AGE_DAYS} days)" + + local total_size=0 + local count=0 + + while IFS= read -r -d '' file; do + local size + size=$(stat -c%s "$file" 2>/dev/null || echo 0) + if [[ "$size" -gt 0 ]]; then + total_size=$((total_size + size)) + ((count++)) || true + if [[ "$VERBOSE" == "true" ]]; then + printf " %10s %s\n" "$(human_bytes "$size")" "$file" + fi + fi + done < <(find /var/log -type f \( -name "*.gz" -o -name "*.xz" -o -name "*.bz2" -o -name "*.[0-9]" -o -name "*.old" \) -mtime +"$LOG_AGE_DAYS" -print0 2>/dev/null) + + # Rotated logs without compression + while IFS= read -r -d '' file; do + local size + size=$(stat -c%s "$file" 2>/dev/null || echo 0) + if [[ "$size" -gt 0 ]]; then + total_size=$((total_size + size)) + ((count++)) || true + fi + done < <(find /var/log -type f -name "*.log.*" -mtime +"$LOG_AGE_DAYS" -print0 2>/dev/null) + + printf " %-30s %s (%d files)\n" "Rotated/old logs:" "$(human_bytes "$total_size")" "$count" + add_reclaimable "$total_size" +} + +clean_old_logs() { + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY-RUN] Would delete old log files in /var/log" + return + fi + + local cleaned=0 + find /var/log -type f \( -name "*.gz" -o -name "*.xz" -o -name "*.bz2" -o -name "*.[0-9]" -o -name "*.old" \) -mtime +"$LOG_AGE_DAYS" -print0 2>/dev/null | while IFS= read -r -d '' file; do + local size + size=$(stat -c%s "$file" 2>/dev/null || echo 0) + rm -f "$file" && cleaned=$((cleaned + size)) + done + + find /var/log -type f -name "*.log.*" -mtime +"$LOG_AGE_DAYS" -delete 2>/dev/null || true + log "Cleaned old log files" +} + +# ══════════════════════════════════════════════════════════════════════ +# JOURNAL +# ══════════════════════════════════════════════════════════════════════ + +scan_journal() { + section_header "Systemd Journal" + + if ! command -v journalctl &>/dev/null; then + printf " %-30s %s\n" "Journal:" "N/A (no systemd)" + return + fi + + local journal_size + journal_size=$(journalctl --disk-usage 2>/dev/null | grep -oP '[\d.]+[GMKT]' | head -1 || echo "0") + + # Get bytes for tracking + local journal_bytes + journal_bytes=$(du -sb /var/log/journal/ 2>/dev/null | awk '{print $1}' || echo "0") + if [[ "$journal_bytes" -eq 0 ]]; then + journal_bytes=$(du -sb /run/log/journal/ 2>/dev/null | awk '{print $1}' || echo "0") + fi + + printf " %-30s %s\n" "Journal size:" "${journal_size:-Unknown}" + printf " %-30s %s\n" "Would vacuum to:" "$JOURNAL_MAX" + + # Estimate savings + local max_bytes=0 + local max_num="${JOURNAL_MAX%[GMKT]*}" + local max_unit="${JOURNAL_MAX: -1}" + case "$max_unit" in + G) max_bytes=$((max_num * 1073741824)) ;; + M) max_bytes=$((max_num * 1048576)) ;; + K) max_bytes=$((max_num * 1024)) ;; + *) max_bytes=$((max_num)) ;; + esac + + if [[ "$journal_bytes" -gt "$max_bytes" ]]; then + local savings=$((journal_bytes - max_bytes)) + add_reclaimable "$savings" + printf " %-30s %s\n" "Reclaimable:" "$(human_bytes "$savings")" + fi +} + +clean_journal() { + if ! command -v journalctl &>/dev/null; then + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY-RUN] Would vacuum journal to ${JOURNAL_MAX}" + return + fi + + journalctl --vacuum-size="$JOURNAL_MAX" 2>/dev/null || warn "Journal vacuum failed" + log "Vacuumed journal to ${JOURNAL_MAX}" +} + +# ══════════════════════════════════════════════════════════════════════ +# TEMP FILES +# ══════════════════════════════════════════════════════════════════════ + +scan_tmp() { + section_header "Temp Files (> ${TMP_AGE_DAYS} days)" + + local total_size=0 + local count=0 + + for dir in /tmp /var/tmp; do + if [[ -d "$dir" ]]; then + while IFS= read -r -d '' file; do + local size + size=$(stat -c%s "$file" 2>/dev/null || echo 0) + total_size=$((total_size + size)) + ((count++)) || true + done < <(find "$dir" -maxdepth 2 -type f -mtime +"$TMP_AGE_DAYS" -print0 2>/dev/null) + fi + done + + printf " %-30s %s (%d files)\n" "Old temp files:" "$(human_bytes "$total_size")" "$count" + add_reclaimable "$total_size" +} + +clean_tmp() { + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY-RUN] Would delete temp files older than ${TMP_AGE_DAYS} days" + return + fi + + for dir in /tmp /var/tmp; do + if [[ -d "$dir" ]]; then + find "$dir" -maxdepth 2 -type f -mtime +"$TMP_AGE_DAYS" -delete 2>/dev/null || true + fi + done + log "Cleaned temp files" +} + +# ══════════════════════════════════════════════════════════════════════ +# PACKAGE CACHE +# ══════════════════════════════════════════════════════════════════════ + +scan_package_cache() { + section_header "Package Cache" + + if command -v apt-get &>/dev/null; then + local apt_size + apt_size=$(du -sb /var/cache/apt/archives/ 2>/dev/null | awk '{print $1}' || echo "0") + printf " %-30s %s\n" "APT cache:" "$(human_bytes "$apt_size")" + add_reclaimable "$apt_size" + fi + + if command -v yum &>/dev/null || command -v dnf &>/dev/null; then + local yum_size + yum_size=$(du -sb /var/cache/yum/ /var/cache/dnf/ 2>/dev/null | awk '{total+=$1} END {print total+0}') + printf " %-30s %s\n" "YUM/DNF cache:" "$(human_bytes "$yum_size")" + add_reclaimable "$yum_size" + fi +} + +clean_package_cache() { + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY-RUN] Would clean package cache" + return + fi + + if command -v apt-get &>/dev/null; then + apt-get clean -y 2>/dev/null || warn "apt-get clean failed" + log "Cleaned APT cache" + fi + + if command -v dnf &>/dev/null; then + dnf clean all 2>/dev/null || warn "dnf clean failed" + log "Cleaned DNF cache" + elif command -v yum &>/dev/null; then + yum clean all 2>/dev/null || warn "yum clean failed" + log "Cleaned YUM cache" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OLD KERNELS +# ══════════════════════════════════════════════════════════════════════ + +scan_old_kernels() { + section_header "Old Kernels" + + local current_kernel + current_kernel=$(uname -r) + + local old_count=0 + local total_size=0 + + if command -v dpkg &>/dev/null; then + while IFS= read -r pkg; do + [[ -z "$pkg" ]] && continue + local pkg_version + pkg_version=$(echo "$pkg" | sed 's/linux-image-//' | sed 's/-generic//' | sed 's/-unsigned//') + if [[ "$current_kernel" != *"$pkg_version"* ]]; then + local size + size=$(dpkg-query -W --showformat='${Installed-Size}' "$pkg" 2>/dev/null || echo "0") + total_size=$((total_size + size * 1024)) + ((old_count++)) || true + verbose "Old kernel: ${pkg} ($(human_bytes $((size * 1024))))" + fi + done < <(dpkg --list 'linux-image-*' 2>/dev/null | grep '^ii' | awk '{print $2}' | grep -v "$current_kernel") + elif command -v rpm &>/dev/null; then + while IFS= read -r pkg; do + [[ -z "$pkg" ]] && continue + if [[ "$pkg" != *"$current_kernel"* ]]; then + local size + size=$(rpm -q --queryformat '%{SIZE}' "$pkg" 2>/dev/null || echo "0") + total_size=$((total_size + size)) + ((old_count++)) || true + verbose "Old kernel: ${pkg}" + fi + done < <(rpm -qa kernel 2>/dev/null) + fi + + printf " %-30s %s\n" "Current kernel:" "$current_kernel" + printf " %-30s %d ($(human_bytes "$total_size"))\n" "Old kernels:" "$old_count" + add_reclaimable "$total_size" +} + +# ══════════════════════════════════════════════════════════════════════ +# DOCKER CLEANUP +# ══════════════════════════════════════════════════════════════════════ + +scan_docker() { + if ! command -v docker &>/dev/null; then + return + fi + + if ! docker info &>/dev/null 2>&1; then + return + fi + + section_header "Docker" + + # Dangling images + local dangling_count + dangling_count=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l) + + printf " %-30s %d\n" "Dangling images:" "$dangling_count" + + # Stopped containers + local stopped_count + stopped_count=$(docker ps -f "status=exited" -q 2>/dev/null | wc -l) + printf " %-30s %d\n" "Stopped containers:" "$stopped_count" + + # Unused volumes + local vol_count + vol_count=$(docker volume ls -f "dangling=true" -q 2>/dev/null | wc -l) + printf " %-30s %d\n" "Unused volumes:" "$vol_count" + + # Build cache + if docker builder prune --dry-run 2>/dev/null | grep -q "Total:"; then + local build_cache + build_cache=$(docker builder prune --dry-run 2>/dev/null | grep "Total:" | awk '{print $2}') + printf " %-30s %s\n" "Build cache:" "${build_cache:-0}" + fi + + # Docker system df + echo "" + docker system df 2>/dev/null | while IFS= read -r line; do + printf " %s\n" "$line" + done +} + +clean_docker() { + if ! command -v docker &>/dev/null || ! docker info &>/dev/null 2>&1; then + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY-RUN] Would prune Docker system (stopped containers, dangling images, unused networks, build cache)" + return + fi + + docker system prune -f 2>/dev/null || warn "Docker prune failed" + log "Pruned Docker system" +} + +# ══════════════════════════════════════════════════════════════════════ +# LARGE FILES +# ══════════════════════════════════════════════════════════════════════ + +scan_large_files() { + section_header "Large Files (> ${LARGE_FILE_MIN})" + + printf " ${BOLD}%-12s %s${RESET}\n" "SIZE" "FILE" + printf " %s\n" "$(printf '%.0s─' {1..70})" + + local count=0 + for dir in $LARGE_FILE_DIRS; do + [[ -d "$dir" ]] || continue + find "$dir" -xdev -type f -size +"$LARGE_FILE_MIN" -printf '%s %p\n' 2>/dev/null + done | sort -rn | head -20 | while IFS=' ' read -r size path; do + printf " %10s %s\n" "$(human_bytes "$size")" "$path" + ((count++)) || true + done + + if [[ "$count" -eq 0 ]]; then + echo " No files larger than ${LARGE_FILE_MIN} found" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + echo "" + echo -e " ${BOLD}══════════════════════════════════════════${RESET}" + echo -e " ${BOLD}Disk Cleanup Summary${RESET}" + echo -e " ${BOLD}══════════════════════════════════════════${RESET}" + + # Current disk usage + local root_pct + root_pct=$(df / 2>/dev/null | tail -1 | awk '{print $5}' | tr -d '%') + printf " %-20s %s%%\n" "Root disk usage:" "${root_pct:-?}" + printf " %-20s %s\n" "Reclaimable:" "$(human_bytes "$TOTAL_RECLAIMABLE")" + + if [[ "$TOTAL_CLEANED" -gt 0 ]]; then + printf " %-20s %s\n" "Cleaned:" "$(human_bytes "$TOTAL_CLEANED")" + fi + + if [[ "$DRY_RUN" == "true" && "$RUN_MODE" == *"clean"* ]]; then + echo "" + echo -e " ${YELLOW}Dry-run mode — nothing was deleted${RESET}" + echo -e " Run with --force to actually clean" + fi + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2 + exit 1 ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + echo "" + echo -e "${BOLD}Disk Cleanup — $(hostname -f 2>/dev/null || hostname)${RESET}" + echo -e "Mode: ${RUN_MODE}" + if [[ "$RUN_MODE" == "clean" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo -e "Safety: ${YELLOW}dry-run (use --force to delete)${RESET}" + else + echo -e "Safety: ${RED}LIVE — files will be deleted${RESET}" + fi + fi + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + + case "$RUN_MODE" in + scan) + scan_old_logs + scan_journal + scan_tmp + scan_package_cache + scan_old_kernels + scan_docker + scan_large_files + print_summary + ;; + clean) + scan_old_logs + clean_old_logs + scan_journal + clean_journal + scan_tmp + clean_tmp + scan_package_cache + clean_package_cache + scan_docker + clean_docker + scan_large_files + print_summary + ;; + large-files) + scan_large_files + ;; + esac +} + +main "$@" diff --git a/disk-usage-reporter.sh b/disk-usage-reporter.sh new file mode 100755 index 0000000..669bc88 --- /dev/null +++ b/disk-usage-reporter.sh @@ -0,0 +1,451 @@ +#!/usr/bin/env bash + +######################################################################################### +#### disk-usage-reporter.sh — Find what's consuming disk space #### +#### Scans filesystems, ranks largest directories and files, flags old data #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./disk-usage-reporter.sh #### +#### ./disk-usage-reporter.sh --path /var #### +#### ./disk-usage-reporter.sh --top 50 --min-size 100M #### +#### ./disk-usage-reporter.sh --json #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +SCAN_PATH="/" +TOP_N=20 +MIN_SIZE="1M" +MAX_DEPTH=3 +AGE_WARN=90 +JSON_MODE=false +NO_COLOR=false +VERSION="1.00" + +# Colors +RED='\033[0;31m' +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +SCRIPT_NAME="$(basename "$0")" + +# ============================================================================ +# USAGE & ARGUMENT PARSING +# ============================================================================ + +show_usage() { + cat <= 100M + ${SCRIPT_NAME} --path /home --age-warn 365 # Flag files older than 1 year + ${SCRIPT_NAME} --json # JSON output for scripting + +EOF + exit 0 +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) show_usage ;; + --path) SCAN_PATH="$2"; shift 2 ;; + --top) TOP_N="$2"; shift 2 ;; + --min-size) MIN_SIZE="$2"; shift 2 ;; + --max-depth) MAX_DEPTH="$2"; shift 2 ;; + --age-warn) AGE_WARN="$2"; shift 2 ;; + --json) JSON_MODE=true; shift ;; + --no-color) NO_COLOR=true; shift ;; + *) echo "Unknown option: $1" >&2; exit 1 ;; + esac + done + if [[ "$NO_COLOR" == true ]]; then + RED="" YELLOW="" GREEN="" CYAN="" BOLD="" NC="" + fi +} + +# ============================================================================ +# HELPERS +# ============================================================================ + +header() { + echo "" + echo -e "${CYAN}====================================================${NC}" + echo -e "${CYAN} ${BOLD}${1}${NC}" + echo -e "${CYAN}====================================================${NC}" + echo "" +} + +format_bytes() { + local b="$1" + if [[ "$b" -ge 1073741824 ]]; then + awk "BEGIN {printf \"%.2f GB\", $b/1073741824}" + elif [[ "$b" -ge 1048576 ]]; then + awk "BEGIN {printf \"%.1f MB\", $b/1048576}" + elif [[ "$b" -ge 1024 ]]; then + awk "BEGIN {printf \"%.1f KB\", $b/1024}" + else + echo "${b} B" + fi +} + +fmt_num() { + printf "%'d" "$1" 2>/dev/null || echo "$1" +} + +# Convert human-readable size (1M, 500K, 2G) to find -size argument +parse_min_size() { + echo "${MIN_SIZE}" +} + +# Convert human-readable size to bytes for comparison +size_to_bytes() { + local size="$1" + local num unit + num="$(echo "$size" | sed 's/[^0-9.]//g')" + unit="$(echo "$size" | sed 's/[0-9.]//g' | tr '[:lower:]' '[:upper:]')" + case "$unit" in + K) awk "BEGIN {printf \"%d\", $num * 1024}" ;; + M) awk "BEGIN {printf \"%d\", $num * 1048576}" ;; + G) awk "BEGIN {printf \"%d\", $num * 1073741824}" ;; + T) awk "BEGIN {printf \"%d\", $num * 1099511627776}" ;; + *) echo "$num" ;; + esac +} + +# ============================================================================ +# FILESYSTEM OVERVIEW +# ============================================================================ + +filesystem_overview() { + header "Filesystem Overview" + + printf " ${BOLD}%-30s %6s %6s %6s %5s %-20s${NC}\n" \ + "Filesystem" "Size" "Used" "Avail" "Use%" "Mounted on" + echo " ────────────────────────────────────────────────────────────────────────────────────" + + df -hP -x tmpfs -x devtmpfs -x squashfs 2>/dev/null | tail -n +2 | while IFS= read -r line; do + local fs size used avail pct mount + fs="$(echo "$line" | awk '{print $1}')" + size="$(echo "$line" | awk '{print $2}')" + used="$(echo "$line" | awk '{print $3}')" + avail="$(echo "$line" | awk '{print $4}')" + pct="$(echo "$line" | awk '{print $5}')" + mount="$(echo "$line" | awk '{print $6}')" + + local pct_num="${pct%\%}" + local color="" + if [[ "$pct_num" -ge 90 ]]; then + color="${RED}" + elif [[ "$pct_num" -ge 80 ]]; then + color="${YELLOW}" + else + color="${GREEN}" + fi + + printf " ${color}%-30s %6s %6s %6s %5s %-20s${NC}\n" \ + "$fs" "$size" "$used" "$avail" "$pct" "$mount" + done +} + +# ============================================================================ +# TOP DIRECTORIES BY SIZE +# ============================================================================ + +top_directories() { + header "Top ${TOP_N} Directories by Size" + + printf " ${BOLD}%4s %-60s %10s${NC}\n" "#" "Directory" "Size" + echo " ────────────────────────────────────────────────────────────────────────────────────" + + du -x --max-depth="${MAX_DEPTH}" "${SCAN_PATH}" 2>/dev/null \ + | sort -rn \ + | head -n "${TOP_N}" \ + | while IFS=$'\t' read -r size_kb dir; do + local num + num=$((COUNTER + 1)) + COUNTER=$num + local size_bytes=$((size_kb * 1024)) + local hsize + hsize="$(format_bytes "$size_bytes")" + + local color="${NC}" + if [[ "$size_bytes" -ge 10737418240 ]]; then + color="${RED}" + elif [[ "$size_bytes" -ge 1073741824 ]]; then + color="${YELLOW}" + fi + + printf " ${color}%4d %-60s %10s${NC}\n" "$num" "$dir" "$hsize" + done +} + +# ============================================================================ +# TOP FILES BY SIZE +# ============================================================================ + +top_files() { + header "Top ${TOP_N} Files by Size" + + printf " ${BOLD}%4s %-60s %10s${NC}\n" "#" "File" "Size" + echo " ────────────────────────────────────────────────────────────────────────────────────" + + find "${SCAN_PATH}" -xdev -type f -size +"$(parse_min_size)" -printf '%s\t%p\n' 2>/dev/null \ + | sort -rn \ + | head -n "${TOP_N}" \ + | awk -v idx=0 '{idx++; print idx"\t"$1"\t"$2}' \ + | while IFS=$'\t' read -r num size_bytes filepath; do + local hsize + hsize="$(format_bytes "$size_bytes")" + + local color="${NC}" + if [[ "$size_bytes" -ge 1073741824 ]]; then + color="${RED}" + elif [[ "$size_bytes" -ge 104857600 ]]; then + color="${YELLOW}" + fi + + printf " ${color}%4d %-60s %10s${NC}\n" "$num" "$filepath" "$hsize" + done +} + +# ============================================================================ +# OLD LARGE FILES +# ============================================================================ + +old_large_files() { + header "Old Large Files (> ${MIN_SIZE}, older than ${AGE_WARN} days)" + + printf " ${BOLD}%4s %-50s %10s %12s${NC}\n" "#" "File" "Size" "Last Modified" + echo " ────────────────────────────────────────────────────────────────────────────────────" + + OLD_FILES_DATA="$(find "${SCAN_PATH}" -xdev -type f -size +"$(parse_min_size)" -mtime +"${AGE_WARN}" \ + -printf '%s\t%T+\t%p\n' 2>/dev/null \ + | sort -rn \ + | head -n "${TOP_N}")" + + OLD_FILE_COUNT=0 + OLD_FILE_BYTES=0 + + if [[ -z "$OLD_FILES_DATA" ]]; then + echo " No files found matching criteria." + return + fi + + echo "$OLD_FILES_DATA" | awk -v idx=0 '{idx++; print idx"\t"$0}' \ + | while IFS=$'\t' read -r num size_bytes mtime filepath; do + OLD_FILE_COUNT=$((OLD_FILE_COUNT + 1)) + OLD_FILE_BYTES=$((OLD_FILE_BYTES + size_bytes)) + + local hsize mdate + hsize="$(format_bytes "$size_bytes")" + mdate="$(echo "$mtime" | cut -d'+' -f1)" + + printf " ${YELLOW}%4d %-50s %10s %12s${NC}\n" "$num" "$filepath" "$hsize" "$mdate" + done +} + +# ============================================================================ +# SUMMARY +# ============================================================================ + +compute_summary() { + local total_scanned old_count old_bytes + + total_scanned="$(du -sx "${SCAN_PATH}" 2>/dev/null | awk '{print $1}')" + total_scanned=$((total_scanned * 1024)) + + old_bytes="$(find "${SCAN_PATH}" -xdev -type f -size +"$(parse_min_size)" -mtime +"${AGE_WARN}" \ + -printf '%s\n' 2>/dev/null | awk '{s+=$1} END {print s+0}')" + old_count="$(find "${SCAN_PATH}" -xdev -type f -size +"$(parse_min_size)" -mtime +"${AGE_WARN}" \ + 2>/dev/null | wc -l)" + + echo "$total_scanned" "$old_count" "$old_bytes" +} + +print_summary() { + header "Summary" + + local data total_scanned old_count old_bytes + data="$(compute_summary)" + total_scanned="$(echo "$data" | awk '{print $1}')" + old_count="$(echo "$data" | awk '{print $2}')" + old_bytes="$(echo "$data" | awk '{print $3}')" + + echo -e " ${BOLD}Scan path:${NC} ${SCAN_PATH}" + echo -e " ${BOLD}Total scanned:${NC} $(format_bytes "$total_scanned")" + echo -e " ${BOLD}Min file size:${NC} ${MIN_SIZE}" + echo -e " ${BOLD}Age threshold:${NC} ${AGE_WARN} days" + echo "" + echo -e " ${BOLD}Old large files:${NC} $(fmt_num "$old_count") files" + echo -e " ${BOLD}Reclaimable space:${NC} ${YELLOW}$(format_bytes "$old_bytes")${NC}" + echo "" + + if [[ "$old_bytes" -gt 0 ]]; then + echo -e " ${YELLOW}→ Review old files above — candidates for cleanup or archival${NC}" + else + echo -e " ${GREEN}✓ No old large files found${NC}" + fi + echo "" +} + +# ============================================================================ +# JSON OUTPUT +# ============================================================================ + +json_output() { + local total_scanned old_count old_bytes + local data + data="$(compute_summary)" + total_scanned="$(echo "$data" | awk '{print $1}')" + old_count="$(echo "$data" | awk '{print $2}')" + old_bytes="$(echo "$data" | awk '{print $3}')" + + echo "{" + echo " \"scan_path\": \"${SCAN_PATH}\"," + echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"," + echo " \"min_size\": \"${MIN_SIZE}\"," + echo " \"age_warn_days\": ${AGE_WARN}," + echo " \"max_depth\": ${MAX_DEPTH}," + + # Filesystems + echo " \"filesystems\": [" + local fs_first=true + df -hP -x tmpfs -x devtmpfs -x squashfs 2>/dev/null | tail -n +2 | while IFS= read -r line; do + local fs size used avail pct mount + fs="$(echo "$line" | awk '{print $1}')" + size="$(echo "$line" | awk '{print $2}')" + used="$(echo "$line" | awk '{print $3}')" + avail="$(echo "$line" | awk '{print $4}')" + pct="$(echo "$line" | awk '{print $5}')" + mount="$(echo "$line" | awk '{print $6}')" + if [[ "$fs_first" == true ]]; then + fs_first=false + else + echo "," + fi + printf ' {"filesystem":"%s","size":"%s","used":"%s","avail":"%s","use_pct":"%s","mount":"%s"}' \ + "$fs" "$size" "$used" "$avail" "$pct" "$mount" + done + echo "" + echo " ]," + + # Top directories + echo " \"top_directories\": [" + local dir_first=true + du -x --max-depth="${MAX_DEPTH}" "${SCAN_PATH}" 2>/dev/null \ + | sort -rn | head -n "${TOP_N}" \ + | while IFS=$'\t' read -r size_kb dir; do + if [[ "$dir_first" == true ]]; then + dir_first=false + else + echo "," + fi + printf ' {"path":"%s","size_bytes":%d}' "$dir" "$((size_kb * 1024))" + done + echo "" + echo " ]," + + # Top files + echo " \"top_files\": [" + local file_first=true + find "${SCAN_PATH}" -xdev -type f -size +"$(parse_min_size)" -printf '%s\t%p\n' 2>/dev/null \ + | sort -rn | head -n "${TOP_N}" \ + | while IFS=$'\t' read -r size_bytes filepath; do + if [[ "$file_first" == true ]]; then + file_first=false + else + echo "," + fi + printf ' {"path":"%s","size_bytes":%d}' "$filepath" "$size_bytes" + done + echo "" + echo " ]," + + # Old files + echo " \"old_large_files\": [" + local old_first=true + find "${SCAN_PATH}" -xdev -type f -size +"$(parse_min_size)" -mtime +"${AGE_WARN}" \ + -printf '%s\t%T+\t%p\n' 2>/dev/null \ + | sort -rn | head -n "${TOP_N}" \ + | while IFS=$'\t' read -r size_bytes mtime filepath; do + local mdate + mdate="$(echo "$mtime" | cut -d'+' -f1)" + if [[ "$old_first" == true ]]; then + old_first=false + else + echo "," + fi + printf ' {"path":"%s","size_bytes":%d,"last_modified":"%s"}' "$filepath" "$size_bytes" "$mdate" + done + echo "" + echo " ]," + + # Summary + echo " \"summary\": {" + echo " \"total_scanned_bytes\": ${total_scanned}," + echo " \"old_file_count\": ${old_count}," + echo " \"reclaimable_bytes\": ${old_bytes}" + echo " }" + echo "}" +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + if [[ ! -d "$SCAN_PATH" ]]; then + echo -e "${RED}[ERROR]${NC} Path does not exist: ${SCAN_PATH}" >&2 + exit 1 + fi + + if [[ "$JSON_MODE" == true ]]; then + json_output + exit 0 + fi + + echo "" + echo -e "${BOLD}Disk Usage Report${NC}" + echo -e "$(date '+%Y-%m-%d %H:%M:%S %Z') — Scanning: ${SCAN_PATH}" + + COUNTER=0 + filesystem_overview + top_directories + top_files + old_large_files + print_summary +} + +main "$@" diff --git a/dns-lookup.sh b/dns-lookup.sh new file mode 100644 index 0000000..fa7188a --- /dev/null +++ b/dns-lookup.sh @@ -0,0 +1,429 @@ +#!/usr/bin/env bash + +######################################################################################### +#### dns-lookup.sh — Batch DNS lookups with record comparison across servers #### +#### Query multiple record types and compare results across DNS servers #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./dns-lookup.sh example.com google.com #### +#### ./dns-lookup.sh --type MX --servers 8.8.8.8,1.1.1.1 example.com #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +DNS_TIMEOUT="${DNS_TIMEOUT:-5}" +RECORD_TYPE="${RECORD_TYPE:-A}" +DNS_SERVERS="" +COMPARE="${COMPARE:-false}" +DOMAIN_FILE="" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +DOMAINS=() +COUNT_TOTAL=0 +COUNT_SUCCESS=0 +COUNT_FAILED=0 +COUNT_MISMATCH=0 +DIG_CMD="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${CYAN}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +# ── Helpers ─────────────────────────────────────────────────────────── +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +# ══════════════════════════════════════════════════════════════════════ +# DNS QUERY +# ══════════════════════════════════════════════════════════════════════ + +detect_dns_tool() { + if command -v dig &>/dev/null; then + DIG_CMD="dig" + elif command -v nslookup &>/dev/null; then + DIG_CMD="nslookup" + else + err "Neither dig nor nslookup found. Install dnsutils or bind-utils." + exit 1 + fi + verbose "Using DNS tool: ${DIG_CMD}" +} + +query_dig() { + local domain="$1" + local rtype="$2" + local server="${3:-}" + + local cmd_args=() + if [[ -n "$server" ]]; then + cmd_args+=("@${server}") + fi + cmd_args+=("$domain" "$rtype" "+short" "+time=${DNS_TIMEOUT}" "+tries=1") + + verbose "dig ${cmd_args[*]}" + dig "${cmd_args[@]}" 2>/dev/null || echo "" +} + +query_nslookup() { + local domain="$1" + local rtype="$2" + local server="${3:-}" + + local result + if [[ -n "$server" ]]; then + result=$(nslookup -type="$rtype" -timeout="$DNS_TIMEOUT" "$domain" "$server" 2>/dev/null) || result="" + else + result=$(nslookup -type="$rtype" -timeout="$DNS_TIMEOUT" "$domain" 2>/dev/null) || result="" + fi + + # Parse nslookup output — extract answer lines + echo "$result" | awk '/^Name:|^Address:|answer:/{found=1} found && /^[^ \t]/' | grep -v "^Server:" | grep -v "^Name:" | awk '{print $NF}' +} + +do_query() { + local domain="$1" + local rtype="$2" + local server="${3:-}" + + if [[ "$DIG_CMD" == "dig" ]]; then + query_dig "$domain" "$rtype" "$server" + else + query_nslookup "$domain" "$rtype" "$server" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# LOOKUP LOGIC +# ══════════════════════════════════════════════════════════════════════ + +lookup_single() { + local domain="$1" + local rtype="$2" + local server="${3:-system resolver}" + local server_arg="${3:-}" + + COUNT_TOTAL=$((COUNT_TOTAL + 1)) + + local result + result=$(do_query "$domain" "$rtype" "$server_arg") + + if [[ -z "$result" ]]; then + COUNT_FAILED=$((COUNT_FAILED + 1)) + printf " %b%-30s %-6s %-18s %s%b\n" "$RED" "$domain" "$rtype" "$server" "NO RECORDS" "$RESET" + return + fi + + COUNT_SUCCESS=$((COUNT_SUCCESS + 1)) + + # Get TTL if using dig + local ttl="--" + if [[ "$DIG_CMD" == "dig" && -n "$server_arg" ]]; then + ttl=$(dig "@${server_arg}" "$domain" "$rtype" +noall +answer +time="${DNS_TIMEOUT}" +tries=1 2>/dev/null \ + | awk '{print $2}' | head -1 || echo "--") + elif [[ "$DIG_CMD" == "dig" ]]; then + ttl=$(dig "$domain" "$rtype" +noall +answer +time="${DNS_TIMEOUT}" +tries=1 2>/dev/null \ + | awk '{print $2}' | head -1 || echo "--") + fi + + while IFS= read -r value; do + [[ -z "$value" ]] && continue + printf " %-30s %-6s %-8s %-18s %s\n" "$domain" "$rtype" "$ttl" "$server" "$value" + # Only print domain on first line + domain="" + ttl="" + done <<< "$result" +} + +lookup_compare() { + local domain="$1" + local rtype="$2" + local -a servers_arr + + IFS=',' read -ra servers_arr <<< "$DNS_SERVERS" + + if [[ ${#servers_arr[@]} -lt 2 ]]; then + warn "Compare mode requires at least 2 DNS servers (use --servers)" + return + fi + + local -a all_results=() + local first_result="" + + for server in "${servers_arr[@]}"; do + COUNT_TOTAL=$((COUNT_TOTAL + 1)) + + local result + result=$(do_query "$domain" "$rtype" "$server" | sort) + + if [[ -z "$result" ]]; then + COUNT_FAILED=$((COUNT_FAILED + 1)) + printf " %b%-30s %-6s %-18s %s%b\n" "$RED" "$domain" "$rtype" "$server" "NO RECORDS" "$RESET" + all_results+=("FAILED") + continue + fi + + COUNT_SUCCESS=$((COUNT_SUCCESS + 1)) + all_results+=("$result") + + if [[ -z "$first_result" ]]; then + first_result="$result" + fi + + while IFS= read -r value; do + [[ -z "$value" ]] && continue + printf " %-30s %-6s %-18s %s\n" "$domain" "$rtype" "$server" "$value" + domain="" + done <<< "$result" + done + + # Check for mismatches + local mismatch=false + for r in "${all_results[@]}"; do + if [[ "$r" != "$first_result" && "$r" != "FAILED" && "$first_result" != "FAILED" ]]; then + mismatch=true + break + fi + done + + if [[ "$mismatch" == "true" ]]; then + COUNT_MISMATCH=$((COUNT_MISMATCH + 1)) + printf " %b ⚠ MISMATCH across servers for %s%b\n" "$RED" "$1" "$RESET" + else + printf " %b ✓ Consistent across servers for %s%b\n" "$GREEN" "$1" "$RESET" + fi + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# INPUT PARSING +# ══════════════════════════════════════════════════════════════════════ + +parse_domain() { + local entry="$1" + entry=$(echo "$entry" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + [[ -z "$entry" || "$entry" == \#* ]] && return + DOMAINS+=("$entry") +} + +load_domains_from_file() { + local file="$1" + if [[ ! -f "$file" ]]; then + err "File not found: $file" + exit 1 + fi + while IFS= read -r line; do + parse_domain "$line" + done < "$file" +} + +load_domains_from_stdin() { + while IFS= read -r line; do + parse_domain "$line" + done +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2 + exit 1 ;; + *) + parse_domain "$1"; shift ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + detect_dns_tool + + # Load domains from file if specified + if [[ -n "$DOMAIN_FILE" ]]; then + load_domains_from_file "$DOMAIN_FILE" + fi + + # Load from stdin if no domains yet and stdin is not a terminal + if [[ ${#DOMAINS[@]} -eq 0 ]] && ! [[ -t 0 ]]; then + load_domains_from_stdin + fi + + if [[ ${#DOMAINS[@]} -eq 0 ]]; then + err "No domains specified" + echo "Run ${SCRIPT_NAME} --help for usage" >&2 + exit 1 + fi + + # Validate record type + case "$RECORD_TYPE" in + A|AAAA|MX|NS|TXT|CNAME|SOA|PTR) ;; + *) + err "Unsupported record type: ${RECORD_TYPE}" + exit 1 ;; + esac + + echo "" + echo -e "${BOLD}DNS Lookup — ${RECORD_TYPE} Records${RESET}" + echo -e "${DIM}$(date '+%Y-%m-%d %H:%M:%S %Z')${RESET}" + echo -e "${DIM}Tool: ${DIG_CMD} | Timeout: ${DNS_TIMEOUT}s${RESET}" + + section_header "Results" + + if [[ "$COMPARE" == "true" ]]; then + printf " ${BOLD}%-30s %-6s %-18s %s${RESET}\n" "DOMAIN" "TYPE" "SERVER" "VALUE" + printf " %s\n" "$(printf '%.0s─' {1..85})" + + for domain in "${DOMAINS[@]}"; do + lookup_compare "$domain" "$RECORD_TYPE" + done + else + # Determine servers to query + local -a servers_list + if [[ -n "$DNS_SERVERS" ]]; then + IFS=',' read -ra servers_list <<< "$DNS_SERVERS" + else + servers_list=("") + fi + + printf " ${BOLD}%-30s %-6s %-8s %-18s %s${RESET}\n" "DOMAIN" "TYPE" "TTL" "SERVER" "VALUE" + printf " %s\n" "$(printf '%.0s─' {1..90})" + + for domain in "${DOMAINS[@]}"; do + for server in "${servers_list[@]}"; do + lookup_single "$domain" "$RECORD_TYPE" "$server" + done + done + fi + + section_header "Summary" + field "Total lookups:" "$COUNT_TOTAL" + field_color "Successful:" "${GREEN}${COUNT_SUCCESS}${RESET}" + if [[ "$COUNT_FAILED" -gt 0 ]]; then + field_color "Failed:" "${RED}${COUNT_FAILED}${RESET}" + else + field "Failed:" "$COUNT_FAILED" + fi + if [[ "$COMPARE" == "true" ]]; then + if [[ "$COUNT_MISMATCH" -gt 0 ]]; then + field_color "Mismatches:" "${RED}${COUNT_MISMATCH}${RESET}" + else + field_color "Mismatches:" "${GREEN}0${RESET}" + fi + fi + + echo "" +} + +main "$@" diff --git a/dns-propagation-checker.sh b/dns-propagation-checker.sh new file mode 100755 index 0000000..0cc4ba1 --- /dev/null +++ b/dns-propagation-checker.sh @@ -0,0 +1,350 @@ +#!/usr/bin/env bash + +######################################################################################### +#### dns-propagation-checker.sh — Check DNS propagation across public resolvers #### +#### Queries Cloudflare, Google, Quad9, OpenDNS, compares results #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./dns-propagation-checker.sh example.com #### +#### ./dns-propagation-checker.sh example.com --type MX #### +#### ./dns-propagation-checker.sh example.com --watch 30 #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +DOMAIN="" +RECORD_TYPE="A" +TIMEOUT=5 +COLOR="auto" +JSON_OUTPUT="false" +WATCH_INTERVAL=0 +EXPECTED="" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME + +# ── Built-in Resolvers ─────────────────────────────────────────────── +RESOLVER_NAMES=("Cloudflare" "Google" "Quad9" "OpenDNS" "Cloudflare-2" "Google-2") +RESOLVER_IPS=("1.1.1.1" "8.8.8.8" "9.9.9.9" "208.67.222.222" "1.0.0.1" "8.8.4.4") +CUSTOM_RESOLVERS=() + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2 + exit 1 ;; + *) + if [[ -z "$DOMAIN" ]]; then + DOMAIN="$1" + else + err "Unexpected argument: $1" + exit 1 + fi + shift ;; + esac + done + + if [[ -z "$DOMAIN" ]]; then + err "Domain name is required" + echo "Run ${SCRIPT_NAME} --help for usage" >&2 + exit 1 + fi + + local valid_types="A AAAA MX CNAME TXT NS SOA PTR" + if [[ ! " $valid_types " =~ " $RECORD_TYPE " ]]; then + err "Invalid record type: $RECORD_TYPE" + exit 1 + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# DNS QUERY +# ══════════════════════════════════════════════════════════════════════ + +query_resolver() { + local resolver_ip="$1" domain="$2" rtype="$3" timeout="$4" + local output ttl_output answer ttl + + output=$(dig +time="$timeout" +tries=1 +short "@${resolver_ip}" "$domain" "$rtype" 2>/dev/null) || true + ttl_output=$(dig +time="$timeout" +tries=1 +noall +answer "@${resolver_ip}" "$domain" "$rtype" 2>/dev/null) || true + answer=$(echo "$output" | tr '\n' ' ' | sed 's/ *$//') + ttl=$(echo "$ttl_output" | awk 'NR==1{print $2}') + + if [[ -z "$answer" ]]; then + echo "FAIL||"; return + fi + echo "${answer}|${ttl:-?}|" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAJORITY ANSWER +# ══════════════════════════════════════════════════════════════════════ + +find_majority() { + local -n answers_ref=$1 + local -A counts + local max_count=0 majority="" + for answer in "${answers_ref[@]}"; do + [[ "$answer" == "FAIL" ]] && continue + counts["$answer"]=$(( ${counts["$answer"]:-0} + 1 )) + if [[ ${counts["$answer"]} -gt $max_count ]]; then + max_count=${counts["$answer"]}; majority="$answer" + fi + done + echo "$majority" +} + +# ══════════════════════════════════════════════════════════════════════ +# RUN CHECK +# ══════════════════════════════════════════════════════════════════════ + +run_check() { + local all_names=("${RESOLVER_NAMES[@]}") + local all_ips=("${RESOLVER_IPS[@]}") + + for custom in "${CUSTOM_RESOLVERS[@]}"; do + all_names+=("Custom-${custom}") + all_ips+=("$custom") + done + + local total=${#all_names[@]} + local answers=() ttls=() + + for i in $(seq 0 $(( total - 1 ))); do + local result + result=$(query_resolver "${all_ips[$i]}" "$DOMAIN" "$RECORD_TYPE" "$TIMEOUT") + answers+=("$(echo "$result" | cut -d'|' -f1)") + ttls+=("$(echo "$result" | cut -d'|' -f2)") + done + + local majority + majority=$(find_majority answers) + local compare_to="${EXPECTED:-$majority}" + local agree_count=0 statuses=() + + for i in $(seq 0 $(( total - 1 ))); do + if [[ "${answers[$i]}" == "FAIL" ]]; then + statuses+=("FAIL") + elif [[ "${answers[$i]}" == "$compare_to" ]]; then + statuses+=("MATCH"); agree_count=$((agree_count + 1)) + else + statuses+=("MISMATCH") + fi + done + + if [[ "$JSON_OUTPUT" == "true" ]]; then + print_json all_names all_ips answers ttls statuses "$agree_count" "$total" "$majority" + else + print_table all_names all_ips answers ttls statuses "$agree_count" "$total" "$majority" "$compare_to" + fi + [[ "$agree_count" -eq "$total" ]] && return 0 || return 1 +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT: TABLE +# ══════════════════════════════════════════════════════════════════════ + +print_table() { + local -n names_ref=$1 ips_ref=$2 ans_ref=$3 ttl_ref=$4 stat_ref=$5 + local agree="$6" total="$7" majority="$8" compare="$9" + + echo "" + echo -e "${BOLD}DNS Propagation Check — ${DOMAIN} (${RECORD_TYPE})${RESET}" + echo -e "${DIM}$(date -u '+%Y-%m-%d %H:%M:%S UTC')${RESET}" + echo "" + printf " ${BOLD}%-20s %-17s %-22s %-6s %s${RESET}\n" "RESOLVER" "IP" "RESULT" "TTL" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + local count=${#names_ref[@]} + for i in $(seq 0 $(( count - 1 ))); do + local color status_str + case "${stat_ref[$i]}" in + MATCH) color="$GREEN"; status_str="MATCH" ;; + MISMATCH) color="$YELLOW"; status_str="MISMATCH" ;; + FAIL) color="$RED"; status_str="FAIL" ;; + esac + + local display_answer="${ans_ref[$i]}" + if [[ ${#display_answer} -gt 20 ]]; then + display_answer="${display_answer:0:17}..." + fi + + printf " %-20s %-17s %b%-22s%b %-6s %b%s%b\n" \ + "${names_ref[$i]}" \ + "${ips_ref[$i]}" \ + "$color" "$display_answer" "$RESET" \ + "${ttl_ref[$i]}" \ + "$color" "$status_str" "$RESET" + done + + echo "" + echo -e " ${BOLD}Summary${RESET}" + if [[ -n "$EXPECTED" ]]; then + printf " %-20s %s\n" "Expected answer:" "$EXPECTED" + fi + printf " %-20s %s\n" "Majority answer:" "${majority:-N/A}" + printf " %-20s %s\n" "Agree:" "${agree}/${total} resolvers" + + if [[ "$agree" -eq "$total" ]]; then + printf " %-20s " "Status:"; echo -e "${GREEN}PROPAGATION COMPLETE${RESET}" + else + printf " %-20s " "Status:"; echo -e "${YELLOW}PROPAGATION PENDING${RESET}" + fi + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT: JSON +# ══════════════════════════════════════════════════════════════════════ + +print_json() { + local -n jnames=$1 jips=$2 jans=$3 jttls=$4 jstats=$5 + local agree="$6" total="$7" majority="$8" + local count=${#jnames[@]} propagated="false" + [[ "$agree" -eq "$total" ]] && propagated="true" + + printf '{"domain":"%s","type":"%s","timestamp":"%s","results":[' \ + "$DOMAIN" "$RECORD_TYPE" "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" + for i in $(seq 0 $(( count - 1 ))); do + [[ $i -gt 0 ]] && printf ',' + local escaped_answer + escaped_answer=$(echo "${jans[$i]}" | sed 's/"/\\"/g') + printf '{"resolver":"%s","ip":"%s","answer":"%s","ttl":"%s","status":"%s"}' \ + "${jnames[$i]}" "${jips[$i]}" "$escaped_answer" "${jttls[$i]}" "${jstats[$i]}" + done + local escaped_majority + escaped_majority=$(echo "$majority" | sed 's/"/\\"/g') + printf '],"summary":{"majority":"%s","agree":%d,"total":%d,"propagated":%s}}\n' \ + "$escaped_majority" "$agree" "$total" "$propagated" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + if ! command -v dig &>/dev/null; then + err "dig is required but not found. Install dnsutils (Debian/Ubuntu) or bind-utils (RHEL/CentOS)." + exit 1 + fi + + if [[ "$WATCH_INTERVAL" -gt 0 ]]; then + local cycle=1 + while true; do + if [[ "$JSON_OUTPUT" != "true" ]]; then + [[ $cycle -gt 1 ]] && echo -e "${DIM}────────────────────────────────────────────────${RESET}" + echo -e "${DIM}Watch cycle ${cycle} — checking every ${WATCH_INTERVAL}s (Ctrl+C to stop)${RESET}" + fi + if run_check; then + [[ "$JSON_OUTPUT" != "true" ]] && echo -e " ${GREEN}All resolvers agree. Propagation complete.${RESET}\n" + exit 0 + fi + cycle=$((cycle + 1)) + sleep "$WATCH_INTERVAL" + done + else + run_check && exit 0 || exit 1 + fi +} + +main "$@" diff --git a/dns-smoke-tests.sh b/dns-smoke-tests.sh new file mode 100644 index 0000000..7959749 --- /dev/null +++ b/dns-smoke-tests.sh @@ -0,0 +1,500 @@ +#!/usr/bin/env bash + +##################################################################################### +#### dns-smoke-tests.sh — Verify DNS infrastructure is healthy #### +#### Checks resolution, zone transfers, SOA, DNSSEC, response time, DoT. #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: ./dns-smoke-tests.sh #### +#### DNS_SERVER=192.168.1.1 DOMAIN=example.com ./dns-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +##################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +DNS_SERVER="${DNS_SERVER:-}" +DOMAIN="${DOMAIN:-example.com}" +REVERSE_IP="${REVERSE_IP:-}" +ZONE="${ZONE:-}" +ZONE_MASTER="${ZONE_MASTER:-}" +DNSSEC_DOMAIN="${DNSSEC_DOMAIN:-}" +DOT_SERVER="${DOT_SERVER:-}" +MAX_RESPONSE_MS="${MAX_RESPONSE_MS:-500}" +TEST_RECORDS="${TEST_RECORDS:-}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0; FAIL=0; SKIP=0; TOTAL=0 +RESULTS=() +START_TIME="" + +# ── Dig tool detection ─────────────────────────────────────────────── +DIG_CMD="" +detect_dig() { + if command -v dig >/dev/null 2>&1; then + DIG_CMD="dig" + elif command -v drill >/dev/null 2>&1; then + DIG_CMD="drill" + else + err "Neither dig nor drill found. Install dnsutils or ldns." + exit 1 + fi + verbose "Using ${DIG_CMD} for DNS queries" +} + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" detail="${2:-}" + ((PASS++)) || true; ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name}${detail:+ (${detail})}" + else echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_fail() { + local name="$1" detail="${2:-}" + ((FAIL++)) || true; ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_skip() { + local name="$1" reason="${2:-}" + ((SKIP++)) || true; ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}"; fi +} + +# ── Helpers ─────────────────────────────────────────────────────────── +has_cmd() { command -v "$1" >/dev/null 2>&1; } + +# Build dig command with optional @server +dig_cmd() { + if [[ -n "$DNS_SERVER" ]]; then + "$DIG_CMD" "@${DNS_SERVER}" "$@" + else + "$DIG_CMD" "$@" + fi +} + +# ── Output Functions ────────────────────────────────────────────────── +section_header() { + local name="$1" + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo "" + echo -e "${BOLD}${name}${RESET}" + fi +} + +print_header() { + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo "" + echo -e "${BOLD}DNS Smoke Tests${RESET}" + echo "Domain: ${DOMAIN}" + [[ -n "$DNS_SERVER" ]] && echo "Server: ${DNS_SERVER}" || echo "Server: (system resolver)" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + fi +} + +print_tap_header() { + echo "TAP version 13" +} + +print_summary() { + local end_time; end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} ${DOMAIN} ${DNS_SERVER:-(system resolver)}" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + if [[ $FAIL -eq 0 ]]; then echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}"; fi +} + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +# ══════════════════════════════════════════════════════════════════════ +# TESTS +# ══════════════════════════════════════════════════════════════════════ + +# ── 1. Resolver Reachable ───────────────────────────────────────────── +test_resolver_reachable() { + section_header "Connectivity" + local output + output=$(dig_cmd +short +time=5 +tries=1 "${DOMAIN}" A 2>&1) || true + if [[ -n "$output" ]] && ! echo "$output" | grep -qi "timed out\|connection refused\|no servers\|SERVFAIL"; then + record_pass "Resolver reachable" "${DNS_SERVER:-(system resolver)}" + else + record_fail "Resolver reachable" "${DNS_SERVER:-(system resolver)} — ${output:-no response}" + fi +} + +# ── 2. Forward Resolution (A) ──────────────────────────────────────── +test_forward_resolution() { + section_header "Resolution" + local output + output=$(dig_cmd +short "${DOMAIN}" A 2>/dev/null) || true + if [[ -n "$output" ]]; then + local first_ip + first_ip=$(echo "$output" | head -1) + record_pass "Forward resolution (${DOMAIN} A)" "${first_ip}" + else + record_fail "Forward resolution (${DOMAIN} A)" "no A record returned" + fi +} + +# ── 3. AAAA Resolution ─────────────────────────────────────────────── +test_aaaa_resolution() { + local output + output=$(dig_cmd +short "${DOMAIN}" AAAA 2>/dev/null) || true + if [[ -n "$output" ]]; then + local first_ip + first_ip=$(echo "$output" | head -1) + record_pass "AAAA resolution (${DOMAIN})" "${first_ip}" + else + record_skip "AAAA resolution (${DOMAIN})" "no AAAA record" + fi +} + +# ── 4. MX Resolution ───────────────────────────────────────────────── +test_mx_resolution() { + local output + output=$(dig_cmd +short "${DOMAIN}" MX 2>/dev/null) || true + if [[ -n "$output" ]]; then + local first_mx + first_mx=$(echo "$output" | head -1) + record_pass "MX resolution (${DOMAIN})" "${first_mx}" + else + record_skip "MX resolution (${DOMAIN})" "no MX record" + fi +} + +# ── 5. Reverse Lookup ──────────────────────────────────────────────── +test_reverse_lookup() { + if [[ -z "$REVERSE_IP" ]]; then + record_skip "Reverse lookup" "REVERSE_IP not set" + return + fi + local output + output=$(dig_cmd +short -x "${REVERSE_IP}" 2>/dev/null) || true + if [[ -n "$output" ]]; then + record_pass "Reverse lookup (${REVERSE_IP})" "${output}" + else + record_fail "Reverse lookup (${REVERSE_IP})" "no PTR record returned" + fi +} + +# ── 6. Response Time ───────────────────────────────────────────────── +test_response_time() { + section_header "Performance" + local output query_time + output=$(dig_cmd "${DOMAIN}" A 2>/dev/null) || true + # dig outputs "Query time: 12 msec" or ";; Query time: 12 msec" + query_time=$(echo "$output" | grep -i "query time" | grep -oP '[0-9]+' | head -1) || true + if [[ -z "$query_time" ]]; then + # drill outputs ";; Query time: 0 msec" + query_time=$(echo "$output" | grep -i "query time" | awk '{print $4}') || true + fi + if [[ -n "$query_time" ]]; then + if [[ "$query_time" -le "$MAX_RESPONSE_MS" ]]; then + record_pass "Response time" "${query_time}ms (<= ${MAX_RESPONSE_MS}ms)" + else + record_fail "Response time" "${query_time}ms (> ${MAX_RESPONSE_MS}ms)" + fi + else + record_fail "Response time" "could not parse query time" + fi +} + +# ── 7. Authoritative Answer ────────────────────────────────────────── +test_authoritative_answer() { + section_header "Authority" + local output + output=$(dig_cmd "${DOMAIN}" A 2>/dev/null) || true + if echo "$output" | grep -q "flags:.*aa"; then + record_pass "Authoritative answer (${DOMAIN})" "AA flag set" + else + record_fail "Authoritative answer (${DOMAIN})" "AA flag not set — server is not authoritative" + fi +} + +# ── 8. SOA Serial ──────────────────────────────────────────────────── +test_soa_serial() { + local output serial + output=$(dig_cmd +short "${DOMAIN}" SOA 2>/dev/null) || true + if [[ -z "$output" ]]; then + record_fail "SOA serial (${DOMAIN})" "no SOA record returned" + return + fi + # SOA format: ns1.example.com. admin.example.com. 2026051201 3600 900 604800 86400 + serial=$(echo "$output" | awk '{print $3}') || true + if [[ -z "$serial" ]]; then + record_fail "SOA serial (${DOMAIN})" "could not parse serial" + elif [[ "$serial" == "0" ]]; then + record_fail "SOA serial (${DOMAIN})" "serial is 0" + else + record_pass "SOA serial (${DOMAIN})" "${serial}" + fi +} + +# ── 9. SOA Consistency ─────────────────────────────────────────────── +test_soa_consistency() { + if [[ -z "$ZONE_MASTER" ]]; then + record_skip "SOA consistency" "ZONE_MASTER not set" + return + fi + local serial_local serial_master + # Get serial from configured server + serial_local=$(dig_cmd +short "${DOMAIN}" SOA 2>/dev/null | awk '{print $3}') || true + # Get serial from master + serial_master=$("$DIG_CMD" "@${ZONE_MASTER}" +short "${DOMAIN}" SOA 2>/dev/null | awk '{print $3}') || true + if [[ -z "$serial_local" || -z "$serial_master" ]]; then + record_fail "SOA consistency" "could not retrieve serials (local=${serial_local:-?}, master=${serial_master:-?})" + return + fi + if [[ "$serial_local" == "$serial_master" ]]; then + record_pass "SOA consistency" "serial ${serial_local} matches across servers" + else + record_fail "SOA consistency" "serial mismatch — local=${serial_local}, master=${serial_master}" + fi +} + +# ── 10. Zone Transfer ──────────────────────────────────────────────── +test_zone_transfer() { + section_header "Zone Transfer" + if [[ -z "$ZONE" ]]; then + record_skip "Zone transfer (AXFR)" "ZONE not set" + return + fi + local output exit_code=0 + output=$(dig_cmd AXFR "${ZONE}" 2>&1) || exit_code=$? + # Check if transfer returned records + local record_count + record_count=$(echo "$output" | grep -c "^${ZONE}" 2>/dev/null) || record_count=0 + if [[ $record_count -gt 0 ]]; then + record_pass "Zone transfer (${ZONE})" "${record_count} records transferred" + elif echo "$output" | grep -qi "transfer failed\|refused\|REFUSED"; then + record_pass "Zone transfer (${ZONE})" "AXFR refused (expected on production)" + else + record_fail "Zone transfer (${ZONE})" "transfer failed — ${output:0:100}" + fi +} + +# ── 11. DNSSEC Validation ──────────────────────────────────────────── +test_dnssec_validation() { + section_header "DNSSEC" + if [[ -z "$DNSSEC_DOMAIN" ]]; then + record_skip "DNSSEC validation" "DNSSEC_DOMAIN not set" + return + fi + local output + output=$(dig_cmd +dnssec +short "${DNSSEC_DOMAIN}" A 2>/dev/null) || true + # Check for AD flag in full output + local full_output + full_output=$(dig_cmd +dnssec "${DNSSEC_DOMAIN}" A 2>/dev/null) || true + if echo "$full_output" | grep -q "flags:.*ad"; then + record_pass "DNSSEC validation (${DNSSEC_DOMAIN})" "AD flag set" + elif [[ -n "$output" ]]; then + record_fail "DNSSEC validation (${DNSSEC_DOMAIN})" "response received but AD flag not set" + else + record_fail "DNSSEC validation (${DNSSEC_DOMAIN})" "no response" + fi +} + +# ── 12. DNS-over-TLS ───────────────────────────────────────────────── +test_dot() { + section_header "DNS-over-TLS" + if [[ -z "$DOT_SERVER" ]]; then + record_skip "DNS-over-TLS" "DOT_SERVER not set" + return + fi + if ! has_cmd openssl; then + record_skip "DNS-over-TLS" "openssl not installed" + return + fi + local output exit_code=0 + output=$(echo "" | openssl s_client -connect "${DOT_SERVER}:853" -servername "${DOT_SERVER}" 2>&1) || exit_code=$? + if echo "$output" | grep -qi "connected\|verify return"; then + # Extract certificate info if available + local cn + cn=$(echo "$output" | grep -oP 'CN\s*=\s*\K[^,/]+' | head -1) || true + record_pass "DNS-over-TLS (${DOT_SERVER}:853)" "TLS handshake OK${cn:+ — CN=${cn}}" + else + record_fail "DNS-over-TLS (${DOT_SERVER}:853)" "TLS handshake failed" + fi +} + +# ── 13. Custom Record Checks ───────────────────────────────────────── +test_custom_records() { + if [[ -z "$TEST_RECORDS" ]]; then return; fi + section_header "Custom Records" + local IFS=',' + for entry in $TEST_RECORDS; do + local name type expected + name=$(echo "$entry" | cut -d: -f1) + type=$(echo "$entry" | cut -d: -f2) + expected=$(echo "$entry" | cut -d: -f3-) + if [[ -z "$name" || -z "$type" ]]; then + record_fail "Custom record" "invalid entry: ${entry}" + continue + fi + local output + output=$(dig_cmd +short "${name}" "${type}" 2>/dev/null) || true + if [[ -z "$output" ]]; then + record_fail "Custom record (${name} ${type})" "no record returned" + elif [[ -n "$expected" ]]; then + if echo "$output" | grep -q "$expected"; then + record_pass "Custom record (${name} ${type})" "${output}" + else + record_fail "Custom record (${name} ${type})" "expected '${expected}', got '${output}'" + fi + else + record_pass "Custom record (${name} ${type})" "${output}" + fi + done +} + +# ── 14. Recursive Resolution ───────────────────────────────────────── +test_recursive_resolution() { + section_header "Recursion" + local output + output=$(dig_cmd +short "google.com" A 2>/dev/null) || true + if [[ -n "$output" ]]; then + local first_ip + first_ip=$(echo "$output" | head -1) + record_pass "Recursive resolution (google.com)" "${first_ip}" + else + record_fail "Recursive resolution (google.com)" "could not resolve external domain" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" detail="${2:-}" + ((PASS++)) || true; ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name}${detail:+ (${detail})}" + else echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_fail() { + local name="$1" detail="${2:-}" + ((FAIL++)) || true; ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_skip() { + local name="$1" reason="${2:-}" + ((SKIP++)) || true; ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}"; fi +} + +# ── Helpers ─────────────────────────────────────────────────────────── +has_cmd() { command -v "$1" >/dev/null 2>&1; } + +remove_container() { + local name="$1" + docker rm -f "$name" >/dev/null 2>&1 || true +} + +section() { + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then echo ""; echo -e "${BOLD}$1${RESET}"; fi +} + +# ── Cleanup ─────────────────────────────────────────────────────────── +# shellcheck disable=SC2317 +cleanup() { + verbose "Cleaning up test artifacts..." + remove_container "$SMOKE_CONTAINER" + remove_container "$SMOKE_PORT_CONTAINER" + remove_container "$SMOKE_DNS_CONTAINER" + remove_container "$SMOKE_VOL_CONTAINER" + remove_container "$SMOKE_NET_CONTAINER" + remove_container "$SMOKE_MEM_CONTAINER" + docker volume rm -f "$SMOKE_VOLUME" >/dev/null 2>&1 || true + docker network rm "$SMOKE_NETWORK" >/dev/null 2>&1 || true + docker rmi -f "$SMOKE_BUILD_TAG" >/dev/null 2>&1 || true +} +trap cleanup EXIT + +# ══════════════════════════════════════════════════════════════════════ +# TEST FUNCTIONS +# ══════════════════════════════════════════════════════════════════════ + +# ── 1. Docker daemon running ───────────────────────────────────────── +test_daemon_running() { + if has_cmd systemctl; then + if systemctl is-active --quiet docker 2>/dev/null; then record_pass "Docker daemon running" "systemctl active" + else record_fail "Docker daemon running" "systemctl inactive"; fi + elif has_cmd service; then + if service docker status >/dev/null 2>&1; then record_pass "Docker daemon running" "service running" + else record_fail "Docker daemon running" "service stopped"; fi + elif docker info >/dev/null 2>&1; then record_pass "Docker daemon running" "docker info ok" + else record_fail "Docker daemon running" "cannot determine status"; fi +} + +# ── 2. Docker API responsive ───────────────────────────────────────── +test_api_responsive() { + local output + if output=$(timeout 10 docker info 2>&1); then + local ver; ver=$(echo "$output" | grep -i "Server Version" | head -1 | awk '{print $NF}') || true + record_pass "Docker API responsive" "server ${ver:-unknown}" + else record_fail "Docker API responsive" "docker info timed out or failed"; fi +} + +# ── 3. Docker socket accessible ────────────────────────────────────── +test_socket_accessible() { + local socket="/var/run/docker.sock" + if [[ -S "$socket" ]]; then + if [[ -r "$socket" && -w "$socket" ]]; then record_pass "Docker socket accessible" "$socket" + else record_fail "Docker socket accessible" "$socket not readable/writable"; fi + elif docker info >/dev/null 2>&1; then record_pass "Docker socket accessible" "non-default socket" + else record_fail "Docker socket accessible" "$socket not found"; fi +} + +# ── 4. Container lifecycle ─────────────────────────────────────────── +test_container_lifecycle() { + if [[ "$SKIP_LIFECYCLE" == "true" ]]; then record_skip "Container lifecycle" "SKIP_LIFECYCLE=true"; return; fi + remove_container "$SMOKE_CONTAINER" + + if ! docker create --name "$SMOKE_CONTAINER" "$TEST_IMAGE" sleep 30 >/dev/null 2>&1; then + record_fail "Container lifecycle" "docker create failed" + return + fi + + if ! docker start "$SMOKE_CONTAINER" >/dev/null 2>&1; then + record_fail "Container lifecycle" "docker start failed" + return + fi + + local exec_output + exec_output=$(docker exec "$SMOKE_CONTAINER" echo "smoke-ok" 2>&1) || true + if [[ "$exec_output" != "smoke-ok" ]]; then + record_fail "Container lifecycle" "docker exec failed" + return + fi + + if ! docker stop -t 5 "$SMOKE_CONTAINER" >/dev/null 2>&1; then + record_fail "Container lifecycle" "docker stop failed" + return + fi + + if ! docker rm "$SMOKE_CONTAINER" >/dev/null 2>&1; then + record_fail "Container lifecycle" "docker rm failed" + return + fi + + record_pass "Container lifecycle" "create/start/exec/stop/rm" +} + +# ── 5. Port binding ────────────────────────────────────────────────── +test_port_binding() { + if [[ "$SKIP_LIFECYCLE" == "true" ]]; then record_skip "Port binding" "SKIP_LIFECYCLE=true"; return; fi + if ! has_cmd curl; then record_skip "Port binding" "curl not installed"; return; fi + remove_container "$SMOKE_PORT_CONTAINER" + + if ! docker run -d --name "$SMOKE_PORT_CONTAINER" \ + -p "${SMOKE_PORT}:80" \ + "$TEST_IMAGE" sh -c 'mkdir -p /var/www && echo "smoke-ok" > /var/www/index.html && httpd -f -p 80 -h /var/www 2>/dev/null || { while true; do echo -e "HTTP/1.1 200 OK\r\nContent-Length: 9\r\n\r\nsmoke-ok\n" | nc -l -p 80 2>/dev/null || break; done; }' >/dev/null 2>&1; then + record_fail "Port binding" "failed to start container with port mapping" + return + fi + + sleep 2 + local response + response=$(curl -sf --max-time 5 "http://localhost:${SMOKE_PORT}/" 2>/dev/null) || true + remove_container "$SMOKE_PORT_CONTAINER" + + if [[ "$response" == *"smoke-ok"* ]]; then + record_pass "Port binding" "curl localhost:${SMOKE_PORT}" + else + record_fail "Port binding" "no response on localhost:${SMOKE_PORT}" + fi +} + +# ── 6. Container DNS ───────────────────────────────────────────────── +test_container_dns() { + if [[ "$SKIP_LIFECYCLE" == "true" ]]; then record_skip "Container DNS" "SKIP_LIFECYCLE=true"; return; fi + remove_container "$SMOKE_DNS_CONTAINER" + + local dns_output + dns_output=$(docker run --rm --name "$SMOKE_DNS_CONTAINER" "$TEST_IMAGE" \ + sh -c "nslookup ${DNS_TEST_DOMAIN} 2>/dev/null || getent hosts ${DNS_TEST_DOMAIN} 2>/dev/null || ping -c1 -W3 ${DNS_TEST_DOMAIN} 2>/dev/null" 2>&1) || true + + if [[ -n "$dns_output" ]] && ! echo "$dns_output" | grep -qi "can't resolve\|not found\|failure\|NXDOMAIN"; then + record_pass "Container DNS" "${DNS_TEST_DOMAIN}" + else + record_fail "Container DNS" "failed to resolve ${DNS_TEST_DOMAIN}" + fi +} + +# ── 7. Volume mount ────────────────────────────────────────────────── +test_volume_mount() { + if [[ "$SKIP_VOLUME" == "true" ]]; then record_skip "Volume mount" "SKIP_VOLUME=true"; return; fi + docker volume rm -f "$SMOKE_VOLUME" >/dev/null 2>&1 || true + remove_container "$SMOKE_VOL_CONTAINER" + + if ! docker volume create "$SMOKE_VOLUME" >/dev/null 2>&1; then + record_fail "Volume mount" "docker volume create failed" + return + fi + + local write_result + write_result=$(docker run --rm --name "$SMOKE_VOL_CONTAINER" \ + -v "${SMOKE_VOLUME}:/data" "$TEST_IMAGE" \ + sh -c 'echo "smoke-vol-ok" > /data/test.txt && cat /data/test.txt' 2>&1) || true + + docker volume rm -f "$SMOKE_VOLUME" >/dev/null 2>&1 || true + + if [[ "$write_result" == "smoke-vol-ok" ]]; then + record_pass "Volume mount" "write/read verified" + else + record_fail "Volume mount" "write/read mismatch" + fi +} + +# ── 8. Network create/connect ──────────────────────────────────────── +test_network_create() { + if [[ "$SKIP_NETWORK" == "true" ]]; then record_skip "Network create/connect" "SKIP_NETWORK=true"; return; fi + docker network rm "$SMOKE_NETWORK" >/dev/null 2>&1 || true + remove_container "$SMOKE_NET_CONTAINER" + + if ! docker network create --driver bridge "$SMOKE_NETWORK" >/dev/null 2>&1; then + record_fail "Network create/connect" "docker network create failed" + return + fi + + local net_output + net_output=$(docker run --rm --name "$SMOKE_NET_CONTAINER" \ + --network "$SMOKE_NETWORK" "$TEST_IMAGE" \ + sh -c 'ip addr show 2>/dev/null || ifconfig 2>/dev/null' 2>&1) || true + + docker network rm "$SMOKE_NETWORK" >/dev/null 2>&1 || true + + if [[ -n "$net_output" ]]; then + record_pass "Network create/connect" "bridge network" + else + record_fail "Network create/connect" "container failed to attach to network" + fi +} + +# ── 9. Image pull ──────────────────────────────────────────────────── +test_image_pull() { + if docker pull "$TEST_IMAGE" >/dev/null 2>&1; then + record_pass "Image pull" "$TEST_IMAGE" + else + record_fail "Image pull" "failed to pull $TEST_IMAGE" + fi +} + +# ── 10. Image build ────────────────────────────────────────────────── +test_image_build() { + if [[ "$SKIP_BUILD" == "true" ]]; then record_skip "Image build" "SKIP_BUILD=true"; return; fi + docker rmi -f "$SMOKE_BUILD_TAG" >/dev/null 2>&1 || true + + if echo "FROM alpine:latest" | docker build -t "$SMOKE_BUILD_TAG" - >/dev/null 2>&1; then + docker rmi -f "$SMOKE_BUILD_TAG" >/dev/null 2>&1 || true + record_pass "Image build" "inline Dockerfile" + else + record_fail "Image build" "docker build failed" + fi +} + +# ── 11. Docker Compose ─────────────────────────────────────────────── +test_compose_stack() { + if [[ -z "$COMPOSE_FILE" ]]; then + record_skip "Compose stack" "COMPOSE_FILE not set" + return + fi + if [[ ! -f "$COMPOSE_FILE" ]]; then + record_fail "Compose stack" "${COMPOSE_FILE} not found" + return + fi + local compose_cmd="" + if docker compose version >/dev/null 2>&1; then + compose_cmd="docker compose" + elif has_cmd docker-compose; then + compose_cmd="docker-compose" + else + record_skip "Compose stack" "neither 'docker compose' nor 'docker-compose' available" + return + fi + + local ps_output expected_count running_count + ps_output=$($compose_cmd -f "$COMPOSE_FILE" ps --format json 2>/dev/null) || true + + if [[ -z "$ps_output" ]]; then + ps_output=$($compose_cmd -f "$COMPOSE_FILE" ps 2>/dev/null) || true + if [[ -z "$ps_output" ]]; then + record_fail "Compose stack" "could not read compose project status" + return + fi + expected_count=$(echo "$ps_output" | tail -n +2 | wc -l) + running_count=$(echo "$ps_output" | tail -n +2 | grep -ciE "up|running" || true) + else + expected_count=$(echo "$ps_output" | grep -c '"Service"' 2>/dev/null || echo "$ps_output" | wc -l) + running_count=$(echo "$ps_output" | grep -ciE '"running"' 2>/dev/null || true) + fi + + if [[ "$expected_count" -eq 0 ]]; then + record_fail "Compose stack" "no services found" + elif [[ "$running_count" -ge "$expected_count" ]]; then + record_pass "Compose stack" "${running_count}/${expected_count} services running" + else + record_fail "Compose stack" "${running_count}/${expected_count} services running" + fi +} + +# ── 12. Resource limits ────────────────────────────────────────────── +test_resource_limits() { + if [[ "$SKIP_LIFECYCLE" == "true" ]]; then record_skip "Resource limits" "SKIP_LIFECYCLE=true"; return; fi + remove_container "$SMOKE_MEM_CONTAINER" + + local mem_limit + mem_limit=$(docker run --rm --name "$SMOKE_MEM_CONTAINER" \ + --memory=64m "$TEST_IMAGE" \ + sh -c 'cat /sys/fs/cgroup/memory.max 2>/dev/null || cat /sys/fs/cgroup/memory/memory.limit_in_bytes 2>/dev/null' 2>&1) || true + + if [[ -z "$mem_limit" ]]; then + record_skip "Resource limits" "cgroup memory info not available" + return + fi + + local limit_bytes=67108864 # 64 MiB + if [[ "$mem_limit" =~ ^[0-9]+$ ]]; then + if [[ "$mem_limit" -le $((limit_bytes + 1048576)) ]]; then + local limit_mb=$((mem_limit / 1048576)) + record_pass "Resource limits" "memory cgroup enforced (${limit_mb}M)" + else + record_fail "Resource limits" "memory limit not enforced (got ${mem_limit})" + fi + else + record_skip "Resource limits" "unexpected cgroup value: ${mem_limit}" + fi +} + +# ── 13. Disk space ─────────────────────────────────────────────────── +test_disk_space() { + local df_output + df_output=$(docker system df 2>/dev/null) || true + + if [[ -z "$df_output" ]]; then + record_fail "Disk space" "docker system df failed" + return + fi + + local docker_root used_pct + docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null) || docker_root="/var/lib/docker" + used_pct=$(df "$docker_root" 2>/dev/null | tail -1 | awk '{print $5}' | tr -d '%') || used_pct=0 + + if [[ "$used_pct" -gt 80 ]]; then + record_fail "Disk space" "${used_pct}% used (threshold 80%)" + else + record_pass "Disk space" "${used_pct}% used" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_tap_header() { + echo "TAP version 13" +} + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +print_summary() { + local end_time; end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} Docker Smoke Tests" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + if [[ $FAIL -eq 0 ]]; then echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}"; fi +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat </dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "ERROR: Missing required commands: ${missing[*]}" >&2 + echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 + exit 1 + fi +} + +validate_config() { + # Export DOCKER_HOST if set so docker CLI picks it up + if [[ -n "$DOCKER_HOST" ]]; then + export DOCKER_HOST + fi +} + +docker_cmd() { + # Run a docker command and return its output + # Returns empty string on failure + docker "$@" 2>/dev/null || echo "" +} + +add_metric() { + local name="$1" + local type="$2" + local help="$3" + local value="$4" + local labels="${5:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name}{${labels}} ${value} +" + else + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name} ${value} +" + fi +} + +add_metric_value() { + local name="$1" + local value="$2" + local labels="${3:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="${name}{${labels}} ${value} +" + else + OUTPUT+="${name} ${value} +" + fi +} + +collect_nodes() { + local nodes_json + nodes_json=$(docker_cmd node ls --format '{{json .}}') + + if [[ -z "$nodes_json" ]]; then + add_metric "swarm_up" "gauge" "Docker Swarm reachability (1=up, 0=down)" "0" + return 1 + fi + + add_metric "swarm_up" "gauge" "Docker Swarm reachability (1=up, 0=down)" "1" + + # Total node count + local node_count + node_count=$(echo "$nodes_json" | wc -l) + add_metric "swarm_node_count" "gauge" "Total number of nodes in the swarm" "${node_count}" + + # Nodes by status + local nodes_ready nodes_down + nodes_ready=$(echo "$nodes_json" | jq -r 'select(.Status == "Ready")' | jq -s 'length') + nodes_down=$(echo "$nodes_json" | jq -r 'select(.Status == "Down")' | jq -s 'length') + add_metric "swarm_nodes_ready" "gauge" "Number of nodes in ready state" "${nodes_ready}" + add_metric "swarm_nodes_down" "gauge" "Number of nodes in down state" "${nodes_down}" + + # Manager and worker counts + local managers_total workers_total + managers_total=$(echo "$nodes_json" | jq -r 'select(.ManagerStatus != "")' | jq -s 'length') + workers_total=$(echo "$nodes_json" | jq -r 'select(.ManagerStatus == "")' | jq -s 'length') + add_metric "swarm_managers_total" "gauge" "Total number of manager nodes" "${managers_total}" + add_metric "swarm_workers_total" "gauge" "Total number of worker nodes" "${workers_total}" + + # Leader detection — check if the current node is the leader + local is_leader + is_leader=$(echo "$nodes_json" | jq -r 'select(.Self == "true" or .Self == true) | select(.ManagerStatus == "Leader")' | jq -s 'length') + if [[ "$is_leader" -gt 0 ]]; then + add_metric "swarm_manager_leader" "gauge" "Whether this node is the leader (1=leader, 0=not leader)" "1" + else + add_metric "swarm_manager_leader" "gauge" "Whether this node is the leader (1=leader, 0=not leader)" "0" + fi + + return 0 +} + +collect_services() { + local services_json + services_json=$(docker_cmd service ls --format '{{json .}}') + + if [[ -z "$services_json" ]]; then + add_metric "swarm_services_total" "gauge" "Total number of services" "0" + return + fi + + # Total service count + local service_count + service_count=$(echo "$services_json" | wc -l) + add_metric "swarm_services_total" "gauge" "Total number of services" "${service_count}" + + # Per-service replica metrics + # docker service ls --format '{{json .}}' gives us Name and Replicas ("3/3" format) + local first_replicas=true + local first_running=true + + while IFS= read -r line; do + local service_name replicas_str desired running + + service_name=$(echo "$line" | jq -r '.Name') + replicas_str=$(echo "$line" | jq -r '.Replicas') + + # Replicas format is "RUNNING/DESIRED" (e.g. "3/3") or "RUNNING/DESIRED (max N per node)" + # Strip any parenthetical suffix + replicas_str="${replicas_str%% (*}" + + running=$(echo "$replicas_str" | cut -d'/' -f1) + desired=$(echo "$replicas_str" | cut -d'/' -f2) + + # Validate numeric + if ! [[ "$desired" =~ ^[0-9]+$ ]]; then + desired=0 + fi + if ! [[ "$running" =~ ^[0-9]+$ ]]; then + running=0 + fi + + if [[ "$first_replicas" == true ]]; then + OUTPUT+="# HELP swarm_service_replicas Desired replica count per service +# TYPE swarm_service_replicas gauge +" + first_replicas=false + fi + OUTPUT+="swarm_service_replicas{service=\"${service_name}\"} ${desired} +" + + if [[ "$first_running" == true ]]; then + first_running=false + fi + done <<< "$services_json" + + # Running replicas — separate HELP/TYPE block + OUTPUT+="# HELP swarm_service_replicas_running Running replica count per service +# TYPE swarm_service_replicas_running gauge +" + while IFS= read -r line; do + local service_name replicas_str running + + service_name=$(echo "$line" | jq -r '.Name') + replicas_str=$(echo "$line" | jq -r '.Replicas') + replicas_str="${replicas_str%% (*}" + running=$(echo "$replicas_str" | cut -d'/' -f1) + + if ! [[ "$running" =~ ^[0-9]+$ ]]; then + running=0 + fi + + OUTPUT+="swarm_service_replicas_running{service=\"${service_name}\"} ${running} +" + done <<< "$services_json" +} + +collect_tasks() { + # Count running tasks + local tasks_running + tasks_running=$(docker_cmd node ps --format '{{json .}}' --filter 'desired-state=running' 2>/dev/null | jq -s 'length' 2>/dev/null) + if [[ -z "$tasks_running" || "$tasks_running" == "null" ]]; then + tasks_running=0 + fi + add_metric "swarm_tasks_running" "gauge" "Total number of running tasks" "${tasks_running}" + + # Count failed tasks across all services + local tasks_failed + tasks_failed=$(docker_cmd service ls -q 2>/dev/null | while read -r svc_id; do + docker service ps "$svc_id" --format '{{json .}}' --filter 'desired-state=shutdown' 2>/dev/null + done | jq -r 'select(.CurrentState | test("^Failed|^Rejected"; "i"))' 2>/dev/null | jq -s 'length' 2>/dev/null) + if [[ -z "$tasks_failed" || "$tasks_failed" == "null" ]]; then + tasks_failed=0 + fi + add_metric "swarm_tasks_failed" "gauge" "Total number of failed tasks" "${tasks_failed}" +} + +collect_networks() { + local networks_json + networks_json=$(docker_cmd network ls --filter driver=overlay --format '{{json .}}') + + local network_count=0 + if [[ -n "$networks_json" ]]; then + network_count=$(echo "$networks_json" | wc -l) + fi + + add_metric "swarm_networks_total" "gauge" "Total number of overlay networks" "${network_count}" +} + +collect_raft() { + # Get Raft index from docker info + local info_json + info_json=$(docker_cmd info --format '{{json .}}') + + if [[ -z "$info_json" ]]; then + return + fi + + local raft_index + raft_index=$(echo "$info_json" | jq -r '.Swarm.Cluster.RaftIndex // .Swarm.RaftIndex // empty' 2>/dev/null) + + # Fallback — try extracting from Swarm.Cluster directly + if [[ -z "$raft_index" ]]; then + raft_index=$(echo "$info_json" | jq -r '.Swarm.Cluster.Version.Index // empty' 2>/dev/null) + fi + + if [[ -n "$raft_index" && "$raft_index" != "null" ]]; then + add_metric "swarm_raft_index" "gauge" "Raft applied index" "${raft_index}" + else + add_metric "swarm_raft_index" "gauge" "Raft applied index" "0" + fi +} + +write_output() { + if [[ "$TEXTFILE_MODE" == true ]]; then + local output_file="${TEXTFILE_DIR}/docker_swarm.prom" + local temp_file="${output_file}.$$" + + mkdir -p "$TEXTFILE_DIR" + echo "$OUTPUT" > "$temp_file" + mv "$temp_file" "$output_file" + else + echo "$OUTPUT" + fi +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + local env_lines="" + if [[ -n "$DOCKER_HOST" ]]; then + env_lines="DOCKER_HOST=${DOCKER_HOST} +" + fi + + cat > /etc/cron.d/docker-swarm-exporter </dev/null +EOF + + chmod 644 /etc/cron.d/docker-swarm-exporter + echo "Installed cron job: /etc/cron.d/docker-swarm-exporter" + echo "Metrics will be written to: ${TEXTFILE_DIR}/docker_swarm.prom" +} + +# --- Main --- + +main() { + # Parse arguments + for arg in "$@"; do + case "$arg" in + --textfile) TEXTFILE_MODE=true ;; + --install) + check_dependencies + validate_config + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) echo "Unknown option: $arg" >&2; usage ;; + esac + done + + check_dependencies + validate_config + + START_TIME=$(date +%s%N) + + # Exporter info + add_metric "swarm_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" + + # Collect metrics + if collect_nodes; then + collect_services + collect_tasks + collect_networks + collect_raft + fi + + # Exporter performance + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + add_metric "swarm_exporter_duration_seconds" "gauge" "Script execution time" "$duration" + add_metric "swarm_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" + + write_output +} + +main "$@" diff --git a/docker-volume-backup.sh b/docker-volume-backup.sh new file mode 100644 index 0000000..9f26908 --- /dev/null +++ b/docker-volume-backup.sh @@ -0,0 +1,269 @@ +#!/bin/bash +############################################################# +#### Docker Volume Backup Script #### +#### Backup and restore Docker named volumes using #### +#### tar archives with optional compression #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: ./docker-volume-backup.sh [OPTIONS] #### +############################################################# + +set -euo pipefail + +SCRIPT_NAME=$(basename "$0") +readonly SCRIPT_NAME +readonly DEFAULT_BACKUP_DIR="/opt/docker-backups" +readonly DEFAULT_RETAIN=7 +readonly ALPINE_IMAGE="alpine:latest" + +BACKUP_DIR="$DEFAULT_BACKUP_DIR" +RETAIN="$DEFAULT_RETAIN" +MODE="" +TARGET_VOLUME="" +RESTORE_ARCHIVE="" + +# Colors +readonly RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" >&2; } +log_step() { echo -e "${BLUE}[STEP]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*"; } + +show_help() { + cat << EOF +Usage: $SCRIPT_NAME [OPTIONS] + +Backup and restore Docker named volumes using tar archives with compression. + +OPTIONS: + --backup [VOLUME] Backup all named volumes, or a specific volume if given + --restore ARCHIVE Restore a volume from the specified tar.gz archive + --list List available backups + --backup-dir PATH Backup directory (default: $DEFAULT_BACKUP_DIR) + --retain N Number of backups to keep per volume (default: $DEFAULT_RETAIN) + --help, -h Show this help message + +EXAMPLES: + $SCRIPT_NAME --backup + $SCRIPT_NAME --backup my_volume + $SCRIPT_NAME --restore $DEFAULT_BACKUP_DIR/my_volume_20260309_143022.tar.gz + $SCRIPT_NAME --list + $SCRIPT_NAME --backup --backup-dir /mnt/backups --retain 14 +EOF + exit 0 +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --backup) + MODE="backup"; shift + [[ $# -gt 0 && ! "$1" =~ ^-- ]] && { TARGET_VOLUME="$1"; shift; } + ;; + --restore) + MODE="restore" + [[ $# -lt 2 ]] && { log_error "--restore requires an archive path"; exit 1; } + RESTORE_ARCHIVE="$2"; shift 2 + ;; + --list) MODE="list"; shift ;; + --backup-dir) + [[ $# -lt 2 ]] && { log_error "--backup-dir requires a path"; exit 1; } + BACKUP_DIR="$2"; shift 2 + ;; + --retain) + [[ $# -lt 2 ]] && { log_error "--retain requires a number"; exit 1; } + RETAIN="$2"; shift 2 + ;; + --help|-h) show_help ;; + *) log_error "Unknown option: $1"; show_help ;; + esac + done + if [[ -z "$MODE" ]]; then + log_error "No action specified. Use --backup, --restore, or --list." + show_help + fi +} + +check_dependencies() { + if ! command -v docker &>/dev/null; then + log_error "docker is required but not installed"; exit 1 + fi + if ! docker info &>/dev/null; then + log_error "Cannot connect to Docker daemon. Is it running?"; exit 1 + fi +} + +backup_volume() { + local volume_name="$1" + local timestamp archive_name final_path tmp_file size + timestamp=$(date +%Y%m%d_%H%M%S) + archive_name="${volume_name}_${timestamp}.tar.gz" + final_path="${BACKUP_DIR}/${archive_name}" + + log_step "Backing up volume: ${volume_name}" + + if ! docker volume inspect "$volume_name" &>/dev/null; then + log_error "Volume '$volume_name' does not exist"; return 1 + fi + + mkdir -p "$BACKUP_DIR" + tmp_file=$(mktemp "${BACKUP_DIR}/.backup_XXXXXX.tar.gz") + + if docker run --rm \ + -v "${volume_name}:/source:ro" \ + -v "${BACKUP_DIR}:/backup" \ + "$ALPINE_IMAGE" \ + tar czf "/backup/$(basename "$tmp_file")" -C /source . 2>/dev/null; then + mv "$tmp_file" "$final_path" + size=$(du -h "$final_path" | cut -f1) + log_info "Created backup: ${final_path} (${size})" + else + rm -f "$tmp_file" + log_error "Failed to backup volume: ${volume_name}"; return 1 + fi +} + +do_backup() { + log_step "Starting Docker volume backup" + log_info "Backup directory: ${BACKUP_DIR}" + + local volumes=() + if [[ -n "$TARGET_VOLUME" ]]; then + volumes=("$TARGET_VOLUME") + else + while IFS= read -r vol; do + [[ -n "$vol" ]] && volumes+=("$vol") + done < <(docker volume ls --format '{{.Name}}' | sort) + fi + + if [[ ${#volumes[@]} -eq 0 ]]; then + log_warn "No Docker named volumes found"; return 0 + fi + log_info "Found ${#volumes[@]} volume(s) to backup" + + local success=0 failed=0 + for vol in "${volumes[@]}"; do + if backup_volume "$vol"; then + success=$((success + 1)) + else + failed=$((failed + 1)) + fi + done + + apply_retention + log_step "Backup complete: ${success} succeeded, ${failed} failed" + [[ $failed -gt 0 ]] && return 1 + return 0 +} + +do_restore() { + if [[ ! -f "$RESTORE_ARCHIVE" ]]; then + log_error "Archive not found: ${RESTORE_ARCHIVE}"; exit 1 + fi + + local basename_archive volume_name + basename_archive=$(basename "$RESTORE_ARCHIVE") + volume_name=$(echo "$basename_archive" | sed 's/_[0-9]\{8\}_[0-9]\{6\}\.tar\.gz$//') + + if [[ -z "$volume_name" || "$volume_name" == "$basename_archive" ]]; then + log_error "Cannot determine volume name from archive: ${basename_archive}" + log_error "Expected format: volumename_YYYYMMDD_HHMMSS.tar.gz"; exit 1 + fi + + log_step "Restoring volume: ${volume_name} from ${RESTORE_ARCHIVE}" + + if ! docker volume inspect "$volume_name" &>/dev/null; then + log_info "Creating volume: ${volume_name}" + docker volume create "$volume_name" >/dev/null + else + log_warn "Volume '${volume_name}' already exists — contents will be overwritten" + fi + + local archive_abs archive_dir archive_file + archive_abs=$(realpath "$RESTORE_ARCHIVE") + archive_dir=$(dirname "$archive_abs") + archive_file=$(basename "$archive_abs") + + if docker run --rm \ + -v "${volume_name}:/target" \ + -v "${archive_dir}:/backup:ro" \ + "$ALPINE_IMAGE" \ + sh -c "rm -rf /target/* /target/..?* /target/.[!.]* 2>/dev/null; tar xzf /backup/${archive_file} -C /target" 2>/dev/null; then + log_info "Volume '${volume_name}' restored successfully" + else + log_error "Failed to restore volume: ${volume_name}"; exit 1 + fi +} + +do_list() { + if [[ ! -d "$BACKUP_DIR" ]]; then + log_info "No backups found (directory does not exist: ${BACKUP_DIR})"; return 0 + fi + + local count=0 + log_step "Available backups in ${BACKUP_DIR}:" + printf "\n %-40s %-10s %s\n" "ARCHIVE" "SIZE" "VOLUME" + printf " %-40s %-10s %s\n" "-------" "----" "------" + + for archive in "$BACKUP_DIR"/*.tar.gz; do + [[ -f "$archive" ]] || continue + count=$((count + 1)) + local name size vol_name + name=$(basename "$archive") + size=$(du -h "$archive" | cut -f1) + vol_name=$(echo "$name" | sed 's/_[0-9]\{8\}_[0-9]\{6\}\.tar\.gz$//') + printf " %-40s %-10s %s\n" "$name" "$size" "$vol_name" + done + + echo "" + if [[ $count -eq 0 ]]; then + log_info "No backup archives found in ${BACKUP_DIR}" + else + log_info "${count} backup(s) found" + fi +} + +apply_retention() { + log_step "Applying retention policy (keep last ${RETAIN} per volume)" + + local vol_names=() + for archive in "$BACKUP_DIR"/*.tar.gz; do + [[ -f "$archive" ]] || continue + local name vol_name + name=$(basename "$archive") + vol_name=$(echo "$name" | sed 's/_[0-9]\{8\}_[0-9]\{6\}\.tar\.gz$//') + vol_names+=("$vol_name") + done + + local unique_vols + unique_vols=$(printf '%s\n' "${vol_names[@]}" 2>/dev/null | sort -u) + + while IFS= read -r vol; do + [[ -z "$vol" ]] && continue + local old_archives=() + while IFS= read -r f; do + [[ -n "$f" ]] && old_archives+=("$f") + done < <(ls -1t "$BACKUP_DIR"/${vol}_[0-9]*_[0-9]*.tar.gz 2>/dev/null | tail -n +$((RETAIN + 1))) + for old_archive in "${old_archives[@]}"; do + log_info "Removing old backup: $(basename "$old_archive")" + rm -f "$old_archive" + done + done <<< "$unique_vols" +} + +main() { + parse_args "$@" + check_dependencies + case "$MODE" in + backup) do_backup ;; + restore) do_restore ;; + list) do_list ;; + esac +} + +main "$@" diff --git a/dokku-exporter.sh b/dokku-exporter.sh new file mode 100755 index 0000000..617d6fc --- /dev/null +++ b/dokku-exporter.sh @@ -0,0 +1,410 @@ +#!/bin/bash +################################################################################ +# Script Name: dokku-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for Dokku PaaS providing operational +# metrics via the Dokku CLI — application status, plugin counts, +# domain configuration, SSL status, and host health +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - Dokku installed on the local host +# - Root or dokku user access to run dokku commands +# - netcat (nc) for HTTP mode +# +# Usage: +# # Output to stdout +# ./dokku-exporter.sh +# +# # HTTP server mode +# ./dokku-exporter.sh --http -p 9198 +# +# # Textfile collector mode +# ./dokku-exporter.sh --textfile +# +# Metrics Exported: +# - dokku_up - Dokku reachability (1=up, 0=down) +# - dokku_info{version} - Dokku version info +# - dokku_apps_total - Total app count +# - dokku_apps_running - Running apps +# - dokku_apps_stopped - Stopped apps +# - dokku_plugins_total - Installed plugin count +# - dokku_domains_app_total - Total app domains configured +# - dokku_ssl_enabled_total - Apps with SSL enabled +# - dokku_exporter_duration_seconds - Script execution time +# - dokku_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9198 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9198 + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Check prerequisites +# Returns: 0 if OK, 1 if error +check_prerequisites() { + if ! command -v dokku >/dev/null 2>&1; then + echo "ERROR: dokku not found" >&2 + return 1 + fi + + return 0 +} + +# Escape special characters in Prometheus label values +# Args: $1 - string to escape +# Returns: escaped string safe for Prometheus labels +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/}" + echo "$val" +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Check prerequisites + if ! check_prerequisites; then + cat </dev/null) + + if [ -z "$version_output" ]; then + cat < "0.38.0") + local dokku_version + dokku_version=$(echo "$version_output" | awk '{print $NF}') + + if [ -z "$dokku_version" ]; then + cat </dev/null || true) + + local total_apps=0 + local running_apps=0 + local stopped_apps=0 + + if [ -n "$apps_list" ]; then + total_apps=$(echo "$apps_list" | wc -l) + total_apps=${total_apps:-0} + + # Count running apps by checking each app's process status + while IFS= read -r app; do + local ps_running + ps_running=$(dokku ps:report "$app" --ps-running 2>/dev/null || echo "false") + if [ "$ps_running" = "true" ]; then + running_apps=$((running_apps + 1)) + fi + done <<< "$apps_list" + + stopped_apps=$((total_apps - running_apps)) + fi + + cat </dev/null | wc -l) + plugins_count=${plugins_count:-0} + + cat </dev/null || true) + if [ -n "$app_domains" ]; then + # Domains are space-separated; count words + local domain_count + domain_count=$(echo "$app_domains" | wc -w) + total_domains=$((total_domains + domain_count)) + fi + done <<< "$apps_list" + fi + + cat </dev/null || echo "false") + if [ "$ssl_enabled" = "true" ]; then + ssl_enabled_count=$((ssl_enabled_count + 1)) + fi + done <<< "$apps_list" + fi + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + # Infinite loop accepting HTTP requests + while true; do + { + read -r request + # Check if request is for /metrics endpoint + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else # Serve HTML landing page for other requests + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +Dokku Exporter v1.0 + +

Dokku Prometheus Exporter v1.0

+

Metrics

+

Operational metrics from the Dokku CLI.

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Main entry point - routes to appropriate output mode +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + # Run HTTP server (blocks until killed) + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + # Textfile collector mode: write atomically using temp file + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + # Create temp file in SAME directory for atomic rename (same filesystem) + local temp_file + temp_file=$(mktemp "${output_dir}/.dokku_metrics.XXXXXX") + + # Generate metrics to temp file + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + # Validate: file must exist, have content + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + # Set permissions before move + chmod 644 "$temp_file" + + # Atomic rename - no gap where file is missing + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + # Default: output to stdout + generate_metrics + fi +} + +# Execute main function with all script arguments +main "$@" diff --git a/dokku-smoke-tests.sh b/dokku-smoke-tests.sh new file mode 100755 index 0000000..1114f91 --- /dev/null +++ b/dokku-smoke-tests.sh @@ -0,0 +1,455 @@ +#!/bin/bash +################################################################################ +# Script Name: dokku-smoke-tests.sh +# Version: 1.0 +# Description: Smoke test suite for Dokku PaaS — validates connectivity, +# app deployment lifecycle, plugin health, SSL certificates, +# and resource usage via the dokku CLI +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - bash 4+ +# - dokku binary (run on the Dokku host) +# - Root or dokku user access +# +# Usage: +# sudo ./dokku-smoke-tests.sh +# sudo ./dokku-smoke-tests.sh --skip-app --skip-ssl +# sudo ./dokku-smoke-tests.sh --format tap +# sudo ./dokku-smoke-tests.sh --format junit --junit-file results.xml +# +################################################################################ + +set -euo pipefail + +# --- Defaults --- +SKIP_APP="${SKIP_APP_LIFECYCLE:-false}" +SKIP_SSL="${SKIP_SSL:-false}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}" +DOKKU_DOMAIN="${DOKKU_DOMAIN:-}" +VERBOSE=false +USE_COLOR=true +TEST_APP_NAME="" +PASSED=0 +FAILED=0 +SKIPPED=0 +START_TIME="" +JUNIT_RESULTS=() +TAP_RESULTS=() +TEST_NUM=0 + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +usage() { + cat <&2 + fi +} + +pass() { + local suite="$1" msg="$2" + ((TEST_NUM++)) || true + ((PASSED++)) || true + case "$OUTPUT_FORMAT" in + tap) TAP_RESULTS+=("ok $TEST_NUM - [$suite] $msg") ;; + junit) JUNIT_RESULTS+=("") ;; + *) echo -e " ${GREEN}✓${NC} $msg" ;; + esac +} + +fail() { + local suite="$1" msg="$2" detail="${3:-}" + ((TEST_NUM++)) || true + ((FAILED++)) || true + case "$OUTPUT_FORMAT" in + tap) TAP_RESULTS+=("not ok $TEST_NUM - [$suite] $msg") ;; + junit) JUNIT_RESULTS+=("$detail") ;; + *) echo -e " ${RED}✗${NC} $msg${detail:+ — $detail}" ;; + esac +} + +skip() { + local suite="$1" msg="$2" + ((TEST_NUM++)) || true + ((SKIPPED++)) || true + case "$OUTPUT_FORMAT" in + tap) TAP_RESULTS+=("ok $TEST_NUM - [$suite] $msg # SKIP") ;; + junit) JUNIT_RESULTS+=("") ;; + *) echo -e " ${YELLOW}⊘${NC} $msg — skipped" ;; + esac +} + +suite_header() { + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e "\n${BOLD}$1${NC}" + fi +} + +# --- Cleanup --- +cleanup() { + if [[ -n "$TEST_APP_NAME" ]]; then + debug "Cleaning up test app: $TEST_APP_NAME" + dokku apps:destroy "$TEST_APP_NAME" --force >/dev/null 2>&1 || true + TEST_APP_NAME="" + fi +} +trap cleanup EXIT INT TERM + +# --- Header --- +START_TIME=$(date +%s) +HOSTNAME_STR=$(hostname -f 2>/dev/null || hostname) + +if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e "${BOLD}Dokku Smoke Tests${NC}" + echo "Host: $HOSTNAME_STR" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" +fi + +# ===================================================== +# Suite 1: Connectivity +# ===================================================== +suite_header "Connectivity" + +# Check dokku binary +DOKKU_BIN=$(command -v dokku 2>/dev/null || true) +if [[ -n "$DOKKU_BIN" ]]; then + pass "Connectivity" "Dokku binary found — $DOKKU_BIN" +else + fail "Connectivity" "Dokku binary not found" "dokku is not in PATH" + echo -e "\n${RED}Cannot continue without dokku binary. Aborting.${NC}" >&2 + exit 1 +fi + +# Check Docker daemon +if docker info >/dev/null 2>&1; then + pass "Connectivity" "Docker daemon running" +else + fail "Connectivity" "Docker daemon not running" "docker info failed" +fi + +# Check dokku version +DOKKU_VERSION=$(dokku version 2>/dev/null | grep -oP 'dokku version \K[0-9]+\.[0-9]+\.[0-9]+' || true) +if [[ -z "$DOKKU_VERSION" ]]; then + # Try alternate format + DOKKU_VERSION=$(dokku version 2>/dev/null | grep -oP '[0-9]+\.[0-9]+\.[0-9]+' || true) +fi + +if [[ -n "$DOKKU_VERSION" ]]; then + pass "Connectivity" "Dokku version — $DOKKU_VERSION" +else + fail "Connectivity" "Dokku version" "Could not parse version string" +fi + +# Auto-detect global domain if not set +if [[ -z "$DOKKU_DOMAIN" ]]; then + DOKKU_DOMAIN=$(dokku domains:report --global 2>/dev/null | grep -i "global vhosts" | awk '{print $NF}' || true) + if [[ -z "$DOKKU_DOMAIN" ]]; then + DOKKU_DOMAIN=$(dokku domains:report --global 2>/dev/null | tail -1 | awk '{print $NF}' || true) + fi + debug "Auto-detected domain: $DOKKU_DOMAIN" +fi + +# ===================================================== +# Suite 2: App Lifecycle +# ===================================================== +if [[ "$SKIP_APP" == "true" ]]; then + suite_header "App Lifecycle" + skip "App Lifecycle" "Create test app" + skip "App Lifecycle" "Deploy image" + skip "App Lifecycle" "App responding" + skip "App Lifecycle" "Delete test app" +else + suite_header "App Lifecycle" + + TEST_APP_NAME="dokku-smoke-$(date +%s)" + debug "Test app name: $TEST_APP_NAME" + + # Create app + create_output=$(dokku apps:create "$TEST_APP_NAME" 2>&1) || true + debug "Create output: $create_output" + + if dokku apps:exists "$TEST_APP_NAME" >/dev/null 2>&1; then + pass "App Lifecycle" "Create test app — $TEST_APP_NAME" + else + fail "App Lifecycle" "Create test app" "$create_output" + skip "App Lifecycle" "Deploy image" + skip "App Lifecycle" "App responding" + skip "App Lifecycle" "Delete test app" + TEST_APP_NAME="" + SKIP_APP=true + fi + + if [[ "$SKIP_APP" != "true" ]]; then + # Deploy image via git:from-image + deploy_output=$(dokku git:from-image "$TEST_APP_NAME" nginxdemos/hello 2>&1) || true + debug "Deploy output: $deploy_output" + + # Check if app is running + app_running=false + for i in $(seq 1 12); do + sleep 5 + debug "Waiting for app to start... attempt $i/12" + ps_output=$(dokku ps:report "$TEST_APP_NAME" 2>/dev/null || true) + if echo "$ps_output" | grep -qi "running"; then + app_running=true + break + fi + # Also check container status directly + running_count=$(dokku ps:report "$TEST_APP_NAME" 2>/dev/null | grep -i "running" | wc -l || echo "0") + if [[ "$running_count" -gt 0 ]]; then + app_running=true + break + fi + done + + if [[ "$app_running" == "true" ]]; then + pass "App Lifecycle" "Deploy image — nginxdemos/hello deployed" + else + fail "App Lifecycle" "Deploy image" "App not running after 60s" + fi + + # Verify HTTP response + if [[ -n "$DOKKU_DOMAIN" ]]; then + app_url="http://${TEST_APP_NAME}.${DOKKU_DOMAIN}" + debug "App URL: $app_url" + sleep 3 + + app_http=$(curl -s -o /dev/null -w "%{http_code}" \ + --connect-timeout 10 --max-time 30 \ + "$app_url" 2>/dev/null || echo "000") + + if [[ "$app_http" == "200" ]]; then + pass "App Lifecycle" "App responding — HTTP 200 at ${TEST_APP_NAME}.${DOKKU_DOMAIN}" + else + fail "App Lifecycle" "App responding" "HTTP $app_http at $app_url" + fi + else + # No domain configured — check container port directly + debug "No global domain — checking container directly" + port=$(dokku proxy:ports "$TEST_APP_NAME" 2>/dev/null | grep -oP ':\K[0-9]+$' | head -1 || true) + if [[ -n "$port" ]]; then + app_http=$(curl -s -o /dev/null -w "%{http_code}" \ + --connect-timeout 10 --max-time 30 \ + "http://localhost:$port" 2>/dev/null || echo "000") + if [[ "$app_http" == "200" ]]; then + pass "App Lifecycle" "App responding — HTTP 200 on port $port" + else + fail "App Lifecycle" "App responding" "HTTP $app_http on port $port" + fi + else + skip "App Lifecycle" "App responding" + fi + fi + + # Delete test app + delete_output=$(dokku apps:destroy "$TEST_APP_NAME" --force 2>&1) || true + debug "Delete output: $delete_output" + + if ! dokku apps:exists "$TEST_APP_NAME" >/dev/null 2>&1; then + pass "App Lifecycle" "Delete test app — cleaned up" + TEST_APP_NAME="" + else + fail "App Lifecycle" "Delete test app" "Manual cleanup may be required" + fi + fi +fi + +# ===================================================== +# Suite 3: Plugin Health +# ===================================================== +suite_header "Plugins" + +plugin_list=$(dokku plugin:list 2>/dev/null || true) +debug "Plugin list: $plugin_list" + +if [[ -n "$plugin_list" ]]; then + plugin_count=$(echo "$plugin_list" | grep -c "enabled" || echo "0") + pass "Plugins" "Plugin list — $plugin_count plugins installed" +else + fail "Plugins" "Plugin list" "dokku plugin:list failed" +fi + +# Check core plugins +CORE_PLUGINS=("nginx-vhosts" "apps" "config" "ps") +for plugin in "${CORE_PLUGINS[@]}"; do + if echo "$plugin_list" | grep -q "$plugin"; then + pass "Plugins" "Core plugin present — $plugin" + else + fail "Plugins" "Core plugin present — $plugin" "Not found in plugin list" + fi +done + +# ===================================================== +# Suite 4: SSL +# ===================================================== +if [[ "$SKIP_SSL" == "true" ]]; then + suite_header "SSL" + skip "SSL" "Letsencrypt plugin installed" + skip "SSL" "TLS certificate valid" +else + suite_header "SSL" + + # Check if letsencrypt plugin is installed + le_installed=false + if echo "$plugin_list" | grep -qi "letsencrypt"; then + le_installed=true + pass "SSL" "Letsencrypt plugin installed" + else + skip "SSL" "Letsencrypt plugin installed" + fi + + # Check global domain certificate + if [[ "$le_installed" == "true" && -n "$DOKKU_DOMAIN" ]]; then + # Check certificate via openssl if the domain resolves + cert_host="$DOKKU_DOMAIN" + cert_output=$(echo | openssl s_client -servername "$cert_host" -connect "${cert_host}:443" 2>/dev/null || true) + cert_enddate=$(echo "$cert_output" | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2 || true) + + if [[ -n "$cert_enddate" ]]; then + expiry_epoch=$(date -d "$cert_enddate" +%s 2>/dev/null || echo "0") + now_epoch=$(date +%s) + days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + + if [[ "$days_left" -gt 0 ]]; then + pass "SSL" "TLS certificate valid — $days_left days remaining" + else + fail "SSL" "TLS certificate expired" "$days_left days past expiry" + fi + else + skip "SSL" "TLS certificate valid" + fi + else + skip "SSL" "TLS certificate valid" + fi +fi + +# ===================================================== +# Suite 5: Resources +# ===================================================== +suite_header "Resources" + +# Disk usage +disk_line=$(df -h / 2>/dev/null | tail -1 || true) +if [[ -n "$disk_line" ]]; then + disk_pct=$(echo "$disk_line" | awk '{print $5}' | tr -d '%') + disk_used=$(echo "$disk_line" | awk '{print $3}') + disk_total=$(echo "$disk_line" | awk '{print $2}') + pass "Resources" "Disk usage — ${disk_pct}% (${disk_used} / ${disk_total})" +else + fail "Resources" "Disk usage" "Could not read disk info" +fi + +# Docker images +image_count=$(docker images -q 2>/dev/null | wc -l || echo "0") +pass "Resources" "Docker images — $image_count images" + +# Docker volumes +volume_count=$(docker volume ls -q 2>/dev/null | wc -l || echo "0") +pass "Resources" "Docker volumes — $volume_count volumes" + +# Docker containers +container_count=$(docker ps -q 2>/dev/null | wc -l || echo "0") +pass "Resources" "Docker containers — $container_count running" + +# ===================================================== +# Summary +# ===================================================== +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +case "$OUTPUT_FORMAT" in + tap) + echo "TAP version 13" + echo "1..$TEST_NUM" + for line in "${TAP_RESULTS[@]}"; do + echo "$line" + done + echo "# passed: $PASSED" + echo "# failed: $FAILED" + echo "# skipped: $SKIPPED" + echo "# duration: ${DURATION}s" + ;; + junit) + { + echo '' + echo "" + echo " " + echo " " + echo " " + for result in "${JUNIT_RESULTS[@]}"; do + echo " $result" + done + echo "" + } > "$JUNIT_FILE" + echo "JUnit results written to $JUNIT_FILE" + ;; + *) + echo "" + echo "────────────────────────────────────────" + echo -e "Summary ${BOLD}$HOSTNAME_STR${NC}" + echo -e " ${GREEN}$PASSED passed${NC} ${RED}$FAILED failed${NC} ${YELLOW}$SKIPPED skipped${NC} (${DURATION}s)" + echo "────────────────────────────────────────" + if [[ "$FAILED" -eq 0 ]]; then + echo -e "${GREEN}All tests passed.${NC}" + else + echo -e "${RED}Some tests failed.${NC}" + fi + ;; +esac + +exit $((FAILED > 0 ? 1 : 0)) diff --git a/dokploy-exporter.sh b/dokploy-exporter.sh new file mode 100755 index 0000000..9f5f9c9 --- /dev/null +++ b/dokploy-exporter.sh @@ -0,0 +1,470 @@ +#!/bin/bash +################################################################################ +# Script Name: dokploy-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for Dokploy PaaS providing operational +# metrics via the Dokploy API — project counts, application status, +# database breakdown by type, compose services, server info, +# and API health +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - Dokploy instance running with API enabled +# - Dokploy API key (generate in Settings → API) +# - curl for API calls +# - jq for JSON parsing +# - netcat (nc) for HTTP mode +# +# Usage: +# # Output to stdout +# ./dokploy-exporter.sh +# +# # HTTP server mode +# ./dokploy-exporter.sh --http -p 9197 +# +# # Textfile collector mode +# ./dokploy-exporter.sh --textfile +# +# # Custom API token and URL +# ./dokploy-exporter.sh --api-url http://dokploy.local:3000 --api-token mytoken +# +# Metrics Exported: +# - dokploy_up - API reachability (1=up, 0=down) +# - dokploy_info{version} - Dokploy version info +# - dokploy_projects_total - Total project count +# - dokploy_applications_total - Total applications across all projects +# - dokploy_applications_by_status{status} - Applications by status +# - dokploy_compose_services_total - Total Docker Compose services +# - dokploy_databases_total - Total managed databases +# - dokploy_databases_by_type{type} - Databases by type +# - dokploy_servers_total - Total servers managed +# - dokploy_exporter_duration_seconds - Script execution time +# - dokploy_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9197 +# Default API URL: http://localhost:3000 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9197 +API_URL="http://localhost:3000" +API_TOKEN="" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Check prerequisites +# Returns: 0 if OK, 1 if error +check_prerequisites() { + if ! command -v curl >/dev/null 2>&1; then + echo "ERROR: curl not found" >&2 + return 1 + fi + + if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq not found (required for JSON parsing)" >&2 + return 1 + fi + + if [ -z "$API_TOKEN" ]; then + echo "ERROR: --api-token is required" >&2 + return 1 + fi + + return 0 +} + +# Escape special characters in Prometheus label values +# Args: $1 - string to escape +# Returns: escaped string safe for Prometheus labels +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/}" + echo "$val" +} + +# Make an authenticated API call +# Args: $1 - API endpoint path (e.g., /api/project.all) +# Returns: JSON response on stdout +api_call() { + local endpoint="$1" + curl -s -X GET \ + -H "x-api-key: ${API_TOKEN}" \ + -H "Accept: application/json" \ + "${API_URL}${endpoint}" 2>/dev/null +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Check prerequisites + if ! check_prerequisites; then + cat </dev/null) + + if [ -z "$health_response" ]; then + cat </dev/null) + + if [ "$is_error" = "yes" ]; then + cat </dev/null) + dokploy_version="${dokploy_version:-unknown}" + + cat </dev/null) + total_projects=${total_projects:-0} + + # Count applications across all projects + total_apps=$(echo "$projects_response" | jq '[.[] | (.applications // []) | length] | add // 0' 2>/dev/null) + total_apps=${total_apps:-0} + + # Count applications by status + done_apps=$(echo "$projects_response" | jq '[.[] | (.applications // [])[] | select(.applicationStatus == "done")] | length' 2>/dev/null) + idle_apps=$(echo "$projects_response" | jq '[.[] | (.applications // [])[] | select(.applicationStatus == "idle")] | length' 2>/dev/null) + running_apps=$(echo "$projects_response" | jq '[.[] | (.applications // [])[] | select(.applicationStatus == "running")] | length' 2>/dev/null) + error_apps=$(echo "$projects_response" | jq '[.[] | (.applications // [])[] | select(.applicationStatus == "error")] | length' 2>/dev/null) + done_apps=${done_apps:-0} + idle_apps=${idle_apps:-0} + running_apps=${running_apps:-0} + error_apps=${error_apps:-0} + + # Count compose services across all projects + total_compose=$(echo "$projects_response" | jq '[.[] | (.compose // []) | length] | add // 0' 2>/dev/null) + total_compose=${total_compose:-0} + + # Count databases by type across all projects + pg_count=$(echo "$projects_response" | jq '[.[] | (.postgres // []) | length] | add // 0' 2>/dev/null) + mysql_count=$(echo "$projects_response" | jq '[.[] | (.mysql // []) | length] | add // 0' 2>/dev/null) + mariadb_count=$(echo "$projects_response" | jq '[.[] | (.mariadb // []) | length] | add // 0' 2>/dev/null) + mongo_count=$(echo "$projects_response" | jq '[.[] | (.mongo // []) | length] | add // 0' 2>/dev/null) + redis_count=$(echo "$projects_response" | jq '[.[] | (.redis // []) | length] | add // 0' 2>/dev/null) + pg_count=${pg_count:-0} + mysql_count=${mysql_count:-0} + mariadb_count=${mariadb_count:-0} + mongo_count=${mongo_count:-0} + redis_count=${redis_count:-0} + + total_databases=$((pg_count + mysql_count + mariadb_count + mongo_count + redis_count)) + fi + + cat </dev/null) + total_servers=${total_servers:-0} + fi + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + # Infinite loop accepting HTTP requests + while true; do + { + read -r request + # Check if request is for /metrics endpoint + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else # Serve HTML landing page for other requests + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +Dokploy Exporter v1.0 + +

Dokploy Prometheus Exporter v1.0

+

Metrics

+

Operational metrics from the Dokploy API.

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Main entry point - routes to appropriate output mode +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + # Run HTTP server (blocks until killed) + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + # Textfile collector mode: write atomically using temp file + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + # Create temp file in SAME directory for atomic rename (same filesystem) + local temp_file + temp_file=$(mktemp "${output_dir}/.dokploy_metrics.XXXXXX") + + # Generate metrics to temp file + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + # Validate: file must exist, have content + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + # Set permissions before move + chmod 644 "$temp_file" + + # Atomic rename - no gap where file is missing + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + # Default: output to stdout + generate_metrics + fi +} + +# Execute main function with all script arguments +main "$@" diff --git a/dovecot-metrics-exporter.sh b/dovecot-metrics-exporter.sh new file mode 100644 index 0000000..0ff0f32 --- /dev/null +++ b/dovecot-metrics-exporter.sh @@ -0,0 +1,372 @@ +#!/bin/bash +################################################################################ +# Script Name: dovecot-metrics-exporter.sh +# Description: Prometheus exporter for Dovecot IMAP/POP3 server metrics +# +# Collects connection counts, authentication stats, mailbox operations, +# process info, and protocol-level metrics from doveadm and Dovecot stats. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.0 +# +# Usage: +# # Output to stdout +# ./dovecot-metrics-exporter.sh +# +# # Textfile collector mode (atomic write) +# ./dovecot-metrics-exporter.sh --textfile +# +# # Custom output file +# ./dovecot-metrics-exporter.sh -o /path/to/metrics.prom +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HOSTNAME=$(hostname) + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Safe integer extraction — returns 0 on failure +safe_int() { + local val="$1" + if [[ "$val" =~ ^[0-9]+$ ]]; then + echo "$val" + else + echo 0 + fi +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +generate_metrics() { + local START_TIME + START_TIME=$(date +%s.%N) + +# --- Exporter info --- +echo "# HELP dovecot_up Exporter status (1=up, 0=down)" +echo "# TYPE dovecot_up gauge" + +# Check if Dovecot is running +if systemctl is-active --quiet dovecot 2>/dev/null; then + echo "dovecot_up 1" +else + echo "dovecot_up 0" +fi + +echo "" +echo "# HELP dovecot_exporter_info Exporter version information" +echo "# TYPE dovecot_exporter_info gauge" +echo 'dovecot_exporter_info{version="1.0"} 1' +echo "" + +# --- Dovecot version --- +echo "# HELP dovecot_version_info Dovecot version information" +echo "# TYPE dovecot_version_info gauge" +local dovecot_version +dovecot_version=$(dovecot --version 2>/dev/null | awk '{print $1}') || dovecot_version="unknown" +echo "dovecot_version_info{version=\"${dovecot_version}\"} 1" +echo "" + +# --- Process counts --- +echo "# HELP dovecot_processes Number of running Dovecot processes by type" +echo "# TYPE dovecot_processes gauge" +for proc_type in imap pop3 lmtp managesieve submission auth anvil; do + count=$(pgrep -c "dovecot/${proc_type}" 2>/dev/null) || count=0 + echo "dovecot_processes{type=\"${proc_type}\"} ${count}" +done +local total_procs +total_procs=$(pgrep -c dovecot 2>/dev/null) || total_procs=0 +echo "dovecot_processes{type=\"total\"} ${total_procs}" +echo "" + +# --- Connected users (from doveadm) --- +echo "# HELP dovecot_connected_users Number of currently connected users by protocol" +echo "# TYPE dovecot_connected_users gauge" +local imap_users=0 pop3_users=0 lmtp_users=0 managesieve_users=0 +if command -v doveadm >/dev/null 2>&1; then + imap_users=$(doveadm who -1 2>/dev/null | grep -c 'imap' 2>/dev/null) || imap_users=0 + pop3_users=$(doveadm who -1 2>/dev/null | grep -c 'pop3' 2>/dev/null) || pop3_users=0 + lmtp_users=$(doveadm who -1 2>/dev/null | grep -c 'lmtp' 2>/dev/null) || lmtp_users=0 + managesieve_users=$(doveadm who -1 2>/dev/null | grep -c 'managesieve' 2>/dev/null) || managesieve_users=0 +fi +echo "dovecot_connected_users{protocol=\"imap\"} ${imap_users}" +echo "dovecot_connected_users{protocol=\"pop3\"} ${pop3_users}" +echo "dovecot_connected_users{protocol=\"lmtp\"} ${lmtp_users}" +echo "dovecot_connected_users{protocol=\"managesieve\"} ${managesieve_users}" +echo "" + +# --- Total connections (from doveadm who) --- +echo "# HELP dovecot_connections_total Total active connections by protocol" +echo "# TYPE dovecot_connections_total gauge" +local imap_conns=0 pop3_conns=0 +if command -v doveadm >/dev/null 2>&1; then + imap_conns=$(doveadm who -1 2>/dev/null | grep 'imap' | awk '{sum+=$3} END {print sum+0}' 2>/dev/null) || imap_conns=0 + pop3_conns=$(doveadm who -1 2>/dev/null | grep 'pop3' | awk '{sum+=$3} END {print sum+0}' 2>/dev/null) || pop3_conns=0 +fi +echo "dovecot_connections_total{protocol=\"imap\"} ${imap_conns}" +echo "dovecot_connections_total{protocol=\"pop3\"} ${pop3_conns}" +echo "" + +# --- Authentication stats from mail.log --- +local LOG_FILE="/var/log/mail.log" +if [[ ! -f "$LOG_FILE" ]]; then + LOG_FILE="/var/log/maillog" +fi + +echo "# HELP dovecot_auth_success_total Successful authentication attempts by protocol" +echo "# TYPE dovecot_auth_success_total counter" +local imap_auth_ok=0 pop3_auth_ok=0 +if [[ -f "$LOG_FILE" ]]; then + imap_auth_ok=$(grep -c 'imap-login: Info: Login:' "$LOG_FILE" 2>/dev/null) || imap_auth_ok=0 + pop3_auth_ok=$(grep -c 'pop3-login: Info: Login:' "$LOG_FILE" 2>/dev/null) || pop3_auth_ok=0 +fi +echo "dovecot_auth_success_total{protocol=\"imap\"} ${imap_auth_ok}" +echo "dovecot_auth_success_total{protocol=\"pop3\"} ${pop3_auth_ok}" +echo "" + +echo "# HELP dovecot_auth_failed_total Failed authentication attempts by protocol" +echo "# TYPE dovecot_auth_failed_total counter" +local imap_auth_fail=0 pop3_auth_fail=0 +if [[ -f "$LOG_FILE" ]]; then + imap_auth_fail=$(grep -c 'imap-login:.*auth failed\|imap-login: Info: Aborted login' "$LOG_FILE" 2>/dev/null) || imap_auth_fail=0 + pop3_auth_fail=$(grep -c 'pop3-login:.*auth failed\|pop3-login: Info: Aborted login' "$LOG_FILE" 2>/dev/null) || pop3_auth_fail=0 +fi +echo "dovecot_auth_failed_total{protocol=\"imap\"} ${imap_auth_fail}" +echo "dovecot_auth_failed_total{protocol=\"pop3\"} ${pop3_auth_fail}" +echo "" + +# --- TLS connections --- +echo "# HELP dovecot_tls_connections_total TLS connections by status" +echo "# TYPE dovecot_tls_connections_total counter" +local tls_yes=0 tls_no=0 +if [[ -f "$LOG_FILE" ]]; then + tls_yes=$(grep -c 'Login:.*TLS' "$LOG_FILE" 2>/dev/null) || tls_yes=0 + tls_no=$(grep 'Login:' "$LOG_FILE" 2>/dev/null | grep -cv 'TLS' 2>/dev/null) || tls_no=0 +fi +echo "dovecot_tls_connections_total{tls=\"yes\"} ${tls_yes}" +echo "dovecot_tls_connections_total{tls=\"no\"} ${tls_no}" +echo "" + +# --- Authentication methods --- +echo "# HELP dovecot_auth_method_total Logins by authentication method" +echo "# TYPE dovecot_auth_method_total counter" +if [[ -f "$LOG_FILE" ]]; then + for method in PLAIN LOGIN CRAM-MD5 DIGEST-MD5; do + count=$(grep -c "Login:.*method=${method}" "$LOG_FILE" 2>/dev/null) || count=0 + echo "dovecot_auth_method_total{method=\"${method}\"} ${count}" + done +else + for method in PLAIN LOGIN CRAM-MD5 DIGEST-MD5; do + echo "dovecot_auth_method_total{method=\"${method}\"} 0" + done +fi +echo "" + +# --- Disconnections --- +echo "# HELP dovecot_disconnections_total Client disconnections by reason" +echo "# TYPE dovecot_disconnections_total counter" +local dc_logout=0 dc_timeout=0 dc_closed=0 dc_internal=0 +if [[ -f "$LOG_FILE" ]]; then + dc_logout=$(grep -c 'Logged out' "$LOG_FILE" 2>/dev/null) || dc_logout=0 + dc_timeout=$(grep -c 'Disconnected.*Timed out\|Connection timed out' "$LOG_FILE" 2>/dev/null) || dc_timeout=0 + dc_closed=$(grep -c 'Disconnected.*Connection closed' "$LOG_FILE" 2>/dev/null) || dc_closed=0 + dc_internal=$(grep -c 'Disconnected.*Internal error' "$LOG_FILE" 2>/dev/null) || dc_internal=0 +fi +echo "dovecot_disconnections_total{reason=\"logout\"} ${dc_logout}" +echo "dovecot_disconnections_total{reason=\"timeout\"} ${dc_timeout}" +echo "dovecot_disconnections_total{reason=\"connection_closed\"} ${dc_closed}" +echo "dovecot_disconnections_total{reason=\"internal_error\"} ${dc_internal}" +echo "" + +# --- LMTP delivery stats --- +echo "# HELP dovecot_lmtp_deliveries_total LMTP deliveries by status" +echo "# TYPE dovecot_lmtp_deliveries_total counter" +local lmtp_ok=0 lmtp_reject=0 lmtp_tempfail=0 +if [[ -f "$LOG_FILE" ]]; then + lmtp_ok=$(grep -c 'lmtp.*saved mail' "$LOG_FILE" 2>/dev/null) || lmtp_ok=0 + lmtp_reject=$(grep -c 'lmtp.*rejected' "$LOG_FILE" 2>/dev/null) || lmtp_reject=0 + lmtp_tempfail=$(grep -c 'lmtp.*temporary failure\|lmtp.*temp-fail' "$LOG_FILE" 2>/dev/null) || lmtp_tempfail=0 +fi +echo "dovecot_lmtp_deliveries_total{status=\"delivered\"} ${lmtp_ok}" +echo "dovecot_lmtp_deliveries_total{status=\"rejected\"} ${lmtp_reject}" +echo "dovecot_lmtp_deliveries_total{status=\"tempfail\"} ${lmtp_tempfail}" +echo "" + +# --- Sieve stats --- +echo "# HELP dovecot_sieve_actions_total Sieve filter actions" +echo "# TYPE dovecot_sieve_actions_total counter" +local sieve_filed=0 sieve_discard=0 sieve_redirect=0 sieve_reject=0 +if [[ -f "$LOG_FILE" ]]; then + sieve_filed=$(grep -c 'sieve:.*stored mail\|sieve:.*fileinto' "$LOG_FILE" 2>/dev/null) || sieve_filed=0 + sieve_discard=$(grep -c 'sieve:.*discard' "$LOG_FILE" 2>/dev/null) || sieve_discard=0 + sieve_redirect=$(grep -c 'sieve:.*redirect' "$LOG_FILE" 2>/dev/null) || sieve_redirect=0 + sieve_reject=$(grep -c 'sieve:.*reject' "$LOG_FILE" 2>/dev/null) || sieve_reject=0 +fi +echo "dovecot_sieve_actions_total{action=\"filed\"} ${sieve_filed}" +echo "dovecot_sieve_actions_total{action=\"discard\"} ${sieve_discard}" +echo "dovecot_sieve_actions_total{action=\"redirect\"} ${sieve_redirect}" +echo "dovecot_sieve_actions_total{action=\"reject\"} ${sieve_reject}" +echo "" + +# --- Dovecot stats (if old_stats or stats plugin enabled) --- +# Try doveadm stats dump for Dovecot 2.3+ +echo "# HELP dovecot_mail_commands_total Mail commands executed" +echo "# TYPE dovecot_mail_commands_total counter" +local cmds_select=0 cmds_fetch=0 cmds_store=0 cmds_search=0 cmds_copy=0 cmds_expunge=0 +if command -v doveadm >/dev/null 2>&1; then + local stats_output + stats_output=$(doveadm stats dump session 2>/dev/null | head -20) + if [[ -n "$stats_output" ]]; then + cmds_select=$(echo "$stats_output" | awk '{sum+=$4} END {print sum+0}') || cmds_select=0 + cmds_fetch=$(echo "$stats_output" | awk '{sum+=$5} END {print sum+0}') || cmds_fetch=0 + fi +fi +# Fallback: count from logs +if [[ -f "$LOG_FILE" ]]; then + cmds_copy=$(grep -c 'Copy\|copy' "$LOG_FILE" 2>/dev/null | head -1) || cmds_copy=0 + cmds_expunge=$(grep -c 'Expunged' "$LOG_FILE" 2>/dev/null) || cmds_expunge=0 +fi +echo "dovecot_mail_commands_total{command=\"copy\"} ${cmds_copy}" +echo "dovecot_mail_commands_total{command=\"expunge\"} ${cmds_expunge}" +echo "" + +# --- Mail storage quota (top users if doveadm quota available) --- +echo "# HELP dovecot_quota_usage_bytes User quota usage in bytes (top users)" +echo "# TYPE dovecot_quota_usage_bytes gauge" +echo "# HELP dovecot_quota_limit_bytes User quota limit in bytes" +echo "# TYPE dovecot_quota_limit_bytes gauge" +if command -v doveadm >/dev/null 2>&1; then + doveadm quota get -A 2>/dev/null | grep 'STORAGE' | head -20 | while IFS=$'\t' read -r user type value limit _; do + local usage_bytes=$((value * 1024)) + local limit_bytes=$((limit * 1024)) + echo "dovecot_quota_usage_bytes{user=\"${user}\"} ${usage_bytes}" + echo "dovecot_quota_limit_bytes{user=\"${user}\"} ${limit_bytes}" + done 2>/dev/null +fi +echo "" + +# --- Dovecot uptime --- +echo "# HELP dovecot_uptime_seconds Dovecot process uptime in seconds" +echo "# TYPE dovecot_uptime_seconds gauge" +local dovecot_pid uptime_seconds=0 +dovecot_pid=$(pgrep -o dovecot 2>/dev/null) || dovecot_pid="" +if [[ -n "$dovecot_pid" ]] && [[ -d "/proc/${dovecot_pid}" ]]; then + local start_time + start_time=$(stat -c %Y "/proc/${dovecot_pid}" 2>/dev/null) || start_time=0 + if [[ "$start_time" -gt 0 ]]; then + uptime_seconds=$(( $(date +%s) - start_time )) + fi +fi +echo "dovecot_uptime_seconds ${uptime_seconds}" +echo "" + +# --- Memory usage --- +echo "# HELP dovecot_memory_bytes Total memory usage of all Dovecot processes" +echo "# TYPE dovecot_memory_bytes gauge" +local total_mem=0 +total_mem=$(pgrep dovecot 2>/dev/null | xargs -I {} cat /proc/{}/status 2>/dev/null | awk '/VmRSS/{sum+=$2} END {print sum*1024+0}') || total_mem=0 +echo "dovecot_memory_bytes ${total_mem}" +echo "" + +# --- Script execution time --- +local END_TIME +END_TIME=$(date +%s.%N) +local DURATION +DURATION=$(echo "$END_TIME - $START_TIME" | bc) + +echo "# HELP dovecot_exporter_duration_seconds Time to generate all metrics" +echo "# TYPE dovecot_exporter_duration_seconds gauge" +echo "dovecot_exporter_duration_seconds ${DURATION}" +echo "" + +echo "# HELP dovecot_exporter_last_run_timestamp Unix timestamp of last successful run" +echo "# TYPE dovecot_exporter_last_run_timestamp gauge" +echo "dovecot_exporter_last_run_timestamp $(date +%s)" +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +main() { + parse_args "$@" + + if [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.dovecot_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/duplicati-exporter.sh b/duplicati-exporter.sh new file mode 100755 index 0000000..7027b3f --- /dev/null +++ b/duplicati-exporter.sh @@ -0,0 +1,445 @@ +#!/bin/bash +################################################################################ +# Script Name: duplicati-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for Duplicati backups — backup job status, +# last run time, backup age, file counts, and size metrics +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - curl installed +# - jq installed +# - netcat (nc) for HTTP mode +# +# Usage: +# ./duplicati-exporter.sh --textfile +# ./duplicati-exporter.sh --http -p 9203 +# ./duplicati-exporter.sh --url http://myhost:8200 --password secret +# DUPLICATI_PASSWORD=secret ./duplicati-exporter.sh --textfile +# +# Configuration: +# Default HTTP port: 9203 +# Default Duplicati URL: http://localhost:8200 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +EXPORTER_VERSION="1.0" +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9203 +DUPLICATI_URL="http://localhost:8200" +DUPLICATI_PASS="${DUPLICATI_PASSWORD:-}" + +show_usage() { + cat <&2; exit 1 ;; + esac + done + # Strip trailing slash from URL + DUPLICATI_URL="${DUPLICATI_URL%/}" +} + +check_dependencies() { + local missing=0 + for cmd in curl jq; do + if ! command -v "$cmd" >/dev/null 2>&1; then + echo "ERROR: $cmd not found" >&2 + missing=1 + fi + done + return "$missing" +} + +# Authenticate with Duplicati and store cookie jar +duplicati_auth() { + COOKIE_JAR=$(mktemp /tmp/.duplicati_cookies.XXXXXX) + trap 'rm -f "$COOKIE_JAR"' EXIT + + # If no password, try unauthenticated access + if [ -z "$DUPLICATI_PASS" ]; then + return 0 + fi + + # Get XSRF token first + local xsrf_token + xsrf_token=$(curl -s -c "$COOKIE_JAR" -b "$COOKIE_JAR" \ + "${DUPLICATI_URL}/api/v1/auth/refresh" 2>/dev/null | jq -r '.Token // empty') + + if [ -z "$xsrf_token" ]; then + # Try fetching the login page to get cookies + curl -s -c "$COOKIE_JAR" -b "$COOKIE_JAR" \ + "${DUPLICATI_URL}/" >/dev/null 2>&1 + xsrf_token=$(grep -i "xsrf" "$COOKIE_JAR" 2>/dev/null | awk '{print $NF}') + fi + + # Authenticate with password + local auth_response + auth_response=$(curl -s -c "$COOKIE_JAR" -b "$COOKIE_JAR" \ + -X POST "${DUPLICATI_URL}/api/v1/auth/login" \ + -H "Content-Type: application/json" \ + ${xsrf_token:+-H "X-XSRF-Token: $xsrf_token"} \ + -d "{\"Password\":\"${DUPLICATI_PASS}\"}" 2>/dev/null) + + if echo "$auth_response" | jq -e '.Token' >/dev/null 2>&1; then + AUTH_TOKEN=$(echo "$auth_response" | jq -r '.Token') + return 0 + fi + + return 1 +} + +# Make an authenticated API call +api_call() { + local endpoint="$1" + curl -s -b "$COOKIE_JAR" \ + ${AUTH_TOKEN:+-H "Authorization: Bearer $AUTH_TOKEN"} \ + "${DUPLICATI_URL}/api/v1/${endpoint}" 2>/dev/null +} + +# Map status string to numeric value +status_to_number() { + case "$1" in + Success|Completed) echo 1 ;; + Warning) echo 2 ;; + Error|Failed) echo 3 ;; + Fatal) echo 4 ;; + *) echo 0 ;; + esac +} + +generate_metrics() { + local script_start + script_start=$(date +%s%N) + + if ! check_dependencies; then + echo "# HELP duplicati_up Exporter status (1=up, 0=down)" + echo "# TYPE duplicati_up gauge" + echo "duplicati_up 0" + return + fi + + # Test server reachability + local server_up=0 + if curl -s --connect-timeout 5 "${DUPLICATI_URL}/api/v1/systeminfo" >/dev/null 2>&1; then + server_up=1 + fi + + # Authenticate + AUTH_TOKEN="" + if [ "$server_up" -eq 1 ]; then + duplicati_auth + fi + + echo "# HELP duplicati_up Duplicati server reachable (1=up, 0=down)" + echo "# TYPE duplicati_up gauge" + echo "duplicati_up $server_up" + + echo "# HELP duplicati_exporter_info Exporter version information" + echo "# TYPE duplicati_exporter_info gauge" + echo "duplicati_exporter_info{version=\"${EXPORTER_VERSION}\"} 1" + + if [ "$server_up" -eq 0 ]; then + echo "# HELP duplicati_backup_count Total number of configured backup jobs" + echo "# TYPE duplicati_backup_count gauge" + echo "duplicati_backup_count 0" + local script_end + script_end=$(date +%s) + echo "# HELP duplicati_exporter_duration_seconds Script execution time" + echo "# TYPE duplicati_exporter_duration_seconds gauge" + echo "duplicati_exporter_duration_seconds 0" + echo "# HELP duplicati_exporter_last_run_timestamp Last successful run" + echo "# TYPE duplicati_exporter_last_run_timestamp gauge" + echo "duplicati_exporter_last_run_timestamp $script_end" + return + fi + + # Fetch all backups + local backups_json + backups_json=$(api_call "backups") + + if [ -z "$backups_json" ] || ! echo "$backups_json" | jq -e '.' >/dev/null 2>&1; then + echo "# HELP duplicati_backup_count Total number of configured backup jobs" + echo "# TYPE duplicati_backup_count gauge" + echo "duplicati_backup_count 0" + local script_end + script_end=$(date +%s) + echo "# HELP duplicati_exporter_duration_seconds Script execution time" + echo "# TYPE duplicati_exporter_duration_seconds gauge" + echo "duplicati_exporter_duration_seconds 0" + echo "# HELP duplicati_exporter_last_run_timestamp Last successful run" + echo "# TYPE duplicati_exporter_last_run_timestamp gauge" + echo "duplicati_exporter_last_run_timestamp $script_end" + return + fi + + local backup_count + backup_count=$(echo "$backups_json" | jq 'length') + echo "# HELP duplicati_backup_count Total number of configured backup jobs" + echo "# TYPE duplicati_backup_count gauge" + echo "duplicati_backup_count ${backup_count:-0}" + + local now + now=$(date +%s) + + # Collect per-backup metrics into arrays so HELP/TYPE appears once per metric + local info_lines=() + local last_run_lines=() + local age_lines=() + local duration_lines=() + local status_lines=() + local files_lines=() + local size_lines=() + local uploaded_lines=() + local next_ts_lines=() + local next_sec_lines=() + local error_lines=() + local warning_lines=() + + while IFS= read -r backup; do + local id name target_url + id=$(echo "$backup" | jq -r '.Backup.ID // empty') + name=$(echo "$backup" | jq -r '.Backup.Name // empty') + target_url=$(echo "$backup" | jq -r '.Backup.TargetURL // empty') + + [ -z "$name" ] && continue + + local safe_name="${name//\"/\\\"}" + local safe_target="${target_url//\"/\\\"}" + + info_lines+=("duplicati_backup_info{id=\"${id}\",name=\"${safe_name}\",target_url=\"${safe_target}\"} 1") + + # Last run timestamp + local last_run_ts=0 + local last_run_raw + last_run_raw=$(echo "$backup" | jq -r '.Backup.Metadata."LastBackupDate" // empty') + if [ -n "$last_run_raw" ]; then + last_run_ts=$(date -d "$last_run_raw" +%s 2>/dev/null || echo 0) + fi + last_run_lines+=("duplicati_backup_last_run_timestamp{name=\"${safe_name}\"} $last_run_ts") + + # Age since last run + local age=0 + if [ "$last_run_ts" -gt 0 ]; then + age=$((now - last_run_ts)) + fi + age_lines+=("duplicati_backup_last_run_age_seconds{name=\"${safe_name}\"} $age") + + # Last duration + local duration + duration=$(echo "$backup" | jq -r '.Backup.Metadata."LastBackupDuration" // "0"') + local duration_seconds=0 + if [[ "$duration" =~ ^([0-9]+):([0-9]+):([0-9]+) ]]; then + duration_seconds=$(( BASH_REMATCH[1] * 3600 + BASH_REMATCH[2] * 60 + BASH_REMATCH[3] )) + elif [[ "$duration" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + duration_seconds="${duration%%.*}" + fi + duration_lines+=("duplicati_backup_last_duration_seconds{name=\"${safe_name}\"} $duration_seconds") + + # Last status + local status_raw status_num + status_raw=$(echo "$backup" | jq -r '.Backup.Metadata."LastBackupResult" // "Unknown"') + status_num=$(status_to_number "$status_raw") + status_lines+=("duplicati_backup_last_status{name=\"${safe_name}\",status=\"${status_raw}\"} $status_num") + + # Files examined + local files_total + files_total=$(echo "$backup" | jq -r '.Backup.Metadata."LastBackupExaminedFiles" // "0"') + files_lines+=("duplicati_backup_files_total{name=\"${safe_name}\"} ${files_total:-0}") + + # Files size + local files_size + files_size=$(echo "$backup" | jq -r '.Backup.Metadata."LastBackupSizeOfExaminedFiles" // "0"') + size_lines+=("duplicati_backup_files_size_bytes{name=\"${safe_name}\"} ${files_size:-0}") + + # Uploaded bytes + local uploaded + uploaded=$(echo "$backup" | jq -r '.Backup.Metadata."LastBackupUploadedSize" // "0"') + uploaded_lines+=("duplicati_backup_uploaded_bytes{name=\"${safe_name}\"} ${uploaded:-0}") + + # Next run timestamp + local next_run_ts=0 + local next_run_raw + next_run_raw=$(echo "$backup" | jq -r '.Schedule.Time // empty') + if [ -n "$next_run_raw" ]; then + next_run_ts=$(date -d "$next_run_raw" +%s 2>/dev/null || echo 0) + fi + next_ts_lines+=("duplicati_backup_next_run_timestamp{name=\"${safe_name}\"} $next_run_ts") + + # Seconds until next run + local next_run_seconds=0 + if [ "$next_run_ts" -gt "$now" ]; then + next_run_seconds=$((next_run_ts - now)) + fi + next_sec_lines+=("duplicati_backup_next_run_seconds{name=\"${safe_name}\"} $next_run_seconds") + + # Error count + local error_count + error_count=$(echo "$backup" | jq -r '.Backup.Metadata."LastBackupErrors" // "0"') + error_lines+=("duplicati_backup_error_count{name=\"${safe_name}\"} ${error_count:-0}") + + # Warning count + local warning_count + warning_count=$(echo "$backup" | jq -r '.Backup.Metadata."LastBackupWarnings" // "0"') + warning_lines+=("duplicati_backup_warning_count{name=\"${safe_name}\"} ${warning_count:-0}") + done < <(echo "$backups_json" | jq -c '.[]' 2>/dev/null) + + # Output each metric group with HELP/TYPE immediately before values + if [ ${#info_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_info Backup job information" + echo "# TYPE duplicati_backup_info gauge" + printf '%s\n' "${info_lines[@]}" + fi + if [ ${#last_run_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_last_run_timestamp Unix timestamp of last backup run" + echo "# TYPE duplicati_backup_last_run_timestamp gauge" + printf '%s\n' "${last_run_lines[@]}" + fi + if [ ${#age_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_last_run_age_seconds Seconds since last backup run" + echo "# TYPE duplicati_backup_last_run_age_seconds gauge" + printf '%s\n' "${age_lines[@]}" + fi + if [ ${#duration_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_last_duration_seconds Duration of last backup run in seconds" + echo "# TYPE duplicati_backup_last_duration_seconds gauge" + printf '%s\n' "${duration_lines[@]}" + fi + if [ ${#status_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_last_status Last backup status (Success=1, Warning=2, Error=3, Fatal=4, Unknown=0)" + echo "# TYPE duplicati_backup_last_status gauge" + printf '%s\n' "${status_lines[@]}" + fi + if [ ${#files_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_files_total Total files examined in last backup" + echo "# TYPE duplicati_backup_files_total gauge" + printf '%s\n' "${files_lines[@]}" + fi + if [ ${#size_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_files_size_bytes Total size of examined files in bytes" + echo "# TYPE duplicati_backup_files_size_bytes gauge" + printf '%s\n' "${size_lines[@]}" + fi + if [ ${#uploaded_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_uploaded_bytes Bytes uploaded in last backup" + echo "# TYPE duplicati_backup_uploaded_bytes gauge" + printf '%s\n' "${uploaded_lines[@]}" + fi + if [ ${#next_ts_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_next_run_timestamp Next scheduled run unix timestamp" + echo "# TYPE duplicati_backup_next_run_timestamp gauge" + printf '%s\n' "${next_ts_lines[@]}" + fi + if [ ${#next_sec_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_next_run_seconds Seconds until next scheduled run" + echo "# TYPE duplicati_backup_next_run_seconds gauge" + printf '%s\n' "${next_sec_lines[@]}" + fi + if [ ${#error_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_error_count Number of errors in last backup run" + echo "# TYPE duplicati_backup_error_count gauge" + printf '%s\n' "${error_lines[@]}" + fi + if [ ${#warning_lines[@]} -gt 0 ]; then + echo "# HELP duplicati_backup_warning_count Number of warnings in last backup run" + echo "# TYPE duplicati_backup_warning_count gauge" + printf '%s\n' "${warning_lines[@]}" + fi + + local script_end script_duration_ns script_duration + script_end=$(date +%s) + script_duration_ns=$(( $(date +%s%N) - script_start )) + script_duration=$(( script_duration_ns / 1000000000 )) + + echo "# HELP duplicati_exporter_duration_seconds Script execution time" + echo "# TYPE duplicati_exporter_duration_seconds gauge" + echo "duplicati_exporter_duration_seconds $script_duration" + echo "# HELP duplicati_exporter_last_run_timestamp Last successful run" + echo "# TYPE duplicati_exporter_last_run_timestamp gauge" + echo "duplicati_exporter_last_run_timestamp $script_end" +} + +run_http_server() { + echo "Starting Duplicati exporter on port $HTTP_PORT..." >&2 + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + echo "Duplicati Exporter

Duplicati Prometheus Exporter

Metrics

" + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +main() { + parse_args "$@" + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + local temp_file + temp_file=$(mktemp "${output_dir}/.duplicati_metrics.XXXXXX") + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + echo "Metrics written to $OUTPUT_FILE" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/ebs-snapshot-manager.sh b/ebs-snapshot-manager.sh new file mode 100644 index 0000000..ba62a55 --- /dev/null +++ b/ebs-snapshot-manager.sh @@ -0,0 +1,813 @@ +#!/usr/bin/env bash + +######################################################################################### +#### ebs-snapshot-manager.sh — Create, manage, audit, and prune AWS EBS snapshots #### +#### Supports automated creation, cross-region copy, retention, and orphan detection #### +#### Requires: bash 4+, aws-cli v2, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### export AWS_PROFILE="production" #### +#### ./ebs-snapshot-manager.sh --snapshot #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +AWS_REGION="${AWS_REGION:-}" +VOLUME_IDS="${VOLUME_IDS:-}" +VOLUME_TAG_KEY="${VOLUME_TAG_KEY:-}" +VOLUME_TAG_VALUE="${VOLUME_TAG_VALUE:-}" +RETENTION_DAYS="${RETENTION_DAYS:-30}" +COPY_TO_REGION="${COPY_TO_REGION:-}" +SNAPSHOT_DESCRIPTION="${SNAPSHOT_DESCRIPTION:-Automated snapshot by ebs-snapshot-manager}" +NO_WAIT="${NO_WAIT:-false}" +DRY_RUN="${DRY_RUN:-true}" +RESTORE_AZ="${RESTORE_AZ:-}" +RESTORE_VOLUME_TYPE="${RESTORE_VOLUME_TYPE:-gp3}" +RESTORE_IOPS="${RESTORE_IOPS:-}" +RESTORE_THROUGHPUT="${RESTORE_THROUGHPUT:-}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +RUN_MODE="" +TARGET_VOLUME="" +TARGET_SNAPSHOT="" +START_TIME="" +WARNINGS=0 + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; ((WARNINGS++)) || true; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── AWS CLI wrapper ─────────────────────────────────────────────────── +aws_cmd() { + local args=("$@") + [[ -n "$AWS_REGION" ]] && args+=(--region "$AWS_REGION") + verbose "aws ${args[*]}" + aws "${args[@]}" +} + +# ── Dependency check ────────────────────────────────────────────────── +check_deps() { + for cmd in aws jq; do + if ! command -v "$cmd" &>/dev/null; then + err "${cmd} is required but not installed" + exit 1 + fi + done + + # Verify AWS credentials + if ! aws sts get-caller-identity &>/dev/null; then + err "AWS credentials not configured or expired" + exit 1 + fi + + # Determine region + if [[ -z "$AWS_REGION" ]]; then + AWS_REGION=$(aws configure get region 2>/dev/null || echo "") + if [[ -z "$AWS_REGION" ]]; then + err "AWS_REGION is required (set via env var or aws configure)" + exit 1 + fi + fi + + verbose "Using region: ${AWS_REGION}" + verbose "Account: $(aws sts get-caller-identity --query 'Account' --output text 2>/dev/null)" +} + +# ── Get volume list ─────────────────────────────────────────────────── +get_volumes() { + local filters=() + + if [[ -n "$VOLUME_IDS" ]]; then + # Specific volumes requested + local vol_array + IFS=',' read -ra vol_array <<< "$VOLUME_IDS" + aws_cmd ec2 describe-volumes \ + --volume-ids "${vol_array[@]}" \ + --query 'Volumes[*].VolumeId' \ + --output text | tr '\t' '\n' + return + fi + + if [[ -n "$VOLUME_TAG_KEY" ]]; then + filters+=(--filters "Name=tag:${VOLUME_TAG_KEY},Values=${VOLUME_TAG_VALUE:-*}") + fi + + aws_cmd ec2 describe-volumes \ + "${filters[@]}" \ + --query 'Volumes[*].VolumeId' \ + --output text | tr '\t' '\n' +} + +# ── Get account ID ─────────────────────────────────────────────────── +get_account_id() { + aws sts get-caller-identity --query 'Account' --output text +} + +# ══════════════════════════════════════════════════════════════════════ +# SNAPSHOT MODE +# ══════════════════════════════════════════════════════════════════════ + +do_snapshot() { + log "Creating EBS snapshots..." + local volumes + volumes=$(get_volumes) + + if [[ -z "$volumes" ]]; then + warn "No volumes found matching criteria" + return + fi + + local vol_count + vol_count=$(echo "$volumes" | wc -l) + log "Found ${vol_count} volume(s) to snapshot" + + local created=0 + local failed=0 + local snapshot_ids=() + local now + now=$(date -u +%Y-%m-%dT%H:%M:%SZ) + + while IFS= read -r vol_id; do + [[ -z "$vol_id" ]] && continue + verbose "Snapshotting ${vol_id}..." + + local vol_name + # shellcheck disable=SC2016 + vol_name=$(aws_cmd ec2 describe-volumes \ + --volume-ids "$vol_id" \ + --query 'Volumes[0].Tags[?Key==`Name`].Value | [0]' \ + --output text 2>/dev/null) || vol_name="N/A" + [[ "$vol_name" == "None" ]] && vol_name="N/A" + + local snap_id + snap_id=$(aws_cmd ec2 create-snapshot \ + --volume-id "$vol_id" \ + --description "$SNAPSHOT_DESCRIPTION" \ + --tag-specifications "ResourceType=snapshot,Tags=[ + {Key=Name,Value=snap-${vol_id}-$(date +%Y%m%d)}, + {Key=CreatedBy,Value=ebs-snapshot-manager}, + {Key=CreatedAt,Value=${now}}, + {Key=VolumeId,Value=${vol_id}}, + {Key=VolumeName,Value=${vol_name}} + ]" \ + --query 'SnapshotId' \ + --output text 2>/dev/null) || snap_id="" + + if [[ -n "$snap_id" ]]; then + echo -e " ${GREEN}✓${RESET} ${vol_id} → ${snap_id} (${vol_name})" + snapshot_ids+=("$snap_id") + ((created++)) || true + else + echo -e " ${RED}✗${RESET} ${vol_id} — snapshot creation failed" + ((failed++)) || true + fi + done <<< "$volumes" + + # Wait for completion + if [[ "$NO_WAIT" != "true" && ${#snapshot_ids[@]} -gt 0 ]]; then + log "Waiting for ${#snapshot_ids[@]} snapshot(s) to complete..." + for snap_id in "${snapshot_ids[@]}"; do + if aws_cmd ec2 wait snapshot-completed --snapshot-ids "$snap_id" 2>/dev/null; then + local size + size=$(aws_cmd ec2 describe-snapshots \ + --snapshot-ids "$snap_id" \ + --query 'Snapshots[0].VolumeSize' \ + --output text 2>/dev/null) || size="?" + verbose "${snap_id} completed (${size} GiB)" + else + warn "${snap_id} did not complete within timeout" + fi + done + fi + + echo "" + log "Snapshots created: ${created}, failed: ${failed}" +} + +# ══════════════════════════════════════════════════════════════════════ +# PRUNE MODE +# ══════════════════════════════════════════════════════════════════════ + +do_prune() { + local cutoff_epoch + cutoff_epoch=$(date -d "-${RETENTION_DAYS} days" +%s 2>/dev/null) || \ + cutoff_epoch=$(date -v-"${RETENTION_DAYS}"d +%s 2>/dev/null) || { + err "Could not calculate retention cutoff date" + exit 1 + } + + local cutoff_date + cutoff_date=$(date -d "@${cutoff_epoch}" +%Y-%m-%dT%H:%M:%S 2>/dev/null) || \ + cutoff_date=$(date -r "${cutoff_epoch}" +%Y-%m-%dT%H:%M:%S 2>/dev/null) + + log "Pruning snapshots older than ${RETENTION_DAYS} days (before ${cutoff_date})" + if [[ "$DRY_RUN" == "true" ]]; then + log "${YELLOW}DRY RUN${RESET} — no snapshots will be deleted. Use --force to delete." + fi + + local owner_id + owner_id=$(get_account_id) + + local snapshots_json + snapshots_json=$(aws_cmd ec2 describe-snapshots \ + --owner-ids "$owner_id" \ + --filters "Name=tag:CreatedBy,Values=ebs-snapshot-manager" \ + --query 'Snapshots[*].{Id:SnapshotId,Start:StartTime,Size:VolumeSize,Vol:VolumeId}' \ + --output json) + + local total + total=$(echo "$snapshots_json" | jq 'length') + echo "$snapshots_json" | jq -c '.[]' | while IFS= read -r snap; do + local snap_id start_time size vol_id + snap_id=$(echo "$snap" | jq -r '.Id') + start_time=$(echo "$snap" | jq -r '.Start') + size=$(echo "$snap" | jq -r '.Size') + vol_id=$(echo "$snap" | jq -r '.Vol') + + local snap_epoch + snap_epoch=$(date -d "$start_time" +%s 2>/dev/null) || \ + snap_epoch=$(date -jf "%Y-%m-%dT%H:%M:%S" "${start_time%%.*}" +%s 2>/dev/null) || snap_epoch=0 + + if [[ $snap_epoch -lt $cutoff_epoch ]]; then + local age_days=$(( ($(date +%s) - snap_epoch) / 86400 )) + + if [[ "$DRY_RUN" == "true" ]]; then + echo -e " ${YELLOW}⊘${RESET} ${snap_id} — ${age_days}d old, ${size} GiB, vol: ${vol_id} (would delete)" + else + if aws_cmd ec2 delete-snapshot --snapshot-id "$snap_id" 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} ${snap_id} — deleted (${age_days}d old, ${size} GiB)" + else + echo -e " ${RED}✗${RESET} ${snap_id} — delete failed" + fi + fi + fi + done + + log "Total managed snapshots: ${total}" +} + +# ══════════════════════════════════════════════════════════════════════ +# COPY-REGION MODE +# ══════════════════════════════════════════════════════════════════════ + +do_copy_region() { + if [[ -z "$COPY_TO_REGION" ]]; then + err "Target region required. Use --copy-region REGION or set COPY_TO_REGION" + exit 1 + fi + + log "Copying latest snapshots to ${COPY_TO_REGION}..." + + local owner_id + owner_id=$(get_account_id) + + # Get volumes to copy snapshots for + local volumes + if [[ -n "$TARGET_VOLUME" ]]; then + volumes="$TARGET_VOLUME" + else + volumes=$(get_volumes) + fi + + if [[ -z "$volumes" ]]; then + warn "No volumes found" + return + fi + + local copied=0 + local failed=0 + + while IFS= read -r vol_id; do + [[ -z "$vol_id" ]] && continue + + # Find latest snapshot for this volume + local latest_snap + latest_snap=$(aws_cmd ec2 describe-snapshots \ + --owner-ids "$owner_id" \ + --filters "Name=volume-id,Values=${vol_id}" "Name=status,Values=completed" \ + --query 'sort_by(Snapshots, &StartTime)[-1].SnapshotId' \ + --output text 2>/dev/null) || latest_snap="" + + if [[ -z "$latest_snap" || "$latest_snap" == "None" ]]; then + echo -e " ${YELLOW}⊘${RESET} ${vol_id} — no completed snapshots found" + continue + fi + + # Copy to target region + local copy_id + copy_id=$(aws ec2 copy-snapshot \ + --region "$COPY_TO_REGION" \ + --source-region "$AWS_REGION" \ + --source-snapshot-id "$latest_snap" \ + --description "DR copy of ${latest_snap} from ${AWS_REGION}" \ + --tag-specifications "ResourceType=snapshot,Tags=[ + {Key=Name,Value=dr-copy-${latest_snap}}, + {Key=CreatedBy,Value=ebs-snapshot-manager}, + {Key=SourceRegion,Value=${AWS_REGION}}, + {Key=SourceSnapshotId,Value=${latest_snap}}, + {Key=VolumeId,Value=${vol_id}} + ]" \ + --query 'SnapshotId' \ + --output text 2>/dev/null) || copy_id="" + + if [[ -n "$copy_id" ]]; then + echo -e " ${GREEN}✓${RESET} ${latest_snap} → ${copy_id} (${AWS_REGION} → ${COPY_TO_REGION})" + ((copied++)) || true + else + echo -e " ${RED}✗${RESET} ${latest_snap} — copy failed" + ((failed++)) || true + fi + done <<< "$volumes" + + echo "" + log "Copied: ${copied}, failed: ${failed}" +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT MODE +# ══════════════════════════════════════════════════════════════════════ + +do_audit() { + log "Auditing EBS snapshots in ${AWS_REGION}..." + + local owner_id + owner_id=$(get_account_id) + + local snapshots_json + snapshots_json=$(aws_cmd ec2 describe-snapshots \ + --owner-ids "$owner_id" \ + --query 'Snapshots[*].{Id:SnapshotId,Vol:VolumeId,Size:VolumeSize,Status:State,Start:StartTime,Desc:Description,Tags:Tags}' \ + --output json) + + local total + total=$(echo "$snapshots_json" | jq 'length') + + if [[ "$total" -eq 0 ]]; then + log "No snapshots found" + return + fi + + # Get existing volumes for orphan detection + local existing_volumes + existing_volumes=$(aws_cmd ec2 describe-volumes \ + --query 'Volumes[*].VolumeId' \ + --output text | tr '\t' '\n' | sort) + + local orphan_count=0 + local untagged_count=0 + local managed_count=0 + + echo "" + echo -e "${BOLD}Snapshot Inventory${RESET}" + printf " %-24s %-14s %8s %6s %s\n" "SNAPSHOT" "VOLUME" "SIZE" "AGE" "STATUS" + echo " $(printf '%.0s─' {1..70})" + + echo "$snapshots_json" | jq -c '.[]' | while IFS= read -r snap; do + local snap_id vol_id size status start_time + snap_id=$(echo "$snap" | jq -r '.Id') + vol_id=$(echo "$snap" | jq -r '.Vol') + size=$(echo "$snap" | jq -r '.Size') + status=$(echo "$snap" | jq -r '.Status') + start_time=$(echo "$snap" | jq -r '.Start') + + local snap_epoch + snap_epoch=$(date -d "$start_time" +%s 2>/dev/null) || snap_epoch=0 + local age_days=$(( ($(date +%s) - snap_epoch) / 86400 )) + + # Check if managed + local is_managed + is_managed=$(echo "$snap" | jq -r '.Tags // [] | map(select(.Key == "CreatedBy" and .Value == "ebs-snapshot-manager")) | length') + + # Check if orphaned + local is_orphan="no" + if ! echo "$existing_volumes" | grep -q "^${vol_id}$" 2>/dev/null; then + is_orphan="yes" + fi + + # Check if tagged + local tag_count + tag_count=$(echo "$snap" | jq '.Tags // [] | length') + + local status_marker="" + if [[ "$is_orphan" == "yes" ]]; then + status_marker="${RED}orphan${RESET}" + elif [[ "$tag_count" -eq 0 ]]; then + status_marker="${YELLOW}untagged${RESET}" + elif [[ "$is_managed" -gt 0 ]]; then + status_marker="${GREEN}managed${RESET}" + else + status_marker="${status}" + fi + + printf " %-24s %-14s %6s G %4sd %b\n" \ + "$snap_id" "$vol_id" "$size" "$age_days" "$status_marker" + done + + # Summary stats + local total_size + total_size=$(echo "$snapshots_json" | jq '[.[].Size] | add // 0') + orphan_count=$(echo "$snapshots_json" | jq --arg vols "$existing_volumes" ' + [.[] | select(.Vol as $v | ($vols | split("\n") | map(select(. != "")) | index($v) == null))] | length + ') + untagged_count=$(echo "$snapshots_json" | jq '[.[] | select((.Tags // []) | length == 0)] | length') + managed_count=$(echo "$snapshots_json" | jq '[.[] | select((.Tags // []) | map(select(.Key == "CreatedBy" and .Value == "ebs-snapshot-manager")) | length > 0)] | length') + + local monthly_cost + monthly_cost=$(echo "$total_size * 0.05" | bc 2>/dev/null || echo "?") + + echo "" + echo -e "${BOLD}Summary${RESET}" + echo -e " Total snapshots: ${total}" + echo -e " Managed snapshots: ${managed_count}" + echo -e " Total storage: ${total_size} GiB" + echo -e " Est. monthly cost: \$${monthly_cost}" + echo -e " Orphaned: ${orphan_count}" + echo -e " Untagged: ${untagged_count}" + + if [[ "$orphan_count" -gt 0 ]]; then + echo "" + warn "${orphan_count} orphaned snapshot(s) found — source volume no longer exists" + fi + + # Check volumes without recent snapshots + echo "" + echo -e "${BOLD}Volumes Without Recent Snapshots (>${RETENTION_DAYS}d)${RESET}" + + local volumes_json + volumes_json=$(aws_cmd ec2 describe-volumes \ + --query 'Volumes[*].{Id:VolumeId,Size:Size,State:State}' \ + --output json) + + echo "$volumes_json" | jq -c '.[]' | while IFS= read -r vol; do + local v_id v_size + v_id=$(echo "$vol" | jq -r '.Id') + v_size=$(echo "$vol" | jq -r '.Size') + + local latest_snap_time + latest_snap_time=$(aws_cmd ec2 describe-snapshots \ + --owner-ids "$owner_id" \ + --filters "Name=volume-id,Values=${v_id}" "Name=status,Values=completed" \ + --query 'sort_by(Snapshots, &StartTime)[-1].StartTime' \ + --output text 2>/dev/null) || latest_snap_time="None" + + if [[ "$latest_snap_time" == "None" || -z "$latest_snap_time" ]]; then + echo -e " ${RED}✗${RESET} ${v_id} (${v_size} GiB) — ${RED}no snapshots${RESET}" + else + local snap_epoch + snap_epoch=$(date -d "$latest_snap_time" +%s 2>/dev/null) || snap_epoch=0 + local cutoff_epoch + cutoff_epoch=$(date -d "-${RETENTION_DAYS} days" +%s 2>/dev/null) || cutoff_epoch=0 + + if [[ $snap_epoch -lt $cutoff_epoch ]]; then + local age=$(( ($(date +%s) - snap_epoch) / 86400 )) + echo -e " ${YELLOW}!${RESET} ${v_id} (${v_size} GiB) — last snapshot ${age}d ago" + fi + fi + done + + # Prometheus output + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + echo "" + echo "# HELP ebs_snapshots_total Total EBS snapshots" + echo "# TYPE ebs_snapshots_total gauge" + echo "ebs_snapshots_total{region=\"${AWS_REGION}\"} ${total}" + echo "# HELP ebs_snapshots_managed_total Managed EBS snapshots" + echo "# TYPE ebs_snapshots_managed_total gauge" + echo "ebs_snapshots_managed_total{region=\"${AWS_REGION}\"} ${managed_count}" + echo "# HELP ebs_snapshots_orphaned_total Orphaned EBS snapshots" + echo "# TYPE ebs_snapshots_orphaned_total gauge" + echo "ebs_snapshots_orphaned_total{region=\"${AWS_REGION}\"} ${orphan_count}" + echo "# HELP ebs_snapshots_untagged_total Untagged EBS snapshots" + echo "# TYPE ebs_snapshots_untagged_total gauge" + echo "ebs_snapshots_untagged_total{region=\"${AWS_REGION}\"} ${untagged_count}" + echo "# HELP ebs_snapshots_size_gib_total Total snapshot storage in GiB" + echo "# TYPE ebs_snapshots_size_gib_total gauge" + echo "ebs_snapshots_size_gib_total{region=\"${AWS_REGION}\"} ${total_size}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# RESTORE MODE +# ══════════════════════════════════════════════════════════════════════ + +do_restore() { + if [[ -z "$TARGET_SNAPSHOT" ]]; then + err "Snapshot ID required. Use --restore SNAP_ID" + exit 1 + fi + + log "Restoring volume from snapshot ${TARGET_SNAPSHOT}..." + + # Verify snapshot exists and is completed + local snap_info + snap_info=$(aws_cmd ec2 describe-snapshots \ + --snapshot-ids "$TARGET_SNAPSHOT" \ + --query 'Snapshots[0].{State:State,Size:VolumeSize,Vol:VolumeId}' \ + --output json 2>/dev/null) || { + err "Snapshot ${TARGET_SNAPSHOT} not found" + exit 1 + } + + local snap_state snap_size source_vol + snap_state=$(echo "$snap_info" | jq -r '.State') + snap_size=$(echo "$snap_info" | jq -r '.Size') + source_vol=$(echo "$snap_info" | jq -r '.Vol') + + if [[ "$snap_state" != "completed" ]]; then + err "Snapshot state is '${snap_state}' — must be 'completed'" + exit 1 + fi + + # Determine AZ + if [[ -z "$RESTORE_AZ" ]]; then + RESTORE_AZ=$(aws_cmd ec2 describe-availability-zones \ + --query 'AvailabilityZones[0].ZoneName' \ + --output text) + log "No AZ specified, using ${RESTORE_AZ}" + fi + + # Build create-volume args + local create_args=( + ec2 create-volume + --snapshot-id "$TARGET_SNAPSHOT" + --availability-zone "$RESTORE_AZ" + --volume-type "$RESTORE_VOLUME_TYPE" + --tag-specifications "ResourceType=volume,Tags=[ + {Key=Name,Value=restored-from-${TARGET_SNAPSHOT}}, + {Key=CreatedBy,Value=ebs-snapshot-manager}, + {Key=RestoredFrom,Value=${TARGET_SNAPSHOT}}, + {Key=SourceVolumeId,Value=${source_vol}} + ]" + ) + + [[ -n "$RESTORE_IOPS" ]] && create_args+=(--iops "$RESTORE_IOPS") + [[ -n "$RESTORE_THROUGHPUT" ]] && create_args+=(--throughput "$RESTORE_THROUGHPUT") + + local vol_id + vol_id=$(aws_cmd "${create_args[@]}" \ + --query 'VolumeId' \ + --output text 2>/dev/null) || { + err "Failed to create volume from snapshot" + exit 1 + } + + echo -e " ${GREEN}✓${RESET} Created volume ${vol_id}" + echo -e " Source snapshot: ${TARGET_SNAPSHOT}" + echo -e " Size: ${snap_size} GiB" + echo -e " Type: ${RESTORE_VOLUME_TYPE}" + echo -e " AZ: ${RESTORE_AZ}" + + # Wait for volume to become available + log "Waiting for volume to become available..." + if aws_cmd ec2 wait volume-available --volume-ids "$vol_id" 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} Volume ${vol_id} is available" + else + warn "Volume did not become available within timeout" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST MODE +# ══════════════════════════════════════════════════════════════════════ + +do_list() { + local owner_id + owner_id=$(get_account_id) + + local filters=("Name=owner-id,Values=${owner_id}") + + if [[ -n "$TARGET_VOLUME" ]]; then + filters+=("Name=volume-id,Values=${TARGET_VOLUME}") + fi + + local snapshots_json + snapshots_json=$(aws_cmd ec2 describe-snapshots \ + --owner-ids "$owner_id" \ + ${TARGET_VOLUME:+--filters "Name=volume-id,Values=${TARGET_VOLUME}"} \ + --query 'sort_by(Snapshots, &StartTime) | reverse(@) | [*].{Id:SnapshotId,Vol:VolumeId,Size:VolumeSize,Status:State,Start:StartTime,Desc:Description}' \ + --output json) + + local total + total=$(echo "$snapshots_json" | jq 'length') + + if [[ "$total" -eq 0 ]]; then + log "No snapshots found" + return + fi + + if [[ "$OUTPUT_FORMAT" == "json" ]]; then + echo "$snapshots_json" | jq '.' + return + fi + + echo "" + printf " %-24s %-14s %8s %-12s %-22s %s\n" "SNAPSHOT" "VOLUME" "SIZE" "STATUS" "CREATED" "DESCRIPTION" + echo " $(printf '%.0s─' {1..100})" + + echo "$snapshots_json" | jq -c '.[]' | while IFS= read -r snap; do + local snap_id vol_id size status start_time desc + snap_id=$(echo "$snap" | jq -r '.Id') + vol_id=$(echo "$snap" | jq -r '.Vol') + size=$(echo "$snap" | jq -r '.Size') + status=$(echo "$snap" | jq -r '.Status') + start_time=$(echo "$snap" | jq -r '.Start' | cut -c1-19) + desc=$(echo "$snap" | jq -r '.Desc' | cut -c1-40) + + printf " %-24s %-14s %6s G %-12s %-22s %s\n" \ + "$snap_id" "$vol_id" "$size" "$status" "$start_time" "$desc" + done + + echo "" + log "Total: ${total} snapshot(s)" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +show_help() { + cat </dev/null || echo 'default')}" + echo -e "Mode: ${RUN_MODE}" + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" + + check_deps + + case "$RUN_MODE" in + snapshot) do_snapshot ;; + prune) do_prune ;; + copy-region) do_copy_region ;; + audit) do_audit ;; + restore) do_restore ;; + list) do_list ;; + esac + + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + echo "" + log "Completed in ${duration}s" + + if [[ $WARNINGS -gt 0 ]]; then + exit 2 + fi +} + +main "$@" diff --git a/ec2-inventory-reporter.sh b/ec2-inventory-reporter.sh new file mode 100644 index 0000000..3029e75 --- /dev/null +++ b/ec2-inventory-reporter.sh @@ -0,0 +1,704 @@ +#!/usr/bin/env bash + +######################################################################################### +#### ec2-inventory-reporter.sh — AWS EC2 instance inventory and compliance report #### +#### Instance metadata, uptime, cost estimates, tag compliance, SG audit #### +#### Requires: bash 4+, aws-cli v2, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./ec2-inventory-reporter.sh #### +#### ./ec2-inventory-reporter.sh --all-regions --format csv #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +AWS_REGION="${AWS_REGION:-us-east-1}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +REQUIRED_TAGS="${REQUIRED_TAGS:-Name,Environment,Owner}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +SCAN_REGION="$AWS_REGION" +ALL_REGIONS="false" +FILTER_STATE="" +FILTER_TAG_KEY="" +FILTER_TAG_VALUE="" +FILTER_TYPE="" +TAG_CHECK="false" +SG_AUDIT="false" +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + return + fi + if [[ "$COLOR" == "auto" && ! -t 1 ]]; then + return + fi + RED="\033[0;31m" + GREEN="\033[0;32m" + YELLOW="\033[0;33m" + BLUE="\033[0;34m" + BOLD="\033[1m" + DIM="\033[2m" + RESET="\033[0m" +} + +# ── Logging ─────────────────────────────────────────────────────────── +log_info() { printf "${GREEN}[INFO]${RESET} %s\n" "$*"; } +log_warn() { printf "${YELLOW}[WARN]${RESET} %s\n" "$*" >&2; } +log_error() { printf "${RED}[ERROR]${RESET} %s\n" "$*" >&2; } +log_debug() { [[ "$VERBOSE" == "true" ]] && printf "${DIM}[DEBUG] %s${RESET}\n" "$*"; } + +# ── Helpers ─────────────────────────────────────────────────────────── +die() { log_error "$@"; exit 1; } + +check_deps() { + local missing=() + command -v aws >/dev/null 2>&1 || missing+=("aws-cli") + command -v jq >/dev/null 2>&1 || missing+=("jq") + if (( ${#missing[@]} > 0 )); then + die "Missing required tools: ${missing[*]}" + fi + + local bash_major="${BASH_VERSINFO[0]}" + if (( bash_major < 4 )); then + die "Requires bash 4+, found ${BASH_VERSION}" + fi +} + +# ── Pricing table (us-east-1 on-demand Linux, $/hr) ────────────────── +declare -A PRICING=( + # General purpose + ["t3.nano"]=0.0052 ["t3.micro"]=0.0104 ["t3.small"]=0.0208 + ["t3.medium"]=0.0416 ["t3.large"]=0.0832 ["t3.xlarge"]=0.1664 + ["t3.2xlarge"]=0.3328 + ["t3a.nano"]=0.0047 ["t3a.micro"]=0.0094 ["t3a.small"]=0.0188 + ["t3a.medium"]=0.0376 ["t3a.large"]=0.0752 ["t3a.xlarge"]=0.1504 + ["t3a.2xlarge"]=0.3008 + ["m5.large"]=0.096 ["m5.xlarge"]=0.192 ["m5.2xlarge"]=0.384 + ["m5.4xlarge"]=0.768 ["m5.8xlarge"]=1.536 + ["m6i.large"]=0.096 ["m6i.xlarge"]=0.192 ["m6i.2xlarge"]=0.384 + ["m6i.4xlarge"]=0.768 + ["m7i.large"]=0.1008 ["m7i.xlarge"]=0.2016 ["m7i.2xlarge"]=0.4032 + # Compute optimized + ["c5.large"]=0.085 ["c5.xlarge"]=0.17 ["c5.2xlarge"]=0.34 + ["c5.4xlarge"]=0.68 ["c5.9xlarge"]=1.53 + ["c6i.large"]=0.085 ["c6i.xlarge"]=0.17 ["c6i.2xlarge"]=0.34 + # Memory optimized + ["r5.large"]=0.126 ["r5.xlarge"]=0.252 ["r5.2xlarge"]=0.504 + ["r5.4xlarge"]=1.008 + ["r6i.large"]=0.126 ["r6i.xlarge"]=0.252 ["r6i.2xlarge"]=0.504 + # Storage optimized + ["i3.large"]=0.156 ["i3.xlarge"]=0.312 ["i3.2xlarge"]=0.624 + # Accelerated + ["g4dn.xlarge"]=0.526 ["g4dn.2xlarge"]=0.752 + # Burstable previous gen + ["t2.nano"]=0.0058 ["t2.micro"]=0.0116 ["t2.small"]=0.023 + ["t2.medium"]=0.0464 ["t2.large"]=0.0928 +) + +# ── Cost estimation ────────────────────────────────────────────────── +estimate_cost() { + local instance_type="$1" state="$2" + if [[ "$state" != "running" ]]; then + echo "0.00" + return + fi + local hourly="${PRICING[$instance_type]:-}" + if [[ -z "$hourly" ]]; then + echo "N/A" + return + fi + printf "%.2f" "$(echo "$hourly * 730" | bc -l)" +} + +# ── Uptime calculation ─────────────────────────────────────────────── +format_uptime() { + local launch_time="$1" state="$2" + if [[ "$state" != "running" || -z "$launch_time" || "$launch_time" == "null" ]]; then + echo "—" + return + fi + + local launch_epoch now_epoch diff_sec + if date --version >/dev/null 2>&1; then + launch_epoch=$(date -d "$launch_time" +%s 2>/dev/null) || { echo "—"; return; } + else + launch_epoch=$(date -j -f "%Y-%m-%dT%H:%M:%S" "${launch_time%%.*}" +%s 2>/dev/null) || { echo "—"; return; } + fi + now_epoch=$(date -u +%s) + diff_sec=$(( now_epoch - launch_epoch )) + + if (( diff_sec < 0 )); then + echo "—" + return + fi + + local days=$(( diff_sec / 86400 )) + local hours=$(( (diff_sec % 86400) / 3600 )) + local mins=$(( (diff_sec % 3600) / 60 )) + printf "%dd %dh %dm" "$days" "$hours" "$mins" +} + +# ── Tag compliance check ───────────────────────────────────────────── +check_tag_compliance() { + local tags_json="$1" + local missing=() + + IFS=',' read -ra required <<< "$REQUIRED_TAGS" + for tag in "${required[@]}"; do + tag=$(echo "$tag" | xargs) + local found + found=$(echo "$tags_json" | jq -r --arg key "$tag" '.[] | select(.Key == $key) | .Key' 2>/dev/null) + if [[ -z "$found" ]]; then + missing+=("$tag") + fi + done + + if (( ${#missing[@]} == 0 )); then + echo "PASS" + else + echo "MISSING: ${missing[*]}" + fi +} + +# ── Security group audit ───────────────────────────────────────────── +audit_security_groups() { + local region="$1" + shift + local sg_ids=("$@") + local findings=() + + if (( ${#sg_ids[@]} == 0 )); then + echo "—" + return + fi + + local sg_data + sg_data=$(aws ec2 describe-security-groups \ + --region "$region" \ + --group-ids "${sg_ids[@]}" \ + --output json 2>/dev/null) || { echo "ERROR"; return; } + + local open_rules + open_rules=$(echo "$sg_data" | jq -r ' + .SecurityGroups[].IpPermissions[] | + select( + (.IpRanges[]?.CidrIp == "0.0.0.0/0") or + (.Ipv6Ranges[]?.CidrIpv6 == "::/0") + ) | + select( + (.FromPort != 80 or .ToPort != 80) and + (.FromPort != 443 or .ToPort != 443) + ) | + if .FromPort == .ToPort then + "port \(.FromPort // "all")" + elif .FromPort == -1 then + "all ports" + else + "ports \(.FromPort)-\(.ToPort)" + end + ' 2>/dev/null) + + if [[ -z "$open_rules" ]]; then + echo "OK" + else + local unique + unique=$(echo "$open_rules" | sort -u | paste -sd ", " -) + echo "OPEN: $unique" + fi +} + +# ── Query EC2 instances ────────────────────────────────────────────── +get_instances() { + local region="$1" + local filters=() + + if [[ -n "$FILTER_STATE" ]]; then + filters+=("Name=instance-state-name,Values=$FILTER_STATE") + fi + if [[ -n "$FILTER_TAG_KEY" && -n "$FILTER_TAG_VALUE" ]]; then + filters+=("Name=tag:$FILTER_TAG_KEY,Values=$FILTER_TAG_VALUE") + fi + if [[ -n "$FILTER_TYPE" ]]; then + filters+=("Name=instance-type,Values=$FILTER_TYPE") + fi + + local cmd=( + aws ec2 describe-instances + --region "$region" + --output json + ) + + if (( ${#filters[@]} > 0 )); then + cmd+=(--filters "${filters[@]}") + fi + + log_debug "Running: ${cmd[*]}" + + local result="" + local next_token="" + + while true; do + local page_cmd=("${cmd[@]}") + if [[ -n "$next_token" ]]; then + page_cmd+=(--starting-token "$next_token") + fi + + local page + page=$("${page_cmd[@]}" 2>/dev/null) || { log_warn "Failed to query EC2 in $region"; echo "[]"; return; } + + local page_instances + page_instances=$(echo "$page" | jq '[.Reservations[].Instances[]]') + + if [[ -z "$result" ]]; then + result="$page_instances" + else + result=$(echo "$result $page_instances" | jq -s 'add') + fi + + next_token=$(echo "$page" | jq -r '.NextToken // empty') + if [[ -z "$next_token" ]]; then + break + fi + done + + echo "$result" +} + +# ── Get all enabled regions ────────────────────────────────────────── +get_all_regions() { + aws ec2 describe-regions \ + --region "$AWS_REGION" \ + --query 'Regions[].RegionName' \ + --output text 2>/dev/null | tr '\t' '\n' | sort +} + +# ── Text table output ──────────────────────────────────────────────── +output_text() { + local region="$1" instances_json="$2" + local count + count=$(echo "$instances_json" | jq 'length') + + local account_id + account_id=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "unknown") + + echo "EC2 Inventory Reporter" + echo "Account: $account_id" + echo "Region: $region" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "Instances: $count" + echo "" + + if (( count == 0 )); then + echo " No instances found." + echo "" + return + fi + + local divider="─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────" + + printf " %-21s %-14s %-10s %-15s %-16s %-17s %-16s %s\n" \ + "INSTANCE ID" "TYPE" "STATE" "AZ" "PRIVATE IP" "PUBLIC IP" "UPTIME" "EST \$/MO" + printf " %s\n" "$divider" + + local total_cost=0 running=0 stopped=0 other=0 + local compliance_issues=0 sg_issues=0 + + while IFS=$'\t' read -r iid itype istate iaz pip eip launch_time tags_json sg_json; do + local uptime + uptime=$(format_uptime "$launch_time" "$istate") + + local cost + cost=$(estimate_cost "$itype" "$istate") + + [[ -z "$eip" || "$eip" == "null" ]] && eip="—" + + printf " %-21s %-14s %-10s %-15s %-16s %-17s %-16s %s\n" \ + "$iid" "$itype" "$istate" "$iaz" "$pip" "$eip" "$uptime" "\$$cost" + + if [[ "$cost" != "N/A" ]]; then + total_cost=$(echo "$total_cost + $cost" | bc -l) + fi + + case "$istate" in + running) (( running++ )) ;; + stopped) (( stopped++ )) ;; + *) (( other++ )) ;; + esac + + if [[ "$TAG_CHECK" == "true" ]]; then + local compliance + compliance=$(check_tag_compliance "$tags_json") + if [[ "$compliance" != "PASS" ]]; then + (( compliance_issues++ )) + printf " ${YELLOW} ↳ Tag compliance: %s${RESET}\n" "$compliance" + fi + fi + + if [[ "$SG_AUDIT" == "true" && "$istate" == "running" ]]; then + local sg_ids_list + sg_ids_list=$(echo "$sg_json" | jq -r '.[].GroupId' 2>/dev/null) + if [[ -n "$sg_ids_list" ]]; then + local sg_arr=() + while IFS= read -r sg; do + [[ -n "$sg" ]] && sg_arr+=("$sg") + done <<< "$sg_ids_list" + local sg_result + sg_result=$(audit_security_groups "$region" "${sg_arr[@]}") + if [[ "$sg_result" != "OK" && "$sg_result" != "—" ]]; then + (( sg_issues++ )) + printf " ${RED} ↳ SG audit: %s${RESET}\n" "$sg_result" + fi + fi + fi + + done < <(echo "$instances_json" | jq -r ' + .[] | + [ + .InstanceId, + .InstanceType, + (.State.Name), + (.Placement.AvailabilityZone), + (.PrivateIpAddress // "—"), + (.PublicIpAddress // "null"), + (.LaunchTime // "null"), + (.Tags // [] | tojson), + (.SecurityGroups // [] | tojson) + ] | @tsv + ') + + printf " %s\n" "$divider" + + local summary="TOTAL: $count instances" + local parts=() + (( running > 0 )) && parts+=("$running running") + (( stopped > 0 )) && parts+=("$stopped stopped") + (( other > 0 )) && parts+=("$other other") + if (( ${#parts[@]} > 0 )); then + local joined + joined=$(printf ", %s" "${parts[@]}") + summary+=" (${joined:2})" + fi + printf " %-70s Estimated monthly cost: \$%.2f\n" "$summary" "$total_cost" + + if [[ "$TAG_CHECK" == "true" ]]; then + echo "" + if (( compliance_issues > 0 )); then + printf " ${YELLOW}Tag compliance issues: %d instance(s) missing required tags${RESET}\n" "$compliance_issues" + else + printf " ${GREEN}Tag compliance: all instances have required tags${RESET}\n" + fi + fi + + if [[ "$SG_AUDIT" == "true" ]]; then + if (( sg_issues > 0 )); then + printf " ${RED}Security group issues: %d instance(s) with overly permissive rules${RESET}\n" "$sg_issues" + else + printf " ${GREEN}Security groups: no overly permissive rules found${RESET}\n" + fi + fi + + echo "" +} + +# ── CSV output ──────────────────────────────────────────────────────── +output_csv() { + local region="$1" instances_json="$2" + local count + count=$(echo "$instances_json" | jq 'length') + + local header="instance_id,type,state,az,private_ip,public_ip,launch_time,uptime,est_monthly_cost" + if [[ "$TAG_CHECK" == "true" ]]; then + header+=",tag_compliance" + fi + if [[ "$SG_AUDIT" == "true" ]]; then + header+=",sg_audit" + fi + header+=",region" + echo "$header" + + if (( count == 0 )); then + return + fi + + while IFS=$'\t' read -r iid itype istate iaz pip eip launch_time tags_json sg_json; do + local uptime + uptime=$(format_uptime "$launch_time" "$istate") + + local cost + cost=$(estimate_cost "$itype" "$istate") + + [[ -z "$eip" || "$eip" == "null" ]] && eip="" + + local line="$iid,$itype,$istate,$iaz,$pip,$eip,$launch_time,\"$uptime\",$cost" + + if [[ "$TAG_CHECK" == "true" ]]; then + local compliance + compliance=$(check_tag_compliance "$tags_json") + line+=",\"$compliance\"" + fi + + if [[ "$SG_AUDIT" == "true" ]]; then + local sg_ids_list + sg_ids_list=$(echo "$sg_json" | jq -r '.[].GroupId' 2>/dev/null) + if [[ -n "$sg_ids_list" && "$istate" == "running" ]]; then + local sg_arr=() + while IFS= read -r sg; do + [[ -n "$sg" ]] && sg_arr+=("$sg") + done <<< "$sg_ids_list" + local sg_result + sg_result=$(audit_security_groups "$region" "${sg_arr[@]}") + line+=",\"$sg_result\"" + else + line+=",\"—\"" + fi + fi + + line+=",$region" + echo "$line" + + done < <(echo "$instances_json" | jq -r ' + .[] | + [ + .InstanceId, + .InstanceType, + (.State.Name), + (.Placement.AvailabilityZone), + (.PrivateIpAddress // "—"), + (.PublicIpAddress // "null"), + (.LaunchTime // "null"), + (.Tags // [] | tojson), + (.SecurityGroups // [] | tojson) + ] | @tsv + ') +} + +# ── JSON output ─────────────────────────────────────────────────────── +output_json() { + local region="$1" instances_json="$2" + local count + count=$(echo "$instances_json" | jq 'length') + + local items="[]" + + if (( count > 0 )); then + items=$(echo "$instances_json" | jq --arg region "$region" '[ + .[] | { + instance_id: .InstanceId, + type: .InstanceType, + state: .State.Name, + az: .Placement.AvailabilityZone, + private_ip: (.PrivateIpAddress // null), + public_ip: (.PublicIpAddress // null), + launch_time: (.LaunchTime // null), + ami_id: (.ImageId // null), + vpc_id: (.VpcId // null), + key_name: (.KeyName // null), + tags: (.Tags // []), + security_groups: (.SecurityGroups // []), + region: $region + } + ]') + fi + + local account_id + account_id=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "unknown") + + jq -n \ + --arg account "$account_id" \ + --arg region "$region" \ + --arg time "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --argjson count "$count" \ + --argjson instances "$items" \ + '{ + account: $account, + region: $region, + time: $time, + instance_count: $count, + instances: $instances + }' +} + +# ── Process a single region ────────────────────────────────────────── +process_region() { + local region="$1" + local csv_header_printed="$2" + + log_debug "Querying region: $region" + + local instances_json + instances_json=$(get_instances "$region") + + local count + count=$(echo "$instances_json" | jq 'length' 2>/dev/null || echo 0) + + if (( count == 0 )) && [[ "$ALL_REGIONS" == "true" ]]; then + log_debug "No instances in $region, skipping" + return + fi + + case "$OUTPUT_FORMAT" in + text) + output_text "$region" "$instances_json" + ;; + csv) + if [[ "$csv_header_printed" == "false" ]]; then + output_csv "$region" "$instances_json" + else + output_csv "$region" "$instances_json" | tail -n +2 + fi + ;; + json) + output_json "$region" "$instances_json" + ;; + esac +} + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat < 0 )); do + case "$1" in + --region) + [[ $# -lt 2 ]] && die "--region requires a value" + SCAN_REGION="$2"; shift 2 ;; + --all-regions) + ALL_REGIONS="true"; shift ;; + --state) + [[ $# -lt 2 ]] && die "--state requires a value" + FILTER_STATE="$2"; shift 2 ;; + --tag) + [[ $# -lt 2 ]] && die "--tag requires KEY=VALUE" + [[ "$2" != *"="* ]] && die "--tag value must be KEY=VALUE" + FILTER_TAG_KEY="${2%%=*}"; FILTER_TAG_VALUE="${2#*=}"; shift 2 ;; + --type) + [[ $# -lt 2 ]] && die "--type requires a value" + FILTER_TYPE="$2"; shift 2 ;; + --format) + [[ $# -lt 2 ]] && die "--format requires a value" + OUTPUT_FORMAT="$2"; shift 2 ;; + --tag-check) + TAG_CHECK="true"; shift ;; + --sg-audit) + SG_AUDIT="true"; shift ;; + --verbose) + VERBOSE="true"; shift ;; + --no-color) + COLOR="never"; shift ;; + --help|-h) + usage ;; + *) + die "Unknown option: $1 (see --help)" ;; + esac + done + + case "$OUTPUT_FORMAT" in + text|csv|json) ;; + *) die "Invalid --format: $OUTPUT_FORMAT (expected text, csv, json)" ;; + esac +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + parse_args "$@" + setup_colors + check_deps + + START_TIME=$(date +%s) + + log_debug "Validating AWS credentials..." + aws sts get-caller-identity --output text >/dev/null 2>&1 \ + || die "AWS credentials not configured or expired" + + if [[ "$ALL_REGIONS" == "true" ]]; then + log_info "Scanning all enabled regions..." + local regions + regions=$(get_all_regions) + + if [[ -z "$regions" ]]; then + die "Failed to retrieve region list" + fi + + local csv_header="false" + local json_first="true" + + if [[ "$OUTPUT_FORMAT" == "json" ]]; then + echo "[" + fi + + while IFS= read -r region; do + [[ -z "$region" ]] && continue + log_info "Scanning $region..." + + if [[ "$OUTPUT_FORMAT" == "json" ]]; then + if [[ "$json_first" == "true" ]]; then + json_first="false" + else + echo "," + fi + fi + + process_region "$region" "$csv_header" + csv_header="true" + done <<< "$regions" + + if [[ "$OUTPUT_FORMAT" == "json" ]]; then + echo "]" + fi + else + log_info "Scanning region: $SCAN_REGION" + process_region "$SCAN_REGION" "false" + fi + + local elapsed=$(( $(date +%s) - START_TIME )) + log_info "Completed in ${elapsed}s" +} + +main "$@" diff --git a/elasticsearch-exporter.sh b/elasticsearch-exporter.sh new file mode 100755 index 0000000..137a9a7 --- /dev/null +++ b/elasticsearch-exporter.sh @@ -0,0 +1,424 @@ +#!/usr/bin/env bash +# +# Elasticsearch Prometheus Metrics Exporter +# +# Prometheus textfile collector exporter for Elasticsearch. +# Uses the Elasticsearch REST API to collect cluster health, +# node statistics, index counts, JVM memory, search/indexing +# throughput, circuit breaker state, and shard status. +# +# Usage: +# ./elasticsearch-exporter.sh +# ./elasticsearch-exporter.sh --textfile +# ./elasticsearch-exporter.sh --install +# +# Parameters: +# --textfile Write to textfile collector directory +# --install Create cron job for automatic collection +# --help Show usage +# +# Environment: +# ES_URL Elasticsearch REST API URL (default: http://localhost:9200) +# ES_USER Username for basic auth (optional) +# ES_PASS Password for basic auth (optional) +# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector) +# CURL_TIMEOUT API request timeout in seconds (default: 10) +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.0 +# +# Metrics Exported: +# Core: +# - elasticsearch_up +# - elasticsearch_exporter_info{version} +# +# Cluster Health: +# - elasticsearch_cluster_health{status} +# - elasticsearch_cluster_nodes_total +# - elasticsearch_cluster_data_nodes +# - elasticsearch_cluster_shards_active +# - elasticsearch_cluster_shards_relocating +# - elasticsearch_cluster_shards_initializing +# - elasticsearch_cluster_shards_unassigned +# - elasticsearch_cluster_pending_tasks +# +# Cluster Stats: +# - elasticsearch_indices_total +# - elasticsearch_documents_total +# - elasticsearch_store_size_bytes +# +# Node Stats: +# - elasticsearch_jvm_heap_used_bytes{node} +# - elasticsearch_jvm_heap_max_bytes{node} +# - elasticsearch_search_query_total{node} +# - elasticsearch_indexing_index_total{node} +# - elasticsearch_circuit_breaker_tripped{node,breaker} +# +# Exporter: +# - elasticsearch_exporter_duration_seconds +# - elasticsearch_exporter_last_run_timestamp + +set -euo pipefail + +# --- Configuration --- +readonly VERSION="1.0" +readonly SCRIPT_NAME="$(basename "$0")" +ES_URL="${ES_URL:-http://localhost:9200}" +ES_USER="${ES_USER:-}" +ES_PASS="${ES_PASS:-}" +TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +TEXTFILE_MODE=false +OUTPUT="" +START_TIME="" + +# --- Functions --- + +usage() { + cat </dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "ERROR: Missing required commands: ${missing[*]}" >&2 + echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 + exit 1 + fi +} + +validate_config() { + # Strip trailing slash + ES_URL="${ES_URL%/}" +} + +api_get() { + local endpoint="$1" + local curl_args=(-sf --max-time "$CURL_TIMEOUT") + + if [[ -n "$ES_USER" && -n "$ES_PASS" ]]; then + curl_args+=(-u "${ES_USER}:${ES_PASS}") + fi + + curl "${curl_args[@]}" "${ES_URL}${endpoint}" 2>/dev/null || echo "" +} + +add_metric() { + local name="$1" + local type="$2" + local help="$3" + local value="$4" + local labels="${5:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name}{${labels}} ${value} +" + else + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name} ${value} +" + fi +} + +add_metric_value() { + local name="$1" + local value="$2" + local labels="${3:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="${name}{${labels}} ${value} +" + else + OUTPUT+="${name} ${value} +" + fi +} + +collect_cluster_health() { + local health_json + health_json=$(api_get "/_cluster/health") + + if [[ -z "$health_json" ]]; then + add_metric "elasticsearch_up" "gauge" "Elasticsearch reachability (1=up, 0=down)" "0" + return 1 + fi + + add_metric "elasticsearch_up" "gauge" "Elasticsearch reachability (1=up, 0=down)" "1" + + # Cluster health status (green=0, yellow=1, red=2) + local status + status=$(echo "$health_json" | jq -r '.status // "red"' 2>/dev/null) + + local status_value + case "$status" in + green) status_value=0 ;; + yellow) status_value=1 ;; + red) status_value=2 ;; + *) status_value=2 ;; + esac + + add_metric "elasticsearch_cluster_health" "gauge" "Cluster health status (green=0, yellow=1, red=2)" "$status_value" "status=\"${status}\"" + + # Node counts + local nodes_total data_nodes + nodes_total=$(echo "$health_json" | jq '.number_of_nodes // 0' 2>/dev/null) + data_nodes=$(echo "$health_json" | jq '.number_of_data_nodes // 0' 2>/dev/null) + + add_metric "elasticsearch_cluster_nodes_total" "gauge" "Total number of cluster nodes" "${nodes_total:-0}" + add_metric "elasticsearch_cluster_data_nodes" "gauge" "Number of data nodes" "${data_nodes:-0}" + + # Shard counts + local active_shards relocating initializing unassigned + active_shards=$(echo "$health_json" | jq '.active_shards // 0' 2>/dev/null) + relocating=$(echo "$health_json" | jq '.relocating_shards // 0' 2>/dev/null) + initializing=$(echo "$health_json" | jq '.initializing_shards // 0' 2>/dev/null) + unassigned=$(echo "$health_json" | jq '.unassigned_shards // 0' 2>/dev/null) + + add_metric "elasticsearch_cluster_shards_active" "gauge" "Number of active shards" "${active_shards:-0}" + add_metric "elasticsearch_cluster_shards_relocating" "gauge" "Number of relocating shards" "${relocating:-0}" + add_metric "elasticsearch_cluster_shards_initializing" "gauge" "Number of initializing shards" "${initializing:-0}" + add_metric "elasticsearch_cluster_shards_unassigned" "gauge" "Number of unassigned shards" "${unassigned:-0}" + + # Pending tasks + local pending_tasks + pending_tasks=$(echo "$health_json" | jq '.number_of_pending_tasks // 0' 2>/dev/null) + + add_metric "elasticsearch_cluster_pending_tasks" "gauge" "Number of pending cluster tasks" "${pending_tasks:-0}" + + return 0 +} + +collect_cluster_stats() { + local stats_json + stats_json=$(api_get "/_cluster/stats") + + if [[ -z "$stats_json" ]]; then + return + fi + + # Indices count + local indices_count + indices_count=$(echo "$stats_json" | jq '.indices.count // 0' 2>/dev/null) + + add_metric "elasticsearch_indices_total" "gauge" "Total number of indices" "${indices_count:-0}" + + # Document count + local doc_count + doc_count=$(echo "$stats_json" | jq '.indices.docs.count // 0' 2>/dev/null) + + add_metric "elasticsearch_documents_total" "gauge" "Total number of documents" "${doc_count:-0}" + + # Store size + local store_size + store_size=$(echo "$stats_json" | jq '.indices.store.size_in_bytes // 0' 2>/dev/null) + + add_metric "elasticsearch_store_size_bytes" "gauge" "Total store size in bytes" "${store_size:-0}" +} + +collect_node_stats() { + local nodes_json + nodes_json=$(api_get "/_nodes/stats") + + if [[ -z "$nodes_json" ]]; then + return + fi + + local node_ids + node_ids=$(echo "$nodes_json" | jq -r '.nodes | keys[]' 2>/dev/null) + + if [[ -z "$node_ids" ]]; then + return + fi + + # JVM heap used per node + OUTPUT+="# HELP elasticsearch_jvm_heap_used_bytes JVM heap memory used per node +# TYPE elasticsearch_jvm_heap_used_bytes gauge +" + + local node_id node_name heap_used + for node_id in $node_ids; do + node_name=$(echo "$nodes_json" | jq -r ".nodes[\"${node_id}\"].name // \"${node_id}\"" 2>/dev/null) + heap_used=$(echo "$nodes_json" | jq ".nodes[\"${node_id}\"].jvm.mem.heap_used_in_bytes // 0" 2>/dev/null) + + add_metric_value "elasticsearch_jvm_heap_used_bytes" "${heap_used:-0}" "node=\"${node_name}\"" + done + + # JVM heap max per node + OUTPUT+="# HELP elasticsearch_jvm_heap_max_bytes JVM heap memory max per node +# TYPE elasticsearch_jvm_heap_max_bytes gauge +" + + local heap_max + for node_id in $node_ids; do + node_name=$(echo "$nodes_json" | jq -r ".nodes[\"${node_id}\"].name // \"${node_id}\"" 2>/dev/null) + heap_max=$(echo "$nodes_json" | jq ".nodes[\"${node_id}\"].jvm.mem.heap_max_in_bytes // 0" 2>/dev/null) + + add_metric_value "elasticsearch_jvm_heap_max_bytes" "${heap_max:-0}" "node=\"${node_name}\"" + done + + # Search query total per node + OUTPUT+="# HELP elasticsearch_search_query_total Total search queries per node +# TYPE elasticsearch_search_query_total gauge +" + + local query_total + for node_id in $node_ids; do + node_name=$(echo "$nodes_json" | jq -r ".nodes[\"${node_id}\"].name // \"${node_id}\"" 2>/dev/null) + query_total=$(echo "$nodes_json" | jq ".nodes[\"${node_id}\"].indices.search.query_total // 0" 2>/dev/null) + + add_metric_value "elasticsearch_search_query_total" "${query_total:-0}" "node=\"${node_name}\"" + done + + # Indexing index total per node + OUTPUT+="# HELP elasticsearch_indexing_index_total Total indexing operations per node +# TYPE elasticsearch_indexing_index_total gauge +" + + local index_total + for node_id in $node_ids; do + node_name=$(echo "$nodes_json" | jq -r ".nodes[\"${node_id}\"].name // \"${node_id}\"" 2>/dev/null) + index_total=$(echo "$nodes_json" | jq ".nodes[\"${node_id}\"].indices.indexing.index_total // 0" 2>/dev/null) + + add_metric_value "elasticsearch_indexing_index_total" "${index_total:-0}" "node=\"${node_name}\"" + done + + # Circuit breaker trips per node per breaker type + OUTPUT+="# HELP elasticsearch_circuit_breaker_tripped Circuit breaker trip count per node and breaker +# TYPE elasticsearch_circuit_breaker_tripped gauge +" + + local breaker_names breaker_name tripped + for node_id in $node_ids; do + node_name=$(echo "$nodes_json" | jq -r ".nodes[\"${node_id}\"].name // \"${node_id}\"" 2>/dev/null) + breaker_names=$(echo "$nodes_json" | jq -r ".nodes[\"${node_id}\"].breakers | keys[]" 2>/dev/null) + + for breaker_name in $breaker_names; do + tripped=$(echo "$nodes_json" | jq ".nodes[\"${node_id}\"].breakers[\"${breaker_name}\"].tripped // 0" 2>/dev/null) + + add_metric_value "elasticsearch_circuit_breaker_tripped" "${tripped:-0}" "node=\"${node_name}\",breaker=\"${breaker_name}\"" + done + done +} + +write_output() { + if [[ "$TEXTFILE_MODE" == true ]]; then + local output_file="${TEXTFILE_DIR}/elasticsearch.prom" + local temp_file="${output_file}.$$" + + mkdir -p "$TEXTFILE_DIR" + echo "$OUTPUT" > "$temp_file" + mv "$temp_file" "$output_file" + else + echo "$OUTPUT" + fi +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + local env_vars="" + env_vars+="ES_URL=${ES_URL}\n" + if [[ -n "$ES_USER" ]]; then + env_vars+="ES_USER=${ES_USER}\n" + fi + if [[ -n "$ES_PASS" ]]; then + env_vars+="ES_PASS=${ES_PASS}\n" + fi + env_vars+="TEXTFILE_DIR=${TEXTFILE_DIR}" + + cat > /etc/cron.d/elasticsearch-exporter </dev/null +EOF + + chmod 644 /etc/cron.d/elasticsearch-exporter + echo "Installed cron job: /etc/cron.d/elasticsearch-exporter" + echo "Metrics will be written to: ${TEXTFILE_DIR}/elasticsearch.prom" +} + +# --- Main --- + +main() { + # Parse arguments + for arg in "$@"; do + case "$arg" in + --textfile) TEXTFILE_MODE=true ;; + --install) + check_dependencies + validate_config + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) echo "Unknown option: $arg" >&2; usage ;; + esac + done + + check_dependencies + validate_config + + START_TIME=$(date +%s%N) + + # Exporter info + add_metric "elasticsearch_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" + + # Collect metrics + if collect_cluster_health; then + collect_cluster_stats + collect_node_stats + fi + + # Exporter performance + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + add_metric "elasticsearch_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" + add_metric "elasticsearch_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" + + write_output +} + +main "$@" diff --git a/exchange-metrics.ps1 b/exchange-metrics.ps1 index 98acaef..e7cd223 100644 --- a/exchange-metrics.ps1 +++ b/exchange-metrics.ps1 @@ -1,6 +1,43 @@ -# Exchange Metrics Collector - Outputs Prometheus-compatible metrics +# Exchange Metrics Collector - Outputs Prometheus-compatible metrics # Requires Exchange Management Shell and appropriate permissions +param( + [switch]$InstallScheduledTask, + [int]$TaskIntervalMinutes = 5 +) + +if ($InstallScheduledTask) { + $taskName = "ExchangeMetricsExporter" + $existingTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + + if (-not $existingTask) { + $taskAction = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$($MyInvocation.MyCommand.Path)`"" + + if (-not $TaskIntervalMinutes -or $TaskIntervalMinutes -le 0) { + throw "TaskIntervalMinutes must be a positive integer" + } + + $taskTrigger = New-ScheduledTaskTrigger -Once -At (Get-Date).AddMinutes(1) -RepetitionInterval (New-TimeSpan -Minutes $TaskIntervalMinutes) -RepetitionDuration (New-TimeSpan -Days 365) + $taskPrincipal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest + + try { + Write-Host "Creating scheduled task: $taskName" + Register-ScheduledTask -TaskName $taskName -Action $taskAction -Trigger $taskTrigger -Principal $taskPrincipal -Description "Exports Exchange metrics for Prometheus every $TaskIntervalMinutes minutes" + + $createdTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + if (-not $createdTask) { + throw "Failed to verify scheduled task creation" + } + Write-Host "Successfully created scheduled task: $taskName" -ForegroundColor Green + } catch { + Write-Error "Failed to create auto-start task: $($_.Exception.Message)" + throw + } + } else { + Write-Host "Scheduled task '$taskName' already exists, skipping creation" + } +} + $StartTime = Get-Date $Hostname = $env:COMPUTERNAME diff --git a/expand-drive.sh b/expand-drive.sh index f85c145..d699e3a 100755 --- a/expand-drive.sh +++ b/expand-drive.sh @@ -7,10 +7,13 @@ #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### -#### Version: 2.3 #### +#### Version: 2.4 #### #### #### #### Usage: sudo ./expand-drive.sh #### ############################################################# +# v2.4 changes: +# - Fixed: grep in pipeline crashes under set -euo pipefail when no matches found. Added || true guard +############################################################# # Set strict error handling: # -e: Exit immediately if a command exits with a non-zero status @@ -189,7 +192,7 @@ process_partition() { # Extract partition number from device path (e.g., extract "1" from "/dev/sda1") local part_num - part_num=$(echo "$partition" | grep -o '[0-9]\+$' | tail -1) + part_num=$(echo "$partition" | { grep -o '[0-9]\+$' || true; } | tail -1) if [ -z "$part_num" ]; then log_error "Could not extract partition number from $partition" return 1 @@ -293,7 +296,7 @@ main() { # Get list of all disk devices in the system using lsblk # Filter for disk type and extract device names local devices - devices=$($LSBLK_PATH -pln -o NAME,TYPE | grep "disk" | cut -d' ' -f1) + devices=$($LSBLK_PATH -pln -o NAME,TYPE | { grep "disk" || true; } | cut -d' ' -f1) # Verify we found at least one disk device if [ -z "$devices" ]; then diff --git a/fapolicyd-log-analyzer.sh b/fapolicyd-log-analyzer.sh new file mode 100644 index 0000000..d6f30c5 --- /dev/null +++ b/fapolicyd-log-analyzer.sh @@ -0,0 +1,387 @@ +#!/bin/bash + +############################################################# +#### fapolicyd Log Analyzer Script #### +#### Parses denial logs and suggests fix commands #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### To use this script chmod it to 755 #### +#### or simply type bash #### +############################################################# + +# ── Colors ──────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +# ── Defaults ────────────────────────────────────────────── +MODE="recent" +OUTPUT_FILE="" +QUIET=0 +TOTAL_DENIALS=0 +UNIQUE_FILES=0 +SUGGESTED_FIXES=0 + +# ── Functions ───────────────────────────────────────────── + +usage() { + echo -e "${BOLD}fapolicyd Log Analyzer${NC}" + echo "" + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --help Show this help message" + echo " --recent Analyze denials from the last hour only (default)" + echo " --all Analyze all denials in the log" + echo " --output FILE Save suggested fixes to FILE" + echo " --quiet Show suggestions only, suppress raw denial lines" + echo "" + echo "Examples:" + echo " sudo bash $0 --recent" + echo " sudo bash $0 --all --output fixes.txt" + echo " sudo bash $0 --quiet --output /tmp/fixes.txt" + exit 0 +} + +check_root() { + if [[ $EUID -ne 0 ]]; then + echo -e "${RED}Error: This script must be run as root.${NC}" + echo "Please run with: sudo bash $0" + exit 1 + fi +} + +check_fapolicyd() { + if ! command -v fapolicyd-cli &>/dev/null; then + echo -e "${RED}Error: fapolicyd does not appear to be installed.${NC}" + echo -e "${YELLOW}Install with: dnf install fapolicyd${NC}" + exit 1 + fi + + if ! systemctl is-active --quiet fapolicyd 2>/dev/null; then + echo -e "${YELLOW}Warning: fapolicyd service is not currently running.${NC}" + echo -e "${CYAN}Continuing to analyze existing log entries...${NC}" + echo "" + fi +} + +output_line() { + local line="$1" + echo -e "$line" + if [[ -n "$OUTPUT_FILE" ]]; then + echo -e "$line" | sed 's/\x1b\[[0-9;]*m//g' >> "$OUTPUT_FILE" + fi +} + +# ── fapolicyd Analysis ─────────────────────────────────── + +parse_fapolicyd_denial() { + local line="$1" + + local dec perm fname exe trust pid + dec=$(echo "$line" | grep -oP 'dec=\K[^ ]+') + perm=$(echo "$line" | grep -oP 'perm=\K[^ ]+') + fname=$(echo "$line" | grep -oP 'fname=\K[^ ]+') + exe=$(echo "$line" | grep -oP 'exe=\K[^ ]+') + trust=$(echo "$line" | grep -oP 'trust=\K[^ ]+') + pid=$(echo "$line" | grep -oP 'pid=\K[^ ]+') + + if [[ $QUIET -eq 0 ]]; then + output_line "${RED}DENIAL:${NC} $line" + fi + + [[ -n "$dec" ]] && output_line "${CYAN} Decision:${NC} $dec" + [[ -n "$perm" ]] && output_line "${CYAN} Permission:${NC} $perm" + [[ -n "$fname" ]] && output_line "${CYAN} File:${NC} $fname" + [[ -n "$exe" ]] && output_line "${CYAN} Executable:${NC} $exe" + [[ -n "$trust" ]] && output_line "${CYAN} Trust status:${NC} $trust" + [[ -n "$pid" ]] && output_line "${CYAN} PID:${NC} $pid" + + suggest_fapolicyd_fix "$fname" "$exe" "$perm" "$trust" + output_line "" +} + +suggest_fapolicyd_fix() { + local fname="$1" exe="$2" perm="$3" trust="$4" + + ((SUGGESTED_FIXES++)) + + if [[ -n "$fname" ]]; then + # Check current trust status + output_line "${GREEN} Suggested fixes:${NC}" + + # Trust the file + output_line "${GREEN} 1. Add file to trust database:${NC}" + output_line "${GREEN} fapolicyd-cli --file add ${fname}${NC}" + output_line "${GREEN} fapolicyd-cli --update${NC}" + + # Check trust + output_line "${GREEN} 2. Verify trust status:${NC}" + output_line "${GREEN} fapolicyd-cli --check-path ${fname}${NC}" + + # If the file is a script or binary from a known package + if command -v rpm &>/dev/null; then + local pkg + pkg=$(rpm -qf "$fname" 2>/dev/null) + if [[ $? -eq 0 && -n "$pkg" ]]; then + output_line "${CYAN} Note: File belongs to package: ${pkg}${NC}" + output_line "${YELLOW} If the file was modified after install, consider:${NC}" + output_line "${GREEN} rpm --restore ${pkg}${NC}" + output_line "${GREEN} fapolicyd-cli --update${NC}" + fi + fi + + # If it looks like a shared library + if [[ "$fname" == *.so* ]]; then + output_line "${YELLOW} Library denial — also check:${NC}" + output_line "${GREEN} ldconfig${NC}" + output_line "${GREEN} fapolicyd-cli --update${NC}" + fi + fi + + # Suggest rule-based approach + if [[ -n "$exe" && -n "$perm" ]]; then + output_line "${GREEN} 3. Or add a custom rule in /etc/fapolicyd/rules.d/:${NC}" + output_line "${GREEN} allow ${perm} exe=${exe} : all${NC}" + fi +} + +categorize_fapolicyd_denial() { + local line="$1" + local perm fname + + perm=$(echo "$line" | grep -oP 'perm=\K[^ ]+') + fname=$(echo "$line" | grep -oP 'fname=\K[^ ]+') + + case "$perm" in + execute) + if [[ "$fname" == *.so* ]]; then + echo "library" + else + echo "execute" + fi + ;; + open) + echo "open" + ;; + *) + echo "other" + ;; + esac +} + +analyze_fapolicyd() { + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "${BOLD} fapolicyd Log Analysis${NC}" + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "" + + # Show daemon status + local daemon_status + daemon_status=$(systemctl is-active fapolicyd 2>/dev/null) + output_line "${CYAN}fapolicyd status:${NC} $daemon_status" + + # Show integrity setting + if [[ -f /etc/fapolicyd/fapolicyd.conf ]]; then + local integrity + integrity=$(grep -oP '^\s*integrity\s*=\s*\K.*' /etc/fapolicyd/fapolicyd.conf 2>/dev/null) + [[ -n "$integrity" ]] && output_line "${CYAN}Integrity mode:${NC} $integrity" + fi + + # Show trust database stats + if command -v fapolicyd-cli &>/dev/null; then + local trust_count + trust_count=$(fapolicyd-cli --dump-db 2>/dev/null | wc -l) + [[ -n "$trust_count" ]] && output_line "${CYAN}Trusted files:${NC} $trust_count" + fi + output_line "" + + # Gather denials from audit log + local denials="" + + if [[ ! -f /var/log/audit/audit.log ]]; then + output_line "${RED}Error: Cannot find /var/log/audit/audit.log${NC}" + output_line "${YELLOW}Ensure auditd is running: systemctl start auditd${NC}" + return + fi + + if [[ "$MODE" == "recent" ]]; then + if command -v ausearch &>/dev/null; then + denials=$(ausearch -m FANOTIFY -ts recent 2>/dev/null | grep "type=FANOTIFY") + fi + # Fallback to manual log parsing + if [[ -z "$denials" ]]; then + local one_hour_ago + one_hour_ago=$(date -d '1 hour ago' '+%s' 2>/dev/null) + if [[ -n "$one_hour_ago" ]]; then + denials=$(awk -v cutoff="$one_hour_ago" ' + /type=FANOTIFY/ && /dec=deny/ { + match($0, /msg=audit\(([0-9]+)\./, arr) + if (arr[1] >= cutoff) print + } + ' /var/log/audit/audit.log) + fi + fi + else + denials=$(grep "type=FANOTIFY" /var/log/audit/audit.log | grep "dec=deny") + fi + + if [[ -z "$denials" ]]; then + output_line "${GREEN}No fapolicyd denials found.${NC}" + output_line "" + return + fi + + # Group denials by category + declare -A categories + local denial_count=0 + local -A seen_files + + while IFS= read -r line; do + [[ -z "$line" ]] && continue + ((denial_count++)) + local category + category=$(categorize_fapolicyd_denial "$line") + categories["$category"]+="$line"$'\n' + + local f + f=$(echo "$line" | grep -oP 'fname=\K[^ ]+') + [[ -n "$f" ]] && seen_files["$f"]=1 + done <<< "$denials" + + TOTAL_DENIALS=$denial_count + UNIQUE_FILES=${#seen_files[@]} + + # Display grouped results + for category in "execute" "library" "open" "other"; do + if [[ -n "${categories[$category]}" ]]; then + local label + case "$category" in + execute) label="Execution Denials" ;; + library) label="Library Load Denials" ;; + open) label="File Open Denials" ;; + other) label="Other Denials" ;; + esac + + output_line "${BOLD}── ${label} ──────────────────────────────────${NC}" + output_line "" + + while IFS= read -r denial_line; do + [[ -z "$denial_line" ]] && continue + parse_fapolicyd_denial "$denial_line" + done <<< "${categories[$category]}" + fi + done + + # Show bulk fix suggestions + if [[ ${#seen_files[@]} -gt 0 ]]; then + output_line "${BOLD}── Bulk Fix Commands ────────────────────────────${NC}" + output_line "" + output_line "${YELLOW}To trust all denied files at once:${NC}" + for f in "${!seen_files[@]}"; do + output_line "${GREEN} fapolicyd-cli --file add ${f}${NC}" + done + output_line "${GREEN} fapolicyd-cli --update${NC}" + output_line "" + fi + + # Rule file reference + output_line "${BOLD}── Rule File Reference ──────────────────────────${NC}" + output_line "" + output_line "${CYAN}Rules are loaded from:${NC}" + if [[ -d /etc/fapolicyd/rules.d ]]; then + output_line " /etc/fapolicyd/rules.d/ (drop-in directory)" + local rule_files + rule_files=$(ls /etc/fapolicyd/rules.d/ 2>/dev/null) + if [[ -n "$rule_files" ]]; then + output_line "${CYAN} Current rule files:${NC}" + while IFS= read -r rf; do + output_line " $rf" + done <<< "$rule_files" + fi + fi + if [[ -f /etc/fapolicyd/fapolicyd.rules ]]; then + output_line " /etc/fapolicyd/fapolicyd.rules (compiled rules)" + fi + output_line "" + output_line "${YELLOW}After making changes, restart the daemon:${NC}" + output_line "${GREEN} systemctl restart fapolicyd${NC}" + output_line "" +} + +# ── Summary ─────────────────────────────────────────────── + +print_summary() { + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "${BOLD} Summary${NC}" + output_line "${BOLD}═══════════════════════════════════════════════════${NC}" + output_line "" + output_line " Total denials found: ${BOLD}${TOTAL_DENIALS}${NC}" + output_line " Unique files denied: ${BOLD}${UNIQUE_FILES}${NC}" + output_line " Suggested fixes: ${BOLD}${SUGGESTED_FIXES}${NC}" + output_line "" + + if [[ -n "$OUTPUT_FILE" ]]; then + output_line "${GREEN}Suggestions saved to: ${OUTPUT_FILE}${NC}" + output_line "" + fi +} + +# ── Parse Arguments ─────────────────────────────────────── + +while [[ $# -gt 0 ]]; do + case "$1" in + --help|-h) + usage + ;; + --recent) + MODE="recent" + shift + ;; + --all) + MODE="all" + shift + ;; + --output) + if [[ -z "$2" || "$2" == --* ]]; then + echo -e "${RED}Error: --output requires a filename argument.${NC}" + exit 1 + fi + OUTPUT_FILE="$2" + shift 2 + ;; + --quiet|-q) + QUIET=1 + shift + ;; + *) + echo -e "${RED}Unknown option: $1${NC}" + echo "Use --help for usage information." + exit 1 + ;; + esac +done + +# ── Main ────────────────────────────────────────────────── + +check_root + +# Clear output file if specified +if [[ -n "$OUTPUT_FILE" ]]; then + > "$OUTPUT_FILE" +fi + +echo -e "${BOLD}fapolicyd Log Analyzer v1.00${NC}" +echo -e "${CYAN}Mode: ${MODE}${NC}" +echo "" + +check_fapolicyd +analyze_fapolicyd +print_summary diff --git a/file-permissions-audit.sh b/file-permissions-audit.sh new file mode 100644 index 0000000..81ab5fb --- /dev/null +++ b/file-permissions-audit.sh @@ -0,0 +1,354 @@ +#!/usr/bin/env bash + +######################################################################################### +#### file-permissions-audit.sh — Find world-writable files, SUID/SGID binaries, #### +#### and files owned by nobody or with no valid owner #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./file-permissions-audit.sh #### +#### ./file-permissions-audit.sh --scan-dirs /usr /bin /home #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +SCAN_DIRS="${SCAN_DIRS:-/usr /bin /sbin /var /opt /home /tmp}" +EXCLUDE_PATHS=() +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +COUNT_WORLD_WRITABLE=0 +COUNT_SUID=0 +COUNT_SGID=0 +COUNT_NOBODY=0 +COUNT_UNOWNED=0 + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + YELLOW='\033[0;33m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + RED="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${CYAN}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +# ── Helpers ─────────────────────────────────────────────────────────── +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +build_exclude_args() { + local args=() + for path in "${EXCLUDE_PATHS[@]+"${EXCLUDE_PATHS[@]}"}"; do + args+=(-not -path "${path}/*") + done + # Always exclude /proc and /sys + args+=(-not -path "/proc/*" -not -path "/sys/*") + echo "${args[@]}" +} + +get_file_info() { + local file="$1" + local octal symbolic owner group ftype + + octal=$(stat -c '%a' "$file" 2>/dev/null || echo "????") + symbolic=$(stat -c '%A' "$file" 2>/dev/null || echo "??????????") + owner=$(stat -c '%U' "$file" 2>/dev/null || echo "UNKNOWN") + group=$(stat -c '%G' "$file" 2>/dev/null || echo "UNKNOWN") + + if [[ -d "$file" ]]; then + ftype="dir" + elif [[ -L "$file" ]]; then + ftype="link" + else + ftype="file" + fi + + echo "${octal} ${symbolic} ${owner}:${group} ${ftype}" +} + +print_file_entry() { + local color="$1" + local file="$2" + local info + info=$(get_file_info "$file") + local octal symbolic ownership ftype + octal=$(echo "$info" | awk '{print $1}') + symbolic=$(echo "$info" | awk '{print $2}') + ownership=$(echo "$info" | awk '{print $3}') + ftype=$(echo "$info" | awk '{print $4}') + + printf " %b%-4s %-11s %-20s %-6s%b %s\n" "$color" "$octal" "$symbolic" "$ownership" "$ftype" "$RESET" "$file" +} + +# ══════════════════════════════════════════════════════════════════════ +# SCAN FUNCTIONS +# ══════════════════════════════════════════════════════════════════════ + +scan_world_writable() { + section_header "World-Writable Files & Directories" + + printf " ${BOLD}%-4s %-11s %-20s %-6s${RESET} %s\n" "PERM" "MODE" "OWNER:GROUP" "TYPE" "PATH" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + local count=0 + local exclude_args + exclude_args=$(build_exclude_args) + + for dir in $SCAN_DIRS; do + [[ -d "$dir" ]] || continue + # shellcheck disable=SC2086 + while IFS= read -r file; do + [[ -z "$file" ]] && continue + print_file_entry "$CYAN" "$file" + count=$((count + 1)) + done < <(find "$dir" -xdev -perm -0002 -not -type l $exclude_args 2>/dev/null) + done + + COUNT_WORLD_WRITABLE=$count + echo "" + log "Found ${count} world-writable entries" +} + +scan_suid() { + section_header "SUID Binaries" + + printf " ${BOLD}%-4s %-11s %-20s %-6s${RESET} %s\n" "PERM" "MODE" "OWNER:GROUP" "TYPE" "PATH" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + local count=0 + local exclude_args + exclude_args=$(build_exclude_args) + + for dir in $SCAN_DIRS; do + [[ -d "$dir" ]] || continue + # shellcheck disable=SC2086 + while IFS= read -r file; do + [[ -z "$file" ]] && continue + local owner + owner=$(stat -c '%U' "$file" 2>/dev/null || echo "UNKNOWN") + local color="$YELLOW" + if [[ "$owner" == "root" ]]; then + color="$RED" + fi + print_file_entry "$color" "$file" + count=$((count + 1)) + done < <(find "$dir" -xdev -type f -perm -4000 $exclude_args 2>/dev/null) + done + + COUNT_SUID=$count + echo "" + log "Found ${count} SUID binaries" +} + +scan_sgid() { + section_header "SGID Binaries" + + printf " ${BOLD}%-4s %-11s %-20s %-6s${RESET} %s\n" "PERM" "MODE" "OWNER:GROUP" "TYPE" "PATH" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + local count=0 + local exclude_args + exclude_args=$(build_exclude_args) + + for dir in $SCAN_DIRS; do + [[ -d "$dir" ]] || continue + # shellcheck disable=SC2086 + while IFS= read -r file; do + [[ -z "$file" ]] && continue + print_file_entry "$YELLOW" "$file" + count=$((count + 1)) + done < <(find "$dir" -xdev -type f -perm -2000 $exclude_args 2>/dev/null) + done + + COUNT_SGID=$count + echo "" + log "Found ${count} SGID binaries" +} + +scan_nobody() { + section_header "Files Owned by nobody/nogroup" + + printf " ${BOLD}%-4s %-11s %-20s %-6s${RESET} %s\n" "PERM" "MODE" "OWNER:GROUP" "TYPE" "PATH" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + local count=0 + local exclude_args + exclude_args=$(build_exclude_args) + + for dir in $SCAN_DIRS; do + [[ -d "$dir" ]] || continue + # shellcheck disable=SC2086 + while IFS= read -r file; do + [[ -z "$file" ]] && continue + print_file_entry "$YELLOW" "$file" + count=$((count + 1)) + done < <(find "$dir" -xdev \( -user nobody -o -group nogroup \) $exclude_args 2>/dev/null) + done + + COUNT_NOBODY=$count + echo "" + log "Found ${count} files owned by nobody/nogroup" +} + +scan_unowned() { + section_header "Files With No Valid Owner" + + printf " ${BOLD}%-4s %-11s %-20s %-6s${RESET} %s\n" "PERM" "MODE" "OWNER:GROUP" "TYPE" "PATH" + printf " %s\n" "$(printf '%.0s─' {1..78})" + + local count=0 + local exclude_args + exclude_args=$(build_exclude_args) + + for dir in $SCAN_DIRS; do + [[ -d "$dir" ]] || continue + # shellcheck disable=SC2086 + while IFS= read -r file; do + [[ -z "$file" ]] && continue + print_file_entry "$RED" "$file" + count=$((count + 1)) + done < <(find "$dir" -xdev \( -nouser -o -nogroup \) $exclude_args 2>/dev/null) + done + + COUNT_UNOWNED=$count + echo "" + log "Found ${count} files with no valid owner" +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + echo "" + echo -e " ${BOLD}══════════════════════════════════════════${RESET}" + echo -e " ${BOLD}Permissions Audit Summary${RESET}" + echo -e " ${BOLD}══════════════════════════════════════════${RESET}" + echo "" + + printf " %-30s %b\n" "World-writable:" "${CYAN}${COUNT_WORLD_WRITABLE}${RESET}" + printf " %-30s %b\n" "SUID binaries:" "${RED}${COUNT_SUID}${RESET}" + printf " %-30s %b\n" "SGID binaries:" "${YELLOW}${COUNT_SGID}${RESET}" + printf " %-30s %b\n" "Owned by nobody/nogroup:" "${YELLOW}${COUNT_NOBODY}${RESET}" + printf " %-30s %b\n" "No valid owner:" "${RED}${COUNT_UNOWNED}${RESET}" + + local total=$((COUNT_WORLD_WRITABLE + COUNT_SUID + COUNT_SGID + COUNT_NOBODY + COUNT_UNOWNED)) + echo "" + printf " %-30s %d\n" "Total findings:" "$total" + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2 + exit 1 ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + echo "" + echo -e "${BOLD}File Permissions Audit — $(hostname -f 2>/dev/null || hostname)${RESET}" + echo -e "${DIM}$(date '+%Y-%m-%d %H:%M:%S %Z')${RESET}" + echo -e "${DIM}Scanning: ${SCAN_DIRS}${RESET}" + + scan_world_writable + scan_suid + scan_sgid + scan_nobody + scan_unowned + print_summary +} + +main "$@" diff --git a/firewall-rule-diff.sh b/firewall-rule-diff.sh new file mode 100644 index 0000000..8c52884 --- /dev/null +++ b/firewall-rule-diff.sh @@ -0,0 +1,620 @@ +#!/usr/bin/env bash + +###################################################################################### +#### firewall-rule-diff.sh — Detect firewall rule drift against a saved baseline #### +#### Supports UFW, iptables, and nftables. Saves snapshots, diffs against #### +#### baseline, exports Prometheus metrics via textfile collector. #### +#### Requires: bash 4+, diff, coreutils #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### sudo ./firewall-rule-diff.sh --save #### +#### sudo ./firewall-rule-diff.sh --check #### +#### #### +#### See --help for all options. #### +###################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +MODE="" # save or check +BACKEND="" # auto-detect: ufw, iptables, nftables +BASELINE_DIR="/etc/firewall-baseline" +MAX_AGE_DAYS=30 +TEXTFILE_MODE=false +PROM_FILE="/var/lib/node_exporter/firewall_drift.prom" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +JUNIT_FILE="${JUNIT_FILE:-firewall-drift-results.xml}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0 +FAIL=0 +WARN=0 +TOTAL=0 +RESULTS=() +START_TIME="" +RULES_ADDED=0 +RULES_REMOVED=0 +RULES_TOTAL=0 +DRIFT_DETECTED=0 +BASELINE_AGE=0 +DETECTED_BACKEND="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" + local detail="${2:-}" + ((PASS++)) || true + ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name}" + elif [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_fail() { + local name="$1" + local detail="${2:-}" + ((FAIL++)) || true + ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + elif [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_warn() { + local name="$1" + local detail="${2:-}" + ((WARN++)) || true + ((TOTAL++)) || true + RESULTS+=("WARN|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name} # SKIP ${detail}" + elif [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e " ${YELLOW}⊘${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +# ── Help ────────────────────────────────────────────────────────────── +show_help() { + cat <<'EOF' +Usage: firewall-rule-diff.sh [OPTIONS] + +Detect firewall rule drift by comparing current state against a saved baseline. +Supports UFW, iptables, and nftables with auto-detection. + +Modes: + --save Save current firewall rules as new baseline + --check Compare current rules against baseline (default) + +Options: + --backend BACKEND Force backend: ufw, iptables, nftables (default: auto-detect) + --baseline-dir PATH Baseline storage directory (default: /etc/firewall-baseline/) + --max-age DAYS Warn if baseline older than N days (default: 30) + --textfile Write Prometheus metrics to textfile collector + --prom-file PATH Textfile path (default: /var/lib/node_exporter/firewall_drift.prom) + --format FORMAT Output: text (default), tap, junit + --junit-file FILE JUnit output path (default: firewall-drift-results.xml) + --verbose Show debug output + --no-color Disable colored output + -h, --help Show this help + +Examples: + sudo ./firewall-rule-diff.sh --save + sudo ./firewall-rule-diff.sh --check + sudo ./firewall-rule-diff.sh --check --textfile + sudo ./firewall-rule-diff.sh --backend iptables --check + sudo ./firewall-rule-diff.sh --check --max-age 7 +EOF + exit 0 +} + +# ── Parse Arguments ─────────────────────────────────────────────────── +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --save) MODE="save"; shift ;; + --check) MODE="check"; shift ;; + --backend) BACKEND="$2"; shift 2 ;; + --baseline-dir) BASELINE_DIR="$2"; shift 2 ;; + --max-age) MAX_AGE_DAYS="$2"; shift 2 ;; + --textfile) TEXTFILE_MODE=true; shift ;; + --prom-file) PROM_FILE="$2"; shift 2 ;; + --format) OUTPUT_FORMAT="$2"; shift 2 ;; + --junit-file) JUNIT_FILE="$2"; shift 2 ;; + --verbose) VERBOSE=true; shift ;; + --no-color) COLOR="never"; shift ;; + -h|--help) show_help ;; + *) err "Unknown option: $1"; echo "Run with --help for usage."; exit 1 ;; + esac + done + + if [[ -z "$MODE" ]]; then + MODE="check" + fi +} + +# ── Detect Backend ──────────────────────────────────────────────────── +detect_backend() { + if [[ -n "$BACKEND" ]]; then + DETECTED_BACKEND="$BACKEND" + verbose "Backend forced: ${DETECTED_BACKEND}" + return + fi + + if command -v ufw &>/dev/null && ufw status &>/dev/null; then + local ufw_status + ufw_status=$(ufw status 2>/dev/null | head -1) + if [[ "$ufw_status" == *"active"* ]]; then + DETECTED_BACKEND="ufw" + verbose "Detected active UFW" + return + fi + fi + + if command -v nft &>/dev/null; then + local nft_rules + nft_rules=$(nft list ruleset 2>/dev/null | wc -l) + if [[ "$nft_rules" -gt 0 ]]; then + DETECTED_BACKEND="nftables" + verbose "Detected nftables with ${nft_rules} lines" + return + fi + fi + + if command -v iptables-save &>/dev/null; then + DETECTED_BACKEND="iptables" + verbose "Detected iptables" + return + fi + + err "No supported firewall backend found (ufw, nftables, iptables)" + exit 1 +} + +# ── Snapshot Functions ──────────────────────────────────────────────── +snapshot_ufw() { + local dir="$1" + ufw status numbered > "${dir}/ufw-status.txt" 2>/dev/null || true + ufw status verbose > "${dir}/ufw-verbose.txt" 2>/dev/null || true + if [[ -f /etc/ufw/user.rules ]]; then + cp /etc/ufw/user.rules "${dir}/user.rules" + fi + if [[ -f /etc/ufw/user6.rules ]]; then + cp /etc/ufw/user6.rules "${dir}/user6.rules" + fi + # count rules from numbered output (skip header lines) + RULES_TOTAL=$(grep -cE '^\[' "${dir}/ufw-status.txt" 2>/dev/null) || RULES_TOTAL=0 + verbose "UFW snapshot: ${RULES_TOTAL} rules" +} + +snapshot_iptables() { + local dir="$1" + iptables-save > "${dir}/iptables-v4.rules" 2>/dev/null || true + if command -v ip6tables-save &>/dev/null; then + ip6tables-save > "${dir}/iptables-v6.rules" 2>/dev/null || true + fi + # count non-comment, non-empty lines + RULES_TOTAL=$(grep -cvE '^(#|$|\*|COMMIT|:)' "${dir}/iptables-v4.rules" 2>/dev/null) || RULES_TOTAL=0 + if [[ -f "${dir}/iptables-v6.rules" ]]; then + local v6_count + v6_count=$(grep -cvE '^(#|$|\*|COMMIT|:)' "${dir}/iptables-v6.rules" 2>/dev/null) || v6_count=0 + RULES_TOTAL=$((RULES_TOTAL + v6_count)) + fi + verbose "iptables snapshot: ${RULES_TOTAL} rules" +} + +snapshot_nftables() { + local dir="$1" + nft list ruleset > "${dir}/nftables.rules" 2>/dev/null || true + RULES_TOTAL=$(grep -cE '^\s+(rule|chain|table)' "${dir}/nftables.rules" 2>/dev/null) || RULES_TOTAL=0 + verbose "nftables snapshot: ${RULES_TOTAL} lines" +} + +take_snapshot() { + local dir="$1" + case "$DETECTED_BACKEND" in + ufw) snapshot_ufw "$dir" ;; + iptables) snapshot_iptables "$dir" ;; + nftables) snapshot_nftables "$dir" ;; + esac + echo "$DETECTED_BACKEND" > "${dir}/backend.txt" + date +%s > "${dir}/timestamp.txt" + date -Is > "${dir}/timestamp-human.txt" +} + +# ── Save Mode ───────────────────────────────────────────────────────── +do_save() { + mkdir -p "${BASELINE_DIR}" + local snapshot_dir="${BASELINE_DIR}/baseline" + + # clean up any previous baseline + if [[ -d "$snapshot_dir" ]]; then + local prev_ts + prev_ts=$(cat "${snapshot_dir}/timestamp.txt" 2>/dev/null || echo "unknown") + local archive_dir="${BASELINE_DIR}/archive-${prev_ts}" + mv "$snapshot_dir" "$archive_dir" 2>/dev/null || rm -rf "$snapshot_dir" + verbose "Archived previous baseline" + fi + + mkdir -p "$snapshot_dir" + + verbose "Taking ${DETECTED_BACKEND} snapshot to ${snapshot_dir}" + take_snapshot "$snapshot_dir" + + if [[ ! -f "${snapshot_dir}/backend.txt" ]]; then + err "Snapshot failed — ${snapshot_dir}/backend.txt not created" + exit 1 + fi + + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e "${BOLD}Firewall Rule Diff${RESET}" + echo "Backend: ${DETECTED_BACKEND}" + echo "Baseline: ${snapshot_dir}" + echo "Time: $(cat "${snapshot_dir}/timestamp-human.txt")" + echo "Rules: ${RULES_TOTAL}" + echo "" + echo -e " ${GREEN}✓${RESET} Baseline saved — ${RULES_TOTAL} rules (${DETECTED_BACKEND})" + elif [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "1..1" + echo "ok 1 - Baseline saved (${RULES_TOTAL} rules, ${DETECTED_BACKEND})" + fi +} + +# ── Check Mode ──────────────────────────────────────────────────────── +do_check() { + START_TIME=$(date +%s) + local baseline_dir="${BASELINE_DIR}/baseline" + + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e "${BOLD}Firewall Rule Diff${RESET}" + echo "Backend: ${DETECTED_BACKEND}" + echo "Baseline: ${baseline_dir}" + echo "Time: $(date -Is)" + echo "" + elif [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "TAP version 13" + fi + + # check baseline exists + if [[ ! -d "$baseline_dir" ]]; then + record_fail "Baseline exists" "no baseline found — run with --save first" + DRIFT_DETECTED=1 + print_summary + return + fi + + # check backend matches + local baseline_backend + baseline_backend=$(cat "${baseline_dir}/backend.txt" 2>/dev/null || echo "unknown") + if [[ "$baseline_backend" != "$DETECTED_BACKEND" ]]; then + record_fail "Backend match" "baseline uses ${baseline_backend}, current is ${DETECTED_BACKEND}" + DRIFT_DETECTED=1 + else + record_pass "Backend match" "${DETECTED_BACKEND}" + fi + + # check baseline age + local baseline_ts + baseline_ts=$(cat "${baseline_dir}/timestamp.txt" 2>/dev/null || echo "0") + local now + now=$(date +%s) + BASELINE_AGE=$((now - baseline_ts)) + local age_days=$((BASELINE_AGE / 86400)) + + if [[ $age_days -gt $MAX_AGE_DAYS ]]; then + record_warn "Baseline age" "${age_days} days old (threshold: ${MAX_AGE_DAYS})" + else + record_pass "Baseline age" "${age_days} days old" + fi + + # take current snapshot to temp dir + local tmp_dir + tmp_dir=$(mktemp -d) + trap 'rm -rf "'"$tmp_dir"'"' EXIT + take_snapshot "$tmp_dir" + + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo "" + echo -e "${BOLD}Rule Comparison${RESET}" + fi + + # diff based on backend + case "$DETECTED_BACKEND" in + ufw) diff_ufw "$baseline_dir" "$tmp_dir" ;; + iptables) diff_iptables "$baseline_dir" "$tmp_dir" ;; + nftables) diff_nftables "$baseline_dir" "$tmp_dir" ;; + esac + + print_summary +} + +# ── Diff Functions ──────────────────────────────────────────────────── +diff_rules_file() { + local label="$1" + local baseline_file="$2" + local current_file="$3" + + if [[ ! -f "$baseline_file" ]] && [[ ! -f "$current_file" ]]; then + verbose "Both files missing for ${label} — skipping" + return + fi + + if [[ ! -f "$baseline_file" ]]; then + record_fail "${label}" "file missing from baseline but present now" + DRIFT_DETECTED=1 + return + fi + + if [[ ! -f "$current_file" ]]; then + record_fail "${label}" "file present in baseline but missing now" + DRIFT_DETECTED=1 + return + fi + + local diff_output + diff_output=$(diff --unified=0 "$baseline_file" "$current_file" 2>/dev/null) || true + + if [[ -z "$diff_output" ]]; then + record_pass "${label}" "no changes" + return + fi + + DRIFT_DETECTED=1 + + local added removed + added=$(echo "$diff_output" | grep -c '^+[^+]' 2>/dev/null) || added=0 + removed=$(echo "$diff_output" | grep -c '^-[^-]' 2>/dev/null) || removed=0 + + RULES_ADDED=$((RULES_ADDED + added)) + RULES_REMOVED=$((RULES_REMOVED + removed)) + + record_fail "${label}" "${added} added, ${removed} removed" + + if [[ "$VERBOSE" == "true" || "$OUTPUT_FORMAT" == "text" ]]; then + # show the actual diff lines (limit to 20 lines) + local count=0 + while IFS= read -r line; do + if [[ "$line" == +* && "$line" != +++* ]]; then + echo -e " ${GREEN}${line}${RESET}" + ((count++)) || true + elif [[ "$line" == -* && "$line" != ---* ]]; then + echo -e " ${RED}${line}${RESET}" + ((count++)) || true + fi + [[ $count -ge 20 ]] && { echo " ... (truncated)"; break; } + done <<< "$diff_output" + fi +} + +diff_ufw() { + local baseline="$1" + local current="$2" + + diff_rules_file "UFW status" "${baseline}/ufw-status.txt" "${current}/ufw-status.txt" + diff_rules_file "UFW IPv4 rules" "${baseline}/user.rules" "${current}/user.rules" + diff_rules_file "UFW IPv6 rules" "${baseline}/user6.rules" "${current}/user6.rules" + + # rule count comparison + local baseline_count current_count + baseline_count=$(grep -cE '^\[' "${baseline}/ufw-status.txt" 2>/dev/null) || baseline_count=0 + current_count=$(grep -cE '^\[' "${current}/ufw-status.txt" 2>/dev/null) || current_count=0 + + if [[ $baseline_count -ne $current_count ]]; then + record_fail "Rule count" "baseline: ${baseline_count}, current: ${current_count}" + DRIFT_DETECTED=1 + else + record_pass "Rule count" "${current_count} rules" + fi +} + +diff_iptables() { + local baseline="$1" + local current="$2" + + diff_rules_file "iptables IPv4 rules" "${baseline}/iptables-v4.rules" "${current}/iptables-v4.rules" + diff_rules_file "iptables IPv6 rules" "${baseline}/iptables-v6.rules" "${current}/iptables-v6.rules" + + # chain count comparison + local baseline_chains current_chains + baseline_chains=$(grep -cE '^:' "${baseline}/iptables-v4.rules" 2>/dev/null) || baseline_chains=0 + current_chains=$(grep -cE '^:' "${current}/iptables-v4.rules" 2>/dev/null) || current_chains=0 + + if [[ $baseline_chains -ne $current_chains ]]; then + record_fail "Chain count (IPv4)" "baseline: ${baseline_chains}, current: ${current_chains}" + DRIFT_DETECTED=1 + else + record_pass "Chain count (IPv4)" "${current_chains} chains" + fi +} + +diff_nftables() { + local baseline="$1" + local current="$2" + + diff_rules_file "nftables ruleset" "${baseline}/nftables.rules" "${current}/nftables.rules" + + # table count comparison + local baseline_tables current_tables + baseline_tables=$(grep -c '^table' "${baseline}/nftables.rules" 2>/dev/null) || baseline_tables=0 + current_tables=$(grep -c '^table' "${current}/nftables.rules" 2>/dev/null) || current_tables=0 + + if [[ $baseline_tables -ne $current_tables ]]; then + record_fail "Table count" "baseline: ${baseline_tables}, current: ${current_tables}" + DRIFT_DETECTED=1 + else + record_pass "Table count" "${current_tables} tables" + fi +} + +# ── Summary ─────────────────────────────────────────────────────────── +print_summary() { + local end_time + end_time=$(date +%s) + local elapsed=$((end_time - START_TIME)) + + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "1..${TOTAL}" + elif [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo "" + echo "────────────────────────────────────────" + echo -e "${BOLD}Summary${RESET} ${DETECTED_BACKEND}" + echo -e " ${PASS} passed ${FAIL} failed ${WARN} skipped (${elapsed}s)" + if [[ $DRIFT_DETECTED -eq 1 ]]; then + echo -e " Rules added: ${RULES_ADDED} removed: ${RULES_REMOVED}" + echo -e " ${RED}Drift detected.${RESET}" + else + echo -e " ${GREEN}No drift detected.${RESET}" + fi + echo "────────────────────────────────────────" + fi + + if [[ "$OUTPUT_FORMAT" == "junit" ]]; then + write_junit + fi + + if [[ "$TEXTFILE_MODE" == "true" ]]; then + write_prometheus + fi +} + +# ── JUnit Output ────────────────────────────────────────────────────── +write_junit() { + local end_time + end_time=$(date +%s) + local elapsed=$((end_time - START_TIME)) + + { + echo '' + echo "" + echo " " + + for result in "${RESULTS[@]}"; do + local status name detail + status=$(echo "$result" | cut -d'|' -f1) + name=$(echo "$result" | cut -d'|' -f2) + detail=$(echo "$result" | cut -d'|' -f3) + + # escape XML + name="${name//&/&}" + name="${name///>}" + detail="${detail//&/&}" + detail="${detail///>}" + + echo " " + if [[ "$status" == "FAIL" ]]; then + echo " " + elif [[ "$status" == "WARN" ]]; then + echo " " + fi + echo " " + done + + echo " " + echo "" + } > "$JUNIT_FILE" + + verbose "JUnit report written to ${JUNIT_FILE}" +} + +# ── Prometheus Output ───────────────────────────────────────────────── +write_prometheus() { + local prom_dir + prom_dir=$(dirname "$PROM_FILE") + if [[ ! -d "$prom_dir" ]]; then + warn "Prometheus textfile directory does not exist: ${prom_dir}" + return + fi + + local tmp_file="${PROM_FILE}.$$" + { + echo "# HELP firewall_drift_detected Whether firewall rules differ from baseline" + echo "# TYPE firewall_drift_detected gauge" + echo "firewall_drift_detected ${DRIFT_DETECTED}" + echo "# HELP firewall_rules_added Rules added since baseline" + echo "# TYPE firewall_rules_added gauge" + echo "firewall_rules_added ${RULES_ADDED}" + echo "# HELP firewall_rules_removed Rules removed since baseline" + echo "# TYPE firewall_rules_removed gauge" + echo "firewall_rules_removed ${RULES_REMOVED}" + echo "# HELP firewall_rules_total Current total firewall rules" + echo "# TYPE firewall_rules_total gauge" + echo "firewall_rules_total ${RULES_TOTAL}" + echo "# HELP firewall_baseline_age_seconds Seconds since baseline was saved" + echo "# TYPE firewall_baseline_age_seconds gauge" + echo "firewall_baseline_age_seconds ${BASELINE_AGE}" + echo "# HELP firewall_scan_timestamp Unix timestamp of last scan" + echo "# TYPE firewall_scan_timestamp gauge" + echo "firewall_scan_timestamp $(date +%s)" + echo "# HELP firewall_backend Active firewall backend" + echo "# TYPE firewall_backend gauge" + echo "firewall_backend{backend=\"${DETECTED_BACKEND}\"} 1" + } > "$tmp_file" + + mv "$tmp_file" "$PROM_FILE" + verbose "Prometheus metrics written to ${PROM_FILE}" +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + setup_colors + parse_args "$@" + setup_colors # re-apply after --no-color + + if [[ $EUID -ne 0 ]]; then + err "This script must be run as root." + exit 1 + fi + + detect_backend + + case "$MODE" in + save) do_save ;; + check) do_check ;; + esac + + if [[ $FAIL -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@" diff --git a/fix-code-server-nginx.sh b/fix-code-server-nginx.sh new file mode 100755 index 0000000..9b4c7fe --- /dev/null +++ b/fix-code-server-nginx.sh @@ -0,0 +1,65 @@ +#!/bin/bash +############################################################### +#### Fix code-server Nginx Config #### +#### Applies X-Frame-Options + query-filter fixes #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: sudo ./fix-code-server-nginx.sh [domain] #### +############################################################### +set -euo pipefail + +if [[ $EUID -ne 0 ]]; then + echo "ERROR: This script must be run as root (sudo)." + exit 1 +fi + +DOMAIN="${1:-}" +if [[ -z "$DOMAIN" ]]; then + read -rp "Enter the code-server domain (e.g. code.mydomain.com): " DOMAIN +fi + +HEADERS_FILE="/etc/nginx/snippets/security-headers-${DOMAIN}.conf" +SITE_CONF="/etc/nginx/conf.d/code-server.conf" + +echo "=== Fix 1: X-Frame-Options DENY -> SAMEORIGIN ===" +if [[ -f "$HEADERS_FILE" ]]; then + if grep -q 'X-Frame-Options "DENY"' "$HEADERS_FILE"; then + sed -i 's/X-Frame-Options "DENY"/X-Frame-Options "SAMEORIGIN"/' "$HEADERS_FILE" + echo " Updated: $HEADERS_FILE" + else + echo " Already set to SAMEORIGIN (or not found) in $HEADERS_FILE — skipping." + fi +else + echo " WARNING: $HEADERS_FILE not found. Skipping." +fi + +echo "" +echo "=== Fix 2: Disable query-filter snippet ===" +QUERY_FILTER="snippets/query-filter-${DOMAIN}.conf" +if [[ -f "$SITE_CONF" ]]; then + if grep -qE "^\s*include\s+${QUERY_FILTER}" "$SITE_CONF"; then + sed -i "s|^\(\s*\)include ${QUERY_FILTER};|\1# Disabled: breaks VS Code extensions\n\1# include ${QUERY_FILTER};|" "$SITE_CONF" + echo " Commented out query filter in: $SITE_CONF" + else + echo " Query filter already disabled (or not found) in $SITE_CONF — skipping." + fi +else + echo " WARNING: $SITE_CONF not found. Skipping." +fi + +echo "" +echo "=== Testing and reloading nginx ===" +if nginx -t; then + systemctl reload nginx + echo " Nginx reloaded successfully." +else + echo " ERROR: nginx config test failed. Check the files manually." + exit 1 +fi + +echo "" +echo "Done! Reload your code-server browser tab to verify." diff --git a/freeradius-exporter.sh b/freeradius-exporter.sh new file mode 100644 index 0000000..783c5a5 --- /dev/null +++ b/freeradius-exporter.sh @@ -0,0 +1,395 @@ +#!/usr/bin/env bash + +######################################################################################### +#### freeradius-exporter.sh — Prometheus metrics exporter for FreeRADIUS #### +#### Exports authentication, accounting, and proxy statistics from the #### +#### FreeRADIUS status server as Prometheus metrics #### +#### Requires: bash 4+, radclient #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./freeradius-exporter.sh --http --port 9620 #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -uo pipefail + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +RADIUS_HOST="${RADIUS_HOST:-localhost}" +RADIUS_STATUS_PORT="${RADIUS_STATUS_PORT:-18121}" +RADIUS_SECRET="${RADIUS_SECRET:-adminsecret}" +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9620 + +EXPORTER_VERSION="1.00" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Query the FreeRADIUS status server via radclient +# Returns: raw attribute-value pair output, or empty on failure +query_status_server() { + echo "Message-Authenticator = 0x00, FreeRADIUS-Statistics-Type = All" \ + | radclient "${RADIUS_HOST}:${RADIUS_STATUS_PORT}" status "${RADIUS_SECRET}" 2>/dev/null +} + +# Extract a numeric value from radclient output +# Args: $1 - attribute name, $2 - radclient output +# Returns: numeric value or 0 if not found +extract_value() { + local attr="$1" + local data="$2" + local val + val=$(echo "$data" | grep -F "$attr" | awk -F'= ' '{print $2}' | tr -d '[:space:]') + echo "${val:-0}" +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Check radclient is available + if ! command -v radclient >/dev/null 2>&1; then + echo "ERROR: radclient command not found" >&2 + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +FreeRADIUS Exporter v${EXPORTER_VERSION} + +

FreeRADIUS Prometheus Exporter v${EXPORTER_VERSION}

+

Metrics

+

Authentication, accounting, and proxy statistics from the FreeRADIUS status server.

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +main() { + parse_args "$@" + + if [[ "$HTTP_MODE" = true ]]; then + run_http_server + elif [[ -n "$OUTPUT_FILE" ]]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.freeradius_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [[ "$file_lines" -lt 5 ]]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/game-server-exporter.sh b/game-server-exporter.sh new file mode 100755 index 0000000..1f66285 --- /dev/null +++ b/game-server-exporter.sh @@ -0,0 +1,624 @@ +#!/bin/bash +################################################################################ +# Script Name: game-server-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for game servers providing operational +# metrics — Minecraft, Valheim, and Palworld player counts, +# server status, TPS, query response times, and server version info +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - nmap-ncat (nc) for network queries +# - curl for REST API queries (Palworld) +# - python3 with mcstatus (optional, enhanced Minecraft metrics) +# - netcat (nc) for HTTP mode +# +# Usage: +# # Output to stdout +# ./game-server-exporter.sh +# +# # HTTP server mode +# ./game-server-exporter.sh --http -p 9195 +# +# # Textfile collector mode +# ./game-server-exporter.sh --textfile +# +# # Custom server addresses +# ./game-server-exporter.sh --minecraft-host mc.example.com +# +# Metrics Exported: +# - game_server_up{game,server} - Server reachability (1=up, 0=down) +# - game_server_players_online{game,server} - Online player count +# - game_server_players_max{game,server} - Maximum player slots +# - game_server_info{game,server,version,motd} - Server version info +# - game_server_tps{game="minecraft",server} - Ticks per second (Minecraft) +# - game_server_query_duration_seconds{game,server} - Query time per server +# - game_server_exporter_duration_seconds - Total script execution time +# - game_server_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9195 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9195 + +# Server configuration +MINECRAFT_HOST="" +MINECRAFT_QUERY_PORT=25565 +MINECRAFT_RCON_PORT=25575 +MINECRAFT_RCON_PASS="" +VALHEIM_HOST="" +VALHEIM_QUERY_PORT=2457 +PALWORLD_HOST="" +PALWORLD_QUERY_PORT=8212 + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Check prerequisites +# Returns: 0 if OK, 1 if error +check_prerequisites() { + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: nc (nmap-ncat) not found" >&2 + return 1 + fi + + if [ -n "$PALWORLD_HOST" ] && ! command -v curl >/dev/null 2>&1; then + echo "ERROR: curl not found (required for Palworld REST API)" >&2 + return 1 + fi + + return 0 +} + +# Escape special characters in Prometheus label values +# Args: $1 - string to escape +# Returns: escaped string safe for Prometheus labels +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/}" + echo "$val" +} + +# ============================================================================ +# GAME SERVER QUERY FUNCTIONS +# ============================================================================ + +# Query Minecraft server using python3 mcstatus or basic TCP check +# Args: $1 - host, $2 - port +# Sets global variables: mc_up, mc_players_online, mc_players_max, mc_version, mc_motd, mc_tps, mc_query_duration +query_minecraft() { + local host="$1" + local port="$2" + local query_start query_end + + mc_up=0 + mc_players_online=0 + mc_players_max=0 + mc_version="unknown" + mc_motd="unknown" + mc_tps="" + mc_query_duration=0 + + query_start=$(date +%s%N) + + # Try python3 mcstatus first (most reliable) + if command -v python3 >/dev/null 2>&1; then + local py_result + py_result=$(python3 -c " +import sys +try: + from mcstatus import JavaServer + server = JavaServer.lookup('${host}:${port}', timeout=5) + status = server.status() + print('UP') + print(status.players.online) + print(status.players.max) + print(status.version.name) + desc = status.description + if isinstance(desc, dict): + desc = desc.get('text', 'unknown') + print(str(desc).replace(chr(10), ' ')) +except ImportError: + print('NO_MCSTATUS') +except Exception as e: + print('DOWN') +" 2>/dev/null) || true + + local first_line + first_line=$(echo "$py_result" | head -1) + + if [ "$first_line" = "UP" ]; then + mc_up=1 + mc_players_online=$(echo "$py_result" | sed -n '2p') + mc_players_max=$(echo "$py_result" | sed -n '3p') + mc_version=$(echo "$py_result" | sed -n '4p') + mc_motd=$(echo "$py_result" | sed -n '5p') + elif [ "$first_line" != "NO_MCSTATUS" ]; then + # mcstatus available but server is down + query_end=$(date +%s%N) + mc_query_duration=$(( (query_end - query_start) / 1000000000 )) + return + fi + fi + + # Fallback: basic TCP check if mcstatus not available or not tried yet + if [ "$mc_up" -eq 0 ] && [ -z "${py_result:-}" ] || { [ -n "${first_line:-}" ] && [ "$first_line" = "NO_MCSTATUS" ]; }; then + if nc -z -w 3 "$host" "$port" 2>/dev/null; then + mc_up=1 + # Try to read SLP response for basic info + local slp_response + slp_response=$(printf '\xfe\x01' | nc -w 3 "$host" "$port" 2>/dev/null | strings 2>/dev/null) || true + if [ -n "$slp_response" ]; then + # Legacy SLP response: §1\0\0\0\0\0 + mc_version=$(echo "$slp_response" | tr '\0' '\n' | sed -n '4p' 2>/dev/null) || mc_version="unknown" + mc_motd=$(echo "$slp_response" | tr '\0' '\n' | sed -n '5p' 2>/dev/null) || mc_motd="unknown" + mc_players_online=$(echo "$slp_response" | tr '\0' '\n' | sed -n '6p' 2>/dev/null) || mc_players_online=0 + mc_players_max=$(echo "$slp_response" | tr '\0' '\n' | sed -n '7p' 2>/dev/null) || mc_players_max=0 + # Sanitize numeric values + [[ "$mc_players_online" =~ ^[0-9]+$ ]] || mc_players_online=0 + [[ "$mc_players_max" =~ ^[0-9]+$ ]] || mc_players_max=0 + fi + fi + fi + + # Try RCON for TPS if credentials are provided and server is up + if [ "$mc_up" -eq 1 ] && [ -n "$MINECRAFT_RCON_PASS" ]; then + local tps_result + tps_result=$(python3 -c " +import sys +try: + from mcrcon import MCRcon + with MCRcon('${host}', '${MINECRAFT_RCON_PASS}', port=${MINECRAFT_RCON_PORT}) as mcr: + resp = mcr.command('tps') + # Parse TPS from response (e.g., '§6TPS from last 1m, 5m, 15m: §a20.0, §a20.0, §a20.0') + import re + nums = re.findall(r'[\d.]+', resp) + if nums: + print(nums[-1]) # Last TPS value (15m average) +except Exception: + pass +" 2>/dev/null) || true + if [ -n "$tps_result" ]; then + mc_tps="$tps_result" + fi + fi + + query_end=$(date +%s%N) + mc_query_duration=$(( (query_end - query_start) / 1000000000 )) +} + +# Query Valheim server using Steam A2S protocol or TCP fallback +# Args: $1 - host, $2 - port +# Sets global variables: vh_up, vh_players_online, vh_players_max, vh_version, vh_motd, vh_query_duration +query_valheim() { + local host="$1" + local port="$2" + local query_start query_end + + vh_up=0 + vh_players_online=0 + vh_players_max=0 + vh_version="unknown" + vh_motd="unknown" + vh_query_duration=0 + + query_start=$(date +%s%N) + + # Try python3 A2S query first (Steam query protocol) + if command -v python3 >/dev/null 2>&1; then + local py_result + py_result=$(python3 -c " +import sys +try: + import a2s + address = ('${host}', ${port}) + info = a2s.info(address, timeout=5) + print('UP') + print(info.player_count) + print(info.max_players) + print(info.version) + print(info.server_name.replace(chr(10), ' ')) +except ImportError: + print('NO_A2S') +except Exception: + print('DOWN') +" 2>/dev/null) || true + + local first_line + first_line=$(echo "$py_result" | head -1) + + if [ "$first_line" = "UP" ]; then + vh_up=1 + vh_players_online=$(echo "$py_result" | sed -n '2p') + vh_players_max=$(echo "$py_result" | sed -n '3p') + vh_version=$(echo "$py_result" | sed -n '4p') + vh_motd=$(echo "$py_result" | sed -n '5p') + elif [ "$first_line" != "NO_A2S" ]; then + query_end=$(date +%s%N) + vh_query_duration=$(( (query_end - query_start) / 1000000000 )) + return + fi + fi + + # Fallback: TCP port check on game port (query port - 1 is typically the game port) + if [ "$vh_up" -eq 0 ]; then + local game_port=$((port - 1)) + if nc -z -w 3 "$host" "$game_port" 2>/dev/null || nc -z -w 3 "$host" "$port" 2>/dev/null; then + vh_up=1 + fi + fi + + query_end=$(date +%s%N) + vh_query_duration=$(( (query_end - query_start) / 1000000000 )) +} + +# Query Palworld server using REST API or TCP fallback +# Args: $1 - host, $2 - port +# Sets global variables: pw_up, pw_players_online, pw_players_max, pw_version, pw_motd, pw_query_duration +query_palworld() { + local host="$1" + local port="$2" + local query_start query_end + + pw_up=0 + pw_players_online=0 + pw_players_max=0 + pw_version="unknown" + pw_motd="unknown" + pw_query_duration=0 + + query_start=$(date +%s%N) + + # Try REST API query first + if command -v curl >/dev/null 2>&1; then + local api_response + api_response=$(curl -s -m 5 "http://${host}:${port}/v1/api/info" 2>/dev/null) || true + + if [ -n "$api_response" ] && command -v python3 >/dev/null 2>&1; then + local parse_result + parse_result=$(python3 -c " +import json, sys +try: + data = json.loads('''${api_response}''') + print('UP') + print(data.get('currentPlayerNum', 0)) + print(data.get('maxPlayerNum', 0)) + print(data.get('version', 'unknown')) + print(data.get('serverName', 'unknown').replace(chr(10), ' ')) +except Exception: + print('PARSE_FAIL') +" 2>/dev/null) || true + + local first_line + first_line=$(echo "$parse_result" | head -1) + + if [ "$first_line" = "UP" ]; then + pw_up=1 + pw_players_online=$(echo "$parse_result" | sed -n '2p') + pw_players_max=$(echo "$parse_result" | sed -n '3p') + pw_version=$(echo "$parse_result" | sed -n '4p') + pw_motd=$(echo "$parse_result" | sed -n '5p') + fi + elif [ -n "$api_response" ]; then + # curl got a response but no python3 to parse JSON + pw_up=1 + fi + fi + + # Fallback: TCP port check + if [ "$pw_up" -eq 0 ]; then + if nc -z -w 3 "$host" "$port" 2>/dev/null; then + pw_up=1 + fi + fi + + query_end=$(date +%s%N) + pw_query_duration=$(( (query_end - query_start) / 1000000000 )) +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Check prerequisites + if ! check_prerequisites; then + return + fi + + # Check that at least one server is configured + if [ -z "$MINECRAFT_HOST" ] && [ -z "$VALHEIM_HOST" ] && [ -z "$PALWORLD_HOST" ]; then + echo "# No game servers configured. Use --minecraft-host, --valheim-host, or --palworld-host" >&2 + return + fi + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + # Infinite loop accepting HTTP requests + while true; do + { + read -r request + # Check if request is for /metrics endpoint + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else # Serve HTML landing page for other requests + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +Game Server Exporter v1.0 + +

Game Server Prometheus Exporter v1.0

+

Metrics

+

Operational metrics from Minecraft, Valheim, and Palworld servers.

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Main entry point - routes to appropriate output mode +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + # Run HTTP server (blocks until killed) + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + # Textfile collector mode: write atomically using temp file + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + # Create temp file in SAME directory for atomic rename (same filesystem) + local temp_file + temp_file=$(mktemp "${output_dir}/.game_server_metrics.XXXXXX") + + # Generate metrics to temp file + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + # Validate: file must exist, have content + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + # Set permissions before move + chmod 644 "$temp_file" + + # Atomic rename - no gap where file is missing + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + # Default: output to stdout + generate_metrics + fi +} + +# Execute main function with all script arguments +main "$@" diff --git a/gcp-cost-reporter.sh b/gcp-cost-reporter.sh new file mode 100755 index 0000000..872790d --- /dev/null +++ b/gcp-cost-reporter.sh @@ -0,0 +1,563 @@ +#!/usr/bin/env bash + +######################################################################################### +#### gcp-cost-reporter.sh — GCP cost breakdown by service, project, or label. #### +#### Queries BigQuery billing export for spend data with period comparison #### +#### Requires: bash 4+, gcloud CLI (bq), jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./gcp-cost-reporter.sh --daily #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +BQ_BILLING_TABLE="${BQ_BILLING_TABLE:-}" +GCP_PROJECT="${GCP_PROJECT:-}" +GROUP_BY="${GROUP_BY:-SERVICE}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}" +LABEL_FILTER_KEY="${LABEL_FILTER_KEY:-}" +LABEL_FILTER_VALUE="${LABEL_FILTER_VALUE:-}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +RUN_MODE="" +CUSTOM_START="" +CUSTOM_END="" +SLACK_URL="" +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "auto" && ! -t 1 ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" DIM="" RESET="" + return + fi + RED="\033[0;31m" + GREEN="\033[0;32m" + YELLOW="\033[0;33m" + # shellcheck disable=SC2034 + BLUE="\033[0;34m" + # shellcheck disable=SC2034 + BOLD="\033[1m" + DIM="\033[2m" + RESET="\033[0m" +} + +# ── Logging ─────────────────────────────────────────────────────────── +log_info() { printf "${GREEN}[INFO]${RESET} %s\n" "$*"; } +log_warn() { printf "${YELLOW}[WARN]${RESET} %s\n" "$*" >&2; } +log_error() { printf "${RED}[ERROR]${RESET} %s\n" "$*" >&2; } +log_debug() { [[ "$VERBOSE" == "true" ]] && printf "${DIM}[DEBUG] %s${RESET}\n" "$*"; } + +# ── Helpers ─────────────────────────────────────────────────────────── +die() { log_error "$@"; exit 1; } + +check_deps() { + local missing=() + command -v gcloud >/dev/null 2>&1 || missing+=("gcloud") + command -v bq >/dev/null 2>&1 || missing+=("bq") + command -v jq >/dev/null 2>&1 || missing+=("jq") + command -v curl >/dev/null 2>&1 || missing+=("curl") + if (( ${#missing[@]} > 0 )); then + die "Missing required tools: ${missing[*]}" + fi + + local bash_major="${BASH_VERSINFO[0]}" + if (( bash_major < 4 )); then + die "Requires bash 4+, found ${BASH_VERSION}" + fi +} + +validate_date() { + local d="$1" + if [[ ! "$d" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]]; then + die "Invalid date format: $d (expected YYYY-MM-DD)" + fi +} + +# ── Date math (portable) ───────────────────────────────────────────── +date_offset() { + local base="$1" offset="$2" + if date --version >/dev/null 2>&1; then + date -d "${base} ${offset} days" +%Y-%m-%d + else + date -j -v"${offset}d" -f "%Y-%m-%d" "$base" +%Y-%m-%d + fi +} + +today_utc() { date -u +%Y-%m-%d; } + +first_of_month() { + local d="$1" + echo "${d:0:8}01" +} + +first_of_prev_month() { + local d="$1" + local year="${d:0:4}" + local month="${d:5:2}" + month=$((10#$month - 1)) + if (( month == 0 )); then + month=12 + year=$((year - 1)) + fi + printf "%04d-%02d-01" "$year" "$month" +} + +days_between() { + local s="$1" e="$2" + local ss se + if date --version >/dev/null 2>&1; then + ss=$(date -d "$s" +%s) + se=$(date -d "$e" +%s) + else + ss=$(date -j -f "%Y-%m-%d" "$s" +%s) + se=$(date -j -f "%Y-%m-%d" "$e" +%s) + fi + echo $(( (se - ss) / 86400 )) +} + +# ── Compute date ranges ────────────────────────────────────────────── +compute_ranges() { + local today + today="$(today_utc)" + + case "$RUN_MODE" in + daily) + PERIOD_START="$(date_offset "$today" -1)" + PERIOD_END="$today" + PREV_START="$(date_offset "$today" -2)" + PREV_END="$(date_offset "$today" -1)" + ;; + weekly) + PERIOD_START="$(date_offset "$today" -7)" + PERIOD_END="$today" + PREV_START="$(date_offset "$today" -14)" + PREV_END="$(date_offset "$today" -7)" + ;; + monthly) + PERIOD_START="$(first_of_month "$today")" + PERIOD_END="$today" + local prev_first + prev_first="$(first_of_prev_month "$today")" + PREV_START="$prev_first" + PREV_END="$PERIOD_START" + ;; + custom) + PERIOD_START="$CUSTOM_START" + PERIOD_END="$CUSTOM_END" + local span + span="$(days_between "$CUSTOM_START" "$CUSTOM_END")" + PREV_START="$(date_offset "$CUSTOM_START" "-$span")" + PREV_END="$CUSTOM_START" + ;; + *) + die "Unknown mode: $RUN_MODE" + ;; + esac + + log_debug "Current period: $PERIOD_START → $PERIOD_END" + log_debug "Previous period: $PREV_START → $PREV_END" +} + +# ── Build BigQuery SQL ──────────────────────────────────────────────── +build_select_column() { + case "$GROUP_BY" in + SERVICE) echo "service.description AS group_key" ;; + PROJECT) echo "project.id AS group_key" ;; + LABEL) + if [[ -z "$LABEL_FILTER_KEY" ]]; then + die "--group-by LABEL requires --label KEY=VALUE" + fi + echo "( SELECT value FROM UNNEST(labels) WHERE key = '${LABEL_FILTER_KEY}' ) AS group_key" + ;; + *) + die "Invalid --group-by value: $GROUP_BY (expected SERVICE, PROJECT, or LABEL)" + ;; + esac +} + +build_where_clause() { + local start="$1" end="$2" + local where="usage_start_time >= TIMESTAMP('${start}') AND usage_start_time < TIMESTAMP('${end}')" + + if [[ -n "$LABEL_FILTER_KEY" && -n "$LABEL_FILTER_VALUE" ]]; then + where="${where} AND EXISTS( SELECT 1 FROM UNNEST(labels) l WHERE l.key = '${LABEL_FILTER_KEY}' AND l.value = '${LABEL_FILTER_VALUE}' )" + fi + + echo "$where" +} + +build_query() { + local start="$1" end="$2" + local select_col where_clause + + select_col="$(build_select_column)" + where_clause="$(build_where_clause "$start" "$end")" + + cat </dev/null +} + +# ── Parse cost data ────────────────────────────────────────────────── +parse_costs() { + local raw="$1" + echo "$raw" | jq -r ' + .[] | + select(.group_key != null and .group_key != "") | + "\(.group_key)\t\(.total_cost)" + ' 2>/dev/null || echo "" +} + +# ── Format helpers ──────────────────────────────────────────────────── +fmt_currency() { + printf "$%.2f" "$1" +} + +fmt_delta() { + local curr="$1" prev="$2" + if (( $(echo "$prev == 0" | bc -l) )); then + echo "N/A" + return + fi + local pct + pct=$(echo "scale=1; (($curr - $prev) / $prev) * 100" | bc -l) + local sign="" + if (( $(echo "$pct > 0" | bc -l) )); then + sign="+" + fi + echo "${sign}${pct}%" +} + +print_header() { + local account_id + account_id=$(gcloud config get-value account 2>/dev/null || echo "unknown") + local project_id + project_id="${GCP_PROJECT:-$(gcloud config get-value project 2>/dev/null || echo "unknown")}" + + echo "GCP Cost Reporter" + echo "Account: $account_id" + echo "Project: $project_id" + echo "Table: $BQ_BILLING_TABLE" + echo "Mode: $RUN_MODE" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + + if [[ "$RUN_MODE" == "custom" ]]; then + echo "Period: $PERIOD_START → $PERIOD_END" + fi + echo "" +} + +# ── Text table output ──────────────────────────────────────────────── +output_text_table() { + local -n curr_data=$1 + local -n prev_data=$2 + local label="SERVICE" + case "$GROUP_BY" in + PROJECT) label="PROJECT" ;; + LABEL) label="LABEL" ;; + esac + local divider="──────────────────────────────────────────────────────────────────────" + printf " %-38s %-12s %-12s %s\n" "$label" "COST" "PREV" "DELTA" + printf " %s\n" "$divider" + local total_curr=0 total_prev=0 + for key in "${!curr_data[@]}"; do + local cost="${curr_data[$key]}" prev_cost="${prev_data[$key]:-0}" + printf " %-38s %-12s %-12s %s\n" \ + "$key" "$(fmt_currency "$cost")" "$(fmt_currency "$prev_cost")" "$(fmt_delta "$cost" "$prev_cost")" + total_curr=$(echo "$total_curr + $cost" | bc -l) + total_prev=$(echo "$total_prev + $prev_cost" | bc -l) + done + printf " %s\n" "$divider" + printf " %-38s %-12s %-12s %s\n" \ + "TOTAL" "$(fmt_currency "$total_curr")" "$(fmt_currency "$total_prev")" "$(fmt_delta "$total_curr" "$total_prev")" +} + +# ── CSV output ──────────────────────────────────────────────────────── +output_csv() { + local -n curr_data=$1 + local -n prev_data=$2 + local label="service" + case "$GROUP_BY" in + PROJECT) label="project" ;; + LABEL) label="label" ;; + esac + echo "${label},cost,previous_cost,delta_pct" + for key in "${!curr_data[@]}"; do + local cost="${curr_data[$key]}" prev_cost="${prev_data[$key]:-0}" pct="0" + if (( $(echo "$prev_cost != 0" | bc -l) )); then + pct=$(echo "scale=2; (($cost - $prev_cost) / $prev_cost) * 100" | bc -l) + fi + echo "\"$key\",$cost,$prev_cost,$pct" + done +} + +# ── JSON output ─────────────────────────────────────────────────────── +output_json() { + local -n curr_data=$1 + local -n prev_data=$2 + local label="service" + case "$GROUP_BY" in + PROJECT) label="project" ;; + LABEL) label="label" ;; + esac + local items=() + for key in "${!curr_data[@]}"; do + items+=("{\"${label}\": \"${key}\", \"cost\": ${curr_data[$key]}, \"previous_cost\": ${prev_data[$key]:-0}}") + done + local joined + joined=$(printf ",%s" "${items[@]}") + joined="${joined:1}" + printf '{"mode":"%s","period_start":"%s","period_end":"%s","previous_start":"%s","previous_end":"%s","group_by":"%s","items":[%s]}\n' \ + "$RUN_MODE" "$PERIOD_START" "$PERIOD_END" "$PREV_START" "$PREV_END" "$GROUP_BY" "$joined" +} + +# ── Render report ───────────────────────────────────────────────────── +render_report() { + local curr_raw="$1" prev_raw="$2" + + declare -A curr_costs + declare -A prev_costs + + while IFS=$'\t' read -r key amount; do + [[ -z "$key" ]] && continue + curr_costs["$key"]="$amount" + done <<< "$(parse_costs "$curr_raw")" + + while IFS=$'\t' read -r key amount; do + [[ -z "$key" ]] && continue + prev_costs["$key"]="$amount" + done <<< "$(parse_costs "$prev_raw")" + + for key in "${!prev_costs[@]}"; do + if [[ -z "${curr_costs[$key]+x}" ]]; then + curr_costs["$key"]="0" + fi + done + + case "$OUTPUT_FORMAT" in + text) + print_header + local title="Cost Breakdown — ${PERIOD_START} → ${PERIOD_END}" + echo "$title" + output_text_table curr_costs prev_costs + echo "" + ;; + csv) + output_csv curr_costs prev_costs + ;; + json) + output_json curr_costs prev_costs + ;; + *) + die "Unknown format: $OUTPUT_FORMAT" + ;; + esac +} + +# ── Slack webhook ───────────────────────────────────────────────────── +send_slack() { + local report="$1" webhook="$2" + + log_info "Posting report to Slack..." + + local max_len=3000 + local body="$report" + if (( ${#body} > max_len )); then + body="${body:0:$max_len} + +... (truncated — full report exceeds Slack message limit)" + fi + + local payload + payload=$(jq -n --arg text "\`\`\`${body}\`\`\`" '{ text: $text }') + + local http_code + http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + -d "$payload" \ + "$webhook") + + if [[ "$http_code" != "200" ]]; then + log_error "Slack webhook returned HTTP $http_code" + return 1 + fi + + log_info "Slack message posted" +} + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat < 0 )); do + case "$1" in + --daily|--weekly|--monthly) + RUN_MODE="${1#--}"; shift ;; + --custom) + RUN_MODE="custom" + [[ $# -lt 3 ]] && die "--custom requires START and END dates" + CUSTOM_START="$2"; CUSTOM_END="$3" + validate_date "$CUSTOM_START"; validate_date "$CUSTOM_END" + shift 3 ;; + --group-by) + [[ $# -lt 2 ]] && die "--group-by requires a value" + GROUP_BY="$2"; shift 2 ;; + --label) + [[ $# -lt 2 ]] && die "--label requires KEY=VALUE" + [[ "$2" != *"="* ]] && die "--label value must be KEY=VALUE" + LABEL_FILTER_KEY="${2%%=*}"; LABEL_FILTER_VALUE="${2#*=}"; shift 2 ;; + --format) + [[ $# -lt 2 ]] && die "--format requires a value" + OUTPUT_FORMAT="$2"; shift 2 ;; + --slack) + [[ $# -lt 2 ]] && die "--slack requires a webhook URL" + SLACK_URL="$2"; shift 2 ;; + --project) + [[ $# -lt 2 ]] && die "--project requires a project ID" + GCP_PROJECT="$2"; shift 2 ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + --help|-h) usage ;; + *) die "Unknown option: $1 (see --help)" ;; + esac + done + + if [[ -z "$RUN_MODE" ]]; then log_error "No mode specified"; echo ""; usage; exit 1; fi + [[ -z "$BQ_BILLING_TABLE" ]] && die "BQ_BILLING_TABLE is required (e.g., project.dataset.gcp_billing_export_v1_XXXXXX)" + [[ -z "$SLACK_URL" && -n "$SLACK_WEBHOOK_URL" ]] && SLACK_URL="$SLACK_WEBHOOK_URL" + + case "$GROUP_BY" in + SERVICE|PROJECT|LABEL) ;; + *) die "Invalid --group-by: $GROUP_BY (expected SERVICE, PROJECT, or LABEL)" ;; + esac + case "$OUTPUT_FORMAT" in + text|csv|json) ;; + *) die "Invalid --format: $OUTPUT_FORMAT" ;; + esac +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + parse_args "$@" + setup_colors + check_deps + + START_TIME=$(date +%s) + + log_debug "Validating GCP credentials..." + gcloud auth print-access-token >/dev/null 2>&1 \ + || die "GCP credentials not configured or expired (run gcloud auth login)" + + compute_ranges + + log_info "Querying BigQuery billing data ($RUN_MODE, group by $GROUP_BY)..." + + local curr_raw prev_raw + curr_raw="$(query_costs "$PERIOD_START" "$PERIOD_END")" + prev_raw="$(query_costs "$PREV_START" "$PREV_END")" + + if [[ -z "$curr_raw" || "$curr_raw" == "[]" ]]; then + die "No cost data returned for $PERIOD_START → $PERIOD_END" + fi + + local report + report="$(render_report "$curr_raw" "$prev_raw")" + + echo "$report" + + if [[ -n "$SLACK_URL" ]]; then + send_slack "$report" "$SLACK_URL" + fi + + local elapsed=$(( $(date +%s) - START_TIME )) + log_info "Completed in ${elapsed}s" +} + +main "$@" diff --git a/gcp-firewall-auditor.sh b/gcp-firewall-auditor.sh new file mode 100644 index 0000000..970d528 --- /dev/null +++ b/gcp-firewall-auditor.sh @@ -0,0 +1,635 @@ +#!/usr/bin/env bash + +######################################################################################### +#### gcp-firewall-auditor.sh — Audit GCP VPC firewall rules for risky configs #### +#### Finds 0.0.0.0/0 rules, dangerous ports, overly permissive access, unused rules #### +#### Requires: bash 4+, gcloud CLI, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./gcp-firewall-auditor.sh --full #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +# ── Severity counters ──────────────────────────────────────────────── +TOTAL_CRIT=0 +TOTAL_WARN=0 +TOTAL_INFO=0 +TOTAL_OK=0 + +flag_crit() { ((TOTAL_CRIT++)) || true; } +flag_warn() { ((TOTAL_WARN++)) || true; } +flag_info() { ((TOTAL_INFO++)) || true; } +flag_ok() { ((TOTAL_OK++)) || true; } + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +DANGEROUS_PORTS="${DANGEROUS_PORTS:-22,3389,3306,5432,1433,6379,27017,9200,8080,8443}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +GCP_PROJECT="" +VPC_NETWORK="" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── Dependency and credential checks ──────────────────────────────── +check_deps() { + command -v gcloud &>/dev/null || die "gcloud CLI is required (install: https://cloud.google.com/sdk/docs/install)" + command -v jq &>/dev/null || die "jq is required" +} + +check_credentials() { + local account + account=$(gcloud auth list --filter="status:ACTIVE" --format="value(account)" 2>/dev/null) + [[ -z "$account" ]] && die "No active gcloud credentials — run 'gcloud auth login'" + + if [[ -n "$GCP_PROJECT" ]]; then + gcloud config set project "$GCP_PROJECT" --quiet 2>/dev/null \ + || die "Cannot set project: ${GCP_PROJECT}" + else + GCP_PROJECT=$(gcloud config get-value project 2>/dev/null) + [[ -z "$GCP_PROJECT" || "$GCP_PROJECT" == "(unset)" ]] && die "No project set — use --project or 'gcloud config set project'" + fi + + verbose "Account: ${account}" + log "Project: ${GCP_PROJECT}" +} + +# ── gcloud wrapper ─────────────────────────────────────────────────── +gc_cmd() { + local args=("$@") + [[ -n "$GCP_PROJECT" ]] && args+=(--project "$GCP_PROJECT") + verbose "gcloud ${args[*]}" + gcloud "${args[@]}" +} + +# ── Port-to-service mapping ───────────────────────────────────────── +port_to_service() { + local port="$1" + case "$port" in + 22) echo "SSH" ;; + 80) echo "HTTP" ;; + 443) echo "HTTPS" ;; + 3306) echo "MySQL" ;; + 5432) echo "PostgreSQL" ;; + 1433) echo "MSSQL" ;; + 3389) echo "RDP" ;; + 6379) echo "Redis" ;; + 27017) echo "MongoDB" ;; + 9200) echo "Elasticsearch" ;; + 8080) echo "HTTP-Alt" ;; + 8443) echo "HTTPS-Alt" ;; + 53) echo "DNS" ;; + 25) echo "SMTP" ;; + 5900) echo "VNC" ;; + 11211) echo "Memcached" ;; + 2379) echo "etcd" ;; + 9090) echo "Prometheus" ;; + *) echo "" ;; + esac +} + +# ── Check if port is in dangerous list ─────────────────────────────── +is_dangerous_port() { + local port="$1" + local IFS=',' + for dp in $DANGEROUS_PORTS; do + if [[ "$port" == "$dp" ]]; then + return 0 + fi + done + return 1 +} + +# ── Check if port falls in a range ─────────────────────────────────── +port_in_range() { + local port="$1" range="$2" + if [[ "$range" == *-* ]]; then + local start="${range%-*}" + local end="${range#*-}" + [[ "$port" -ge "$start" && "$port" -le "$end" ]] + else + [[ "$port" == "$range" ]] + fi +} + +# ── Fetch firewall rules ──────────────────────────────────────────── +fetch_rules() { + local args=(compute firewall-rules list --format=json) + if [[ -n "$VPC_NETWORK" ]]; then + args+=(--filter="network~${VPC_NETWORK}") + fi + gc_cmd "${args[@]}" 2>/dev/null +} + +# ══════════════════════════════════════════════════════════════════════ +# OPEN PORTS AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_open_ports() { + log "Auditing firewall rules for dangerous open ports..." + log "Dangerous ports: ${DANGEROUS_PORTS}" + echo "" + + printf " %-28s %-14s %-8s %-8s %-18s %s\n" \ + "RULE_NAME" "NETWORK" "PROTO" "PORT" "SOURCE" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..95})" + + local rules_json + rules_json=$(fetch_rules) + + echo "$rules_json" | jq -c '.[] | select(.direction == "INGRESS" and .disabled != true)' 2>/dev/null | while IFS= read -r rule; do + local rule_name network + rule_name=$(echo "$rule" | jq -r '.name') + network=$(echo "$rule" | jq -r '.network' | rev | cut -d/ -f1 | rev) + + local has_open="false" + while IFS= read -r src; do + if [[ "$src" == "0.0.0.0/0" ]]; then + has_open="true" + break + fi + done < <(echo "$rule" | jq -r '.sourceRanges[]? // empty' 2>/dev/null) + + [[ "$has_open" != "true" ]] && continue + + echo "$rule" | jq -c '.allowed[]? // empty' 2>/dev/null | while IFS= read -r allowed; do + local protocol + protocol=$(echo "$allowed" | jq -r '.IPProtocol') + + local ports + ports=$(echo "$allowed" | jq -r '.ports[]? // empty' 2>/dev/null) + + if [[ -z "$ports" ]]; then + if [[ "$protocol" == "all" ]]; then + printf " %-28s %-14s %-8s %-8s %-18s %b%s%b\n" \ + "${rule_name:0:27}" "${network:0:13}" "all" "all" \ + "0.0.0.0/0" "$RED" "CRITICAL" "$RESET" + flag_crit + else + local IFS=',' + for dp in $DANGEROUS_PORTS; do + local svc + svc=$(port_to_service "$dp") + printf " %-28s %-14s %-8s %-8s %-18s %b%s%b\n" \ + "${rule_name:0:27}" "${network:0:13}" "$protocol" "$dp" \ + "0.0.0.0/0" "$RED" "CRITICAL" "$RESET" + flag_crit + done + fi + continue + fi + + while IFS= read -r port_spec; do + [[ -z "$port_spec" ]] && continue + + local IFS=',' + for dp in $DANGEROUS_PORTS; do + if port_in_range "$dp" "$port_spec"; then + local svc severity color + svc=$(port_to_service "$dp") + if [[ "$dp" == "80" || "$dp" == "443" ]]; then + severity="INFO"; color="$CYAN"; flag_info + else + severity="CRITICAL"; color="$RED"; flag_crit + fi + printf " %-28s %-14s %-8s %-8s %-18s %b%s%b\n" \ + "${rule_name:0:27}" "${network:0:13}" "$protocol" \ + "${dp} (${svc})" "0.0.0.0/0" "$color" "$severity" "$RESET" + fi + done + done <<< "$ports" + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# PERMISSIVE RULES AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_permissive() { + log "Auditing overly permissive firewall rules..." + echo "" + + printf " %-28s %-14s %-14s %-18s %s\n" \ + "RULE_NAME" "NETWORK" "PROTOCOLS" "SOURCE" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..85})" + + local rules_json + rules_json=$(fetch_rules) + + echo "$rules_json" | jq -c '.[] | select(.direction == "INGRESS" and .disabled != true)' 2>/dev/null | while IFS= read -r rule; do + local rule_name network + rule_name=$(echo "$rule" | jq -r '.name') + network=$(echo "$rule" | jq -r '.network' | rev | cut -d/ -f1 | rev) + + local has_open="false" + while IFS= read -r src; do + if [[ "$src" == "0.0.0.0/0" ]]; then + has_open="true" + break + fi + done < <(echo "$rule" | jq -r '.sourceRanges[]? // empty' 2>/dev/null) + + [[ "$has_open" != "true" ]] && continue + + local has_all_traffic="false" + while IFS= read -r allowed; do + local proto + proto=$(echo "$allowed" | jq -r '.IPProtocol') + local port_count + port_count=$(echo "$allowed" | jq '.ports // [] | length') + + if [[ "$proto" == "all" ]]; then + has_all_traffic="true" + elif [[ "$port_count" -eq 0 ]]; then + has_all_traffic="true" + fi + done < <(echo "$rule" | jq -c '.allowed[]? // empty' 2>/dev/null) + + if [[ "$has_all_traffic" == "true" ]]; then + local proto_list + proto_list=$(echo "$rule" | jq -r '[.allowed[]?.IPProtocol] | join(",")' 2>/dev/null) + printf " %-28s %-14s %-14s %-18s %b%s%b\n" \ + "${rule_name:0:27}" "${network:0:13}" "${proto_list:0:13}" \ + "0.0.0.0/0" "$RED" "CRITICAL" "$RESET" + flag_crit + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# EGRESS AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_egress() { + log "Auditing egress firewall rules..." + echo "" + + printf " %-28s %-14s %-14s %-18s %s\n" \ + "RULE_NAME" "NETWORK" "PROTOCOLS" "DESTINATION" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..85})" + + local rules_json + rules_json=$(fetch_rules) + + echo "$rules_json" | jq -c '.[] | select(.direction == "EGRESS" and .disabled != true)' 2>/dev/null | while IFS= read -r rule; do + local rule_name network + rule_name=$(echo "$rule" | jq -r '.name') + network=$(echo "$rule" | jq -r '.network' | rev | cut -d/ -f1 | rev) + + local has_wide="false" + while IFS= read -r dest; do + if [[ "$dest" == "0.0.0.0/0" ]]; then + has_wide="true" + break + fi + done < <(echo "$rule" | jq -r '.destinationRanges[]? // empty' 2>/dev/null) + + [[ "$has_wide" != "true" ]] && continue + + local proto_list + proto_list=$(echo "$rule" | jq -r '[.allowed[]?.IPProtocol] | join(",")' 2>/dev/null) + + local severity="WARN" color="$YELLOW" + if [[ "$proto_list" == "all" ]]; then + severity="WARN"; color="$YELLOW"; flag_warn + else + severity="INFO"; color="$CYAN"; flag_info + fi + + printf " %-28s %-14s %-14s %-18s %b%s%b\n" \ + "${rule_name:0:27}" "${network:0:13}" "${proto_list:0:13}" \ + "0.0.0.0/0" "$color" "$severity" "$RESET" + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# UNUSED RULES AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_unused() { + log "Checking for disabled or potentially unused firewall rules..." + echo "" + + printf " %-28s %-14s %-10s %-10s %s\n" \ + "RULE_NAME" "NETWORK" "DIRECTION" "DISABLED" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..80})" + + local rules_json + rules_json=$(fetch_rules) + + echo "$rules_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r rule; do + local rule_name network direction disabled + rule_name=$(echo "$rule" | jq -r '.name') + network=$(echo "$rule" | jq -r '.network' | rev | cut -d/ -f1 | rev) + direction=$(echo "$rule" | jq -r '.direction') + disabled=$(echo "$rule" | jq -r '.disabled // false') + + if [[ "$disabled" == "true" ]]; then + printf " %-28s %-14s %-10s %-10s %b%s%b\n" \ + "${rule_name:0:27}" "${network:0:13}" "$direction" "YES" \ + "$YELLOW" "WARN — disabled" "$RESET" + flag_warn + continue + fi + + local target_tags + target_tags=$(echo "$rule" | jq -r '.targetTags // [] | join(",")' 2>/dev/null) + + if [[ -n "$target_tags" && "$target_tags" != "null" ]]; then + local first_tag="${target_tags%%,*}" + local instance_count + instance_count=$(gcloud compute instances list \ + --filter="tags.items=${first_tag}" \ + --format="value(name)" 2>/dev/null | wc -l) + + if [[ "$instance_count" -eq 0 ]]; then + printf " %-28s %-14s %-10s %-10s %b%s%b\n" \ + "${rule_name:0:27}" "${network:0:13}" "$direction" "NO" \ + "$YELLOW" "WARN — no targets" "$RESET" + flag_warn + else + verbose "Rule ${rule_name}: ${instance_count} matching instance(s)" + flag_ok + fi + else + flag_ok + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST ALL RULES +# ══════════════════════════════════════════════════════════════════════ +list_rules() { + log "Listing all firewall rules..." + echo "" + + printf " %-28s %-14s %-10s %-8s %-12s %-18s %s\n" \ + "RULE_NAME" "NETWORK" "DIR" "PROTO" "PORTS" "SOURCE/DEST" "PRIORITY" + printf " %s\n" "$(printf '%.0s─' {1..105})" + + local rules_json + rules_json=$(fetch_rules) + + echo "$rules_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r rule; do + local rule_name network direction priority + rule_name=$(echo "$rule" | jq -r '.name') + network=$(echo "$rule" | jq -r '.network' | rev | cut -d/ -f1 | rev) + direction=$(echo "$rule" | jq -r '.direction') + priority=$(echo "$rule" | jq -r '.priority') + + local cidr_list + if [[ "$direction" == "INGRESS" ]]; then + cidr_list=$(echo "$rule" | jq -r '.sourceRanges[0]? // "any"' 2>/dev/null) + else + cidr_list=$(echo "$rule" | jq -r '.destinationRanges[0]? // "any"' 2>/dev/null) + fi + + echo "$rule" | jq -c '.allowed[]? // empty' 2>/dev/null | while IFS= read -r allowed; do + local proto port_str + proto=$(echo "$allowed" | jq -r '.IPProtocol') + port_str=$(echo "$allowed" | jq -r '.ports // ["all"] | join(",")' 2>/dev/null) + [[ "$port_str" == "null" ]] && port_str="all" + + local dir_color="$CYAN" + [[ "$direction" == "EGRESS" ]] && dir_color="$YELLOW" + + printf " %-28s %-14s %b%-10s%b %-8s %-12s %-18s %s\n" \ + "${rule_name:0:27}" "${network:0:13}" "$dir_color" "$direction" "$RESET" \ + "$proto" "${port_str:0:11}" "${cidr_list:0:17}" "$priority" + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ +print_summary() { + local elapsed + elapsed=$(( $(date +%s) - START_TIME )) + + echo "" + echo " ══════════════════════════════════════════" + echo " Firewall Audit Summary" + echo " ══════════════════════════════════════════" + printf " %-20s %b%d%b\n" "CRITICAL:" "$RED" "$TOTAL_CRIT" "$RESET" + printf " %-20s %b%d%b\n" "WARN:" "$YELLOW" "$TOTAL_WARN" "$RESET" + printf " %-20s %b%d%b\n" "INFO:" "$CYAN" "$TOTAL_INFO" "$RESET" + printf " %-20s %b%d%b\n" "OK:" "$GREEN" "$TOTAL_OK" "$RESET" + echo " ──────────────────────────────────────────" + printf " Completed in %ds\n" "$elapsed" + echo "" + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + echo -e " ${RED}${BOLD}Action required:${RESET} ${TOTAL_CRIT} critical finding(s)" + echo "" + echo " Top recommendations:" + echo " • Close 0.0.0.0/0 rules on SSH (22), RDP (3389), and database ports" + echo " • Replace all-protocol allow rules with specific port lists" + echo " • Use target tags or service accounts to scope rules" + echo " • Delete disabled rules that are no longer needed" + echo "" + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + echo -e " ${YELLOW}Review recommended:${RESET} ${TOTAL_WARN} warning(s)" + echo "" + echo " Suggestions:" + echo " • Review disabled rules for deletion" + echo " • Check rules with no matching target instances" + echo " • Restrict egress where applicable" + echo "" + else + echo -e " ${GREEN}All checks passed${RESET}" + echo "" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2 + exit 1 + fi + + RUN_MODE="${modes[*]}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ +main() { + parse_args "$@" + setup_colors + check_deps + check_credentials + + START_TIME=$(date +%s) + + echo "" + echo -e "${BOLD}GCP Firewall Auditor${RESET}" + echo -e "Project: ${GCP_PROJECT}" + echo -e "Mode: ${RUN_MODE}" + if [[ -n "$VPC_NETWORK" ]]; then + echo -e "Network: ${VPC_NETWORK}" + fi + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" + + for mode in $RUN_MODE; do + case "$mode" in + open-ports) audit_open_ports ;; + permissive) audit_permissive ;; + unused) audit_unused ;; + egress) audit_egress ;; + rules) list_rules ;; + esac + done + + print_summary + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + exit 2 + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@" diff --git a/gcp-snapshot-manager.sh b/gcp-snapshot-manager.sh new file mode 100644 index 0000000..054dbb6 --- /dev/null +++ b/gcp-snapshot-manager.sh @@ -0,0 +1,708 @@ +#!/usr/bin/env bash + +######################################################################################### +#### gcp-snapshot-manager.sh — Create, rotate, list, audit, and restore GCP #### +#### persistent disk snapshots via gcloud CLI. Automated retention and fleet ops #### +#### Requires: bash 4+, gcloud CLI, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./gcp-snapshot-manager.sh --snapshot --all #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +ALSO_ROTATE="false" +INSTANCE_NAME="" +ZONE="" +TARGET_ALL="false" +SNAPSHOT_NAME="" +KEEP="${GSM_KEEP:-3}" +PREFIX="${GSM_PREFIX:-auto}" +MAX_AGE="${GSM_MAX_AGE:-7}" +OUTPUT_FORMAT="${GSM_FORMAT:-text}" +DRY_RUN="true" +FORCE="false" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +GCP_PROJECT="" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +SNAP_CREATED=0 +SNAP_DELETED=0 +SNAP_ERRORS=0 + +# ── Dependency and credential checks ──────────────────────────────── +check_deps() { + command -v gcloud &>/dev/null || die "gcloud CLI is required" + command -v jq &>/dev/null || die "jq is required" +} + +check_credentials() { + local account + account=$(gcloud auth list --filter="status:ACTIVE" --format="value(account)" 2>/dev/null) + [[ -z "$account" ]] && die "No active gcloud credentials — run 'gcloud auth login'" + + if [[ -n "$GCP_PROJECT" ]]; then + gcloud config set project "$GCP_PROJECT" --quiet 2>/dev/null \ + || die "Cannot set project: ${GCP_PROJECT}" + else + GCP_PROJECT=$(gcloud config get-value project 2>/dev/null) + [[ -z "$GCP_PROJECT" || "$GCP_PROJECT" == "(unset)" ]] && die "No project set — use --project or 'gcloud config set project'" + fi + + log "Project: ${GCP_PROJECT}" +} + +# ── Instance helpers ───────────────────────────────────────────────── +get_all_instances() { + gcloud compute instances list --project "$GCP_PROJECT" --format=json 2>/dev/null +} + +get_boot_disk() { + local instance="$1" zone="$2" + gcloud compute instances describe "$instance" --zone "$zone" --project "$GCP_PROJECT" \ + --format='json(disks)' 2>/dev/null \ + | jq -r '.disks[] | select(.boot == true) | .source' 2>/dev/null \ + | rev | cut -d/ -f1 | rev +} + +get_instance_zone() { + local instance_json="$1" + echo "$instance_json" | jq -r '.zone' | rev | cut -d/ -f1 | rev +} + +# ── Snapshot helpers ───────────────────────────────────────────────── +list_snapshots() { + gcloud compute snapshots list --project "$GCP_PROJECT" --format=json 2>/dev/null +} + +managed_snapshots() { + list_snapshots | jq --arg pfx "$PREFIX" \ + '[.[] | select(.name | startswith($pfx))]' +} + +# ══════════════════════════════════════════════════════════════════════ +# SNAPSHOT +# ══════════════════════════════════════════════════════════════════════ +do_snapshot() { + local instances_json + instances_json=$(get_all_instances) + + local instances + if [[ "$TARGET_ALL" == "true" ]]; then + instances="$instances_json" + elif [[ -n "$INSTANCE_NAME" ]]; then + instances=$(echo "$instances_json" | jq --arg n "$INSTANCE_NAME" '[.[] | select(.name == $n)]') + else + die "Specify --instance NAME or --all" + fi + + local count + count=$(echo "$instances" | jq 'length') + [[ "$count" -eq 0 ]] && die "No instances found" + + local target_label="$INSTANCE_NAME" + [[ "$TARGET_ALL" == "true" ]] && target_label="all (${count} instances)" + + section_header "Creating Snapshots" + field "Target:" "$target_label" + field "Prefix:" "$PREFIX" + echo "" + + echo "$instances" | jq -c '.[]' | while IFS= read -r inst; do + local name zone disk_name snap_name + name=$(echo "$inst" | jq -r '.name') + zone=$(get_instance_zone "$inst") + disk_name=$(get_boot_disk "$name" "$zone") + snap_name="${PREFIX}-${name}-$(date +%Y%m%d-%H%M%S)" + + if [[ -z "$disk_name" ]]; then + echo -e " ${RED}✗${RESET} ${name} (${zone}) no boot disk found" + ((SNAP_ERRORS++)) || true + continue + fi + + verbose "Snapshotting ${name} disk ${disk_name} in ${zone}" + + if gcloud compute snapshots create "$snap_name" \ + --source-disk="$disk_name" \ + --source-disk-zone="$zone" \ + --project "$GCP_PROJECT" \ + --labels="managed-by=gcp-snapshot-manager,source-instance=${name}" \ + --quiet 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} ${name} (${zone}) ${snap_name}" + ((SNAP_CREATED++)) || true + else + echo -e " ${RED}✗${RESET} ${name} (${zone}) failed" + ((SNAP_ERRORS++)) || true + fi + + sleep 1 + done + + echo "" + field_color "Created:" "${GREEN}${SNAP_CREATED}${RESET}" + if [[ "$SNAP_ERRORS" -gt 0 ]]; then + field_color "Errors:" "${RED}${SNAP_ERRORS}${RESET}" + fi + + if [[ "$ALSO_ROTATE" == "true" ]]; then + do_rotate + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# ROTATE +# ══════════════════════════════════════════════════════════════════════ +do_rotate() { + section_header "Rotating Snapshots" + field "Keep:" "$KEEP per instance" + field "Prefix:" "$PREFIX" + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + field "Mode:" "DRY RUN (use --force to delete)" + else + field "Mode:" "LIVE — deletions are permanent" + fi + echo "" + + local snaps + snaps=$(managed_snapshots) + + local instance_names + instance_names=$(echo "$snaps" | jq -r '.[].labels["source-instance"] // empty' | sort -u) + + if [[ -z "$instance_names" ]]; then + log "No managed snapshots found matching prefix '${PREFIX}'" + return + fi + + while IFS= read -r inst; do + [[ -z "$inst" ]] && continue + local inst_snaps + inst_snaps=$(echo "$snaps" | jq --arg inst "$inst" \ + '[.[] | select(.labels["source-instance"] == $inst)] | sort_by(.creationTimestamp) | reverse') + local total + total=$(echo "$inst_snaps" | jq 'length') + + if (( total <= KEEP )); then + verbose "${inst}: ${total} snapshots, keeping all" + continue + fi + + local to_delete + to_delete=$(echo "$inst_snaps" | jq --argjson k "$KEEP" '.[$k:]') + local del_count + del_count=$(echo "$to_delete" | jq 'length') + + echo "$to_delete" | jq -c '.[]' | while IFS= read -r snap; do + local sname + sname=$(echo "$snap" | jq -r '.name') + + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + echo -e " ${DIM}[DRY RUN]${RESET} would delete ${sname}" + else + if gcloud compute snapshots delete "$sname" \ + --project "$GCP_PROJECT" --quiet 2>/dev/null; then + echo -e " ${YELLOW}✓${RESET} deleted ${sname}" + ((SNAP_DELETED++)) || true + else + echo -e " ${RED}✗${RESET} failed to delete ${sname}" + ((SNAP_ERRORS++)) || true + fi + fi + done + + log "${inst}: ${total} total, keeping ${KEEP}, removing ${del_count}" + done <<< "$instance_names" + + echo "" + field_color "Deleted:" "${YELLOW}${SNAP_DELETED}${RESET}" +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST +# ══════════════════════════════════════════════════════════════════════ +do_list() { + section_header "All Snapshots" + + local snaps + snaps=$(list_snapshots) + local count + count=$(echo "$snaps" | jq 'length') + + if [[ "$count" -eq 0 ]]; then + log "No snapshots found" + return + fi + + printf " %-40s %-10s %-12s %-16s %s\n" \ + "NAME" "SIZE_GB" "AGE" "SOURCE_DISK" "SOURCE_INSTANCE" + printf " %s\n" "$(printf '%.0s─' {1..100})" + + local now + now=$(date +%s) + + echo "$snaps" | jq -c '.[]' | while IFS= read -r snap; do + local name size_gb created source_disk source_inst age_str + name=$(echo "$snap" | jq -r '.name') + size_gb=$(echo "$snap" | jq -r '.diskSizeGb // 0') + created=$(echo "$snap" | jq -r '.creationTimestamp // ""') + source_disk=$(echo "$snap" | jq -r '.sourceDisk // ""' | rev | cut -d/ -f1 | rev) + source_inst=$(echo "$snap" | jq -r '.labels["source-instance"] // "manual"') + + if [[ -n "$created" ]]; then + local snap_epoch + snap_epoch=$(date -d "$created" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + local age_days=$(( (now - snap_epoch) / 86400 )) + age_str="${age_days}d" + else + age_str="unknown" + fi + else + age_str="unknown" + fi + + printf " %-40s %-10s %-12s %-16s %s\n" \ + "${name:0:39}" "$size_gb" "$age_str" "${source_disk:0:15}" "${source_inst:0:20}" + done + + echo "" + field "Total snapshots:" "$count" +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ +do_audit() { + section_header "Snapshot Audit" + + local instances_json + instances_json=$(get_all_instances) + local snaps + snaps=$(list_snapshots) + local now + now=$(date +%s) + + printf " %-24s %-14s %-24s %-8s %-8s %s\n" \ + "INSTANCE" "ZONE" "LATEST_SNAPSHOT" "AGE" "COUNT" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..100})" + + echo "$instances_json" | jq -c '.[]' | while IFS= read -r inst; do + local name zone + name=$(echo "$inst" | jq -r '.name') + zone=$(get_instance_zone "$inst") + + local inst_snaps snap_count + inst_snaps=$(echo "$snaps" | jq --arg inst "$name" \ + '[.[] | select(.labels["source-instance"] == $inst)]') + snap_count=$(echo "$inst_snaps" | jq 'length') + + if [[ "$snap_count" -eq 0 ]]; then + printf " %-24s %-14s %-24s %-8s %-8s %b%s%b\n" \ + "${name:0:23}" "${zone:0:13}" "(none)" "—" "0" \ + "$RED" "✗ Unprotected" "$RESET" + continue + fi + + local latest_name latest_date age_str status color + latest_name=$(echo "$inst_snaps" | jq -r 'sort_by(.creationTimestamp) | last | .name // ""') + latest_date=$(echo "$inst_snaps" | jq -r 'sort_by(.creationTimestamp) | last | .creationTimestamp // ""') + + if [[ -n "$latest_date" ]]; then + local snap_epoch + snap_epoch=$(date -d "$latest_date" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + local age_days=$(( (now - snap_epoch) / 86400 )) + age_str="${age_days}d" + if (( age_days > MAX_AGE )); then + status="⚠ Stale"; color="$YELLOW" + else + status="✓ OK"; color="$GREEN" + fi + else + age_str="unknown"; status="✓ OK"; color="$GREEN" + fi + else + age_str="unknown"; status="✓ OK"; color="$GREEN" + fi + + printf " %-24s %-14s %-24s %-8s %-8s %b%s%b\n" \ + "${name:0:23}" "${zone:0:13}" "${latest_name:0:23}" \ + "$age_str" "$snap_count" "$color" "$status" "$RESET" + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# RESTORE +# ══════════════════════════════════════════════════════════════════════ +do_restore() { + [[ -z "$INSTANCE_NAME" ]] && die "--restore requires --instance NAME" + [[ -z "$SNAPSHOT_NAME" ]] && die "--restore requires --snapshot-name NAME" + [[ -z "$ZONE" ]] && die "--restore requires --zone ZONE" + + section_header "Restore from Snapshot" + field "Instance:" "$INSTANCE_NAME" + field "Snapshot:" "$SNAPSHOT_NAME" + field "Zone:" "$ZONE" + echo "" + + if [[ "$FORCE" != "true" ]]; then + warn "This will stop the instance and replace its boot disk. Use --force to confirm." + return + fi + + log "Creating disk from snapshot..." + local disk_name="restored-${INSTANCE_NAME}-$(date +%Y%m%d-%H%M%S)" + + if gcloud compute disks create "$disk_name" \ + --source-snapshot="$SNAPSHOT_NAME" \ + --zone="$ZONE" \ + --project "$GCP_PROJECT" \ + --quiet 2>/dev/null; then + echo -e " ${GREEN}✓${RESET} Disk created: ${disk_name}" + else + die "Failed to create disk from snapshot" + fi + + log "Stopping instance..." + gcloud compute instances stop "$INSTANCE_NAME" \ + --zone="$ZONE" --project "$GCP_PROJECT" --quiet 2>/dev/null \ + || die "Failed to stop instance" + + local old_disk + old_disk=$(get_boot_disk "$INSTANCE_NAME" "$ZONE") + + log "Detaching old boot disk..." + gcloud compute instances detach-disk "$INSTANCE_NAME" \ + --disk="$old_disk" --zone="$ZONE" --project "$GCP_PROJECT" \ + --quiet 2>/dev/null || die "Failed to detach old disk" + + log "Attaching restored disk..." + gcloud compute instances attach-disk "$INSTANCE_NAME" \ + --disk="$disk_name" --zone="$ZONE" --boot \ + --project "$GCP_PROJECT" --quiet 2>/dev/null \ + || die "Failed to attach restored disk" + + log "Starting instance..." + gcloud compute instances start "$INSTANCE_NAME" \ + --zone="$ZONE" --project "$GCP_PROJECT" --quiet 2>/dev/null + echo -e " ${GREEN}✓${RESET} Instance started with restored disk" +} + +# ══════════════════════════════════════════════════════════════════════ +# STATUS +# ══════════════════════════════════════════════════════════════════════ +do_status() { + local instances_json + instances_json=$(get_all_instances) + local snaps + snaps=$(list_snapshots) + local now + now=$(date +%s) + + local total_instances=0 total_snaps=0 total_gb=0 + local protected=0 stale=0 unprotected=0 + + while IFS= read -r inst; do + [[ -z "$inst" ]] && continue + ((total_instances++)) || true + + local name + name=$(echo "$inst" | jq -r '.name') + + local inst_snaps snap_count + inst_snaps=$(echo "$snaps" | jq --arg inst "$name" \ + '[.[] | select(.labels["source-instance"] == $inst)]') + snap_count=$(echo "$inst_snaps" | jq 'length') + total_snaps=$(( total_snaps + snap_count )) + + local gb + gb=$(echo "$inst_snaps" | jq '[.[].diskSizeGb // 0 | tonumber] | add // 0') + total_gb=$(( total_gb + gb )) + + if [[ "$snap_count" -eq 0 ]]; then + ((unprotected++)) || true + continue + fi + + local latest_date + latest_date=$(echo "$inst_snaps" | jq -r \ + 'sort_by(.creationTimestamp) | last | .creationTimestamp // ""') + + if [[ -n "$latest_date" ]]; then + local snap_epoch + snap_epoch=$(date -d "$latest_date" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + local age_days=$(( (now - snap_epoch) / 86400 )) + if (( age_days > MAX_AGE )); then + ((stale++)) || true + else + ((protected++)) || true + fi + else + ((protected++)) || true + fi + else + ((protected++)) || true + fi + done < <(echo "$instances_json" | jq -c '.[]') + + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + cat <${MAX_AGE}d):" "${YELLOW}${stale}${RESET}" + else + field_color "Stale (>${MAX_AGE}d):" "${GREEN}0${RESET}" + fi + if [[ "$unprotected" -gt 0 ]]; then + field_color "Unprotected:" "${RED}${unprotected}${RESET}" + else + field_color "Unprotected:" "${GREEN}0${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat </dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "ERROR: Missing required commands: ${missing[*]}" >&2 + echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 + exit 1 + fi +} + +validate_config() { + if [[ -z "$GITEA_URL" ]]; then + echo "ERROR: GITEA_URL environment variable is required" >&2 + exit 1 + fi + if [[ -z "$GITEA_TOKEN" ]]; then + echo "ERROR: GITEA_TOKEN environment variable is required" >&2 + exit 1 + fi + # Strip trailing slash + GITEA_URL="${GITEA_URL%/}" +} + +api_get() { + local endpoint="$1" + curl -sf --max-time "$CURL_TIMEOUT" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + "${GITEA_URL}${endpoint}" 2>/dev/null || echo "" +} + +api_get_with_headers() { + local endpoint="$1" + local response + response=$(curl -sD - --max-time "$CURL_TIMEOUT" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + "${GITEA_URL}${endpoint}" 2>/dev/null) || { echo ""; return; } + + local headers body + headers=$(echo "$response" | sed '/^\r$/q') + body=$(echo "$response" | sed '1,/^\r$/d') + + local total_count + total_count=$(echo "$headers" | grep -i '^X-Total-Count:' | tr -d '\r' | awk '{print $2}') + + echo "${total_count:-0}" + echo "$body" +} + +sanitize_label() { + local value="$1" + echo "$value" | sed 's/[^a-zA-Z0-9_\/.-]/_/g' +} + +add_metric() { + local name="$1" + local type="$2" + local help="$3" + local value="$4" + local labels="${5:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name}{${labels}} ${value} +" + else + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name} ${value} +" + fi +} + +add_metric_value() { + local name="$1" + local value="$2" + local labels="${3:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="${name}{${labels}} ${value} +" + else + OUTPUT+="${name} ${value} +" + fi +} + +collect_version() { + local version_json + version_json=$(api_get "/api/v1/version") + + if [[ -z "$version_json" ]]; then + add_metric "gitea_up" "gauge" "Gitea reachability (1=up, 0=down)" "0" + return 1 + fi + + add_metric "gitea_up" "gauge" "Gitea reachability (1=up, 0=down)" "1" + + local version + version=$(echo "$version_json" | jq -r '.version // empty' 2>/dev/null) + + if [[ -n "$version" ]]; then + add_metric "gitea_version_info" "gauge" "Gitea/Forgejo version" "1" "version=\"${version}\"" + fi + + return 0 +} + +collect_users() { + local response + response=$(api_get_with_headers "/api/v1/admin/users?limit=1") + + if [[ -z "$response" ]]; then + return + fi + + local total_count + total_count=$(echo "$response" | head -1) + + if [[ -n "$total_count" && "$total_count" != "0" ]]; then + add_metric "gitea_users_total" "gauge" "Total number of users" "$total_count" + fi +} + +collect_organizations() { + local response + response=$(api_get_with_headers "/api/v1/admin/orgs?limit=1") + + if [[ -z "$response" ]]; then + return + fi + + local total_count + total_count=$(echo "$response" | head -1) + + if [[ -n "$total_count" ]]; then + add_metric "gitea_organizations_total" "gauge" "Total number of organizations" "$total_count" + fi +} + +collect_repositories() { + local response + response=$(api_get_with_headers "/api/v1/repos/search?limit=1") + + if [[ -z "$response" ]]; then + return + fi + + local total_count + total_count=$(echo "$response" | head -1) + + if [[ -n "$total_count" ]]; then + add_metric "gitea_repositories_total" "gauge" "Total number of repositories" "$total_count" + fi +} + +collect_repo_details() { + local page=1 + local per_page=50 + local collected=0 + local first_page=true + + # Add HELP/TYPE lines for per-repo metrics + OUTPUT+="# HELP gitea_repo_stars Number of stars for the repository +# TYPE gitea_repo_stars gauge +# HELP gitea_repo_forks Number of forks for the repository +# TYPE gitea_repo_forks gauge +# HELP gitea_repo_open_issues Number of open issues for the repository +# TYPE gitea_repo_open_issues gauge +# HELP gitea_repo_open_pull_requests Number of open pull requests for the repository +# TYPE gitea_repo_open_pull_requests gauge +# HELP gitea_repo_size_bytes Repository size in bytes +# TYPE gitea_repo_size_bytes gauge +# HELP gitea_repo_is_mirror Whether the repository is a mirror (1=yes, 0=no) +# TYPE gitea_repo_is_mirror gauge +" + + while [[ $collected -lt $MAX_REPOS ]]; do + local remaining=$((MAX_REPOS - collected)) + local fetch_count=$((remaining < per_page ? remaining : per_page)) + + local repos_json + repos_json=$(api_get "/api/v1/repos/search?limit=${fetch_count}&page=${page}") + + if [[ -z "$repos_json" ]]; then + break + fi + + local repo_count + repo_count=$(echo "$repos_json" | jq -r '.data | length // 0' 2>/dev/null) + + if [[ "$repo_count" == "0" || -z "$repo_count" ]]; then + break + fi + + local i + for ((i = 0; i < repo_count && collected < MAX_REPOS; i++)); do + local full_name stars forks open_issues size mirror has_pull_requests + full_name=$(echo "$repos_json" | jq -r ".data[$i].full_name // empty" 2>/dev/null) + stars=$(echo "$repos_json" | jq -r ".data[$i].stars_count // 0" 2>/dev/null) + forks=$(echo "$repos_json" | jq -r ".data[$i].forks_count // 0" 2>/dev/null) + open_issues=$(echo "$repos_json" | jq -r ".data[$i].open_issues_count // 0" 2>/dev/null) + size=$(echo "$repos_json" | jq -r ".data[$i].size // 0" 2>/dev/null) + mirror=$(echo "$repos_json" | jq -r ".data[$i].mirror // false" 2>/dev/null) + has_pull_requests=$(echo "$repos_json" | jq -r ".data[$i].has_pull_requests // true" 2>/dev/null) + + if [[ -z "$full_name" ]]; then + continue + fi + + local safe_name + safe_name=$(sanitize_label "$full_name") + local label="repo=\"${safe_name}\"" + + # Size: API returns KB, convert to bytes + local size_bytes=$((size * 1024)) + + # Mirror: convert bool to 0/1 + local mirror_val=0 + if [[ "$mirror" == "true" ]]; then + mirror_val=1 + fi + + # Open PRs: fetch from repo API if pull requests are enabled + local open_prs=0 + if [[ "$has_pull_requests" == "true" ]]; then + local owner repo_name + owner=$(echo "$repos_json" | jq -r ".data[$i].owner.login // empty" 2>/dev/null) + repo_name=$(echo "$repos_json" | jq -r ".data[$i].name // empty" 2>/dev/null) + if [[ -n "$owner" && -n "$repo_name" ]]; then + local pr_response + pr_response=$(api_get_with_headers "/api/v1/repos/${owner}/${repo_name}/pulls?state=open&limit=1") + if [[ -n "$pr_response" ]]; then + open_prs=$(echo "$pr_response" | head -1) + fi + fi + fi + + add_metric_value "gitea_repo_stars" "$stars" "$label" + add_metric_value "gitea_repo_forks" "$forks" "$label" + add_metric_value "gitea_repo_open_issues" "$open_issues" "$label" + add_metric_value "gitea_repo_open_pull_requests" "${open_prs:-0}" "$label" + add_metric_value "gitea_repo_size_bytes" "$size_bytes" "$label" + add_metric_value "gitea_repo_is_mirror" "$mirror_val" "$label" + + collected=$((collected + 1)) + done + + # If we got fewer than requested, we've reached the end + if [[ $repo_count -lt $fetch_count ]]; then + break + fi + + page=$((page + 1)) + done +} + +collect_runners() { + local runners_json + runners_json=$(api_get "/api/v1/admin/runners") + + # Runner endpoint may 404 if Actions is not enabled — skip gracefully + if [[ -z "$runners_json" ]]; then + return + fi + + # Validate we got a JSON array + local is_array + is_array=$(echo "$runners_json" | jq -r 'if type == "array" then "yes" else "no" end' 2>/dev/null) + + if [[ "$is_array" != "yes" ]]; then + return + fi + + local total online offline + total=$(echo "$runners_json" | jq 'length' 2>/dev/null) + online=$(echo "$runners_json" | jq '[.[] | select(.status == "online")] | length' 2>/dev/null) + offline=$(echo "$runners_json" | jq '[.[] | select(.status != "online")] | length' 2>/dev/null) + + add_metric "gitea_runners_total" "gauge" "Total number of registered runners" "${total:-0}" + add_metric "gitea_runners_online" "gauge" "Number of online runners" "${online:-0}" + add_metric "gitea_runners_offline" "gauge" "Number of offline runners" "${offline:-0}" +} + +write_output() { + if [[ "$TEXTFILE_MODE" == true ]]; then + local output_file="${TEXTFILE_DIR}/gitea.prom" + local temp_file="${output_file}.$$" + + mkdir -p "$TEXTFILE_DIR" + echo "$OUTPUT" > "$temp_file" + mv "$temp_file" "$output_file" + else + echo "$OUTPUT" + fi +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + cat > /etc/cron.d/gitea-exporter </dev/null +EOF + + chmod 644 /etc/cron.d/gitea-exporter + echo "Installed cron job: /etc/cron.d/gitea-exporter" + echo "Metrics will be written to: ${TEXTFILE_DIR}/gitea.prom" +} + +# --- Main --- + +main() { + # Parse arguments + for arg in "$@"; do + case "$arg" in + --textfile) TEXTFILE_MODE=true ;; + --install) + check_dependencies + validate_config + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) echo "Unknown option: $arg" >&2; usage ;; + esac + done + + check_dependencies + validate_config + + START_TIME=$(date +%s%N) + + # Exporter info + add_metric "gitea_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" + + # Collect metrics + if collect_version; then + collect_users + collect_organizations + collect_repositories + collect_repo_details + collect_runners + fi + + # Exporter performance + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + add_metric "gitea_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" + add_metric "gitea_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" + + write_output +} + +main "$@" diff --git a/gitlab-docker-register-runner.sh b/gitlab-docker-register-runner.sh new file mode 100644 index 0000000..d46ff87 --- /dev/null +++ b/gitlab-docker-register-runner.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash + +############################################################################### +# register-runner.sh — GitLab Runner registration helper +# +# Registers the Docker-based GitLab Runner against your GitLab CE instance. +# Designed to work with the gitlab-docker-compose.yml stack. +# +# Usage: +# ./register-runner.sh +# ./register-runner.sh glrt-xxxxxxxxxxxxxxxxxxxx +# +# The runner token is obtained from: +# Admin Area → CI/CD → Runners → New instance runner → Create runner +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# License: MIT +############################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +GITLAB_HOSTNAME="${GITLAB_HOSTNAME:-gitlab.local}" +RUNNER_CONTAINER="${RUNNER_CONTAINER:-gitlab-runner}" +RUNNER_EXECUTOR="${RUNNER_EXECUTOR:-docker}" +RUNNER_IMAGE="${RUNNER_IMAGE:-alpine:latest}" +RUNNER_DESCRIPTION="${RUNNER_DESCRIPTION:-docker-runner}" +RUNNER_TAGS="${RUNNER_TAGS:-docker,linux}" + +# ── Colors ──────────────────────────────────────────────────────────── +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BOLD='\033[1m' + RESET='\033[0m' +else + RED="" GREEN="" YELLOW="" BOLD="" RESET="" +fi + +log() { echo -e "${GREEN}[OK]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } + +# ── Validation ──────────────────────────────────────────────────────── +if [[ $# -lt 1 ]]; then + echo -e "${BOLD}Usage:${RESET} $(basename "$0") " + echo "" + echo "Get the token from: Admin Area → CI/CD → Runners → New instance runner" + echo "" + echo "Environment variables:" + echo " GITLAB_HOSTNAME GitLab server hostname (default: gitlab.local)" + echo " RUNNER_CONTAINER Runner container name (default: gitlab-runner)" + echo " RUNNER_EXECUTOR Executor type (default: docker)" + echo " RUNNER_IMAGE Default CI image (default: alpine:latest)" + echo " RUNNER_DESCRIPTION Runner description (default: docker-runner)" + echo " RUNNER_TAGS Comma-separated tags (default: docker,linux)" + exit 1 +fi + +RUNNER_TOKEN="$1" + +# Verify the runner container is running +if ! docker inspect "$RUNNER_CONTAINER" &>/dev/null; then + err "Container '${RUNNER_CONTAINER}' not found. Is the stack running?" + err "Run: docker compose up -d" + exit 1 +fi + +if [[ "$(docker inspect -f '{{.State.Running}}' "$RUNNER_CONTAINER" 2>/dev/null)" != "true" ]]; then + err "Container '${RUNNER_CONTAINER}' is not running." + exit 1 +fi + +# ── Register ────────────────────────────────────────────────────────── +echo -e "${BOLD}Registering GitLab Runner...${RESET}" +echo " GitLab URL: https://${GITLAB_HOSTNAME}" +echo " Executor: ${RUNNER_EXECUTOR}" +echo " Default image: ${RUNNER_IMAGE}" +echo " Tags: ${RUNNER_TAGS}" +echo " Description: ${RUNNER_DESCRIPTION}" +echo "" + +docker exec "$RUNNER_CONTAINER" gitlab-runner register \ + --non-interactive \ + --url "https://${GITLAB_HOSTNAME}" \ + --token "$RUNNER_TOKEN" \ + --executor "$RUNNER_EXECUTOR" \ + --docker-image "$RUNNER_IMAGE" \ + --description "$RUNNER_DESCRIPTION" \ + --tag-list "$RUNNER_TAGS" \ + --docker-network-mode "gitlab-net" \ + --docker-volumes "/var/run/docker.sock:/var/run/docker.sock" + +echo "" + +# ── Verify ──────────────────────────────────────────────────────────── +if docker exec "$RUNNER_CONTAINER" gitlab-runner list 2>&1 | grep -q "$RUNNER_DESCRIPTION"; then + log "Runner registered successfully." + echo "" + docker exec "$RUNNER_CONTAINER" gitlab-runner list +else + warn "Registration completed but runner not found in list. Check logs:" + echo " docker compose logs gitlab-runner" +fi diff --git a/gitlab-metrics-exporter.sh b/gitlab-metrics-exporter.sh index 5b47027..b2e7c3b 100755 --- a/gitlab-metrics-exporter.sh +++ b/gitlab-metrics-exporter.sh @@ -6,7 +6,7 @@ #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### -#### Version: 1.00-030426 #### +#### Version: 1.01-210426 #### ################################################ set -o pipefail @@ -557,13 +557,13 @@ collect_local_metrics() { # GitLab version info local version_patterns="^gitlab_version_info[{ ]" local version_help="^# (HELP|TYPE) gitlab_version_info" - metrics+=$(echo "$raw_metrics" | grep -E "$version_help|$version_patterns" 2>/dev/null) + metrics+=$(echo "$raw_metrics" | grep -E "$version_help|$version_patterns" 2>/dev/null || true) metrics+=$'\n' # Puma metrics local puma_patterns="^puma_workers[{ ]|^puma_running_workers[{ ]|^puma_running[{ ]|^puma_queued_connections[{ ]|^puma_active_connections[{ ]|^puma_pool_capacity[{ ]|^puma_max_threads[{ ]|^puma_idle_threads[{ ]" local puma_help="^# (HELP|TYPE) puma_" - metrics+=$(echo "$raw_metrics" | grep -E "$puma_help|$puma_patterns" 2>/dev/null) + metrics+=$(echo "$raw_metrics" | grep -E "$puma_help|$puma_patterns" 2>/dev/null || true) metrics+=$'\n' # Sidekiq metrics (served by separate Sidekiq exporter, default localhost:8082) @@ -574,37 +574,37 @@ collect_local_metrics() { # Core Sidekiq job metrics local sidekiq_patterns="^sidekiq_running_jobs[{ ]|^sidekiq_concurrency[{ ]|^sidekiq_mem_total_bytes[{ ]|^sidekiq_jobs_failed_total[{ ]|^sidekiq_jobs_dead_total[{ ]|^sidekiq_enqueued_jobs_total[{ ]|^sidekiq_jobs_completion_seconds[_{ ]|^sidekiq_jobs_queue_duration_seconds[_{ ]|^sidekiq_jobs_cpu_seconds[_{ ]|^sidekiq_jobs_db_seconds[_{ ]|^sidekiq_jobs_gitaly_seconds[_{ ]|^sidekiq_redis_requests_total[{ ]|^sidekiq_redis_requests_duration_seconds[_{ ]" local sidekiq_help="^# (HELP|TYPE) sidekiq_(running_jobs|concurrency|mem_total_bytes|jobs_failed_total|jobs_dead_total|enqueued_jobs_total|jobs_completion_seconds|jobs_queue_duration_seconds|jobs_cpu_seconds|jobs_db_seconds|jobs_gitaly_seconds|redis_requests_total|redis_requests_duration_seconds)" - metrics+=$(echo "$sidekiq_raw" | grep -E "$sidekiq_help|$sidekiq_patterns" 2>/dev/null) + metrics+=$(echo "$sidekiq_raw" | grep -E "$sidekiq_help|$sidekiq_patterns" 2>/dev/null || true) metrics+=$'\n' # CI/CD pipeline internals local ci_patterns="^pipelines_created_total[{ ]|^deployments[{ ]|^gitlab_ci_pipeline_creation_duration_seconds[_{ ]|^gitlab_ci_pipeline_failure_reasons[{ ]|^gitlab_ci_active_jobs[_{ ]" local ci_help="^# (HELP|TYPE) (pipelines_created_total|deployments|gitlab_ci_pipeline_creation_duration_seconds|gitlab_ci_pipeline_failure_reasons|gitlab_ci_active_jobs)" - metrics+=$(echo "$sidekiq_raw" | grep -E "$ci_help|$ci_patterns" 2>/dev/null) + metrics+=$(echo "$sidekiq_raw" | grep -E "$ci_help|$ci_patterns" 2>/dev/null || true) metrics+=$'\n' # Email delivery metrics local email_patterns="^gitlab_emails_delivered_total[{ ]|^gitlab_emails_delivery_attempts_total[{ ]" local email_help="^# (HELP|TYPE) gitlab_emails_(delivered_total|delivery_attempts_total)" - metrics+=$(echo "$sidekiq_raw" | grep -E "$email_help|$email_patterns" 2>/dev/null) + metrics+=$(echo "$sidekiq_raw" | grep -E "$email_help|$email_patterns" 2>/dev/null || true) metrics+=$'\n' # External HTTP (webhooks, integrations) local ext_http_patterns="^gitlab_external_http_total[{ ]|^gitlab_external_http_duration_seconds[_{ ]" local ext_http_help="^# (HELP|TYPE) gitlab_external_http_(total|duration_seconds)" - metrics+=$(echo "$sidekiq_raw" | grep -E "$ext_http_help|$ext_http_patterns" 2>/dev/null) + metrics+=$(echo "$sidekiq_raw" | grep -E "$ext_http_help|$ext_http_patterns" 2>/dev/null || true) metrics+=$'\n' # Sidekiq SLI apdex/errors local sli_patterns="^gitlab_sli_sidekiq_execution_apdex_success_total[{ ]|^gitlab_sli_sidekiq_execution_apdex_total[{ ]|^gitlab_sli_sidekiq_execution_error_total[{ ]|^gitlab_sli_sidekiq_execution_total[{ ]" local sli_help="^# (HELP|TYPE) gitlab_sli_sidekiq_execution" - metrics+=$(echo "$sidekiq_raw" | grep -E "$sli_help|$sli_patterns" 2>/dev/null) + metrics+=$(echo "$sidekiq_raw" | grep -E "$sli_help|$sli_patterns" 2>/dev/null || true) metrics+=$'\n' # DB transaction duration, primary SQL, threads, cache, workers local extra_patterns="^gitlab_database_transaction_seconds[_{ ]|^gitlab_sql_primary_duration_seconds[_{ ]|^gitlab_ruby_threads_running_threads[{ ]|^gitlab_ruby_threads_max_expected_threads[{ ]|^limited_capacity_worker_running_jobs[{ ]|^limited_capacity_worker_max_running_jobs[{ ]|^limited_capacity_worker_remaining_work_count[{ ]|^redis_hit_miss_operations_total[{ ]" local extra_help="^# (HELP|TYPE) (gitlab_database_transaction_seconds|gitlab_sql_primary_duration_seconds|gitlab_ruby_threads_running_threads|gitlab_ruby_threads_max_expected_threads|limited_capacity_worker_running_jobs|limited_capacity_worker_max_running_jobs|limited_capacity_worker_remaining_work_count|redis_hit_miss_operations_total)" - metrics+=$(echo "$sidekiq_raw" | grep -E "$extra_help|$extra_patterns" 2>/dev/null) + metrics+=$(echo "$sidekiq_raw" | grep -E "$extra_help|$extra_patterns" 2>/dev/null || true) metrics+=$'\n' else debug_echo "Warning: Could not scrape Sidekiq exporter at $GITLAB_SIDEKIQ_URL (is sidekiq_exporter enabled?)" @@ -613,31 +613,31 @@ collect_local_metrics() { # Redis metrics local redis_patterns="^gitlab_redis_client_requests_total[{ ]|^gitlab_redis_client_exceptions_total[{ ]|^gitlab_redis_client_requests_duration_seconds[_{ ]|^gitlab_redis_client_requests_duration_seconds_sum[{ ]|^gitlab_redis_client_requests_duration_seconds_count[{ ]" local redis_help="^# (HELP|TYPE) gitlab_redis_client_(requests_total|exceptions_total|requests_duration_seconds)" - metrics+=$(echo "$raw_metrics" | grep -E "$redis_help|$redis_patterns" 2>/dev/null) + metrics+=$(echo "$raw_metrics" | grep -E "$redis_help|$redis_patterns" 2>/dev/null || true) metrics+=$'\n' # Database connection pool metrics local db_patterns="^gitlab_database_connection_pool_" local db_help="^# (HELP|TYPE) gitlab_database_connection_pool_" - metrics+=$(echo "$raw_metrics" | grep -E "$db_help|$db_patterns" 2>/dev/null) + metrics+=$(echo "$raw_metrics" | grep -E "$db_help|$db_patterns" 2>/dev/null || true) metrics+=$'\n' # Process metrics (CPU, memory, file descriptors) local process_patterns="^ruby_process_resident_memory_bytes[{ ]|^ruby_process_cpu_seconds_total[{ ]|^process_open_fds[{ ]|^process_max_fds[{ ]|^ruby_gc_stat_heap_live_slots[{ ]|^ruby_gc_stat_heap_free_slots[{ ]" local process_help="^# (HELP|TYPE) (ruby_process_resident_memory_bytes|ruby_process_cpu_seconds_total|process_open_fds|process_max_fds|ruby_gc_stat_heap_live_slots|ruby_gc_stat_heap_free_slots)" - metrics+=$(echo "$raw_metrics" | grep -E "$process_help|$process_patterns" 2>/dev/null) + metrics+=$(echo "$raw_metrics" | grep -E "$process_help|$process_patterns" 2>/dev/null || true) metrics+=$'\n' # GitLab transaction/request metrics local txn_patterns="^gitlab_transaction_duration_seconds[{ _]|^gitlab_sql_duration_seconds[{ _]|^gitlab_cache_operation_duration_seconds[{ _]" local txn_help="^# (HELP|TYPE) (gitlab_transaction_duration_seconds|gitlab_sql_duration_seconds|gitlab_cache_operation_duration_seconds)" - metrics+=$(echo "$raw_metrics" | grep -E "$txn_help|$txn_patterns" 2>/dev/null) + metrics+=$(echo "$raw_metrics" | grep -E "$txn_help|$txn_patterns" 2>/dev/null || true) metrics+=$'\n' # User session and ActionCable metrics local session_patterns="^user_session_logins_total[{ ]|^action_cable_active_connections[{ ]|^action_cable_pool_current_size[{ ]" local session_help="^# (HELP|TYPE) (user_session_logins_total|action_cable_active_connections|action_cable_pool_current_size)" - metrics+=$(echo "$raw_metrics" | grep -E "$session_help|$session_patterns" 2>/dev/null) + metrics+=$(echo "$raw_metrics" | grep -E "$session_help|$session_patterns" 2>/dev/null || true) metrics+=$'\n' local metric_count diff --git a/gitlab-smoke-tests.ps1 b/gitlab-smoke-tests.ps1 new file mode 100644 index 0000000..735b1d7 --- /dev/null +++ b/gitlab-smoke-tests.ps1 @@ -0,0 +1,864 @@ +############################################################################### +# gitlab-smoke-tests.ps1 - Verify GitLab instance health after upgrades +# +# PowerShell port of gitlab-smoke-tests.sh. Zero external dependencies +# beyond PowerShell 5.1+ and git. Runs on Windows, Linux, and macOS. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# License: MIT +# Version 1.00 +# +# Usage: +# $env:GITLAB_URL = "https://gitlab.example.com" +# $env:GITLAB_TOKEN = "glpat-xxxxxxxxxxxx" +# .\gitlab-smoke-tests.ps1 +# .\gitlab-smoke-tests.ps1 -SkipGit -SkipRegistry +# .\gitlab-smoke-tests.ps1 -Insecure -Format junit +# .\gitlab-smoke-tests.ps1 -Format tap +############################################################################### + +[CmdletBinding()] +param( + [string]$GitLabUrl = $env:GITLAB_URL, + [string]$GitLabToken = $env:GITLAB_TOKEN, + [string]$GitLabUser = $(if ($env:GITLAB_USER) { $env:GITLAB_USER } else { "root" }), + [string]$HealthToken = $env:GITLAB_HEALTH_TOKEN, + [string]$ProjectPrefix = $(if ($env:SMOKE_PROJECT_PREFIX) { $env:SMOKE_PROJECT_PREFIX } else { "smoke-test" }), + [int]$Timeout = $(if ($env:CURL_TIMEOUT) { [int]$env:CURL_TIMEOUT } else { 10 }), + [switch]$Insecure, + [switch]$SkipGit, + [switch]$SkipRegistry, + [switch]$SkipCleanup, + [ValidateSet("text","tap","junit")] + [string]$Format = "text", + [string]$JunitFile = "smoke-results.xml", + [switch]$NoColor +) + +$ErrorActionPreference = "Continue" + +# ============================================================================ +# STATE +# ============================================================================ + +$script:Pass = 0 +$script:Fail = 0 +$script:Skip = 0 +$script:Total = 0 +$script:Results = @() +$script:CleanupProjectId = "" +$script:TmpDir = "" +$script:StartTime = $null +$script:GitCloneOk = $false + +# ============================================================================ +# COLORS +# ============================================================================ + +function Write-Color { + param([string]$Text, [string]$Color = "White") + if ($NoColor) { + Write-Host $Text + } else { + Write-Host $Text -ForegroundColor $Color + } +} + +function Write-Log { param([string]$Msg) Write-Color "[INFO] $Msg" "Cyan" } +function Write-Warn { param([string]$Msg) Write-Color "[WARN] $Msg" "Yellow" } +function Write-Err { param([string]$Msg) Write-Color "[ERROR] $Msg" "Red" } + +# ============================================================================ +# TEST RESULT RECORDING +# ============================================================================ + +function Record-Pass { + param([string]$Name, [string]$Detail = "") + $script:Pass++ + $script:Total++ + $script:Results += [PSCustomObject]@{ Status="PASS"; Name=$Name; Detail=$Detail } + if ($Format -eq "tap") { + Write-Host "ok $($script:Total) - $Name" + } else { + $msg = " $(if($NoColor){'[PASS]'}else{[char]0x2713}) $Name" + if ($Detail) { $msg += " - $Detail" } + Write-Color $msg "Green" + } +} + +function Record-Fail { + param([string]$Name, [string]$Detail = "") + $script:Fail++ + $script:Total++ + $script:Results += [PSCustomObject]@{ Status="FAIL"; Name=$Name; Detail=$Detail } + if ($Format -eq "tap") { + Write-Host "not ok $($script:Total) - $Name" + if ($Detail) { Write-Host " # $Detail" } + } else { + $msg = " $(if($NoColor){'[FAIL]'}else{[char]0x2717}) $Name" + if ($Detail) { $msg += " - $Detail" } + Write-Color $msg "Red" + } +} + +function Record-Skip { + param([string]$Name, [string]$Reason = "") + $script:Skip++ + $script:Total++ + $script:Results += [PSCustomObject]@{ Status="SKIP"; Name=$Name; Detail=$Reason } + if ($Format -eq "tap") { + Write-Host "ok $($script:Total) - $Name # SKIP $Reason" + } else { + $msg = " $(if($NoColor){'[SKIP]'}else{[char]0x2298}) $Name" + if ($Reason) { $msg += " - $Reason" } + Write-Color $msg "Yellow" + } +} + +# ============================================================================ +# HTTP HELPERS +# ============================================================================ + +function Invoke-GitLabApi { + param( + [string]$Method, + [string]$Endpoint, + [string]$Body = $null, + [switch]$StatusOnly + ) + + $uri = "$GitLabUrl/api/v4$Endpoint" + $headers = @{ "Content-Type" = "application/json" } + if ($GitLabToken) { $headers["PRIVATE-TOKEN"] = $GitLabToken } + + $params = @{ + Uri = $uri + Method = $Method + Headers = $headers + TimeoutSec = $Timeout + UseBasicParsing = $true + ErrorAction = "Stop" + } + + if ($Insecure -and $PSVersionTable.PSVersion.Major -ge 7) { + $params["SkipCertificateCheck"] = $true + } + + if ($Body) { + $params["Body"] = $Body + } + + try { + if ($StatusOnly) { + $response = Invoke-WebRequest @params + return [int]$response.StatusCode + } else { + return Invoke-RestMethod @params + } + } catch { + if ($StatusOnly) { + if ($_.Exception.Response) { + return [int]$_.Exception.Response.StatusCode + } + return 0 + } + return $null + } +} + +function Invoke-HealthCheck { + param([string]$Path) + + $uri = "$GitLabUrl$Path" + if ($HealthToken) { $uri += "?token=$HealthToken" } + + $params = @{ + Uri = $uri + Method = "GET" + TimeoutSec = $Timeout + UseBasicParsing = $true + ErrorAction = "Stop" + } + + if ($Insecure -and $PSVersionTable.PSVersion.Major -ge 7) { + $params["SkipCertificateCheck"] = $true + } + + try { + $response = Invoke-WebRequest @params + return [int]$response.StatusCode + } catch { + if ($_.Exception.Response) { + return [int]$_.Exception.Response.StatusCode + } + return 0 + } +} + +# ============================================================================ +# TLS HELPER +# ============================================================================ + +function Get-TlsCertExpiry { + param([string]$HostName, [int]$Port = 443) + + try { + $tcpClient = New-Object System.Net.Sockets.TcpClient + $tcpClient.ReceiveTimeout = $Timeout * 1000 + $tcpClient.SendTimeout = $Timeout * 1000 + $tcpClient.Connect($HostName, $Port) + + $sslStream = New-Object System.Net.Security.SslStream( + $tcpClient.GetStream(), $false, + { param($s,$c,$ch,$e) return $true } + ) + $sslStream.AuthenticateAsClient($HostName) + + $cert = $sslStream.RemoteCertificate + $expiry = [DateTime]$cert.GetExpirationDateString() + + $sslStream.Dispose() + $tcpClient.Dispose() + + return $expiry + } catch { + return $null + } +} + +# ============================================================================ +# TEST SUITES +# ============================================================================ + +# -- 1. Connectivity -------------------------------------------------------- + +function Test-Connectivity { + Write-Host "" + Write-Color "Connectivity" "White" + + # 1a. Health endpoint + $code = Invoke-HealthCheck "/-/health" + if ($code -eq 200) { + Record-Pass "GitLab health endpoint reachable" "HTTP $code" + } else { + Record-Fail "GitLab health endpoint reachable" "HTTP $code" + } + + # 1b. Readiness + $code = Invoke-HealthCheck "/-/readiness" + if ($code -eq 200) { + Record-Pass "GitLab readiness check" "HTTP $code" + } else { + Record-Fail "GitLab readiness check" "HTTP $code" + } + + # 1c. Liveness + $code = Invoke-HealthCheck "/-/liveness" + if ($code -eq 200) { + Record-Pass "GitLab liveness check" "HTTP $code" + } else { + Record-Fail "GitLab liveness check" "HTTP $code" + } + + # 1d. TLS certificate + if ($GitLabUrl -match "^https://") { + $hostPart = $GitLabUrl -replace "^https://", "" -replace "/.*", "" -replace ":.*", "" + $portPart = 443 + if ($GitLabUrl -match ":(\d+)") { $portPart = [int]$Matches[1] } + + $expiry = Get-TlsCertExpiry -HostName $hostPart -Port $portPart + if ($expiry) { + $daysLeft = [math]::Floor(($expiry - (Get-Date)).TotalDays) + if ($daysLeft -gt 30) { + Record-Pass "TLS certificate valid" "$daysLeft days remaining" + } elseif ($daysLeft -gt 0) { + Record-Pass "TLS certificate valid" "$daysLeft days remaining (renew soon)" + } else { + Record-Fail "TLS certificate valid" "expired or expiring in $daysLeft days" + } + } else { + Record-Skip "TLS certificate check" "could not retrieve certificate" + } + } else { + Record-Skip "TLS certificate check" "not using HTTPS" + } +} + +# -- 2. API ---------------------------------------------------------------- + +function Test-Api { + Write-Host "" + Write-Color "API" "White" + + # 2a. Version + $versionData = Invoke-GitLabApi -Method GET -Endpoint "/version" + if ($versionData -and $versionData.version) { + Record-Pass "API version endpoint" "GitLab $($versionData.version) ($($versionData.revision))" + } else { + Record-Fail "API version endpoint" "no version returned" + } + + # 2b. Authentication + $authStatus = Invoke-GitLabApi -Method GET -Endpoint "/user" -StatusOnly + if ($authStatus -eq 200) { + $userData = Invoke-GitLabApi -Method GET -Endpoint "/user" + Record-Pass "API authentication" "authenticated as $($userData.username)" + } elseif ($authStatus -eq 401) { + Record-Fail "API authentication" "token rejected (HTTP 401)" + } else { + Record-Fail "API authentication" "HTTP $authStatus" + } + + # 2c. List projects + $projStatus = Invoke-GitLabApi -Method GET -Endpoint "/projects?per_page=1" -StatusOnly + if ($projStatus -eq 200) { + Record-Pass "API list projects" "database responding" + } else { + Record-Fail "API list projects" "HTTP $projStatus" + } + + # 2d. List users + $userStatus = Invoke-GitLabApi -Method GET -Endpoint "/users?per_page=1" -StatusOnly + if ($userStatus -eq 200) { + Record-Pass "API list users" "user directory accessible" + } else { + Record-Fail "API list users" "HTTP $userStatus" + } + + # 2e. Sidekiq + $sidekiq = Invoke-GitLabApi -Method GET -Endpoint "/sidekiq/compound_metrics" + if ($sidekiq -and -not $sidekiq.error) { + $procCount = 0 + if ($sidekiq.processes) { $procCount = @($sidekiq.processes).Count } + Record-Pass "Sidekiq running" "$procCount process(es) responding" + } else { + Record-Fail "Sidekiq running" "could not query Sidekiq metrics" + } + + # 2f. Runners + $runnerStatus = Invoke-GitLabApi -Method GET -Endpoint "/runners/all?per_page=1" -StatusOnly + if ($runnerStatus -eq 200) { + Record-Pass "API runners endpoint" "runner management accessible" + } elseif ($runnerStatus -eq 403) { + Record-Skip "API runners endpoint" "token lacks admin scope" + } else { + Record-Fail "API runners endpoint" "HTTP $runnerStatus" + } + + # 2g. Search + $searchStatus = Invoke-GitLabApi -Method GET -Endpoint "/search?scope=projects&search=test" -StatusOnly + if ($searchStatus -eq 200) { + Record-Pass "API search" "search index responding" + } elseif ($searchStatus -eq 403) { + Record-Skip "API search" "search disabled or token lacks scope" + } else { + Record-Fail "API search" "HTTP $searchStatus" + } +} + +# -- 3. Git Operations ----------------------------------------------------- + +function Test-Git { + if ($SkipGit) { + Write-Host "" + Write-Color "Git Operations" "White" + Record-Skip "Git clone" "SkipGit specified" + Record-Skip "Git push" "SkipGit specified" + return + } + + Write-Host "" + Write-Color "Git Operations" "White" + + # Create test project + $projectName = "$ProjectPrefix-$([DateTimeOffset]::UtcNow.ToUnixTimeSeconds())" + $body = @{ name = $projectName; visibility = "private"; initialize_with_readme = $true } | ConvertTo-Json + $project = Invoke-GitLabApi -Method POST -Endpoint "/projects" -Body $body + + if (-not $project -or -not $project.id) { + Record-Fail "Create test project" "API returned no project ID" + Record-Skip "Git clone" "no test project" + Record-Skip "Git push" "no test project" + return + } + + $script:CleanupProjectId = $project.id + Record-Pass "Create test project" "$projectName (ID: $($project.id))" + + # Build clone URL + $httpUrl = $project.http_url_to_repo + if (-not $httpUrl) { + $httpUrl = "$GitLabUrl/$GitLabUser/$projectName.git" + } + + # Rewrite origin if API returns an internal hostname + $apiOrigin = if ($httpUrl -match "^(https?://[^/]+)") { $Matches[1] } else { "" } + if ($apiOrigin -and $apiOrigin -ne $GitLabUrl) { + $httpUrl = $httpUrl -replace [regex]::Escape($apiOrigin), $GitLabUrl + } + + # Inject token + if ($httpUrl -match "^https://") { + $cloneUrl = $httpUrl -replace "^https://", "https://oauth2:${GitLabToken}@" + } elseif ($httpUrl -match "^http://") { + $cloneUrl = $httpUrl -replace "^http://", "http://oauth2:${GitLabToken}@" + } else { + $cloneUrl = $httpUrl + } + + # Temp directory + $script:TmpDir = Join-Path ([System.IO.Path]::GetTempPath()) "gitlab-smoke-$([guid]::NewGuid().ToString('N').Substring(0,8))" + New-Item -ItemType Directory -Path $script:TmpDir -Force | Out-Null + + # Wait for repo init + Start-Sleep -Seconds 2 + + # Clone + $gitArgs = @("clone") + if ($Insecure) { $env:GIT_SSL_NO_VERIFY = "true" } + + $repoDir = Join-Path $script:TmpDir "repo" + $cloneOutput = & git clone $cloneUrl $repoDir 2>&1 + $cloneRc = $LASTEXITCODE + + if ($cloneRc -eq 0) { + $script:GitCloneOk = $true + Record-Pass "Git clone (HTTPS)" "Gitaly responding" + } else { + $shortErr = ($cloneOutput | Select-String -Pattern "fatal|error" | Select-Object -First 1) -replace [regex]::Escape($GitLabToken), "[REDACTED]" + Record-Fail "Git clone (HTTPS)" "$shortErr" + return + } + + # Push + Push-Location $repoDir + try { + & git config user.email "smoke-test@example.com" + & git config user.name "Smoke Test" + "smoke test $(Get-Date -Format 'yyyy-MM-ddTHH:mm:ssZ')" | Out-File -FilePath "smoke-test.txt" -Encoding utf8 + + & git add smoke-test.txt + & git commit -m "smoke test commit" 2>&1 | Out-Null + + $pushOutput = & git push origin main 2>&1 + $pushRc = $LASTEXITCODE + if ($pushRc -ne 0) { + $pushOutput = & git push origin master 2>&1 + $pushRc = $LASTEXITCODE + } + + if ($pushRc -eq 0) { + Record-Pass "Git push (HTTPS)" "write to Gitaly succeeded" + } else { + Record-Fail "Git push (HTTPS)" "push failed" + } + } finally { + Pop-Location + } +} + +# -- 4. Container Registry ------------------------------------------------- + +function Test-Registry { + if ($SkipRegistry) { + Write-Host "" + Write-Color "Container Registry" "White" + Record-Skip "Registry API" "SkipRegistry specified" + return + } + + Write-Host "" + Write-Color "Container Registry" "White" + + # Check if registry is enabled + $registryEnabled = "" + $settings = Invoke-GitLabApi -Method GET -Endpoint "/application/settings" + if ($settings) { + $registryEnabled = $settings.container_registry_enabled + } + + if ($registryEnabled -eq $false) { + Record-Skip "Registry API reachable" "container registry disabled in application settings" + Record-Skip "Registry project endpoint" "container registry disabled in application settings" + return + } + + # Try registry v2 API + $hostPart = $GitLabUrl -replace "^https?://", "" -replace "/.*", "" + $registryStatus = 0 + + $registryUrls = @( + "$GitLabUrl`:5050/v2/", + "https://${hostPart}:5050/v2/", + "https://registry.${hostPart}/v2/" + ) + + foreach ($regUrl in $registryUrls) { + try { + $params = @{ + Uri = $regUrl + Method = "GET" + TimeoutSec = $Timeout + UseBasicParsing = $true + ErrorAction = "Stop" + } + if ($Insecure -and $PSVersionTable.PSVersion.Major -ge 7) { + $params["SkipCertificateCheck"] = $true + } + $response = Invoke-WebRequest @params + $registryStatus = [int]$response.StatusCode + break + } catch { + if ($_.Exception.Response) { + $registryStatus = [int]$_.Exception.Response.StatusCode + if ($registryStatus -eq 401) { break } + } + } + } + + if ($registryStatus -eq 200 -or $registryStatus -eq 401) { + Record-Pass "Registry API reachable" "HTTP $registryStatus" + } elseif ($registryStatus -eq 0) { + if ($registryEnabled -eq $true) { + Record-Fail "Registry API reachable" "enabled in settings but not reachable at standard ports/hosts" + } else { + Record-Skip "Registry API reachable" "not found at standard ports/hosts (settings unreadable - may need admin token)" + } + } else { + Record-Fail "Registry API reachable" "HTTP $registryStatus" + } + + # Project-level registry + if ($script:CleanupProjectId) { + $regStatus = Invoke-GitLabApi -Method GET -Endpoint "/projects/$($script:CleanupProjectId)/registry/repositories" -StatusOnly + if ($regStatus -eq 200) { + Record-Pass "Registry project endpoint" "project registry accessible" + } elseif ($regStatus -eq 404) { + Record-Skip "Registry project endpoint" "container registry not enabled for project" + } else { + Record-Fail "Registry project endpoint" "HTTP $regStatus" + } + } +} + +# -- 5. CI/CD -------------------------------------------------------------- + +function Test-CICD { + Write-Host "" + Write-Color "CI/CD" "White" + + # Runners + $runners = Invoke-GitLabApi -Method GET -Endpoint "/runners/all?per_page=100" + if ($runners -is [array]) { + $runnerCount = $runners.Count + $onlineCount = @($runners | Where-Object { $_.status -eq "online" }).Count + + if ($onlineCount -gt 0) { + Record-Pass "CI/CD runners online" "$onlineCount/$runnerCount runners online" + } elseif ($runnerCount -gt 0) { + Record-Fail "CI/CD runners online" "0/$runnerCount runners online" + } else { + Record-Skip "CI/CD runners online" "no runners registered" + } + } else { + Record-Skip "CI/CD runners" "could not query runners (admin token required)" + } + + # CI/CD settings + $cicdStatus = Invoke-GitLabApi -Method GET -Endpoint "/application/settings" -StatusOnly + if ($cicdStatus -eq 200) { + Record-Pass "CI/CD settings accessible" "application settings readable" + } elseif ($cicdStatus -eq 403) { + Record-Skip "CI/CD settings accessible" "admin token required" + } else { + Record-Fail "CI/CD settings accessible" "HTTP $cicdStatus" + } +} + +# -- 6. Background Migrations ---------------------------------------------- + +function Test-Migrations { + Write-Host "" + Write-Color "Background Migrations" "White" + + $migrations = Invoke-GitLabApi -Method GET -Endpoint "/admin/batched_background_migrations?database=main" + + if ($migrations -is [array]) { + $totalMig = $migrations.Count + $failedMig = @($migrations | Where-Object { $_.status -eq "failed" }).Count + $activeMig = @($migrations | Where-Object { $_.status -eq "active" }).Count + $pausedMig = @($migrations | Where-Object { $_.status -eq "paused" }).Count + $finishedMig = @($migrations | Where-Object { $_.status -eq "finished" }).Count + + if ($failedMig -gt 0) { + Record-Fail "Background migrations" "$failedMig failed, $activeMig active, $pausedMig paused, $finishedMig finished of $totalMig" + } elseif ($pausedMig -gt 0) { + Record-Fail "Background migrations" "$pausedMig paused, $activeMig active, $finishedMig finished of $totalMig" + } elseif ($activeMig -gt 0) { + Record-Pass "Background migrations" "$activeMig active, $finishedMig finished of $totalMig (in progress)" + } else { + Record-Pass "Background migrations" "all $totalMig finished" + } + } else { + $migStatus = Invoke-GitLabApi -Method GET -Endpoint "/admin/batched_background_migrations?database=main" -StatusOnly + if ($migStatus -eq 403) { + Record-Skip "Background migrations" "admin token required" + } else { + Record-Skip "Background migrations" "could not query (HTTP $migStatus)" + } + } +} + +# -- 7. Components --------------------------------------------------------- + +function Test-Components { + Write-Host "" + Write-Color "Components" "White" + + # Metadata + $metadata = Invoke-GitLabApi -Method GET -Endpoint "/metadata" + if ($metadata -and $metadata.version) { + $edition = if ($metadata.enterprise -eq $true) { "EE" } else { "CE" } + Record-Pass "GitLab metadata" "$($metadata.version) $edition" + } elseif ($metadata) { + Record-Pass "GitLab metadata" "endpoint reachable" + } else { + Record-Skip "GitLab metadata" "metadata endpoint not available" + } + + # Statistics + $stats = Invoke-GitLabApi -Method GET -Endpoint "/application/statistics" + if ($stats -and $stats.active_users) { + Record-Pass "Instance statistics" "$($stats.active_users) users, $($stats.projects) projects, $($stats.groups) groups" + } elseif ($stats) { + Record-Pass "Instance statistics" "endpoint reachable" + } else { + Record-Skip "Instance statistics" "admin token required" + } + + # Gitaly (inferred) + if ($script:GitCloneOk) { + Record-Pass "Gitaly storage" "project created and cloned successfully" + } elseif ($script:CleanupProjectId) { + Record-Skip "Gitaly storage" "project created but clone was not tested or failed" + } + + # PostgreSQL (inferred) + $pgStatus = Invoke-GitLabApi -Method GET -Endpoint "/projects?per_page=1&order_by=updated_at" -StatusOnly + if ($pgStatus -eq 200) { + Record-Pass "PostgreSQL" "database queries succeeding" + } else { + Record-Fail "PostgreSQL" "sorted query failed (HTTP $pgStatus)" + } + + # Redis (inferred) + $redisStatus = Invoke-GitLabApi -Method GET -Endpoint "/user" -StatusOnly + if ($redisStatus -eq 200) { + Record-Pass "Redis" "session/cache operational (auth succeeded)" + } else { + Record-Skip "Redis" "cannot verify independently" + } +} + +# ============================================================================ +# OUTPUT +# ============================================================================ + +function Write-Summary { + $duration = [math]::Floor(((Get-Date) - $script:StartTime).TotalSeconds) + + Write-Host "" + $separator = [string]::new([char]0x2500, 40) + Write-Color $separator "White" + Write-Color "Summary $GitLabUrl" "White" + + $summaryLine = " $($script:Pass) passed $($script:Fail) failed $($script:Skip) skipped (${duration}s)" + Write-Host $summaryLine + Write-Color $separator "White" + + if ($script:Fail -eq 0) { + Write-Color "All tests passed." "Green" + } else { + Write-Color "$($script:Fail) test(s) failed." "Red" + } +} + +function Write-TapHeader { + Write-Host "TAP version 13" +} + +function Write-TapFooter { + Write-Host "1..$($script:Total)" + Write-Host "# pass $($script:Pass)" + Write-Host "# fail $($script:Fail)" + Write-Host "# skip $($script:Skip)" +} + +function Write-JunitReport { + $duration = [math]::Floor(((Get-Date) - $script:StartTime).TotalSeconds) + + $xml = @" + + + +"@ + + foreach ($r in $script:Results) { + $safeName = $r.Name -replace '&','&' -replace '<','<' -replace '>','>' -replace '"','"' + $safeDetail = $r.Detail -replace '&','&' -replace '<','<' -replace '>','>' -replace '"','"' + + switch ($r.Status) { + "PASS" { + $xml += "`n " + if ($r.Detail) { $xml += "`n $safeDetail" } + $xml += "`n " + } + "FAIL" { + $xml += "`n " + $xml += "`n FAILED: $safeName - $safeDetail" + $xml += "`n " + } + "SKIP" { + $xml += "`n " + $xml += "`n " + $xml += "`n " + } + } + } + + $xml += "`n " + $xml += "`n" + + $xml | Out-File -FilePath $JunitFile -Encoding utf8 + Write-Log "JUnit report written to $JunitFile" +} + +# ============================================================================ +# CLEANUP +# ============================================================================ + +function Invoke-Cleanup { + if ($script:CleanupProjectId -and -not $SkipCleanup) { + try { + Invoke-GitLabApi -Method DELETE -Endpoint "/projects/$($script:CleanupProjectId)" | Out-Null + } catch { } + } + + if ($script:TmpDir -and (Test-Path $script:TmpDir)) { + Remove-Item -Recurse -Force $script:TmpDir -ErrorAction SilentlyContinue + } + + if ($env:GIT_SSL_NO_VERIFY) { + Remove-Item Env:\GIT_SSL_NO_VERIFY -ErrorAction SilentlyContinue + } +} + +# ============================================================================ +# MAIN +# ============================================================================ + +function Show-Usage { + @" +Usage: .\gitlab-smoke-tests.ps1 [OPTIONS] + +Smoke-test a GitLab instance. PowerShell 5.1+, git only. +Designed for air-gapped environments. + +Required environment variables: + GITLAB_URL GitLab base URL (https://gitlab.example.com) + GITLAB_TOKEN Personal access token (api scope; admin for full coverage) + +Optional environment variables: + GITLAB_HEALTH_TOKEN Health check access token + GITLAB_USER Username for git operations (default: root) + +Parameters: + -SkipGit Skip git clone/push tests + -SkipRegistry Skip container registry tests + -SkipCleanup Don't delete the test project after run + -Insecure Allow self-signed TLS certificates + -Timeout N HTTP timeout in seconds (default: 10) + -Format FORMAT Output: text (default), tap, junit + -JunitFile FILE JUnit output path (default: smoke-results.xml) + -NoColor Disable colored output + -Verbose Show debug output + +Examples: + `$env:GITLAB_URL = "https://gitlab.example.com" + `$env:GITLAB_TOKEN = "glpat-xxxxxxxxxxxx" + .\gitlab-smoke-tests.ps1 + + .\gitlab-smoke-tests.ps1 -Insecure -Format junit + .\gitlab-smoke-tests.ps1 -SkipGit -SkipRegistry + .\gitlab-smoke-tests.ps1 -Format tap +"@ +} + +# Handle PS 5.1 TLS and self-signed certs +if ($PSVersionTable.PSVersion.Major -lt 7) { + [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 + + if ($Insecure) { + Add-Type @" +using System.Net; +using System.Security.Cryptography.X509Certificates; +public class TrustAll : ICertificatePolicy { + public bool CheckValidationResult(ServicePoint sp, X509Certificate cert, + WebRequest req, int problem) { return true; } +} +"@ -ErrorAction SilentlyContinue + [System.Net.ServicePointManager]::CertificatePolicy = New-Object TrustAll + } +} + +# Validate +if (-not $GitLabUrl) { + Write-Err "GITLAB_URL is required" + Write-Host "" + Show-Usage + exit 1 +} + +if (-not $GitLabToken) { + Write-Err "GITLAB_TOKEN is required" + Write-Host "" + Show-Usage + exit 1 +} + +$GitLabUrl = $GitLabUrl.TrimEnd("/") +$script:StartTime = Get-Date + +if ($Format -eq "tap") { + Write-TapHeader +} else { + Write-Host "" + Write-Color "GitLab Smoke Tests" "White" + Write-Host "Target: $GitLabUrl" + Write-Host "Time: $(Get-Date -Format 'yyyy-MM-ddTHH:mm:ssZ')" + Write-Host "" +} + +try { + Test-Connectivity + Test-Api + Test-Git + Test-Registry + Test-CICD + Test-Migrations + Test-Components +} finally { + Invoke-Cleanup +} + +if ($Format -eq "tap") { + Write-TapFooter +} elseif ($Format -eq "junit") { + Write-Summary + Write-JunitReport +} else { + Write-Summary +} + +if ($script:Fail -eq 0) { exit 0 } else { exit 1 } diff --git a/gitlab-smoke-tests.sh b/gitlab-smoke-tests.sh new file mode 100644 index 0000000..cffd7b3 --- /dev/null +++ b/gitlab-smoke-tests.sh @@ -0,0 +1,862 @@ +#!/usr/bin/env bash + +######################################################################################### +#### gitlab-smoke-tests.sh — Verify GitLab instance health after upgrades or changes #### +#### Zero external dependencies. Runs in air-gapped environments. #### +#### Requires: bash 4+, curl, git, openssl (optional) #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.02 #### +#### #### +#### Usage: #### +#### export GITLAB_URL="https://gitlab.example.com" #### +#### export GITLAB_TOKEN="glpat-xxxxxxxxxxxxxxxxxxxx" #### +#### export GITLAB_HEALTH_TOKEN="your-health-token" # optional #### +#### ./gitlab-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +GITLAB_URL="${GITLAB_URL:-}" +GITLAB_TOKEN="${GITLAB_TOKEN:-}" +GITLAB_USER="${GITLAB_USER:-root}" +SMOKE_PROJECT_PREFIX="${SMOKE_PROJECT_PREFIX:-smoke-test}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +CURL_INSECURE="${CURL_INSECURE:-false}" +SKIP_GIT="${SKIP_GIT:-false}" +SKIP_REGISTRY="${SKIP_REGISTRY:-false}" +SKIP_CLEANUP="${SKIP_CLEANUP:-false}" +GITLAB_HEALTH_TOKEN="${GITLAB_HEALTH_TOKEN:-}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" # text, tap, junit +JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0 +FAIL=0 +SKIP=0 +TOTAL=0 +RESULTS=() +CLEANUP_PROJECT_ID="" +TMPDIR_SMOKE="" +START_TIME="" +GIT_CLONE_OK="false" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" + local detail="${2:-}" + ((PASS++)) || true + ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name}" + else + echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_fail() { + local name="$1" + local detail="${2:-}" + ((FAIL++)) || true + ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else + echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_skip() { + local name="$1" + local reason="${2:-}" + ((SKIP++)) || true + ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else + echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}" + fi +} + +# ── curl wrapper ────────────────────────────────────────────────────── +api_curl() { + local method="$1" + local endpoint="$2" + shift 2 + local curl_opts=(-s -S --max-time "$CURL_TIMEOUT" -X "$method") + + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + [[ -n "$GITLAB_TOKEN" ]] && curl_opts+=(-H "PRIVATE-TOKEN: ${GITLAB_TOKEN}") + curl_opts+=(-H "Content-Type: application/json") + + local url="${GITLAB_URL}/api/v4${endpoint}" + verbose "curl ${method} ${url} $*" + + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +api_curl_status() { + local method="$1" + local endpoint="$2" + shift 2 + local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT" -X "$method") + + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + [[ -n "$GITLAB_TOKEN" ]] && curl_opts+=(-H "PRIVATE-TOKEN: ${GITLAB_TOKEN}") + curl_opts+=(-H "Content-Type: application/json") + + local url="${GITLAB_URL}/api/v4${endpoint}" + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +# ── JSON parsing (no jq required) ──────────────────────────────────── +# Extract a top-level string/number value from flat JSON +json_value() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"?\K[^\",}]+" || true; } | head -1 +} + +json_value_string() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"\K[^\"]*" || true; } | head -1 +} + +# ── Cleanup ─────────────────────────────────────────────────────────── +cleanup() { + if [[ -n "$CLEANUP_PROJECT_ID" && "$SKIP_CLEANUP" != "true" ]]; then + verbose "Cleaning up smoke test project (ID: ${CLEANUP_PROJECT_ID})" + api_curl DELETE "/projects/${CLEANUP_PROJECT_ID}" >/dev/null 2>&1 || true + fi + if [[ -n "$TMPDIR_SMOKE" && -d "$TMPDIR_SMOKE" ]]; then + rm -rf "$TMPDIR_SMOKE" + fi +} + +trap cleanup EXIT + +# ══════════════════════════════════════════════════════════════════════ +# TEST SUITES +# ══════════════════════════════════════════════════════════════════════ + +# ── 1. Connectivity ────────────────────────────────────────────────── +test_connectivity() { + echo "" + echo -e "${BOLD}Connectivity${RESET}" + + # 1a. HTTP(S) reachable + local curl_opts=(-s -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + + local health_qs="" + [[ -n "$GITLAB_HEALTH_TOKEN" ]] && health_qs="?token=${GITLAB_HEALTH_TOKEN}" + + local http_code + http_code=$(curl "${curl_opts[@]}" "${GITLAB_URL}/-/health${health_qs}" 2>/dev/null) || http_code="000" + + if [[ "$http_code" == "200" ]]; then + record_pass "GitLab health endpoint reachable" "HTTP ${http_code}" + else + record_fail "GitLab health endpoint reachable" "HTTP ${http_code}" + fi + + # 1b. Readiness check + http_code=$(curl "${curl_opts[@]}" "${GITLAB_URL}/-/readiness${health_qs}" 2>/dev/null) || http_code="000" + if [[ "$http_code" == "200" ]]; then + record_pass "GitLab readiness check" "HTTP ${http_code}" + else + record_fail "GitLab readiness check" "HTTP ${http_code}" + fi + + # 1c. Liveness check + http_code=$(curl "${curl_opts[@]}" "${GITLAB_URL}/-/liveness${health_qs}" 2>/dev/null) || http_code="000" + if [[ "$http_code" == "200" ]]; then + record_pass "GitLab liveness check" "HTTP ${http_code}" + else + record_fail "GitLab liveness check" "HTTP ${http_code}" + fi + + # 1d. TLS certificate validity (if HTTPS) + if [[ "$GITLAB_URL" == https://* ]]; then + local host + host=$(echo "$GITLAB_URL" | sed 's|https://||' | cut -d/ -f1 | cut -d: -f1) + local port + port=$(echo "$GITLAB_URL" | grep -oP ':\K[0-9]+$' || echo "443") + + local expiry + expiry=$(echo | openssl s_client -servername "$host" -connect "${host}:${port}" 2>/dev/null | \ + openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) || expiry="" + + if [[ -n "$expiry" ]]; then + local expiry_epoch + expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null) || expiry_epoch=0 + local now_epoch + now_epoch=$(date +%s) + local days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + + if [[ $days_left -gt 30 ]]; then + record_pass "TLS certificate valid" "${days_left} days remaining" + elif [[ $days_left -gt 0 ]]; then + record_pass "TLS certificate valid" "${days_left} days remaining (renew soon)" + else + record_fail "TLS certificate valid" "expired or expiring in ${days_left} days" + fi + else + record_skip "TLS certificate check" "could not retrieve certificate" + fi + else + record_skip "TLS certificate check" "not using HTTPS" + fi +} + +# ── 2. API ──────────────────────────────────────────────────────────── +test_api() { + echo "" + echo -e "${BOLD}API${RESET}" + + # 2a. Version endpoint + local version_json + version_json=$(api_curl GET "/version" 2>/dev/null) || version_json="" + + local gl_version + gl_version=$(json_value_string "version" "$version_json") + local gl_revision + gl_revision=$(json_value_string "revision" "$version_json") + + if [[ -n "$gl_version" ]]; then + record_pass "API version endpoint" "GitLab ${gl_version} (${gl_revision})" + else + record_fail "API version endpoint" "no version returned" + fi + + # 2b. Authentication + local auth_status + auth_status=$(api_curl_status GET "/user") + if [[ "$auth_status" == "200" ]]; then + local user_json + user_json=$(api_curl GET "/user") + local username + username=$(json_value_string "username" "$user_json") + record_pass "API authentication" "authenticated as ${username}" + elif [[ "$auth_status" == "401" ]]; then + record_fail "API authentication" "token rejected (HTTP 401)" + else + record_fail "API authentication" "HTTP ${auth_status}" + fi + + # 2c. List projects (verify database queries work) + local projects_status + projects_status=$(api_curl_status GET "/projects?per_page=1") + if [[ "$projects_status" == "200" ]]; then + record_pass "API list projects" "database responding" + else + record_fail "API list projects" "HTTP ${projects_status}" + fi + + # 2d. List users + local users_status + users_status=$(api_curl_status GET "/users?per_page=1") + if [[ "$users_status" == "200" ]]; then + record_pass "API list users" "user directory accessible" + else + record_fail "API list users" "HTTP ${users_status}" + fi + + # 2e. Sidekiq health (job processing) + local sidekiq_json + sidekiq_json=$(api_curl GET "/sidekiq/compound_metrics" 2>/dev/null) || sidekiq_json="" + + if [[ -n "$sidekiq_json" && "$sidekiq_json" != *"error"* ]]; then + local processes + processes=$(echo "$sidekiq_json" | { grep -oP '"hostname"\s*:' || true; } | wc -l) + record_pass "Sidekiq running" "${processes} process(es) responding" + else + record_fail "Sidekiq running" "could not query Sidekiq metrics" + fi + + # 2f. Runners endpoint + local runners_status + runners_status=$(api_curl_status GET "/runners/all?per_page=1") + if [[ "$runners_status" == "200" ]]; then + record_pass "API runners endpoint" "runner management accessible" + elif [[ "$runners_status" == "403" ]]; then + record_skip "API runners endpoint" "token lacks admin scope" + else + record_fail "API runners endpoint" "HTTP ${runners_status}" + fi + + # 2g. Search endpoint + local search_status + search_status=$(api_curl_status GET "/search?scope=projects&search=test") + if [[ "$search_status" == "200" ]]; then + record_pass "API search" "search index responding" + elif [[ "$search_status" == "403" ]]; then + record_skip "API search" "search disabled or token lacks scope" + else + record_fail "API search" "HTTP ${search_status}" + fi +} + +# ── 3. Git Operations ──────────────────────────────────────────────── +test_git() { + if [[ "$SKIP_GIT" == "true" ]]; then + echo "" + echo -e "${BOLD}Git Operations${RESET}" + record_skip "Git clone" "SKIP_GIT=true" + record_skip "Git push" "SKIP_GIT=true" + return + fi + + echo "" + echo -e "${BOLD}Git Operations${RESET}" + + # Create a test project via API + local project_name + project_name="${SMOKE_PROJECT_PREFIX}-$(date +%s)" + local create_json + create_json=$(api_curl POST "/projects" -d "{\"name\":\"${project_name}\",\"visibility\":\"private\",\"initialize_with_readme\":true}") + + local project_id + project_id=$(json_value "id" "$create_json") + local http_url + http_url=$(json_value_string "http_url_to_repo" "$create_json") + + if [[ -z "$project_id" || "$project_id" == "null" ]]; then + record_fail "Create test project" "API returned: $(echo "$create_json" | head -c 200)" + record_skip "Git clone" "no test project" + record_skip "Git push" "no test project" + return + fi + + CLEANUP_PROJECT_ID="$project_id" + record_pass "Create test project" "${project_name} (ID: ${project_id})" + + # Clone + TMPDIR_SMOKE=$(mktemp -d) + + # Fallback: if http_url_to_repo wasn't parsed, construct it + if [[ -z "$http_url" ]]; then + http_url="${GITLAB_URL}/${GITLAB_USER}/${project_name}.git" + verbose "http_url_to_repo not found in API response, constructed: ${http_url}" + fi + verbose "Clone URL (from API): ${http_url}" + + # Replace the hostname in the API-returned URL with GITLAB_URL + # (the API may return an internal hostname that's unreachable remotely) + local api_origin + api_origin=$(echo "$http_url" | grep -oP 'https?://[^/]+') + if [[ -n "$api_origin" && "$api_origin" != "$GITLAB_URL" ]]; then + http_url="${http_url/$api_origin/$GITLAB_URL}" + verbose "Rewrote clone URL to: ${http_url}" + fi + + local clone_url + # Inject token into URL for HTTPS clone + if [[ "$http_url" == https://* ]]; then + clone_url="https://oauth2:${GITLAB_TOKEN}@${http_url#https://}" + elif [[ "$http_url" == http://* ]]; then + clone_url="http://oauth2:${GITLAB_TOKEN}@${http_url#http://}" + else + clone_url="$http_url" + fi + + local git_opts=() + [[ "$CURL_INSECURE" == "true" ]] && git_opts+=(-c http.sslVerify=false) + + # Brief wait for repository initialization (initialize_with_readme is async) + sleep 2 + + verbose "Running: git clone ${TMPDIR_SMOKE}/repo" + local clone_err clone_rc + clone_err=$(git ${git_opts[@]+"${git_opts[@]}"} clone "$clone_url" "${TMPDIR_SMOKE}/repo" 2>&1) && clone_rc=0 || clone_rc=$? + if [[ "$clone_rc" -eq 0 ]]; then + GIT_CLONE_OK="true" + record_pass "Git clone (HTTPS)" "Gitaly responding" + else + local short_err redacted_url + short_err=$(echo "$clone_err" | grep -i -E 'fatal|error' | head -1 | sed "s|${GITLAB_TOKEN}|[REDACTED]|g") + redacted_url=$(echo "$http_url" | sed "s|${GITLAB_TOKEN}|[REDACTED]|g") + verbose "Full clone output: $(echo "$clone_err" | sed "s|${GITLAB_TOKEN}|[REDACTED]|g")" + local redacted_clone + redacted_clone=$(echo "$clone_url" | sed "s|${GITLAB_TOKEN}|[REDACTED]|g") + verbose "Attempted URL: ${redacted_clone}" + record_fail "Git clone (HTTPS)" "${short_err:-clone failed (exit $clone_rc)}" + return + fi + + # Push a commit + pushd "${TMPDIR_SMOKE}/repo" >/dev/null + git config user.email "smoke-test@example.com" + git config user.name "Smoke Test" + echo "smoke test $(date -u +%Y-%m-%dT%H:%M:%SZ)" > smoke-test.txt + git add smoke-test.txt + git commit -m "smoke test commit" >/dev/null 2>&1 + + if git ${git_opts[@]+"${git_opts[@]}"} push origin main >/dev/null 2>&1 || \ + git ${git_opts[@]+"${git_opts[@]}"} push origin master >/dev/null 2>&1; then + record_pass "Git push (HTTPS)" "write to Gitaly succeeded" + else + record_fail "Git push (HTTPS)" "push failed" + fi + popd >/dev/null +} + +# ── 4. Container Registry ──────────────────────────────────────────── +test_registry() { + if [[ "$SKIP_REGISTRY" == "true" ]]; then + echo "" + echo -e "${BOLD}Container Registry${RESET}" + record_skip "Registry API" "SKIP_REGISTRY=true" + return + fi + + echo "" + echo -e "${BOLD}Container Registry${RESET}" + + # Check if registry is enabled via application settings API + local registry_enabled="" + local settings_json + settings_json=$(api_curl GET "/application/settings" 2>/dev/null) || settings_json="" + + if [[ -n "$settings_json" ]]; then + registry_enabled=$(json_value "container_registry_enabled" "$settings_json" 2>/dev/null || echo "") + fi + + if [[ "$registry_enabled" == "false" ]]; then + record_skip "Registry API reachable" "container registry disabled in application settings" + record_skip "Registry project endpoint" "container registry disabled in application settings" + return + fi + + # Try the registry v2 API endpoint + local host + host=$(echo "$GITLAB_URL" | sed 's|https\?://||' | cut -d/ -f1) + + local curl_opts=(-s -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + + local registry_status + registry_status=$(curl "${curl_opts[@]}" "${GITLAB_URL}:5050/v2/" 2>/dev/null) || \ + registry_status=$(curl "${curl_opts[@]}" "https://${host}:5050/v2/" 2>/dev/null) || \ + registry_status=$(curl "${curl_opts[@]}" "https://registry.${host}/v2/" 2>/dev/null) || \ + registry_status="000" + + if [[ "$registry_status" == "200" || "$registry_status" == "401" ]]; then + record_pass "Registry API reachable" "HTTP ${registry_status}" + elif [[ "$registry_status" == "000" ]]; then + if [[ "$registry_enabled" == "true" ]]; then + record_fail "Registry API reachable" "enabled in settings but not reachable at standard ports/hosts" + else + record_skip "Registry API reachable" "not found at standard ports/hosts (settings unreadable — may need admin token)" + fi + else + record_fail "Registry API reachable" "HTTP ${registry_status}" + fi + + # Check registry via GitLab API (project-level) + if [[ -n "$CLEANUP_PROJECT_ID" ]]; then + local reg_status + reg_status=$(api_curl_status GET "/projects/${CLEANUP_PROJECT_ID}/registry/repositories") + if [[ "$reg_status" == "200" ]]; then + record_pass "Registry project endpoint" "project registry accessible" + elif [[ "$reg_status" == "404" ]]; then + record_skip "Registry project endpoint" "container registry not enabled for project" + else + record_fail "Registry project endpoint" "HTTP ${reg_status}" + fi + fi +} + +# ── 5. CI/CD ────────────────────────────────────────────────────────── +test_cicd() { + echo "" + echo -e "${BOLD}CI/CD${RESET}" + + # Check runners + local runners_json + runners_json=$(api_curl GET "/runners/all?per_page=100" 2>/dev/null) || runners_json="" + + if [[ "$runners_json" == "["* ]]; then + local runner_count + runner_count=$(echo "$runners_json" | { grep -oP '"id"\s*:' || true; } | wc -l) + local online_count + online_count=$(echo "$runners_json" | { grep -oP '"status"\s*:\s*"online"' || true; } | wc -l) + + if [[ $online_count -gt 0 ]]; then + record_pass "CI/CD runners online" "${online_count}/${runner_count} runners online" + elif [[ $runner_count -gt 0 ]]; then + record_fail "CI/CD runners online" "0/${runner_count} runners online" + else + record_skip "CI/CD runners online" "no runners registered" + fi + else + record_skip "CI/CD runners" "could not query runners (admin token required)" + fi + + # Check CI/CD settings via API + local cicd_status + cicd_status=$(api_curl_status GET "/application/settings") + if [[ "$cicd_status" == "200" ]]; then + record_pass "CI/CD settings accessible" "application settings readable" + elif [[ "$cicd_status" == "403" ]]; then + record_skip "CI/CD settings accessible" "admin token required" + else + record_fail "CI/CD settings accessible" "HTTP ${cicd_status}" + fi +} + +# ── 6. Background Migrations ───────────────────────────────────────── +test_migrations() { + echo "" + echo -e "${BOLD}Background Migrations${RESET}" + + # Batched background migrations (admin only) + local migrations_json + migrations_json=$(api_curl GET "/admin/batched_background_migrations?database=main" 2>/dev/null) || migrations_json="" + + if [[ "$migrations_json" == "["* ]]; then + local total_mig + total_mig=$(echo "$migrations_json" | { grep -oP '"id"\s*:' || true; } | wc -l) + + local failed_mig + failed_mig=$(echo "$migrations_json" | { grep -oP '"status"\s*:\s*"failed"' || true; } | wc -l) + local active_mig + active_mig=$(echo "$migrations_json" | { grep -oP '"status"\s*:\s*"active"' || true; } | wc -l) + local paused_mig + paused_mig=$(echo "$migrations_json" | { grep -oP '"status"\s*:\s*"paused"' || true; } | wc -l) + local finalized_mig + finalized_mig=$(echo "$migrations_json" | { grep -oP '"status"\s*:\s*"finished"' || true; } | wc -l) + + if [[ $failed_mig -gt 0 ]]; then + record_fail "Background migrations" "${failed_mig} failed, ${active_mig} active, ${paused_mig} paused, ${finalized_mig} finished of ${total_mig}" + elif [[ $paused_mig -gt 0 ]]; then + record_fail "Background migrations" "${paused_mig} paused, ${active_mig} active, ${finalized_mig} finished of ${total_mig}" + elif [[ $active_mig -gt 0 ]]; then + record_pass "Background migrations" "${active_mig} active, ${finalized_mig} finished of ${total_mig} (in progress)" + else + record_pass "Background migrations" "all ${total_mig} finished" + fi + else + local mig_status + mig_status=$(api_curl_status GET "/admin/batched_background_migrations?database=main") + if [[ "$mig_status" == "403" ]]; then + record_skip "Background migrations" "admin token required" + else + record_skip "Background migrations" "could not query (HTTP ${mig_status})" + fi + fi +} + +# ── 7. Storage & Components ────────────────────────────────────────── +test_components() { + echo "" + echo -e "${BOLD}Components${RESET}" + + # Metadata endpoint + local metadata_json + metadata_json=$(api_curl GET "/metadata" 2>/dev/null) || metadata_json="" + + if [[ -n "$metadata_json" ]]; then + local gl_version + gl_version=$(json_value_string "version" "$metadata_json") + local enterprise + enterprise=$(json_value "enterprise" "$metadata_json") + + if [[ -n "$gl_version" ]]; then + local edition="CE" + [[ "$enterprise" == "true" ]] && edition="EE" + record_pass "GitLab metadata" "${gl_version} ${edition}" + else + record_pass "GitLab metadata" "endpoint reachable" + fi + else + record_skip "GitLab metadata" "metadata endpoint not available" + fi + + # Statistics (admin) + local stats_json + stats_json=$(api_curl GET "/application/statistics" 2>/dev/null) || stats_json="" + + if [[ -n "$stats_json" && "$stats_json" != *"error"* && "$stats_json" != *"403"* ]]; then + local active_users + active_users=$(json_value "active_users" "$stats_json") + local projects + projects=$(json_value "projects" "$stats_json") + local groups + groups=$(json_value "groups" "$stats_json") + + if [[ -n "$active_users" ]]; then + record_pass "Instance statistics" "${active_users} users, ${projects} projects, ${groups} groups" + else + record_pass "Instance statistics" "endpoint reachable" + fi + else + record_skip "Instance statistics" "admin token required" + fi + + # Gitaly check — only report pass if clone actually succeeded + if [[ "$GIT_CLONE_OK" == "true" ]]; then + record_pass "Gitaly storage" "project created and cloned successfully" + elif [[ -n "$CLEANUP_PROJECT_ID" ]]; then + record_skip "Gitaly storage" "project created but clone was not tested or failed" + fi + + # PostgreSQL (inferred from API responsiveness) + local pg_test + pg_test=$(api_curl_status GET "/projects?per_page=1&order_by=updated_at") + if [[ "$pg_test" == "200" ]]; then + record_pass "PostgreSQL" "database queries succeeding" + else + record_fail "PostgreSQL" "sorted query failed (HTTP ${pg_test})" + fi + + # Redis (inferred from session/cache) + local redis_test + redis_test=$(api_curl_status GET "/user") + if [[ "$redis_test" == "200" ]]; then + record_pass "Redis" "session/cache operational (auth succeeded)" + else + record_skip "Redis" "cannot verify independently" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} ${GITLAB_URL}" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + + if [[ $FAIL -eq 0 ]]; then + echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else + echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}" + fi +} + +print_tap_header() { + echo "TAP version 13" +} + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +write_junit() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + cat > "$JUNIT_FILE" < + + +JUNIT_EOF + + for result in "${RESULTS[@]}"; do + local status name detail + status=$(echo "$result" | cut -d'|' -f1) + name=$(echo "$result" | cut -d'|' -f2) + detail=$(echo "$result" | cut -d'|' -f3) + + # XML-escape the values + name=$(echo "$name" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + detail=$(echo "$detail" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + + case "$status" in + PASS) + echo " " >> "$JUNIT_FILE" + [[ -n "$detail" ]] && echo " ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + FAIL) + echo " " >> "$JUNIT_FILE" + echo " FAILED: ${name} — ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + SKIP) + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + esac + done + + echo " " >> "$JUNIT_FILE" + echo "" >> "$JUNIT_FILE" + + log "JUnit report written to ${JUNIT_FILE}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2; } +err() { printf "${RED}✗ %s${RESET}\n" "$*" >&2; } +verbose() { [[ "$VERBOSE" == "true" ]] && printf "${DIM} %s${RESET}\n" "$*" >&2 || true; } +die() { err "$*"; exit 1; } + +# ── Help ────────────────────────────────────────────────────────────────────── + +show_help() { + cat </dev/null 2>&1; then + ver=$(gitlab-ctl version 2>/dev/null | grep -oP '\d+\.\d+\.\d+' | head -1 || true) + fi + if [[ -z "$ver" ]] && command -v dpkg >/dev/null 2>&1; then + ver=$(dpkg -l gitlab-ce gitlab-ee 2>/dev/null | awk '/^ii/{print $3}' | grep -oP '\d+\.\d+\.\d+' | head -1 || true) + fi + if [[ -z "$ver" ]] && command -v rpm >/dev/null 2>&1; then + ver=$(rpm -q gitlab-ce gitlab-ee 2>/dev/null | grep -oP '\d+\.\d+\.\d+' | head -1 || true) + fi + echo "$ver" +} + +detect_pg_version() { + local ver="" + if command -v gitlab-psql >/dev/null 2>&1; then + ver=$(gitlab-psql --version 2>/dev/null | grep -oP '\d+' | head -1 || true) + fi + if [[ -z "$ver" ]] && command -v psql >/dev/null 2>&1; then + ver=$(psql --version 2>/dev/null | grep -oP '\d+' | head -1 || true) + fi + echo "$ver" +} + +# ── Core Logic ──────────────────────────────────────────────────────────────── + +get_pg_req() { + local gl_major="$1" + local entry + for entry in "${PG_REQS[@]}"; do + IFS='|' read -r req_gl req_min req_max <<< "$entry" + if [[ "$req_gl" == "$gl_major" ]]; then + echo "${req_min}|${req_max}" + return + fi + done + echo "unknown|unknown" +} + +build_upgrade_path() { + local from_int to_int + from_int=$(version_to_int "$FROM_VERSION") + to_int=$(version_to_int "$TO_VERSION") + + UPGRADE_PATH=() + local entry ver ver_int conditional notes + for entry in "${STOPS[@]}"; do + IFS='|' read -r ver conditional notes <<< "$entry" + ver_int=$(version_to_int "$ver") + + if (( ver_int <= from_int )); then + continue + fi + if (( ver_int > to_int )); then + continue + fi + if [[ "$SKIP_CONDITIONAL" == "true" && "$conditional" == "1" ]]; then + verbose "Skipping conditional stop: $ver" + continue + fi + + UPGRADE_PATH+=("$entry") + done + + # Add target if it's not already in the path + local last_ver="" + if [[ ${#UPGRADE_PATH[@]} -gt 0 ]]; then + IFS='|' read -r last_ver _ _ <<< "${UPGRADE_PATH[-1]}" + fi + if [[ "$last_ver" != "$TO_VERSION" ]]; then + UPGRADE_PATH+=("${TO_VERSION}|0|Target version") + fi +} + +get_pg_warnings() { + PG_WARNINGS=() + if [[ -z "$PG_VERSION" ]]; then + return + fi + + local from_major to_major + from_major=$(version_major "$FROM_VERSION") + to_major=$(version_major "$TO_VERSION") + + local gl_major + for (( gl_major = from_major; gl_major <= to_major; gl_major++ )); do + local req + req=$(get_pg_req "$gl_major") + IFS='|' read -r pg_min pg_max <<< "$req" + if [[ "$pg_min" == "unknown" ]]; then + continue + fi + if (( PG_VERSION < pg_min )); then + # Find the last stop before this major version + local boundary_stop="" + local prev_major=$(( gl_major - 1 )) + local entry ver + for entry in "${STOPS[@]}"; do + IFS='|' read -r ver _ _ <<< "$entry" + if [[ "$(version_major "$ver")" == "$prev_major" ]]; then + boundary_stop="$ver" + fi + done + PG_WARNINGS+=("PostgreSQL ${PG_VERSION} is below minimum for GitLab ${gl_major}.x (requires ${pg_min}+)|Upgrade PostgreSQL to ${pg_min}+ before upgrading past GitLab ${boundary_stop:-${prev_major}.x}|${gl_major}|${pg_min}|${pg_max}") + fi + done +} + +estimate_downtime() { + local stop_count=${#UPGRADE_PATH[@]} + local pg_upgrade_count=${#PG_WARNINGS[@]} + + # Software time: package install + gitlab-ctl reconfigure per stop + DT_SW_LOW=$(( stop_count * 5 )) + DT_SW_HIGH=$(( stop_count * 15 )) + + # Background migration time per stop, based on database size + # These run between stops and must complete before proceeding + local mig_low=0 mig_high=0 + case "$DB_SIZE" in + small) mig_low=2; mig_high=10 ;; # <10GB: minutes + medium) mig_low=10; mig_high=30 ;; # 10-50GB: tens of minutes + large) mig_low=30; mig_high=90 ;; # 50-200GB: up to hours + xlarge) mig_low=60; mig_high=240 ;; # 200GB+: hours per stop + *) mig_low=0; mig_high=0 ;; # unknown: show software only + esac + DT_MIG_LOW=$(( stop_count * mig_low )) + DT_MIG_HIGH=$(( stop_count * mig_high )) + + # PostgreSQL major upgrade time + DT_PG_LOW=$(( pg_upgrade_count * 15 )) + DT_PG_HIGH=$(( pg_upgrade_count * 60 )) + + DT_GL_LOW=$(( DT_SW_LOW + DT_MIG_LOW )) + DT_GL_HIGH=$(( DT_SW_HIGH + DT_MIG_HIGH )) + DT_TOTAL_LOW=$(( DT_GL_LOW + DT_PG_LOW )) + DT_TOTAL_HIGH=$(( DT_GL_HIGH + DT_PG_HIGH )) +} + +# ── Text Output ─────────────────────────────────────────────────────────────── + +print_header() { + printf "\n${BOLD}GitLab Upgrade Path Calculator${RESET}\n" + printf "══════════════════════════════════════════════════════════════\n\n" +} + +format_path_text() { + print_header + + printf " ${BOLD}From:${RESET} %s\n" "$FROM_VERSION" + printf " ${BOLD}To:${RESET} %s\n" "$TO_VERSION" + printf " ${BOLD}Stops:${RESET} %d\n" "${#UPGRADE_PATH[@]}" + + if [[ -n "$PG_VERSION" ]]; then + if [[ ${#PG_WARNINGS[@]} -gt 0 ]]; then + local last_warn="${PG_WARNINGS[-1]}" + IFS='|' read -r _ _ _ final_pg_min _ <<< "$last_warn" + printf "\n ${BOLD}PostgreSQL:${RESET} ${YELLOW}Currently ${PG_VERSION} → Must upgrade to ${final_pg_min}+ before GitLab ${TO_VERSION}${RESET}\n" + else + printf "\n ${BOLD}PostgreSQL:${RESET} ${GREEN}${PG_VERSION} — compatible with target${RESET}\n" + fi + fi + + printf "\n ── Upgrade Path ──────────────────────────────────────────\n\n" + printf " ${DIM}Step Version Notes PG Required${RESET}\n" + printf " ${DIM}──── ──────────── ─────────────────────────────────────── ──────────${RESET}\n" + + local step=0 entry ver conditional notes + for entry in "${UPGRADE_PATH[@]}"; do + IFS='|' read -r ver conditional notes <<< "$entry" + step=$((step + 1)) + + local gl_major pg_range + gl_major=$(version_major "$ver") + local req + req=$(get_pg_req "$gl_major") + IFS='|' read -r pg_min pg_max <<< "$req" + if [[ "$pg_min" != "unknown" ]]; then + pg_range="${pg_min}-${pg_max}" + else + pg_range="—" + fi + + local cond_marker="" + if [[ "$conditional" == "1" ]]; then + cond_marker=" ⓘ" + fi + + local ver_color="$RESET" + if [[ "$notes" == "Target version" ]]; then + ver_color="$GREEN" + fi + + printf " %3d ${ver_color}%-12s${RESET} %-39s %s\n" "$step" "$ver" "${notes}${cond_marker}" "$pg_range" + done + + if [[ ${#PG_WARNINGS[@]} -gt 0 ]]; then + printf "\n ── PostgreSQL Upgrade Required ───────────────────────────\n\n" + local warning + for warning in "${PG_WARNINGS[@]}"; do + IFS='|' read -r msg action gl_major pg_min pg_max <<< "$warning" + printf " ${YELLOW}⚠ %s${RESET}\n" "$msg" + printf " → %s\n\n" "$action" + done + fi + + estimate_downtime + printf " ── Estimated Downtime ────────────────────────────────────\n\n" + printf " Software: %d stops × 5-15 min = %d-%d min\n" "${#UPGRADE_PATH[@]}" "$DT_SW_LOW" "$DT_SW_HIGH" + printf " ${DIM}(package install + gitlab-ctl reconfigure)${RESET}\n" + if [[ -n "$DB_SIZE" ]]; then + printf " Migrations: %d stops × %s db = %d-%d min\n" "${#UPGRADE_PATH[@]}" "$DB_SIZE" "$DT_MIG_LOW" "$DT_MIG_HIGH" + printf " ${DIM}(background migrations must complete per stop)${RESET}\n" + else + printf " Migrations: ${DIM}use --db-size (small/medium/large/xlarge) for estimates${RESET}\n" + fi + if [[ ${#PG_WARNINGS[@]} -gt 0 ]]; then + printf " PG upgrades: %d × 15-60 min = %d-%d min\n" "${#PG_WARNINGS[@]}" "$DT_PG_LOW" "$DT_PG_HIGH" + fi + printf "\n ${BOLD}Total estimate: %d-%d min${RESET}" "$DT_TOTAL_LOW" "$DT_TOTAL_HIGH" + if (( DT_TOTAL_HIGH >= 120 )); then + local hours_low=$(( DT_TOTAL_LOW / 60 )) + local hours_high=$(( DT_TOTAL_HIGH / 60 )) + printf " (%d-%d hrs — plan a full maintenance window)" "$hours_low" "$hours_high" + fi + printf "\n\n" + + if [[ "$SKIP_CONDITIONAL" == "false" ]]; then + local has_conditional=false + for entry in "${UPGRADE_PATH[@]}"; do + IFS='|' read -r _ conditional _ <<< "$entry" + if [[ "$conditional" == "1" ]]; then + has_conditional=true + break + fi + done + if [[ "$has_conditional" == "true" ]]; then + printf " ${DIM}ⓘ = conditional stop (may be skippable — use --skip-conditional)${RESET}\n\n" + fi + fi +} + +format_path_json() { + local steps_json="[" + local step=0 first=true entry ver conditional notes + for entry in "${UPGRADE_PATH[@]}"; do + IFS='|' read -r ver conditional notes <<< "$entry" + step=$((step + 1)) + local gl_major req pg_min pg_max + gl_major=$(version_major "$ver") + req=$(get_pg_req "$gl_major") + IFS='|' read -r pg_min pg_max <<< "$req" + + [[ "$first" == "true" ]] || steps_json+="," + first=false + steps_json+=$(printf '{"step":%d,"version":"%s","conditional":%s,"notes":"%s","pg_min":"%s","pg_max":"%s"}' \ + "$step" "$ver" "$( [[ "$conditional" == "1" ]] && echo "true" || echo "false" )" "$notes" "$pg_min" "$pg_max") + done + steps_json+="]" + + local pg_upgrades_json="[" + first=true + if [[ ${#PG_WARNINGS[@]} -gt 0 ]]; then + local warning + for warning in "${PG_WARNINGS[@]}"; do + IFS='|' read -r msg action gl_major pg_min pg_max <<< "$warning" + [[ "$first" == "true" ]] || pg_upgrades_json+="," + first=false + pg_upgrades_json+=$(printf '{"before_gitlab":"%s.0.0","min_pg":"%s","max_pg":"%s"}' "$gl_major" "$pg_min" "$pg_max") + done + fi + pg_upgrades_json+="]" + + estimate_downtime + + printf '{\n' + printf ' "from": "%s",\n' "$FROM_VERSION" + printf ' "to": "%s",\n' "$TO_VERSION" + printf ' "total_stops": %d,\n' "${#UPGRADE_PATH[@]}" + printf ' "pg_current": "%s",\n' "${PG_VERSION:-null}" + printf ' "pg_upgrades_needed": %s,\n' "$pg_upgrades_json" + printf ' "steps": %s,\n' "$steps_json" + printf ' "db_size": "%s",\n' "${DB_SIZE:-unknown}" + printf ' "estimated_downtime_min": {"software": {"low": %d, "high": %d}, "migrations": {"low": %d, "high": %d}, "pg_upgrades": {"low": %d, "high": %d}, "total": {"low": %d, "high": %d}}\n' \ + "$DT_SW_LOW" "$DT_SW_HIGH" "$DT_MIG_LOW" "$DT_MIG_HIGH" "$DT_PG_LOW" "$DT_PG_HIGH" "$DT_TOTAL_LOW" "$DT_TOTAL_HIGH" + printf '}\n' +} + +# ── Mode: --path ────────────────────────────────────────────────────────────── + +run_path() { + if [[ -z "$FROM_VERSION" ]]; then + verbose "Detecting installed GitLab version..." + FROM_VERSION=$(detect_gitlab_version) + if [[ -z "$FROM_VERSION" ]]; then + die "Could not detect installed GitLab version. Use --from VERSION." + fi + log "Detected GitLab version: $FROM_VERSION" + fi + if [[ -z "$TO_VERSION" || "$TO_VERSION" == "latest" ]]; then + TO_VERSION="$LATEST_VERSION" + fi + + validate_version "$FROM_VERSION" + validate_version "$TO_VERSION" + + local from_int to_int + from_int=$(version_to_int "$FROM_VERSION") + to_int=$(version_to_int "$TO_VERSION") + if (( from_int >= to_int )); then + die "Target version ($TO_VERSION) must be higher than current version ($FROM_VERSION)" + fi + + build_upgrade_path + get_pg_warnings + + if [[ "$FORMAT" == "json" ]]; then + format_path_json + else + format_path_text + fi +} + +# ── Mode: --check ───────────────────────────────────────────────────────────── + +run_check() { + verbose "Detecting installed GitLab version..." + FROM_VERSION=$(detect_gitlab_version) + + if [[ -z "$FROM_VERSION" ]]; then + die "Could not detect installed GitLab version. Use --path --from VERSION instead." + fi + + log "Detected GitLab version: $FROM_VERSION" + + if [[ -z "$PG_VERSION" ]]; then + verbose "Detecting PostgreSQL version..." + PG_VERSION=$(detect_pg_version) + if [[ -n "$PG_VERSION" ]]; then + log "Detected PostgreSQL version: $PG_VERSION" + fi + fi + + if [[ -z "$TO_VERSION" || "$TO_VERSION" == "latest" ]]; then + TO_VERSION="$LATEST_VERSION" + fi + + validate_version "$FROM_VERSION" + validate_version "$TO_VERSION" + + build_upgrade_path + get_pg_warnings + + if [[ "$FORMAT" == "json" ]]; then + format_path_json + else + format_path_text + fi +} + +# ── Mode: --list-stops ──────────────────────────────────────────────────────── + +run_list_stops() { + print_header + printf " ── All Known Required Upgrade Stops ──────────────────────\n\n" + printf " ${DIM}Version Type Notes PG Required${RESET}\n" + printf " ${DIM}──────────── ──────────── ─────────────────────────────────────── ──────────${RESET}\n" + + local entry ver conditional notes + for entry in "${STOPS[@]}"; do + IFS='|' read -r ver conditional notes <<< "$entry" + + local gl_major req pg_min pg_max pg_range type_label + gl_major=$(version_major "$ver") + req=$(get_pg_req "$gl_major") + IFS='|' read -r pg_min pg_max <<< "$req" + if [[ "$pg_min" != "unknown" ]]; then + pg_range="${pg_min}-${pg_max}" + else + pg_range="—" + fi + + if [[ "$conditional" == "1" ]]; then + type_label="${YELLOW}conditional${RESET} " + else + type_label="${GREEN}required${RESET} " + fi + + printf " %-12s %b %-39s %s\n" "$ver" "$type_label" "$notes" "$pg_range" + done + + printf "\n ── PostgreSQL Version Requirements ───────────────────────\n\n" + printf " ${DIM}GitLab Min PG Max PG${RESET}\n" + printf " ${DIM}───────── ──────── ────────${RESET}\n" + local pg_entry + for pg_entry in "${PG_REQS[@]}"; do + IFS='|' read -r gl_major pg_min pg_max <<< "$pg_entry" + printf " %-9s %-8s %s\n" "${gl_major}.x" "$pg_min" "$pg_max" + done + printf "\n" +} + +# ── Mode: --db-check ────────────────────────────────────────────────────────── + +run_db_check() { + if [[ -z "$PG_VERSION" ]]; then + verbose "Detecting PostgreSQL version..." + PG_VERSION=$(detect_pg_version) + if [[ -z "$PG_VERSION" ]]; then + die "Could not detect PostgreSQL version. Use --pg-version VERSION." + fi + log "Detected PostgreSQL version: $PG_VERSION" + fi + + if [[ -z "$FROM_VERSION" ]]; then + FROM_VERSION=$(detect_gitlab_version) + if [[ -z "$FROM_VERSION" ]]; then + die "Could not detect GitLab version. Use --from VERSION." + fi + log "Detected GitLab version: $FROM_VERSION" + fi + + if [[ -z "$TO_VERSION" || "$TO_VERSION" == "latest" ]]; then + TO_VERSION="$LATEST_VERSION" + fi + + validate_version "$FROM_VERSION" + validate_version "$TO_VERSION" + + build_upgrade_path + get_pg_warnings + + if [[ "$FORMAT" == "json" ]]; then + local pg_json="[" + local first=true + if [[ ${#PG_WARNINGS[@]} -gt 0 ]]; then + local warning + for warning in "${PG_WARNINGS[@]}"; do + IFS='|' read -r msg action gl_major pg_min pg_max <<< "$warning" + [[ "$first" == "true" ]] || pg_json+="," + first=false + pg_json+=$(printf '{"message":"%s","action":"%s","gitlab_major":"%s","pg_min":"%s","pg_max":"%s"}' \ + "$msg" "$action" "$gl_major" "$pg_min" "$pg_max") + done + fi + pg_json+="]" + printf '{"pg_current":"%s","from":"%s","to":"%s","compatible":%s,"warnings":%s}\n' \ + "$PG_VERSION" "$FROM_VERSION" "$TO_VERSION" \ + "$( [[ ${#PG_WARNINGS[@]} -eq 0 ]] && echo "true" || echo "false" )" "$pg_json" + return + fi + + print_header + printf " ${BOLD}PostgreSQL Compatibility Check${RESET}\n\n" + printf " Current GitLab: %s\n" "$FROM_VERSION" + printf " Target GitLab: %s\n" "$TO_VERSION" + printf " Current PostgreSQL: %s\n\n" "$PG_VERSION" + + local from_major to_major + from_major=$(version_major "$FROM_VERSION") + to_major=$(version_major "$TO_VERSION") + + printf " ── Requirements by GitLab Version ────────────────────────\n\n" + printf " ${DIM}GitLab Min PG Max PG Your PG %s Status${RESET}\n" "$PG_VERSION" + printf " ${DIM}───────── ──────── ──────── ────────── ──────────${RESET}\n" + + local gl_major + for (( gl_major = from_major; gl_major <= to_major; gl_major++ )); do + local req pg_min pg_max status + req=$(get_pg_req "$gl_major") + IFS='|' read -r pg_min pg_max <<< "$req" + + if (( PG_VERSION < pg_min )); then + status="${RED}✗ Too low${RESET}" + elif (( PG_VERSION > pg_max )); then + status="${YELLOW}⚠ Too high${RESET}" + else + status="${GREEN}✓ OK${RESET}" + fi + printf " %-9s %-8s %-8s %-10s %b\n" "${gl_major}.x" "$pg_min" "$pg_max" "$PG_VERSION" "$status" + done + + if [[ ${#PG_WARNINGS[@]} -gt 0 ]]; then + printf "\n ── Action Required ───────────────────────────────────────\n\n" + local warning + for warning in "${PG_WARNINGS[@]}"; do + IFS='|' read -r msg action gl_major pg_min pg_max <<< "$warning" + printf " ${YELLOW}⚠ %s${RESET}\n" "$msg" + printf " → %s\n\n" "$action" + done + else + printf "\n ${GREEN}✓ PostgreSQL %s is compatible with the full upgrade path.${RESET}\n\n" "$PG_VERSION" + fi +} + +# ── Main ────────────────────────────────────────────────────────────────────── + +main() { + setup_colors + parse_args "$@" + setup_colors + + case "$RUN_MODE" in + path) run_path ;; + check) run_check ;; + list-stops) run_list_stops ;; + db-check) run_db_check ;; + *) die "Unknown mode: $RUN_MODE" ;; + esac +} + +main "$@" diff --git a/gitlab-upgrade.sh b/gitlab-upgrade.sh index e0ac0ff..d6620d0 100644 --- a/gitlab-upgrade.sh +++ b/gitlab-upgrade.sh @@ -7,7 +7,7 @@ #### #### #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### -#### Version: 1.00-030526 #### +#### Version: 1.01-051326 #### ################################################ set -o pipefail @@ -15,7 +15,8 @@ set -o pipefail SCRIPT_NAME=$(basename "$0") readonly SCRIPT_NAME -# Required version stops (as of 2026) +# Required version stops (as of May 2026) +# Source: https://docs.gitlab.com/update/upgrade_paths/ readonly VERSION_STOPS=( "14.0.12" "14.3.6" @@ -24,15 +25,19 @@ readonly VERSION_STOPS=( "15.0.5" "15.4.6" "15.11.13" - "16.0.9" - "16.3.8" - "16.7.9" + "16.0.10" + "16.3.9" + "16.7.10" "16.11.10" - "17.0.8" "17.3.7" "17.5.5" "17.8.7" - "18.0.1" + "17.11.7" + "18.0.2" + "18.2.6" + "18.5.2" + "18.8.7" + "18.11.0" ) # Default configuration diff --git a/gitops-bootstrap.sh b/gitops-bootstrap.sh new file mode 100644 index 0000000..8bfb40f --- /dev/null +++ b/gitops-bootstrap.sh @@ -0,0 +1,652 @@ +#!/usr/bin/env bash +######################################################################################### +#### gitops-bootstrap.sh — Bootstrap GitOps on Kubernetes with Flux or ArgoCD #### +#### Install, configure git source, sync applications, and validate deployments #### +#### Requires: bash 4+, kubectl, git, flux CLI or argocd CLI #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./gitops-bootstrap.sh --install flux --repo git@github.com:org/infra.git #### +#### #### +#### See --help for all options. #### +######################################################################################### +set -euo pipefail + +VERSION="1.00" + +# --- ANSI color variables (pre-initialized) --- +RED="" +GREEN="" +YELLOW="" +BLUE="" +CYAN="" +BOLD="" +DIM="" +RESET="" + +# --- Defaults --- +RUN_MODE="" +GITOPS_TOOL="${GITOPS_TOOL:-flux}" +GIT_REPO="${GITOPS_REPO:-}" +GIT_BRANCH="${GITOPS_BRANCH:-main}" +GIT_PATH="${GITOPS_PATH:-./clusters/default}" +NAMESPACE="${GITOPS_NAMESPACE:-}" +KUBECONFIG_FILE="${KUBECONFIG:-}" +KUBE_CTX="${KUBE_CONTEXT:-}" +CONFIRM_YES=false +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# --- State --- +readonly SCRIPT_NAME="${0##*/}" +START_TIME=$(date +%s) + +# --- Source name used for flux commands --- +SOURCE_NAME="main" +KUSTOMIZATION_NAME="default" +APP_NAME="" + +# --- Color setup --- +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[1;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# --- Logging --- +log() { printf "%b\n" "${GREEN}✔${RESET} $*"; } +warn() { printf "%b\n" "${YELLOW}⚠${RESET} $*" >&2; } +err() { printf "%b\n" "${RED}✖${RESET} $*" >&2; } +verbose() { [[ "$VERBOSE" == "true" ]] && printf "%b\n" "${DIM}▸ $*${RESET}" >&2; return 0; } +die() { err "$*"; exit 1; } + +section_header() { + printf "\n%b━━━ %s ━━━%b\n" "${BOLD}${BLUE}" "$1" "${RESET}" +} + +field() { + printf " %-22s %s\n" "$1:" "$2" +} + +field_color() { + printf " %-22s %b%s%b\n" "$1:" "$2" "$3" "${RESET}" +} + +# --- Resolve namespace default based on tool --- +resolve_namespace() { + if [[ -z "$NAMESPACE" ]]; then + if [[ "$GITOPS_TOOL" == "argocd" ]]; then + NAMESPACE="argocd" + else + NAMESPACE="flux-system" + fi + fi +} + +# --- kubectl wrapper --- +kubectl_cmd() { + local -a args=("kubectl") + [[ -n "$KUBECONFIG_FILE" ]] && args+=("--kubeconfig" "$KUBECONFIG_FILE") + [[ -n "$KUBE_CTX" ]] && args+=("--context" "$KUBE_CTX") + "${args[@]}" "$@" +} + +# --- Dependency checks --- +require_kubectl() { + command -v kubectl >/dev/null 2>&1 || die "kubectl is required but not found in PATH" +} + +require_flux() { + command -v flux >/dev/null 2>&1 || die "flux CLI is required but not found in PATH" +} + +require_argocd() { + command -v argocd >/dev/null 2>&1 || die "argocd CLI is required but not found in PATH" +} + +require_git() { + command -v git >/dev/null 2>&1 || die "git is required but not found in PATH" +} + +# --- Confirm prompt --- +confirm_action() { + local prompt="${1:-Continue?}" + if [[ "$CONFIRM_YES" == "true" ]]; then + return 0 + fi + printf "%s [y/N] " "$prompt" + read -r answer + [[ "$answer" =~ ^[Yy]$ ]] || die "Aborted" +} + +# --- Wait for pods ready in namespace --- +wait_for_pods() { + local ns="$1" + local timeout="${2:-120}" + log "Waiting for pods in namespace ${CYAN}${ns}${RESET} (timeout ${timeout}s)" + + local deadline=$(($(date +%s) + timeout)) + while true; do + local not_ready + not_ready=$(kubectl_cmd get pods -n "$ns" --no-headers 2>/dev/null \ + | grep -cvE 'Running|Completed|Succeeded' || true) + if [[ "$not_ready" -eq 0 ]]; then + local total + total=$(kubectl_cmd get pods -n "$ns" --no-headers 2>/dev/null | wc -l) + if [[ "$total" -gt 0 ]]; then + log "All ${total} pod(s) ready in ${ns}" + return 0 + fi + fi + if [[ $(date +%s) -ge $deadline ]]; then + warn "Timeout waiting for pods in ${ns}" + kubectl_cmd get pods -n "$ns" --no-headers 2>/dev/null || true + return 1 + fi + sleep 5 + done +} + +# ───────────────────────────────────────────────────────────────────── +# Install +# ───────────────────────────────────────────────────────────────────── + +do_install_flux() { + section_header "Installing Flux" + require_kubectl + require_flux + + log "Running pre-flight checks" + verbose "flux check --pre" + flux check --pre || die "Flux pre-flight checks failed" + + log "Installing Flux components into namespace ${CYAN}${NAMESPACE}${RESET}" + verbose "flux install --namespace=${NAMESPACE}" + flux install --namespace="$NAMESPACE" + + wait_for_pods "$NAMESPACE" + + if [[ -n "$GIT_REPO" ]]; then + log "Configuring GitRepository source" + verbose "flux create source git ${SOURCE_NAME} --url=${GIT_REPO} --branch=${GIT_BRANCH} --namespace=${NAMESPACE}" + flux create source git "$SOURCE_NAME" \ + --url="$GIT_REPO" \ + --branch="$GIT_BRANCH" \ + --namespace="$NAMESPACE" + + log "Creating Kustomization" + verbose "flux create kustomization ${KUSTOMIZATION_NAME} --source=${SOURCE_NAME} --path=${GIT_PATH} --namespace=${NAMESPACE} --prune=true" + flux create kustomization "$KUSTOMIZATION_NAME" \ + --source="$SOURCE_NAME" \ + --path="$GIT_PATH" \ + --namespace="$NAMESPACE" \ + --prune=true + fi + + section_header "Flux Installation Summary" + field "Namespace" "$NAMESPACE" + field "Git repository" "${GIT_REPO:-not configured}" + field "Branch" "$GIT_BRANCH" + field "Path" "$GIT_PATH" + log "Flux installation complete" +} + +do_install_argocd() { + section_header "Installing Argo CD" + require_kubectl + + local manifests_url="https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml" + + log "Creating namespace ${CYAN}${NAMESPACE}${RESET}" + kubectl_cmd create namespace "$NAMESPACE" --dry-run=client -o yaml \ + | kubectl_cmd apply -f - + + log "Applying Argo CD manifests" + verbose "kubectl apply -n ${NAMESPACE} -f ${manifests_url}" + kubectl_cmd apply -n "$NAMESPACE" -f "$manifests_url" + + wait_for_pods "$NAMESPACE" + + section_header "Argo CD Installation Summary" + field "Namespace" "$NAMESPACE" + field "Manifests" "$manifests_url" + printf "\n" + log "Retrieve initial admin password:" + printf " %bkubectl -n %s get secret argocd-initial-admin-secret -o jsonpath='{.data.password}' | base64 -d%b\n" \ + "${CYAN}" "$NAMESPACE" "${RESET}" + log "Argo CD installation complete" +} + +do_install() { + case "$GITOPS_TOOL" in + flux) do_install_flux ;; + argocd) do_install_argocd ;; + *) die "Unknown GitOps tool: ${GITOPS_TOOL}. Use 'flux' or 'argocd'." ;; + esac +} + +# ───────────────────────────────────────────────────────────────────── +# Status +# ───────────────────────────────────────────────────────────────────── + +do_status_flux() { + section_header "Flux Status" + require_kubectl + require_flux + + log "Sources" + flux get sources all --namespace="$NAMESPACE" 2>/dev/null || warn "No sources found" + + printf "\n" + log "Kustomizations" + flux get kustomizations --namespace="$NAMESPACE" 2>/dev/null || warn "No kustomizations found" + + printf "\n" + log "Helm releases" + flux get helmreleases --all-namespaces 2>/dev/null || verbose "No helm releases found" +} + +do_status_argocd() { + section_header "Argo CD Status" + require_kubectl + + log "Applications" + kubectl_cmd get applications -n "$NAMESPACE" -o wide 2>/dev/null || warn "No applications found" + + printf "\n" + log "App Projects" + kubectl_cmd get appprojects -n "$NAMESPACE" -o wide 2>/dev/null || verbose "No app projects found" +} + +do_status() { + case "$GITOPS_TOOL" in + flux) do_status_flux ;; + argocd) do_status_argocd ;; + *) die "Unknown GitOps tool: ${GITOPS_TOOL}" ;; + esac + + section_header "Pod Status (${NAMESPACE})" + kubectl_cmd get pods -n "$NAMESPACE" -o wide 2>/dev/null || warn "No pods found" +} + +# ───────────────────────────────────────────────────────────────────── +# Add Source +# ───────────────────────────────────────────────────────────────────── + +do_add_source_flux() { + section_header "Adding Git Source (Flux)" + require_flux + + [[ -z "$GIT_REPO" ]] && die "--repo is required to add a source" + + log "Creating GitRepository source ${CYAN}${SOURCE_NAME}${RESET}" + verbose "flux create source git ${SOURCE_NAME} --url=${GIT_REPO} --branch=${GIT_BRANCH} --namespace=${NAMESPACE}" + flux create source git "$SOURCE_NAME" \ + --url="$GIT_REPO" \ + --branch="$GIT_BRANCH" \ + --namespace="$NAMESPACE" + + log "Source added successfully" + flux get sources git --namespace="$NAMESPACE" +} + +do_add_source_argocd() { + section_header "Adding Git Source (Argo CD)" + require_argocd + + [[ -z "$GIT_REPO" ]] && die "--repo is required to add a source" + + log "Adding repository ${CYAN}${GIT_REPO}${RESET}" + verbose "argocd repo add ${GIT_REPO}" + argocd repo add "$GIT_REPO" || die "Failed to add repository" + + log "Repository added successfully" +} + +do_add_source() { + case "$GITOPS_TOOL" in + flux) do_add_source_flux ;; + argocd) do_add_source_argocd ;; + *) die "Unknown GitOps tool: ${GITOPS_TOOL}" ;; + esac +} + +# ───────────────────────────────────────────────────────────────────── +# Sync / Reconcile +# ───────────────────────────────────────────────────────────────────── + +do_sync_flux() { + section_header "Reconciling (Flux)" + require_flux + + log "Reconciling source git/${SOURCE_NAME}" + verbose "flux reconcile source git ${SOURCE_NAME} --namespace=${NAMESPACE}" + flux reconcile source git "$SOURCE_NAME" --namespace="$NAMESPACE" + + log "Reconciling kustomization ${KUSTOMIZATION_NAME}" + verbose "flux reconcile kustomization ${KUSTOMIZATION_NAME} --namespace=${NAMESPACE}" + flux reconcile kustomization "$KUSTOMIZATION_NAME" --namespace="$NAMESPACE" + + log "Reconciliation triggered" +} + +do_sync_argocd() { + section_header "Syncing (Argo CD)" + require_argocd + + if [[ -n "$APP_NAME" ]]; then + log "Syncing application ${CYAN}${APP_NAME}${RESET}" + verbose "argocd app sync ${APP_NAME}" + argocd app sync "$APP_NAME" + else + log "Syncing all applications" + local apps + apps=$(kubectl_cmd get applications -n "$NAMESPACE" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true) + if [[ -z "$apps" ]]; then + warn "No applications found to sync" + return + fi + while IFS= read -r app; do + [[ -z "$app" ]] && continue + log "Syncing ${app}" + argocd app sync "$app" || warn "Failed to sync ${app}" + done <<< "$apps" + fi + + log "Sync triggered" +} + +do_sync() { + case "$GITOPS_TOOL" in + flux) do_sync_flux ;; + argocd) do_sync_argocd ;; + *) die "Unknown GitOps tool: ${GITOPS_TOOL}" ;; + esac +} + +# ───────────────────────────────────────────────────────────────────── +# Validate (pre-flight) +# ───────────────────────────────────────────────────────────────────── + +do_validate() { + section_header "Pre-flight Validation" + require_kubectl + + local checks_passed=0 + local checks_failed=0 + + # Check kubectl connectivity + log "Checking kubectl connectivity" + if kubectl_cmd cluster-info >/dev/null 2>&1; then + field_color "Cluster access" "${GREEN}" "OK" + checks_passed=$((checks_passed + 1)) + else + field_color "Cluster access" "${RED}" "FAILED" + checks_failed=$((checks_failed + 1)) + fi + + # Check namespace exists + log "Checking namespace ${CYAN}${NAMESPACE}${RESET}" + if kubectl_cmd get namespace "$NAMESPACE" >/dev/null 2>&1; then + field_color "Namespace" "${GREEN}" "${NAMESPACE} exists" + checks_passed=$((checks_passed + 1)) + else + field_color "Namespace" "${YELLOW}" "${NAMESPACE} does not exist (will be created)" + checks_passed=$((checks_passed + 1)) + fi + + # Check CRDs installed + log "Checking CRDs" + if [[ "$GITOPS_TOOL" == "flux" ]]; then + if kubectl_cmd get crd gitrepositories.source.toolkit.fluxcd.io >/dev/null 2>&1; then + field_color "Flux CRDs" "${GREEN}" "installed" + checks_passed=$((checks_passed + 1)) + else + field_color "Flux CRDs" "${YELLOW}" "not installed" + checks_passed=$((checks_passed + 1)) + fi + elif [[ "$GITOPS_TOOL" == "argocd" ]]; then + if kubectl_cmd get crd applications.argoproj.io >/dev/null 2>&1; then + field_color "ArgoCD CRDs" "${GREEN}" "installed" + checks_passed=$((checks_passed + 1)) + else + field_color "ArgoCD CRDs" "${YELLOW}" "not installed" + checks_passed=$((checks_passed + 1)) + fi + fi + + # Check git repo accessible + if [[ -n "$GIT_REPO" ]]; then + log "Checking git repository accessibility" + require_git + if git ls-remote "$GIT_REPO" HEAD >/dev/null 2>&1; then + field_color "Git repository" "${GREEN}" "accessible" + checks_passed=$((checks_passed + 1)) + else + field_color "Git repository" "${RED}" "not accessible" + checks_failed=$((checks_failed + 1)) + fi + else + verbose "No git repository specified, skipping connectivity check" + fi + + # Check tool CLI available + log "Checking CLI tools" + if [[ "$GITOPS_TOOL" == "flux" ]]; then + if command -v flux >/dev/null 2>&1; then + local flux_ver + flux_ver=$(flux version --client 2>/dev/null | head -1 || echo "unknown") + field_color "flux CLI" "${GREEN}" "$flux_ver" + checks_passed=$((checks_passed + 1)) + else + field_color "flux CLI" "${RED}" "not found" + checks_failed=$((checks_failed + 1)) + fi + elif [[ "$GITOPS_TOOL" == "argocd" ]]; then + if command -v argocd >/dev/null 2>&1; then + local argocd_ver + argocd_ver=$(argocd version --client --short 2>/dev/null || echo "unknown") + field_color "argocd CLI" "${GREEN}" "$argocd_ver" + checks_passed=$((checks_passed + 1)) + else + field_color "argocd CLI" "${RED}" "not found" + checks_failed=$((checks_failed + 1)) + fi + fi + + section_header "Validation Summary" + field_color "Passed" "${GREEN}" "$checks_passed" + if [[ "$checks_failed" -gt 0 ]]; then + field_color "Failed" "${RED}" "$checks_failed" + die "Validation failed with ${checks_failed} error(s)" + else + field_color "Failed" "${GREEN}" "0" + log "All pre-flight checks passed" + fi +} + +# ───────────────────────────────────────────────────────────────────── +# Teardown +# ───────────────────────────────────────────────────────────────────── + +do_teardown_flux() { + section_header "Tearing Down Flux" + require_flux + + confirm_action "Remove Flux from the cluster?" + + log "Uninstalling Flux" + verbose "flux uninstall --namespace=${NAMESPACE} --silent" + flux uninstall --namespace="$NAMESPACE" --silent + + log "Flux has been removed from the cluster" +} + +do_teardown_argocd() { + section_header "Tearing Down Argo CD" + require_kubectl + + confirm_action "Remove Argo CD from the cluster (delete namespace ${NAMESPACE})?" + + log "Deleting namespace ${CYAN}${NAMESPACE}${RESET}" + kubectl_cmd delete namespace "$NAMESPACE" --wait=true + + log "Argo CD has been removed from the cluster" +} + +do_teardown() { + case "$GITOPS_TOOL" in + flux) do_teardown_flux ;; + argocd) do_teardown_argocd ;; + *) die "Unknown GitOps tool: ${GITOPS_TOOL}" ;; + esac +} + +# ───────────────────────────────────────────────────────────────────── +# Help +# ───────────────────────────────────────────────────────────────────── + +show_help() { + cat </dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "# ERROR: Missing required commands: ${missing[*]}" >&2 + echo "# Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 + exit 1 + fi +} + +validate_config() { + if [[ -z "$GLPI_URL" ]]; then + echo "# ERROR: GLPI_URL environment variable is required" >&2 + exit 1 + fi + if [[ -z "$GLPI_USER_TOKEN" ]]; then + echo "# ERROR: GLPI_USER_TOKEN environment variable is required" >&2 + exit 1 + fi + GLPI_URL="${GLPI_URL%/}" +} + +init_session() { + local auth_headers=(-H "Authorization: user_token ${GLPI_USER_TOKEN}") + if [[ -n "$GLPI_APP_TOKEN" ]]; then + auth_headers+=(-H "App-Token: ${GLPI_APP_TOKEN}") + fi + + local response + response=$(curl -sf --max-time "$CURL_TIMEOUT" \ + "${auth_headers[@]}" \ + "${GLPI_URL}/apirest.php/initSession" 2>/dev/null) || { echo ""; return 1; } + + SESSION_TOKEN=$(echo "$response" | jq -r '.session_token // empty' 2>/dev/null) + + if [[ -z "$SESSION_TOKEN" ]]; then + return 1 + fi + + return 0 +} + +kill_session() { + if [[ -n "$SESSION_TOKEN" ]]; then + local headers=(-H "Session-Token: ${SESSION_TOKEN}") + if [[ -n "$GLPI_APP_TOKEN" ]]; then + headers+=(-H "App-Token: ${GLPI_APP_TOKEN}") + fi + curl -sf --max-time "$CURL_TIMEOUT" \ + "${headers[@]}" \ + "${GLPI_URL}/apirest.php/killSession" &>/dev/null || true + SESSION_TOKEN="" + fi +} + +api_get() { + local endpoint="$1" + local headers=(-H "Session-Token: ${SESSION_TOKEN}" -H "Content-Type: application/json") + if [[ -n "$GLPI_APP_TOKEN" ]]; then + headers+=(-H "App-Token: ${GLPI_APP_TOKEN}") + fi + + curl -sf --max-time "$CURL_TIMEOUT" \ + "${headers[@]}" \ + "${GLPI_URL}/apirest.php/${endpoint}" 2>/dev/null || echo "" +} + +api_get_count() { + local endpoint="$1" + local range_header + range_header=$(curl -sI --max-time "$CURL_TIMEOUT" \ + -H "Session-Token: ${SESSION_TOKEN}" \ + -H "Content-Type: application/json" \ + ${GLPI_APP_TOKEN:+-H "App-Token: ${GLPI_APP_TOKEN}"} \ + "${GLPI_URL}/apirest.php/${endpoint}?range=0-0" 2>/dev/null | grep -i '^Content-Range:' | tr -d '\r') + + if [[ -n "$range_header" ]]; then + echo "$range_header" | sed 's|.*/||' + else + echo "0" + fi +} + +sanitize_label() { + local value="$1" + echo "$value" | sed 's/[^a-zA-Z0-9_ \/.-]/_/g' | sed 's/"/\\"/g' +} + +add_metric() { + local name="$1" + local type="$2" + local help="$3" + local value="$4" + local labels="${5:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name}{${labels}} ${value} +" + else + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name} ${value} +" + fi +} + +add_metric_value() { + local name="$1" + local value="$2" + local labels="${3:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="${name}{${labels}} ${value} +" + else + OUTPUT+="${name} ${value} +" + fi +} + +# --- Collectors --- + +collect_tickets() { + # Total tickets (open = not closed) + local total + total=$(api_get_count "Ticket") + add_metric "glpi_tickets_total" "gauge" "Total number of tickets" "${total:-0}" + + # Tickets by status + # GLPI status codes: 1=New, 2=Assigned, 3=Planned, 4=Waiting, 5=Solved, 6=Closed + local status_names=("new" "assigned" "planned" "waiting" "solved" "closed") + local status_codes=(1 2 3 4 5 6) + + for i in "${!status_codes[@]}"; do + local code="${status_codes[$i]}" + local name="${status_names[$i]}" + local count + count=$(api_get_count "Ticket?searchText[status]=${code}") + add_metric "glpi_tickets_${name}" "gauge" "Tickets in ${name} status" "${count:-0}" + done + + # Tickets by urgency + # GLPI urgency: 1=Very low, 2=Low, 3=Medium, 4=High, 5=Very high + OUTPUT+="# HELP glpi_tickets_by_urgency Number of tickets by urgency level +# TYPE glpi_tickets_by_urgency gauge +" + local urgency_names=("very_low" "low" "medium" "high" "very_high") + local urgency_codes=(1 2 3 4 5) + + for i in "${!urgency_codes[@]}"; do + local code="${urgency_codes[$i]}" + local uname="${urgency_names[$i]}" + local count + count=$(api_get_count "Ticket?searchText[urgency]=${code}") + add_metric_value "glpi_tickets_by_urgency" "${count:-0}" "urgency=\"${uname}\"" + done + + # Tickets by category (top categories) + local cat_json + cat_json=$(api_get "ITILCategory?range=0-49") + + if [[ -n "$cat_json" ]]; then + local is_array + is_array=$(echo "$cat_json" | jq -r 'if type == "array" then "yes" else "no" end' 2>/dev/null) + + if [[ "$is_array" == "yes" ]]; then + local cat_count + cat_count=$(echo "$cat_json" | jq 'length' 2>/dev/null) + + if [[ "$cat_count" -gt 0 ]]; then + OUTPUT+="# HELP glpi_tickets_by_category Number of tickets per category +# TYPE glpi_tickets_by_category gauge +" + local j + for ((j = 0; j < cat_count && j < 30; j++)); do + local cat_name cat_id + cat_name=$(echo "$cat_json" | jq -r ".[$j].completename // .[$j].name // empty" 2>/dev/null) + cat_id=$(echo "$cat_json" | jq -r ".[$j].id // empty" 2>/dev/null) + + if [[ -n "$cat_name" && -n "$cat_id" ]]; then + local ticket_count + ticket_count=$(api_get_count "Ticket?searchText[itilcategories_id]=${cat_id}") + local safe_name + safe_name=$(sanitize_label "$cat_name") + add_metric_value "glpi_tickets_by_category" "${ticket_count:-0}" "category=\"${safe_name}\"" + fi + done + fi + fi + fi +} + +collect_assets() { + # Computers + local computers + computers=$(api_get_count "Computer") + add_metric "glpi_computers_total" "gauge" "Total number of computers" "${computers:-0}" + + # Monitors + local monitors + monitors=$(api_get_count "Monitor") + add_metric "glpi_monitors_total" "gauge" "Total number of monitors" "${monitors:-0}" + + # Network devices + local netdevices + netdevices=$(api_get_count "NetworkEquipment") + add_metric "glpi_network_devices_total" "gauge" "Total number of network devices" "${netdevices:-0}" + + # Phones + local phones + phones=$(api_get_count "Phone") + add_metric "glpi_phones_total" "gauge" "Total number of phones" "${phones:-0}" + + # Printers + local printers + printers=$(api_get_count "Printer") + add_metric "glpi_printers_total" "gauge" "Total number of printers" "${printers:-0}" + + # Software + local software + software=$(api_get_count "Software") + add_metric "glpi_software_total" "gauge" "Total number of software entries" "${software:-0}" +} + +collect_organization() { + # Users + local users + users=$(api_get_count "User") + add_metric "glpi_users_total" "gauge" "Total number of users" "${users:-0}" + + # Groups + local groups + groups=$(api_get_count "Group") + add_metric "glpi_groups_total" "gauge" "Total number of groups" "${groups:-0}" + + # Entities + local entities + entities=$(api_get_count "Entity") + add_metric "glpi_entities_total" "gauge" "Total number of entities" "${entities:-0}" + + # Locations + local locations + locations=$(api_get_count "Location") + add_metric "glpi_locations_total" "gauge" "Total number of locations" "${locations:-0}" +} + +# --- Output --- + +write_output() { + if [[ "$TEXTFILE_MODE" == true ]]; then + local output_file="${TEXTFILE_DIR}/glpi.prom" + local temp_file="${output_file}.$$" + + mkdir -p "$TEXTFILE_DIR" + echo "$OUTPUT" > "$temp_file" + mv "$temp_file" "$output_file" + echo "# Wrote metrics to ${output_file}" >&2 + else + echo "$OUTPUT" + fi +} + +serve_http() { + if ! command -v nc &>/dev/null && ! command -v ncat &>/dev/null; then + echo "# ERROR: nc (netcat) or ncat required for HTTP mode" >&2 + exit 1 + fi + + echo "# GLPI exporter listening on port ${HTTP_PORT}" >&2 + echo "# Metrics endpoint: http://localhost:${HTTP_PORT}/metrics" >&2 + + local nc_cmd="nc" + if command -v ncat &>/dev/null; then + nc_cmd="ncat" + fi + + while true; do + OUTPUT="" + START_TIME=$(date +%s%N) + + add_metric "glpi_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" + + if init_session; then + add_metric "glpi_up" "gauge" "GLPI API reachability (1=up, 0=down)" "1" + collect_tickets + collect_assets + collect_organization + kill_session + else + add_metric "glpi_up" "gauge" "GLPI API reachability (1=up, 0=down)" "0" + fi + + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + add_metric "glpi_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" + add_metric "glpi_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" + + local content_length=${#OUTPUT} + local response="HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4; charset=utf-8\r\nContent-Length: ${content_length}\r\nConnection: close\r\n\r\n${OUTPUT}" + + echo -e "$response" | $nc_cmd -l -p "$HTTP_PORT" -q 1 2>/dev/null || \ + echo -e "$response" | $nc_cmd -l "$HTTP_PORT" -c 2>/dev/null || \ + echo -e "$response" | $nc_cmd -l -p "$HTTP_PORT" 2>/dev/null || true + done +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "# ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + cat > /etc/cron.d/glpi-exporter </dev/null +EOF + + chmod 644 /etc/cron.d/glpi-exporter + echo "# Installed cron job: /etc/cron.d/glpi-exporter" >&2 + echo "# Metrics will be written to: ${TEXTFILE_DIR}/glpi.prom" >&2 +} + +# --- Main --- + +main() { + for arg in "$@"; do + case "$arg" in + --textfile) TEXTFILE_MODE=true ;; + --http) HTTP_MODE=true ;; + -p|--port) shift; HTTP_PORT="${1:-$HTTP_PORT}" ;; + --install) + check_dependencies + validate_config + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) ;; + esac + done + + check_dependencies + validate_config + + if [[ "$HTTP_MODE" == true ]]; then + serve_http + exit 0 + fi + + START_TIME=$(date +%s%N) + + add_metric "glpi_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" + + if init_session; then + add_metric "glpi_up" "gauge" "GLPI API reachability (1=up, 0=down)" "1" + collect_tickets + collect_assets + collect_organization + kill_session + else + add_metric "glpi_up" "gauge" "GLPI API reachability (1=up, 0=down)" "0" + fi + + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + add_metric "glpi_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" + add_metric "glpi_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" + + write_output +} + +main "$@" diff --git a/gpu-exporter.sh b/gpu-exporter.sh new file mode 100755 index 0000000..411e4a0 --- /dev/null +++ b/gpu-exporter.sh @@ -0,0 +1,440 @@ +#!/bin/bash +################################################################################ +# Script Name: gpu-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for NVIDIA GPU metrics — temperature, +# utilization, VRAM usage, power draw, fan speed, clock speeds, +# performance state, and per-process GPU memory via nvidia-smi +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - NVIDIA GPU with drivers installed +# - nvidia-smi available in PATH +# - netcat (nc) for HTTP mode +# +# Usage: +# ./gpu-exporter.sh # stdout +# ./gpu-exporter.sh --http -p 9195 # HTTP server +# ./gpu-exporter.sh --textfile # node_exporter textfile +# +# Metrics Exported: +# - gpu_info{gpu,name,driver_version,cuda_version} - GPU info +# - gpu_count - Number of GPUs detected +# - gpu_temperature_celsius{gpu} - Temperature +# - gpu_utilization_percent{gpu} - GPU utilization +# - gpu_memory_utilization_percent{gpu} - Memory utilization +# - gpu_memory_used_bytes{gpu} - VRAM used +# - gpu_memory_total_bytes{gpu} - Total VRAM +# - gpu_memory_free_bytes{gpu} - Free VRAM +# - gpu_power_draw_watts{gpu} - Power draw +# - gpu_power_limit_watts{gpu} - Power limit +# - gpu_fan_speed_percent{gpu} - Fan speed +# - gpu_clock_speed_mhz{gpu} - GPU clock +# - gpu_memory_clock_speed_mhz{gpu} - Memory clock +# - gpu_pstate{gpu} - Performance state +# - gpu_process_memory_bytes{gpu,pid,process_name} - Per-process memory +# - gpu_exporter_duration_seconds - Script execution time +# - gpu_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9195 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9195 + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Escape special characters in Prometheus label values +# Args: $1 - string to escape +# Returns: escaped string safe for Prometheus labels +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/}" + echo "$val" +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Check nvidia-smi exists + if ! command -v nvidia-smi >/dev/null 2>&1; then + cat </dev/null | head -1) + gpu_count=${gpu_count:-0} + + # Strip whitespace + gpu_count=$(echo "$gpu_count" | tr -d '[:space:]') + + if [ "$gpu_count" -eq 0 ] 2>/dev/null; then + cat </dev/null | head -1) + driver_version=$(echo "$driver_version" | tr -d '[:space:]') + driver_version=${driver_version:-"unknown"} + + cuda_version=$(nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null | head -1) + cuda_version=$(echo "$cuda_version" | tr -d '[:space:]') + + # Fallback: parse from nvidia-smi header if query fails + if [ -z "$cuda_version" ] || [ "$cuda_version" = "[N/A]" ]; then + cuda_version=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version:\s*\K[0-9.]+' || echo "unknown") + fi + + cat </dev/null) + + if [ -n "$info_lines" ]; then + while IFS= read -r info_line; do + [ -z "$info_line" ] && continue + local g_idx g_name + g_idx=$(echo "$info_line" | cut -d',' -f1 | tr -d '[:space:]') + g_name=$(echo "$info_line" | cut -d',' -f2- | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + echo "gpu_info{gpu=\"$g_idx\",name=\"$(prom_escape "$g_name")\",driver_version=\"$(prom_escape "$driver_version")\",cuda_version=\"$(prom_escape "$cuda_version")\"} 1" + done <<< "$info_lines" + fi + + echo "" + + # ======================================================================== + # OUTPUT PER-GPU METRICS (with HELP/TYPE headers) + # ======================================================================== + + # Helper: emit a metric block for all GPUs + # Args: $1=metric_name, $2=help_text, $3=query_field + emit_gpu_metric() { + local metric="$1" help="$2" query="$3" + echo "# HELP $metric $help" + echo "# TYPE $metric gauge" + local lines + lines=$(nvidia-smi --query-gpu=index,"$query" --format=csv,noheader,nounits 2>/dev/null) + while IFS=', ' read -r g_idx g_val; do + g_idx=$(echo "$g_idx" | tr -d '[:space:]') + g_val=$(echo "$g_val" | tr -d '[:space:]') + [[ "$g_val" == "[N/A]" ]] && g_val=0 + echo "${metric}{gpu=\"$g_idx\"} $g_val" + done <<< "$lines" + echo "" + } + + # Helper: emit a memory metric (MiB → bytes) for all GPUs + # Args: $1=metric_name, $2=help_text, $3=query_field + emit_gpu_mem_metric() { + local metric="$1" help="$2" query="$3" + echo "# HELP $metric $help" + echo "# TYPE $metric gauge" + local lines + lines=$(nvidia-smi --query-gpu=index,"$query" --format=csv,noheader,nounits 2>/dev/null) + while IFS=', ' read -r g_idx g_val; do + g_idx=$(echo "$g_idx" | tr -d '[:space:]') + g_val=$(echo "$g_val" | tr -d '[:space:]') + [[ "$g_val" == "[N/A]" ]] && g_val=0 + local bytes + bytes=$(awk "BEGIN { printf \"%.0f\", $g_val * 1048576 }") + echo "${metric}{gpu=\"$g_idx\"} $bytes" + done <<< "$lines" + echo "" + } + + emit_gpu_metric "gpu_temperature_celsius" "GPU temperature in degrees Celsius" "temperature.gpu" + emit_gpu_metric "gpu_utilization_percent" "GPU core utilization percentage" "utilization.gpu" + emit_gpu_metric "gpu_memory_utilization_percent" "GPU memory utilization percentage" "utilization.memory" + emit_gpu_mem_metric "gpu_memory_used_bytes" "GPU memory used in bytes" "memory.used" + emit_gpu_mem_metric "gpu_memory_total_bytes" "GPU total memory in bytes" "memory.total" + emit_gpu_mem_metric "gpu_memory_free_bytes" "GPU free memory in bytes" "memory.free" + emit_gpu_metric "gpu_power_draw_watts" "GPU power draw in watts" "power.draw" + emit_gpu_metric "gpu_power_limit_watts" "GPU power limit in watts" "power.limit" + emit_gpu_metric "gpu_fan_speed_percent" "GPU fan speed percentage" "fan.speed" + emit_gpu_metric "gpu_clock_speed_mhz" "GPU graphics clock speed in MHz" "clocks.current.graphics" + emit_gpu_metric "gpu_memory_clock_speed_mhz" "GPU memory clock speed in MHz" "clocks.current.memory" + + # Performance state needs special handling (P0 → 0, P8 → 8, etc.) + echo "# HELP gpu_pstate GPU performance state (0=max, 12=min)" + echo "# TYPE gpu_pstate gauge" + local pstate_lines + pstate_lines=$(nvidia-smi --query-gpu=index,pstate --format=csv,noheader,nounits 2>/dev/null) + while IFS=', ' read -r g_idx g_pstate; do + g_idx=$(echo "$g_idx" | tr -d '[:space:]') + g_pstate=$(echo "$g_pstate" | tr -d '[:space:]') + local pnum=0 + if [[ "$g_pstate" =~ ^P([0-9]+)$ ]]; then + pnum="${BASH_REMATCH[1]}" + fi + echo "gpu_pstate{gpu=\"$g_idx\"} $pnum" + done <<< "$pstate_lines" + + echo "" + + # ======================================================================== + # PER-PROCESS GPU MEMORY + # ======================================================================== + + # Build UUID-to-index mapping + declare -A uuid_to_index + local uuid_lines + uuid_lines=$(nvidia-smi --query-gpu=index,uuid --format=csv,noheader 2>/dev/null) + + if [ -n "$uuid_lines" ]; then + while IFS=', ' read -r g_idx g_uuid; do + g_idx=$(echo "$g_idx" | tr -d '[:space:]') + g_uuid=$(echo "$g_uuid" | tr -d '[:space:]') + uuid_to_index["$g_uuid"]="$g_idx" + done <<< "$uuid_lines" + fi + + cat </dev/null) + + if [ -n "$process_lines" ]; then + while IFS= read -r proc_line; do + [ -z "$proc_line" ] && continue + + # Parse: uuid, pid, process_name, used_memory_mib + local proc_uuid proc_pid proc_name proc_mem_mib + proc_uuid=$(echo "$proc_line" | cut -d',' -f1 | tr -d '[:space:]') + proc_pid=$(echo "$proc_line" | cut -d',' -f2 | tr -d '[:space:]') + proc_name=$(echo "$proc_line" | cut -d',' -f3 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + proc_mem_mib=$(echo "$proc_line" | rev | cut -d',' -f1 | rev | tr -d '[:space:]') + + # Resolve UUID to GPU index + local proc_gpu_idx="${uuid_to_index[$proc_uuid]:-0}" + + # Handle [N/A] memory + if [ "$proc_mem_mib" = "[N/A]" ]; then + proc_mem_mib=0 + fi + + # Convert MiB to bytes + local proc_mem_bytes + proc_mem_bytes=$(awk "BEGIN { printf \"%.0f\", $proc_mem_mib * 1048576 }") + + # Extract short process name from full path + local short_name + short_name=$(basename "$proc_name" 2>/dev/null || echo "$proc_name") + + echo "gpu_process_memory_bytes{gpu=\"$proc_gpu_idx\",pid=\"$proc_pid\",process_name=\"$(prom_escape "$short_name")\"} $proc_mem_bytes" + done <<< "$process_lines" + fi + + echo "" + + # ======================================================================== + # EXPORTER RUNTIME + # ======================================================================== + + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + # Infinite loop accepting HTTP requests + while true; do + { + read -r request + # Check if request is for /metrics endpoint + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else # Serve HTML landing page for other requests + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +GPU Exporter v1.0 + +

GPU Prometheus Exporter v1.0

+

Metrics

+

NVIDIA GPU metrics via nvidia-smi.

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Main entry point - routes to appropriate output mode +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + # Run HTTP server (blocks until killed) + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + # Textfile collector mode: write atomically using temp file + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + # Create temp file in SAME directory for atomic rename (same filesystem) + local temp_file + temp_file=$(mktemp "${output_dir}/.gpu_metrics.XXXXXX") + + # Generate metrics to temp file + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + # Validate: file must exist, have content + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + # Set permissions before move + chmod 644 "$temp_file" + + # Atomic rename — no gap where file is missing + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + # Default: output to stdout + generate_metrics + fi +} + +# Execute main function with all script arguments +main "$@" diff --git a/grafana-backup.sh b/grafana-backup.sh new file mode 100644 index 0000000..f2ebdc7 --- /dev/null +++ b/grafana-backup.sh @@ -0,0 +1,396 @@ +#!/bin/bash + +################################################ +#### Grafana Backup & Restore Script #### +#### Backup dashboards, datasources, alert #### +#### rules, and folders via the HTTP API #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### Version: 1.0.0.20260309 #### +################################################ + +set -o pipefail + +SCRIPT_NAME=$(basename "$0") +readonly SCRIPT_NAME + +# Default configuration +readonly DEFAULT_BACKUP_DIR="/var/backups/grafana" +readonly DEFAULT_RETENTION_COUNT=7 +readonly DEFAULT_CURL_TIMEOUT=30 + +# Configuration variables (can be overridden by environment) +GRAFANA_URL=${GRAFANA_URL:-} +GRAFANA_TOKEN=${GRAFANA_TOKEN:-} +BACKUP_DIR=${BACKUP_DIR:-$DEFAULT_BACKUP_DIR} +RETENTION_COUNT=${RETENTION_COUNT:-$DEFAULT_RETENTION_COUNT} + +# Runtime +RUN_MODE="backup" +RESTORE_DIR="" + +handle_error() { + local exit_code=$1 + local line_number=$2 + echo "Error: $SCRIPT_NAME failed at line $line_number with exit code $exit_code" >&2 + exit "$exit_code" +} + +trap 'handle_error $? $LINENO' ERR + +show_help() { + cat << EOF +Usage: $SCRIPT_NAME [OPTIONS] + +Backup and restore Grafana dashboards, datasources, alert rules, and folders +via the HTTP API. Creates timestamped backup directories with automatic retention. + +OPTIONS: + --backup Run a full backup (default) + --restore DIR Restore from the specified backup directory + --list List available backups + --help, -h Show this help message + +ENVIRONMENT VARIABLES: + GRAFANA_URL Grafana base URL (required, e.g. http://localhost:3000) + GRAFANA_TOKEN Grafana API token with Admin permissions (required) + BACKUP_DIR Root backup directory (default: $DEFAULT_BACKUP_DIR) + RETENTION_COUNT Number of backups to retain (default: $DEFAULT_RETENTION_COUNT) + +EXAMPLES: + GRAFANA_URL=http://localhost:3000 GRAFANA_TOKEN=glsa_xxxx $SCRIPT_NAME --backup + GRAFANA_URL=http://localhost:3000 GRAFANA_TOKEN=glsa_xxxx $SCRIPT_NAME --restore /var/backups/grafana/20260309-143022 + GRAFANA_URL=http://localhost:3000 GRAFANA_TOKEN=glsa_xxxx $SCRIPT_NAME --list + +EOF + exit 0 +} + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --backup) RUN_MODE="backup"; shift ;; + --restore) RUN_MODE="restore"; RESTORE_DIR="$2"; shift 2 ;; + --list) RUN_MODE="list"; shift ;; + --help|-h) show_help ;; + *) echo "Unknown option: $1" >&2; show_help ;; + esac +done + +validate_config() { + if [[ -z "$GRAFANA_URL" ]]; then + echo "Error: GRAFANA_URL is required" >&2 + exit 1 + fi + + if [[ -z "$GRAFANA_TOKEN" ]]; then + echo "Error: GRAFANA_TOKEN is required" >&2 + exit 1 + fi + + # Strip trailing slash + GRAFANA_URL="${GRAFANA_URL%/}" + + # Check dependencies + for cmd in curl jq; do + if ! command -v "$cmd" &>/dev/null; then + echo "Error: $cmd is required but not installed" >&2 + exit 1 + fi + done +} + +grafana_api() { + local method="$1" + local endpoint="$2" + local data="${3:-}" + local url="${GRAFANA_URL}${endpoint}" + + local args=( + -sf + --max-time "$DEFAULT_CURL_TIMEOUT" + -H "Authorization: Bearer ${GRAFANA_TOKEN}" + -H "Content-Type: application/json" + -H "Accept: application/json" + -X "$method" + ) + + if [[ -n "$data" ]]; then + args+=(-d "$data") + fi + + curl "${args[@]}" "$url" +} + +backup_dashboards() { + local dest="$1/dashboards" + mkdir -p "$dest" + + echo "Backing up dashboards..." + + local search_result + search_result=$(grafana_api GET "/api/search?type=dash-db&limit=5000") || { + echo " Error: Failed to search dashboards" >&2 + return 1 + } + + local count + count=$(echo "$search_result" | jq 'length') + echo " Found $count dashboards" + + echo "$search_result" | jq -r '.[].uid' | while IFS= read -r uid; do + local dashboard + dashboard=$(grafana_api GET "/api/dashboards/uid/$uid") || { + echo " Warning: Failed to export dashboard $uid" >&2 + continue + } + + local title + title=$(echo "$dashboard" | jq -r '.dashboard.title // "unknown"' | tr '/ ' '__') + echo "$dashboard" | jq '.' > "$dest/${uid}_${title}.json" + done + + echo " Dashboards saved to $dest" +} + +backup_datasources() { + local dest="$1/datasources" + mkdir -p "$dest" + + echo "Backing up datasources..." + + local result + result=$(grafana_api GET "/api/datasources") || { + echo " Error: Failed to fetch datasources" >&2 + return 1 + } + + local count + count=$(echo "$result" | jq 'length') + echo " Found $count datasources" + + echo "$result" | jq -c '.[]' | while IFS= read -r ds; do + local id name + id=$(echo "$ds" | jq -r '.id') + name=$(echo "$ds" | jq -r '.name' | tr '/ ' '__') + echo "$ds" | jq '.' > "$dest/${id}_${name}.json" + done + + echo " Datasources saved to $dest" +} + +backup_alert_rules() { + local dest="$1/alert_rules" + mkdir -p "$dest" + + echo "Backing up alert rules..." + + local result + result=$(grafana_api GET "/api/v1/provisioning/alert-rules") || { + echo " Error: Failed to fetch alert rules" >&2 + return 1 + } + + local count + count=$(echo "$result" | jq 'length') + echo " Found $count alert rules" + + echo "$result" | jq -c '.[]' | while IFS= read -r rule; do + local uid title + uid=$(echo "$rule" | jq -r '.uid // .id // "unknown"') + title=$(echo "$rule" | jq -r '.title // "unknown"' | tr '/ ' '__') + echo "$rule" | jq '.' > "$dest/${uid}_${title}.json" + done + + echo " Alert rules saved to $dest" +} + +backup_folders() { + local dest="$1/folders" + mkdir -p "$dest" + + echo "Backing up folders..." + + local result + result=$(grafana_api GET "/api/folders?limit=1000") || { + echo " Error: Failed to fetch folders" >&2 + return 1 + } + + local count + count=$(echo "$result" | jq 'length') + echo " Found $count folders" + + echo "$result" | jq -c '.[]' | while IFS= read -r folder; do + local uid title + uid=$(echo "$folder" | jq -r '.uid') + title=$(echo "$folder" | jq -r '.title' | tr '/ ' '__') + echo "$folder" | jq '.' > "$dest/${uid}_${title}.json" + done + + echo " Folders saved to $dest" +} + +restore_dashboards() { + local src="$1/dashboards" + + if [[ ! -d "$src" ]]; then + echo " No dashboards directory found, skipping" >&2 + return 0 + fi + + echo "Restoring dashboards..." + + local count=0 + for file in "$src"/*.json; do + [[ -f "$file" ]] || continue + + local payload + payload=$(jq '{dashboard: .dashboard, overwrite: true, folderId: (.meta.folderId // 0)}' "$file") || { + echo " Warning: Failed to parse $file" >&2 + continue + } + + if grafana_api POST "/api/dashboards/db" "$payload" >/dev/null; then + count=$((count + 1)) + else + local title + title=$(basename "$file" .json) + echo " Warning: Failed to restore dashboard $title" >&2 + fi + done + + echo " Restored $count dashboards" +} + +restore_datasources() { + local src="$1/datasources" + + if [[ ! -d "$src" ]]; then + echo " No datasources directory found, skipping" >&2 + return 0 + fi + + echo "Restoring datasources..." + + local count=0 + for file in "$src"/*.json; do + [[ -f "$file" ]] || continue + + local payload + payload=$(jq 'del(.id, .uid, .readOnly)' "$file") || { + echo " Warning: Failed to parse $file" >&2 + continue + } + + if grafana_api POST "/api/datasources" "$payload" >/dev/null; then + count=$((count + 1)) + else + local name + name=$(basename "$file" .json) + echo " Warning: Failed to restore datasource $name" >&2 + fi + done + + echo " Restored $count datasources" +} + +prune_backups() { + echo "Pruning old backups (retaining $RETENTION_COUNT)..." + + local backup_count + backup_count=$(find "$BACKUP_DIR" -mindepth 1 -maxdepth 1 -type d | wc -l) + + if [[ $backup_count -le $RETENTION_COUNT ]]; then + echo " No pruning needed ($backup_count backups present)" + return 0 + fi + + local remove_count=$((backup_count - RETENTION_COUNT)) + find "$BACKUP_DIR" -mindepth 1 -maxdepth 1 -type d | sort | head -n "$remove_count" | while IFS= read -r dir; do + echo " Removing $(basename "$dir")" + rm -rf "$dir" + done + + echo " Pruned $remove_count old backups" +} + +do_backup() { + local timestamp + timestamp=$(date +%Y%m%d-%H%M%S) + local dest="$BACKUP_DIR/$timestamp" + + mkdir -p "$dest" + echo "Starting backup to $dest" + + backup_dashboards "$dest" + backup_datasources "$dest" + backup_alert_rules "$dest" + backup_folders "$dest" + + prune_backups + + echo "Backup complete: $dest" +} + +do_restore() { + if [[ -z "$RESTORE_DIR" ]]; then + echo "Error: --restore requires a directory argument" >&2 + exit 1 + fi + + if [[ ! -d "$RESTORE_DIR" ]]; then + echo "Error: Restore directory does not exist: $RESTORE_DIR" >&2 + exit 1 + fi + + echo "Restoring from $RESTORE_DIR" + + restore_dashboards "$RESTORE_DIR" + restore_datasources "$RESTORE_DIR" + + echo "Restore complete" +} + +do_list() { + if [[ ! -d "$BACKUP_DIR" ]]; then + echo "No backups found (directory does not exist: $BACKUP_DIR)" + return 0 + fi + + local count=0 + for dir in "$BACKUP_DIR"/*/; do + [[ -d "$dir" ]] || continue + count=$((count + 1)) + + local name + name=$(basename "$dir") + local dashboards datasources alerts folders + dashboards=$(find "$dir/dashboards" -name '*.json' 2>/dev/null | wc -l) + datasources=$(find "$dir/datasources" -name '*.json' 2>/dev/null | wc -l) + alerts=$(find "$dir/alert_rules" -name '*.json' 2>/dev/null | wc -l) + folders=$(find "$dir/folders" -name '*.json' 2>/dev/null | wc -l) + + printf " %s dashboards:%-4s datasources:%-4s alerts:%-4s folders:%-4s\n" \ + "$name" "$dashboards" "$datasources" "$alerts" "$folders" + done + + if [[ $count -eq 0 ]]; then + echo "No backups found in $BACKUP_DIR" + else + echo "$count backup(s) in $BACKUP_DIR" + fi +} + +main() { + validate_config + + case "$RUN_MODE" in + backup) do_backup ;; + restore) do_restore ;; + list) do_list ;; + esac +} + +main diff --git a/graylog-api-backup.sh b/graylog-api-backup.sh new file mode 100755 index 0000000..0078abf --- /dev/null +++ b/graylog-api-backup.sh @@ -0,0 +1,256 @@ +#!/usr/bin/env bash +# +# Graylog API Backup +# +# Backs up Graylog configuration objects via the REST API. +# Exports inputs, streams, pipelines, dashboards, alerts, +# index sets, users, roles, sidecar configs, content packs, +# and lookup tables to individual JSON files in a dated directory. +# +# Usage: +# GRAYLOG_URL="http://graylog.example.com:9000/api" GRAYLOG_TOKEN="abc123" ./graylog-api-backup.sh +# GRAYLOG_URL="http://graylog.example.com:9000/api" GRAYLOG_TOKEN="abc123" ./graylog-api-backup.sh --dry-run +# GRAYLOG_URL="http://graylog.example.com:9000/api" GRAYLOG_TOKEN="abc123" ./graylog-api-backup.sh --install +# +# Parameters: +# --dry-run Show what would be backed up without writing files +# --install Create cron job for daily backup at 3am +# --help Show usage +# +# Environment: +# GRAYLOG_URL Graylog API base URL (required, e.g. http://localhost:9000/api) +# GRAYLOG_TOKEN API token (required) +# BACKUP_DIR Base backup directory (default: /backup/graylog-api) +# RETENTION_DAYS Delete backups older than this many days (default: 30) +# CURL_TIMEOUT API request timeout in seconds (default: 10) +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.01 + +set -euo pipefail + +# --- Configuration --- +readonly VERSION="1.0" +readonly SCRIPT_NAME="$(basename "$0")" +GRAYLOG_URL="${GRAYLOG_URL:-}" +GRAYLOG_TOKEN="${GRAYLOG_TOKEN:-}" +BACKUP_DIR="${BACKUP_DIR:-/backup/graylog-api}" +RETENTION_DAYS="${RETENTION_DAYS:-30}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +DRY_RUN=false + +# Backup endpoints: "api_path output_filename" +readonly ENDPOINTS=( + "system/inputs inputs.json" + "streams streams.json" + "system/pipelines/pipeline pipelines.json" + "system/pipelines/rule pipeline-rules.json" + "system/pipelines/connections pipeline-connections.json" + "views dashboards.json" + "alerts/definitions alert-definitions.json" + "alerts/notifications alert-notifications.json" + "system/indices/index_sets index-sets.json" + "users users.json" + "roles roles.json" + "sidecar/configurations sidecar-configs.json" + "system/content_packs content-packs.json" + "system/lookup/tables lookup-tables.json" + "system/lookup/adapters lookup-adapters.json" + "system/lookup/caches lookup-caches.json" +) + +# --- Functions --- + +usage() { + cat </dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "ERROR: Missing required commands: ${missing[*]}" >&2 + echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 + exit 1 + fi +} + +validate_config() { + if [[ -z "$GRAYLOG_URL" ]]; then + echo "ERROR: GRAYLOG_URL environment variable is required" >&2 + exit 1 + fi + if [[ -z "$GRAYLOG_TOKEN" ]]; then + echo "ERROR: GRAYLOG_TOKEN environment variable is required" >&2 + exit 1 + fi + # Strip trailing slash + GRAYLOG_URL="${GRAYLOG_URL%/}" +} + +api_get() { + local endpoint="$1" + curl -sf --max-time "$CURL_TIMEOUT" \ + -u "${GRAYLOG_TOKEN}:token" \ + -H "Accept: application/json" \ + "${GRAYLOG_URL}/${endpoint}" 2>/dev/null || echo "" +} + +backup_endpoint() { + local endpoint="$1" + local filename="$2" + local output_dir="$3" + local response + + if [[ "$DRY_RUN" == true ]]; then + printf " %-35s → %s (dry-run)\n" "$endpoint" "$filename" + return 0 + fi + + response=$(api_get "$endpoint") + + if [[ -z "$response" ]]; then + printf " %-35s → %-30s FAIL\n" "$endpoint" "$filename" + return 1 + fi + + echo "$response" | jq '.' > "${output_dir}/${filename}" + local size + size=$(du -h "${output_dir}/${filename}" | cut -f1) + printf " %-35s → %-30s OK %s\n" "$endpoint" "$filename" "$size" + return 0 +} + +cleanup_old_backups() { + if [[ ! -d "$BACKUP_DIR" ]]; then + return + fi + + local removed=0 + while IFS= read -r dir; do + rm -rf "$dir" + ((removed++)) || true + done < <(find "$BACKUP_DIR" -maxdepth 1 -mindepth 1 -type d -mtime +"$RETENTION_DAYS" 2>/dev/null) + + if [[ $removed -gt 0 ]]; then + echo "Retention: removed $removed backup(s) older than ${RETENTION_DAYS} days" + fi +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + cat > /etc/cron.d/graylog-api-backup </dev/null +EOF + + chmod 644 /etc/cron.d/graylog-api-backup + echo "Installed cron job: /etc/cron.d/graylog-api-backup" + echo "Backups will be written to: ${BACKUP_DIR}/" +} + +# --- Main --- + +main() { + # Parse arguments + for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + --install) + check_dependencies + validate_config + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) echo "Unknown option: $arg" >&2; usage ;; + esac + done + + check_dependencies + validate_config + + local today + today=$(date +%Y%m%d) + local output_dir="${BACKUP_DIR}/${today}" + + if [[ "$DRY_RUN" == true ]]; then + echo "Graylog API Backup v${VERSION} (dry-run)" + echo "Target: ${output_dir}" + echo "" + else + mkdir -p "$output_dir" + echo "Graylog API Backup v${VERSION}" + echo "Target: ${output_dir}" + echo "" + fi + + local ok=0 + local fail=0 + + for entry in "${ENDPOINTS[@]}"; do + local endpoint filename + endpoint="${entry%% *}" + filename="${entry##* }" + + if backup_endpoint "$endpoint" "$filename" "$output_dir"; then + ((ok++)) || true + else + ((fail++)) || true + fi + done + + echo "" + + if [[ "$DRY_RUN" == true ]]; then + echo "Dry-run complete: ${#ENDPOINTS[@]} endpoints" + else + local total_size + total_size=$(du -sh "$output_dir" | cut -f1) + echo "Complete: ${ok} OK, ${fail} failed, ${total_size} total" + + cleanup_old_backups + fi +} + +main "$@" diff --git a/graylog-exporter.sh b/graylog-exporter.sh new file mode 100755 index 0000000..2e76cc2 --- /dev/null +++ b/graylog-exporter.sh @@ -0,0 +1,414 @@ +#!/usr/bin/env bash +# +# Graylog Prometheus Metrics Exporter +# +# Prometheus textfile collector exporter for Graylog. +# Uses the Graylog REST API to collect node status, throughput, +# journal health, buffer usage, input/stream/index counts, +# and cluster leadership state. +# +# Usage: +# GRAYLOG_URL="http://graylog.example.com:9000/api" GRAYLOG_TOKEN="abc123" ./graylog-exporter.sh +# GRAYLOG_URL="http://graylog.example.com:9000/api" GRAYLOG_TOKEN="abc123" ./graylog-exporter.sh --textfile +# GRAYLOG_URL="http://graylog.example.com:9000/api" GRAYLOG_TOKEN="abc123" ./graylog-exporter.sh --install +# +# Parameters: +# --textfile Write to textfile collector directory +# --install Create cron job for automatic collection +# --help Show usage +# +# Environment: +# GRAYLOG_URL Graylog API base URL (required, e.g. http://localhost:9000/api) +# GRAYLOG_TOKEN API token (required) +# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector) +# CURL_TIMEOUT API request timeout in seconds (default: 10) +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.0 +# +# Metrics Exported: +# Core: +# - graylog_up +# - graylog_exporter_info{version} +# - graylog_node_is_leader +# - graylog_input_count +# +# Throughput: +# - graylog_throughput_input +# - graylog_throughput_output +# +# Journal: +# - graylog_journal_size_bytes +# - graylog_journal_uncommitted_entries +# - graylog_journal_events_read_count +# - graylog_journal_events_append_count +# +# Buffers: +# - graylog_buffer_input_usage +# - graylog_buffer_process_usage +# - graylog_buffer_output_usage +# +# Counts: +# - graylog_stream_count +# - graylog_index_count +# - graylog_sidecar_count +# - graylog_sidecar_active_count +# - graylog_content_pack_count +# +# Exporter: +# - graylog_exporter_duration_seconds +# - graylog_exporter_last_run_timestamp + +set -euo pipefail + +# --- Configuration --- +readonly VERSION="1.1" +readonly SCRIPT_NAME="$(basename "$0")" +GRAYLOG_URL="${GRAYLOG_URL:-}" +GRAYLOG_TOKEN="${GRAYLOG_TOKEN:-}" +TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +TEXTFILE_MODE=false +OUTPUT="" +START_TIME="" + +# --- Functions --- + +usage() { + cat </dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "ERROR: Missing required commands: ${missing[*]}" >&2 + echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 + exit 1 + fi +} + +validate_config() { + if [[ -z "$GRAYLOG_URL" ]]; then + echo "ERROR: GRAYLOG_URL environment variable is required" >&2 + exit 1 + fi + if [[ -z "$GRAYLOG_TOKEN" ]]; then + echo "ERROR: GRAYLOG_TOKEN environment variable is required" >&2 + exit 1 + fi + # Strip trailing slash + GRAYLOG_URL="${GRAYLOG_URL%/}" +} + +api_get() { + local endpoint="$1" + curl -sf --max-time "$CURL_TIMEOUT" \ + -u "${GRAYLOG_TOKEN}:token" \ + -H "Accept: application/json" \ + "${GRAYLOG_URL}${endpoint}" 2>/dev/null || echo "" +} + +add_metric() { + local name="$1" + local type="$2" + local help="$3" + local value="$4" + local labels="${5:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name}{${labels}} ${value} +" + else + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name} ${value} +" + fi +} + +add_metric_value() { + local name="$1" + local value="$2" + local labels="${3:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="${name}{${labels}} ${value} +" + else + OUTPUT+="${name} ${value} +" + fi +} + +collect_cluster() { + local cluster_json + cluster_json=$(api_get "/cluster") + + if [[ -z "$cluster_json" ]]; then + add_metric "graylog_up" "gauge" "Graylog reachability (1=up, 0=down)" "0" + return 1 + fi + + add_metric "graylog_up" "gauge" "Graylog reachability (1=up, 0=down)" "1" + + # Determine leader status from the first (local) node + local is_leader + is_leader=$(echo "$cluster_json" | jq -r '[.[]][0].is_leader // [.[]][0].is_master // false' 2>/dev/null) + + if [[ "$is_leader" == "true" ]]; then + add_metric "graylog_node_is_leader" "gauge" "Whether this node is the cluster leader (1=leader, 0=follower)" "1" + else + add_metric "graylog_node_is_leader" "gauge" "Whether this node is the cluster leader (1=leader, 0=follower)" "0" + fi + + return 0 +} + +collect_throughput() { + local throughput_json + throughput_json=$(api_get "/system/throughput") + + if [[ -z "$throughput_json" ]]; then + return + fi + + local input_throughput output_throughput + input_throughput=$(echo "$throughput_json" | jq -r '.throughput.input // .throughput // 0' 2>/dev/null) + output_throughput=$(echo "$throughput_json" | jq -r '.throughput.output // 0' 2>/dev/null) + + # Fallback: some Graylog versions expose a single throughput value + if [[ "$input_throughput" == "0" || -z "$input_throughput" ]]; then + input_throughput=$(echo "$throughput_json" | jq -r '.throughput // 0' 2>/dev/null) + fi + + add_metric "graylog_throughput_input" "gauge" "Messages received per second" "${input_throughput:-0}" + add_metric "graylog_throughput_output" "gauge" "Messages written per second" "${output_throughput:-0}" +} + +collect_inputs() { + local inputs_json + inputs_json=$(api_get "/system/inputs") + + if [[ -z "$inputs_json" ]]; then + return + fi + + local input_count + input_count=$(echo "$inputs_json" | jq -r '.total // 0' 2>/dev/null) + add_metric "graylog_input_count" "gauge" "Total number of configured inputs" "${input_count:-0}" +} + +collect_journal() { + local journal_json + journal_json=$(api_get "/system/journal") + + if [[ -z "$journal_json" ]]; then + return + fi + + local journal_size uncommitted read_events append_events + journal_size=$(echo "$journal_json" | jq -r '.journal_size // 0' 2>/dev/null) + uncommitted=$(echo "$journal_json" | jq -r '.uncommitted_journal_entries // 0' 2>/dev/null) + read_events=$(echo "$journal_json" | jq -r '.read_events_per_second // 0' 2>/dev/null) + append_events=$(echo "$journal_json" | jq -r '.append_events_per_second // 0' 2>/dev/null) + + add_metric "graylog_journal_size_bytes" "gauge" "Journal size in bytes" "${journal_size:-0}" + add_metric "graylog_journal_uncommitted_entries" "gauge" "Uncommitted journal entries" "${uncommitted:-0}" + add_metric "graylog_journal_events_read_count" "gauge" "Journal events read per second" "${read_events:-0}" + add_metric "graylog_journal_events_append_count" "gauge" "Journal events appended per second" "${append_events:-0}" +} + +collect_buffers() { + local buffers_json + buffers_json=$(api_get "/system/buffers") + + if [[ -z "$buffers_json" ]]; then + return + fi + + local input_usage process_usage output_usage + input_usage=$(echo "$buffers_json" | jq -r '.buffers.input.utilization_percent // 0' 2>/dev/null) + process_usage=$(echo "$buffers_json" | jq -r '.buffers.process.utilization_percent // 0' 2>/dev/null) + output_usage=$(echo "$buffers_json" | jq -r '.buffers.output.utilization_percent // 0' 2>/dev/null) + + add_metric "graylog_buffer_input_usage" "gauge" "Input buffer utilization percentage" "${input_usage:-0}" + add_metric "graylog_buffer_process_usage" "gauge" "Process buffer utilization percentage" "${process_usage:-0}" + add_metric "graylog_buffer_output_usage" "gauge" "Output buffer utilization percentage" "${output_usage:-0}" +} + +collect_streams() { + local streams_json + streams_json=$(api_get "/streams") + + if [[ -z "$streams_json" ]]; then + return + fi + + local stream_count + stream_count=$(echo "$streams_json" | jq -r '.total // 0' 2>/dev/null) + add_metric "graylog_stream_count" "gauge" "Total number of streams" "${stream_count:-0}" +} + +collect_indices() { + local indices_json + indices_json=$(api_get "/system/indexer/indices") + + if [[ -z "$indices_json" ]]; then + return + fi + + local index_count + index_count=$(echo "$indices_json" | jq -r '.all.indices | length // 0' 2>/dev/null) + + # Fallback: try counting top-level keys + if [[ "$index_count" == "0" || -z "$index_count" ]]; then + index_count=$(echo "$indices_json" | jq -r 'if type == "object" then keys | length else 0 end' 2>/dev/null) + fi + + add_metric "graylog_index_count" "gauge" "Total number of indices" "${index_count:-0}" +} + +collect_sidecars() { + local sidecars_json + sidecars_json=$(api_get "/sidecars") + + if [[ -z "$sidecars_json" ]]; then + return + fi + + local total_count active_count + total_count=$(echo "$sidecars_json" | jq -r '.pagination.total // 0' 2>/dev/null) + active_count=$(echo "$sidecars_json" | jq -r '[.sidecars[] | select(.node_details.status.status == 1)] | length // 0' 2>/dev/null) + + add_metric "graylog_sidecar_count" "gauge" "Total number of registered Sidecars" "${total_count:-0}" + add_metric "graylog_sidecar_active_count" "gauge" "Number of active Sidecars" "${active_count:-0}" +} + +collect_content_packs() { + local packs_json + packs_json=$(api_get "/system/content_packs/latest") + + if [[ -z "$packs_json" ]]; then + return + fi + + local pack_count + pack_count=$(echo "$packs_json" | jq -r '.total // 0' 2>/dev/null) + add_metric "graylog_content_pack_count" "gauge" "Total number of content packs" "${pack_count:-0}" +} + +write_output() { + if [[ "$TEXTFILE_MODE" == true ]]; then + local output_file="${TEXTFILE_DIR}/graylog.prom" + local temp_file="${output_file}.$$" + + mkdir -p "$TEXTFILE_DIR" + echo "$OUTPUT" > "$temp_file" + mv "$temp_file" "$output_file" + else + echo "$OUTPUT" + fi +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + cat > /etc/cron.d/graylog-exporter </dev/null +EOF + + chmod 644 /etc/cron.d/graylog-exporter + echo "Installed cron job: /etc/cron.d/graylog-exporter" + echo "Metrics will be written to: ${TEXTFILE_DIR}/graylog.prom" +} + +# --- Main --- + +main() { + # Parse arguments + for arg in "$@"; do + case "$arg" in + --textfile) TEXTFILE_MODE=true ;; + --install) + check_dependencies + validate_config + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) echo "Unknown option: $arg" >&2; usage ;; + esac + done + + check_dependencies + validate_config + + START_TIME=$(date +%s%N) + + # Exporter info + add_metric "graylog_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" + + # Collect metrics + if collect_cluster; then + collect_inputs + collect_throughput + collect_journal + collect_buffers + collect_streams + collect_indices + collect_sidecars + collect_content_packs + fi + + # Exporter performance + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + add_metric "graylog_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" + add_metric "graylog_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" + + write_output +} + +main "$@" diff --git a/haproxy-metrics.sh b/haproxy-metrics.sh new file mode 100755 index 0000000..db0c3ed --- /dev/null +++ b/haproxy-metrics.sh @@ -0,0 +1,540 @@ +#!/bin/bash +############################################################# +#### HAProxy Metrics Exporter for Prometheus #### +#### Extended metrics via runtime API, config parsing, #### +#### SSL cert checking, stick tables, and log analysis #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.01 #### +#### #### +#### Usage: ./haproxy-metrics.sh [OPTIONS] #### +############################################################# +# +# Metrics collected (haproxy_extended_ prefix): +# - Process: status, uptime, CPU, RSS memory, workers, open FDs +# - Backend health: per-server status, weight, check duration +# - Connections: current, rates, queue depths, session reuse +# - SSL: certificate expiry per domain +# - Stick tables: entry counts, utilization, types +# - Errors: log-parsed 4xx/5xx, connection errors, retries +# - Config: frontend/backend/server counts, ACL rules, maxconn +# - Reload: count and last timestamp +# +# Requirements: +# - Bash 4.0+ +# - socat (for HAProxy runtime API) +# - openssl (for SSL cert checks, optional) +# - nc (for HTTP server mode) +# +set -euo pipefail + +######################### +### Configuration ### +######################### + +TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter}" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT="${HTTP_PORT:-9117}" +LOCK_FILE="/var/run/haproxy-metrics.lock" + +HAPROXY_CONFIG="${HAPROXY_CONFIG:-/etc/haproxy/haproxy.cfg}" +HAPROXY_SOCKET="${HAPROXY_SOCKET:-/run/haproxy/admin.sock}" +HAPROXY_LOG="${HAPROXY_LOG:-/var/log/haproxy/haproxy.log}" +CERT_DIR="${CERT_DIR:-/etc/haproxy/certs}" + +LOG_TAIL_LINES=10000 + +######################### +### Logging ### +######################### + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" >&2; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" >&2; } +log_error() { echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" >&2; } +log_step() { echo -e "${BLUE}[STEP]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" >&2; } + +######################### +### Arguments ### +######################### + +show_help() { + cat </dev/null || true) + if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then + log_error "Another instance running (PID: $pid)"; exit 1 + fi + rm -f "$LOCK_FILE" + fi + echo $$ > "$LOCK_FILE" + trap 'rm -f "$LOCK_FILE"' EXIT INT TERM +} + +######################### +### Socket Helper ### +######################### + +haproxy_cmd() { + local cmd="$1" + if [[ -S "$HAPROXY_SOCKET" ]]; then + echo "$cmd" | timeout 5 socat - UNIX-CONNECT:"$HAPROXY_SOCKET" 2>/dev/null || true + fi +} + +######################### +### Process Metrics ### +######################### + +get_process_metrics() { + local pid uptime_s cpu mem fds workers + + pid=$(pgrep -x haproxy | head -1 2>/dev/null || true) + if [[ -z "$pid" ]]; then + echo "# HELP haproxy_extended_up Whether HAProxy is running" + echo "# TYPE haproxy_extended_up gauge" + echo "haproxy_extended_up 0" + return + fi + + echo "# HELP haproxy_extended_up Whether HAProxy is running" + echo "# TYPE haproxy_extended_up gauge" + echo "haproxy_extended_up 1" + + # Version + local version + version=$(haproxy -v 2>/dev/null | head -1 | grep -oP '\d+\.\d+[\.\d]*' || echo "unknown") + echo "# HELP haproxy_extended_version_info HAProxy version" + echo "# TYPE haproxy_extended_version_info gauge" + echo "haproxy_extended_version_info{version=\"${version}\"} 1" + + # Uptime + uptime_s=$(ps -o etimes= -p "$pid" 2>/dev/null | tr -d ' ' || echo "0") + echo "# HELP haproxy_extended_uptime_seconds HAProxy process uptime" + echo "# TYPE haproxy_extended_uptime_seconds gauge" + echo "haproxy_extended_uptime_seconds $uptime_s" + + # CPU + cpu=$(ps -o %cpu= -p "$pid" 2>/dev/null | tr -d ' ' || echo "0") + echo "# HELP haproxy_extended_cpu_percent HAProxy CPU usage" + echo "# TYPE haproxy_extended_cpu_percent gauge" + echo "haproxy_extended_cpu_percent $cpu" + + # Memory (RSS in bytes) + mem=$(ps -o rss= -p "$pid" 2>/dev/null | tr -d ' ' || echo "0") + mem=$((mem * 1024)) + echo "# HELP haproxy_extended_memory_bytes HAProxy RSS memory" + echo "# TYPE haproxy_extended_memory_bytes gauge" + echo "haproxy_extended_memory_bytes $mem" + + # Open FDs + fds=$(ls /proc/"$pid"/fd 2>/dev/null | wc -l || echo "0") + echo "# HELP haproxy_extended_open_fds Open file descriptors" + echo "# TYPE haproxy_extended_open_fds gauge" + echo "haproxy_extended_open_fds $fds" + + # Worker count + workers=$(pgrep -x haproxy 2>/dev/null | wc -l || echo "0") + echo "# HELP haproxy_extended_worker_count HAProxy process count" + echo "# TYPE haproxy_extended_worker_count gauge" + echo "haproxy_extended_worker_count $workers" +} + +######################### +### Stats Metrics ### +######################### + +get_stats_metrics() { + local stats + stats=$(haproxy_cmd "show stat" | tail -n +2) + [[ -z "$stats" ]] && return + + echo "# HELP haproxy_extended_server_status Per-server health status (1=UP)" + echo "# TYPE haproxy_extended_server_status gauge" + echo "# HELP haproxy_extended_server_weight Server weight" + echo "# TYPE haproxy_extended_server_weight gauge" + echo "# HELP haproxy_extended_server_current_sessions Current sessions per server" + echo "# TYPE haproxy_extended_server_current_sessions gauge" + echo "# HELP haproxy_extended_server_max_sessions Max observed sessions per server" + echo "# TYPE haproxy_extended_server_max_sessions gauge" + echo "# HELP haproxy_extended_server_check_duration_seconds Health check duration" + echo "# TYPE haproxy_extended_server_check_duration_seconds gauge" + echo "# HELP haproxy_extended_server_check_failures_total Consecutive health check failures" + echo "# TYPE haproxy_extended_server_check_failures_total gauge" + echo "# HELP haproxy_extended_server_last_change_seconds Seconds since last status change" + echo "# TYPE haproxy_extended_server_last_change_seconds gauge" + + while IFS=',' read -r pxname svname _qcur _qmax scur smax _slim _stot _bin _bout _dreq _dresp _ereq _econ _eresp _wretr _wredis status weight _act _bck chkfail _chkdown lastchg _downtime _qlimit _pid _iid _sid _throttle _lbtot _tracked _type _rate _rate_lim _rate_max check_status check_code check_duration _hrsp_1xx _hrsp_2xx _hrsp_3xx _hrsp_4xx _hrsp_5xx _hrsp_other _hanafail _rest; do + [[ -z "$pxname" || "$pxname" == "#"* ]] && continue + [[ "$svname" == "FRONTEND" || "$svname" == "BACKEND" ]] && continue + + local status_val=0 + case "$status" in + UP|UP*) status_val=1 ;; + MAINT*) status_val=2 ;; + DRAIN*) status_val=3 ;; + esac + + echo "haproxy_extended_server_status{backend=\"${pxname}\",server=\"${svname}\",state=\"${status}\"} ${status_val}" + echo "haproxy_extended_server_weight{backend=\"${pxname}\",server=\"${svname}\"} ${weight:-0}" + echo "haproxy_extended_server_current_sessions{backend=\"${pxname}\",server=\"${svname}\"} ${scur:-0}" + echo "haproxy_extended_server_max_sessions{backend=\"${pxname}\",server=\"${svname}\"} ${smax:-0}" + + if [[ -n "$check_duration" && "$check_duration" != "" ]]; then + local dur_s + dur_s=$(echo "scale=3; ${check_duration:-0} / 1000" | bc 2>/dev/null || echo "0") + echo "haproxy_extended_server_check_duration_seconds{backend=\"${pxname}\",server=\"${svname}\"} ${dur_s}" + fi + + echo "haproxy_extended_server_check_failures_total{backend=\"${pxname}\",server=\"${svname}\"} ${chkfail:-0}" + echo "haproxy_extended_server_last_change_seconds{backend=\"${pxname}\",server=\"${svname}\"} ${lastchg:-0}" + done <<< "$stats" + + # Frontend/backend connection metrics + echo "# HELP haproxy_extended_frontend_current_connections Current frontend connections" + echo "# TYPE haproxy_extended_frontend_current_connections gauge" + echo "# HELP haproxy_extended_frontend_session_rate Current session rate per frontend" + echo "# TYPE haproxy_extended_frontend_session_rate gauge" + echo "# HELP haproxy_extended_backend_queue_depth Current queue depth per backend" + echo "# TYPE haproxy_extended_backend_queue_depth gauge" + + while IFS=',' read -r pxname svname qcur _qmax scur _smax _slim _stot _bin _bout _dreq _dresp _ereq _econ _eresp _wretr _wredis _status _weight _act _bck _chkfail _chkdown _lastchg _downtime _qlimit _pid _iid _sid _throttle _lbtot _tracked _type rate _rest; do + [[ -z "$pxname" || "$pxname" == "#"* ]] && continue + if [[ "$svname" == "FRONTEND" ]]; then + echo "haproxy_extended_frontend_current_connections{frontend=\"${pxname}\"} ${scur:-0}" + echo "haproxy_extended_frontend_session_rate{frontend=\"${pxname}\"} ${rate:-0}" + elif [[ "$svname" == "BACKEND" ]]; then + echo "haproxy_extended_backend_queue_depth{backend=\"${pxname}\"} ${qcur:-0}" + fi + done <<< "$stats" +} + +######################### +### SSL Certificates ### +######################### + +get_ssl_metrics() { + [[ ! -d "$CERT_DIR" ]] && return + local certs + certs=$(find "$CERT_DIR" -name "*.pem" -o -name "*.crt" 2>/dev/null) + [[ -z "$certs" ]] && return + + command -v openssl >/dev/null 2>&1 || return + + echo "# HELP haproxy_extended_ssl_cert_expiry_seconds SSL certificate expiry in seconds from now" + echo "# TYPE haproxy_extended_ssl_cert_expiry_seconds gauge" + echo "# HELP haproxy_extended_ssl_cert_expiry_days SSL certificate expiry in days" + echo "# TYPE haproxy_extended_ssl_cert_expiry_days gauge" + + while IFS= read -r cert_file; do + [[ -f "$cert_file" ]] || continue + local cn expiry_epoch now_epoch remaining_s remaining_d + cn=$(openssl x509 -in "$cert_file" -noout -subject 2>/dev/null | sed 's/.*CN\s*=\s*//' | sed 's/\/.*//' || true) + [[ -z "$cn" ]] && cn=$(basename "$cert_file" | sed 's/\.\(pem\|crt\)$//') + expiry_epoch=$(date -d "$(openssl x509 -in "$cert_file" -noout -enddate 2>/dev/null | cut -d= -f2)" +%s 2>/dev/null || echo "0") + now_epoch=$(date +%s) + remaining_s=$((expiry_epoch - now_epoch)) + remaining_d=$((remaining_s / 86400)) + + echo "haproxy_extended_ssl_cert_expiry_seconds{domain=\"${cn}\",file=\"$(basename "$cert_file")\"} ${remaining_s}" + echo "haproxy_extended_ssl_cert_expiry_days{domain=\"${cn}\",file=\"$(basename "$cert_file")\"} ${remaining_d}" + done <<< "$certs" +} + +######################### +### Stick Tables ### +######################### + +get_stick_table_metrics() { + local tables + tables=$(haproxy_cmd "show table") + [[ -z "$tables" ]] && return + + echo "# HELP haproxy_extended_stick_table_entries Current entries in stick table" + echo "# TYPE haproxy_extended_stick_table_entries gauge" + echo "# HELP haproxy_extended_stick_table_size Configured max size of stick table" + echo "# TYPE haproxy_extended_stick_table_size gauge" + echo "# HELP haproxy_extended_stick_table_used_ratio Utilization ratio (0.0-1.0)" + echo "# TYPE haproxy_extended_stick_table_used_ratio gauge" + + while IFS= read -r line; do + if [[ "$line" =~ ^#\ table:\ ([^,]+),\ type:\ ([^,]+),\ size:([0-9]+),\ used:([0-9]+) ]]; then + local tname="${BASH_REMATCH[1]}" + local ttype="${BASH_REMATCH[2]}" + local tsize="${BASH_REMATCH[3]}" + local tused="${BASH_REMATCH[4]}" + local ratio="0" + if [[ "$tsize" -gt 0 ]]; then + ratio=$(echo "scale=4; $tused / $tsize" | bc 2>/dev/null || echo "0") + fi + echo "haproxy_extended_stick_table_entries{table=\"${tname}\",type=\"${ttype}\"} ${tused}" + echo "haproxy_extended_stick_table_size{table=\"${tname}\",type=\"${ttype}\"} ${tsize}" + echo "haproxy_extended_stick_table_used_ratio{table=\"${tname}\",type=\"${ttype}\"} ${ratio}" + fi + done <<< "$tables" +} + +######################### +### Config Metrics ### +######################### + +get_config_metrics() { + [[ ! -f "$HAPROXY_CONFIG" ]] && return + + local frontends backends servers acls maxconn + frontends=$(grep -c '^frontend ' "$HAPROXY_CONFIG" 2>/dev/null || true) + backends=$(grep -c '^backend ' "$HAPROXY_CONFIG" 2>/dev/null || true) + servers=$(grep -cE '^\s+server\s' "$HAPROXY_CONFIG" 2>/dev/null || true) + acls=$(grep -cE '^\s+acl\s' "$HAPROXY_CONFIG" 2>/dev/null || true) + maxconn=$(grep -E '^\s*maxconn\s' "$HAPROXY_CONFIG" 2>/dev/null | head -1 | awk '{print $2}' || echo "0") + + echo "# HELP haproxy_extended_config_frontends Number of configured frontends" + echo "# TYPE haproxy_extended_config_frontends gauge" + echo "haproxy_extended_config_frontends $frontends" + + echo "# HELP haproxy_extended_config_backends Number of configured backends" + echo "# TYPE haproxy_extended_config_backends gauge" + echo "haproxy_extended_config_backends $backends" + + echo "# HELP haproxy_extended_config_servers Number of configured servers" + echo "# TYPE haproxy_extended_config_servers gauge" + echo "haproxy_extended_config_servers $servers" + + echo "# HELP haproxy_extended_config_acl_rules Number of ACL rules" + echo "# TYPE haproxy_extended_config_acl_rules gauge" + echo "haproxy_extended_config_acl_rules $acls" + + echo "# HELP haproxy_extended_config_maxconn Configured maxconn" + echo "# TYPE haproxy_extended_config_maxconn gauge" + echo "haproxy_extended_config_maxconn ${maxconn:-0}" +} + +######################### +### Log Metrics ### +######################### + +get_log_metrics() { + [[ ! -f "$HAPROXY_LOG" ]] && return + + local lines + lines=$(tail -n "$LOG_TAIL_LINES" "$HAPROXY_LOG" 2>/dev/null || true) + [[ -z "$lines" ]] && return + + local http_4xx http_5xx conn_err retries denied + http_4xx=$(echo "$lines" | grep -cE ' [4][0-9]{2} ' 2>/dev/null || true) + http_5xx=$(echo "$lines" | grep -cE ' [5][0-9]{2} ' 2>/dev/null || true) + conn_err=$(echo "$lines" | grep -cE 'CD|SC|SD|PC|PH|PR' 2>/dev/null || true) + retries=$(echo "$lines" | grep -c '+' 2>/dev/null || true) + denied=$(echo "$lines" | grep -cE 'PR--|PD' 2>/dev/null || true) + + echo "# HELP haproxy_extended_log_http_4xx_total 4xx responses in recent log" + echo "# TYPE haproxy_extended_log_http_4xx_total gauge" + echo "haproxy_extended_log_http_4xx_total $http_4xx" + + echo "# HELP haproxy_extended_log_http_5xx_total 5xx responses in recent log" + echo "# TYPE haproxy_extended_log_http_5xx_total gauge" + echo "haproxy_extended_log_http_5xx_total $http_5xx" + + echo "# HELP haproxy_extended_log_connection_errors Connection errors in recent log" + echo "# TYPE haproxy_extended_log_connection_errors gauge" + echo "haproxy_extended_log_connection_errors $conn_err" + + echo "# HELP haproxy_extended_log_retries_total Retries in recent log" + echo "# TYPE haproxy_extended_log_retries_total gauge" + echo "haproxy_extended_log_retries_total $retries" + + echo "# HELP haproxy_extended_log_denied_total Denied requests in recent log" + echo "# TYPE haproxy_extended_log_denied_total gauge" + echo "haproxy_extended_log_denied_total $denied" + + # Log file size + local log_size + log_size=$(stat -c '%s' "$HAPROXY_LOG" 2>/dev/null || echo "0") + echo "# HELP haproxy_extended_log_size_bytes HAProxy log file size" + echo "# TYPE haproxy_extended_log_size_bytes gauge" + echo "haproxy_extended_log_size_bytes $log_size" +} + +######################### +### Reload Metrics ### +######################### + +get_reload_metrics() { + local info + info=$(haproxy_cmd "show info") + [[ -z "$info" ]] && return + + local reloads + reloads=$(echo "$info" | grep -i 'TotalReloads' | awk '{print $2}' | tr -d '[:space:]' || echo "0") + + echo "# HELP haproxy_extended_reload_count Total HAProxy reloads" + echo "# TYPE haproxy_extended_reload_count gauge" + echo "haproxy_extended_reload_count ${reloads:-0}" + + local cur_conns max_conns + cur_conns=$(echo "$info" | grep '^CurrConns:' | awk '{print $2}' | tr -d '[:space:]' || echo "0") + max_conns=$(echo "$info" | grep '^MaxConn:' | awk '{print $2}' | tr -d '[:space:]' || echo "0") + + echo "# HELP haproxy_extended_current_connections Total current connections from info" + echo "# TYPE haproxy_extended_current_connections gauge" + echo "haproxy_extended_current_connections ${cur_conns:-0}" + + echo "# HELP haproxy_extended_max_connections Configured MaxConn from info" + echo "# TYPE haproxy_extended_max_connections gauge" + echo "haproxy_extended_max_connections ${max_conns:-0}" +} + +######################### +### Collector ### +######################### + +get_scrape_metrics() { + local start_time end_time duration + start_time=$(date +%s%N) + + echo "# HELP haproxy_extended_scrape_duration_seconds Time to collect all metrics" + echo "# TYPE haproxy_extended_scrape_duration_seconds gauge" + + end_time=$(date +%s%N) + duration=$(echo "scale=3; ($end_time - $start_time) / 1000000000" | bc 2>/dev/null || echo "0") + echo "haproxy_extended_scrape_duration_seconds $duration" + + echo "# HELP haproxy_extended_scrape_timestamp_seconds Unix timestamp of last scrape" + echo "# TYPE haproxy_extended_scrape_timestamp_seconds gauge" + echo "haproxy_extended_scrape_timestamp_seconds $(date +%s)" +} + +generate_metrics() { + local start_ns + start_ns=$(date +%s%N) + + get_process_metrics + get_stats_metrics + get_ssl_metrics + get_stick_table_metrics + get_config_metrics + get_log_metrics + get_reload_metrics + + local end_ns duration + end_ns=$(date +%s%N) + duration=$(echo "scale=3; ($end_ns - $start_ns) / 1000000000" | bc 2>/dev/null || echo "0") + + echo "# HELP haproxy_extended_scrape_duration_seconds Time to collect all metrics" + echo "# TYPE haproxy_extended_scrape_duration_seconds gauge" + echo "haproxy_extended_scrape_duration_seconds $duration" + + echo "# HELP haproxy_extended_scrape_timestamp_seconds Unix timestamp of last scrape" + echo "# TYPE haproxy_extended_scrape_timestamp_seconds gauge" + echo "haproxy_extended_scrape_timestamp_seconds $(date +%s)" +} + +######################### +### Output ### +######################### + +write_textfile() { + local metrics="$1" + local outfile="$2" + + local outdir + outdir=$(dirname "$outfile") + mkdir -p "$outdir" + + local tmpfile + tmpfile=$(mktemp "${outfile}.XXXXXX") + echo "$metrics" > "$tmpfile" + mv "$tmpfile" "$outfile" + log_info "Metrics written to $outfile" +} + +run_http_server() { + log_info "Starting HTTP server on port $HTTP_PORT" + if ! command -v nc >/dev/null 2>&1 && ! command -v ncat >/dev/null 2>&1; then + log_error "nc (netcat) is required for HTTP mode"; exit 1 + fi + + local nc_cmd="nc" + command -v ncat >/dev/null 2>&1 && nc_cmd="ncat" + + while true; do + local metrics + metrics=$(generate_metrics) + local content_length=${#metrics} + local response="HTTP/1.1 200 OK\r\nContent-Type: text/plain; charset=utf-8\r\nContent-Length: ${content_length}\r\nConnection: close\r\n\r\n${metrics}" + + echo -e "$response" | $nc_cmd -l -p "$HTTP_PORT" -q 1 2>/dev/null || \ + echo -e "$response" | $nc_cmd -l "$HTTP_PORT" 2>/dev/null || true + done +} + +######################### +### Main ### +######################### + +main() { + parse_args "$@" + acquire_lock + + if [[ "$HTTP_MODE" == true ]]; then + run_http_server + elif [[ -n "$OUTPUT_FILE" ]]; then + local metrics + metrics=$(generate_metrics) + write_textfile "$metrics" "$OUTPUT_FILE" + else + generate_metrics + fi +} + +main "$@" diff --git a/headscale-metrics-exporter.sh b/headscale-metrics-exporter.sh new file mode 100755 index 0000000..ada9c70 --- /dev/null +++ b/headscale-metrics-exporter.sh @@ -0,0 +1,370 @@ +#!/bin/bash +################################################################################ +# Script Name: headscale-metrics-exporter.sh +# Version: 1.0 +# Description: Prometheus textfile collector exporter for Headscale — node +# status, user counts, route health, pre-auth key inventory, +# and key expiry tracking +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - Headscale CLI installed and headscale service running +# - jq for JSON parsing +# - Root/sudo access (headscale CLI requires it) +# - netcat (nc) for HTTP mode +# +# Usage: +# sudo ./headscale-metrics-exporter.sh +# sudo ./headscale-metrics-exporter.sh --http -p 9588 +# sudo ./headscale-metrics-exporter.sh --textfile +# +# Metrics exported: +# headscale_nodes_connected - Count of online nodes +# headscale_nodes_registered - Total registered nodes +# headscale_nodes_online - Per-node online status (1/0) +# headscale_users_total - Total users +# headscale_preauth_keys_total - Pre-auth keys by state +# headscale_routes_total - Routes by status +# headscale_routes_exit_nodes - Exit nodes by status +# headscale_node_key_expiry_seconds - Unix timestamp of key expiry per node +# headscale_exporter_duration_seconds - Script execution time +# headscale_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9588 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +# ============================================================================== +# CONFIGURATION VARIABLES +# ============================================================================== + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9588 + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# ============================================================================== +# HELPER FUNCTIONS +# ============================================================================== + +check_headscale() { + if ! command -v headscale >/dev/null 2>&1; then + echo "ERROR: headscale command not found" >&2 + return 1 + fi + if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq command not found" >&2 + return 1 + fi + if ! headscale nodes list -o json >/dev/null 2>&1; then + echo "ERROR: headscale not responding" >&2 + return 1 + fi + return 0 +} + +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/\\n}" + echo "$val" +} + +iso_to_unix() { + local ts="$1" + if [ -z "$ts" ] || [ "$ts" = "null" ]; then + echo "0" + return + fi + local unix_ts + unix_ts=$(date -d "$ts" +%s 2>/dev/null) + echo "${unix_ts:-0}" +} + +# ============================================================================== +# METRIC GENERATION +# ============================================================================== + +generate_metrics() { + local script_start + script_start=$(date +%s) + + if ! check_headscale; then + echo "# HELP headscale_nodes_registered Total number of registered nodes" + echo "# TYPE headscale_nodes_registered gauge" + echo "headscale_nodes_registered 0" + return + fi + + # ------------------------------------------------------------------ + # Collect data from headscale CLI + # ------------------------------------------------------------------ + local nodes_json users_json routes_json + nodes_json=$(headscale nodes list -o json 2>/dev/null) + users_json=$(headscale users list -o json 2>/dev/null) + routes_json=$(headscale routes list -o json 2>/dev/null) + + if [ -z "$nodes_json" ] || [ "$nodes_json" = "null" ]; then + nodes_json="[]" + fi + if [ -z "$users_json" ] || [ "$users_json" = "null" ]; then + users_json="[]" + fi + if [ -z "$routes_json" ] || [ "$routes_json" = "null" ]; then + routes_json="[]" + fi + + # ------------------------------------------------------------------ + # Node metrics + # ------------------------------------------------------------------ + local total_nodes online_nodes + total_nodes=$(echo "$nodes_json" | jq 'length') + online_nodes=$(echo "$nodes_json" | jq '[.[] | select(.online == true)] | length') + + echo "# HELP headscale_nodes_connected Number of currently online nodes" + echo "# TYPE headscale_nodes_connected gauge" + echo "headscale_nodes_connected ${online_nodes:-0}" + echo "" + + echo "# HELP headscale_nodes_registered Total number of registered nodes" + echo "# TYPE headscale_nodes_registered gauge" + echo "headscale_nodes_registered ${total_nodes:-0}" + echo "" + + echo "# HELP headscale_nodes_online Per-node online status (1=online, 0=offline)" + echo "# TYPE headscale_nodes_online gauge" + + local node_data + node_data=$(echo "$nodes_json" | jq -r ' + .[] | + [ + .givenName // .name // "", + .user.name // "", + (.ipAddresses[0]? // ""), + (.online // false | tostring), + .expiry // "" + ] | @tsv + ') + + declare -a node_names=() + declare -a node_users=() + declare -a node_expiries=() + + while IFS=$'\t' read -r name user ip online expiry; do + [ -z "$name" ] && continue + local escaped_name escaped_user escaped_ip + escaped_name=$(prom_escape "$name") + escaped_user=$(prom_escape "$user") + escaped_ip=$(prom_escape "$ip") + + local online_val=0 + if [ "$online" = "true" ]; then + online_val=1 + fi + echo "headscale_nodes_online{node=\"$escaped_name\",user=\"$escaped_user\",ip=\"$escaped_ip\"} $online_val" + + node_names+=("$escaped_name") + node_users+=("$escaped_user") + node_expiries+=("$expiry") + done <<< "$node_data" + echo "" + + # ------------------------------------------------------------------ + # Key expiry metrics + # ------------------------------------------------------------------ + echo "# HELP headscale_node_key_expiry_seconds Unix timestamp of node key expiry" + echo "# TYPE headscale_node_key_expiry_seconds gauge" + + for i in "${!node_names[@]}"; do + local expiry_unix + expiry_unix=$(iso_to_unix "${node_expiries[$i]}") + echo "headscale_node_key_expiry_seconds{node=\"${node_names[$i]}\",user=\"${node_users[$i]}\"} $expiry_unix" + done + echo "" + + # ------------------------------------------------------------------ + # User metrics + # ------------------------------------------------------------------ + local total_users + total_users=$(echo "$users_json" | jq 'length') + + echo "# HELP headscale_users_total Total number of users" + echo "# TYPE headscale_users_total gauge" + echo "headscale_users_total ${total_users:-0}" + echo "" + + # ------------------------------------------------------------------ + # Pre-auth key metrics + # ------------------------------------------------------------------ + local usable_keys=0 + local expired_keys=0 + local used_keys=0 + + local user_list + user_list=$(echo "$users_json" | jq -r '.[].name // empty') + + while IFS= read -r username; do + [ -z "$username" ] && continue + local keys_json + keys_json=$(headscale preauthkeys list -u "$username" -o json 2>/dev/null) + if [ -z "$keys_json" ] || [ "$keys_json" = "null" ]; then + continue + fi + + local u e s + u=$(echo "$keys_json" | jq '[.[] | select(.used == false and .expiration > now)] | length' 2>/dev/null) + e=$(echo "$keys_json" | jq '[.[] | select(.used == false and .expiration <= now)] | length' 2>/dev/null) + s=$(echo "$keys_json" | jq '[.[] | select(.used == true)] | length' 2>/dev/null) + + usable_keys=$((usable_keys + ${u:-0})) + expired_keys=$((expired_keys + ${e:-0})) + used_keys=$((used_keys + ${s:-0})) + done <<< "$user_list" + + echo "# HELP headscale_preauth_keys_total Pre-auth keys by state" + echo "# TYPE headscale_preauth_keys_total gauge" + echo "headscale_preauth_keys_total{state=\"usable\"} $usable_keys" + echo "headscale_preauth_keys_total{state=\"expired\"} $expired_keys" + echo "headscale_preauth_keys_total{state=\"used\"} $used_keys" + echo "" + + # ------------------------------------------------------------------ + # Route metrics + # ------------------------------------------------------------------ + local approved_routes pending_routes disabled_routes + local approved_exit pending_exit disabled_exit + + approved_routes=$(echo "$routes_json" | jq '[.[] | select(.advertised == true and .enabled == true and (.prefix | test("^0\\.0\\.0\\.0/0$|^::/0$") | not))] | length') + pending_routes=$(echo "$routes_json" | jq '[.[] | select(.advertised == true and .enabled == false and (.prefix | test("^0\\.0\\.0\\.0/0$|^::/0$") | not))] | length') + disabled_routes=$(echo "$routes_json" | jq '[.[] | select(.advertised == false and (.prefix | test("^0\\.0\\.0\\.0/0$|^::/0$") | not))] | length') + + approved_exit=$(echo "$routes_json" | jq '[.[] | select(.advertised == true and .enabled == true and (.prefix | test("^0\\.0\\.0\\.0/0$|^::/0$")))] | length') + pending_exit=$(echo "$routes_json" | jq '[.[] | select(.advertised == true and .enabled == false and (.prefix | test("^0\\.0\\.0\\.0/0$|^::/0$")))] | length') + disabled_exit=$(echo "$routes_json" | jq '[.[] | select(.advertised == false and (.prefix | test("^0\\.0\\.0\\.0/0$|^::/0$")))] | length') + + echo "# HELP headscale_routes_total Routes by status" + echo "# TYPE headscale_routes_total gauge" + echo "headscale_routes_total{status=\"approved\"} ${approved_routes:-0}" + echo "headscale_routes_total{status=\"pending\"} ${pending_routes:-0}" + echo "headscale_routes_total{status=\"disabled\"} ${disabled_routes:-0}" + echo "" + + echo "# HELP headscale_routes_exit_nodes Exit nodes by status" + echo "# TYPE headscale_routes_exit_nodes gauge" + echo "headscale_routes_exit_nodes{status=\"approved\"} ${approved_exit:-0}" + echo "headscale_routes_exit_nodes{status=\"pending\"} ${pending_exit:-0}" + echo "headscale_routes_exit_nodes{status=\"disabled\"} ${disabled_exit:-0}" + echo "" + + # ------------------------------------------------------------------ + # Exporter metadata + # ------------------------------------------------------------------ + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + + cat <&2 + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + echo "Headscale Exporter

Headscale Prometheus Exporter

Metrics

" + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================== +# MAIN +# ============================================================================== + +main() { + parse_args "$@" + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + local temp_file + temp_file=$(mktemp "${output_dir}/.headscale_metrics.XXXXXX") + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + echo "Metrics written to $OUTPUT_FILE" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/hestia-bot-block.sh b/hestia-bot-block.sh new file mode 100755 index 0000000..e1748eb --- /dev/null +++ b/hestia-bot-block.sh @@ -0,0 +1,621 @@ +#!/bin/bash +################################################################################ +# Script Name: hestia-bot-block.sh +# Version: 2.5 +# Description: Configure AI scraper and SEO bot blocking on HestiaCP and +# VestaCP/myVesta servers. Creates an nginx map in conf.d, builds +# custom nginx templates with bot-blocking rules, and optionally +# applies them to specified domains. +# +# Supports incremental map updates and stacking on existing +# templates (e.g., geoip). Safe to re-run — merges new bots +# into existing map without losing custom additions. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - HestiaCP or VestaCP/myVesta installed and running +# - Root access +# - nginx as proxy (default HestiaCP setup) +# +# Usage: +# sudo ./hestia-bot-block.sh +# sudo ./hestia-bot-block.sh --apply user domain.com +# sudo ./hestia-bot-block.sh --apply-all user +# sudo ./hestia-bot-block.sh --base-template default-geoip +# sudo ./hestia-bot-block.sh --update-map-only +# sudo ./hestia-bot-block.sh --dry-run +# +# Changelog: +# 2.5 — 2026-05-12: Added empty-referer image scrape blocking. Headless +# bot networks (Puppeteer/Playwright on residential proxies) hit +# cover images directly with no referer — now returns 444 for any +# image request (.png/.jpg/.webp) with an empty Referer header. +# 2.4 — 2026-05-11: Added fragment-in-referer blocking (real browsers strip +# URI fragments from the Referer header). Added request method blocking +# (only GET/HEAD allowed — static sites never need POST/PUT/DELETE). +# Added ospa-radar (lead-gen/business intelligence crawler) to blocklist. +# 2.3 — 2026-05-08: Added Exabot (defunct Dassault/Exalead crawler, now +# spoofed) and Sogou (Tencent Chinese search crawler) to blocklist. +# 2.2 — 2026-05-04: Fixed custom entry preservation carrying forward bots +# that were removed from the builtin list. Previously-builtin bots +# (OAI-SearchBot, Claude-Web) are now stripped during map updates. +# 2.1 — 2026-05-04: Removed Claude-Web and OAI-SearchBot from blocklist. +# These are user-facing fetcher bots, not training crawlers. Blocking +# them prevents your content from being cited in AI answers. +# +################################################################################ + +set -euo pipefail + +# --- Configuration --- +TEMPLATE_NAME="default-botblock" +BASE_TEMPLATE="default" +CONF_DIR="/etc/nginx/conf.d" +PANEL_TPL_DIR="" +PANEL_NAME="" +MAP_FILE="${CONF_DIR}/bot-block.conf" +APPLY_USER="" +APPLY_DOMAIN="" +APPLY_ALL=false +UPDATE_MAP_ONLY=false +DRY_RUN=false + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +detect_panel() { + if [[ -d "/usr/local/hestia/data/templates/web/nginx" ]]; then + PANEL_TPL_DIR="/usr/local/hestia/data/templates/web/nginx" + PANEL_NAME="HestiaCP" + elif [[ -d "/usr/local/vesta/data/templates/web/nginx" ]]; then + PANEL_TPL_DIR="/usr/local/vesta/data/templates/web/nginx" + PANEL_NAME="VestaCP/myVesta" + else + echo -e "${RED}Error: Neither HestiaCP nor VestaCP/myVesta found${NC}" >&2 + exit 1 + fi + info "Detected ${PANEL_NAME} (${PANEL_TPL_DIR})" +} + +usage() { + cat <&2 + exit 1 +fi + +if ! command -v nginx &>/dev/null; then + echo -e "${RED}Error: nginx not found${NC}" >&2 + exit 1 +fi + +# Only detect panel if we need templates +if [[ "$UPDATE_MAP_ONLY" == "false" ]]; then + detect_panel +fi + +# ===================================================== +# Bot list — single source of truth +# ===================================================== +# Each line: "~*pattern 1;" +# To add a new bot, add it to the appropriate section below. +read -r -d '' BOT_LIST <<'BOTLIST' || true + # AI scrapers + ~*ABEvalBot 1; + ~*GPTBot 1; + ~*ClaudeBot 1; + ~*anthropic-ai 1; + ~*CCBot 1; + ~*Bytespider 1; + ~*TikTokSpider 1; + ~*cohere-ai 1; + ~*PerplexityBot 1; + ~*Diffbot 1; + ~*MistralBot 1; + ~*YandexGPTBot 1; + ~*meta-externalagent 1; + ~*Meta-ExternalFetcher 1; + ~*meta-webindexer 1; + ~*PetalBot 1; + ~*Amazonbot 1; + ~*Amzn-SearchBot 1; + ~*AI2Bot 1; + ~*Timpibot 1; + ~*img2dataset 1; + ~*YouBot 1; + ~*HanaleiBot 1; + ~*Trafilatura 1; + + # Defunct crawlers (spoofed user agents) + ~*Exabot 1; + ~*Sogou 1; + + # SEO scrapers + ~*MJ12bot 1; + ~*SemrushBot 1; + ~*AhrefsBot 1; + ~*DotBot 1; + ~*DataForSeoBot 1; + ~*SERanking 1; + + # Vulnerability scanners + ~*Nikto 1; + ~*sqlmap 1; + ~*Nmap 1; + ~*masscan 1; + ~*ZmEu 1; + ~*Morpheus 1; + + # Lead-gen / business intelligence bots + ~*ospa-radar 1; + ~*HubSeedsBot 1; + + # AI scrapers / research bots + ~*Aranet-SearchBot 1; + ~*AzureAI-SearchBot 1; + ~*MINERVA-DeepResearch 1; + ~*NagetBot 1; + ~*LAIABot 1; + ~*pi-coding-agent 1; + + # Probe / monitoring bots + ~*CMS-Checker 1; + ~*NexoFaviconBot 1; + ~*AwarioBot 1; + ~*AwarioSmartBot 1; + ~*CopyousBot 1; + ~*SurdotlyBot 1; + ~*trendictionbot 1; + ~*wpbot 1; + ~*WebFetchTool 1; + ~*YisouSpider 1; + + # Scraping frameworks + ~*Scrapy 1; + ~*python-requests 1; + ~*Go-http-client 1; + ~*Java/ 1; + ~*libwww-perl 1; + ~*node-fetch 1; + ~*HeadlessChrome 1; + + # Outdated browsers (Chrome < 115 — almost certainly bots) + ~*Chrome/([1-9][0-9]?|10[0-9]|11[0-4])\. 1; + + # Empty / missing user agent + "" 1; + "-" 1; +BOTLIST + +# ===================================================== +# Step 1: Create or update nginx map +# ===================================================== + +# Extract bot patterns from the built-in list (lowercase for comparison) +get_builtin_patterns() { + echo "$BOT_LIST" | grep -oP '~\*\S+|^ ""' | tr '[:upper:]' '[:lower:]' | sort -u +} + +# Extract bot patterns from an existing map file (lowercase for comparison) +get_existing_patterns() { + local file="$1" + grep -oP '~\*\S+|^\s*""' "$file" 2>/dev/null | tr '[:upper:]' '[:lower:]' | sort -u +} + +# Bots previously in the builtin list that were intentionally removed. +# These must be stripped from custom entries to prevent them being preserved +# across updates. Users who want to block these can re-add them manually. +REMOVED_BOTS="~*oai-searchbot +~*claude-web" + +# Extract custom entries from existing map that are NOT in our built-in list +# and NOT in the removed list +get_custom_entries() { + local file="$1" + if [[ ! -f "$file" ]]; then + return + fi + + local builtin_patterns + builtin_patterns=$(get_builtin_patterns) + + # Read each bot line from the existing file, keep ones not in our list + while IFS= read -r line; do + # skip comments, blank lines, map header/footer, default line + [[ "$line" =~ ^[[:space:]]*# ]] && continue + [[ "$line" =~ ^[[:space:]]*$ ]] && continue + [[ "$line" =~ ^map ]] && continue + [[ "$line" =~ ^\} ]] && continue + [[ "$line" =~ default ]] && continue + + # extract the pattern from this line + local pattern + pattern=$(echo "$line" | grep -oP '~\*\S+|^\s*""' | tr '[:upper:]' '[:lower:]' | head -1) + [[ -z "$pattern" ]] && continue + + # skip if in our built-in list + if echo "$builtin_patterns" | grep -qxF "$pattern"; then + continue + fi + + # skip if in the removed list (previously builtin, intentionally dropped) + if echo "$REMOVED_BOTS" | grep -qxF "$pattern"; then + continue + fi + + echo "$line" + done < "$file" +} + +step "Configuring bot-block map at ${MAP_FILE}" + +CUSTOM_ENTRIES="" +ADDED_NEW=0 +if [[ -f "$MAP_FILE" ]]; then + # Detect custom entries added by the user + CUSTOM_ENTRIES=$(get_custom_entries "$MAP_FILE") + + # Count new bots being added + existing=$(get_existing_patterns "$MAP_FILE") + while IFS= read -r pattern; do + [[ -z "$pattern" ]] && continue + if ! echo "$existing" | grep -qxF "$pattern"; then + ADDED_NEW=$((ADDED_NEW + 1)) + fi + done <<< "$(get_builtin_patterns)" + + if [[ -n "$CUSTOM_ENTRIES" ]]; then + custom_count=$(echo "$CUSTOM_ENTRIES" | wc -l) + info "Found ${custom_count} custom bot entries — will preserve them" + fi +fi + +# Build the full map content +MAP_CONTENT="# Bot-blocking map for AI scrapers, SEO bots, and vulnerability scanners +# Generated by hestia-bot-block.sh — https://mylinux.work +# Last updated: $(date '+%Y-%m-%d %H:%M:%S') + +map \$http_user_agent \$is_bad_bot { + default 0; + +${BOT_LIST}" + +# Append custom entries if any +if [[ -n "$CUSTOM_ENTRIES" ]]; then + MAP_CONTENT="${MAP_CONTENT} + + # Custom entries (preserved from previous configuration) +${CUSTOM_ENTRIES}" +fi + +MAP_CONTENT="${MAP_CONTENT} +}" + +if [[ "$DRY_RUN" == "true" ]]; then + if [[ -f "$MAP_FILE" ]]; then + echo " Would update: ${MAP_FILE}" + [[ -n "$CUSTOM_ENTRIES" ]] && echo " Would preserve: $(echo "$CUSTOM_ENTRIES" | wc -l) custom entries" + [[ "$ADDED_NEW" -gt 0 ]] && echo " Would add: ${ADDED_NEW} new bot patterns" + else + echo " Would create: ${MAP_FILE}" + fi +else + if [[ -f "$MAP_FILE" ]]; then + cp "$MAP_FILE" "${MAP_FILE}.bak.$(date +%s)" + if [[ "$ADDED_NEW" -gt 0 ]]; then + info "Map updated: ${MAP_FILE} (${ADDED_NEW} new patterns added)" + else + info "Map updated: ${MAP_FILE} (already current)" + fi + else + info "Map created: ${MAP_FILE}" + fi + echo "$MAP_CONTENT" > "$MAP_FILE" +fi + +# ===================================================== +# If --update-map-only, skip templates and just reload +# ===================================================== +if [[ "$UPDATE_MAP_ONLY" == "true" ]]; then + step "Testing nginx configuration" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" + else + if nginx -t 2>&1; then + info "nginx config valid" + else + echo -e "${RED}[ERROR] nginx config test failed — restoring backup${NC}" >&2 + latest_bak=$(ls -t "${MAP_FILE}.bak."* 2>/dev/null | head -1) + if [[ -n "$latest_bak" ]]; then + cp "$latest_bak" "$MAP_FILE" + warn "Restored: ${latest_bak}" + fi + exit 1 + fi + fi + + step "Reloading nginx" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl reload nginx" + else + systemctl reload nginx + info "nginx reloaded" + fi + + echo "" + echo -e "${BOLD}Done.${NC} Map updated — templates unchanged." + echo " Map: ${MAP_FILE}" + echo "" + echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" + echo " Expected: 444 (connection dropped) or 000 (no response)" + exit 0 +fi + +# ===================================================== +# Step 2: Create custom Hestia templates +# ===================================================== +BOT_BLOCK_DIRECTIVE=' + # Bot blocking — added by hestia-bot-block.sh + if ($is_bad_bot) { + return 444; + } + # Block broken srcset scrapers + if ($request_uri ~* "%20[0-9]+w,https?://") { + return 444; + } + # Block spoofed referers with fragment identifiers (real browsers strip these) + if ($http_referer ~* "#") { + return 444; + } + # Block non-GET/HEAD methods (static sites never need POST/PUT/DELETE) + if ($request_method !~ ^(GET|HEAD)$ ) { + return 444; + } + # Block empty-referer requests for images (headless bot image scraping) + set $block_image_scrape 0; + if ($uri ~* "\.(png|jpg|webp)$") { + set $block_image_scrape 1; + } + if ($http_referer = "") { + set $block_image_scrape "${block_image_scrape}1"; + } + if ($block_image_scrape = "11") { + return 444; + }' + +# The if block goes inside the server block, after the listen/server_name lines. +# Find the first location block and insert before it. +create_template() { + local src="$1" dst="$2" label="$3" + + if [[ ! -f "$src" ]]; then + warn "Source template not found: ${src} — skipping ${label}" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${dst} (from ${src})" + return + fi + + if [[ -f "$dst" ]]; then + cp "$dst" "${dst}.bak.$(date +%s)" + warn "Existing ${label} template backed up" + fi + + # Check if the source already has bot blocking (avoid double-injection) + if grep -q 'is_bad_bot' "$src"; then + # Source already has bot blocking — copy as-is + cp "$src" "$dst" + info "Created ${label} template: ${dst} (bot blocking already present in base)" + else + # Insert bot-blocking directive before the first 'location' line + awk -v block="$BOT_BLOCK_DIRECTIVE" ' + !inserted && /^[[:space:]]*location[[:space:]]/ { + print block + print "" + inserted = 1 + } + { print } + ' "$src" > "$dst" + info "Created ${label} template: ${dst}" + fi +} + +# Resolve the base template — verify it exists +BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" +if [[ ! -f "$BASE_TPL" ]]; then + if [[ "$BASE_TEMPLATE" != "default" ]]; then + warn "Base template '${BASE_TEMPLATE}' not found — falling back to 'default'" + BASE_TEMPLATE="default" + fi + BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" + if [[ ! -f "$BASE_TPL" ]]; then + echo -e "${RED}Error: Default template not found: ${BASE_TPL}${NC}" >&2 + exit 1 + fi +fi + +step "Creating custom ${PANEL_NAME} nginx templates (${TEMPLATE_NAME}) from ${BASE_TEMPLATE}" + +# Proxy templates +create_template \ + "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" \ + "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.tpl" \ + "HTTP (.tpl)" + +create_template \ + "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.stpl" \ + "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.stpl" \ + "SSL (.stpl)" + +# php-fpm templates (if they exist for the base) +if [[ -d "${PANEL_TPL_DIR}/php-fpm" ]]; then + if [[ -f "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" ]]; then + create_template \ + "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" \ + "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.tpl" \ + "php-fpm HTTP (.tpl)" + + create_template \ + "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.stpl" \ + "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.stpl" \ + "php-fpm SSL (.stpl)" + fi +fi + +# Copy .sh hooks from the base template if they exist +for ext in tpl stpl; do + base_sh="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.${ext}.sh" + dst_sh="${PANEL_TPL_DIR}/${TEMPLATE_NAME}.${ext}.sh" + if [[ -f "$base_sh" && ! -f "$dst_sh" ]]; then + cp "$base_sh" "$dst_sh" + info "Copied hook: ${dst_sh}" + fi +done + +# ===================================================== +# Step 3: Validate nginx config +# ===================================================== +step "Testing nginx configuration" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" +else + if nginx -t 2>&1; then + info "nginx config valid" + else + echo -e "${RED}[ERROR] nginx config test failed${NC}" >&2 + echo " Restore backups from ${PANEL_TPL_DIR} and ${CONF_DIR}" >&2 + exit 1 + fi +fi + +# ===================================================== +# Step 4: Apply template (optional) +# ===================================================== +if [[ -n "$APPLY_USER" ]]; then + if ! command -v v-change-web-domain-proxy-tpl &>/dev/null; then + echo -e "${RED}Error: v-change-web-domain-proxy-tpl not found${NC}" >&2 + exit 1 + fi + + if [[ "$APPLY_ALL" == "true" ]]; then + step "Applying template to all domains for user: ${APPLY_USER}" + domains=$(v-list-web-domains "$APPLY_USER" plain 2>/dev/null | awk '{print $1}') + if [[ -z "$domains" ]]; then + warn "No domains found for user: ${APPLY_USER}" + else + while IFS= read -r domain; do + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${domain} ${TEMPLATE_NAME}" + else + v-change-web-domain-proxy-tpl "$APPLY_USER" "$domain" "$TEMPLATE_NAME" + info "Applied to: ${domain}" + fi + done <<< "$domains" + fi + else + step "Applying template to: ${APPLY_DOMAIN}" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${APPLY_DOMAIN} ${TEMPLATE_NAME}" + else + v-change-web-domain-proxy-tpl "$APPLY_USER" "$APPLY_DOMAIN" "$TEMPLATE_NAME" + info "Applied to: ${APPLY_DOMAIN}" + fi + fi +fi + +# ===================================================== +# Step 5: Reload nginx +# ===================================================== +step "Reloading nginx" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl reload nginx" +else + systemctl reload nginx + info "nginx reloaded" +fi + +# ===================================================== +# Summary +# ===================================================== +echo "" +echo -e "${BOLD}Done.${NC}" +echo "" +echo " Map: ${MAP_FILE}" +echo " Base: ${BASE_TEMPLATE}" +echo " Template: ${TEMPLATE_NAME} (.tpl + .stpl)" +if [[ -n "$APPLY_USER" ]]; then + if [[ "$APPLY_ALL" == "true" ]]; then + echo " Applied: All domains for ${APPLY_USER}" + else + echo " Applied: ${APPLY_DOMAIN}" + fi +else + echo "" + echo " To apply to a domain:" + echo " v-change-web-domain-proxy-tpl ${TEMPLATE_NAME}" + echo "" + echo " To apply to all domains for a user:" + echo " $(basename "$0") --apply-all " +fi +echo "" +echo " To add new bots later without touching templates:" +echo " sudo $(basename "$0") --update-map-only" +echo "" +echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" +echo " Expected: 444 (connection dropped) or 000 (no response)" diff --git a/hestia-geoip-setup.sh b/hestia-geoip-setup.sh new file mode 100755 index 0000000..4a9a5dc --- /dev/null +++ b/hestia-geoip-setup.sh @@ -0,0 +1,821 @@ +#!/bin/bash +################################################################################ +# Script Name: hestia-geoip-setup.sh +# Version: 1.0 +# Description: Configure GeoIP2 enriched logging on HestiaCP / VestaCP / +# myVesta servers. Installs GeoIP2 module, downloads databases +# (MaxMind or DB-IP free), creates nginx geoip2 variable mappings +# and enriched log format in conf.d, builds custom nginx templates +# with enriched access_log, and optionally applies to domains. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - HestiaCP or VestaCP/myVesta installed and running +# - nginx as proxy (default HestiaCP setup) +# - Root access +# - MaxMind account OR use --db-ip for no-signup alternative +# +# Usage: +# sudo ./hestia-geoip-setup.sh --db-ip +# sudo ./hestia-geoip-setup.sh --db-ip --base-template default-botblock +# sudo ./hestia-geoip-setup.sh --account-id 123456 --license-key ABCDEF +# sudo ./hestia-geoip-setup.sh --db-ip --apply admin example.com +# sudo ./hestia-geoip-setup.sh --db-ip --apply-all admin +# sudo ./hestia-geoip-setup.sh --dry-run --db-ip +# sudo ./hestia-geoip-setup.sh --remove +# +################################################################################ + +set -euo pipefail + +# --- Configuration --- +GEOIP_CONF="/etc/GeoIP.conf" +GEOIP_DB_DIR="/usr/share/GeoIP" +NGINX_GEOIP_CONF="/etc/nginx/conf.d/geoip2.conf" +NGINX_LOG_CONF="/etc/nginx/conf.d/enriched-log-format.conf" +APACHE_LOG_CONF="/etc/apache2/conf-available/enriched-log-format.conf" +CRON_WEEKLY="/etc/cron.weekly/geoip-db-update" +CRON_MONTHLY="/etc/cron.monthly/geoip-db-update" +MARKER_START="# geoip-managed-start" +MARKER_END="# geoip-managed-end" +TIMESTAMP=$(date +%s) + +TEMPLATE_NAME="default-geoip" +BASE_TEMPLATE="default" +PANEL_TPL_DIR="" +PANEL_NAME="" +APPLY_USER="" +APPLY_DOMAIN="" +APPLY_ALL=false +DRY_RUN=false +REMOVE=false +SKIP_PACKAGES=false +USE_DBIP=false +ACCOUNT_ID="${MAXMIND_ACCOUNT_ID:-}" +LICENSE_KEY="${MAXMIND_LICENSE_KEY:-}" + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +detect_panel() { + if [[ -d "/usr/local/hestia/data/templates/web/nginx" ]]; then + PANEL_TPL_DIR="/usr/local/hestia/data/templates/web/nginx" + PANEL_NAME="HestiaCP" + elif [[ -d "/usr/local/vesta/data/templates/web/nginx" ]]; then + PANEL_TPL_DIR="/usr/local/vesta/data/templates/web/nginx" + PANEL_NAME="VestaCP/myVesta" + else + echo -e "${RED}Error: Neither HestiaCP nor VestaCP/myVesta found${NC}" >&2 + exit 1 + fi + info "Detected ${PANEL_NAME} (${PANEL_TPL_DIR})" +} + +usage() { + cat <&2 + exit 1 +fi + +detect_panel + +if ! command -v nginx &>/dev/null; then + echo -e "${RED}Error: nginx not found${NC}" >&2 + exit 1 +fi + +backup_file() { + local file="$1" + if [[ -f "$file" ]]; then + cp "$file" "${file}.bak.${TIMESTAMP}" + fi +} + +# ===================================================== +# REMOVE MODE +# ===================================================== +if [[ "$REMOVE" == "true" ]]; then + step "Removing GeoIP2 configuration" + + for file in "$NGINX_GEOIP_CONF" "$NGINX_LOG_CONF" "$APACHE_LOG_CONF" "$CRON_WEEKLY" "$CRON_MONTHLY"; do + if [[ -f "$file" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would remove: ${file}" + else + rm -f "$file" + info "Removed: ${file}" + fi + else + warn "Not found: ${file} (already removed?)" + fi + done + + # Remove module load config (leave .so in case of reinstall) + LOAD_MOD_CONF="/etc/nginx/modules-enabled/50-mod-http-geoip2.conf" + if [[ -f "$LOAD_MOD_CONF" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would remove: ${LOAD_MOD_CONF}" + else + rm -f "$LOAD_MOD_CONF" + info "Removed: ${LOAD_MOD_CONF}" + fi + fi + + # Remove nginx templates + for ext in tpl stpl; do + tpl_file="${PANEL_TPL_DIR}/${TEMPLATE_NAME}.${ext}" + if [[ -f "$tpl_file" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would remove: ${tpl_file}" + else + rm -f "$tpl_file" + info "Removed: ${tpl_file}" + fi + fi + # Also check php-fpm directory + tpl_file="${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.${ext}" + if [[ -f "$tpl_file" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would remove: ${tpl_file}" + else + rm -f "$tpl_file" + info "Removed: ${tpl_file}" + fi + fi + done + + # Remove Apache templates + for apache_dir in "/usr/local/hestia/data/templates/web/apache2/php-fpm" "/usr/local/hestia/data/templates/web/apache2"; do + for ext in tpl stpl; do + tpl_file="${apache_dir}/${TEMPLATE_NAME}.${ext}" + if [[ -f "$tpl_file" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would remove: ${tpl_file}" + else + rm -f "$tpl_file" + info "Removed: ${tpl_file}" + fi + fi + done + done + + # Disable Apache enriched log config + if [[ -f "$APACHE_LOG_CONF" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: a2disconf enriched-log-format" + else + a2disconf -q enriched-log-format 2>/dev/null || true + fi + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" + echo " Would run: systemctl reload nginx" + else + step "Testing nginx configuration" + if nginx -t 2>&1; then + info "nginx config valid" + else + echo -e "${RED}[ERROR] nginx config test failed — restore .bak files${NC}" >&2 + exit 1 + fi + + step "Reloading nginx" + systemctl reload nginx + info "nginx reloaded" + + if systemctl is-active --quiet apache2 2>/dev/null; then + step "Reloading Apache" + systemctl reload apache2 + info "Apache reloaded" + fi + fi + + echo "" + echo -e "${BOLD}GeoIP2 configuration removed.${NC}" + echo " Domains still using the ${TEMPLATE_NAME} template should be switched back." + exit 0 +fi + +# --- Prompt for credentials if not using DB-IP --- +if [[ "$USE_DBIP" != "true" ]]; then + if [[ -z "$ACCOUNT_ID" ]]; then + read -rp "MaxMind Account ID (or Ctrl+C and rerun with --db-ip): " ACCOUNT_ID + fi + if [[ -z "$LICENSE_KEY" ]]; then + read -rp "MaxMind License Key: " LICENSE_KEY + fi + + if [[ -z "$ACCOUNT_ID" || -z "$LICENSE_KEY" ]]; then + echo -e "${RED}Error: MaxMind credentials required (or use --db-ip for no-signup alternative)${NC}" >&2 + exit 1 + fi +fi + +# ===================================================== +# Step 1: Install packages and GeoIP2 nginx module +# ===================================================== +NGINX_MOD_DIR="/usr/lib/nginx/modules" +GEOIP2_MOD="${NGINX_MOD_DIR}/ngx_http_geoip2_module.so" +LOAD_MOD_CONF="/etc/nginx/modules-enabled/50-mod-http-geoip2.conf" + +if [[ "$SKIP_PACKAGES" == "true" ]]; then + warn "Skipping package installation (--skip-packages)" +else + # Hestia uses nginx from nginx.org — Ubuntu/Debian libnginx-mod-* packages + # conflict with it. Install only libmaxminddb-dev + build tools, then compile + # the geoip2 module as a dynamic .so against the installed nginx. + step "Installing build dependencies" + + if [[ "$USE_DBIP" == "true" ]]; then + PACKAGES="libmaxminddb0 libmaxminddb-dev mmdb-bin curl build-essential git libpcre2-dev libssl-dev zlib1g-dev" + else + PACKAGES="libmaxminddb0 libmaxminddb-dev mmdb-bin geoipupdate build-essential git libpcre2-dev libssl-dev zlib1g-dev" + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: apt-get update && apt-get install -y ${PACKAGES}" + else + apt-get update -qq + # shellcheck disable=SC2086 + apt-get install -y $PACKAGES + info "Packages installed: ${PACKAGES}" + fi + + # Build the geoip2 dynamic module if not already present + if [[ -f "$GEOIP2_MOD" ]]; then + info "GeoIP2 module already exists: ${GEOIP2_MOD}" + else + step "Building ngx_http_geoip2_module for nginx $(nginx -v 2>&1 | grep -oP '[\d.]+')" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would compile ngx_http_geoip2_module.so" + else + NGINX_VER=$(nginx -v 2>&1 | grep -oP '[\d.]+') + BUILD_DIR=$(mktemp -d) + + cd "$BUILD_DIR" + curl -fsSL "https://nginx.org/download/nginx-${NGINX_VER}.tar.gz" -o nginx.tar.gz + tar xzf nginx.tar.gz + + git clone --depth 1 https://github.com/leev/ngx_http_geoip2_module.git + + cd "nginx-${NGINX_VER}" + ./configure --with-compat --add-dynamic-module=../ngx_http_geoip2_module + make modules + + mkdir -p "$NGINX_MOD_DIR" + cp objs/ngx_http_geoip2_module.so "$NGINX_MOD_DIR/" + cp objs/ngx_stream_geoip2_module.so "$NGINX_MOD_DIR/" 2>/dev/null || true + + cd / + rm -rf "$BUILD_DIR" + info "Built and installed: ${GEOIP2_MOD}" + fi + fi + + # Ensure the module is loaded + if [[ ! -f "$LOAD_MOD_CONF" ]] && [[ "$DRY_RUN" != "true" ]]; then + mkdir -p "$(dirname "$LOAD_MOD_CONF")" + echo "load_module ${GEOIP2_MOD};" > "$LOAD_MOD_CONF" + info "Created: ${LOAD_MOD_CONF}" + fi +fi + +# ===================================================== +# Step 2: Write GeoIP.conf (MaxMind only) +# ===================================================== +if [[ "$USE_DBIP" != "true" ]]; then + step "Writing MaxMind configuration to ${GEOIP_CONF}" + + GEOIP_CONF_CONTENT="${MARKER_START} +AccountID ${ACCOUNT_ID} +LicenseKey ${LICENSE_KEY} +EditionIDs GeoLite2-City GeoLite2-ASN GeoLite2-Country +DatabaseDirectory ${GEOIP_DB_DIR} +${MARKER_END}" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${GEOIP_CONF}" + else + backup_file "$GEOIP_CONF" + echo "$GEOIP_CONF_CONTENT" > "$GEOIP_CONF" + chmod 600 "$GEOIP_CONF" + info "Created: ${GEOIP_CONF}" + fi +fi + +# ===================================================== +# Step 3: Download GeoIP databases +# ===================================================== +if [[ "$USE_DBIP" == "true" ]]; then + step "Downloading DB-IP free databases to ${GEOIP_DB_DIR}" + + DBIP_MONTH=$(date +%Y-%m) + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would download: dbip-country-lite-${DBIP_MONTH}.mmdb.gz" + echo " Would download: dbip-asn-lite-${DBIP_MONTH}.mmdb.gz" + else + mkdir -p "$GEOIP_DB_DIR" + cd "$GEOIP_DB_DIR" + + curl -fsSL "https://download.db-ip.com/free/dbip-country-lite-${DBIP_MONTH}.mmdb.gz" -o dbip-country.mmdb.gz + gunzip -f dbip-country.mmdb.gz + mv dbip-country.mmdb GeoLite2-City.mmdb + info "Downloaded: DB-IP Country → GeoLite2-City.mmdb" + + curl -fsSL "https://download.db-ip.com/free/dbip-asn-lite-${DBIP_MONTH}.mmdb.gz" -o dbip-asn.mmdb.gz + gunzip -f dbip-asn.mmdb.gz + mv dbip-asn.mmdb GeoLite2-ASN.mmdb + info "Downloaded: DB-IP ASN → GeoLite2-ASN.mmdb" + + cd - >/dev/null + fi +else + step "Downloading MaxMind GeoIP2 databases to ${GEOIP_DB_DIR}" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: mkdir -p ${GEOIP_DB_DIR}" + echo " Would run: geoipupdate" + else + mkdir -p "$GEOIP_DB_DIR" + geoipupdate -v + info "GeoIP2 databases downloaded" + fi +fi + +# ===================================================== +# Step 4: Create nginx geoip2 config in conf.d +# ===================================================== +step "Creating nginx GeoIP2 config at ${NGINX_GEOIP_CONF}" + +NGINX_GEOIP_CONTENT="${MARKER_START} +# GeoIP2 variable mappings — generated by hestia-geoip-setup.sh + +geoip2 ${GEOIP_DB_DIR}/GeoLite2-City.mmdb { + auto_reload 60m; + \$geoip2_country_code country iso_code; + \$geoip2_country_name country names en; + \$geoip2_city_name city names en; +} + +geoip2 ${GEOIP_DB_DIR}/GeoLite2-ASN.mmdb { + auto_reload 60m; + \$geoip2_asn autonomous_system_number; + \$geoip2_asn_org autonomous_system_organization; +} +${MARKER_END}" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${NGINX_GEOIP_CONF}" +else + backup_file "$NGINX_GEOIP_CONF" + echo "$NGINX_GEOIP_CONTENT" > "$NGINX_GEOIP_CONF" + info "Created: ${NGINX_GEOIP_CONF}" +fi + +# ===================================================== +# Step 5: Create enriched log format in conf.d +# ===================================================== +step "Creating enriched log format at ${NGINX_LOG_CONF}" + +NGINX_LOG_CONTENT="${MARKER_START} +# Enriched log format with GeoIP2 data — generated by hestia-geoip-setup.sh + +log_format enriched '\$remote_addr - \$remote_user [\$time_local] ' + '\"\$request\" \$status \$body_bytes_sent ' + '\"\$http_referer\" \"\$http_user_agent\" ' + '\$geoip2_country_code \"\$geoip2_asn_org\"'; +${MARKER_END}" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${NGINX_LOG_CONF}" +else + backup_file "$NGINX_LOG_CONF" + echo "$NGINX_LOG_CONTENT" > "$NGINX_LOG_CONF" + info "Created: ${NGINX_LOG_CONF}" +fi + +# ===================================================== +# Step 5b: Create Apache enriched log format +# ===================================================== +if [[ -d "/etc/apache2/conf-available" ]]; then + step "Creating Apache enriched log format at ${APACHE_LOG_CONF}" + + APACHE_LOG_CONTENT="${MARKER_START} +# Enriched log format with GeoIP2 data (passed from nginx) — generated by hestia-geoip-setup.sh +LogFormat \"%h %l %u %t \\\"%r\\\" %>s %b \\\"%{Referer}i\\\" \\\"%{User-Agent}i\\\" %{X-GeoIP-Country}i \\\"%{X-GeoIP-ASN}i\\\"\" enriched +${MARKER_END}" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${APACHE_LOG_CONF}" + echo " Would enable: a2enconf enriched-log-format" + else + backup_file "$APACHE_LOG_CONF" + echo "$APACHE_LOG_CONTENT" > "$APACHE_LOG_CONF" + a2enconf -q enriched-log-format 2>/dev/null || true + info "Created and enabled: ${APACHE_LOG_CONF}" + fi +else + warn "Apache conf-available directory not found — skipping Apache log format" +fi + +# ===================================================== +# Step 6: Create custom Hestia templates +# ===================================================== + +# --- Hestia Apache template patching --- +APACHE_TPL_DIR="" +if [[ -d "/usr/local/hestia/data/templates/web/apache2/php-fpm" ]]; then + APACHE_TPL_DIR="/usr/local/hestia/data/templates/web/apache2/php-fpm" +elif [[ -d "/usr/local/hestia/data/templates/web/apache2" ]]; then + APACHE_TPL_DIR="/usr/local/hestia/data/templates/web/apache2" +fi + +create_apache_template() { + local src="$1" dst="$2" label="$3" + + if [[ ! -f "$src" ]]; then + warn "Source Apache template not found: ${src} — skipping ${label}" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${dst} (from ${src})" + return + fi + + backup_file "$dst" + + # Define enriched LogFormat inside the vhost and replace 'combined' with 'enriched' + APACHE_ENRICHED_FORMAT=' LogFormat "%h %l %u %t \\"%r\\" %>s %b \\"%{Referer}i\\" \\"%{User-Agent}i\\" %{X-GeoIP-Country}i \\"%{X-GeoIP-ASN}i\\"" enriched' + + sed 's/\(CustomLog.*\.log\) combined/\1 enriched/g' "$src" | \ + awk -v fmt="$APACHE_ENRICHED_FORMAT" ' + !injected && /CustomLog/ { + print fmt + injected = 1 + } + { print } + ' > "$dst" + + if grep -q "enriched" "$dst"; then + info "Created ${label} template: ${dst}" + else + cp "$src" "$dst" + warn "No 'combined' format found in ${src} — copied unchanged" + fi +} + +if [[ -n "$APACHE_TPL_DIR" ]]; then + # Apache templates may use a different base than nginx (e.g., bot-blocking + # only exists in the nginx layer). Fall back to 'default' if base not found. + APACHE_BASE="$BASE_TEMPLATE" + if [[ ! -f "${APACHE_TPL_DIR}/${APACHE_BASE}.tpl" ]]; then + APACHE_BASE="default" + warn "Apache base template '${BASE_TEMPLATE}' not found — falling back to 'default'" + fi + + step "Creating custom Apache templates (${TEMPLATE_NAME}) from ${APACHE_BASE}" + + create_apache_template \ + "${APACHE_TPL_DIR}/${APACHE_BASE}.tpl" \ + "${APACHE_TPL_DIR}/${TEMPLATE_NAME}.tpl" \ + "Apache HTTP (.tpl)" + + create_apache_template \ + "${APACHE_TPL_DIR}/${APACHE_BASE}.stpl" \ + "${APACHE_TPL_DIR}/${TEMPLATE_NAME}.stpl" \ + "Apache SSL (.stpl)" +fi + +# --- Nginx template patching --- +create_template() { + local src="$1" dst="$2" label="$3" + + if [[ ! -f "$src" ]]; then + warn "Source template not found: ${src} — skipping ${label}" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${dst} (from ${src})" + return + fi + + backup_file "$dst" + + # Replace 'combined' with 'enriched' in access_log directives + # and inject proxy_set_header lines after the first proxy_pass directive. + # If the base template has no proxy_set_header directives, nginx defaults + # (Host, Connection) are lost when ANY proxy_set_header is added — the + # classic inheritance trap. Include the standard set to prevent 421 errors. + local has_proxy_headers + has_proxy_headers=$(grep -c 'proxy_set_header' "$src" || true) + + sed 's/\(access_log.*\.log\) combined;/\1 enriched;/g' "$src" | \ + awk -v d='$' -v need_std="$has_proxy_headers" ' + !injected && /proxy_pass/ { + print + if (need_std == 0) { + print "\t\tproxy_set_header Host " d "host;" + print "\t\tproxy_set_header X-Real-IP " d "remote_addr;" + print "\t\tproxy_set_header X-Forwarded-For " d "proxy_add_x_forwarded_for;" + print "\t\tproxy_set_header X-Forwarded-Proto " d "scheme;" + } + print "\t\tproxy_set_header X-GeoIP-Country " d "geoip2_country_code;" + print "\t\tproxy_set_header X-GeoIP-ASN " d "geoip2_asn_org;" + injected = 1 + next + } + { print } + ' > "$dst" + + # Verify changes + if grep -q "enriched" "$dst"; then + info "Created ${label} template: ${dst} (log format updated)" + else + warn "No 'combined' log format found in ${src} — format not changed" + warn " Manually change access_log format to 'enriched' in ${dst}" + fi + + if grep -q "proxy_set_header X-GeoIP-Country" "$dst"; then + info "Injected GeoIP proxy headers into ${dst}" + else + warn "No proxy_pass found in ${src} — GeoIP headers not injected" + fi +} + +step "Creating custom ${PANEL_NAME} templates (${TEMPLATE_NAME}) from ${BASE_TEMPLATE}" + +# Proxy templates (nginx as proxy for apache) +create_template \ + "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" \ + "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.tpl" \ + "proxy HTTP (.tpl)" + +create_template \ + "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.stpl" \ + "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.stpl" \ + "proxy SSL (.stpl)" + +# php-fpm templates (nginx standalone) +if [[ -d "${PANEL_TPL_DIR}/php-fpm" ]]; then + if [[ -f "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" ]]; then + create_template \ + "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" \ + "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.tpl" \ + "php-fpm HTTP (.tpl)" + + create_template \ + "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.stpl" \ + "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.stpl" \ + "php-fpm SSL (.stpl)" + fi +fi + +# Copy .sh hook if base template has one +for tpl_dir in "${PANEL_TPL_DIR}" "${PANEL_TPL_DIR}/php-fpm"; do + if [[ -f "${tpl_dir}/${BASE_TEMPLATE}.sh" ]] && [[ ! -f "${tpl_dir}/${TEMPLATE_NAME}.sh" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would copy: ${tpl_dir}/${BASE_TEMPLATE}.sh → ${tpl_dir}/${TEMPLATE_NAME}.sh" + else + cp "${tpl_dir}/${BASE_TEMPLATE}.sh" "${tpl_dir}/${TEMPLATE_NAME}.sh" + info "Copied hook: ${tpl_dir}/${TEMPLATE_NAME}.sh" + fi + fi +done + +# ===================================================== +# Step 7: Create cron job for database updates +# ===================================================== +if [[ "$USE_DBIP" == "true" ]]; then + CRON_FILE="$CRON_MONTHLY" + step "Creating monthly DB-IP update script at ${CRON_FILE}" + + CRON_CONTENT="#!/bin/sh +# Monthly DB-IP database update — generated by hestia-geoip-setup.sh +MONTH=\$(date +%Y-%m) +cd ${GEOIP_DB_DIR} && \\ + curl -fsSL \"https://download.db-ip.com/free/dbip-country-lite-\${MONTH}.mmdb.gz\" | gunzip > GeoLite2-City.mmdb && \\ + curl -fsSL \"https://download.db-ip.com/free/dbip-asn-lite-\${MONTH}.mmdb.gz\" | gunzip > GeoLite2-ASN.mmdb && \\ + logger -t dbip-update \"DB-IP databases updated\"" +else + CRON_FILE="$CRON_WEEKLY" + step "Creating weekly geoipupdate script at ${CRON_FILE}" + + CRON_CONTENT="#!/bin/sh +# Weekly GeoIP2 database update — generated by hestia-geoip-setup.sh +/usr/bin/geoipupdate -s 2>&1 | logger -t geoipupdate" +fi + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${CRON_FILE}" +else + backup_file "$CRON_FILE" + echo "$CRON_CONTENT" > "$CRON_FILE" + chmod 755 "$CRON_FILE" + info "Created: ${CRON_FILE}" +fi + +# ===================================================== +# Step 8: Validate nginx config +# ===================================================== +step "Testing nginx configuration" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" +else + if nginx -t 2>&1; then + info "nginx config valid" + else + echo -e "${RED}[ERROR] nginx config test failed${NC}" >&2 + echo " Restore backups (.bak.${TIMESTAMP}) and check templates" >&2 + exit 1 + fi +fi + +# ===================================================== +# Step 9: Apply template (optional) +# ===================================================== +if [[ -n "$APPLY_USER" ]]; then + if ! command -v v-change-web-domain-proxy-tpl &>/dev/null; then + echo -e "${RED}Error: v-change-web-domain-proxy-tpl not found${NC}" >&2 + exit 1 + fi + + # Apply nginx proxy template + if [[ "$APPLY_ALL" == "true" ]]; then + step "Applying nginx proxy template to all domains for user: ${APPLY_USER}" + domains=$(v-list-web-domains "$APPLY_USER" plain 2>/dev/null | awk '{print $1}') + if [[ -z "$domains" ]]; then + warn "No domains found for user: ${APPLY_USER}" + else + while IFS= read -r domain; do + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${domain} ${TEMPLATE_NAME}" + else + v-change-web-domain-proxy-tpl "$APPLY_USER" "$domain" "$TEMPLATE_NAME" + info "Nginx proxy template applied to: ${domain}" + fi + done <<< "$domains" + fi + else + step "Applying nginx proxy template to: ${APPLY_DOMAIN}" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${APPLY_DOMAIN} ${TEMPLATE_NAME}" + else + v-change-web-domain-proxy-tpl "$APPLY_USER" "$APPLY_DOMAIN" "$TEMPLATE_NAME" + info "Nginx proxy template applied to: ${APPLY_DOMAIN}" + fi + fi + + # Apply Apache backend template (if Apache templates were created) + if [[ -n "$APACHE_TPL_DIR" ]] && [[ -f "${APACHE_TPL_DIR}/${TEMPLATE_NAME}.tpl" ]]; then + if command -v v-change-web-domain-tpl &>/dev/null; then + if [[ "$APPLY_ALL" == "true" ]]; then + step "Applying Apache backend template to all domains for user: ${APPLY_USER}" + while IFS= read -r domain; do + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-tpl ${APPLY_USER} ${domain} ${TEMPLATE_NAME}" + else + v-change-web-domain-tpl "$APPLY_USER" "$domain" "$TEMPLATE_NAME" + info "Apache template applied to: ${domain}" + fi + done <<< "$domains" + else + step "Applying Apache backend template to: ${APPLY_DOMAIN}" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-tpl ${APPLY_USER} ${APPLY_DOMAIN} ${TEMPLATE_NAME}" + else + v-change-web-domain-tpl "$APPLY_USER" "$APPLY_DOMAIN" "$TEMPLATE_NAME" + info "Apache template applied to: ${APPLY_DOMAIN}" + fi + fi + fi + fi +fi + +# ===================================================== +# Step 10: Reload nginx and Apache +# ===================================================== +step "Reloading web servers" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl reload nginx" + echo " Would run: systemctl reload apache2" +else + systemctl reload nginx + info "nginx reloaded" + + if systemctl is-active --quiet apache2 2>/dev/null; then + systemctl reload apache2 + info "Apache reloaded" + fi +fi + +# ===================================================== +# Summary +# ===================================================== +echo "" +echo -e "${BOLD}Done.${NC}" +echo "" +echo " GeoIP config: ${NGINX_GEOIP_CONF}" +echo " Log format: ${NGINX_LOG_CONF}" +echo " Template: ${TEMPLATE_NAME} (.tpl + .stpl)" +echo " Base: ${BASE_TEMPLATE}" +echo " DB updates: ${CRON_FILE}" +if [[ "$USE_DBIP" == "true" ]]; then + echo " DB source: DB-IP Lite (no signup)" +else + echo " DB source: MaxMind GeoLite2" +fi +if [[ -n "$APPLY_USER" ]]; then + if [[ "$APPLY_ALL" == "true" ]]; then + echo " Applied: All domains for ${APPLY_USER}" + else + echo " Applied: ${APPLY_DOMAIN}" + fi +else + echo "" + echo " To apply to a domain:" + echo " v-change-web-domain-proxy-tpl ${TEMPLATE_NAME}" + echo "" + echo " To apply to all domains for a user:" + echo " sudo $(basename "$0") --apply-all " +fi +echo "" +echo " To remove: sudo $(basename "$0") --remove" diff --git a/hestia-js-challenge.sh b/hestia-js-challenge.sh new file mode 100644 index 0000000..2b10f27 --- /dev/null +++ b/hestia-js-challenge.sh @@ -0,0 +1,796 @@ +#!/bin/bash +################################################################################ +# Script Name: hestia-js-challenge.sh +# Version: 3.1 +# Description: Adds a lightweight JavaScript cookie challenge to nginx on +# HestiaCP / VestaCP / myVesta servers. Creates an nginx map, +# custom templates with JS challenge rules, and a challenge HTML +# page. Bots that don't execute JavaScript are silently dropped. +# Headless Chrome bots from suspect GeoIP regions with no external +# referrer are tarpitted (served at 50 bytes/sec). +# Detects and stacks on existing templates (e.g., geoip, botblock). +# Works alongside hestia-bot-block.sh — run that first. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - HestiaCP, VestaCP, or myVesta installed +# - nginx installed and running +# - Root access +# +# Usage: +# sudo ./hestia-js-challenge.sh +# sudo ./hestia-js-challenge.sh --dry-run +# sudo ./hestia-js-challenge.sh --base-template default-botblock +# sudo ./hestia-js-challenge.sh --base-template geoip-botblock --apply-all admin +# sudo ./hestia-js-challenge.sh --remove +# +# How it works: +# 1. Whitelisted bot UAs (Googlebot, Bingbot, etc.) bypass the check entirely +# 2. All other visitors must have a cookie with a randomized name and token +# 3. First-time visitors get a brief redirect to a challenge page that sets +# the cookie via JS and bounces them back — takes < 100ms +# 4. Bots that don't run JS never get the cookie and get 444'd +# 5. Cookie name and token are randomized per installation — re-running the +# script rotates them, immediately invalidating old pre-set cookies +# +# Changelog: +# 3.1 — 2026-05-21: Challenge endpoint rate limiting. Headless Chrome bot farms +# (China Unicom Shanghai, China Telecom) were passing the JS challenge on +# every request by spawning fresh browser instances without persistent +# cookies. They hit /_bc on every page load (~35s intervals) while real +# users hit it once and keep the cookie for 24h. Added limit_req_zone on +# the challenge endpoint: 3 requests allowed (burst), then 1/min sustained. +# Excess requests get 444. Added --challenge-burst and --challenge-rate +# options for tuning. +# 3.0 — 2026-05-20: Referrer tracking through challenge redirect. Original +# HTTP Referer is passed as &ref= param in the 302 redirect. Challenge +# JS stores it in a _bc_ref cookie. Tarpit map: visitors from suspect +# GeoIP countries (CN by default) with no external referrer are served +# at 50 bytes/sec via limit_rate, draining headless Chrome resources. +# Requires ngx_http_geoip2_module for GeoIP-based tarpitting. +# Added --tarpit-countries and --tarpit-rate options. +# 2.0 — 2026-05-19: Randomized cookie name and token per installation. +# Cookie name is now a random 2-character suffix (e.g. _v7, _xq). +# Cookie value is now a 32-char hex token instead of static "verified". +# Values persist in /etc/nginx/js-challenge.env for --update-html-only. +# Re-running rotates credentials and invalidates old bot bypass cookies. +# Added no-cache headers on challenge page to prevent stale HTML after +# rotation. Fixed challenge page Secure flag to be conditional on HTTPS. +# 1.1 — 2026-05-13: Added --update-html-only to regenerate challenge page +# without touching templates. Added Secure flag to cookie. Added +# loop breaker — shows error message instead of infinite redirect +# when browser blocks the cookie (Firefox strict mode). +# 1.0 — 2026-05-11: Initial release +# +################################################################################ + +set -euo pipefail + +# --- Configuration --- +TEMPLATE_NAME="default-jschallenge" +BASE_TEMPLATE="default" +CONF_DIR="/etc/nginx/conf.d" +PANEL_TPL_DIR="" +PANEL_NAME="" +CHALLENGE_MAP="${CONF_DIR}/js-challenge.conf" +CHALLENGE_DIR="/var/www/js-challenge" +CHALLENGE_HTML="${CHALLENGE_DIR}/challenge.html" +STATE_FILE="/etc/nginx/js-challenge.env" +CHALLENGE_PATH="/_bc" +APPLY_USER="" +APPLY_DOMAIN="" +APPLY_ALL=false +DRY_RUN=false +REMOVE=false +UPDATE_HTML_ONLY=false +COOKIE_MAX_AGE=86400 # 24 hours +TARPIT_COUNTRIES="${TARPIT_COUNTRIES:-CN}" # GeoIP country codes to tarpit (space-separated) +TARPIT_RATE="${TARPIT_RATE:-50}" # bytes/sec for tarpitted responses +CHALLENGE_RATE="${CHALLENGE_RATE:-1}" # sustained challenge requests per minute per IP +CHALLENGE_BURST="${CHALLENGE_BURST:-3}" # initial burst of challenge requests allowed +TIMESTAMP=$(date +%s) + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +info() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +detect_panel() { + if [[ -d "/usr/local/hestia/data/templates/web/nginx" ]]; then + PANEL_TPL_DIR="/usr/local/hestia/data/templates/web/nginx" + PANEL_NAME="HestiaCP" + elif [[ -d "/usr/local/vesta/data/templates/web/nginx" ]]; then + PANEL_TPL_DIR="/usr/local/vesta/data/templates/web/nginx" + PANEL_NAME="VestaCP/myVesta" + else + echo -e "${RED}Error: Neither HestiaCP nor VestaCP/myVesta found${NC}" >&2 + exit 1 + fi + info "Detected ${PANEL_NAME} (${PANEL_TPL_DIR})" +} + +usage() { + cat <&2 + exit 1 +fi + +# ===================================================== +# Generate or load cookie credentials +# ===================================================== + +generate_credentials() { + COOKIE_NAME="_$(openssl rand -hex 1)" + COOKIE_VALUE="$(openssl rand -hex 16)" + COOKIE_VAR="\$cookie_${COOKIE_NAME}" +} + +save_credentials() { + if [[ "$DRY_RUN" != "true" ]]; then + cat > "$STATE_FILE" <&2 + echo " Run without --update-html-only first to generate credentials." >&2 + exit 1 + fi +elif [[ "$REMOVE" != "true" ]]; then + generate_credentials + info "Generated new credentials — cookie: ${COOKIE_NAME} token: ${COOKIE_VALUE:0:8}..." +fi + +# ===================================================== +# Remove mode +# ===================================================== +if [[ "$REMOVE" == "true" ]]; then + step "Removing JS challenge configuration" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would remove: ${CHALLENGE_MAP}" + echo " Would remove: ${CHALLENGE_DIR}" + echo " Would remove: ${STATE_FILE}" + echo " Would run: nginx -t && systemctl reload nginx" + else + [[ -f "$CHALLENGE_MAP" ]] && rm -f "$CHALLENGE_MAP" && info "Removed: ${CHALLENGE_MAP}" + [[ -d "$CHALLENGE_DIR" ]] && rm -rf "$CHALLENGE_DIR" && info "Removed: ${CHALLENGE_DIR}" + [[ -f "$STATE_FILE" ]] && rm -f "$STATE_FILE" && info "Removed: ${STATE_FILE}" + + if nginx -t 2>&1; then + systemctl reload nginx + info "nginx reloaded" + else + echo -e "${RED}[ERROR] nginx config test failed after removal${NC}" >&2 + exit 1 + fi + fi + + echo "" + echo -e "${BOLD}JS challenge removed.${NC}" + echo "" + echo " Note: You should also remove or switch away from the JS challenge" + echo " templates on your domains:" + echo " v-change-web-domain-proxy-tpl " + exit 0 +fi + +# --- Panel detection --- +detect_panel + +if ! command -v nginx &>/dev/null; then + echo -e "${RED}Error: nginx not found${NC}" >&2 + exit 1 +fi + +# ===================================================== +# Step 1: Create the challenge HTML page +# ===================================================== +step "Creating challenge page at ${CHALLENGE_HTML}" + +CHALLENGE_CONTENT=' + +Verifying + + + + + +' + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${CHALLENGE_DIR}/" + echo " Would create: ${CHALLENGE_HTML}" +else + mkdir -p "$CHALLENGE_DIR" + echo "$CHALLENGE_CONTENT" > "$CHALLENGE_HTML" + info "Challenge page created: ${CHALLENGE_HTML}" +fi + +# ===================================================== +# If --update-html-only, stop here +# ===================================================== +if [[ "$UPDATE_HTML_ONLY" == "true" ]]; then + echo "" + echo -e "${BOLD}Done.${NC} Challenge HTML updated — templates unchanged." + echo " HTML: ${CHALLENGE_HTML}" + echo " Cookie: ${COOKIE_NAME}=${COOKIE_VALUE:0:8}..." + echo "" + echo " No nginx reload needed — the HTML is served as a static file." + exit 0 +fi + +# Save credentials for future --update-html-only runs +save_credentials + +# ===================================================== +# Step 2: Create nginx map config +# ===================================================== +step "Creating JS challenge map at ${CHALLENGE_MAP}" + +# Build the cookie variable name for nginx (e.g. _v7 → $cookie__v7) +NGINX_COOKIE_VAR="\$cookie_${COOKIE_NAME}" + +# Check if a geoip2 block already loads an mmdb anywhere in nginx config. +# If so, $geoip2_data_country_code should already be defined — don't duplicate. +# Exclude our own file and backup files from the search. +GEOIP2_BLOCK="" +if ! grep -r 'geoip2.*\.mmdb' /etc/nginx/ \ + --include='*.conf' --exclude='js-challenge.conf' --exclude='*.bak.*' \ + -q 2>/dev/null; then + GEOIP2_BLOCK=' +# ── GeoIP2: country lookup for tarpit decisions ────────────────────── +# Uses the City database (superset of Country). Adjust path if needed. +geoip2 /usr/share/GeoIP/GeoLite2-City.mmdb { + $geoip2_country_code country iso_code; +} +' + step "No existing geoip2 country_code config found — adding to map config" +fi + + # Collect server_name values from nginx configs to build same-site referer map + local REFERER_ENTRIES="" + local _jsc_domain_seen=() + for _conf in /etc/nginx/conf.d/*.conf /etc/nginx/sites-enabled/*; do + [[ -f "$_conf" ]] || continue + while read -r _sn; do + for _d in $_sn; do + [[ "$_d" == "server_name" || "$_d" == ";" || "$_d" == "_" || "$_d" =~ ^[0-9] ]] && continue + _d="${_d%;}" + [[ " ${_jsc_domain_seen[*]:-} " == *" $_d "* ]] && continue + _jsc_domain_seen+=("$_d") + local _d_escaped="${_d//./\\.}" + REFERER_ENTRIES+=" ~^1:https?://${_d_escaped} 1;\n" + done + done < <(grep -oP '^\s*server_name\s+\K[^;]+;?' "$_conf" 2>/dev/null) + done + + if [[ -z "$REFERER_ENTRIES" ]]; then + warn "No server_name values found — same-site image bypass will not work" + warn "Images behind the challenge may cause redirect loops for browsers" + fi + +MAP_CONTENT='# JS cookie challenge — allowed bots and cookie check +# Generated by hestia-js-challenge.sh — https://mylinux.work +# Cookie: '"${COOKIE_NAME}"' Token: '"${COOKIE_VALUE:0:8}"'... +# Generated: '"$(date -Iseconds)"' + +# ── Rate limit: challenge endpoint ─────────────────────────────────── +# Real users hit the challenge once and keep the cookie. Headless bot farms +# spawn fresh browsers per request, hitting the challenge every time. +# Rate: '"${CHALLENGE_RATE}"'r/m with burst of '"${CHALLENGE_BURST}"' — excess gets 444. +limit_req_zone $binary_remote_addr zone=jschallenge:10m rate='"${CHALLENGE_RATE}"'r/m; + +# Bots that legitimately identify themselves and should bypass the JS check +map $http_user_agent $is_allowed_bot { + default 0; + + # Search engines + ~*Googlebot 1; + ~*bingbot 1; + ~*Slurp 1; + ~*DuckDuckBot 1; + ~*DuckAssistBot 1; + ~*Baiduspider 1; + ~*YandexBot 1; + ~*YandexFavicons 1; + ~*Applebot 1; + ~*Qwantbot 1; + ~*Qwantify 1; + ~*Bravebot 1; + ~*kagi-fetcher 1; + ~*Kagibot 1; + ~*Yahoo! 1; + ~*Yeti 1; + + # Social media / link previews + ~*facebookexternalhit 1; + ~*Facebot 1; + ~*Twitterbot 1; + ~*LinkedInBot 1; + ~*Slackbot 1; + ~*Slack-ImgProxy 1; + ~*Discordbot 1; + ~*TelegramBot 1; + ~*WhatsApp 1; + ~*redditbot 1; + ~*ArenaUnfurlBot 1; + + # Feed readers + ~*Feedly 1; + ~*Miniflux 1; + ~*FreshRSS 1; + ~*NewsBlur 1; + ~*Tiny\ Tiny\ RSS 1; + ~*Inoreader 1; + ~*NetNewsWire 1; + + # Monitoring / uptime + ~*UptimeRobot 1; + ~*Pingdom 1; + ~*StatusCake 1; + ~*Blackbox-Exporter 1; + + # AI answer bots (user-facing, not training crawlers) + ~*OAI-SearchBot 1; + ~*ChatGPT-User 1; + ~*Claude-Web 1; + ~*Claude-User 1; + ~*MistralAI-User 1; + + # Archive / research + ~*archive\.org_bot 1; + + # Apple Safari prefetch + ~*safarifetcherd 1; + + # Link checkers / validators + ~*W3C_Validator 1; + ~*W3C-checklink 1; + ~*LinkChecker 1; + ~*link-check 1; + + # Decentralized search + ~*yacybot 1; + + # Add your own allowed bots below +} + +# Validate the challenge cookie — exact token match +map '"${NGINX_COOKIE_VAR}"' $js_cookie_valid { + default 0; + "'"${COOKIE_VALUE}"'" 1; +} + +# Detect requests to the challenge page and download paths (prevent redirect loops) +map $uri $is_challenge_uri { + default 0; + "'"${CHALLENGE_PATH}"'" 1; + ~^/downloads/ 1; + ~*\.(css|js|woff2?)$ 1; + ~*favicon 1; + ~*apple-touch-icon 1; +} + +# Detect image sub-resource requests with same-site referer (browser loads) +# These bypass the challenge because: (a) images cannot execute JS challenges, +# and (b) the same-site referer proves the browser loaded a page from this domain. +# Direct image requests from scrapers (no referer or external referer) still get challenged. +map $uri $is_image_request { + default 0; + ~*\.(png|jpe?g|gif|svg|webp|ico|avif)$ 1; +} +map "$is_image_request:$http_referer" $is_samesite_image { + default 0; +'"${REFERER_ENTRIES}"'} + +# Combined check: need challenge if not allowed bot, no valid cookie, and not the challenge page +map "$is_allowed_bot:$js_cookie_valid:$is_challenge_uri:$is_samesite_image" $needs_js_challenge { + default 1; + "1:0:0:0" 0; + "1:0:0:1" 0; + "1:0:1:0" 0; + "1:0:1:1" 0; + "1:1:0:0" 0; + "1:1:0:1" 0; + "1:1:1:0" 0; + "1:1:1:1" 0; + "0:1:0:0" 0; + "0:1:0:1" 0; + "0:1:1:0" 0; + "0:1:1:1" 0; + "0:0:1:0" 0; + "0:0:1:1" 0; + "0:0:0:1" 0; +} +'"${GEOIP2_BLOCK}"' +# ── Tarpit: headless Chrome bots from suspect regions ───────────────── +# Visitors from tarpit countries with no external referrer (passed through +# the challenge redirect as the _bc_ref cookie) are served at a crawl. +# This drains headless Chrome resources (~200-500 MB RAM per instance) +# without giving the bot a clear "blocked" signal to adapt to. +# +# The _bc_ref cookie is set by the challenge page JS from the &ref= param. +# It contains the original HTTP Referer before the 302 redirect destroyed it. +# "direct" = no external referrer (typed URL or bot). Cookie expires in 120s. + +# Check if visitor is from a tarpit country (requires geoip2 module) +map $geoip2_country_code $is_tarpit_country { + default 0; +'"$(for cc in $TARPIT_COUNTRIES; do echo " \"${cc}\" 1;"; done)"' +} + +# Tarpit only if: tarpit country + no external referrer + passed JS challenge +map "$is_tarpit_country:$cookie__bc_ref" $tarpit_client { + default 0; + "1:direct" 1; + "1:" 1; +} + +# Serve the challenge page +server { + listen 127.0.0.1:18444; + server_name _; + root /var/www/js-challenge; + + location / { + add_header Cache-Control "no-store, no-cache, must-revalidate" always; + add_header Pragma "no-cache" always; + try_files /challenge.html =404; + } +}' + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${CHALLENGE_MAP}" +else + if [[ -f "$CHALLENGE_MAP" ]]; then + cp "$CHALLENGE_MAP" "${CHALLENGE_MAP}.bak.${TIMESTAMP}" + warn "Existing config backed up" + fi + echo "$MAP_CONTENT" > "$CHALLENGE_MAP" + info "Map config created: ${CHALLENGE_MAP}" +fi + +# ===================================================== +# Step 3: Create custom Hestia templates +# ===================================================== +JS_CHALLENGE_DIRECTIVE=' + # JS cookie challenge — added by hestia-js-challenge.sh + location = '"${CHALLENGE_PATH}"' { + limit_req zone=jschallenge burst='"${CHALLENGE_BURST}"' nodelay; + limit_req_status 444; + proxy_pass http://127.0.0.1:18444/; + } + + # Redirect non-JS visitors to challenge page (pass original referrer) + if ($needs_js_challenge) { + return 302 '"${CHALLENGE_PATH}"'?r=$request_uri&ref=$http_referer; + } + + # Tarpit headless Chrome bots from suspect GeoIP regions + if ($tarpit_client) { + set $limit_rate '"${TARPIT_RATE}"'; + }' + +create_template() { + local src="$1" dst="$2" label="$3" + + if [[ ! -f "$src" ]]; then + warn "Source template not found: ${src} — skipping ${label}" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${dst} (from ${src})" + return + fi + + if [[ -f "$dst" ]]; then + cp "$dst" "${dst}.bak.${TIMESTAMP}" + warn "Existing ${label} template backed up" + fi + + # Check if the source already has JS challenge (avoid double-injection) + if grep -q 'needs_js_challenge' "$src"; then + cp "$src" "$dst" + info "Created ${label} template: ${dst} (JS challenge already present in base)" + else + # Insert JS challenge directive before the first 'location' line + awk -v block="$JS_CHALLENGE_DIRECTIVE" ' + !inserted && /^[[:space:]]*location[[:space:]]/ { + print block + print "" + inserted = 1 + } + { print } + ' "$src" > "$dst" + info "Created ${label} template: ${dst}" + fi +} + +# Resolve the base template — verify it exists +BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" +if [[ ! -f "$BASE_TPL" ]]; then + if [[ "$BASE_TEMPLATE" != "default" ]]; then + warn "Base template '${BASE_TEMPLATE}' not found — falling back to 'default'" + BASE_TEMPLATE="default" + fi + BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" + if [[ ! -f "$BASE_TPL" ]]; then + echo -e "${RED}Error: Default template not found: ${BASE_TPL}${NC}" >&2 + exit 1 + fi +fi + +step "Creating custom ${PANEL_NAME} nginx templates (${TEMPLATE_NAME}) from ${BASE_TEMPLATE}" + +# Proxy templates +create_template \ + "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" \ + "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.tpl" \ + "HTTP (.tpl)" + +create_template \ + "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.stpl" \ + "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.stpl" \ + "SSL (.stpl)" + +# php-fpm templates (if they exist for the base) +if [[ -d "${PANEL_TPL_DIR}/php-fpm" ]]; then + if [[ -f "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" ]]; then + create_template \ + "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" \ + "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.tpl" \ + "php-fpm HTTP (.tpl)" + + create_template \ + "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.stpl" \ + "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.stpl" \ + "php-fpm SSL (.stpl)" + fi +fi + +# Copy .sh hooks from the base template if they exist +for ext in tpl stpl; do + base_sh="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.${ext}.sh" + dst_sh="${PANEL_TPL_DIR}/${TEMPLATE_NAME}.${ext}.sh" + if [[ -f "$base_sh" && ! -f "$dst_sh" ]]; then + cp "$base_sh" "$dst_sh" + info "Copied hook: ${dst_sh}" + fi +done + +# ===================================================== +# Step 4: Validate nginx config +# ===================================================== +step "Testing nginx configuration" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" +else + if nginx -t 2>&1; then + info "nginx config valid" + else + echo -e "${RED}[ERROR] nginx config test failed${NC}" >&2 + echo " Restore backups from ${PANEL_TPL_DIR} and ${CONF_DIR}" >&2 + exit 1 + fi +fi + +# ===================================================== +# Step 5: Apply template (optional) +# ===================================================== +if [[ -n "$APPLY_USER" ]]; then + if ! command -v v-change-web-domain-proxy-tpl &>/dev/null; then + echo -e "${RED}Error: v-change-web-domain-proxy-tpl not found${NC}" >&2 + exit 1 + fi + + if [[ "$APPLY_ALL" == "true" ]]; then + step "Applying template to all domains for user: ${APPLY_USER}" + domains=$(v-list-web-domains "$APPLY_USER" plain 2>/dev/null | awk '{print $1}') + if [[ -z "$domains" ]]; then + warn "No domains found for user: ${APPLY_USER}" + else + while IFS= read -r domain; do + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${domain} ${TEMPLATE_NAME}" + else + v-change-web-domain-proxy-tpl "$APPLY_USER" "$domain" "$TEMPLATE_NAME" + info "Applied to: ${domain}" + fi + done <<< "$domains" + fi + else + step "Applying template to: ${APPLY_DOMAIN}" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-proxy-tpl ${APPLY_USER} ${APPLY_DOMAIN} ${TEMPLATE_NAME}" + else + v-change-web-domain-proxy-tpl "$APPLY_USER" "$APPLY_DOMAIN" "$TEMPLATE_NAME" + info "Applied to: ${APPLY_DOMAIN}" + fi + fi +fi + +# ===================================================== +# Step 6: Reload nginx +# ===================================================== +step "Reloading nginx" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl reload nginx" +else + systemctl reload nginx + info "nginx reloaded" +fi + +# ===================================================== +# Summary +# ===================================================== +echo "" +echo -e "${BOLD}Done.${NC}" +echo "" +echo " Challenge map: ${CHALLENGE_MAP}" +echo " Challenge page: ${CHALLENGE_HTML}" +echo " State file: ${STATE_FILE}" +echo " Base template: ${BASE_TEMPLATE}" +echo " Template: ${TEMPLATE_NAME} (.tpl + .stpl)" +echo " Cookie name: ${COOKIE_NAME}" +echo " Cookie token: ${COOKIE_VALUE:0:8}... (32 hex chars)" +echo " Cookie TTL: ${COOKIE_MAX_AGE}s" +echo " Challenge rate: ${CHALLENGE_RATE}r/m (burst: ${CHALLENGE_BURST})" +if [[ -n "$APPLY_USER" ]]; then + if [[ "$APPLY_ALL" == "true" ]]; then + echo " Applied: All domains for ${APPLY_USER}" + else + echo " Applied: ${APPLY_DOMAIN}" + fi +else + echo "" + echo " To apply to a domain:" + echo " v-change-web-domain-proxy-tpl ${TEMPLATE_NAME}" + echo "" + echo " To apply to all domains for a user:" + echo " sudo $(basename "$0") --apply-all " +fi +echo "" +echo " To rotate credentials (invalidate bot-cached cookies):" +echo " sudo $(basename "$0") --base-template ${BASE_TEMPLATE}" +echo "" +echo " Test (bot without cookie gets redirected to challenge):" +echo " curl -o /dev/null -s -w '%{http_code}' https://yourdomain.com" +echo " Expected: 302" +echo "" +echo " Test (browser completes challenge — 302 → 200):" +echo " Open https://yourdomain.com in a browser" +echo " Expected: brief redirect then page loads normally" +echo "" +echo " Test (old static bypass no longer works):" +echo " curl -b '_bc=verified' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" +echo " Expected: 302 (not 200 — old cookie is invalid)" +echo "" +echo " Test (rate limit on challenge endpoint):" +echo " for i in 1 2 3 4 5; do curl -o /dev/null -s -w \"\$i: %{http_code}\n\" https://yourdomain.com${CHALLENGE_PATH}; done" +echo " Expected: first 3 return 200, then 444 (rate limited)" +echo "" +echo " Test (allowed bot bypasses challenge):" +echo " curl -A 'Googlebot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" +echo " Expected: 200" diff --git a/hestia-security.sh b/hestia-security.sh new file mode 100755 index 0000000..b9f6aac --- /dev/null +++ b/hestia-security.sh @@ -0,0 +1,2513 @@ +#!/bin/bash +################################################################################ +# Script Name: hestia-security.sh +# Version: 1.1 +# Description: Unified HestiaCP/VestaCP/myVesta security toolkit. Consolidates +# bot-blocking, JS cookie challenge, GeoIP enriched logging, and +# CrowdSec bouncer setup into a single subcommand-based script +# with shared helpers. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Subcommands: +# bot-block — AI scraper and SEO bot blocking via nginx map +# js-challenge — JavaScript cookie challenge for headless bot detection +# geoip — GeoIP2 enriched logging with country/ASN data +# crowdsec — CrowdSec engine + nginx lua bouncer + HestiaCP log acquisition +# status — Show status of all security features +# +# Usage: +# sudo ./hestia-security.sh bot-block [OPTIONS] +# sudo ./hestia-security.sh js-challenge [OPTIONS] +# sudo ./hestia-security.sh geoip [OPTIONS] +# sudo ./hestia-security.sh crowdsec [OPTIONS] +# sudo ./hestia-security.sh status +# +################################################################################ + +set -euo pipefail + +# ============================================================================= +# SHARED HELPERS +# ============================================================================= + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +# --- Logging (prefixed with # for Prometheus comment compatibility) --- +info() { echo -e "# ${GREEN}[OK]${NC} $*"; } +warn() { echo -e "# ${YELLOW}[WARN]${NC} $*"; } +step() { echo -e "# ${CYAN}[STEP]${NC} $*"; } +err() { echo -e "# ${RED}[ERROR]${NC} $*" >&2; } + +# --- Global variables --- +CONF_DIR="/etc/nginx/conf.d" +DRY_RUN=false +TIMESTAMP=$(date +%s) + +# --- Root check --- +require_root() { + if [[ $EUID -ne 0 && "$DRY_RUN" != "true" ]]; then + err "Must run as root (or use --dry-run)" + exit 1 + fi +} + +# --- Panel detection --- +detect_panel() { + if [[ -d "/usr/local/hestia/data/templates/web/nginx" ]]; then + PANEL_TPL_DIR="/usr/local/hestia/data/templates/web/nginx" + PANEL_NAME="HestiaCP" + elif [[ -d "/usr/local/vesta/data/templates/web/nginx" ]]; then + PANEL_TPL_DIR="/usr/local/vesta/data/templates/web/nginx" + PANEL_NAME="VestaCP/myVesta" + else + err "Neither HestiaCP nor VestaCP/myVesta found" + exit 1 + fi + info "Detected ${PANEL_NAME} (${PANEL_TPL_DIR})" +} + +# --- Require nginx --- +require_nginx() { + if ! command -v nginx &>/dev/null; then + err "nginx not found" + exit 1 + fi +} + +# --- Backup a file with timestamp suffix --- +backup_file() { + local file="$1" + if [[ -f "$file" ]]; then + cp "$file" "${file}.bak.${TIMESTAMP}" + fi +} + +# --- Validate nginx config, restore backup on failure --- +nginx_validate() { + local backup_file="${1:-}" + step "Testing nginx configuration" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" + return 0 + fi + if nginx -t 2>&1; then + info "nginx config valid" + return 0 + else + err "nginx config test failed" + if [[ -n "$backup_file" && -f "$backup_file" ]]; then + local original="${backup_file%.bak.*}" + cp "$backup_file" "$original" + warn "Restored: ${backup_file}" + fi + return 1 + fi +} + +# --- Reload nginx --- +nginx_reload() { + step "Reloading nginx" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl reload nginx" + else + systemctl reload nginx + info "nginx reloaded" + fi +} + +# --- Apply template to domain(s) --- +apply_template() { + local apply_user="$1" + local apply_domain="$2" + local apply_all="$3" + local template_name="$4" + + if [[ -z "$apply_user" ]]; then + return + fi + + if ! command -v v-change-web-domain-proxy-tpl &>/dev/null; then + err "v-change-web-domain-proxy-tpl not found" + exit 1 + fi + + if [[ "$apply_all" == "true" ]]; then + step "Applying template to all domains for user: ${apply_user}" + local domains + domains=$(v-list-web-domains "$apply_user" plain 2>/dev/null | awk '{print $1}') + if [[ -z "$domains" ]]; then + warn "No domains found for user: ${apply_user}" + else + while IFS= read -r domain; do + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-proxy-tpl ${apply_user} ${domain} ${template_name}" + else + v-change-web-domain-proxy-tpl "$apply_user" "$domain" "$template_name" + info "Applied to: ${domain}" + fi + done <<< "$domains" + fi + else + step "Applying template to: ${apply_domain}" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-proxy-tpl ${apply_user} ${apply_domain} ${template_name}" + else + v-change-web-domain-proxy-tpl "$apply_user" "$apply_domain" "$template_name" + info "Applied to: ${apply_domain}" + fi + fi +} + +# --- Create template with directive injection --- +# Args: src dst label directive marker_check +create_template() { + local src="$1" dst="$2" label="$3" directive="$4" marker_check="$5" + + if [[ ! -f "$src" ]]; then + warn "Source template not found: ${src} — skipping ${label}" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${dst} (from ${src})" + return + fi + + if [[ -f "$dst" ]]; then + cp "$dst" "${dst}.bak.${TIMESTAMP}" + warn "Existing ${label} template backed up" + fi + + # Check if the source already has the marker (avoid double-injection) + if grep -q "$marker_check" "$src"; then + cp "$src" "$dst" + info "Created ${label} template: ${dst} (${marker_check} already present in base)" + else + # Insert directive before the first 'location' line + awk -v block="$directive" ' + !inserted && /^[[:space:]]*location[[:space:]]/ { + print block + print "" + inserted = 1 + } + { print } + ' "$src" > "$dst" + info "Created ${label} template: ${dst}" + fi +} + +# --- Common template creation flow --- +# Args: base_template template_name directive marker_check +create_panel_templates() { + local base_template="$1" + local template_name="$2" + local directive="$3" + local marker_check="$4" + + # Resolve the base template — verify it exists + local base_tpl="${PANEL_TPL_DIR}/${base_template}.tpl" + if [[ ! -f "$base_tpl" ]]; then + if [[ "$base_template" != "default" ]]; then + warn "Base template '${base_template}' not found — falling back to 'default'" + base_template="default" + fi + base_tpl="${PANEL_TPL_DIR}/${base_template}.tpl" + if [[ ! -f "$base_tpl" ]]; then + err "Default template not found: ${base_tpl}" + exit 1 + fi + fi + + step "Creating custom ${PANEL_NAME} nginx templates (${template_name}) from ${base_template}" + + # Proxy templates + create_template \ + "${PANEL_TPL_DIR}/${base_template}.tpl" \ + "${PANEL_TPL_DIR}/${template_name}.tpl" \ + "HTTP (.tpl)" "$directive" "$marker_check" + + create_template \ + "${PANEL_TPL_DIR}/${base_template}.stpl" \ + "${PANEL_TPL_DIR}/${template_name}.stpl" \ + "SSL (.stpl)" "$directive" "$marker_check" + + # php-fpm templates (if they exist for the base) + if [[ -d "${PANEL_TPL_DIR}/php-fpm" ]]; then + if [[ -f "${PANEL_TPL_DIR}/php-fpm/${base_template}.tpl" ]]; then + create_template \ + "${PANEL_TPL_DIR}/php-fpm/${base_template}.tpl" \ + "${PANEL_TPL_DIR}/php-fpm/${template_name}.tpl" \ + "php-fpm HTTP (.tpl)" "$directive" "$marker_check" + + create_template \ + "${PANEL_TPL_DIR}/php-fpm/${base_template}.stpl" \ + "${PANEL_TPL_DIR}/php-fpm/${template_name}.stpl" \ + "php-fpm SSL (.stpl)" "$directive" "$marker_check" + fi + fi + + # Copy .sh hooks from the base template if they exist + for ext in tpl stpl; do + local base_sh="${PANEL_TPL_DIR}/${base_template}.${ext}.sh" + local dst_sh="${PANEL_TPL_DIR}/${template_name}.${ext}.sh" + if [[ -f "$base_sh" && ! -f "$dst_sh" ]]; then + cp "$base_sh" "$dst_sh" + info "Copied hook: ${dst_sh}" + fi + done +} + +# ============================================================================= +# MAIN USAGE +# ============================================================================= + +usage_main() { + cat </dev/null | tr '[:upper:]' '[:lower:]' | sort -u + } + + # Bots previously in the builtin list that were intentionally removed. + REMOVED_BOTS="~*oai-searchbot +~*claude-web" + + # Extract custom entries from existing map that are NOT in our built-in list + # and NOT in the removed list + get_custom_entries() { + local file="$1" + if [[ ! -f "$file" ]]; then + return + fi + + local builtin_patterns + builtin_patterns=$(get_builtin_patterns) + + while IFS= read -r line; do + [[ "$line" =~ ^[[:space:]]*# ]] && continue + [[ "$line" =~ ^[[:space:]]*$ ]] && continue + [[ "$line" =~ ^map ]] && continue + [[ "$line" =~ ^\} ]] && continue + [[ "$line" =~ default ]] && continue + + local pattern + pattern=$(echo "$line" | grep -oP '~\*\S+|^\s*""' | tr '[:upper:]' '[:lower:]' | head -1) + [[ -z "$pattern" ]] && continue + + if echo "$builtin_patterns" | grep -qxF "$pattern"; then + continue + fi + + if echo "$REMOVED_BOTS" | grep -qxF "$pattern"; then + continue + fi + + echo "$line" + done < "$file" + } + + # ===================================================== + # Step 1: Create or update nginx map + # ===================================================== + step "Configuring bot-block map at ${MAP_FILE}" + + local CUSTOM_ENTRIES="" + local ADDED_NEW=0 + if [[ -f "$MAP_FILE" ]]; then + CUSTOM_ENTRIES=$(get_custom_entries "$MAP_FILE") + + local existing + existing=$(get_existing_patterns "$MAP_FILE") + while IFS= read -r pattern; do + [[ -z "$pattern" ]] && continue + if ! echo "$existing" | grep -qxF "$pattern"; then + ADDED_NEW=$((ADDED_NEW + 1)) + fi + done <<< "$(get_builtin_patterns)" + + if [[ -n "$CUSTOM_ENTRIES" ]]; then + local custom_count + custom_count=$(echo "$CUSTOM_ENTRIES" | wc -l) + info "Found ${custom_count} custom bot entries — will preserve them" + fi + fi + + # Build the full map content + local MAP_CONTENT="# Bot-blocking map for AI scrapers, SEO bots, and vulnerability scanners +# Generated by hestia-security.sh bot-block — https://mylinux.work +# Last updated: $(date '+%Y-%m-%d %H:%M:%S') + +map \$http_user_agent \$is_bad_bot { + default 0; + +${BOT_LIST}" + + if [[ -n "$CUSTOM_ENTRIES" ]]; then + MAP_CONTENT="${MAP_CONTENT} + + # Custom entries (preserved from previous configuration) +${CUSTOM_ENTRIES}" + fi + + MAP_CONTENT="${MAP_CONTENT} +}" + + if [[ "$DRY_RUN" == "true" ]]; then + if [[ -f "$MAP_FILE" ]]; then + echo " Would update: ${MAP_FILE}" + [[ -n "$CUSTOM_ENTRIES" ]] && echo " Would preserve: $(echo "$CUSTOM_ENTRIES" | wc -l) custom entries" + [[ "$ADDED_NEW" -gt 0 ]] && echo " Would add: ${ADDED_NEW} new bot patterns" + else + echo " Would create: ${MAP_FILE}" + fi + else + if [[ -f "$MAP_FILE" ]]; then + cp "$MAP_FILE" "${MAP_FILE}.bak.$(date +%s)" + if [[ "$ADDED_NEW" -gt 0 ]]; then + info "Map updated: ${MAP_FILE} (${ADDED_NEW} new patterns added)" + else + info "Map updated: ${MAP_FILE} (already current)" + fi + else + info "Map created: ${MAP_FILE}" + fi + echo "$MAP_CONTENT" > "$MAP_FILE" + fi + + # ===================================================== + # If --update-map-only, skip templates and just reload + # ===================================================== + if [[ "$UPDATE_MAP_ONLY" == "true" ]]; then + step "Testing nginx configuration" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: nginx -t" + else + if nginx -t 2>&1; then + info "nginx config valid" + else + err "nginx config test failed — restoring backup" + local latest_bak + latest_bak=$(ls -t "${MAP_FILE}.bak."* 2>/dev/null | head -1) + if [[ -n "$latest_bak" ]]; then + cp "$latest_bak" "$MAP_FILE" + warn "Restored: ${latest_bak}" + fi + exit 1 + fi + fi + + nginx_reload + + echo "" + echo -e "${BOLD}Done.${NC} Map updated — templates unchanged." + echo " Map: ${MAP_FILE}" + echo "" + echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" + echo " Expected: 444 (connection dropped) or 000 (no response)" + return 0 + fi + + # ===================================================== + # Step 2: Create custom Hestia templates + # ===================================================== + local BOT_BLOCK_DIRECTIVE=' + # Bot blocking — added by hestia-security.sh bot-block + if ($is_bad_bot) { + return 444; + } + # Block broken srcset scrapers + if ($request_uri ~* "%20[0-9]+w,https?://") { + return 444; + } + # Block spoofed referers with fragment identifiers (real browsers strip these) + if ($http_referer ~* "#") { + return 444; + } + # Block non-GET/HEAD methods (static sites never need POST/PUT/DELETE) + if ($request_method !~ ^(GET|HEAD)$ ) { + return 444; + } + # Block empty-referer requests for images (headless bot image scraping) + set $block_image_scrape 0; + if ($uri ~* "\.(png|jpg|webp)$") { + set $block_image_scrape 1; + } + if ($http_referer = "") { + set $block_image_scrape "${block_image_scrape}1"; + } + if ($block_image_scrape = "11") { + return 444; + }' + + create_panel_templates "$BASE_TEMPLATE" "$TEMPLATE_NAME" "$BOT_BLOCK_DIRECTIVE" "is_bad_bot" + + # ===================================================== + # Step 3: Validate nginx config + # ===================================================== + if ! nginx_validate; then + err "Aborting — fix the config errors above" + return 1 + fi + + # ===================================================== + # Step 4: Apply template (optional) + # ===================================================== + apply_template "$APPLY_USER" "$APPLY_DOMAIN" "$APPLY_ALL" "$TEMPLATE_NAME" + + # ===================================================== + # Step 5: Reload nginx + # ===================================================== + nginx_reload + + # ===================================================== + # Summary + # ===================================================== + echo "" + echo -e "${BOLD}Done.${NC}" + echo "" + echo " Map: ${MAP_FILE}" + echo " Base: ${BASE_TEMPLATE}" + echo " Template: ${TEMPLATE_NAME} (.tpl + .stpl)" + if [[ -n "$APPLY_USER" ]]; then + if [[ "$APPLY_ALL" == "true" ]]; then + echo " Applied: All domains for ${APPLY_USER}" + else + echo " Applied: ${APPLY_DOMAIN}" + fi + else + echo "" + echo " To apply to a domain:" + echo " v-change-web-domain-proxy-tpl ${TEMPLATE_NAME}" + echo "" + echo " To apply to all domains for a user:" + echo " $(basename "$0") bot-block --apply-all " + fi + echo "" + echo " To add new bots later without touching templates:" + echo " sudo $(basename "$0") bot-block --update-map-only" + echo "" + echo " Verify: curl -A 'GPTBot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" + echo " Expected: 444 (connection dropped) or 000 (no response)" +} + +# ============================================================================= +# SUBCOMMAND: js-challenge +# ============================================================================= + +cmd_js_challenge() { + local TEMPLATE_NAME="default-jschallenge" + local BASE_TEMPLATE="default" + local CHALLENGE_MAP="${CONF_DIR}/js-challenge.conf" + local CHALLENGE_DIR="/var/www/js-challenge" + local CHALLENGE_HTML="${CHALLENGE_DIR}/challenge.html" + local STATE_FILE="/etc/nginx/js-challenge.env" + local CHALLENGE_PATH="/_bc" + local APPLY_USER="" + local APPLY_DOMAIN="" + local APPLY_ALL=false + local REMOVE=false + local UPDATE_HTML_ONLY=false + local COOKIE_MAX_AGE=86400 + local TARPIT_COUNTRIES="${TARPIT_COUNTRIES:-CN}" + local TARPIT_RATE="${TARPIT_RATE:-50}" + local CHALLENGE_RATE="${CHALLENGE_RATE:-1}" + local CHALLENGE_BURST="${CHALLENGE_BURST:-3}" + local COOKIE_NAME="" + local COOKIE_VALUE="" + local COOKIE_VAR="" + + # --- Usage --- + usage_js_challenge() { + cat < "$STATE_FILE" <&2 + exit 1 + fi + elif [[ "$REMOVE" != "true" ]]; then + _jsc_generate_credentials + info "Generated new credentials — cookie: ${COOKIE_NAME} token: ${COOKIE_VALUE:0:8}..." + fi + + # ===================================================== + # Remove mode + # ===================================================== + if [[ "$REMOVE" == "true" ]]; then + step "Removing JS challenge configuration" + + # Remove templates FIRST — they reference variables defined in the + # map file, so deleting the map while templates still use those + # variables would break nginx -t + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would remove JS challenge templates from ${PANEL_TPL_DIR:-/usr/local/hestia/data/templates/web/nginx}" + else + detect_panel + for ext in tpl stpl; do + local tpl_file="${PANEL_TPL_DIR}/${TEMPLATE_NAME}.${ext}" + [[ -f "$tpl_file" ]] && rm -f "$tpl_file" && info "Removed: ${tpl_file}" + tpl_file="${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.${ext}" + [[ -f "$tpl_file" ]] && rm -f "$tpl_file" && info "Removed: ${tpl_file}" + done + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would remove: ${CHALLENGE_MAP}" + echo " Would remove: ${CHALLENGE_DIR}" + echo " Would remove: ${STATE_FILE}" + echo " Would run: nginx -t && systemctl reload nginx" + else + [[ -f "$CHALLENGE_MAP" ]] && rm -f "$CHALLENGE_MAP" && info "Removed: ${CHALLENGE_MAP}" + [[ -d "$CHALLENGE_DIR" ]] && rm -rf "$CHALLENGE_DIR" && info "Removed: ${CHALLENGE_DIR}" + [[ -f "$STATE_FILE" ]] && rm -f "$STATE_FILE" && info "Removed: ${STATE_FILE}" + fi + + if nginx_validate; then + nginx_reload + else + warn "nginx config has errors — check other configs (bot-block, geoip, etc.)" + fi + + echo "" + echo -e "${BOLD}JS challenge removed.${NC}" + echo "" + echo " Note: You should also switch away from the JS challenge templates" + echo " on your domains:" + echo " v-change-web-domain-proxy-tpl " + return 0 + fi + + # --- Panel detection --- + detect_panel + require_nginx + + # ===================================================== + # Step 1: Create the challenge HTML page + # ===================================================== + step "Creating challenge page at ${CHALLENGE_HTML}" + + local CHALLENGE_CONTENT=' + +Verifying + + + + + +' + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${CHALLENGE_DIR}/" + echo " Would create: ${CHALLENGE_HTML}" + else + mkdir -p "$CHALLENGE_DIR" + echo "$CHALLENGE_CONTENT" > "$CHALLENGE_HTML" + info "Challenge page created: ${CHALLENGE_HTML}" + fi + + # ===================================================== + # If --update-html-only, stop here + # ===================================================== + if [[ "$UPDATE_HTML_ONLY" == "true" ]]; then + echo "" + echo -e "${BOLD}Done.${NC} Challenge HTML updated — templates unchanged." + echo " HTML: ${CHALLENGE_HTML}" + echo " Cookie: ${COOKIE_NAME}=${COOKIE_VALUE:0:8}..." + echo "" + echo " No nginx reload needed — the HTML is served as a static file." + return 0 + fi + + # Save credentials for future --update-html-only runs + _jsc_save_credentials + + # ===================================================== + # Step 2: Create nginx map config + # ===================================================== + step "Creating JS challenge map at ${CHALLENGE_MAP}" + + local NGINX_COOKIE_VAR="\$cookie_${COOKIE_NAME}" + + # Check if geoip2 module is available and the database exists + local GEOIP2_BLOCK="" + local GEOIP2_AVAILABLE=false + local TARPIT_MAP_BLOCK="" + + # Check if the module is loadable + if nginx -V 2>&1 | grep -q 'http_geoip2' || \ + ls /etc/nginx/modules-enabled/*geoip2* 2>/dev/null | grep -q . || \ + ls /usr/lib/nginx/modules/*geoip2* 2>/dev/null | grep -q . ; then + if [[ -f /usr/share/GeoIP/GeoLite2-City.mmdb ]] || \ + [[ -f /usr/share/GeoIP/GeoLite2-Country.mmdb ]]; then + GEOIP2_AVAILABLE=true + else + warn "geoip2 module found but no GeoIP database in /usr/share/GeoIP/ — tarpit disabled" + fi + else + warn "geoip2 nginx module not installed — tarpit feature disabled" + warn "Install with: apt install libnginx-mod-http-geoip2 (Ubuntu/Debian)" + fi + + if [[ "$GEOIP2_AVAILABLE" == "true" ]]; then + # Check if geoip2 country_code is already defined elsewhere + if ! grep -r 'geoip2_country_code' /etc/nginx/ \ + --include='*.conf' --exclude='js-challenge.conf' --exclude='*.bak.*' \ + -q 2>/dev/null; then + local mmdb="/usr/share/GeoIP/GeoLite2-City.mmdb" + [[ ! -f "$mmdb" ]] && mmdb="/usr/share/GeoIP/GeoLite2-Country.mmdb" + GEOIP2_BLOCK=' +# ── GeoIP2: country lookup for tarpit decisions ────────────────────── +# Uses the City database (superset of Country). Adjust path if needed. +geoip2 '"${mmdb}"' { + $geoip2_country_code country iso_code; +} +' + step "No existing geoip2 country_code config found — adding to map config" + fi + fi + + # Collect server_name values from nginx configs to build same-site referer map + local REFERER_ENTRIES="" + local _jsc_domain_seen=() + for _conf in /etc/nginx/conf.d/*.conf /etc/nginx/sites-enabled/*; do + [[ -f "$_conf" ]] || continue + while read -r _sn; do + for _d in $_sn; do + [[ "$_d" == "server_name" || "$_d" == ";" || "$_d" == "_" || "$_d" =~ ^[0-9] ]] && continue + _d="${_d%;}" + [[ " ${_jsc_domain_seen[*]:-} " == *" $_d "* ]] && continue + _jsc_domain_seen+=("$_d") + local _d_escaped="${_d//./\\.}" + REFERER_ENTRIES+=" ~^1:https?://${_d_escaped} 1;\n" + done + done < <(grep -oP '^\s*server_name\s+\K[^;]+;?' "$_conf" 2>/dev/null) + done + + if [[ -z "$REFERER_ENTRIES" ]]; then + warn "No server_name values found — same-site image bypass will not work" + warn "Images behind the challenge may cause redirect loops for browsers" + fi + + local MAP_CONTENT='# JS cookie challenge — allowed bots and cookie check +# Generated by hestia-security.sh js-challenge — https://mylinux.work +# Cookie: '"${COOKIE_NAME}"' Token: '"${COOKIE_VALUE:0:8}"'... +# Generated: '"$(date -Iseconds)"' + +# ── Rate limit: challenge endpoint ─────────────────────────────────── +# Real users hit the challenge once and keep the cookie. Headless bot farms +# spawn fresh browsers per request, hitting the challenge every time. +# Rate: '"${CHALLENGE_RATE}"'r/m with burst of '"${CHALLENGE_BURST}"' — excess gets 444. +limit_req_zone $binary_remote_addr zone=jschallenge:10m rate='"${CHALLENGE_RATE}"'r/m; + +# Bots that legitimately identify themselves and should bypass the JS check +map $http_user_agent $is_allowed_bot { + default 0; + + # Search engines + ~*Googlebot 1; + ~*bingbot 1; + ~*Slurp 1; + ~*DuckDuckBot 1; + ~*DuckAssistBot 1; + ~*Baiduspider 1; + ~*YandexBot 1; + ~*YandexFavicons 1; + ~*Applebot 1; + ~*Qwantbot 1; + ~*Qwantify 1; + ~*Bravebot 1; + ~*kagi-fetcher 1; + ~*Kagibot 1; + ~*Yahoo! 1; + ~*Yeti 1; + + # Social media / link previews + ~*facebookexternalhit 1; + ~*Facebot 1; + ~*Twitterbot 1; + ~*LinkedInBot 1; + ~*Slackbot 1; + ~*Slack-ImgProxy 1; + ~*Discordbot 1; + ~*TelegramBot 1; + ~*WhatsApp 1; + ~*redditbot 1; + ~*ArenaUnfurlBot 1; + + # Feed readers + ~*Feedly 1; + ~*Miniflux 1; + ~*FreshRSS 1; + ~*NewsBlur 1; + ~*Tiny\ Tiny\ RSS 1; + ~*Inoreader 1; + ~*NetNewsWire 1; + + # Monitoring / uptime + ~*UptimeRobot 1; + ~*Pingdom 1; + ~*StatusCake 1; + ~*Blackbox-Exporter 1; + + # AI answer bots (user-facing, not training crawlers) + ~*OAI-SearchBot 1; + ~*ChatGPT-User 1; + ~*Claude-Web 1; + ~*Claude-User 1; + ~*MistralAI-User 1; + + # Archive / research + ~*archive\.org_bot 1; + + # Apple Safari prefetch + ~*safarifetcherd 1; + + # Link checkers / validators + ~*W3C_Validator 1; + ~*W3C-checklink 1; + ~*LinkChecker 1; + ~*link-check 1; + + # Decentralized search + ~*yacybot 1; + + # Add your own allowed bots below +} + +# Validate the challenge cookie — exact token match +map '"${NGINX_COOKIE_VAR}"' $js_cookie_valid { + default 0; + "'"${COOKIE_VALUE}"'" 1; +} + +# Detect requests to the challenge page, downloads, and static assets (prevent redirect loops) +map $uri $is_challenge_uri { + default 0; + "'"${CHALLENGE_PATH}"'" 1; + ~^/downloads/ 1; + ~*\.(css|js|woff2?)$ 1; + ~*favicon 1; + ~*apple-touch-icon 1; +} + +# Detect image sub-resource requests with same-site referer (browser loads) +# These bypass the challenge because: (a) images cannot execute JS challenges, +# and (b) the same-site referer proves the browser loaded a page from this domain. +# Direct image requests from scrapers (no referer or external referer) still get challenged. +map $uri $is_image_request { + default 0; + ~*\.(png|jpe?g|gif|svg|webp|ico|avif)$ 1; +} +map "$is_image_request:$http_referer" $is_samesite_image { + default 0; +'"${REFERER_ENTRIES}"'} + +# Combined check: need challenge if not allowed bot, no valid cookie, and not the challenge page +map "$is_allowed_bot:$js_cookie_valid:$is_challenge_uri:$is_samesite_image" $needs_js_challenge { + default 1; + "1:0:0:0" 0; + "1:0:0:1" 0; + "1:0:1:0" 0; + "1:0:1:1" 0; + "1:1:0:0" 0; + "1:1:0:1" 0; + "1:1:1:0" 0; + "1:1:1:1" 0; + "0:1:0:0" 0; + "0:1:0:1" 0; + "0:1:1:0" 0; + "0:1:1:1" 0; + "0:0:1:0" 0; + "0:0:1:1" 0; + "0:0:0:1" 0; +} +' + + if [[ "$GEOIP2_AVAILABLE" == "true" ]]; then + TARPIT_MAP_BLOCK="${GEOIP2_BLOCK} +# ── Tarpit: headless Chrome bots from suspect regions ───────────────── +# Visitors from tarpit countries with no external referrer (passed through +# the challenge redirect as the _bc_ref cookie) are served at a crawl. +# This drains headless Chrome resources (~200-500 MB RAM per instance) +# without giving the bot a clear \"blocked\" signal to adapt to. +# +# The _bc_ref cookie is set by the challenge page JS from the &ref= param. +# It contains the original HTTP Referer before the 302 redirect destroyed it. +# \"direct\" = no external referrer (typed URL or bot). Cookie expires in 120s. + +# Check if visitor is from a tarpit country (requires geoip2 module) +map \$geoip2_country_code \$is_tarpit_country { + default 0; +$(for cc in $TARPIT_COUNTRIES; do echo " \"${cc}\" 1;"; done) +} + +# Tarpit only if: tarpit country + no external referrer + passed JS challenge +map \"\$is_tarpit_country:\$cookie__bc_ref\" \$tarpit_client { + default 0; + \"1:direct\" 1; + \"1:\" 1; +}" + fi + + MAP_CONTENT+="${TARPIT_MAP_BLOCK}" + MAP_CONTENT+=' + +# Serve the challenge page +server { + listen 127.0.0.1:18444; + server_name _; + root /var/www/js-challenge; + + location / { + add_header Cache-Control "no-store, no-cache, must-revalidate" always; + add_header Pragma "no-cache" always; + try_files /challenge.html =404; + } +}' + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${CHALLENGE_MAP}" + else + if [[ -f "$CHALLENGE_MAP" ]]; then + cp "$CHALLENGE_MAP" "${CHALLENGE_MAP}.bak.${TIMESTAMP}" + warn "Existing config backed up" + fi + echo "$MAP_CONTENT" > "$CHALLENGE_MAP" + info "Map config created: ${CHALLENGE_MAP}" + fi + + # ===================================================== + # Step 3: Create custom Hestia templates + # ===================================================== + local JS_CHALLENGE_DIRECTIVE=' + # JS cookie challenge — added by hestia-security.sh js-challenge + location = '"${CHALLENGE_PATH}"' { + limit_req zone=jschallenge burst='"${CHALLENGE_BURST}"' nodelay; + limit_req_status 444; + proxy_pass http://127.0.0.1:18444/; + } + + # Redirect non-JS visitors to challenge page (pass original referrer) + if ($needs_js_challenge) { + return 302 '"${CHALLENGE_PATH}"'?r=$request_uri&ref=$http_referer; + }' + + if [[ "$GEOIP2_AVAILABLE" == "true" ]]; then + JS_CHALLENGE_DIRECTIVE+=' + + # Tarpit headless Chrome bots from suspect GeoIP regions + if ($tarpit_client) { + set $limit_rate '"${TARPIT_RATE}"'; + }' + fi + + create_panel_templates "$BASE_TEMPLATE" "$TEMPLATE_NAME" "$JS_CHALLENGE_DIRECTIVE" "needs_js_challenge" + + # ===================================================== + # Step 4: Validate nginx config + # ===================================================== + if ! nginx_validate; then + err "Aborting — fix the config errors above" + return 1 + fi + + # ===================================================== + # Step 5: Apply template (optional) + # ===================================================== + apply_template "$APPLY_USER" "$APPLY_DOMAIN" "$APPLY_ALL" "$TEMPLATE_NAME" + + # ===================================================== + # Step 6: Reload nginx + # ===================================================== + nginx_reload + + # ===================================================== + # Summary + # ===================================================== + echo "" + echo -e "${BOLD}Done.${NC}" + echo "" + echo " Challenge map: ${CHALLENGE_MAP}" + echo " Challenge page: ${CHALLENGE_HTML}" + echo " State file: ${STATE_FILE}" + echo " Base template: ${BASE_TEMPLATE}" + echo " Template: ${TEMPLATE_NAME} (.tpl + .stpl)" + echo " Cookie name: ${COOKIE_NAME}" + echo " Cookie token: ${COOKIE_VALUE:0:8}... (32 hex chars)" + echo " Cookie TTL: ${COOKIE_MAX_AGE}s" + echo " Challenge rate: ${CHALLENGE_RATE}r/m (burst: ${CHALLENGE_BURST})" + if [[ -n "$APPLY_USER" ]]; then + if [[ "$APPLY_ALL" == "true" ]]; then + echo " Applied: All domains for ${APPLY_USER}" + else + echo " Applied: ${APPLY_DOMAIN}" + fi + else + echo "" + echo " To apply to a domain:" + echo " v-change-web-domain-proxy-tpl ${TEMPLATE_NAME}" + echo "" + echo " To apply to all domains for a user:" + echo " sudo $(basename "$0") js-challenge --apply-all " + fi + echo "" + echo " To rotate credentials (invalidate bot-cached cookies):" + echo " sudo $(basename "$0") js-challenge --base-template ${BASE_TEMPLATE}" + echo "" + echo " Test (bot without cookie gets redirected to challenge):" + echo " curl -o /dev/null -s -w '%{http_code}' https://yourdomain.com" + echo " Expected: 302" + echo "" + echo " Test (browser completes challenge — 302 → 200):" + echo " Open https://yourdomain.com in a browser" + echo " Expected: brief redirect then page loads normally" + echo "" + echo " Test (old static bypass no longer works):" + echo " curl -b '_bc=verified' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" + echo " Expected: 302 (not 200 — old cookie is invalid)" + echo "" + echo " Test (rate limit on challenge endpoint):" + echo " for i in 1 2 3 4 5; do curl -o /dev/null -s -w \"\$i: %{http_code}\n\" https://yourdomain.com${CHALLENGE_PATH}; done" + echo " Expected: first 3 return 200, then 444 (rate limited)" + echo "" + echo " Test (allowed bot bypasses challenge):" + echo " curl -A 'Googlebot' -o /dev/null -s -w '%{http_code}' https://yourdomain.com" + echo " Expected: 200" +} + +# ============================================================================= +# SUBCOMMAND: geoip +# ============================================================================= + +cmd_geoip() { + local GEOIP_CONF="/etc/GeoIP.conf" + local GEOIP_DB_DIR="/usr/share/GeoIP" + local NGINX_GEOIP_CONF="/etc/nginx/conf.d/geoip2.conf" + local NGINX_LOG_CONF="/etc/nginx/conf.d/enriched-log-format.conf" + local APACHE_LOG_CONF="/etc/apache2/conf-available/enriched-log-format.conf" + local CRON_WEEKLY="/etc/cron.weekly/geoip-db-update" + local CRON_MONTHLY="/etc/cron.monthly/geoip-db-update" + local MARKER_START="# geoip-managed-start" + local MARKER_END="# geoip-managed-end" + local TEMPLATE_NAME="default-geoip" + local BASE_TEMPLATE="default" + local APPLY_USER="" + local APPLY_DOMAIN="" + local APPLY_ALL=false + local REMOVE=false + local SKIP_PACKAGES=false + local USE_DBIP=false + local ACCOUNT_ID="${MAXMIND_ACCOUNT_ID:-}" + local LICENSE_KEY="${MAXMIND_LICENSE_KEY:-}" + local APACHE_TPL_DIR="" + + usage_geoip() { + cat </dev/null || true + fi + fi + + if nginx_validate; then + nginx_reload + if systemctl is-active --quiet apache2 2>/dev/null; then + step "Reloading Apache" + if [[ "$DRY_RUN" != "true" ]]; then + systemctl reload apache2 + info "Apache reloaded" + fi + fi + else + warn "nginx config has errors — restore .bak files or check other configs" + fi + + echo "" + echo -e "${BOLD}GeoIP2 configuration removed.${NC}" + echo " Domains still using the ${TEMPLATE_NAME} template should be switched back." + return 0 + fi + + # --- Prompt for credentials if not using DB-IP --- + if [[ "$USE_DBIP" != "true" ]]; then + if [[ -z "$ACCOUNT_ID" ]]; then + read -rp "MaxMind Account ID (or Ctrl+C and rerun with --db-ip): " ACCOUNT_ID + fi + if [[ -z "$LICENSE_KEY" ]]; then + read -rp "MaxMind License Key: " LICENSE_KEY + fi + + if [[ -z "$ACCOUNT_ID" || -z "$LICENSE_KEY" ]]; then + err "MaxMind credentials required (or use --db-ip for no-signup alternative)" + exit 1 + fi + fi + + # ===================================================== + # Step 1: Install packages and GeoIP2 nginx module + # ===================================================== + local NGINX_MOD_DIR="/usr/lib/nginx/modules" + local GEOIP2_MOD="${NGINX_MOD_DIR}/ngx_http_geoip2_module.so" + local LOAD_MOD_CONF="/etc/nginx/modules-enabled/50-mod-http-geoip2.conf" + + if [[ "$SKIP_PACKAGES" == "true" ]]; then + warn "Skipping package installation (--skip-packages)" + else + step "Installing build dependencies" + + local PACKAGES + if [[ "$USE_DBIP" == "true" ]]; then + PACKAGES="libmaxminddb0 libmaxminddb-dev mmdb-bin curl build-essential git libpcre2-dev libssl-dev zlib1g-dev" + else + PACKAGES="libmaxminddb0 libmaxminddb-dev mmdb-bin geoipupdate build-essential git libpcre2-dev libssl-dev zlib1g-dev" + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: apt-get update && apt-get install -y ${PACKAGES}" + else + apt-get update -qq + # shellcheck disable=SC2086 + apt-get install -y $PACKAGES + info "Packages installed: ${PACKAGES}" + fi + + if [[ -f "$GEOIP2_MOD" ]]; then + info "GeoIP2 module already exists: ${GEOIP2_MOD}" + else + step "Building ngx_http_geoip2_module for nginx $(nginx -v 2>&1 | grep -oP '[\d.]+')" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would compile ngx_http_geoip2_module.so" + else + local NGINX_VER + NGINX_VER=$(nginx -v 2>&1 | grep -oP '[\d.]+') + local BUILD_DIR + BUILD_DIR=$(mktemp -d) + + cd "$BUILD_DIR" + curl -fsSL "https://nginx.org/download/nginx-${NGINX_VER}.tar.gz" -o nginx.tar.gz + tar xzf nginx.tar.gz + + git clone --depth 1 https://github.com/leev/ngx_http_geoip2_module.git + + cd "nginx-${NGINX_VER}" + ./configure --with-compat --add-dynamic-module=../ngx_http_geoip2_module + make modules + + mkdir -p "$NGINX_MOD_DIR" + cp objs/ngx_http_geoip2_module.so "$NGINX_MOD_DIR/" + cp objs/ngx_stream_geoip2_module.so "$NGINX_MOD_DIR/" 2>/dev/null || true + + cd / + rm -rf "$BUILD_DIR" + info "Built and installed: ${GEOIP2_MOD}" + fi + fi + + if [[ ! -f "$LOAD_MOD_CONF" ]] && [[ "$DRY_RUN" != "true" ]]; then + mkdir -p "$(dirname "$LOAD_MOD_CONF")" + echo "load_module ${GEOIP2_MOD};" > "$LOAD_MOD_CONF" + info "Created: ${LOAD_MOD_CONF}" + fi + fi + + # ===================================================== + # Step 2: Write GeoIP.conf (MaxMind only) + # ===================================================== + if [[ "$USE_DBIP" != "true" ]]; then + step "Writing MaxMind configuration to ${GEOIP_CONF}" + + local GEOIP_CONF_CONTENT="${MARKER_START} +AccountID ${ACCOUNT_ID} +LicenseKey ${LICENSE_KEY} +EditionIDs GeoLite2-City GeoLite2-ASN GeoLite2-Country +DatabaseDirectory ${GEOIP_DB_DIR} +${MARKER_END}" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${GEOIP_CONF}" + else + backup_file "$GEOIP_CONF" + echo "$GEOIP_CONF_CONTENT" > "$GEOIP_CONF" + chmod 600 "$GEOIP_CONF" + info "Created: ${GEOIP_CONF}" + fi + fi + + # ===================================================== + # Step 3: Download GeoIP databases + # ===================================================== + if [[ "$USE_DBIP" == "true" ]]; then + step "Downloading DB-IP free databases to ${GEOIP_DB_DIR}" + + local DBIP_MONTH + DBIP_MONTH=$(date +%Y-%m) + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would download: dbip-country-lite-${DBIP_MONTH}.mmdb.gz" + echo " Would download: dbip-asn-lite-${DBIP_MONTH}.mmdb.gz" + else + mkdir -p "$GEOIP_DB_DIR" + cd "$GEOIP_DB_DIR" + + curl -fsSL "https://download.db-ip.com/free/dbip-country-lite-${DBIP_MONTH}.mmdb.gz" -o dbip-country.mmdb.gz + gunzip -f dbip-country.mmdb.gz + mv dbip-country.mmdb GeoLite2-City.mmdb + info "Downloaded: DB-IP Country → GeoLite2-City.mmdb" + + curl -fsSL "https://download.db-ip.com/free/dbip-asn-lite-${DBIP_MONTH}.mmdb.gz" -o dbip-asn.mmdb.gz + gunzip -f dbip-asn.mmdb.gz + mv dbip-asn.mmdb GeoLite2-ASN.mmdb + info "Downloaded: DB-IP ASN → GeoLite2-ASN.mmdb" + + cd - >/dev/null + fi + else + step "Downloading MaxMind GeoIP2 databases to ${GEOIP_DB_DIR}" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: mkdir -p ${GEOIP_DB_DIR}" + echo " Would run: geoipupdate" + else + mkdir -p "$GEOIP_DB_DIR" + geoipupdate -v + info "GeoIP2 databases downloaded" + fi + fi + + # ===================================================== + # Step 4: Create nginx geoip2 config in conf.d + # ===================================================== + step "Creating nginx GeoIP2 config at ${NGINX_GEOIP_CONF}" + + local NGINX_GEOIP_CONTENT + read -r -d '' NGINX_GEOIP_CONTENT < "$NGINX_GEOIP_CONF" + info "Created: ${NGINX_GEOIP_CONF}" + fi + + # ===================================================== + # Step 5: Create enriched log format in conf.d + # ===================================================== + step "Creating enriched log format at ${NGINX_LOG_CONF}" + + local NGINX_LOG_CONTENT + read -r -d '' NGINX_LOG_CONTENT < "$NGINX_LOG_CONF" + info "Created: ${NGINX_LOG_CONF}" + fi + + # ===================================================== + # Step 5b: Create Apache enriched log format + # ===================================================== + if [[ -d "/etc/apache2/conf-available" ]]; then + step "Creating Apache enriched log format at ${APACHE_LOG_CONF}" + + local APACHE_LOG_CONTENT + read -r -d '' APACHE_LOG_CONTENT <s %b \"%{Referer}i\" \"%{User-Agent}i\" %{X-GeoIP-Country}i \"%{X-GeoIP-ASN}i\"" enriched +${MARKER_END} +ALOGCONF + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${APACHE_LOG_CONF}" + echo " Would enable: a2enconf enriched-log-format" + else + backup_file "$APACHE_LOG_CONF" + echo "$APACHE_LOG_CONTENT" > "$APACHE_LOG_CONF" + a2enconf -q enriched-log-format 2>/dev/null || true + info "Created and enabled: ${APACHE_LOG_CONF}" + fi + else + warn "Apache conf-available directory not found — skipping Apache log format" + fi + + # ===================================================== + # Step 6: Create custom Hestia templates + # ===================================================== + + # --- Hestia Apache template patching --- + if [[ -d "/usr/local/hestia/data/templates/web/apache2/php-fpm" ]]; then + APACHE_TPL_DIR="/usr/local/hestia/data/templates/web/apache2/php-fpm" + elif [[ -d "/usr/local/hestia/data/templates/web/apache2" ]]; then + APACHE_TPL_DIR="/usr/local/hestia/data/templates/web/apache2" + fi + + _geoip_create_apache_template() { + local src="$1" dst="$2" label="$3" + + if [[ ! -f "$src" ]]; then + warn "Source Apache template not found: ${src} — skipping ${label}" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${dst} (from ${src})" + return + fi + + backup_file "$dst" + + local APACHE_ENRICHED_FORMAT=' LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %{X-GeoIP-Country}i \"%{X-GeoIP-ASN}i\"" enriched' + + sed 's/\(CustomLog.*\.log\) combined/\1 enriched/g' "$src" | \ + awk -v fmt="$APACHE_ENRICHED_FORMAT" ' + !injected && /CustomLog/ { + print fmt + injected = 1 + } + { print } + ' > "$dst" + + if grep -q "enriched" "$dst"; then + info "Created ${label} template: ${dst}" + else + cp "$src" "$dst" + warn "No 'combined' format found in ${src} — copied unchanged" + fi + } + + if [[ -n "$APACHE_TPL_DIR" ]]; then + local APACHE_BASE="$BASE_TEMPLATE" + if [[ ! -f "${APACHE_TPL_DIR}/${APACHE_BASE}.tpl" ]]; then + APACHE_BASE="default" + warn "Apache base template '${BASE_TEMPLATE}' not found — falling back to 'default'" + fi + + step "Creating custom Apache templates (${TEMPLATE_NAME}) from ${APACHE_BASE}" + + _geoip_create_apache_template \ + "${APACHE_TPL_DIR}/${APACHE_BASE}.tpl" \ + "${APACHE_TPL_DIR}/${TEMPLATE_NAME}.tpl" \ + "Apache HTTP (.tpl)" + + _geoip_create_apache_template \ + "${APACHE_TPL_DIR}/${APACHE_BASE}.stpl" \ + "${APACHE_TPL_DIR}/${TEMPLATE_NAME}.stpl" \ + "Apache SSL (.stpl)" + fi + + # --- Nginx template patching --- + _geoip_create_nginx_template() { + local src="$1" dst="$2" label="$3" + + if [[ ! -f "$src" ]]; then + warn "Source template not found: ${src} — skipping ${label}" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${dst} (from ${src})" + return + fi + + backup_file "$dst" + + local has_proxy_headers + has_proxy_headers=$(grep -c 'proxy_set_header' "$src" || true) + + sed 's/\(access_log.*\.log\) combined;/\1 enriched;/g' "$src" | \ + awk -v d='$' -v need_std="$has_proxy_headers" ' + !injected && /proxy_pass/ { + print + if (need_std == 0) { + print "\t\tproxy_set_header Host " d "host;" + print "\t\tproxy_set_header X-Real-IP " d "remote_addr;" + print "\t\tproxy_set_header X-Forwarded-For " d "proxy_add_x_forwarded_for;" + print "\t\tproxy_set_header X-Forwarded-Proto " d "scheme;" + } + print "\t\tproxy_set_header X-GeoIP-Country " d "geoip2_country_code;" + print "\t\tproxy_set_header X-GeoIP-ASN " d "geoip2_asn_org;" + injected = 1 + next + } + { print } + ' > "$dst" + + if grep -q "enriched" "$dst"; then + info "Created ${label} template: ${dst} (log format updated)" + else + warn "No 'combined' log format found in ${src} — format not changed" + warn " Manually change access_log format to 'enriched' in ${dst}" + fi + + if grep -q "proxy_set_header X-GeoIP-Country" "$dst"; then + info "Injected GeoIP proxy headers into ${dst}" + else + warn "No proxy_pass found in ${src} — GeoIP headers not injected" + fi + } + + step "Creating custom ${PANEL_NAME} templates (${TEMPLATE_NAME}) from ${BASE_TEMPLATE}" + + # Resolve the base template + local BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" + if [[ ! -f "$BASE_TPL" ]]; then + if [[ "$BASE_TEMPLATE" != "default" ]]; then + warn "Base template '${BASE_TEMPLATE}' not found — falling back to 'default'" + BASE_TEMPLATE="default" + fi + BASE_TPL="${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" + if [[ ! -f "$BASE_TPL" ]]; then + err "Default template not found: ${BASE_TPL}" + exit 1 + fi + fi + + # Proxy templates (nginx as proxy for apache) + _geoip_create_nginx_template \ + "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.tpl" \ + "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.tpl" \ + "proxy HTTP (.tpl)" + + _geoip_create_nginx_template \ + "${PANEL_TPL_DIR}/${BASE_TEMPLATE}.stpl" \ + "${PANEL_TPL_DIR}/${TEMPLATE_NAME}.stpl" \ + "proxy SSL (.stpl)" + + # php-fpm templates (nginx standalone) + if [[ -d "${PANEL_TPL_DIR}/php-fpm" ]]; then + if [[ -f "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" ]]; then + _geoip_create_nginx_template \ + "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.tpl" \ + "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.tpl" \ + "php-fpm HTTP (.tpl)" + + _geoip_create_nginx_template \ + "${PANEL_TPL_DIR}/php-fpm/${BASE_TEMPLATE}.stpl" \ + "${PANEL_TPL_DIR}/php-fpm/${TEMPLATE_NAME}.stpl" \ + "php-fpm SSL (.stpl)" + fi + fi + + # Copy .sh hook if base template has one + for tpl_dir in "${PANEL_TPL_DIR}" "${PANEL_TPL_DIR}/php-fpm"; do + if [[ -f "${tpl_dir}/${BASE_TEMPLATE}.sh" ]] && [[ ! -f "${tpl_dir}/${TEMPLATE_NAME}.sh" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would copy: ${tpl_dir}/${BASE_TEMPLATE}.sh → ${tpl_dir}/${TEMPLATE_NAME}.sh" + else + cp "${tpl_dir}/${BASE_TEMPLATE}.sh" "${tpl_dir}/${TEMPLATE_NAME}.sh" + info "Copied hook: ${tpl_dir}/${TEMPLATE_NAME}.sh" + fi + fi + done + + # ===================================================== + # Step 7: Create cron job for database updates + # ===================================================== + local CRON_FILE CRON_CONTENT + if [[ "$USE_DBIP" == "true" ]]; then + CRON_FILE="$CRON_MONTHLY" + step "Creating monthly DB-IP update script at ${CRON_FILE}" + + read -r -d '' CRON_CONTENT < GeoLite2-City.mmdb && \\ + curl -fsSL "https://download.db-ip.com/free/dbip-asn-lite-\${MONTH}.mmdb.gz" | gunzip > GeoLite2-ASN.mmdb && \\ + logger -t dbip-update "DB-IP databases updated" +CRONEOF + else + CRON_FILE="$CRON_WEEKLY" + step "Creating weekly geoipupdate script at ${CRON_FILE}" + + read -r -d '' CRON_CONTENT <&1 | logger -t geoipupdate +CRONEOF + fi + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${CRON_FILE}" + else + backup_file "$CRON_FILE" + echo "$CRON_CONTENT" > "$CRON_FILE" + chmod 755 "$CRON_FILE" + info "Created: ${CRON_FILE}" + fi + + # ===================================================== + # Step 8: Validate nginx config + # ===================================================== + if ! nginx_validate; then + err "Aborting — fix the config errors above" + return 1 + fi + + # ===================================================== + # Step 9: Apply template (optional) + # ===================================================== + apply_template "$APPLY_USER" "$APPLY_DOMAIN" "$APPLY_ALL" "$TEMPLATE_NAME" + + # Apply Apache backend template (if Apache templates were created) + if [[ -n "$APPLY_USER" && -n "$APACHE_TPL_DIR" ]] && [[ -f "${APACHE_TPL_DIR}/${TEMPLATE_NAME}.tpl" ]]; then + if command -v v-change-web-domain-tpl &>/dev/null; then + if [[ "$APPLY_ALL" == "true" ]]; then + step "Applying Apache backend template to all domains for user: ${APPLY_USER}" + local domains + domains=$(v-list-web-domains "$APPLY_USER" plain 2>/dev/null | awk '{print $1}') + while IFS= read -r domain; do + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-tpl ${APPLY_USER} ${domain} ${TEMPLATE_NAME}" + else + v-change-web-domain-tpl "$APPLY_USER" "$domain" "$TEMPLATE_NAME" + info "Apache template applied to: ${domain}" + fi + done <<< "$domains" + else + step "Applying Apache backend template to: ${APPLY_DOMAIN}" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would apply: v-change-web-domain-tpl ${APPLY_USER} ${APPLY_DOMAIN} ${TEMPLATE_NAME}" + else + v-change-web-domain-tpl "$APPLY_USER" "$APPLY_DOMAIN" "$TEMPLATE_NAME" + info "Apache template applied to: ${APPLY_DOMAIN}" + fi + fi + fi + fi + + # ===================================================== + # Step 10: Reload nginx and Apache + # ===================================================== + step "Reloading web servers" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl reload nginx" + echo " Would run: systemctl reload apache2" + else + systemctl reload nginx + info "nginx reloaded" + + if systemctl is-active --quiet apache2 2>/dev/null; then + systemctl reload apache2 + info "Apache reloaded" + fi + fi + + # ===================================================== + # Summary + # ===================================================== + echo "" + echo -e "${BOLD}Done.${NC}" + echo "" + echo " GeoIP config: ${NGINX_GEOIP_CONF}" + echo " Log format: ${NGINX_LOG_CONF}" + echo " Template: ${TEMPLATE_NAME} (.tpl + .stpl)" + echo " Base: ${BASE_TEMPLATE}" + echo " DB updates: ${CRON_FILE}" + if [[ "$USE_DBIP" == "true" ]]; then + echo " DB source: DB-IP Lite (no signup)" + else + echo " DB source: MaxMind GeoLite2" + fi + if [[ -n "$APPLY_USER" ]]; then + if [[ "$APPLY_ALL" == "true" ]]; then + echo " Applied: All domains for ${APPLY_USER}" + else + echo " Applied: ${APPLY_DOMAIN}" + fi + else + echo "" + echo " To apply to a domain:" + echo " v-change-web-domain-proxy-tpl ${TEMPLATE_NAME}" + echo "" + echo " To apply to all domains for a user:" + echo " sudo $(basename "$0") geoip --apply-all " + fi + echo "" + echo " To remove: sudo $(basename "$0") geoip --remove" +} + +# ============================================================================= +# SUBCOMMAND: crowdsec +# ============================================================================= + +cmd_crowdsec() { + local ENROLLMENT_KEY="" + local SKIP_ENGINE=false + local SKIP_BOUNCER=false + local SKIP_APPSEC=false + local REMOVE=false + local ACQUIS_FILE="/etc/crowdsec/acquis.d/hestiacp.yaml" + + usage_crowdsec() { + cat </dev/null 2>&1; then + step "Removing nginx bouncer" + apt-get remove -y crowdsec-nginx-bouncer + info "Removed: crowdsec-nginx-bouncer" + else + warn "crowdsec-nginx-bouncer not installed" + fi + + if [[ -f "$ACQUIS_FILE" ]]; then + rm -f "$ACQUIS_FILE" + info "Removed: ${ACQUIS_FILE}" + else + warn "Acquisition file not found: ${ACQUIS_FILE}" + fi + + if nginx -t 2>&1; then + systemctl reload nginx + info "nginx reloaded" + else + err "nginx config test failed after removal" + exit 1 + fi + + if systemctl is-active --quiet crowdsec 2>/dev/null; then + systemctl restart crowdsec + info "CrowdSec restarted" + fi + fi + + echo "" + echo -e "${BOLD}CrowdSec bouncer and acquisition removed.${NC}" + echo "" + echo " The CrowdSec engine is still installed." + echo " To fully remove: sudo apt remove crowdsec" + return 0 + fi + + # ===================================================== + # Step 1: Install CrowdSec engine + # ===================================================== + if [[ "$SKIP_ENGINE" == "true" ]]; then + warn "Skipping CrowdSec engine installation (--skip-engine)" + else + step "Installing CrowdSec engine" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: curl -s https://install.crowdsec.net | bash" + echo " Would run: apt install -y crowdsec" + else + if ! command -v cscli &>/dev/null; then + curl -s https://install.crowdsec.net | bash + apt-get install -y crowdsec + info "CrowdSec engine installed" + else + info "CrowdSec engine already installed" + fi + fi + + # Install collections + step "Installing CrowdSec collections" + + local collections=( + "crowdsecurity/nginx" + "crowdsecurity/http-cve" + "crowdsecurity/base-http-scenarios" + "crowdsecurity/sshd" + "crowdsecurity/linux" + ) + + for collection in "${collections[@]}"; do + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: cscli collections install ${collection}" + else + cscli collections install "$collection" --force 2>/dev/null || true + info "Installed collection: ${collection}" + fi + done + fi + + # ===================================================== + # Step 2: Configure HestiaCP log acquisition + # ===================================================== + step "Configuring HestiaCP log acquisition at ${ACQUIS_FILE}" + + local ACQUIS_CONTENT + read -r -d '' ACQUIS_CONTENT <<'ACQUISEOF' || true +# HestiaCP log acquisition — generated by hestia-security.sh crowdsec +# nginx domain access logs +filenames: + - /var/log/nginx/domains/*.log +labels: + type: nginx +--- +# nginx domain error logs +filenames: + - /var/log/nginx/domains/*.error.log +labels: + type: nginx +--- +# HestiaCP panel nginx logs +filenames: + - /var/log/hestia/nginx-access.log + - /var/log/hestia/nginx-error.log +labels: + type: nginx +--- +# System auth log +filenames: + - /var/log/auth.log +labels: + type: syslog +--- +# Exim mail log +filenames: + - /var/log/exim4/mainlog +labels: + type: syslog +ACQUISEOF + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${ACQUIS_FILE}" + else + mkdir -p "$(dirname "$ACQUIS_FILE")" + echo "$ACQUIS_CONTENT" > "$ACQUIS_FILE" + info "Created: ${ACQUIS_FILE}" + fi + + # ===================================================== + # Step 3: Install nginx lua bouncer + # ===================================================== + if [[ "$SKIP_BOUNCER" == "true" ]]; then + warn "Skipping nginx lua bouncer installation (--skip-bouncer)" + else + step "Installing nginx lua bouncer dependencies" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: apt install -y nginx lua5.1 libnginx-mod-http-lua luarocks gettext-base lua-cjson" + echo " Would run: apt install -y crowdsec-nginx-bouncer" + else + apt-get install -y nginx lua5.1 libnginx-mod-http-lua luarocks gettext-base lua-cjson 2>/dev/null || true + info "Lua dependencies installed" + + step "Installing crowdsec-nginx-bouncer" + apt-get install -y crowdsec-nginx-bouncer + info "crowdsec-nginx-bouncer installed" + fi + + # Install nginx-proxy parser for dual-stack + step "Installing nginx-proxy parser for dual-stack setups" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: cscli parsers install crowdsecurity/nginx-proxy" + else + cscli parsers install crowdsecurity/nginx-proxy --force 2>/dev/null || true + info "Installed parser: crowdsecurity/nginx-proxy" + fi + fi + + # ===================================================== + # Step 4: Restart nginx + # ===================================================== + step "Restarting nginx" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl restart nginx" + else + if nginx -t 2>&1; then + systemctl restart nginx + info "nginx restarted" + else + err "nginx config test failed" + exit 1 + fi + fi + + # ===================================================== + # Step 5: AppSec WAF (optional) + # ===================================================== + if [[ "$SKIP_APPSEC" == "true" ]]; then + warn "Skipping AppSec component (--skip-appsec)" + else + step "Installing AppSec collections" + + local appsec_collections=( + "crowdsecurity/appsec-virtual-patching" + "crowdsecurity/appsec-generic-rules" + ) + + for collection in "${appsec_collections[@]}"; do + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: cscli collections install ${collection}" + else + cscli collections install "$collection" --force 2>/dev/null || true + info "Installed collection: ${collection}" + fi + done + + step "Adding AppSec acquisition block to ${ACQUIS_FILE}" + + local APPSEC_BLOCK + read -r -d '' APPSEC_BLOCK <<'APPSECEOF' || true +--- +# AppSec Component +listen_addr: 127.0.0.1:7422 +appsec_config: crowdsecurity/appsec-default +name: hestiacp_appsec +source: appsec +labels: + type: appsec +APPSECEOF + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would append AppSec block to: ${ACQUIS_FILE}" + else + echo "" >> "$ACQUIS_FILE" + echo "$APPSEC_BLOCK" >> "$ACQUIS_FILE" + info "AppSec acquisition block added to: ${ACQUIS_FILE}" + fi + fi + + # ===================================================== + # Step 6: Console enrollment (optional) + # ===================================================== + if [[ -n "$ENROLLMENT_KEY" ]]; then + step "Enrolling in CrowdSec Console" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: cscli console enroll ${ENROLLMENT_KEY}" + echo " Would run: systemctl restart crowdsec" + else + cscli console enroll "$ENROLLMENT_KEY" + info "Enrolled in CrowdSec Console" + systemctl restart crowdsec + info "CrowdSec restarted" + fi + fi + + # ===================================================== + # Step 7: Restart CrowdSec to pick up acquisition + # ===================================================== + step "Restarting CrowdSec" + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: systemctl restart crowdsec" + else + systemctl restart crowdsec + info "CrowdSec restarted" + fi + + # ===================================================== + # Step 8: Verify + # ===================================================== + step "Verifying installation" + + if [[ "$DRY_RUN" == "true" ]]; then + echo " Would run: cscli bouncers list" + echo " Would run: nginx -t" + else + echo "" + echo " Registered bouncers:" + cscli bouncers list 2>/dev/null || warn "Could not list bouncers" + echo "" + + if nginx -t 2>&1; then + info "nginx config valid" + else + err "nginx config test failed" + fi + fi + + # ===================================================== + # Summary + # ===================================================== + echo "" + echo -e "${BOLD}Done.${NC}" + echo "" + echo " Acquisition: ${ACQUIS_FILE}" + if [[ "$SKIP_ENGINE" != "true" ]]; then + echo " Engine: CrowdSec installed" + fi + if [[ "$SKIP_BOUNCER" != "true" ]]; then + echo " Bouncer: crowdsec-nginx-bouncer installed" + fi + if [[ "$SKIP_APPSEC" != "true" ]]; then + echo " AppSec: WAF component configured" + fi + if [[ -n "$ENROLLMENT_KEY" ]]; then + echo " Console: Enrolled (accept in CrowdSec Console web UI)" + fi + echo "" + echo " Useful commands:" + echo " cscli decisions list — show active decisions (bans)" + echo " cscli alerts list — show recent alerts" + echo " cscli bouncers list — show registered bouncers" + echo " cscli metrics — show processing metrics" + echo " cscli hub update — update hub index" + echo "" + echo " To remove: sudo $(basename "$0") crowdsec --remove" +} + +# ============================================================================= +# SUBCOMMAND: status +# ============================================================================= + +cmd_status() { + while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + echo "Usage: sudo $(basename "$0") status" + echo "" + echo "Show status of all security features." + exit 0 + ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac + done + + echo "" + echo -e "${BOLD}=== HestiaCP Security Status ===${NC}" + echo "" + + # --- Panel detection --- + echo -e "${CYAN}Panel:${NC}" + if [[ -d "/usr/local/hestia/data/templates/web/nginx" ]]; then + echo " Detected: HestiaCP" + echo " Templates: /usr/local/hestia/data/templates/web/nginx" + local tpl_dir="/usr/local/hestia/data/templates/web/nginx" + elif [[ -d "/usr/local/vesta/data/templates/web/nginx" ]]; then + echo " Detected: VestaCP/myVesta" + echo " Templates: /usr/local/vesta/data/templates/web/nginx" + local tpl_dir="/usr/local/vesta/data/templates/web/nginx" + else + echo " Not detected (neither HestiaCP nor VestaCP/myVesta found)" + local tpl_dir="" + fi + echo "" + + # --- Bot-block --- + echo -e "${CYAN}Bot-block:${NC}" + local bot_map="/etc/nginx/conf.d/bot-block.conf" + if [[ -f "$bot_map" ]]; then + local bot_count + bot_count=$(grep -c '~\*' "$bot_map" 2>/dev/null || echo "0") + echo -e " Status: ${GREEN}Active${NC}" + echo " Map: ${bot_map}" + echo " Bot patterns: ${bot_count}" + else + echo -e " Status: ${YELLOW}Not configured${NC}" + fi + echo "" + + # --- JS challenge --- + echo -e "${CYAN}JS Challenge:${NC}" + local jsc_map="/etc/nginx/conf.d/js-challenge.conf" + local jsc_state="/etc/nginx/js-challenge.env" + if [[ -f "$jsc_map" ]]; then + echo -e " Status: ${GREEN}Active${NC}" + echo " Map: ${jsc_map}" + if [[ -f "$jsc_state" ]]; then + # shellcheck source=/dev/null + local _jsc_cookie_name="" + _jsc_cookie_name=$(grep "^COOKIE_NAME=" "$jsc_state" 2>/dev/null | cut -d"'" -f2) + if [[ -n "$_jsc_cookie_name" ]]; then + echo " Cookie name: ${_jsc_cookie_name}" + fi + fi + else + echo -e " Status: ${YELLOW}Not configured${NC}" + fi + echo "" + + # --- GeoIP --- + echo -e "${CYAN}GeoIP:${NC}" + local geoip_conf="/etc/nginx/conf.d/geoip2.conf" + if [[ -f "$geoip_conf" ]]; then + echo -e " Status: ${GREEN}Active${NC}" + echo " Config: ${geoip_conf}" + + # Determine DB source + if [[ -f "/etc/cron.monthly/geoip-db-update" ]]; then + echo " DB source: DB-IP Lite (monthly updates)" + elif [[ -f "/etc/cron.weekly/geoip-db-update" ]]; then + echo " DB source: MaxMind GeoLite2 (weekly updates)" + else + echo " DB source: Unknown (no update cron found)" + fi + + # Show DB age + local city_db="/usr/share/GeoIP/GeoLite2-City.mmdb" + if [[ -f "$city_db" ]]; then + local db_date + db_date=$(stat -c '%y' "$city_db" 2>/dev/null | cut -d' ' -f1) + echo " DB date: ${db_date}" + fi + else + echo -e " Status: ${YELLOW}Not configured${NC}" + fi + echo "" + + # --- CrowdSec --- + echo -e "${CYAN}CrowdSec:${NC}" + if command -v cscli &>/dev/null; then + if systemctl is-active --quiet crowdsec 2>/dev/null; then + echo -e " Engine: ${GREEN}Running${NC}" + else + echo -e " Engine: ${RED}Stopped${NC}" + fi + + if dpkg -l crowdsec-nginx-bouncer &>/dev/null 2>&1; then + echo -e " Bouncer: ${GREEN}Installed${NC}" + local bouncer_count + bouncer_count=$(cscli bouncers list -o raw 2>/dev/null | tail -n +2 | wc -l || echo "0") + echo " Registered bouncers: ${bouncer_count}" + else + echo -e " Bouncer: ${YELLOW}Not installed${NC}" + fi + + local decision_count + decision_count=$(cscli decisions list -o raw 2>/dev/null | tail -n +2 | wc -l || echo "0") + echo " Active decisions: ${decision_count}" + else + echo -e " Status: ${YELLOW}Not installed${NC}" + fi + echo "" + + # --- Custom templates --- + if [[ -n "$tpl_dir" && -d "$tpl_dir" ]]; then + echo -e "${CYAN}Custom templates:${NC}" + local found_tpl=false + for tpl_name in default-botblock default-jschallenge default-geoip geoip-botblock geoip-botblock-jsc; do + if [[ -f "${tpl_dir}/${tpl_name}.tpl" ]]; then + echo " ✓ ${tpl_name}" + found_tpl=true + fi + done + if [[ "$found_tpl" == "false" ]]; then + echo " No custom security templates found" + fi + echo "" + fi +} + +# ============================================================================= +# MAIN DISPATCH +# ============================================================================= + +case "${1:-}" in + bot-block) shift; cmd_bot_block "$@" ;; + js-challenge) shift; cmd_js_challenge "$@" ;; + geoip) shift; cmd_geoip "$@" ;; + crowdsec) shift; cmd_crowdsec "$@" ;; + status) shift; cmd_status "$@" ;; + -h|--help|"") usage_main ;; + *) err "Unknown command: $1"; usage_main ;; +esac diff --git a/hetzner-backup-auditor.sh b/hetzner-backup-auditor.sh new file mode 100644 index 0000000..21838e0 --- /dev/null +++ b/hetzner-backup-auditor.sh @@ -0,0 +1,586 @@ +#!/usr/bin/env bash + +######################################################################################### +#### hetzner-backup-auditor.sh — Audit backup schedules, snapshot ages, and #### +#### retention policies for Hetzner Cloud servers via the REST API #### +#### Requires: bash 4+, curl, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./hetzner-backup-auditor.sh --audit #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +SERVER_ID="" +LABEL_SELECTOR="" +OUTPUT_FORMAT="${HBA_FORMAT:-table}" +MAX_AGE_HOURS="${HBA_MAX_AGE:-48}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +HCLOUD_TOKEN="${HCLOUD_TOKEN:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── API helpers ────────────────────────────────────────────────────── +hcloud_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/hba_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer ${HCLOUD_TOKEN}" \ + -H "Content-Type: application/json" \ + "https://api.hetzner.cloud/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + if [[ "$http_code" =~ ^[45] ]]; then + local errmsg + errmsg=$(jq -r '.error.message // empty' /tmp/hba_resp.json 2>/dev/null) + [[ -n "$errmsg" ]] && verbose "API error: ${errmsg}" + fi + + cat /tmp/hba_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$HCLOUD_TOKEN" ]] && die "HCLOUD_TOKEN not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +urlencode() { + local string="$1" + python3 -c "import urllib.parse; print(urllib.parse.quote('$string', safe=''))" 2>/dev/null \ + || echo "$string" +} + +# ── Pagination helper ──────────────────────────────────────────────── +fetch_all() { + local endpoint="$1" key="$2" + local page=1 per_page=50 all_data="[]" + while true; do + local sep="?" + [[ "$endpoint" == *"?"* ]] && sep="&" + local resp + resp=$(hcloud_api GET "${endpoint}${sep}page=${page}&per_page=${per_page}") + local page_data + page_data=$(echo "$resp" | jq ".${key} // []" 2>/dev/null) + local page_count + page_count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$page_count" -eq 0 ]] && break + all_data=$(echo -e "${all_data}\n${page_data}" | jq -s 'add' 2>/dev/null) + (( page_count < per_page )) && break + ((page++)) || true + done + echo "$all_data" +} + +# ── Age helpers ────────────────────────────────────────────────────── +iso_to_epoch() { + date -d "$1" +%s 2>/dev/null || echo 0 +} + +age_hours() { + local created_epoch="$1" + local now + now=$(date +%s) + echo $(( (now - created_epoch) / 3600 )) +} + +format_age() { + local hours="$1" + if [[ "$hours" -lt 24 ]]; then + echo "${hours}h" + else + local days=$(( hours / 24 )) + local rem=$(( hours % 24 )) + echo "${days}d ${rem}h" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ +do_audit() { + local query="/servers?" + [[ -n "$LABEL_SELECTOR" ]] && query="${query}label_selector=$(urlencode "$LABEL_SELECTOR")&" + [[ -n "$SERVER_ID" ]] && query="/servers?id=${SERVER_ID}&" + + local servers + servers=$(fetch_all "${query%&}" "servers") + local server_count + server_count=$(echo "$servers" | jq 'length' 2>/dev/null || echo 0) + [[ "$server_count" -eq 0 ]] && die "No servers found" + + local snapshots + snapshots=$(fetch_all "/images?type=snapshot" "images") + + local now + now=$(date +%s) + local warnings=0 + local no_backup=0 + local stale=0 + local healthy=0 + local results="" + + while IFS=$'\t' read -r sid sname sstatus backup_enabled; do + [[ -z "$sid" ]] && continue + + # Find most recent snapshot for this server + local latest_snap + latest_snap=$(echo "$snapshots" | jq -r \ + --arg sid "$sid" \ + '[.[] | select(.created_from.id == ($sid | tonumber))] | sort_by(.created) | last | .created // empty' \ + 2>/dev/null) + + # Find most recent backup (backup type images) + local latest_backup_resp + latest_backup_resp=$(hcloud_api GET "/images?type=backup&sort=created:desc&page=1&per_page=1") + # Filter backups for this specific server + local server_backups + server_backups=$(fetch_all "/images?type=backup" "images") + local latest_backup + latest_backup=$(echo "$server_backups" | jq -r \ + --arg sid "$sid" \ + '[.[] | select(.created_from.id == ($sid | tonumber))] | sort_by(.created) | last | .created // empty' \ + 2>/dev/null) + + # Determine newest protection point + local newest="" + local newest_type="none" + if [[ -n "$latest_backup" && -n "$latest_snap" ]]; then + local bepoch sepoch + bepoch=$(iso_to_epoch "$latest_backup") + sepoch=$(iso_to_epoch "$latest_snap") + if [[ "$bepoch" -ge "$sepoch" ]]; then + newest="$latest_backup" + newest_type="backup" + else + newest="$latest_snap" + newest_type="snapshot" + fi + elif [[ -n "$latest_backup" ]]; then + newest="$latest_backup" + newest_type="backup" + elif [[ -n "$latest_snap" ]]; then + newest="$latest_snap" + newest_type="snapshot" + fi + + local age_h="—" + local status_flag="none" + if [[ -n "$newest" ]]; then + local nepoch + nepoch=$(iso_to_epoch "$newest") + age_h=$(age_hours "$nepoch") + if [[ "$age_h" -le "$MAX_AGE_HOURS" ]]; then + status_flag="ok" + ((healthy++)) || true + else + status_flag="stale" + ((stale++)) || true + ((warnings++)) || true + fi + else + ((no_backup++)) || true + ((warnings++)) || true + fi + + local backup_str="disabled" + [[ "$backup_enabled" == "true" ]] && backup_str="enabled" + + results="${results}${sid}\t${sname}\t${sstatus}\t${backup_str}\t${newest_type}\t${age_h}\t${status_flag}\n" + done < <(echo "$servers" | jq -r \ + '.[] | "\(.id)\t\(.name // "unknown")\t\(.status)\t\(.backup_window != null)"' \ + 2>/dev/null) + + case "$OUTPUT_FORMAT" in + json) + jq -n \ + --argjson servers "$server_count" \ + --argjson healthy "$healthy" \ + --argjson stale "$stale" \ + --argjson no_backup "$no_backup" \ + --argjson warnings "$warnings" \ + --argjson max_age "$MAX_AGE_HOURS" \ + '{servers: $servers, healthy: $healthy, stale: $stale, no_backup: $no_backup, warnings: $warnings, max_age_hours: $max_age}' + ;; + prometheus) + cat </dev/null || echo 0) + [[ "$total" -eq 0 ]] && die "No snapshots found" + + local now + now=$(date +%s) + + case "$OUTPUT_FORMAT" in + json) + echo "$snapshots" | jq '[.[] | { + id: .id, description: .description, + size_gb: .image_size, created: .created, + server_id: .created_from.id, server_name: .created_from.name + }]' + ;; + prometheus) + local stale_count=0 + while IFS=$'\t' read -r iid icreated; do + [[ -z "$iid" ]] && continue + local cepoch + cepoch=$(iso_to_epoch "$icreated") + local ah + ah=$(age_hours "$cepoch") + [[ "$ah" -gt "$MAX_AGE_HOURS" ]] && ((stale_count++)) || true + done < <(echo "$snapshots" | jq -r '.[] | "\(.id)\t\(.created)"' 2>/dev/null) + + cat </dev/null \ + | while IFS=$'\t' read -r iid idesc isize iserver icreated; do + local cepoch ah age_display age_color + cepoch=$(iso_to_epoch "$icreated") + ah=$(age_hours "$cepoch") + age_display=$(format_age "$ah") + age_color="$GREEN" + [[ "$ah" -gt "$MAX_AGE_HOURS" ]] && age_color="$YELLOW" + + printf " %-10s %-20s %-8s %-10s %-20s " \ + "$iid" "${idesc:0:18}" "${isize}GB" "${iserver:0:8}" "${icreated:0:19}" + echo -e "${age_color}${age_display}${RESET}" + done + + echo "" + field "Snapshots:" "$total" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# BACKUPS +# ══════════════════════════════════════════════════════════════════════ +do_backups() { + local backups + backups=$(fetch_all "/images?type=backup&sort=created:desc" "images") + local total + total=$(echo "$backups" | jq 'length' 2>/dev/null || echo 0) + [[ "$total" -eq 0 ]] && die "No backups found" + + case "$OUTPUT_FORMAT" in + json) + echo "$backups" | jq '[.[] | { + id: .id, description: .description, + size_gb: .image_size, created: .created, + server_id: .created_from.id, server_name: .created_from.name + }]' + ;; + *) + section_header "Backups" + + printf " ${BOLD}%-10s %-20s %-8s %-10s %-20s %-8s${RESET}\n" \ + "ID" "DESCRIPTION" "SIZE" "SERVER" "CREATED" "AGE" + printf " %s\n" "$(printf '%.0s─' {1..80})" + + echo "$backups" | jq -r \ + '.[] | "\(.id)\t\(.description // "—")\t\(.image_size // 0)\t\(.created_from.name // "—")\t\(.created // "—")"' \ + 2>/dev/null \ + | while IFS=$'\t' read -r iid idesc isize iserver icreated; do + local cepoch ah age_display age_color + cepoch=$(iso_to_epoch "$icreated") + ah=$(age_hours "$cepoch") + age_display=$(format_age "$ah") + age_color="$GREEN" + [[ "$ah" -gt "$MAX_AGE_HOURS" ]] && age_color="$YELLOW" + + printf " %-10s %-20s %-8s %-10s %-20s " \ + "$iid" "${idesc:0:18}" "${isize}GB" "${iserver:0:8}" "${icreated:0:19}" + echo -e "${age_color}${age_display}${RESET}" + done + + echo "" + field "Backups:" "$total" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat < /var/lib/node_exporter/textfile/hetzner_backup.prom 2>/dev/null + +${BOLD}EXIT CODES${RESET} + 0 Success + 1 Runtime error +EOF +} + +# ══════════════════════════════════════════════════════════════════════ +# PARSE ARGS +# ══════════════════════════════════════════════════════════════════════ +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --audit) RUN_MODE="audit"; shift ;; + --snapshots) RUN_MODE="snapshots"; shift ;; + --backups) RUN_MODE="backups"; shift ;; + --server) SERVER_ID="${2:?--server requires an ID}"; shift 2 ;; + --label) LABEL_SELECTOR="${2:?--label requires KEY=VALUE}"; shift 2 ;; + --max-age) MAX_AGE_HOURS="${2:?--max-age requires HOURS}"; shift 2 ;; + --format) OUTPUT_FORMAT="${2:?--format requires a value}"; shift 2 ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + --help|-h) setup_colors; show_help; exit 0 ;; + *) die "Unknown option: $1 (see --help)" ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ +main() { + parse_args "$@" + setup_colors + + if [[ -z "$RUN_MODE" ]]; then + RUN_MODE="audit" + fi + + check_deps + check_credentials + + START_TIME=$(date +%s) + + case "$RUN_MODE" in + audit) do_audit ;; + snapshots) do_snapshots ;; + backups) do_backups ;; + *) die "Unknown mode: ${RUN_MODE}" ;; + esac + + if [[ "$OUTPUT_FORMAT" != "prometheus" ]]; then + echo "" + field "Duration:" "$(elapsed)" + fi +} + +main "$@" diff --git a/hetzner-cost-monitor.sh b/hetzner-cost-monitor.sh new file mode 100644 index 0000000..dd72de1 --- /dev/null +++ b/hetzner-cost-monitor.sh @@ -0,0 +1,626 @@ +#!/usr/bin/env bash + +######################################################################################### +#### hetzner-cost-monitor.sh — Track and report Hetzner Cloud spending via the REST #### +#### API. Server, volume, snapshot, and load balancer costs with alert thresholds #### +#### Requires: bash 4+, curl, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./hetzner-cost-monitor.sh --summary #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +OUTPUT_FORMAT="${HCM_FORMAT:-table}" +ALERT_THRESHOLD="${HCM_ALERT:-0}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +HCLOUD_TOKEN="${HCLOUD_TOKEN:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── API helpers ────────────────────────────────────────────────────── +hcloud_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/hcm_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer ${HCLOUD_TOKEN}" \ + -H "Content-Type: application/json" \ + "https://api.hetzner.cloud/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + if [[ "$http_code" =~ ^[45] ]]; then + local errmsg + errmsg=$(jq -r '.error.message // empty' /tmp/hcm_resp.json 2>/dev/null) + [[ -n "$errmsg" ]] && verbose "API error: ${errmsg}" + fi + + cat /tmp/hcm_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$HCLOUD_TOKEN" ]] && die "HCLOUD_TOKEN not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Pagination helper ──────────────────────────────────────────────── +fetch_all() { + local endpoint="$1" key="$2" + local page=1 per_page=50 all_data="[]" + while true; do + local sep="?" + [[ "$endpoint" == *"?"* ]] && sep="&" + local resp + resp=$(hcloud_api GET "${endpoint}${sep}page=${page}&per_page=${per_page}") + local page_data + page_data=$(echo "$resp" | jq ".${key} // []" 2>/dev/null) + local page_count + page_count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$page_count" -eq 0 ]] && break + all_data=$(echo -e "${all_data}\n${page_data}" | jq -s 'add' 2>/dev/null) + (( page_count < per_page )) && break + ((page++)) || true + done + echo "$all_data" +} + +# ── Cost calculation helpers ───────────────────────────────────────── +get_pricing() { + hcloud_api GET "/pricing" +} + +calc_server_costs() { + local servers="$1" pricing="$2" + local total="0" + + echo "$servers" | jq -r '.[] | "\(.server_type.name)\t\(.status)"' 2>/dev/null \ + | while IFS=$'\t' read -r stype status; do + if [[ "$status" == "running" ]]; then + local hourly + hourly=$(echo "$pricing" | jq -r \ + --arg t "$stype" \ + '.pricing.server_types[] | select(.name == $t) | .prices[0].price_hourly.gross' \ + 2>/dev/null) + [[ -n "$hourly" && "$hourly" != "null" ]] && echo "$hourly" + fi + done \ + | awk '{s+=$1} END {printf "%.2f", s * 730}' +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ +do_summary() { + local pricing + pricing=$(get_pricing) + + local servers + servers=$(fetch_all "/servers" "servers") + local server_count + server_count=$(echo "$servers" | jq 'length' 2>/dev/null || echo 0) + local running_count + running_count=$(echo "$servers" | jq '[.[] | select(.status == "running")] | length' 2>/dev/null || echo 0) + + local volumes + volumes=$(fetch_all "/volumes" "volumes") + local volume_count + volume_count=$(echo "$volumes" | jq 'length' 2>/dev/null || echo 0) + local volume_gb + volume_gb=$(echo "$volumes" | jq '[.[].size] | add // 0' 2>/dev/null || echo 0) + + local images + images=$(fetch_all "/images?type=snapshot" "images") + local snapshot_count + snapshot_count=$(echo "$images" | jq 'length' 2>/dev/null || echo 0) + local snapshot_gb + snapshot_gb=$(echo "$images" | jq '[.[].image_size // 0] | add // 0' 2>/dev/null || echo 0) + + local lbs + lbs=$(fetch_all "/load_balancers" "load_balancers") + local lb_count + lb_count=$(echo "$lbs" | jq 'length' 2>/dev/null || echo 0) + + local ips + ips=$(fetch_all "/floating_ips" "floating_ips") + local ip_count + ip_count=$(echo "$ips" | jq 'length' 2>/dev/null || echo 0) + + # Monthly cost estimates from pricing API + local volume_price_gb + volume_price_gb=$(echo "$pricing" | jq -r '.pricing.volume.price_per_gb_per_month.gross // "0"' 2>/dev/null) + local volume_cost + volume_cost=$(awk "BEGIN {printf \"%.2f\", ${volume_gb} * ${volume_price_gb}}") + + local snapshot_price_gb + snapshot_price_gb=$(echo "$pricing" | jq -r '.pricing.image.price_per_gb_month.gross // "0"' 2>/dev/null) + local snapshot_cost + snapshot_cost=$(awk "BEGIN {printf \"%.2f\", ${snapshot_gb} * ${snapshot_price_gb}}") + + local ip_price + ip_price=$(echo "$pricing" | jq -r '.pricing.floating_ip.price_monthly.gross // "0"' 2>/dev/null) + local ip_cost + ip_cost=$(awk "BEGIN {printf \"%.2f\", ${ip_count} * ${ip_price}}") + + # Server costs — sum hourly × 730 hours/month + local server_cost="0.00" + while IFS=$'\t' read -r stype status; do + [[ -z "$stype" ]] && continue + if [[ "$status" == "running" ]]; then + local hourly + hourly=$(echo "$pricing" | jq -r \ + --arg t "$stype" \ + '[.pricing.server_types[] | select(.name == $t) | .prices[0].price_hourly.gross][0] // "0"' \ + 2>/dev/null) + server_cost=$(awk "BEGIN {printf \"%.2f\", ${server_cost} + (${hourly} * 730)}") + fi + done < <(echo "$servers" | jq -r '.[] | "\(.server_type.name)\t\(.status)"' 2>/dev/null) + + # LB costs + local lb_cost="0.00" + while IFS= read -r lbtype; do + [[ -z "$lbtype" ]] && continue + local lb_hourly + lb_hourly=$(echo "$pricing" | jq -r \ + --arg t "$lbtype" \ + '[.pricing.load_balancer_types[] | select(.name == $t) | .prices[0].price_hourly.gross][0] // "0"' \ + 2>/dev/null) + lb_cost=$(awk "BEGIN {printf \"%.2f\", ${lb_cost} + (${lb_hourly} * 730)}") + done < <(echo "$lbs" | jq -r '.[].load_balancer_type.name // empty' 2>/dev/null) + + local total_cost + total_cost=$(awk "BEGIN {printf \"%.2f\", ${server_cost} + ${volume_cost} + ${snapshot_cost} + ${lb_cost} + ${ip_cost}}") + + # Alert check + local alert_triggered="false" + if [[ "$ALERT_THRESHOLD" != "0" ]]; then + local over + over=$(awk "BEGIN {print (${total_cost} > ${ALERT_THRESHOLD}) ? 1 : 0}") + [[ "$over" == "1" ]] && alert_triggered="true" + fi + + case "$OUTPUT_FORMAT" in + json) + jq -n \ + --argjson servers "$server_count" \ + --argjson running "$running_count" \ + --argjson volumes "$volume_count" \ + --argjson volume_gb "$volume_gb" \ + --argjson snapshots "$snapshot_count" \ + --arg snapshot_gb "$snapshot_gb" \ + --argjson load_balancers "$lb_count" \ + --argjson floating_ips "$ip_count" \ + --arg server_cost "$server_cost" \ + --arg volume_cost "$volume_cost" \ + --arg snapshot_cost "$snapshot_cost" \ + --arg lb_cost "$lb_cost" \ + --arg ip_cost "$ip_cost" \ + --arg total_cost "$total_cost" \ + --arg alert_threshold "$ALERT_THRESHOLD" \ + --argjson alert_triggered "$alert_triggered" \ + '{ + servers: $servers, running: $running, + volumes: $volumes, volume_gb: $volume_gb, + snapshots: $snapshots, snapshot_gb: ($snapshot_gb | tonumber), + load_balancers: $load_balancers, floating_ips: $floating_ips, + monthly_estimate: { + servers: $server_cost, volumes: $volume_cost, + snapshots: $snapshot_cost, load_balancers: $lb_cost, + floating_ips: $ip_cost, total: $total_cost + }, + alert: { threshold: $alert_threshold, triggered: $alert_triggered } + }' + ;; + prometheus) + cat </dev/null || echo 0) + [[ "$server_count" -eq 0 ]] && die "No servers found" + + case "$OUTPUT_FORMAT" in + json) + echo "$servers" | jq '[.[] | { + id: .id, name: .name, status: .status, + type: .server_type.name, location: .datacenter.location.name, + ip: .public_net.ipv4.ip + }]' + ;; + *) + section_header "Server Cost Breakdown" + + printf " ${BOLD}%-10s %-20s %-10s %-8s %-8s %10s${RESET}\n" \ + "ID" "NAME" "TYPE" "STATUS" "LOC" "MONTHLY €" + printf " %s\n" "$(printf '%.0s─' {1..70})" + + while IFS=$'\t' read -r sid sname stype status loc; do + [[ -z "$sid" ]] && continue + local hourly="0" + if [[ "$status" == "running" ]]; then + hourly=$(echo "$pricing" | jq -r \ + --arg t "$stype" \ + '[.pricing.server_types[] | select(.name == $t) | .prices[0].price_hourly.gross][0] // "0"' \ + 2>/dev/null) + fi + local monthly + monthly=$(awk "BEGIN {printf \"%.2f\", ${hourly} * 730}") + + local status_color="$GREEN" + [[ "$status" != "running" ]] && status_color="$DIM" + + printf " %-10s %-20s %-10s " "$sid" "${sname:0:18}" "$stype" + echo -ne "${status_color}" + printf "%-8s" "$status" + echo -ne "${RESET}" + printf " %-8s %10s\n" "${loc:0:6}" "$monthly" + done < <(echo "$servers" | jq -r \ + '.[] | "\(.id)\t\(.name // "unknown")\t\(.server_type.name // "—")\t\(.status)\t\(.datacenter.location.name // "—")"' \ + 2>/dev/null) + + echo "" + field "Servers:" "$server_count" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# RESOURCES +# ══════════════════════════════════════════════════════════════════════ +do_resources() { + case "$OUTPUT_FORMAT" in + json) + local volumes snapshots lbs ips + volumes=$(fetch_all "/volumes" "volumes") + snapshots=$(fetch_all "/images?type=snapshot" "images") + lbs=$(fetch_all "/load_balancers" "load_balancers") + ips=$(fetch_all "/floating_ips" "floating_ips") + jq -n \ + --argjson volumes "$volumes" \ + --argjson snapshots "$snapshots" \ + --argjson load_balancers "$lbs" \ + --argjson floating_ips "$ips" \ + '{volumes: $volumes, snapshots: $snapshots, load_balancers: $load_balancers, floating_ips: $floating_ips}' + ;; + *) + # Volumes + local volumes + volumes=$(fetch_all "/volumes" "volumes") + local vol_count + vol_count=$(echo "$volumes" | jq 'length' 2>/dev/null || echo 0) + + if [[ "$vol_count" -gt 0 ]]; then + section_header "Volumes" + printf " ${BOLD}%-12s %-20s %-8s %-10s %-8s${RESET}\n" \ + "ID" "NAME" "SIZE" "LOCATION" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..60})" + + echo "$volumes" | jq -r \ + '.[] | "\(.id)\t\(.name // "—")\t\(.size)GB\t\(.location.name // "—")\t\(.status // "—")"' \ + 2>/dev/null \ + | while IFS=$'\t' read -r vid vname vsize vloc vstatus; do + printf " %-12s %-20s %-8s %-10s %-8s\n" \ + "$vid" "${vname:0:18}" "$vsize" "${vloc:0:8}" "$vstatus" + done + echo "" + field "Volumes:" "$vol_count" + fi + + # Snapshots + local snapshots + snapshots=$(fetch_all "/images?type=snapshot" "images") + local snap_count + snap_count=$(echo "$snapshots" | jq 'length' 2>/dev/null || echo 0) + + if [[ "$snap_count" -gt 0 ]]; then + section_header "Snapshots" + printf " ${BOLD}%-12s %-22s %-10s %-20s${RESET}\n" \ + "ID" "DESCRIPTION" "SIZE" "CREATED" + printf " %s\n" "$(printf '%.0s─' {1..66})" + + echo "$snapshots" | jq -r \ + '.[] | "\(.id)\t\(.description // "—")\t\(.image_size // 0)GB\t\(.created // "—")"' \ + 2>/dev/null \ + | while IFS=$'\t' read -r iid idesc isize icreated; do + printf " %-12s %-22s %-10s %-20s\n" \ + "$iid" "${idesc:0:20}" "$isize" "${icreated:0:19}" + done + echo "" + field "Snapshots:" "$snap_count" + fi + + # Floating IPs + local ips + ips=$(fetch_all "/floating_ips" "floating_ips") + local ip_count + ip_count=$(echo "$ips" | jq 'length' 2>/dev/null || echo 0) + + if [[ "$ip_count" -gt 0 ]]; then + section_header "Floating IPs" + printf " ${BOLD}%-12s %-18s %-10s %-12s${RESET}\n" \ + "ID" "IP" "TYPE" "LOCATION" + printf " %s\n" "$(printf '%.0s─' {1..54})" + + echo "$ips" | jq -r \ + '.[] | "\(.id)\t\(.ip)\t\(.type)\t\(.home_location.name // "—")"' \ + 2>/dev/null \ + | while IFS=$'\t' read -r fid fip ftype floc; do + printf " %-12s %-18s %-10s %-12s\n" \ + "$fid" "$fip" "$ftype" "$floc" + done + echo "" + field "Floating IPs:" "$ip_count" + fi + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat < /var/lib/node_exporter/textfile/hetzner_cost.prom 2>/dev/null + +${BOLD}EXIT CODES${RESET} + 0 Success + 1 Runtime error +EOF +} + +# ══════════════════════════════════════════════════════════════════════ +# PARSE ARGS +# ══════════════════════════════════════════════════════════════════════ +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --summary) RUN_MODE="summary"; shift ;; + --breakdown) RUN_MODE="breakdown"; shift ;; + --resources) RUN_MODE="resources"; shift ;; + --format) OUTPUT_FORMAT="${2:?--format requires a value}"; shift 2 ;; + --alert) ALERT_THRESHOLD="${2:?--alert requires a threshold}"; shift 2 ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + --help|-h) setup_colors; show_help; exit 0 ;; + *) die "Unknown option: $1 (see --help)" ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ +main() { + parse_args "$@" + setup_colors + + if [[ -z "$RUN_MODE" ]]; then + RUN_MODE="summary" + fi + + check_deps + check_credentials + + START_TIME=$(date +%s) + + case "$RUN_MODE" in + summary) do_summary ;; + breakdown) do_breakdown ;; + resources) do_resources ;; + *) die "Unknown mode: ${RUN_MODE}" ;; + esac + + if [[ "$OUTPUT_FORMAT" != "prometheus" ]]; then + echo "" + field "Duration:" "$(elapsed)" + fi +} + +main "$@" diff --git a/hetzner-dns-manager.sh b/hetzner-dns-manager.sh new file mode 100644 index 0000000..dee8104 --- /dev/null +++ b/hetzner-dns-manager.sh @@ -0,0 +1,721 @@ +#!/usr/bin/env bash + +######################################################################################### +#### hetzner-dns-manager.sh — Manage DNS zones and records via the Hetzner DNS API #### +#### List zones, add/update/delete records, BIND export/import, audit, bulk ops #### +#### Requires: bash 4+, curl, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./hetzner-dns-manager.sh --zones #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +ZONE_NAME="" +ZONE_ID="" +RECORD_ID="" +RECORD_TYPE="" +RECORD_NAME="" +RECORD_VALUE="" +RECORD_TTL="3600" +CSV_FILE="" +OUTPUT_FORMAT="${HDM_FORMAT:-table}" +FORCE="false" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +IMPORT_FILE="" +EXPORT_FILE="" + +# ── Credentials ─────────────────────────────────────────────────────── +HETZNER_DNS_TOKEN="${HETZNER_DNS_TOKEN:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +ACTION_OK=0 +ACTION_FAIL=0 + +# ── API helpers ────────────────────────────────────────────────────── +hdns_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/hdm_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Auth-API-Token: ${HETZNER_DNS_TOKEN}" \ + -H "Content-Type: application/json" \ + "https://dns.hetzner.com/api/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + if [[ "$http_code" =~ ^[45] ]]; then + local errmsg + errmsg=$(jq -r '.error.message // .message // empty' /tmp/hdm_resp.json 2>/dev/null) + [[ -n "$errmsg" ]] && verbose "API error: ${errmsg}" + fi + + cat /tmp/hdm_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +hdns_api_raw() { + local method="$1" endpoint="$2" + shift 2 + + curl -s \ + -X "$method" \ + -H "Auth-API-Token: ${HETZNER_DNS_TOKEN}" \ + "https://dns.hetzner.com/api/v1${endpoint}" "$@" +} + +check_credentials() { + [[ -z "$HETZNER_DNS_TOKEN" ]] && die "HETZNER_DNS_TOKEN not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Zone helpers ───────────────────────────────────────────────────── +resolve_zone_id() { + local name="$1" + local resp + resp=$(hdns_api GET "/zones?name=${name}") + local zid + zid=$(echo "$resp" | jq -r '.zones[0].id // empty' 2>/dev/null) + if [[ -z "$zid" ]]; then + die "Zone not found: ${name}" + fi + echo "$zid" +} + +# ══════════════════════════════════════════════════════════════════════ +# ZONES +# ══════════════════════════════════════════════════════════════════════ +do_zones() { + local page=1 per_page=100 all_data="[]" + + while true; do + local resp + resp=$(hdns_api GET "/zones?page=${page}&per_page=${per_page}") + local page_data + page_data=$(echo "$resp" | jq '.zones // []' 2>/dev/null) + local page_count + page_count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$page_count" -eq 0 ]] && break + all_data=$(echo -e "${all_data}\n${page_data}" | jq -s 'add' 2>/dev/null) + (( page_count < per_page )) && break + ((page++)) || true + done + + local total + total=$(echo "$all_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$total" -eq 0 ]] && die "No zones found" + + case "$OUTPUT_FORMAT" in + json) + echo "$all_data" | jq '.' + ;; + prometheus) + cat </dev/null \ + | while IFS=$'\t' read -r zid name status rcount ttl; do + printf " %-34s %-18s %-10s %-8s %-8s\n" \ + "${zid:0:32}" "${name:0:16}" "$status" "$rcount" "$ttl" + done + + echo "" + field "Zones:" "$total" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# RECORDS +# ══════════════════════════════════════════════════════════════════════ +do_records() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + + local zid + zid=$(resolve_zone_id "$ZONE_NAME") + + local resp + resp=$(hdns_api GET "/records?zone_id=${zid}") + local records + records=$(echo "$resp" | jq '.records // []' 2>/dev/null) + local total + total=$(echo "$records" | jq 'length' 2>/dev/null || echo 0) + + case "$OUTPUT_FORMAT" in + json) + echo "$records" | jq '.' + ;; + prometheus) + cat </dev/null \ + | while IFS=$'\t' read -r rid rtype rname rvalue rttl; do + printf " %-34s %-6s %-18s %-26s %-6s\n" \ + "${rid:0:32}" "$rtype" "${rname:0:16}" "${rvalue:0:24}" "$rttl" + done + + echo "" + field "Records:" "$total" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# ADD +# ══════════════════════════════════════════════════════════════════════ +do_add() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + [[ -z "$RECORD_TYPE" ]] && die "Specify --type TYPE" + [[ -z "$RECORD_NAME" ]] && die "Specify --name NAME" + [[ -z "$RECORD_VALUE" ]] && die "Specify --value VALUE" + + local zid + zid=$(resolve_zone_id "$ZONE_NAME") + + local payload + payload=$(jq -n \ + --arg zid "$zid" \ + --arg type "$RECORD_TYPE" \ + --arg name "$RECORD_NAME" \ + --arg value "$RECORD_VALUE" \ + --argjson ttl "$RECORD_TTL" \ + '{zone_id: $zid, type: $type, name: $name, value: $value, ttl: $ttl}') + + local resp + resp=$(hdns_api POST "/records" -d "$payload") + + local rid + rid=$(echo "$resp" | jq -r '.record.id // empty' 2>/dev/null) + + if [[ -n "$rid" ]]; then + echo -e " ${GREEN}✓${RESET} Record created: ${RECORD_TYPE} ${RECORD_NAME} → ${RECORD_VALUE} (ID: ${rid})" + ((ACTION_OK++)) || true + else + local errmsg + errmsg=$(echo "$resp" | jq -r '.error.message // .message // "unknown error"' 2>/dev/null) + echo -e " ${RED}✗${RESET} Failed to create record: ${errmsg}" + ((ACTION_FAIL++)) || true + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# UPDATE +# ══════════════════════════════════════════════════════════════════════ +do_update() { + [[ -z "$RECORD_ID" ]] && die "Specify --record-id ID" + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + [[ -z "$RECORD_TYPE" ]] && die "Specify --type TYPE" + [[ -z "$RECORD_NAME" ]] && die "Specify --name NAME" + [[ -z "$RECORD_VALUE" ]] && die "Specify --value VALUE" + + local zid + zid=$(resolve_zone_id "$ZONE_NAME") + + local payload + payload=$(jq -n \ + --arg zid "$zid" \ + --arg type "$RECORD_TYPE" \ + --arg name "$RECORD_NAME" \ + --arg value "$RECORD_VALUE" \ + --argjson ttl "$RECORD_TTL" \ + '{zone_id: $zid, type: $type, name: $name, value: $value, ttl: $ttl}') + + local resp + resp=$(hdns_api PUT "/records/${RECORD_ID}" -d "$payload") + + local rid + rid=$(echo "$resp" | jq -r '.record.id // empty' 2>/dev/null) + + if [[ -n "$rid" ]]; then + echo -e " ${GREEN}✓${RESET} Record updated: ${RECORD_TYPE} ${RECORD_NAME} → ${RECORD_VALUE} (ID: ${rid})" + ((ACTION_OK++)) || true + else + local errmsg + errmsg=$(echo "$resp" | jq -r '.error.message // .message // "unknown error"' 2>/dev/null) + echo -e " ${RED}✗${RESET} Failed to update record: ${errmsg}" + ((ACTION_FAIL++)) || true + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# DELETE +# ══════════════════════════════════════════════════════════════════════ +do_delete() { + [[ -z "$RECORD_ID" ]] && die "Specify --record-id ID" + [[ "$FORCE" != "true" ]] && die "Delete is destructive — use --force to confirm" + + local resp + resp=$(hdns_api DELETE "/records/${RECORD_ID}") + + echo -e " ${GREEN}✓${RESET} Record deleted: ${RECORD_ID}" + ((ACTION_OK++)) || true +} + +# ══════════════════════════════════════════════════════════════════════ +# EXPORT +# ══════════════════════════════════════════════════════════════════════ +do_export() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + + local zid + zid=$(resolve_zone_id "$ZONE_NAME") + + local zone_data + zone_data=$(hdns_api_raw GET "/zones/${zid}/export") + + if [[ -n "$EXPORT_FILE" ]]; then + echo "$zone_data" > "$EXPORT_FILE" + echo -e " ${GREEN}✓${RESET} Zone exported to ${EXPORT_FILE}" + else + echo "$zone_data" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# IMPORT +# ══════════════════════════════════════════════════════════════════════ +do_import() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + [[ -z "$IMPORT_FILE" ]] && die "Specify --file FILE" + [[ ! -f "$IMPORT_FILE" ]] && die "File not found: ${IMPORT_FILE}" + [[ "$FORCE" != "true" ]] && die "Import replaces ALL zone records — use --force to confirm" + + local zid + zid=$(resolve_zone_id "$ZONE_NAME") + + local zone_data + zone_data=$(cat "$IMPORT_FILE") + + local http_code + http_code=$(curl -s -o /tmp/hdm_resp.json -w "%{http_code}" \ + -X POST \ + -H "Auth-API-Token: ${HETZNER_DNS_TOKEN}" \ + -H "Content-Type: text/plain" \ + "https://dns.hetzner.com/api/v1/zones/${zid}/import" \ + --data-binary "$zone_data") + + if [[ "$http_code" =~ ^2 ]]; then + echo -e " ${GREEN}✓${RESET} Zone imported from ${IMPORT_FILE}" + ((ACTION_OK++)) || true + else + local errmsg + errmsg=$(jq -r '.error.message // .message // "unknown error"' /tmp/hdm_resp.json 2>/dev/null) + echo -e " ${RED}✗${RESET} Import failed: ${errmsg}" + ((ACTION_FAIL++)) || true + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# BULK ADD +# ══════════════════════════════════════════════════════════════════════ +do_bulk_add() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + [[ -z "$CSV_FILE" ]] && die "Specify --csv FILE" + [[ ! -f "$CSV_FILE" ]] && die "CSV file not found: ${CSV_FILE}" + + local zid + zid=$(resolve_zone_id "$ZONE_NAME") + + section_header "Bulk Add — ${ZONE_NAME}" + + local line_num=0 + while IFS=',' read -r rtype rname rvalue rttl; do + ((line_num++)) || true + [[ -z "$rtype" || "$rtype" =~ ^# ]] && continue + rtype=$(echo "$rtype" | xargs) + rname=$(echo "$rname" | xargs) + rvalue=$(echo "$rvalue" | xargs) + rttl=$(echo "${rttl:-3600}" | xargs) + + local payload + payload=$(jq -n \ + --arg zid "$zid" \ + --arg type "$rtype" \ + --arg name "$rname" \ + --arg value "$rvalue" \ + --argjson ttl "$rttl" \ + '{zone_id: $zid, type: $type, name: $name, value: $value, ttl: $ttl}') + + local resp + resp=$(hdns_api POST "/records" -d "$payload") + + local rid + rid=$(echo "$resp" | jq -r '.record.id // empty' 2>/dev/null) + + if [[ -n "$rid" ]]; then + echo -e " ${GREEN}✓${RESET} ${rtype} ${rname} → ${rvalue} (line ${line_num})" + ((ACTION_OK++)) || true + else + echo -e " ${RED}✗${RESET} ${rtype} ${rname} → ${rvalue} (line ${line_num})" + ((ACTION_FAIL++)) || true + fi + + sleep 0.5 + done < "$CSV_FILE" + + echo "" + field_color "Succeeded:" "${GREEN}${ACTION_OK}${RESET}" + if [[ "$ACTION_FAIL" -gt 0 ]]; then + field_color "Failed:" "${RED}${ACTION_FAIL}${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ +do_audit() { + [[ -z "$ZONE_NAME" ]] && die "Specify --zone DOMAIN" + + local zid + zid=$(resolve_zone_id "$ZONE_NAME") + + local resp + resp=$(hdns_api GET "/records?zone_id=${zid}") + local records + records=$(echo "$resp" | jq '.records // []' 2>/dev/null) + local total + total=$(echo "$records" | jq 'length' 2>/dev/null || echo 0) + + local warnings=0 + + if [[ "$OUTPUT_FORMAT" != "prometheus" ]]; then + section_header "DNS Audit — ${ZONE_NAME}" + field "Records:" "$total" + echo "" + fi + + # Check SOA + local soa_count + soa_count=$(echo "$records" | jq '[.[] | select(.type == "SOA")] | length' 2>/dev/null || echo 0) + if [[ "$soa_count" -eq 0 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${YELLOW}⚠${RESET} No SOA record found" + else + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${GREEN}✓${RESET} SOA record present" + fi + + # Check NS + local ns_count + ns_count=$(echo "$records" | jq '[.[] | select(.type == "NS")] | length' 2>/dev/null || echo 0) + if [[ "$ns_count" -eq 0 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${RED}✗${RESET} No NS records found" + elif [[ "$ns_count" -lt 2 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${YELLOW}⚠${RESET} Only ${ns_count} NS record(s) — recommend at least 2" + else + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${GREEN}✓${RESET} ${ns_count} NS records" + fi + + # Check common types + for rtype in A AAAA MX TXT; do + local rcount + rcount=$(echo "$records" | jq --arg t "$rtype" '[.[] | select(.type == $t)] | length' 2>/dev/null || echo 0) + if [[ "$rcount" -eq 0 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${YELLOW}⚠${RESET} No ${rtype} records found" + else + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${GREEN}✓${RESET} ${rcount} ${rtype} record(s)" + fi + done + + # Check low TTLs + local low_ttl + low_ttl=$(echo "$records" | jq '[.[] | select(.ttl < 300 and .ttl > 0)] | length' 2>/dev/null || echo 0) + if [[ "$low_ttl" -gt 0 ]]; then + ((warnings++)) || true + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${YELLOW}⚠${RESET} ${low_ttl} record(s) with TTL < 300s" + else + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${GREEN}✓${RESET} All TTLs ≥ 300s" + fi + + # Check wildcards + local wildcard + wildcard=$(echo "$records" | jq '[.[] | select(.name | startswith("*"))] | length' 2>/dev/null || echo 0) + if [[ "$wildcard" -gt 0 ]]; then + [[ "$OUTPUT_FORMAT" != "prometheus" ]] && echo -e " ${CYAN}ℹ${RESET} ${wildcard} wildcard record(s)" + fi + + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + cat <&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +# ── Severity counters ──────────────────────────────────────────────── +TOTAL_CRIT=0 +TOTAL_WARN=0 +TOTAL_INFO=0 +TOTAL_OK=0 + +flag_crit() { ((TOTAL_CRIT++)) || true; } +flag_warn() { ((TOTAL_WARN++)) || true; } +flag_info() { ((TOTAL_INFO++)) || true; } +flag_ok() { ((TOTAL_OK++)) || true; } + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +DANGEROUS_PORTS="${DANGEROUS_PORTS:-22,3389,3306,5432,1433,6379,27017,9200,8080,8443}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +HCLOUD_TOKEN="${HCLOUD_TOKEN:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" + +# ── API helpers ────────────────────────────────────────────────────── +hetzner_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/hfa_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer ${HCLOUD_TOKEN}" \ + -H "Content-Type: application/json" \ + "https://api.hetzner.cloud/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + cat /tmp/hfa_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$HCLOUD_TOKEN" ]] && die "HCLOUD_TOKEN not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Server helpers ─────────────────────────────────────────────────── +get_all_servers() { + local page=1 per_page=50 result="[]" + while true; do + local resp + resp=$(hetzner_api GET "/servers?page=${page}&per_page=${per_page}") + local page_data + page_data=$(echo "$resp" | jq '.servers // []' 2>/dev/null) + local count + count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$count" -eq 0 ]] && break + result=$(echo "$result" "$page_data" | jq -s '.[0] + .[1]') + local total + total=$(echo "$resp" | jq '.meta.pagination.total_entries // 0' 2>/dev/null) + (( page * per_page >= total )) && break + ((page++)) || true + done + echo "$result" +} + +# ── Firewall helpers ───────────────────────────────────────────────── +get_all_firewalls() { + local page=1 per_page=50 result="[]" + while true; do + local resp + resp=$(hetzner_api GET "/firewalls?page=${page}&per_page=${per_page}") + local page_data + page_data=$(echo "$resp" | jq '.firewalls // []' 2>/dev/null) + local count + count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$count" -eq 0 ]] && break + result=$(echo "$result" "$page_data" | jq -s '.[0] + .[1]') + local total + total=$(echo "$resp" | jq '.meta.pagination.total_entries // 0' 2>/dev/null) + (( page * per_page >= total )) && break + ((page++)) || true + done + echo "$result" +} + +# ── Port-to-service mapping ───────────────────────────────────────── +port_to_service() { + local port="$1" + case "$port" in + 22) echo "SSH" ;; + 80) echo "HTTP" ;; + 443) echo "HTTPS" ;; + 3306) echo "MySQL" ;; + 5432) echo "PostgreSQL" ;; + 1433) echo "MSSQL" ;; + 3389) echo "RDP" ;; + 6379) echo "Redis" ;; + 27017) echo "MongoDB" ;; + 9200) echo "Elasticsearch" ;; + 8080) echo "HTTP-Alt" ;; + 8443) echo "HTTPS-Alt" ;; + 53) echo "DNS" ;; + 25) echo "SMTP" ;; + 5900) echo "VNC" ;; + 11211) echo "Memcached" ;; + 2379) echo "etcd" ;; + 9090) echo "Prometheus" ;; + *) echo "" ;; + esac +} + +# ── Check if port is in dangerous list ─────────────────────────────── +is_dangerous_port() { + local port="$1" + local IFS=',' + for dp in $DANGEROUS_PORTS; do + if [[ "$port" == "$dp" ]]; then + return 0 + fi + done + return 1 +} + +# ══════════════════════════════════════════════════════════════════════ +# OPEN PORTS AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_open_ports() { + log "Auditing firewall rules for dangerous open ports..." + log "Dangerous ports: ${DANGEROUS_PORTS}" + echo "" + + printf " %-10s %-22s %-8s %-8s %-18s %-12s %s\n" \ + "FW_ID" "FW_NAME" "PORT" "PROTO" "SOURCE" "SERVICE" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..95})" + + local fw_json + fw_json=$(get_all_firewalls) + + echo "$fw_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r fw; do + local fw_id fw_name + fw_id=$(echo "$fw" | jq -r '.id' 2>/dev/null) + fw_name=$(echo "$fw" | jq -r '.name // "unnamed"' 2>/dev/null) + + echo "$fw" | jq -c '.rules[]? // empty' 2>/dev/null | while IFS= read -r rule; do + local direction protocol port_str + direction=$(echo "$rule" | jq -r '.direction // "in"' 2>/dev/null) + protocol=$(echo "$rule" | jq -r '.protocol // "tcp"' 2>/dev/null) + port_str=$(echo "$rule" | jq -r '.port // ""' 2>/dev/null) + + [[ "$direction" != "in" ]] && continue + + local has_open="false" + while IFS= read -r src; do + if [[ "$src" == "0.0.0.0/0" || "$src" == "::/0" ]]; then + has_open="true" + break + fi + done < <(echo "$rule" | jq -r '.source_ips[]? // empty' 2>/dev/null) + + [[ "$has_open" != "true" ]] && continue + + if [[ -z "$port_str" || "$port_str" == "null" ]]; then + local IFS=',' + for dp in $DANGEROUS_PORTS; do + local svc + svc=$(port_to_service "$dp") + printf " %-10s %-22s %-8s %-8s %-18s %-12s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "$dp" "$protocol" \ + "0.0.0.0/0" "${svc:-unknown}" "$RED" "CRITICAL" "$RESET" + flag_crit + done + continue + fi + + if [[ "$port_str" == *-* ]]; then + local range_start range_end + range_start="${port_str%-*}" + range_end="${port_str#*-}" + local IFS=',' + for dp in $DANGEROUS_PORTS; do + if [[ "$dp" -ge "$range_start" && "$dp" -le "$range_end" ]]; then + local svc + svc=$(port_to_service "$dp") + printf " %-10s %-22s %-8s %-8s %-18s %-12s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "$dp" "$protocol" \ + "0.0.0.0/0" "${svc:-unknown}" "$RED" "CRITICAL" "$RESET" + flag_crit + fi + done + continue + fi + + if is_dangerous_port "$port_str"; then + local svc + svc=$(port_to_service "$port_str") + printf " %-10s %-22s %-8s %-8s %-18s %-12s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "$port_str" "$protocol" \ + "0.0.0.0/0" "${svc:-unknown}" "$RED" "CRITICAL" "$RESET" + flag_crit + elif [[ "$port_str" == "80" || "$port_str" == "443" ]]; then + local svc + svc=$(port_to_service "$port_str") + printf " %-10s %-22s %-8s %-8s %-18s %-12s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "$port_str" "$protocol" \ + "0.0.0.0/0" "${svc:-$port_str}" "$CYAN" "INFO" "$RESET" + flag_info + fi + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# UNPROTECTED SERVERS +# ══════════════════════════════════════════════════════════════════════ +audit_unprotected() { + log "Checking for servers without firewalls..." + echo "" + + printf " %-10s %-22s %-16s %-10s %s\n" \ + "SRV_ID" "NAME" "IP" "STATUS" "FIREWALL" + printf " %s\n" "$(printf '%.0s─' {1..75})" + + local servers + servers=$(get_all_servers) + + local fw_json + fw_json=$(get_all_firewalls) + + local protected_ids + protected_ids=$(echo "$fw_json" | jq -r \ + '[.[].applied_to[]? | select(.type == "server") | .server.id] | unique | .[]' 2>/dev/null || true) + + echo "$servers" | jq -c '.[]' 2>/dev/null | while IFS= read -r srv; do + local sid sname ip status + sid=$(echo "$srv" | jq -r '.id' 2>/dev/null) + sname=$(echo "$srv" | jq -r '.name // "unknown"' 2>/dev/null) + ip=$(echo "$srv" | jq -r '.public_net.ipv4.ip // "N/A"' 2>/dev/null) + status=$(echo "$srv" | jq -r '.status // "unknown"' 2>/dev/null) + + local has_fw="false" + if echo "$protected_ids" | grep -q "^${sid}$" 2>/dev/null; then + has_fw="true" + fi + + if [[ "$has_fw" == "false" ]]; then + printf " %-10s %-22s %-16s %-10s %b%s%b\n" \ + "$sid" "${sname:0:20}" "$ip" "$status" \ + "$RED" "NONE — UNPROTECTED" "$RESET" + flag_crit + else + printf " %-10s %-22s %-16s %-10s %b%s%b\n" \ + "$sid" "${sname:0:20}" "$ip" "$status" \ + "$GREEN" "✓ Protected" "$RESET" + flag_ok + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# PERMISSIVE RULES AUDIT +# ══════════════════════════════════════════════════════════════════════ +audit_permissive() { + log "Auditing overly permissive firewall rules..." + echo "" + + printf " %-10s %-22s %-10s %-8s %-18s %-14s %s\n" \ + "FW_ID" "FW_NAME" "PORTS" "PROTO" "SOURCE" "ISSUE" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..100})" + + local fw_json + fw_json=$(get_all_firewalls) + + echo "$fw_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r fw; do + local fw_id fw_name + fw_id=$(echo "$fw" | jq -r '.id' 2>/dev/null) + fw_name=$(echo "$fw" | jq -r '.name // "unnamed"' 2>/dev/null) + + echo "$fw" | jq -c '.rules[]? // empty' 2>/dev/null | while IFS= read -r rule; do + local direction protocol port_str + direction=$(echo "$rule" | jq -r '.direction // "in"' 2>/dev/null) + protocol=$(echo "$rule" | jq -r '.protocol // "tcp"' 2>/dev/null) + port_str=$(echo "$rule" | jq -r '.port // ""' 2>/dev/null) + + [[ "$direction" != "in" ]] && continue + + local has_open="false" + local source_display="" + while IFS= read -r src; do + if [[ "$src" == "0.0.0.0/0" || "$src" == "::/0" ]]; then + has_open="true" + source_display="$src" + break + fi + done < <(echo "$rule" | jq -r '.source_ips[]? // empty' 2>/dev/null) + + if [[ "$has_open" == "true" ]] && [[ -z "$port_str" || "$port_str" == "null" ]]; then + printf " %-10s %-22s %-10s %-8s %-18s %-14s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "ALL" "$protocol" \ + "$source_display" "all-ports" "$RED" "CRITICAL" "$RESET" + flag_crit + continue + fi + + while IFS= read -r src; do + if [[ -n "$src" && "$src" != "null" ]]; then + if [[ "$src" == *"/8" || "$src" == *"/16" ]]; then + printf " %-10s %-22s %-10s %-8s %-18s %-14s %b%s%b\n" \ + "$fw_id" "${fw_name:0:20}" "${port_str:-ALL}" "$protocol" \ + "${src:0:16}" "wide-cidr" "$YELLOW" "WARN" "$RESET" + flag_warn + fi + fi + done < <(echo "$rule" | jq -r '.source_ips[]? // empty' 2>/dev/null) + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# UNUSED FIREWALLS +# ══════════════════════════════════════════════════════════════════════ +audit_unused() { + log "Checking for unused firewalls..." + echo "" + + printf " %-10s %-28s %-8s %s\n" \ + "FW_ID" "FW_NAME" "RULES" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..60})" + + local fw_json + fw_json=$(get_all_firewalls) + + echo "$fw_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r fw; do + local fw_id fw_name rule_count applied_count + fw_id=$(echo "$fw" | jq -r '.id' 2>/dev/null) + fw_name=$(echo "$fw" | jq -r '.name // "unnamed"' 2>/dev/null) + rule_count=$(echo "$fw" | jq '[.rules[]?] | length' 2>/dev/null || echo 0) + applied_count=$(echo "$fw" | jq '[.applied_to[]?] | length' 2>/dev/null || echo 0) + + if [[ "$applied_count" -eq 0 ]]; then + printf " %-10s %-28s %-8s %b%s%b\n" \ + "$fw_id" "${fw_name:0:26}" "$rule_count" \ + "$YELLOW" "UNUSED" "$RESET" + flag_warn + else + verbose "Firewall ${fw_id} (${fw_name}): applied to ${applied_count} resource(s)" + flag_ok + fi + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST ALL RULES +# ══════════════════════════════════════════════════════════════════════ +list_rules() { + log "Listing all firewall rules..." + echo "" + + printf " %-10s %-20s %-8s %-8s %-12s %-18s %s\n" \ + "FW_ID" "FW_NAME" "DIR" "PROTO" "PORTS" "SOURCE/DEST" "SERVICE" + printf " %s\n" "$(printf '%.0s─' {1..90})" + + local fw_json + fw_json=$(get_all_firewalls) + + echo "$fw_json" | jq -c '.[]' 2>/dev/null | while IFS= read -r fw; do + local fw_id fw_name + fw_id=$(echo "$fw" | jq -r '.id' 2>/dev/null) + fw_name=$(echo "$fw" | jq -r '.name // "unnamed"' 2>/dev/null) + + echo "$fw" | jq -c '.rules[]? // empty' 2>/dev/null | while IFS= read -r rule; do + local direction protocol port_str + direction=$(echo "$rule" | jq -r '.direction // "in"' 2>/dev/null) + protocol=$(echo "$rule" | jq -r '.protocol // "tcp"' 2>/dev/null) + port_str=$(echo "$rule" | jq -r '.port // "all"' 2>/dev/null) + + [[ "$port_str" == "null" ]] && port_str="all" + + local cidr_list + if [[ "$direction" == "in" ]]; then + cidr_list=$(echo "$rule" | jq -r '.source_ips[]? // empty' 2>/dev/null | head -1) + else + cidr_list=$(echo "$rule" | jq -r '.destination_ips[]? // empty' 2>/dev/null | head -1) + fi + [[ -z "$cidr_list" ]] && cidr_list="any" + + local svc="" + if [[ "$port_str" =~ ^[0-9]+$ ]]; then + svc=$(port_to_service "$port_str") + fi + + local dir_color="$CYAN" + [[ "$direction" == "out" ]] && dir_color="$YELLOW" + + printf " %-10s %-20s %b%-8s%b %-8s %-12s %-18s %s\n" \ + "$fw_id" "${fw_name:0:18}" "$dir_color" "$direction" "$RESET" \ + "$protocol" "${port_str:0:10}" "${cidr_list:0:16}" "${svc}" + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ +print_summary() { + local elapsed + elapsed=$(( $(date +%s) - START_TIME )) + + echo "" + echo " ══════════════════════════════════════════" + echo " Firewall Audit Summary" + echo " ══════════════════════════════════════════" + printf " %-20s %b%d%b\n" "CRITICAL:" "$RED" "$TOTAL_CRIT" "$RESET" + printf " %-20s %b%d%b\n" "WARN:" "$YELLOW" "$TOTAL_WARN" "$RESET" + printf " %-20s %b%d%b\n" "INFO:" "$CYAN" "$TOTAL_INFO" "$RESET" + printf " %-20s %b%d%b\n" "OK:" "$GREEN" "$TOTAL_OK" "$RESET" + echo " ──────────────────────────────────────────" + printf " Completed in %ds\n" "$elapsed" + echo "" + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + echo -e " ${RED}${BOLD}Action required:${RESET} ${TOTAL_CRIT} critical finding(s)" + echo "" + echo " Top recommendations:" + echo " • Assign firewalls to all unprotected servers" + echo " • Close 0.0.0.0/0 rules on SSH (22), RDP (3389), and database ports" + echo " • Replace all-port allow rules with specific port lists" + echo " • Remove unused firewalls to reduce configuration sprawl" + echo "" + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + echo -e " ${YELLOW}Review recommended:${RESET} ${TOTAL_WARN} warning(s)" + echo "" + echo " Suggestions:" + echo " • Review wide CIDR rules and narrow where possible" + echo " • Delete unused firewalls" + echo " • Restrict outbound where applicable" + echo "" + else + echo -e " ${GREEN}All checks passed${RESET}" + echo "" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2 + exit 1 + fi + + RUN_MODE="${modes[*]}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ +main() { + parse_args "$@" + setup_colors + check_deps + check_credentials + + START_TIME=$(date +%s) + + echo "" + echo -e "${BOLD}Hetzner Cloud Firewall Auditor${RESET}" + echo -e "Mode: ${RUN_MODE}" + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" + + for mode in $RUN_MODE; do + case "$mode" in + open-ports) audit_open_ports ;; + unprotected) audit_unprotected ;; + permissive) audit_permissive ;; + unused) audit_unused ;; + rules) list_rules ;; + esac + done + + print_summary + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + exit 2 + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@" diff --git a/hetzner-fleet-manager.sh b/hetzner-fleet-manager.sh new file mode 100755 index 0000000..8667902 --- /dev/null +++ b/hetzner-fleet-manager.sh @@ -0,0 +1,613 @@ +#!/usr/bin/env bash + +######################################################################################### +#### hetzner-fleet-manager.sh — Inventory, health checks, and bulk operations for #### +#### Hetzner Cloud servers via the REST API. Fleet-wide visibility and control #### +#### Requires: bash 4+, curl, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./hetzner-fleet-manager.sh --inventory --all #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +SERVER_ID="" +TARGET_ALL="false" +LABEL_SELECTOR="" +LABEL_SUB_MODE="" +OUTPUT_FORMAT="${HFM_FORMAT:-table}" +PING_CHECK="false" +FORCE="false" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +HCLOUD_TOKEN="${HCLOUD_TOKEN:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +ACTION_OK=0 +ACTION_FAIL=0 + +# ── API helpers ────────────────────────────────────────────────────── +hcloud_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/hfm_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer ${HCLOUD_TOKEN}" \ + -H "Content-Type: application/json" \ + "https://api.hetzner.cloud/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + if [[ "$http_code" =~ ^[45] ]]; then + local errmsg + errmsg=$(jq -r '.error.message // empty' /tmp/hfm_resp.json 2>/dev/null) + [[ -n "$errmsg" ]] && verbose "API error: ${errmsg}" + fi + + cat /tmp/hfm_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$HCLOUD_TOKEN" ]] && die "HCLOUD_TOKEN not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Server helpers ─────────────────────────────────────────────────── +get_all_server_ids() { + local page=1 per_page=50 ids="" + while true; do + local resp + resp=$(hcloud_api GET "/servers?page=${page}&per_page=${per_page}") + local page_ids + page_ids=$(echo "$resp" | jq -r '.servers[].id' 2>/dev/null) + [[ -z "$page_ids" ]] && break + ids="${ids}${ids:+$'\n'}${page_ids}" + local count + count=$(echo "$page_ids" | wc -l) + (( count < per_page )) && break + ((page++)) || true + done + echo "$ids" +} + +get_server_ids_by_label() { + local selector="$1" + local page=1 per_page=50 ids="" + while true; do + local resp + resp=$(hcloud_api GET "/servers?page=${page}&per_page=${per_page}&label_selector=$(urlencode "$selector")") + local page_ids + page_ids=$(echo "$resp" | jq -r '.servers[].id' 2>/dev/null) + [[ -z "$page_ids" ]] && break + ids="${ids}${ids:+$'\n'}${page_ids}" + local count + count=$(echo "$page_ids" | wc -l) + (( count < per_page )) && break + ((page++)) || true + done + echo "$ids" +} + +urlencode() { + local string="$1" + python3 -c "import urllib.parse; print(urllib.parse.quote('$string', safe=''))" 2>/dev/null \ + || echo "$string" +} + +get_server_name() { + local sid="$1" + hcloud_api GET "/servers/${sid}" \ + | jq -r '.server.name // "unknown"' 2>/dev/null +} + +get_server_ids() { + if [[ "$TARGET_ALL" == "true" ]]; then + get_all_server_ids + elif [[ -n "$SERVER_ID" ]]; then + echo "$SERVER_ID" + elif [[ -n "$LABEL_SELECTOR" ]]; then + get_server_ids_by_label "$LABEL_SELECTOR" + else + die "Specify --server ID, --all, or --label-selector KEY=VALUE" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# INVENTORY +# ══════════════════════════════════════════════════════════════════════ +do_inventory() { + local page=1 per_page=50 all_data="[]" + + local query="/servers?page=${page}&per_page=${per_page}" + [[ -n "$LABEL_SELECTOR" ]] && query="${query}&label_selector=$(urlencode "$LABEL_SELECTOR")" + + while true; do + local resp + resp=$(hcloud_api GET "$query") + local page_data + page_data=$(echo "$resp" | jq '.servers // []' 2>/dev/null) + local page_count + page_count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$page_count" -eq 0 ]] && break + all_data=$(echo -e "${all_data}\n${page_data}" | jq -s 'add' 2>/dev/null) + (( page_count < per_page )) && break + ((page++)) || true + query="/servers?page=${page}&per_page=${per_page}" + [[ -n "$LABEL_SELECTOR" ]] && query="${query}&label_selector=$(urlencode "$LABEL_SELECTOR")" + done + + local total + total=$(echo "$all_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$total" -eq 0 ]] && die "No servers found" + + case "$OUTPUT_FORMAT" in + json) + echo "$all_data" | jq '.' + ;; + ansible) + echo "[hetzner]" + echo "$all_data" | jq -r \ + '.[] | (.public_net.ipv4.ip // "unknown") + " # " + (.name // "unknown") + " id=" + (.id | tostring)' \ + 2>/dev/null + ;; + *) + section_header "Fleet Inventory" + + printf " ${BOLD}%-10s %-20s %-13s %-16s %-10s %-10s${RESET}\n" \ + "ID" "NAME" "STATUS" "IP" "LOCATION" "TYPE" + printf " %s\n" "$(printf '%.0s─' {1..81})" + + echo "$all_data" | jq -r \ + '.[] | "\(.id)\t\(.name // "unknown")\t\(.status // "unknown")\t\(.public_net.ipv4.ip // "—")\t\(.datacenter.location.name // "—")\t\(.server_type.name // "—")"' \ + 2>/dev/null \ + | while IFS=$'\t' read -r sid name status ip location stype; do + printf " %-10s %-20s %-13s %-16s %-10s %-10s\n" \ + "$sid" "${name:0:18}" "$status" "$ip" "${location:0:8}" "$stype" + done + + echo "" + field "Total:" "$total" + ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# HEALTH +# ══════════════════════════════════════════════════════════════════════ +do_health() { + local ids + ids=$(get_server_ids) + [[ -z "$ids" ]] && die "No servers found" + + local running=0 stopped=0 errored=0 total_servers=0 + local results="" + + while IFS= read -r sid; do + [[ -z "$sid" ]] && continue + ((total_servers++)) || true + + local resp + resp=$(hcloud_api GET "/servers/${sid}") + local name status ip + name=$(echo "$resp" | jq -r '.server.name // "unknown"' 2>/dev/null) + status=$(echo "$resp" | jq -r '.server.status // "unknown"' 2>/dev/null) + ip=$(echo "$resp" | jq -r '.server.public_net.ipv4.ip // ""' 2>/dev/null) + + local ping_result="—" + if [[ "$PING_CHECK" == "true" && -n "$ip" ]]; then + if ping -c 1 -W 3 "$ip" &>/dev/null; then + ping_result="reachable" + else + ping_result="unreachable" + fi + fi + + case "$status" in + running) ((running++)) || true ;; + off) ((stopped++)) || true ;; + *) ((errored++)) || true ;; + esac + + results="${results}${sid}\t${name}\t${status}\t${ip}\t${ping_result}\n" + done <<< "$ids" + + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + cat < /dev/null 2>&1; then + echo -e " ${GREEN}✓${RESET} ${sname} (${sid}) ${action} sent" + ((ACTION_OK++)) || true + else + echo -e " ${RED}✗${RESET} ${sname} (${sid}) ${action} failed" + ((ACTION_FAIL++)) || true + fi + + sleep 1 + done <<< "$ids" + + echo "" + field_color "Succeeded:" "${GREEN}${ACTION_OK}${RESET}" + if [[ "$ACTION_FAIL" -gt 0 ]]; then + field_color "Failed:" "${RED}${ACTION_FAIL}${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# LABELS +# ══════════════════════════════════════════════════════════════════════ +do_labels() { + if [[ "$LABEL_SUB_MODE" == "list" ]]; then + local page=1 per_page=50 all_data="[]" + while true; do + local resp + resp=$(hcloud_api GET "/servers?page=${page}&per_page=${per_page}") + local page_data + page_data=$(echo "$resp" | jq '.servers // []' 2>/dev/null) + local page_count + page_count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$page_count" -eq 0 ]] && break + all_data=$(echo -e "${all_data}\n${page_data}" | jq -s 'add' 2>/dev/null) + (( page_count < per_page )) && break + ((page++)) || true + done + + local labels_json + labels_json=$(echo "$all_data" | jq '[.[].labels // {} | to_entries[]] | group_by(.key) | map({key: .[0].key, values: (map(.value) | unique)})' 2>/dev/null) + + if [[ "$OUTPUT_FORMAT" == "json" ]]; then + echo "$labels_json" | jq '.' + return + fi + + section_header "Labels" + + printf " ${BOLD}%-25s %-50s${RESET}\n" "KEY" "VALUES" + printf " %s\n" "$(printf '%.0s─' {1..77})" + + echo "$labels_json" | jq -r '.[] | "\(.key)\t\(.values | join(", "))"' 2>/dev/null \ + | while IFS=$'\t' read -r lkey lvals; do + printf " %-25s %-50s\n" "${lkey:0:23}" "${lvals:0:48}" + done + + elif [[ "$LABEL_SUB_MODE" == "filter" ]]; then + [[ -z "$LABEL_SELECTOR" ]] && die "Specify --label-selector KEY=VALUE with --filter" + do_inventory + else + die "Specify --list or --filter with --labels" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +ALSO_ROTATE="false" +SERVER_ID="" +TARGET_ALL="false" +SNAPSHOT_ID="" +KEEP="${HSM_KEEP:-3}" +PREFIX="${HSM_PREFIX:-auto}" +MAX_AGE="${HSM_MAX_AGE:-7}" +OUTPUT_FORMAT="${HSM_FORMAT:-text}" +DRY_RUN="true" +FORCE="false" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── Credentials ─────────────────────────────────────────────────────── +HCLOUD_TOKEN="${HCLOUD_TOKEN:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +SNAP_CREATED=0 +SNAP_DELETED=0 +SNAP_ERRORS=0 + +# ── Cost constant ──────────────────────────────────────────────────── +COST_PER_GB_MONTH="0.012" + +# ── API helpers ────────────────────────────────────────────────────── +hetzner_api() { + local method="$1" endpoint="$2" + shift 2 + local attempt=0 max_attempts=3 + + while (( attempt < max_attempts )); do + local http_code + http_code=$(curl -s -o /tmp/hsm_resp.json -w "%{http_code}" \ + -X "$method" \ + -H "Authorization: Bearer ${HCLOUD_TOKEN}" \ + -H "Content-Type: application/json" \ + "https://api.hetzner.cloud/v1${endpoint}" "$@") + + verbose "API ${method} ${endpoint} → HTTP ${http_code}" + + if [[ "$http_code" == "429" ]]; then + ((attempt++)) || true + local wait=$(( attempt * 5 )) + warn "Rate limited — retrying in ${wait}s (attempt ${attempt}/${max_attempts})" + sleep "$wait" + continue + fi + + cat /tmp/hsm_resp.json + return 0 + done + + err "API request failed after ${max_attempts} attempts: ${method} ${endpoint}" + return 1 +} + +check_credentials() { + [[ -z "$HCLOUD_TOKEN" ]] && die "HCLOUD_TOKEN not set" +} + +check_deps() { + command -v curl &>/dev/null || die "curl is required" + command -v jq &>/dev/null || die "jq is required" +} + +# ── Server helpers ─────────────────────────────────────────────────── +get_all_server_ids() { + local page=1 ids="" + while true; do + local resp + resp=$(hetzner_api GET "/servers?page=${page}&per_page=50") + local page_ids + page_ids=$(echo "$resp" | jq -r '.servers[].id' 2>/dev/null) + [[ -z "$page_ids" ]] && break + ids="${ids}${ids:+$'\n'}${page_ids}" + local count + count=$(echo "$page_ids" | wc -l) + (( count < 50 )) && break + ((page++)) || true + done + echo "$ids" +} + +get_server_name() { + local sid="$1" + hetzner_api GET "/servers/${sid}" \ + | jq -r '.server.name // "unknown"' 2>/dev/null +} + +get_server_ids() { + if [[ "$TARGET_ALL" == "true" ]]; then + get_all_server_ids + elif [[ -n "$SERVER_ID" ]]; then + echo "$SERVER_ID" + else + die "Specify --server ID or --all" + fi +} + +# ── Snapshot helpers ───────────────────────────────────────────────── +get_snapshots() { + local page=1 result="[]" + while true; do + local resp + resp=$(hetzner_api GET "/images?type=snapshot&page=${page}&per_page=50") + local page_data + page_data=$(echo "$resp" | jq '.images // []' 2>/dev/null) + local count + count=$(echo "$page_data" | jq 'length' 2>/dev/null || echo 0) + [[ "$count" -eq 0 ]] && break + result=$(echo "$result" "$page_data" | jq -s '.[0] + .[1]') + (( count < 50 )) && break + ((page++)) || true + done + echo "$result" +} + +get_server_snapshots() { + local sid="$1" + get_snapshots | jq --argjson sid "$sid" \ + '[.[] | select(.created_from.id == $sid)]' 2>/dev/null +} + +# ══════════════════════════════════════════════════════════════════════ +# SNAPSHOT +# ══════════════════════════════════════════════════════════════════════ +do_snapshot() { + local ids + ids=$(get_server_ids) + [[ -z "$ids" ]] && die "No servers found" + + local count + count=$(echo "$ids" | grep -c . || true) + local target_label="server ${SERVER_ID}" + [[ "$TARGET_ALL" == "true" ]] && target_label="all (${count} servers)" + + section_header "Creating Snapshots" + field "Target:" "$target_label" + field "Prefix:" "$PREFIX" + echo "" + + while IFS= read -r sid; do + [[ -z "$sid" ]] && continue + local snap_desc + snap_desc="${PREFIX}-$(date +%Y%m%d-%H%M%S)" + local sname + sname=$(get_server_name "$sid") + + verbose "Snapshotting ${sname} (${sid}) as ${snap_desc}" + + local resp + resp=$(hetzner_api POST "/servers/${sid}/actions/create_image" \ + -d "{\"description\": \"${snap_desc}\", \"type\": \"snapshot\"}" 2>/dev/null) + + local action_status + action_status=$(echo "$resp" | jq -r '.action.status // .error.code // "error"' 2>/dev/null) + + if [[ "$action_status" == "running" || "$action_status" == "success" ]]; then + echo -e " ${GREEN}✓${RESET} ${sname} (${sid}) ${snap_desc}" + ((SNAP_CREATED++)) || true + else + echo -e " ${RED}✗${RESET} ${sname} (${sid}) failed" + ((SNAP_ERRORS++)) || true + fi + + sleep 1 + done <<< "$ids" + + echo "" + field_color "Created:" "${GREEN}${SNAP_CREATED}${RESET}" + if [[ "$SNAP_ERRORS" -gt 0 ]]; then + field_color "Errors:" "${RED}${SNAP_ERRORS}${RESET}" + fi + + if [[ "$ALSO_ROTATE" == "true" ]]; then + do_rotate + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# ROTATE +# ══════════════════════════════════════════════════════════════════════ +do_rotate() { + local ids + ids=$(get_server_ids) + [[ -z "$ids" ]] && die "No servers found" + + section_header "Rotating Snapshots" + field "Keep:" "$KEEP per server" + field "Prefix:" "$PREFIX" + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + field_color "Mode:" "${YELLOW}DRY-RUN${RESET} (use --force to delete)" + else + field_color "Mode:" "${RED}LIVE${RESET}" + fi + echo "" + + local all_snaps + all_snaps=$(get_snapshots) + + while IFS= read -r sid; do + [[ -z "$sid" ]] && continue + local sname + sname=$(get_server_name "$sid") + + local managed + managed=$(echo "$all_snaps" | jq -r \ + --argjson sid "$sid" --arg prefix "$PREFIX" \ + '[.[] | select(.created_from.id == $sid) | select(.description | startswith($prefix))] | sort_by(.created) | reverse' \ + 2>/dev/null) + + local total + total=$(echo "$managed" | jq 'length' 2>/dev/null || echo 0) + + if (( total <= KEEP )); then + verbose "${sname}: ${total} managed snapshots, keeping all" + continue + fi + + local to_delete + to_delete=$(echo "$managed" | jq -r ".[$KEEP:][] | .id" 2>/dev/null) + + while IFS= read -r imgid; do + [[ -z "$imgid" ]] && continue + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + echo -e " ${YELLOW}⊘${RESET} Would delete: ${sname} (${sid}) → image ${imgid}" + else + if hetzner_api DELETE "/images/${imgid}" > /dev/null 2>&1; then + echo -e " ${GREEN}✓${RESET} Deleted: ${sname} (${sid}) → image ${imgid}" + ((SNAP_DELETED++)) || true + else + echo -e " ${RED}✗${RESET} Failed: ${sname} (${sid}) → image ${imgid}" + ((SNAP_ERRORS++)) || true + fi + sleep 1 + fi + done <<< "$to_delete" + done <<< "$ids" + + echo "" + if [[ "$DRY_RUN" == "true" && "$FORCE" != "true" ]]; then + log "Dry-run complete — use --force to execute" + else + field_color "Deleted:" "${GREEN}${SNAP_DELETED}${RESET}" + if [[ "$SNAP_ERRORS" -gt 0 ]]; then + field_color "Errors:" "${RED}${SNAP_ERRORS}${RESET}" + fi + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST +# ══════════════════════════════════════════════════════════════════════ +do_list() { + section_header "Snapshots" + + printf " ${BOLD}%-10s %-20s %-28s %-8s %-22s${RESET}\n" \ + "IMAGE_ID" "SERVER" "DESCRIPTION" "SIZE" "CREATED" + printf " %s\n" "$(printf '%.0s─' {1..90})" + + local all_snaps + all_snaps=$(get_snapshots) + + if [[ "$TARGET_ALL" != "true" && -n "$SERVER_ID" ]]; then + all_snaps=$(echo "$all_snaps" | jq --argjson sid "$SERVER_ID" \ + '[.[] | select(.created_from.id == $sid)]' 2>/dev/null) + fi + + echo "$all_snaps" | jq -r \ + '.[] | "\(.id)\t\(.created_from.name // "unknown")\t\(.description // "N/A")\t\(.disk_size)\t\(.created)"' \ + 2>/dev/null | while IFS=$'\t' read -r imgid server desc size created; do + printf " %-10s %-20s %-28s %5sGB %-22s\n" \ + "$imgid" "${server:0:18}" "${desc:0:26}" "$size" "${created:0:20}" + done +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ +do_audit() { + local ids + ids=$(get_all_server_ids) + [[ -z "$ids" ]] && die "No servers found" + + section_header "Snapshot Audit" + + printf " ${BOLD}%-20s %-20s %6s %6s %8s %-12s${RESET}\n" \ + "SERVER" "LATEST SNAPSHOT" "AGE" "COUNT" "COST/MO" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..80})" + + local all_snaps + all_snaps=$(get_snapshots) + + local protected=0 stale=0 unprotected=0 total_servers=0 + local total_cost=0 + local now + now=$(date +%s) + + while IFS= read -r sid; do + [[ -z "$sid" ]] && continue + ((total_servers++)) || true + + local sname + sname=$(get_server_name "$sid") + local snaps + snaps=$(echo "$all_snaps" | jq --argjson sid "$sid" \ + '[.[] | select(.created_from.id == $sid)]' 2>/dev/null) + local snap_count + snap_count=$(echo "$snaps" | jq 'length' 2>/dev/null || echo 0) + + local server_cost="0.00" + if [[ "$snap_count" -gt 0 ]]; then + local total_gb + total_gb=$(echo "$snaps" | jq '[.[].disk_size] | add // 0' 2>/dev/null || echo 0) + server_cost=$(echo "$total_gb * $COST_PER_GB_MONTH" | bc -l 2>/dev/null | xargs printf "%.2f" 2>/dev/null || echo "0.00") + total_cost=$(echo "$total_cost + $server_cost" | bc -l 2>/dev/null || echo "$total_cost") + fi + + if [[ "$snap_count" -eq 0 ]]; then + printf " %-20s %-20s %6s %6s %7s€ " "${sname:0:18}" "(none)" "—" "0" "0.00" + echo -e "${RED}✗ Unprotected${RESET}" + ((unprotected++)) || true + continue + fi + + local latest + latest=$(echo "$snaps" | jq -r \ + 'sort_by(.created) | last' 2>/dev/null) + local latest_desc latest_date + latest_desc=$(echo "$latest" | jq -r '.description // "unknown"' 2>/dev/null) + latest_date=$(echo "$latest" | jq -r '.created // ""' 2>/dev/null) + + local age_days="?" + if [[ -n "$latest_date" ]]; then + local snap_epoch + snap_epoch=$(date -d "$latest_date" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + age_days=$(( (now - snap_epoch) / 86400 )) + fi + fi + + local status_str status_color + if [[ "$age_days" != "?" ]] && (( age_days > MAX_AGE )); then + status_str="⚠ Stale" + status_color="$YELLOW" + ((stale++)) || true + else + status_str="✓ OK" + status_color="$GREEN" + ((protected++)) || true + fi + + printf " %-20s %-20s %5sd %6s %7s€ " \ + "${sname:0:18}" "${latest_desc:0:18}" "$age_days" "$snap_count" "$server_cost" + echo -e "${status_color}${status_str}${RESET}" + done <<< "$ids" + + local total_cost_fmt + total_cost_fmt=$(printf "%.2f" "$total_cost" 2>/dev/null || echo "0.00") + + echo "" + field "Servers:" "$total_servers" + field_color "Protected:" "${GREEN}${protected}${RESET}" + if [[ "$stale" -gt 0 ]]; then + field_color "Stale (>${MAX_AGE}d):" "${YELLOW}${stale}${RESET}" + fi + if [[ "$unprotected" -gt 0 ]]; then + field_color "Unprotected:" "${RED}${unprotected}${RESET}" + fi + field "Est. monthly cost:" "€${total_cost_fmt}" +} + +# ══════════════════════════════════════════════════════════════════════ +# RESTORE +# ══════════════════════════════════════════════════════════════════════ +do_restore() { + [[ -z "$SERVER_ID" ]] && die "Specify --server ID" + [[ -z "$SNAPSHOT_ID" ]] && die "Specify --snapshot-id IMAGE_ID" + + local sname + sname=$(get_server_name "$SERVER_ID") + + section_header "Restore Snapshot" + field "Server:" "${sname} (${SERVER_ID})" + field "Image:" "$SNAPSHOT_ID" + echo "" + + if [[ "$FORCE" != "true" ]]; then + echo -e " ${RED}WARNING: This will rebuild the server from the snapshot image.${RESET}" + echo -e " ${RED}All current data on the server will be destroyed.${RESET}" + echo "" + read -r -p " Type 'yes' to confirm: " confirm + if [[ "$confirm" != "yes" ]]; then + log "Restore cancelled" + return + fi + fi + + local resp + resp=$(hetzner_api POST "/servers/${SERVER_ID}/actions/rebuild" \ + -d "{\"image\": ${SNAPSHOT_ID}}" 2>/dev/null) + + local action_status + action_status=$(echo "$resp" | jq -r '.action.status // .error.code // "error"' 2>/dev/null) + + if [[ "$action_status" == "running" || "$action_status" == "success" ]]; then + echo -e " ${GREEN}✓${RESET} Rebuild initiated — server will restore from image ${SNAPSHOT_ID}" + log "Monitor server status — rebuild may take several minutes" + else + echo -e " ${RED}✗${RESET} Restore failed" + local error_msg + error_msg=$(echo "$resp" | jq -r '.error.message // ""' 2>/dev/null) + [[ -n "$error_msg" ]] && err "$error_msg" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# STATUS +# ══════════════════════════════════════════════════════════════════════ +do_status() { + local ids + ids=$(get_all_server_ids) + [[ -z "$ids" ]] && die "No servers found" + + local all_snaps + all_snaps=$(get_snapshots) + + local total_servers=0 total_snaps=0 total_gb=0 + local protected=0 stale=0 unprotected=0 + local now + now=$(date +%s) + + while IFS= read -r sid; do + [[ -z "$sid" ]] && continue + ((total_servers++)) || true + + local snaps + snaps=$(echo "$all_snaps" | jq --argjson sid "$sid" \ + '[.[] | select(.created_from.id == $sid)]' 2>/dev/null) + local snap_count + snap_count=$(echo "$snaps" | jq 'length' 2>/dev/null || echo 0) + total_snaps=$(( total_snaps + snap_count )) + + local gb + gb=$(echo "$snaps" | jq '[.[].disk_size] | add // 0' 2>/dev/null || echo 0) + total_gb=$(( total_gb + gb )) + + if [[ "$snap_count" -eq 0 ]]; then + ((unprotected++)) || true + continue + fi + + local latest_date + latest_date=$(echo "$snaps" | jq -r \ + 'sort_by(.created) | last | .created // ""' 2>/dev/null) + + if [[ -n "$latest_date" ]]; then + local snap_epoch + snap_epoch=$(date -d "$latest_date" +%s 2>/dev/null || echo 0) + if [[ "$snap_epoch" -gt 0 ]]; then + local age_days=$(( (now - snap_epoch) / 86400 )) + if (( age_days > MAX_AGE )); then + ((stale++)) || true + else + ((protected++)) || true + fi + else + ((protected++)) || true + fi + else + ((protected++)) || true + fi + done <<< "$ids" + + local total_cost + total_cost=$(echo "$total_gb * $COST_PER_GB_MONTH" | bc -l 2>/dev/null | xargs printf "%.2f" 2>/dev/null || echo "0.00") + + if [[ "$OUTPUT_FORMAT" == "prometheus" ]]; then + cat <${MAX_AGE}d):" "${YELLOW}${stale}${RESET}" + else + field_color "Stale (>${MAX_AGE}d):" "${GREEN}0${RESET}" + fi + if [[ "$unprotected" -gt 0 ]]; then + field_color "Unprotected:" "${RED}${unprotected}${RESET}" + else + field_color "Unprotected:" "${GREEN}0${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat < + +param( + [ValidateSet('stdout', 'textfile', 'http')] + [string]$Mode = 'stdout', + + [int]$Port = 9516, + + [string]$TextfileDir = 'C:\ProgramData\node_exporter', + + [switch]$InstallScheduledTask, + + [int]$TaskIntervalMinutes = 2 +) + +# Create a scheduled task to run this script every $TaskIntervalMinutes minutes +if ($InstallScheduledTask) { + $taskName = "HyperVMetricsExporter" + $existingTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + + if (-not $existingTask) { + $taskAction = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$($MyInvocation.MyCommand.Path)`" -Mode textfile" + + if (-not $TaskIntervalMinutes -or $TaskIntervalMinutes -le 0) { + throw "TaskIntervalMinutes must be a positive integer" + } + + $taskTrigger = New-ScheduledTaskTrigger -Once -At (Get-Date).AddMinutes(1) -RepetitionInterval (New-TimeSpan -Minutes $TaskIntervalMinutes) -RepetitionDuration (New-TimeSpan -Days 365) + $taskPrincipal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest + + try { + Write-Host "Creating scheduled task: $taskName" + Register-ScheduledTask -TaskName $taskName -Action $taskAction -Trigger $taskTrigger -Principal $taskPrincipal -Description "Exports Hyper-V metrics for Prometheus every $TaskIntervalMinutes minutes" + + $createdTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + if (-not $createdTask) { + throw "Failed to verify scheduled task creation" + } + Write-Host "Successfully created scheduled task: $taskName" -ForegroundColor Green + } catch { + Write-Error "Failed to create scheduled task: $($_.Exception.Message)" + throw + } + } else { + Write-Host "Scheduled task '$taskName' already exists, skipping creation" + } +} + +$ErrorActionPreference = 'SilentlyContinue' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +function Get-UnixTimestamp { + [int][double]::Parse((Get-Date -UFormat '%s')) +} + +function Format-MetricValue { + param([double]$Value, [int]$Decimals = 2) + [math]::Round($Value, $Decimals) +} + +# ============================================================================ +# VM METRICS +# ============================================================================ + +function Get-VMMetrics { + $sb = [System.Text.StringBuilder]::new() + + # Get all VMs + $vms = @() + try { + $vms = Get-VM -ErrorAction Stop + } catch { + Write-Warning "Failed to query Hyper-V VMs: $_" + return $sb.ToString() + } + + # --- hyperv_vm_state --- + $states = @('Running', 'Off', 'Saved', 'Paused', 'Starting', 'Stopping', 'Reset') + [void]$sb.AppendLine('# HELP hyperv_vm_state Current state of the VM (1=current state, 0=other states)') + [void]$sb.AppendLine('# TYPE hyperv_vm_state gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $currentState = $vm.State.ToString() + foreach ($state in $states) { + $val = if ($currentState -eq $state) { 1 } else { 0 } + [void]$sb.AppendLine("hyperv_vm_state{vm=`"$vmName`",state=`"$state`"} $val") + } + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_uptime_seconds --- + [void]$sb.AppendLine('# HELP hyperv_vm_uptime_seconds VM uptime in seconds') + [void]$sb.AppendLine('# TYPE hyperv_vm_uptime_seconds gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $uptime = if ($vm.Uptime) { Format-MetricValue $vm.Uptime.TotalSeconds 0 } else { 0 } + [void]$sb.AppendLine("hyperv_vm_uptime_seconds{vm=`"$vmName`"} $uptime") + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_generation --- + [void]$sb.AppendLine('# HELP hyperv_vm_generation VM generation (1 or 2)') + [void]$sb.AppendLine('# TYPE hyperv_vm_generation gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $gen = if ($vm.Generation) { $vm.Generation } else { 1 } + [void]$sb.AppendLine("hyperv_vm_generation{vm=`"$vmName`"} $gen") + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_cpu_count --- + [void]$sb.AppendLine('# HELP hyperv_vm_cpu_count Number of virtual processors assigned to VM') + [void]$sb.AppendLine('# TYPE hyperv_vm_cpu_count gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $cpuCount = if ($vm.ProcessorCount) { $vm.ProcessorCount } else { 0 } + [void]$sb.AppendLine("hyperv_vm_cpu_count{vm=`"$vmName`"} $cpuCount") + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_cpu_usage_percent --- + [void]$sb.AppendLine('# HELP hyperv_vm_cpu_usage_percent Current CPU usage percentage of the VM') + [void]$sb.AppendLine('# TYPE hyperv_vm_cpu_usage_percent gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $cpuUsage = if ($vm.CPUUsage -ne $null) { $vm.CPUUsage } else { 0 } + [void]$sb.AppendLine("hyperv_vm_cpu_usage_percent{vm=`"$vmName`"} $cpuUsage") + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_memory_assigned_bytes --- + [void]$sb.AppendLine('# HELP hyperv_vm_memory_assigned_bytes Currently assigned memory in bytes') + [void]$sb.AppendLine('# TYPE hyperv_vm_memory_assigned_bytes gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $memAssigned = if ($vm.MemoryAssigned) { $vm.MemoryAssigned } else { 0 } + [void]$sb.AppendLine("hyperv_vm_memory_assigned_bytes{vm=`"$vmName`"} $memAssigned") + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_memory_demand_bytes --- + [void]$sb.AppendLine('# HELP hyperv_vm_memory_demand_bytes Current memory demand in bytes') + [void]$sb.AppendLine('# TYPE hyperv_vm_memory_demand_bytes gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $memDemand = if ($vm.MemoryDemand) { $vm.MemoryDemand } else { 0 } + [void]$sb.AppendLine("hyperv_vm_memory_demand_bytes{vm=`"$vmName`"} $memDemand") + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_memory_startup_bytes --- + [void]$sb.AppendLine('# HELP hyperv_vm_memory_startup_bytes Configured startup memory in bytes') + [void]$sb.AppendLine('# TYPE hyperv_vm_memory_startup_bytes gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $memStartup = if ($vm.MemoryStartup) { $vm.MemoryStartup } else { 0 } + [void]$sb.AppendLine("hyperv_vm_memory_startup_bytes{vm=`"$vmName`"} $memStartup") + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_dynamic_memory_enabled --- + [void]$sb.AppendLine('# HELP hyperv_vm_dynamic_memory_enabled Whether dynamic memory is enabled (1=yes, 0=no)') + [void]$sb.AppendLine('# TYPE hyperv_vm_dynamic_memory_enabled gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $dynMem = if ($vm.DynamicMemoryEnabled) { 1 } else { 0 } + [void]$sb.AppendLine("hyperv_vm_dynamic_memory_enabled{vm=`"$vmName`"} $dynMem") + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_vhd_size_bytes / hyperv_vm_vhd_current_size_bytes --- + [void]$sb.AppendLine('# HELP hyperv_vm_vhd_size_bytes Maximum size of the VHD/VHDX in bytes') + [void]$sb.AppendLine('# TYPE hyperv_vm_vhd_size_bytes gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + try { + $hdds = Get-VMHardDiskDrive -VMName $vm.Name -ErrorAction Stop + foreach ($hdd in $hdds) { + $vhdPath = $hdd.Path -replace '\\', '/' + try { + $vhd = Get-VHD -Path $hdd.Path -ErrorAction Stop + [void]$sb.AppendLine("hyperv_vm_vhd_size_bytes{vm=`"$vmName`",path=`"$vhdPath`"} $($vhd.Size)") + } catch {} + } + } catch {} + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP hyperv_vm_vhd_current_size_bytes Current file size of the VHD/VHDX in bytes') + [void]$sb.AppendLine('# TYPE hyperv_vm_vhd_current_size_bytes gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + try { + $hdds = Get-VMHardDiskDrive -VMName $vm.Name -ErrorAction Stop + foreach ($hdd in $hdds) { + $vhdPath = $hdd.Path -replace '\\', '/' + try { + $vhd = Get-VHD -Path $hdd.Path -ErrorAction Stop + [void]$sb.AppendLine("hyperv_vm_vhd_current_size_bytes{vm=`"$vmName`",path=`"$vhdPath`"} $($vhd.FileSize)") + } catch {} + } + } catch {} + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_network_adapter --- + [void]$sb.AppendLine('# HELP hyperv_vm_network_adapter_bytes_sent Total bytes sent by VM network adapter') + [void]$sb.AppendLine('# TYPE hyperv_vm_network_adapter_bytes_sent counter') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + try { + $adapters = Get-VMNetworkAdapter -VMName $vm.Name -ErrorAction Stop + foreach ($adapter in $adapters) { + $adapterName = if ($adapter.Name) { $adapter.Name } else { "Network Adapter" } + $adapterName = $adapterName -replace '[\\"]', '' + $sent = if ($adapter.BandwidthSetting) { 0 } else { 0 } + # Use performance counters for actual byte counts + try { + $counterPath = "\Hyper-V Virtual Network Adapter($vmName - $adapterName)\Bytes Sent/sec" + $counter = Get-Counter $counterPath -ErrorAction Stop + $sent = [long]$counter.CounterSamples[0].CookedValue + } catch { $sent = 0 } + [void]$sb.AppendLine("hyperv_vm_network_adapter_bytes_sent{vm=`"$vmName`",adapter=`"$adapterName`"} $sent") + } + } catch {} + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP hyperv_vm_network_adapter_bytes_received Total bytes received by VM network adapter') + [void]$sb.AppendLine('# TYPE hyperv_vm_network_adapter_bytes_received counter') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + try { + $adapters = Get-VMNetworkAdapter -VMName $vm.Name -ErrorAction Stop + foreach ($adapter in $adapters) { + $adapterName = if ($adapter.Name) { $adapter.Name } else { "Network Adapter" } + $adapterName = $adapterName -replace '[\\"]', '' + $received = 0 + try { + $counterPath = "\Hyper-V Virtual Network Adapter($vmName - $adapterName)\Bytes Received/sec" + $counter = Get-Counter $counterPath -ErrorAction Stop + $received = [long]$counter.CounterSamples[0].CookedValue + } catch { $received = 0 } + [void]$sb.AppendLine("hyperv_vm_network_adapter_bytes_received{vm=`"$vmName`",adapter=`"$adapterName`"} $received") + } + } catch {} + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_snapshot_count --- + [void]$sb.AppendLine('# HELP hyperv_vm_snapshot_count Number of checkpoints (snapshots) for the VM') + [void]$sb.AppendLine('# TYPE hyperv_vm_snapshot_count gauge') + + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $snapCount = 0 + try { + $snapCount = (Get-VMSnapshot -VMName $vm.Name -ErrorAction Stop | Measure-Object).Count + } catch {} + [void]$sb.AppendLine("hyperv_vm_snapshot_count{vm=`"$vmName`"} $snapCount") + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_replication_state --- + [void]$sb.AppendLine('# HELP hyperv_vm_replication_state VM replication state (1=current state)') + [void]$sb.AppendLine('# TYPE hyperv_vm_replication_state gauge') + + $replStates = @('Disabled', 'ReadyForInitialReplication', 'InitialReplicationInProgress', 'WaitingForInitialReplication', 'Replicating', 'Suspended', 'Error', 'FailedOver', 'Recovered') + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $currentReplState = $vm.ReplicationState.ToString() + foreach ($rs in $replStates) { + $val = if ($currentReplState -eq $rs) { 1 } else { 0 } + [void]$sb.AppendLine("hyperv_vm_replication_state{vm=`"$vmName`",state=`"$rs`"} $val") + } + } + [void]$sb.AppendLine('') + + # --- hyperv_vm_replication_health --- + [void]$sb.AppendLine('# HELP hyperv_vm_replication_health VM replication health (1=current health)') + [void]$sb.AppendLine('# TYPE hyperv_vm_replication_health gauge') + + $replHealths = @('NotApplicable', 'Normal', 'Warning', 'Critical') + foreach ($vm in $vms) { + $vmName = $vm.Name -replace '[\\"]', '' + $currentHealth = $vm.ReplicationHealth.ToString() + foreach ($rh in $replHealths) { + $val = if ($currentHealth -eq $rh) { 1 } else { 0 } + [void]$sb.AppendLine("hyperv_vm_replication_health{vm=`"$vmName`",health=`"$rh`"} $val") + } + } + [void]$sb.AppendLine('') + + # --- Summary metrics --- + $runningCount = ($vms | Where-Object { $_.State -eq 'Running' } | Measure-Object).Count + $stoppedCount = ($vms | Where-Object { $_.State -eq 'Off' } | Measure-Object).Count + $totalCount = $vms.Count + + [void]$sb.AppendLine('# HELP hyperv_vm_total Total number of VMs') + [void]$sb.AppendLine('# TYPE hyperv_vm_total gauge') + [void]$sb.AppendLine("hyperv_vm_total $totalCount") + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP hyperv_vm_running_total Number of running VMs') + [void]$sb.AppendLine('# TYPE hyperv_vm_running_total gauge') + [void]$sb.AppendLine("hyperv_vm_running_total $runningCount") + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP hyperv_vm_stopped_total Number of stopped VMs') + [void]$sb.AppendLine('# TYPE hyperv_vm_stopped_total gauge') + [void]$sb.AppendLine("hyperv_vm_stopped_total $stoppedCount") + [void]$sb.AppendLine('') + + $sb.ToString() +} + +# ============================================================================ +# HOST METRICS +# ============================================================================ + +function Get-HostMetrics { + $sb = [System.Text.StringBuilder]::new() + + # --- hyperv_host_logical_processors --- + [void]$sb.AppendLine('# HELP hyperv_host_logical_processors Number of logical processors on the host') + [void]$sb.AppendLine('# TYPE hyperv_host_logical_processors gauge') + try { + $cpuCount = (Get-CimInstance Win32_Processor -ErrorAction Stop | Measure-Object -Property NumberOfLogicalProcessors -Sum).Sum + [void]$sb.AppendLine("hyperv_host_logical_processors $cpuCount") + } catch { + [void]$sb.AppendLine("hyperv_host_logical_processors 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_host_memory_total_bytes --- + [void]$sb.AppendLine('# HELP hyperv_host_memory_total_bytes Total physical memory on the host') + [void]$sb.AppendLine('# TYPE hyperv_host_memory_total_bytes gauge') + try { + $totalMem = (Get-CimInstance Win32_ComputerSystem -ErrorAction Stop).TotalPhysicalMemory + [void]$sb.AppendLine("hyperv_host_memory_total_bytes $totalMem") + } catch { + [void]$sb.AppendLine("hyperv_host_memory_total_bytes 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_host_memory_available_bytes --- + [void]$sb.AppendLine('# HELP hyperv_host_memory_available_bytes Available physical memory on the host') + [void]$sb.AppendLine('# TYPE hyperv_host_memory_available_bytes gauge') + try { + $availMem = (Get-CimInstance Win32_OperatingSystem -ErrorAction Stop).FreePhysicalMemory * 1024 + [void]$sb.AppendLine("hyperv_host_memory_available_bytes $availMem") + } catch { + [void]$sb.AppendLine("hyperv_host_memory_available_bytes 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_host_vm_count_total --- + [void]$sb.AppendLine('# HELP hyperv_host_vm_count_total Total VMs configured on host') + [void]$sb.AppendLine('# TYPE hyperv_host_vm_count_total gauge') + try { + $vmCount = (Get-VM -ErrorAction Stop | Measure-Object).Count + [void]$sb.AppendLine("hyperv_host_vm_count_total $vmCount") + } catch { + [void]$sb.AppendLine("hyperv_host_vm_count_total 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_host_vm_running_count --- + [void]$sb.AppendLine('# HELP hyperv_host_vm_running_count Number of VMs currently running') + [void]$sb.AppendLine('# TYPE hyperv_host_vm_running_count gauge') + try { + $runCount = (Get-VM -ErrorAction Stop | Where-Object { $_.State -eq 'Running' } | Measure-Object).Count + [void]$sb.AppendLine("hyperv_host_vm_running_count $runCount") + } catch { + [void]$sb.AppendLine("hyperv_host_vm_running_count 0") + } + [void]$sb.AppendLine('') + + $sb.ToString() +} + +# ============================================================================ +# COLLECT ALL METRICS +# ============================================================================ + +function Get-AllMetrics { + $scriptStart = Get-Date + $sb = [System.Text.StringBuilder]::new() + + # Exporter up + [void]$sb.AppendLine('# HELP hyperv_up Exporter status (1=up, 0=down)') + [void]$sb.AppendLine('# TYPE hyperv_up gauge') + [void]$sb.AppendLine('hyperv_up 1') + [void]$sb.AppendLine('') + + # Exporter info + [void]$sb.AppendLine('# HELP hyperv_exporter_info Exporter version information') + [void]$sb.AppendLine('# TYPE hyperv_exporter_info gauge') + [void]$sb.AppendLine('hyperv_exporter_info{version="1.0"} 1') + [void]$sb.AppendLine('') + + # Collect VM metrics + [void]$sb.Append((Get-VMMetrics)) + + # Collect host metrics + [void]$sb.Append((Get-HostMetrics)) + + # Exporter runtime + $scriptEnd = Get-Date + $duration = Format-MetricValue ($scriptEnd - $scriptStart).TotalSeconds + $timestamp = Get-UnixTimestamp + + [void]$sb.AppendLine('# HELP hyperv_exporter_duration_seconds Time to generate all metrics') + [void]$sb.AppendLine('# TYPE hyperv_exporter_duration_seconds gauge') + [void]$sb.AppendLine("hyperv_exporter_duration_seconds $duration") + [void]$sb.AppendLine('') + [void]$sb.AppendLine('# HELP hyperv_exporter_last_run_timestamp Unix timestamp of last successful run') + [void]$sb.AppendLine('# TYPE hyperv_exporter_last_run_timestamp gauge') + [void]$sb.AppendLine("hyperv_exporter_last_run_timestamp $timestamp") + [void]$sb.AppendLine('') + + $sb.ToString() +} + +# ============================================================================ +# HTTP SERVER MODE +# ============================================================================ + +function Start-HttpServer { + param([int]$ListenPort) + + $prefix = "http://+:$ListenPort/" + $listener = [System.Net.HttpListener]::new() + $listener.Prefixes.Add($prefix) + + try { + $listener.Start() + Write-Host "Starting Hyper-V metrics exporter on port $ListenPort..." -ForegroundColor Green + Write-Host "Metrics available at http://localhost:$ListenPort/metrics" + + while ($listener.IsListening) { + $context = $listener.GetContext() + $request = $context.Request + $response = $context.Response + + if ($request.Url.AbsolutePath -eq '/metrics') { + $metrics = Get-AllMetrics + $buffer = [System.Text.Encoding]::UTF8.GetBytes($metrics) + $response.ContentType = 'text/plain; version=0.0.4; charset=utf-8' + } + else { + $html = @" + + +Hyper-V Metrics Exporter v1.0 + +

Hyper-V Metrics Exporter v1.0

+

Metrics

+

Metrics

+
    +
  • VM state, uptime, generation
  • +
  • CPU count and usage
  • +
  • Memory assigned, demand, startup, dynamic memory
  • +
  • VHD size and current file size
  • +
  • Network adapter bytes sent/received
  • +
  • Snapshot count
  • +
  • Replication state and health
  • +
  • Host resource stats
  • +
+ + +"@ + $buffer = [System.Text.Encoding]::UTF8.GetBytes($html) + $response.ContentType = 'text/html; charset=utf-8' + } + + $response.ContentLength64 = $buffer.Length + $response.OutputStream.Write($buffer, 0, $buffer.Length) + $response.OutputStream.Close() + } + } + catch { + Write-Error "HTTP server error: $_" + Write-Error "If access denied, run: netsh http add urlacl url=http://+:$ListenPort/ user=Everyone" + } + finally { + if ($listener.IsListening) { + $listener.Stop() + } + } +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +switch ($Mode) { + 'http' { + Start-HttpServer -ListenPort $Port + } + 'textfile' { + $OutputFile = Join-Path $TextfileDir 'hyperv_metrics.prom' + + $outputDir = Split-Path $OutputFile -Parent + if (-not (Test-Path $outputDir)) { + New-Item -Path $outputDir -ItemType Directory -Force | Out-Null + } + + $tempFile = Join-Path $outputDir ".hyperv_metrics.$PID.tmp" + + try { + $metrics = Get-AllMetrics + $metrics | Out-File -FilePath $tempFile -Encoding utf8 -NoNewline + + $lineCount = ($metrics -split "`n").Count + if ($lineCount -lt 10) { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Metrics file too small ($lineCount lines), keeping previous" + exit 1 + } + + Move-Item -Path $tempFile -Destination $OutputFile -Force + Write-Host "Metrics written to $OutputFile ($lineCount lines)" -ForegroundColor Green + } + catch { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Failed to generate metrics: $_" + exit 1 + } + } + default { + Get-AllMetrics | Write-Output + } +} diff --git a/hyperv-replica-exporter.ps1 b/hyperv-replica-exporter.ps1 new file mode 100644 index 0000000..86126b9 --- /dev/null +++ b/hyperv-replica-exporter.ps1 @@ -0,0 +1,437 @@ +<# +.SYNOPSIS + Hyper-V Replica Prometheus Metrics Exporter +.DESCRIPTION + Prometheus exporter for Hyper-V Replica - replication health, state, last + replication time, replication frequency, pending bytes, missed replications, + VM replication count, and failover readiness. Exports metrics as + Prometheus-compatible text format. +.PARAMETER Mode + Output mode: 'stdout' (default), 'textfile', or 'http' +.PARAMETER Port + HTTP port for http mode (default: 9517) +.PARAMETER TextfileDir + Directory for textfile collector output (default: C:\ProgramData\node_exporter) +.PARAMETER InstallScheduledTask + Switch to create a scheduled task for auto-start on system boot +.PARAMETER TaskIntervalMinutes + Interval in minutes for the scheduled task (default: 2) +.NOTES + Author: Phil Connor + Contact: contact@mylinux.work + Website: https://mylinux.work + License: MIT + Version: 1.0 + + Metrics Exported: + Core Status: + - hyperv_replica_up + - hyperv_replica_exporter_info{version} + + Per-VM Replication: + - hyperv_replica_vm_state{vm, state} + - hyperv_replica_vm_health{vm, health} + - hyperv_replica_vm_last_replication_timestamp{vm} + - hyperv_replica_vm_frequency_seconds{vm} + - hyperv_replica_vm_pending_bytes{vm} + - hyperv_replica_vm_missed_count{vm} + - hyperv_replica_vm_failover_ready{vm} + - hyperv_replica_vm_role{vm, role} + + Totals: + - hyperv_replica_total_vms + - hyperv_replica_healthy_vms + + Exporter: + - hyperv_replica_exporter_duration_seconds + - hyperv_replica_exporter_last_run_timestamp +#> + +param( + [ValidateSet('stdout', 'textfile', 'http')] + [string]$Mode = 'stdout', + + [int]$Port = 9517, + + [string]$TextfileDir = 'C:\ProgramData\node_exporter', + + [switch]$InstallScheduledTask, + + [int]$TaskIntervalMinutes = 2 +) + +# Create a scheduled task to run this script every $TaskIntervalMinutes minutes +if ($InstallScheduledTask) { + $taskName = "HyperVReplicaMetricsExporter" + $existingTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + + if (-not $existingTask) { + $taskAction = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$($MyInvocation.MyCommand.Path)`" -Mode textfile" + + if (-not $TaskIntervalMinutes -or $TaskIntervalMinutes -le 0) { + throw "TaskIntervalMinutes must be a positive integer" + } + + $taskTrigger = New-ScheduledTaskTrigger -Once -At (Get-Date).AddMinutes(1) -RepetitionInterval (New-TimeSpan -Minutes $TaskIntervalMinutes) -RepetitionDuration (New-TimeSpan -Days 365) + $taskPrincipal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest + + try { + Write-Host "Creating scheduled task: $taskName" + Register-ScheduledTask -TaskName $taskName -Action $taskAction -Trigger $taskTrigger -Principal $taskPrincipal -Description "Exports Hyper-V Replica metrics for Prometheus every $TaskIntervalMinutes minutes" + + $createdTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + if (-not $createdTask) { + throw "Failed to verify scheduled task creation" + } + Write-Host "Successfully created scheduled task: $taskName" -ForegroundColor Green + } catch { + Write-Error "Failed to create scheduled task: $($_.Exception.Message)" + throw + } + } else { + Write-Host "Scheduled task '$taskName' already exists, skipping creation" + } +} + +$ErrorActionPreference = 'SilentlyContinue' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +function Get-UnixTimestamp { + [int][double]::Parse((Get-Date -UFormat '%s')) +} + +function Format-MetricValue { + param([double]$Value, [int]$Decimals = 2) + [math]::Round($Value, $Decimals) +} + +# ============================================================================ +# HYPER-V REPLICA METRICS +# ============================================================================ + +function Get-HyperVReplicaMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $replicas = @(Get-VMReplication -ErrorAction Stop) + } catch { + $replicas = @() + } + + $replicationStates = @('Replicating', 'Suspended', 'WaitingForInitialReplication', 'WaitingForStartResynchronize', 'Resynchronizing', 'ResynchronizeSuspended', 'WaitingForUpdateCompletion', 'WaitingForRepurposeCompletion', 'ReadyForInitialReplication', 'FailedOver', 'NotApplicable') + $healthStates = @('Normal', 'Warning', 'Critical') + + # --- hyperv_replica_vm_state --- + [void]$sb.AppendLine('# HELP hyperv_replica_vm_state Replication state of VM (1=current state)') + [void]$sb.AppendLine('# TYPE hyperv_replica_vm_state gauge') + try { + foreach ($vm in $replicas) { + $vmName = $vm.VMName + $currentState = $vm.State.ToString() + foreach ($state in $replicationStates) { + $val = if ($state -eq $currentState) { 1 } else { 0 } + [void]$sb.AppendLine("hyperv_replica_vm_state{vm=`"$vmName`",state=`"$state`"} $val") + } + } + } catch { + [void]$sb.AppendLine("hyperv_replica_vm_state 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_replica_vm_health --- + [void]$sb.AppendLine('# HELP hyperv_replica_vm_health Replication health of VM (1=current health)') + [void]$sb.AppendLine('# TYPE hyperv_replica_vm_health gauge') + try { + foreach ($vm in $replicas) { + $vmName = $vm.VMName + $currentHealth = $vm.Health.ToString() + foreach ($health in $healthStates) { + $val = if ($health -eq $currentHealth) { 1 } else { 0 } + [void]$sb.AppendLine("hyperv_replica_vm_health{vm=`"$vmName`",health=`"$health`"} $val") + } + } + } catch { + [void]$sb.AppendLine("hyperv_replica_vm_health 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_replica_vm_last_replication_timestamp --- + [void]$sb.AppendLine('# HELP hyperv_replica_vm_last_replication_timestamp Unix timestamp of last successful replication') + [void]$sb.AppendLine('# TYPE hyperv_replica_vm_last_replication_timestamp gauge') + try { + foreach ($vm in $replicas) { + $vmName = $vm.VMName + $lastReplTime = $vm.LastReplicationTime + if ($lastReplTime) { + $epoch = [int][double]::Parse((Get-Date $lastReplTime -UFormat '%s')) + [void]$sb.AppendLine("hyperv_replica_vm_last_replication_timestamp{vm=`"$vmName`"} $epoch") + } else { + [void]$sb.AppendLine("hyperv_replica_vm_last_replication_timestamp{vm=`"$vmName`"} 0") + } + } + } catch { + [void]$sb.AppendLine("hyperv_replica_vm_last_replication_timestamp 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_replica_vm_frequency_seconds --- + [void]$sb.AppendLine('# HELP hyperv_replica_vm_frequency_seconds Replication frequency in seconds') + [void]$sb.AppendLine('# TYPE hyperv_replica_vm_frequency_seconds gauge') + try { + foreach ($vm in $replicas) { + $vmName = $vm.VMName + $freqSec = $vm.FrequencySec + if (-not $freqSec) { $freqSec = 0 } + [void]$sb.AppendLine("hyperv_replica_vm_frequency_seconds{vm=`"$vmName`"} $freqSec") + } + } catch { + [void]$sb.AppendLine("hyperv_replica_vm_frequency_seconds 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_replica_vm_pending_bytes --- + [void]$sb.AppendLine('# HELP hyperv_replica_vm_pending_bytes Pending replication data in bytes') + [void]$sb.AppendLine('# TYPE hyperv_replica_vm_pending_bytes gauge') + try { + foreach ($vm in $replicas) { + $vmName = $vm.VMName + $pending = $vm.PendingReplicationSize + if (-not $pending) { $pending = 0 } + [void]$sb.AppendLine("hyperv_replica_vm_pending_bytes{vm=`"$vmName`"} $pending") + } + } catch { + [void]$sb.AppendLine("hyperv_replica_vm_pending_bytes 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_replica_vm_missed_count --- + [void]$sb.AppendLine('# HELP hyperv_replica_vm_missed_count Number of missed replication cycles') + [void]$sb.AppendLine('# TYPE hyperv_replica_vm_missed_count gauge') + try { + foreach ($vm in $replicas) { + $vmName = $vm.VMName + $missed = $vm.MissedReplicationCount + if (-not $missed) { $missed = 0 } + [void]$sb.AppendLine("hyperv_replica_vm_missed_count{vm=`"$vmName`"} $missed") + } + } catch { + [void]$sb.AppendLine("hyperv_replica_vm_missed_count 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_replica_vm_failover_ready --- + [void]$sb.AppendLine('# HELP hyperv_replica_vm_failover_ready Test replica available for failover (1=yes, 0=no)') + [void]$sb.AppendLine('# TYPE hyperv_replica_vm_failover_ready gauge') + try { + foreach ($vm in $replicas) { + $vmName = $vm.VMName + $testReplicaAvailable = $vm.TestReplicaObject + $val = if ($testReplicaAvailable) { 1 } else { 0 } + [void]$sb.AppendLine("hyperv_replica_vm_failover_ready{vm=`"$vmName`"} $val") + } + } catch { + [void]$sb.AppendLine("hyperv_replica_vm_failover_ready 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_replica_vm_role --- + [void]$sb.AppendLine('# HELP hyperv_replica_vm_role Replication role of VM (1=current role)') + [void]$sb.AppendLine('# TYPE hyperv_replica_vm_role gauge') + try { + $roles = @('Primary', 'Replica') + foreach ($vm in $replicas) { + $vmName = $vm.VMName + $currentRole = $vm.ReplicationMode.ToString() + foreach ($role in $roles) { + $val = if ($role -eq $currentRole) { 1 } else { 0 } + [void]$sb.AppendLine("hyperv_replica_vm_role{vm=`"$vmName`",role=`"$role`"} $val") + } + } + } catch { + [void]$sb.AppendLine("hyperv_replica_vm_role 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_replica_total_vms --- + [void]$sb.AppendLine('# HELP hyperv_replica_total_vms Total VMs with replication configured') + [void]$sb.AppendLine('# TYPE hyperv_replica_total_vms gauge') + try { + $totalVms = $replicas.Count + [void]$sb.AppendLine("hyperv_replica_total_vms $totalVms") + } catch { + [void]$sb.AppendLine("hyperv_replica_total_vms 0") + } + [void]$sb.AppendLine('') + + # --- hyperv_replica_healthy_vms --- + [void]$sb.AppendLine('# HELP hyperv_replica_healthy_vms VMs with Normal replication health') + [void]$sb.AppendLine('# TYPE hyperv_replica_healthy_vms gauge') + try { + $healthyCount = ($replicas | Where-Object { $_.Health.ToString() -eq 'Normal' } | Measure-Object).Count + [void]$sb.AppendLine("hyperv_replica_healthy_vms $healthyCount") + } catch { + [void]$sb.AppendLine("hyperv_replica_healthy_vms 0") + } + [void]$sb.AppendLine('') + + $sb.ToString() +} + +# ============================================================================ +# COLLECT ALL METRICS +# ============================================================================ + +function Get-AllMetrics { + $scriptStart = Get-Date + $sb = [System.Text.StringBuilder]::new() + + # Exporter up + [void]$sb.AppendLine('# HELP hyperv_replica_up Hyper-V Replica exporter status (1=up, 0=down)') + [void]$sb.AppendLine('# TYPE hyperv_replica_up gauge') + try { + $vmms = Get-Service -Name vmms -ErrorAction Stop + $upVal = if ($vmms.Status -eq 'Running') { 1 } else { 0 } + [void]$sb.AppendLine("hyperv_replica_up $upVal") + } catch { + [void]$sb.AppendLine("hyperv_replica_up 0") + } + [void]$sb.AppendLine('') + + # Exporter info + [void]$sb.AppendLine('# HELP hyperv_replica_exporter_info Exporter version information') + [void]$sb.AppendLine('# TYPE hyperv_replica_exporter_info gauge') + [void]$sb.AppendLine('hyperv_replica_exporter_info{version="1.0"} 1') + [void]$sb.AppendLine('') + + # Collect Hyper-V Replica metrics + [void]$sb.Append((Get-HyperVReplicaMetrics)) + + # Exporter runtime + $scriptEnd = Get-Date + $duration = Format-MetricValue ($scriptEnd - $scriptStart).TotalSeconds + $timestamp = Get-UnixTimestamp + + [void]$sb.AppendLine('# HELP hyperv_replica_exporter_duration_seconds Time to generate all metrics') + [void]$sb.AppendLine('# TYPE hyperv_replica_exporter_duration_seconds gauge') + [void]$sb.AppendLine("hyperv_replica_exporter_duration_seconds $duration") + [void]$sb.AppendLine('') + [void]$sb.AppendLine('# HELP hyperv_replica_exporter_last_run_timestamp Unix timestamp of last successful run') + [void]$sb.AppendLine('# TYPE hyperv_replica_exporter_last_run_timestamp gauge') + [void]$sb.AppendLine("hyperv_replica_exporter_last_run_timestamp $timestamp") + [void]$sb.AppendLine('') + + $sb.ToString() +} + +# ============================================================================ +# HTTP SERVER MODE +# ============================================================================ + +function Start-HttpServer { + param([int]$ListenPort) + + $prefix = "http://+:$ListenPort/" + $listener = [System.Net.HttpListener]::new() + $listener.Prefixes.Add($prefix) + + try { + $listener.Start() + Write-Host "Starting Hyper-V Replica metrics exporter on port $ListenPort..." -ForegroundColor Green + Write-Host "Metrics available at http://localhost:$ListenPort/metrics" + + while ($listener.IsListening) { + $context = $listener.GetContext() + $request = $context.Request + $response = $context.Response + + if ($request.Url.AbsolutePath -eq '/metrics') { + $metrics = Get-AllMetrics + $buffer = [System.Text.Encoding]::UTF8.GetBytes($metrics) + $response.ContentType = 'text/plain; version=0.0.4; charset=utf-8' + } + else { + $html = @" + + +Hyper-V Replica Metrics Exporter v1.0 + +

Hyper-V Replica Metrics Exporter v1.0

+

Metrics

+

Metrics

+
    +
  • Per-VM replication state and health
  • +
  • Last replication time and frequency
  • +
  • Pending replication bytes
  • +
  • Missed replication count
  • +
  • Failover readiness
  • +
  • Replication role (Primary/Replica)
  • +
  • Total and healthy VM counts
  • +
+ + +"@ + $buffer = [System.Text.Encoding]::UTF8.GetBytes($html) + $response.ContentType = 'text/html; charset=utf-8' + } + + $response.ContentLength64 = $buffer.Length + $response.OutputStream.Write($buffer, 0, $buffer.Length) + $response.OutputStream.Close() + } + } + catch { + Write-Error "HTTP server error: $_" + Write-Error "If access denied, run: netsh http add urlacl url=http://+:$ListenPort/ user=Everyone" + } + finally { + if ($listener.IsListening) { + $listener.Stop() + } + } +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +switch ($Mode) { + 'http' { + Start-HttpServer -ListenPort $Port + } + 'textfile' { + $OutputFile = Join-Path $TextfileDir 'hyperv_replica_metrics.prom' + + $outputDir = Split-Path $OutputFile -Parent + if (-not (Test-Path $outputDir)) { + New-Item -Path $outputDir -ItemType Directory -Force | Out-Null + } + + $tempFile = Join-Path $outputDir ".hyperv_replica_metrics.$PID.tmp" + + try { + $metrics = Get-AllMetrics + $metrics | Out-File -FilePath $tempFile -Encoding utf8 -NoNewline + + $lineCount = ($metrics -split "`n").Count + if ($lineCount -lt 10) { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Metrics file too small ($lineCount lines), keeping previous" + exit 1 + } + + Move-Item -Path $tempFile -Destination $OutputFile -Force + Write-Host "Metrics written to $OutputFile ($lineCount lines)" -ForegroundColor Green + } + catch { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Failed to generate metrics: $_" + exit 1 + } + } + default { + Get-AllMetrics | Write-Output + } +} diff --git a/iam-audit.sh b/iam-audit.sh new file mode 100644 index 0000000..fba7acc --- /dev/null +++ b/iam-audit.sh @@ -0,0 +1,824 @@ +#!/usr/bin/env bash + +######################################################################################### +#### iam-audit.sh — Audit AWS IAM users, roles, policies, and access keys #### +#### Finds stale access keys, users without MFA, unused roles, overly broad policies #### +#### Requires: bash 4+, aws-cli v2, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### export AWS_PROFILE="production" #### +#### ./iam-audit.sh --full #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +AWS_REGION="${AWS_REGION:-}" +KEY_AGE_WARN_DAYS="${KEY_AGE_WARN_DAYS:-90}" +KEY_AGE_CRIT_DAYS="${KEY_AGE_CRIT_DAYS:-180}" +UNUSED_DAYS="${UNUSED_DAYS:-90}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +RUN_MODE="" +START_TIME="" +WARNINGS=0 +TOTAL_PASS=0 +TOTAL_WARN=0 +TOTAL_CRIT=0 + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; ((WARNINGS++)) || true; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +pass() { ((TOTAL_PASS++)) || true; } +flag_warn() { ((TOTAL_WARN++)) || true; } +flag_crit() { ((TOTAL_CRIT++)) || true; } + +# ── AWS CLI wrapper ─────────────────────────────────────────────────── +aws_cmd() { + local args=("$@") + [[ -n "$AWS_REGION" ]] && args+=(--region "$AWS_REGION") + verbose "aws ${args[*]}" + aws "${args[@]}" +} + +# ── Date math (portable) ───────────────────────────────────────────── +now_epoch() { + date +%s +} + +iso_to_epoch() { + local iso_date="$1" + if date -d "$iso_date" +%s &>/dev/null; then + date -d "$iso_date" +%s + elif date -jf "%Y-%m-%dT%H:%M:%S" "${iso_date%%+*}" +%s &>/dev/null; then + date -jf "%Y-%m-%dT%H:%M:%S" "${iso_date%%+*}" +%s + else + # Fallback: parse YYYY-MM-DD with awk + echo "$iso_date" | awk -F'[-T:+]' '{ + y=$1; m=$2; d=$3 + t = mktime(y " " m " " d " 0 0 0") + print t + }' + fi +} + +days_since() { + local iso_date="$1" + local then_epoch now_epoch_val + then_epoch=$(iso_to_epoch "$iso_date") + now_epoch_val=$(now_epoch) + echo $(( (now_epoch_val - then_epoch) / 86400 )) +} + +# ── Credential check ───────────────────────────────────────────────── +check_deps() { + for cmd in aws jq; do + if ! command -v "$cmd" &>/dev/null; then + err "${cmd} is required but not installed" + exit 1 + fi + done + + local identity + identity=$(aws sts get-caller-identity 2>&1) || { + err "AWS credentials not configured, expired, or invalid" + echo "" >&2 + echo "Supported credential methods:" >&2 + echo " • AWS_PROFILE — named profile from ~/.aws/credentials" >&2 + echo " • AWS SSO — run 'aws sso login --profile your-profile'" >&2 + echo " • Environment vars — AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY + AWS_SESSION_TOKEN" >&2 + echo " • Instance profile — automatic on EC2/ECS" >&2 + echo " • AWS_ROLE_ARN — assume role via STS" >&2 + exit 1 + } + + local arn + arn=$(echo "$identity" | jq -r '.Arn') + verbose "Identity: ${arn}" + + if [[ -n "${AWS_SESSION_TOKEN:-}" ]]; then + verbose "Using temporary credentials (session token present)" + fi + + if [[ -z "$AWS_REGION" ]]; then + AWS_REGION=$(aws configure get region 2>/dev/null || echo "") + if [[ -n "$AWS_REGION" ]]; then + verbose "Using region from config: ${AWS_REGION}" + fi + fi + + log "Authenticated as ${arn}" +} + +# ══════════════════════════════════════════════════════════════════════ +# ACCESS KEY AUDIT +# ══════════════════════════════════════════════════════════════════════ + +audit_access_keys() { + log "Auditing access keys..." + echo "" + + printf " %-24s %-22s %-10s %-10s %-20s %s\n" \ + "USER" "KEY_ID" "STATUS" "AGE_DAYS" "LAST_USED" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..100})" + + local users_json + users_json=$(aws_cmd iam list-users --output json) + + local user_count + user_count=$(echo "$users_json" | jq '.Users | length') + + if [[ "$user_count" -eq 0 ]]; then + log "No IAM users found" + return + fi + + local user_name + echo "$users_json" | jq -r '.Users[].UserName' | while IFS= read -r user_name; do + local keys_json + keys_json=$(aws_cmd iam list-access-keys --user-name "$user_name" --output json 2>/dev/null) || continue + + local key_count + key_count=$(echo "$keys_json" | jq '.AccessKeyMetadata | length') + + if [[ "$key_count" -eq 0 ]]; then + verbose "User ${user_name}: no access keys" + continue + fi + + echo "$keys_json" | jq -c '.AccessKeyMetadata[]' | while IFS= read -r key; do + local key_id status create_date age_days last_used_date severity last_used_display + key_id=$(echo "$key" | jq -r '.AccessKeyId') + status=$(echo "$key" | jq -r '.Status') + create_date=$(echo "$key" | jq -r '.CreateDate') + age_days=$(days_since "$create_date") + + # Get last used info + local last_used_json + last_used_json=$(aws_cmd iam get-access-key-last-used --access-key-id "$key_id" --output json 2>/dev/null) || true + + last_used_date=$(echo "$last_used_json" | jq -r '.AccessKeyLastUsed.LastUsedDate // "N/A"') + if [[ "$last_used_date" == "N/A" ]]; then + last_used_display="Never" + else + last_used_display="${last_used_date:0:10}" + fi + + # Determine severity + if [[ "$status" == "Inactive" ]]; then + severity="INFO" + elif [[ "$age_days" -ge "$KEY_AGE_CRIT_DAYS" ]]; then + severity="CRITICAL" + flag_crit + elif [[ "$age_days" -ge "$KEY_AGE_WARN_DAYS" ]]; then + severity="WARN" + flag_warn + elif [[ "$last_used_display" == "Never" && "$age_days" -ge 30 ]]; then + severity="WARN" + flag_warn + else + severity="OK" + pass + fi + + local color="" + case "$severity" in + CRITICAL) color="$RED" ;; + WARN) color="$YELLOW" ;; + OK) color="$GREEN" ;; + *) color="" ;; + esac + + printf " %-24s %-22s %-10s %-10s %-20s %b%s%b\n" \ + "$user_name" "$key_id" "$status" "$age_days" "$last_used_display" \ + "$color" "$severity" "$RESET" + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# MFA AUDIT +# ══════════════════════════════════════════════════════════════════════ + +audit_mfa() { + log "Auditing MFA status..." + echo "" + + printf " %-24s %-18s %-14s %s\n" \ + "USER" "CONSOLE_ACCESS" "MFA_ENABLED" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..75})" + + local users_json + users_json=$(aws_cmd iam list-users --output json) + + echo "$users_json" | jq -r '.Users[].UserName' | while IFS= read -r user_name; do + local has_console="No" + local has_mfa="No" + local severity="OK" + + # Check for login profile (console access) + if aws_cmd iam get-login-profile --user-name "$user_name" &>/dev/null; then + has_console="Yes" + fi + + # Check MFA devices + local mfa_json + mfa_json=$(aws_cmd iam list-mfa-devices --user-name "$user_name" --output json 2>/dev/null) || true + local mfa_count + mfa_count=$(echo "$mfa_json" | jq '.MFADevices | length') + + if [[ "$mfa_count" -gt 0 ]]; then + has_mfa="Yes" + fi + + # Severity: console access without MFA is critical + if [[ "$has_console" == "Yes" && "$has_mfa" == "No" ]]; then + severity="CRITICAL" + flag_crit + elif [[ "$has_console" == "Yes" && "$has_mfa" == "Yes" ]]; then + severity="OK" + pass + else + severity="OK" + pass + fi + + local color="" + case "$severity" in + CRITICAL) color="$RED" ;; + WARN) color="$YELLOW" ;; + OK) color="$GREEN" ;; + *) color="" ;; + esac + + printf " %-24s %-18s %-14s %b%s%b\n" \ + "$user_name" "$has_console" "$has_mfa" \ + "$color" "$severity" "$RESET" + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# UNUSED USERS AUDIT +# ══════════════════════════════════════════════════════════════════════ + +audit_unused_users() { + log "Auditing user activity (inactive > ${UNUSED_DAYS} days)..." + echo "" + + printf " %-24s %-16s %-16s %s\n" \ + "USER" "LAST_CONSOLE" "LAST_API" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..75})" + + local users_json + users_json=$(aws_cmd iam list-users --output json) + + echo "$users_json" | jq -c '.Users[]' | while IFS= read -r user; do + local user_name password_last_used last_console last_api severity + user_name=$(echo "$user" | jq -r '.UserName') + password_last_used=$(echo "$user" | jq -r '.PasswordLastUsed // "N/A"') + + if [[ "$password_last_used" == "N/A" ]]; then + last_console="Never" + else + last_console="${password_last_used:0:10}" + fi + + # Check access key last used + last_api="Never" + local keys_json + keys_json=$(aws_cmd iam list-access-keys --user-name "$user_name" --output json 2>/dev/null) || true + + echo "$keys_json" | jq -r '.AccessKeyMetadata[].AccessKeyId' 2>/dev/null | while IFS= read -r key_id; do + local lu + lu=$(aws_cmd iam get-access-key-last-used --access-key-id "$key_id" --output json 2>/dev/null | \ + jq -r '.AccessKeyLastUsed.LastUsedDate // "N/A"') || true + if [[ "$lu" != "N/A" ]]; then + echo "${lu:0:10}" + fi + done | sort -r | head -1 | read -r latest_api || true + + if [[ -n "${latest_api:-}" ]]; then + last_api="$latest_api" + fi + + # Determine severity + severity="OK" + local console_inactive=false api_inactive=false + + if [[ "$last_console" == "Never" ]]; then + console_inactive=true + else + local console_days + console_days=$(days_since "$password_last_used") + if [[ "$console_days" -ge "$UNUSED_DAYS" ]]; then + console_inactive=true + fi + fi + + if [[ "$last_api" == "Never" ]]; then + api_inactive=true + else + local api_days + api_days=$(days_since "${last_api}T00:00:00Z") + if [[ "$api_days" -ge "$UNUSED_DAYS" ]]; then + api_inactive=true + fi + fi + + if [[ "$console_inactive" == "true" && "$api_inactive" == "true" ]]; then + severity="WARN" + flag_warn + else + pass + fi + + local color="" + case "$severity" in + WARN) color="$YELLOW" ;; + OK) color="$GREEN" ;; + *) color="" ;; + esac + + printf " %-24s %-16s %-16s %b%s%b\n" \ + "$user_name" "$last_console" "$last_api" \ + "$color" "$severity" "$RESET" + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# UNUSED ROLES AUDIT +# ══════════════════════════════════════════════════════════════════════ + +audit_unused_roles() { + log "Auditing role usage (inactive > ${UNUSED_DAYS} days)..." + echo "" + + printf " %-40s %-16s %-10s %s\n" \ + "ROLE" "LAST_USED" "AGE_DAYS" "SEVERITY" + printf " %s\n" "$(printf '%.0s─' {1..80})" + + local roles_json + roles_json=$(aws_cmd iam list-roles --output json) + + echo "$roles_json" | jq -c '.Roles[]' | while IFS= read -r role; do + local role_name role_path last_used create_date age_days severity + + role_name=$(echo "$role" | jq -r '.RoleName') + role_path=$(echo "$role" | jq -r '.Path') + create_date=$(echo "$role" | jq -r '.CreateDate') + age_days=$(days_since "$create_date") + + # Skip service-linked roles + if [[ "$role_path" == /aws-service-role/* ]]; then + verbose "Skipping service-linked role: ${role_name}" + continue + fi + + # Skip AWS-managed roles + if [[ "$role_name" == AWS* || "$role_name" == aws* ]]; then + verbose "Skipping AWS-managed role: ${role_name}" + continue + fi + + # Check last used + local last_used_date + last_used_date=$(echo "$role" | jq -r '.RoleLastUsed.LastUsedDate // "N/A"') + + if [[ "$last_used_date" == "N/A" ]]; then + last_used="Never" + if [[ "$age_days" -ge "$UNUSED_DAYS" ]]; then + severity="WARN" + flag_warn + else + severity="OK" + pass + fi + else + last_used="${last_used_date:0:10}" + local used_days + used_days=$(days_since "$last_used_date") + if [[ "$used_days" -ge "$UNUSED_DAYS" ]]; then + severity="WARN" + flag_warn + else + severity="OK" + pass + fi + fi + + local color="" + case "$severity" in + WARN) color="$YELLOW" ;; + OK) color="$GREEN" ;; + *) color="" ;; + esac + + printf " %-40s %-16s %-10s %b%s%b\n" \ + "$role_name" "$last_used" "$age_days" \ + "$color" "$severity" "$RESET" + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# POLICY AUDIT +# ══════════════════════════════════════════════════════════════════════ + +audit_policies() { + log "Auditing IAM policies..." + echo "" + + printf " %-28s %-10s %-30s %s\n" \ + "ENTITY" "TYPE" "POLICY" "ISSUE" + printf " %s\n" "$(printf '%.0s─' {1..90})" + + local users_json + users_json=$(aws_cmd iam list-users --output json) + + # Check users with direct policy attachments + echo "$users_json" | jq -r '.Users[].UserName' | while IFS= read -r user_name; do + # Attached managed policies directly on user + local attached + attached=$(aws_cmd iam list-attached-user-policies --user-name "$user_name" --output json 2>/dev/null) || continue + + echo "$attached" | jq -c '.AttachedPolicies[]' 2>/dev/null | while IFS= read -r pol; do + local policy_name + policy_name=$(echo "$pol" | jq -r '.PolicyName') + + # Flag direct attachment + printf " %-28s %-10s %-30s %b%s%b\n" \ + "$user_name" "User" "$policy_name" \ + "$YELLOW" "Direct attachment (use groups)" "$RESET" + flag_warn + + # Flag admin access + if [[ "$policy_name" == "AdministratorAccess" ]]; then + printf " %-28s %-10s %-30s %b%s%b\n" \ + "$user_name" "User" "$policy_name" \ + "$RED" "Full admin access" "$RESET" + flag_crit + fi + done + + # Inline policies on user + local inline + inline=$(aws_cmd iam list-user-policies --user-name "$user_name" --output json 2>/dev/null) || continue + + echo "$inline" | jq -r '.PolicyNames[]' 2>/dev/null | while IFS= read -r pol_name; do + local pol_doc + pol_doc=$(aws_cmd iam get-user-policy --user-name "$user_name" --policy-name "$pol_name" --output json 2>/dev/null) || continue + + # Check for wildcard + local has_star_action has_star_resource + has_star_action=$(echo "$pol_doc" | jq '[.PolicyDocument.Statement[] | select(.Effect == "Allow") | .Action] | flatten | any(. == "*")' 2>/dev/null) || has_star_action="false" + has_star_resource=$(echo "$pol_doc" | jq '[.PolicyDocument.Statement[] | select(.Effect == "Allow") | .Resource] | flatten | any(. == "*")' 2>/dev/null) || has_star_resource="false" + + if [[ "$has_star_action" == "true" && "$has_star_resource" == "true" ]]; then + printf " %-28s %-10s %-30s %b%s%b\n" \ + "$user_name" "Inline" "$pol_name" \ + "$RED" "Action:* Resource:*" "$RESET" + flag_crit + fi + done + done + + # Check roles with AdministratorAccess + local roles_json + roles_json=$(aws_cmd iam list-roles --output json) + + echo "$roles_json" | jq -c '.Roles[]' | while IFS= read -r role; do + local role_name role_path + role_name=$(echo "$role" | jq -r '.RoleName') + role_path=$(echo "$role" | jq -r '.Path') + + [[ "$role_path" == /aws-service-role/* ]] && continue + + local attached + attached=$(aws_cmd iam list-attached-role-policies --role-name "$role_name" --output json 2>/dev/null) || continue + + echo "$attached" | jq -c '.AttachedPolicies[]' 2>/dev/null | while IFS= read -r pol; do + local policy_name + policy_name=$(echo "$pol" | jq -r '.PolicyName') + + if [[ "$policy_name" == "AdministratorAccess" ]]; then + printf " %-28s %-10s %-30s %b%s%b\n" \ + "$role_name" "Role" "$policy_name" \ + "$RED" "Full admin access" "$RESET" + flag_crit + fi + done + done + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# ROOT ACCOUNT AUDIT +# ══════════════════════════════════════════════════════════════════════ + +audit_root_account() { + log "Auditing root account and account settings..." + echo "" + + # Account summary + local summary + summary=$(aws_cmd iam get-account-summary --output json 2>/dev/null) || { + warn "Unable to retrieve account summary (may require root or admin permissions)" + return + } + + local root_keys root_mfa + root_keys=$(echo "$summary" | jq '.SummaryMap.AccountAccessKeysPresent') + root_mfa=$(echo "$summary" | jq '.SummaryMap.AccountMFAEnabled') + + echo " Root Account Status:" + echo " $(printf '%.0s─' {1..50})" + + if [[ "$root_keys" -gt 0 ]]; then + printf " %-30s %b%s%b\n" "Root access keys:" "$RED" "PRESENT (remove them!)" "$RESET" + flag_crit + else + printf " %-30s %b%s%b\n" "Root access keys:" "$GREEN" "None" "$RESET" + pass + fi + + if [[ "$root_mfa" -eq 1 ]]; then + printf " %-30s %b%s%b\n" "Root MFA:" "$GREEN" "Enabled" "$RESET" + pass + else + printf " %-30s %b%s%b\n" "Root MFA:" "$RED" "DISABLED" "$RESET" + flag_crit + fi + + # Password policy + echo "" + log "Checking password policy..." + + local policy + if policy=$(aws_cmd iam get-account-password-policy --output json 2>/dev/null); then + local min_len require_upper require_lower require_numbers require_symbols max_age + min_len=$(echo "$policy" | jq '.PasswordPolicy.MinimumPasswordLength') + require_upper=$(echo "$policy" | jq '.PasswordPolicy.RequireUppercaseCharacters') + require_lower=$(echo "$policy" | jq '.PasswordPolicy.RequireLowercaseCharacters') + require_numbers=$(echo "$policy" | jq '.PasswordPolicy.RequireNumbers') + require_symbols=$(echo "$policy" | jq '.PasswordPolicy.RequireSymbols') + max_age=$(echo "$policy" | jq '.PasswordPolicy.MaxPasswordAge // 0') + + printf " %-30s %s\n" "Min password length:" "$min_len" + printf " %-30s %s\n" "Require uppercase:" "$require_upper" + printf " %-30s %s\n" "Require lowercase:" "$require_lower" + printf " %-30s %s\n" "Require numbers:" "$require_numbers" + printf " %-30s %s\n" "Require symbols:" "$require_symbols" + + if [[ "$max_age" -gt 0 ]]; then + printf " %-30s %s days\n" "Max password age:" "$max_age" + else + printf " %-30s %s\n" "Max password age:" "No expiry" + fi + + if [[ "$min_len" -lt 14 ]]; then + flag_warn + printf "\n %b%s%b\n" "$YELLOW" "Recommendation: Set minimum password length to 14+" "$RESET" + else + pass + fi + else + printf " %-30s %b%s%b\n" "Password policy:" "$YELLOW" "Not configured (using AWS defaults)" "$RESET" + flag_warn + fi + + echo "" +} + +# ══════════════════════════════════════════════════════════════════════ +# SUMMARY +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + local elapsed + elapsed=$(( $(now_epoch) - START_TIME )) + + echo "" + echo " ══════════════════════════════════════════" + echo " IAM Audit Summary" + echo " ══════════════════════════════════════════" + printf " %-20s %b%d%b\n" "PASS:" "$GREEN" "$TOTAL_PASS" "$RESET" + printf " %-20s %b%d%b\n" "WARN:" "$YELLOW" "$TOTAL_WARN" "$RESET" + printf " %-20s %b%d%b\n" "CRITICAL:" "$RED" "$TOTAL_CRIT" "$RESET" + echo " ──────────────────────────────────────────" + printf " Completed in %ds\n" "$elapsed" + echo "" + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + echo -e " ${RED}${BOLD}Action required:${RESET} ${TOTAL_CRIT} critical finding(s)" + echo "" + echo " Top recommendations:" + echo " • Remove root access keys immediately" + echo " • Enable MFA for all console users" + echo " • Rotate access keys older than ${KEY_AGE_CRIT_DAYS} days" + echo " • Replace Action:*/Resource:* policies with least-privilege" + echo "" + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + echo -e " ${YELLOW}Review recommended:${RESET} ${TOTAL_WARN} warning(s)" + echo "" + echo " Suggestions:" + echo " • Rotate access keys older than ${KEY_AGE_WARN_DAYS} days" + echo " • Remove or disable inactive users" + echo " • Use groups instead of direct policy attachments" + echo "" + else + echo -e " ${GREEN}All checks passed${RESET}" + echo "" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2 + exit 1 ;; + esac + done + + if [[ ${#modes[@]} -eq 0 ]]; then + err "No audit mode specified" + echo "Run ${SCRIPT_NAME} --help for usage" >&2 + exit 1 + fi + + RUN_MODE="${modes[*]}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + check_deps + + START_TIME=$(now_epoch) + + echo "" + echo -e "${BOLD}IAM Audit${RESET}" + echo -e "Mode: ${RUN_MODE}" + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo -e "Thresholds: key-warn=${KEY_AGE_WARN_DAYS}d key-crit=${KEY_AGE_CRIT_DAYS}d unused=${UNUSED_DAYS}d" + echo "" + + for mode in $RUN_MODE; do + case "$mode" in + keys) audit_access_keys ;; + mfa) audit_mfa ;; + users) audit_unused_users ;; + roles) audit_unused_roles ;; + policies) audit_policies ;; + root) audit_root_account ;; + esac + done + + print_summary + + if [[ "$TOTAL_CRIT" -gt 0 ]]; then + exit 2 + elif [[ "$TOTAL_WARN" -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@" diff --git a/iis-metrics-exporter.ps1 b/iis-metrics-exporter.ps1 new file mode 100644 index 0000000..7d348c8 --- /dev/null +++ b/iis-metrics-exporter.ps1 @@ -0,0 +1,862 @@ +<# +.SYNOPSIS + IIS Prometheus Metrics Exporter +.DESCRIPTION + Prometheus exporter for IIS (Internet Information Services). Collects metrics + via PowerShell WebAdministration module, Windows performance counters, and + IIS log analysis. Outputs Prometheus-compatible text format for consumption + by windows_exporter textfile collector. +.PARAMETER Mode + Output mode: 'stdout' (default), 'textfile', or 'http' +.PARAMETER Port + HTTP port for http mode (default: 9210) +.PARAMETER TextfileDir + Directory for textfile collector output (default: C:\ProgramData\node_exporter) +.PARAMETER OutputFile + Custom output file path +.PARAMETER InstallScheduledTask + Switch to create a scheduled task for auto-start on system boot +.PARAMETER TaskIntervalMinutes + Interval in minutes for the scheduled task (default: 5) +.NOTES + Author: Phil Connor + Contact: contact@mylinux.work + Website: https://mylinux.work + License: MIT + Version: 1.0 + + Metrics Exported: + Core Status: + - iis_up + - iis_exporter_info{version} + + Site Status: + - iis_site_state{site} + - iis_site_binding_info{site,protocol,address,port,hostname} + + Application Pools: + - iis_apppool_state{apppool} + - iis_apppool_worker_process_count{apppool} + - iis_apppool_recycle_count{apppool} + + Request Throughput: + - iis_current_connections{site} + - iis_requests_per_second{site} + - iis_bytes_sent_per_second{site} + - iis_bytes_received_per_second{site} + - iis_total_bytes_sent{site} + - iis_total_bytes_received{site} + - iis_requests_total{site} + + HTTP Methods: + - iis_get_requests_total{site} + - iis_post_requests_total{site} + - iis_put_requests_total{site} + - iis_delete_requests_total{site} + + HTTP Status Codes: + - iis_status_2xx_total{site} + - iis_status_3xx_total{site} + - iis_status_4xx_total{site} + - iis_status_5xx_total{site} + + SSL/TLS: + - iis_ssl_connections_current + - iis_ssl_connections_per_second + - iis_ssl_handshake_failures + + Worker Processes: + - iis_worker_cpu_percent{apppool} + - iis_worker_memory_bytes{apppool} + - iis_worker_active_requests{apppool} + - iis_worker_uptime_seconds{apppool} + + Cache: + - iis_cache_output_entries + - iis_cache_output_hits + - iis_cache_output_misses + - iis_cache_uri_hits + - iis_cache_uri_misses + - iis_cache_kernel_hits + + Failed Requests: + - iis_failed_requests_500_last_hour + - iis_failed_requests_total{site} + + Exporter: + - iis_exporter_duration_seconds + - iis_exporter_last_run_timestamp +#> + +param( + [ValidateSet('stdout', 'textfile', 'http')] + [string]$Mode = 'stdout', + + [int]$Port = 9210, + + [string]$TextfileDir = 'C:\ProgramData\node_exporter', + + [string]$OutputFile, + + [switch]$InstallScheduledTask, + + [int]$TaskIntervalMinutes = 5 +) + +# Create a scheduled task to run this script every $TaskIntervalMinutes minutes +# The task will run as SYSTEM and will be set to run at startup +if ($InstallScheduledTask) { + $taskName = "IISMetricsExporter" + $existingTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + + if (-not $existingTask) { + $taskAction = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$($MyInvocation.MyCommand.Path)`" -Mode textfile" + + if (-not $TaskIntervalMinutes -or $TaskIntervalMinutes -le 0) { + throw "TaskIntervalMinutes must be a positive integer" + } + + $taskTrigger = New-ScheduledTaskTrigger -Once -At (Get-Date).AddMinutes(1) -RepetitionInterval (New-TimeSpan -Minutes $TaskIntervalMinutes) -RepetitionDuration (New-TimeSpan -Days 365) + $taskPrincipal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest + + try { + Write-Host "Creating scheduled task: $taskName" + Register-ScheduledTask -TaskName $taskName -Action $taskAction -Trigger $taskTrigger -Principal $taskPrincipal -Description "Exports IIS metrics for Prometheus every $TaskIntervalMinutes minutes" + + $createdTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + if (-not $createdTask) { + throw "Failed to verify scheduled task creation" + } + Write-Host "Successfully created scheduled task: $taskName" -ForegroundColor Green + } catch { + Write-Error "Failed to create auto-start task: $($_.Exception.Message)" + throw + } + } else { + Write-Host "Scheduled task '$taskName' already exists, skipping creation" + } +} + +$ErrorActionPreference = 'SilentlyContinue' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +function Get-UnixTimestamp { + [int][double]::Parse((Get-Date -UFormat '%s')) +} + +function Format-MetricValue { + param([double]$Value, [int]$Decimals = 2) + [math]::Round($Value, $Decimals) +} + +function Get-SafeCounter { + param([string]$CounterPath) + try { + $counter = Get-Counter -Counter $CounterPath -ErrorAction Stop + if ($counter -and $counter.CounterSamples) { + return [math]::Max(0, [math]::Round($counter.CounterSamples[0].CookedValue, 2)) + } + } catch {} + return 0 +} + +function Get-SafeCounterMulti { + param([string]$CounterPath) + try { + $counter = Get-Counter -Counter $CounterPath -ErrorAction Stop + if ($counter -and $counter.CounterSamples) { + return $counter.CounterSamples + } + } catch {} + return @() +} + +# ============================================================================ +# SITE STATUS +# ============================================================================ + +function Get-SiteStatusMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + Import-Module WebAdministration -ErrorAction Stop + $sites = Get-Website -ErrorAction Stop + + [void]$sb.AppendLine('# HELP iis_site_state Site state (1=Started, 0=Stopped)') + [void]$sb.AppendLine('# TYPE iis_site_state gauge') + foreach ($site in $sites) { + $name = $site.Name + $state = if ($site.State -eq 'Started') { 1 } else { 0 } + [void]$sb.AppendLine("iis_site_state{site=`"$name`"} $state") + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP iis_site_binding_info Site binding information (always 1)') + [void]$sb.AppendLine('# TYPE iis_site_binding_info gauge') + foreach ($site in $sites) { + $name = $site.Name + foreach ($binding in $site.Bindings.Collection) { + $proto = $binding.protocol + $info = $binding.bindingInformation -split ':' + $addr = if ($info[0]) { $info[0] } else { '*' } + $port = if ($info[1]) { $info[1] } else { '80' } + $host = if ($info[2]) { $info[2] } else { '' } + [void]$sb.AppendLine("iis_site_binding_info{site=`"$name`",protocol=`"$proto`",address=`"$addr`",port=`"$port`",hostname=`"$host`"} 1") + } + } + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect site status metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# APPLICATION POOLS +# ============================================================================ + +function Get-AppPoolMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + Import-Module WebAdministration -ErrorAction Stop + $pools = Get-ChildItem IIS:\AppPools -ErrorAction Stop + + [void]$sb.AppendLine('# HELP iis_apppool_state Application pool state (1=Started, 0=Stopped)') + [void]$sb.AppendLine('# TYPE iis_apppool_state gauge') + foreach ($pool in $pools) { + $name = $pool.Name + $state = if ($pool.State -eq 'Started') { 1 } else { 0 } + [void]$sb.AppendLine("iis_apppool_state{apppool=`"$name`"} $state") + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP iis_apppool_worker_process_count Number of worker processes per pool') + [void]$sb.AppendLine('# TYPE iis_apppool_worker_process_count gauge') + foreach ($pool in $pools) { + $name = $pool.Name + $wpCount = 0 + try { + $workers = Get-ChildItem "IIS:\AppPools\$name\WorkerProcesses" -ErrorAction Stop + $wpCount = @($workers).Count + } catch {} + [void]$sb.AppendLine("iis_apppool_worker_process_count{apppool=`"$name`"} $wpCount") + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP iis_apppool_recycle_count Application pool recycle count') + [void]$sb.AppendLine('# TYPE iis_apppool_recycle_count gauge') + foreach ($pool in $pools) { + $name = $pool.Name + $recycleCount = Get-SafeCounter "\W3SVC_W3WP(*$name*)\Total Threads" + $recycleVal = 0 + try { + $recycleEvents = Get-WinEvent -FilterHashtable @{ + LogName = 'System' + ProviderName = 'WAS' + Id = 5074, 5075, 5076, 5077, 5078, 5079, 5080, 5186 + StartTime = (Get-Date).AddHours(-24) + } -ErrorAction Stop | Where-Object { $_.Message -like "*$name*" } + $recycleVal = @($recycleEvents).Count + } catch {} + [void]$sb.AppendLine("iis_apppool_recycle_count{apppool=`"$name`"} $recycleVal") + } + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect application pool metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# REQUEST THROUGHPUT +# ============================================================================ + +function Get-RequestThroughputMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $connSamples = Get-SafeCounterMulti '\Web Service(*)\Current Connections' + if ($connSamples.Count -gt 0) { + [void]$sb.AppendLine('# HELP iis_current_connections Current active connections per site') + [void]$sb.AppendLine('# TYPE iis_current_connections gauge') + foreach ($sample in $connSamples) { + $instance = ($sample.Path -split '\\')[-2] -replace '[()]', '' + if ($instance -eq '_total') { continue } + $val = [math]::Max(0, [math]::Round($sample.CookedValue)) + [void]$sb.AppendLine("iis_current_connections{site=`"$instance`"} $val") + } + [void]$sb.AppendLine('') + } + + $reqSamples = Get-SafeCounterMulti '\Web Service(*)\Total Method Requests/sec' + if ($reqSamples.Count -gt 0) { + [void]$sb.AppendLine('# HELP iis_requests_per_second HTTP requests per second per site') + [void]$sb.AppendLine('# TYPE iis_requests_per_second gauge') + foreach ($sample in $reqSamples) { + $instance = ($sample.Path -split '\\')[-2] -replace '[()]', '' + if ($instance -eq '_total') { continue } + $val = Format-MetricValue $sample.CookedValue + [void]$sb.AppendLine("iis_requests_per_second{site=`"$instance`"} $val") + } + [void]$sb.AppendLine('') + } + + $bytesSentSamples = Get-SafeCounterMulti '\Web Service(*)\Total Bytes Sent/sec' + if ($bytesSentSamples.Count -gt 0) { + [void]$sb.AppendLine('# HELP iis_bytes_sent_per_second Bytes sent per second per site') + [void]$sb.AppendLine('# TYPE iis_bytes_sent_per_second gauge') + foreach ($sample in $bytesSentSamples) { + $instance = ($sample.Path -split '\\')[-2] -replace '[()]', '' + if ($instance -eq '_total') { continue } + $val = Format-MetricValue $sample.CookedValue + [void]$sb.AppendLine("iis_bytes_sent_per_second{site=`"$instance`"} $val") + } + [void]$sb.AppendLine('') + } + + $bytesRecvSamples = Get-SafeCounterMulti '\Web Service(*)\Total Bytes Received/sec' + if ($bytesRecvSamples.Count -gt 0) { + [void]$sb.AppendLine('# HELP iis_bytes_received_per_second Bytes received per second per site') + [void]$sb.AppendLine('# TYPE iis_bytes_received_per_second gauge') + foreach ($sample in $bytesRecvSamples) { + $instance = ($sample.Path -split '\\')[-2] -replace '[()]', '' + if ($instance -eq '_total') { continue } + $val = Format-MetricValue $sample.CookedValue + [void]$sb.AppendLine("iis_bytes_received_per_second{site=`"$instance`"} $val") + } + [void]$sb.AppendLine('') + } + + $totalBytesSent = Get-SafeCounterMulti '\Web Service(*)\Total Bytes Sent' + if ($totalBytesSent.Count -gt 0) { + [void]$sb.AppendLine('# HELP iis_total_bytes_sent Total bytes sent per site') + [void]$sb.AppendLine('# TYPE iis_total_bytes_sent counter') + foreach ($sample in $totalBytesSent) { + $instance = ($sample.Path -split '\\')[-2] -replace '[()]', '' + if ($instance -eq '_total') { continue } + $val = [math]::Max(0, [math]::Round($sample.CookedValue)) + [void]$sb.AppendLine("iis_total_bytes_sent{site=`"$instance`"} $val") + } + [void]$sb.AppendLine('') + } + + $totalBytesRecv = Get-SafeCounterMulti '\Web Service(*)\Total Bytes Received' + if ($totalBytesRecv.Count -gt 0) { + [void]$sb.AppendLine('# HELP iis_total_bytes_received Total bytes received per site') + [void]$sb.AppendLine('# TYPE iis_total_bytes_received counter') + foreach ($sample in $totalBytesRecv) { + $instance = ($sample.Path -split '\\')[-2] -replace '[()]', '' + if ($instance -eq '_total') { continue } + $val = [math]::Max(0, [math]::Round($sample.CookedValue)) + [void]$sb.AppendLine("iis_total_bytes_received{site=`"$instance`"} $val") + } + [void]$sb.AppendLine('') + } + + $totalReqs = Get-SafeCounterMulti '\Web Service(*)\Total Method Requests' + if ($totalReqs.Count -gt 0) { + [void]$sb.AppendLine('# HELP iis_requests_total Total requests per site') + [void]$sb.AppendLine('# TYPE iis_requests_total counter') + foreach ($sample in $totalReqs) { + $instance = ($sample.Path -split '\\')[-2] -replace '[()]', '' + if ($instance -eq '_total') { continue } + $val = [math]::Max(0, [math]::Round($sample.CookedValue)) + [void]$sb.AppendLine("iis_requests_total{site=`"$instance`"} $val") + } + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect request throughput metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# HTTP METHODS +# ============================================================================ + +function Get-HttpMethodMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $methods = @{ + 'get' = '\Web Service(*)\Total Get Requests' + 'post' = '\Web Service(*)\Total Post Requests' + 'put' = '\Web Service(*)\Total Put Requests' + 'delete' = '\Web Service(*)\Total Delete Requests' + } + + foreach ($method in $methods.GetEnumerator()) { + $samples = Get-SafeCounterMulti $method.Value + if ($samples.Count -gt 0) { + [void]$sb.AppendLine("# HELP iis_$($method.Key)_requests_total Total $($method.Key.ToUpper()) requests per site") + [void]$sb.AppendLine("# TYPE iis_$($method.Key)_requests_total counter") + foreach ($sample in $samples) { + $instance = ($sample.Path -split '\\')[-2] -replace '[()]', '' + if ($instance -eq '_total') { continue } + $val = [math]::Max(0, [math]::Round($sample.CookedValue)) + [void]$sb.AppendLine("iis_$($method.Key)_requests_total{site=`"$instance`"} $val") + } + [void]$sb.AppendLine('') + } + } + } + catch { + Write-Warning "Failed to collect HTTP method metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# HTTP STATUS CODES (from IIS log analysis) +# ============================================================================ + +function Get-HttpStatusMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + Import-Module WebAdministration -ErrorAction Stop + $sites = Get-Website -ErrorAction Stop + $oneHourAgo = (Get-Date).AddHours(-1) + + $siteStatusCounts = @{} + + foreach ($site in $sites) { + $logDir = "$($site.logFile.directory)\W3SVC$($site.id)" -replace '%SystemDrive%', $env:SystemDrive + $siteStatusCounts[$site.Name] = @{ '2xx' = 0; '3xx' = 0; '4xx' = 0; '5xx' = 0 } + + if (-not (Test-Path $logDir)) { continue } + + $logFile = Get-ChildItem $logDir -Filter '*.log' -ErrorAction Stop | + Sort-Object LastWriteTime -Descending | + Select-Object -First 1 + + if (-not $logFile) { continue } + + $reader = [System.IO.StreamReader]::new($logFile.FullName) + try { + while ($null -ne ($line = $reader.ReadLine())) { + if ($line.StartsWith('#')) { continue } + $fields = $line -split '\s+' + if ($fields.Count -lt 4) { continue } + + try { + $logDate = [datetime]::ParseExact("$($fields[0]) $($fields[1])", 'yyyy-MM-dd HH:mm:ss', $null) + if ($logDate -lt $oneHourAgo) { continue } + } catch { continue } + + $statusIdx = if ($fields.Count -ge 12) { 11 } elseif ($fields.Count -ge 10) { 9 } else { continue } + $statusCode = 0 + if ([int]::TryParse($fields[$statusIdx], [ref]$statusCode)) { + if ($statusCode -ge 200 -and $statusCode -lt 300) { $siteStatusCounts[$site.Name]['2xx']++ } + elseif ($statusCode -ge 300 -and $statusCode -lt 400) { $siteStatusCounts[$site.Name]['3xx']++ } + elseif ($statusCode -ge 400 -and $statusCode -lt 500) { $siteStatusCounts[$site.Name]['4xx']++ } + elseif ($statusCode -ge 500 -and $statusCode -lt 600) { $siteStatusCounts[$site.Name]['5xx']++ } + } + } + } + finally { + $reader.Close() + } + } + + foreach ($code in @('2xx', '3xx', '4xx', '5xx')) { + [void]$sb.AppendLine("# HELP iis_status_${code}_total Total ${code} responses per site (last hour)") + [void]$sb.AppendLine("# TYPE iis_status_${code}_total gauge") + foreach ($siteName in $siteStatusCounts.Keys) { + [void]$sb.AppendLine("iis_status_${code}_total{site=`"$siteName`"} $($siteStatusCounts[$siteName][$code])") + } + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect HTTP status code metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# SSL/TLS +# ============================================================================ + +function Get-SslMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $sslCurrent = Get-SafeCounter '\Web Service(_Total)\Current ISAPI Extension Requests' + $sslTotal = Get-SafeCounter '\Web Service(_Total)\Total Connection Attempts (all instances)' + + [void]$sb.AppendLine('# HELP iis_ssl_connections_current Current SSL connections') + [void]$sb.AppendLine('# TYPE iis_ssl_connections_current gauge') + [void]$sb.AppendLine("iis_ssl_connections_current $sslCurrent") + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP iis_ssl_connections_per_second SSL connections per second') + [void]$sb.AppendLine('# TYPE iis_ssl_connections_per_second gauge') + $sslPerSec = Get-SafeCounter '\Web Service(_Total)\Connection Attempts/sec' + [void]$sb.AppendLine("iis_ssl_connections_per_second $sslPerSec") + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP iis_ssl_handshake_failures SSL/TLS handshake failures') + [void]$sb.AppendLine('# TYPE iis_ssl_handshake_failures counter') + $sslFailures = 0 + try { + $sslEvents = Get-WinEvent -FilterHashtable @{ + LogName = 'System' + ProviderName = 'Schannel' + Level = 2, 3 + StartTime = (Get-Date).AddHours(-24) + } -ErrorAction Stop + $sslFailures = @($sslEvents).Count + } catch {} + [void]$sb.AppendLine("iis_ssl_handshake_failures $sslFailures") + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect SSL metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# WORKER PROCESSES +# ============================================================================ + +function Get-WorkerProcessMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + Import-Module WebAdministration -ErrorAction Stop + $pools = Get-ChildItem IIS:\AppPools -ErrorAction Stop + + [void]$sb.AppendLine('# HELP iis_worker_cpu_percent Worker process CPU usage percentage per pool') + [void]$sb.AppendLine('# TYPE iis_worker_cpu_percent gauge') + + $cpuLines = [System.Text.StringBuilder]::new() + $memLines = [System.Text.StringBuilder]::new() + $reqLines = [System.Text.StringBuilder]::new() + $upLines = [System.Text.StringBuilder]::new() + + foreach ($pool in $pools) { + $name = $pool.Name + try { + $workers = Get-ChildItem "IIS:\AppPools\$name\WorkerProcesses" -ErrorAction Stop + foreach ($worker in $workers) { + $pid = $worker.processId + if ($pid -and $pid -gt 0) { + $proc = Get-Process -Id $pid -ErrorAction Stop + $cpuPercent = Format-MetricValue $proc.CPU + $memBytes = $proc.WorkingSet64 + $uptimeSeconds = [math]::Round(((Get-Date) - $proc.StartTime).TotalSeconds) + [void]$cpuLines.AppendLine("iis_worker_cpu_percent{apppool=`"$name`"} $cpuPercent") + [void]$memLines.AppendLine("iis_worker_memory_bytes{apppool=`"$name`"} $memBytes") + [void]$upLines.AppendLine("iis_worker_uptime_seconds{apppool=`"$name`"} $uptimeSeconds") + } + } + + $activeReqs = Get-SafeCounter "\W3SVC_W3WP(*$name*)\Active Requests" + [void]$reqLines.AppendLine("iis_worker_active_requests{apppool=`"$name`"} $activeReqs") + } catch {} + } + + [void]$sb.AppendLine('# HELP iis_worker_cpu_percent Worker process CPU time per pool') + [void]$sb.AppendLine('# TYPE iis_worker_cpu_percent gauge') + [void]$sb.Append($cpuLines.ToString()) + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP iis_worker_memory_bytes Worker process working set in bytes per pool') + [void]$sb.AppendLine('# TYPE iis_worker_memory_bytes gauge') + [void]$sb.Append($memLines.ToString()) + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP iis_worker_active_requests Active requests per worker process per pool') + [void]$sb.AppendLine('# TYPE iis_worker_active_requests gauge') + [void]$sb.Append($reqLines.ToString()) + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP iis_worker_uptime_seconds Worker process uptime in seconds per pool') + [void]$sb.AppendLine('# TYPE iis_worker_uptime_seconds gauge') + [void]$sb.Append($upLines.ToString()) + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect worker process metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# CACHE +# ============================================================================ + +function Get-CacheMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $cacheMetrics = @( + @{ Name = 'iis_cache_output_entries'; Counter = '\Web Service Cache\Current Files Cached'; Help = 'Current output cache entries' }, + @{ Name = 'iis_cache_output_hits'; Counter = '\Web Service Cache\File Cache Hits'; Help = 'Output cache hit count' }, + @{ Name = 'iis_cache_output_misses'; Counter = '\Web Service Cache\File Cache Misses'; Help = 'Output cache miss count' }, + @{ Name = 'iis_cache_uri_hits'; Counter = '\Web Service Cache\URI Cache Hits'; Help = 'URI cache hit count' }, + @{ Name = 'iis_cache_uri_misses'; Counter = '\Web Service Cache\URI Cache Misses'; Help = 'URI cache miss count' }, + @{ Name = 'iis_cache_kernel_hits'; Counter = '\Web Service Cache\Kernel: URI Cache Hits'; Help = 'Kernel cache hit count' } + ) + + foreach ($metric in $cacheMetrics) { + $val = Get-SafeCounter $metric.Counter + [void]$sb.AppendLine("# HELP $($metric.Name) $($metric.Help)") + [void]$sb.AppendLine("# TYPE $($metric.Name) gauge") + [void]$sb.AppendLine("$($metric.Name) $val") + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect cache metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# FAILED REQUESTS +# ============================================================================ + +function Get-FailedRequestMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + Import-Module WebAdministration -ErrorAction Stop + $sites = Get-Website -ErrorAction Stop + $oneHourAgo = (Get-Date).AddHours(-1) + $total500 = 0 + + [void]$sb.AppendLine('# HELP iis_failed_requests_total Total failed requests per site (from event log)') + [void]$sb.AppendLine('# TYPE iis_failed_requests_total gauge') + + foreach ($site in $sites) { + $logDir = "$($site.logFile.directory)\W3SVC$($site.id)" -replace '%SystemDrive%', $env:SystemDrive + $count500 = 0 + + if (Test-Path $logDir) { + $logFile = Get-ChildItem $logDir -Filter '*.log' -ErrorAction SilentlyContinue | + Sort-Object LastWriteTime -Descending | + Select-Object -First 1 + + if ($logFile) { + $reader = [System.IO.StreamReader]::new($logFile.FullName) + try { + while ($null -ne ($line = $reader.ReadLine())) { + if ($line.StartsWith('#')) { continue } + $fields = $line -split '\s+' + if ($fields.Count -lt 4) { continue } + + try { + $logDate = [datetime]::ParseExact("$($fields[0]) $($fields[1])", 'yyyy-MM-dd HH:mm:ss', $null) + if ($logDate -lt $oneHourAgo) { continue } + } catch { continue } + + $statusIdx = if ($fields.Count -ge 12) { 11 } elseif ($fields.Count -ge 10) { 9 } else { continue } + $statusCode = 0 + if ([int]::TryParse($fields[$statusIdx], [ref]$statusCode)) { + if ($statusCode -ge 500) { $count500++ } + } + } + } + finally { + $reader.Close() + } + } + } + + $total500 += $count500 + [void]$sb.AppendLine("iis_failed_requests_total{site=`"$($site.Name)`"} $count500") + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP iis_failed_requests_500_last_hour HTTP 500 errors in the last hour') + [void]$sb.AppendLine('# TYPE iis_failed_requests_500_last_hour gauge') + [void]$sb.AppendLine("iis_failed_requests_500_last_hour $total500") + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect failed request metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# COLLECT ALL METRICS +# ============================================================================ + +function Get-AllMetrics { + $scriptStart = Get-Date + $sb = [System.Text.StringBuilder]::new() + + # Exporter up + [void]$sb.AppendLine('# HELP iis_up Exporter status (1=up, 0=down)') + [void]$sb.AppendLine('# TYPE iis_up gauge') + [void]$sb.AppendLine('iis_up 1') + [void]$sb.AppendLine('') + + # Exporter info + [void]$sb.AppendLine('# HELP iis_exporter_info Exporter version information') + [void]$sb.AppendLine('# TYPE iis_exporter_info gauge') + [void]$sb.AppendLine('iis_exporter_info{version="1.0"} 1') + [void]$sb.AppendLine('') + + # Collect all sections + [void]$sb.Append((Get-SiteStatusMetrics)) + [void]$sb.Append((Get-AppPoolMetrics)) + [void]$sb.Append((Get-RequestThroughputMetrics)) + [void]$sb.Append((Get-HttpMethodMetrics)) + [void]$sb.Append((Get-HttpStatusMetrics)) + [void]$sb.Append((Get-SslMetrics)) + [void]$sb.Append((Get-WorkerProcessMetrics)) + [void]$sb.Append((Get-CacheMetrics)) + [void]$sb.Append((Get-FailedRequestMetrics)) + + # Exporter runtime + $scriptEnd = Get-Date + $duration = Format-MetricValue ($scriptEnd - $scriptStart).TotalSeconds + $timestamp = Get-UnixTimestamp + + [void]$sb.AppendLine('# HELP iis_exporter_duration_seconds Time to generate all metrics') + [void]$sb.AppendLine('# TYPE iis_exporter_duration_seconds gauge') + [void]$sb.AppendLine("iis_exporter_duration_seconds $duration") + [void]$sb.AppendLine('') + [void]$sb.AppendLine('# HELP iis_exporter_last_run_timestamp Unix timestamp of last successful run') + [void]$sb.AppendLine('# TYPE iis_exporter_last_run_timestamp gauge') + [void]$sb.AppendLine("iis_exporter_last_run_timestamp $timestamp") + [void]$sb.AppendLine('') + + $sb.ToString() +} + +# ============================================================================ +# HTTP SERVER MODE +# ============================================================================ + +function Start-HttpServer { + param([int]$ListenPort) + + $prefix = "http://+:$ListenPort/" + $listener = [System.Net.HttpListener]::new() + $listener.Prefixes.Add($prefix) + + try { + $listener.Start() + Write-Host "Starting IIS metrics exporter on port $ListenPort..." -ForegroundColor Green + Write-Host "Metrics available at http://localhost:$ListenPort/metrics" + + while ($listener.IsListening) { + $context = $listener.GetContext() + $request = $context.Request + $response = $context.Response + + if ($request.Url.AbsolutePath -eq '/metrics') { + $metrics = Get-AllMetrics + $buffer = [System.Text.Encoding]::UTF8.GetBytes($metrics) + $response.ContentType = 'text/plain; version=0.0.4; charset=utf-8' + } + else { + $html = @" + + +IIS Metrics Exporter v1.0 + +

IIS Metrics Exporter v1.0

+

Metrics

+

Sections

+
    +
  • Site status and bindings
  • +
  • Application pool state and worker process count
  • +
  • Request throughput (connections, requests/sec, bytes)
  • +
  • HTTP method counts (GET, POST, PUT, DELETE)
  • +
  • HTTP status codes (2xx, 3xx, 4xx, 5xx from log analysis)
  • +
  • SSL/TLS connections and handshake failures
  • +
  • Worker process CPU, memory, active requests, uptime
  • +
  • Cache hit/miss ratios (output, URI, kernel)
  • +
  • Failed requests (500 errors from log analysis)
  • +
+ + +"@ + $buffer = [System.Text.Encoding]::UTF8.GetBytes($html) + $response.ContentType = 'text/html; charset=utf-8' + } + + $response.ContentLength64 = $buffer.Length + $response.OutputStream.Write($buffer, 0, $buffer.Length) + $response.OutputStream.Close() + } + } + catch { + Write-Error "HTTP server error: $_" + Write-Error "If access denied, run: netsh http add urlacl url=http://+:$ListenPort/ user=Everyone" + } + finally { + if ($listener.IsListening) { + $listener.Stop() + } + } +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +switch ($Mode) { + 'http' { + Start-HttpServer -ListenPort $Port + } + 'textfile' { + if (-not $OutputFile) { + $OutputFile = Join-Path $TextfileDir 'iis_metrics.prom' + } + + $outputDir = Split-Path $OutputFile -Parent + if (-not (Test-Path $outputDir)) { + New-Item -Path $outputDir -ItemType Directory -Force | Out-Null + } + + $tempFile = Join-Path $outputDir ".iis_metrics.$PID.tmp" + + try { + $metrics = Get-AllMetrics + $metrics | Out-File -FilePath $tempFile -Encoding utf8 -NoNewline + + $lineCount = ($metrics -split "`n").Count + if ($lineCount -lt 10) { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Metrics file too small ($lineCount lines), keeping previous" + exit 1 + } + + Move-Item -Path $tempFile -Destination $OutputFile -Force + Write-Host "Metrics written to $OutputFile ($lineCount lines)" -ForegroundColor Green + } + catch { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Failed to generate metrics: $_" + exit 1 + } + } + default { + Get-AllMetrics | Write-Output + } +} diff --git a/immich-migration.sh b/immich-migration.sh new file mode 100644 index 0000000..4c6977e --- /dev/null +++ b/immich-migration.sh @@ -0,0 +1,530 @@ +#!/usr/bin/env bash + +########################################################################################## +#### immich-migration.sh — Pre-process and bulk upload photos/videos to Immich #### +#### Google Takeout EXIF repair, date fixing, HEIC conversion, duplicate detection, #### +#### folder-based album creation, progress tracking, and immich-cli upload #### +#### Requires: bash 4+, immich-cli, exiftool, jq #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./immich-migration.sh --source ~/Photos --server URL --api-key KEY #### +#### #### +#### See --help for all options. #### +########################################################################################## + +set -euo pipefail + +# ── Colors ──────────────────────────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log_info() { printf "${CYAN}[INFO]${RESET} %s\n" "$1"; } +log_ok() { printf "${GREEN}[OK]${RESET} %s\n" "$1"; } +log_warn() { printf "${YELLOW}[WARN]${RESET} %s\n" "$1"; } +log_error() { printf "${RED}[ERROR]${RESET} %s\n" "$1" >&2; } +log_step() { printf "\n${BOLD}── %s ──${RESET}\n\n" "$1"; } + +write_log() { + [[ -n "${LOG_FILE:-}" ]] && printf "%s %-8s %s\n" "$(date +%Y-%m-%dT%H:%M:%S)" "$1" "$2" >> "$LOG_FILE" +} + +# ── Defaults ────────────────────────────────────────────────────────── +SOURCE_DIR="" +SERVER_URL="" +API_KEY="" +DRY_RUN=false +ALBUM_FROM_FOLDER=false +FIX_DATES=false +SKIP_DUPLICATES=false +SKIP_HEIC_CONVERT=true +LOG_FILE="" +IMMICH_CMD="" + +UPLOAD_LOG="" +WORK_DIR="" +COUNT_TOTAL=0 +COUNT_UPLOADED=0 +COUNT_SKIPPED=0 +COUNT_FAILED=0 +START_TIME=0 + +# ── Usage ───────────────────────────────────────────────────────────── +usage() { + cat </dev/null; then + IMMICH_CMD="immich" + elif command -v immich-cli &>/dev/null; then + IMMICH_CMD="immich-cli" + else + log_error "immich-cli not found. Install with: npm install -g @immich/cli" + exit 1 + fi + log_ok "immich-cli found: $IMMICH_CMD" + + for cmd in exiftool jq; do + if ! command -v "$cmd" &>/dev/null; then + log_error "$cmd not found. Install with: apt install $( [[ $cmd == exiftool ]] && echo libimage-exiftool-perl || echo "$cmd" )" + exit 1 + fi + log_ok "$cmd found" + done + + if [[ "$SKIP_HEIC_CONVERT" == false ]]; then + if command -v magick &>/dev/null || command -v convert &>/dev/null; then + log_ok "ImageMagick found (HEIC conversion enabled)" + elif command -v heif-convert &>/dev/null; then + log_ok "heif-convert found (HEIC conversion enabled)" + else + log_warn "No HEIC converter found — disabling HEIC conversion" + SKIP_HEIC_CONVERT=true + fi + fi +} + +# ── Server Connectivity ────────────────────────────────────────────── +check_server() { + log_step "Checking Server Connectivity" + + local http_code + http_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "${SERVER_URL}/api/server/ping" \ + -H "x-api-key: ${API_KEY}" 2>/dev/null) || true + + if [[ "$http_code" == "200" ]]; then + log_ok "Server reachable: ${SERVER_URL}" + else + log_error "Cannot reach Immich server at ${SERVER_URL} (HTTP ${http_code})" + exit 1 + fi +} + +# ── File Discovery ──────────────────────────────────────────────────── +discover_files() { + log_step "Scanning Source Directory" + + local file_list + file_list=$(mktemp) + + find "$SOURCE_DIR" -type f \( \ + -iname '*.jpg' -o -iname '*.jpeg' -o -iname '*.png' -o -iname '*.gif' \ + -o -iname '*.bmp' -o -iname '*.tiff' -o -iname '*.tif' -o -iname '*.webp' \ + -o -iname '*.heic' -o -iname '*.heif' -o -iname '*.avif' \ + -o -iname '*.mp4' -o -iname '*.mov' -o -iname '*.avi' -o -iname '*.mkv' \ + -o -iname '*.m4v' -o -iname '*.3gp' -o -iname '*.wmv' -o -iname '*.mpg' \ + -o -iname '*.raw' -o -iname '*.cr2' -o -iname '*.nef' -o -iname '*.arw' \ + -o -iname '*.dng' -o -iname '*.orf' -o -iname '*.rw2' \ + \) | sort > "$file_list" + + COUNT_TOTAL=$(wc -l < "$file_list") + log_info "Found ${BOLD}${COUNT_TOTAL}${RESET} media files" + + echo "$file_list" +} + +# ── Google Takeout JSON Merge ───────────────────────────────────────── +process_takeout_json() { + local file_list="$1" + log_step "Google Takeout — JSON Sidecar Processing" + + local json_count=0 + local merged_count=0 + + while IFS= read -r media_file; do + local json_file="" + + if [[ -f "${media_file}.json" ]]; then + json_file="${media_file}.json" + elif [[ -f "${media_file%.*}.json" ]]; then + json_file="${media_file%.*}.json" + fi + + [[ -z "$json_file" ]] && continue + ((json_count++)) || true + + local taken_ts geo_lat geo_lng description + taken_ts=$(jq -r '.photoTakenTime.timestamp // empty' "$json_file" 2>/dev/null) || true + geo_lat=$(jq -r '.geoData.latitude // empty' "$json_file" 2>/dev/null) || true + geo_lng=$(jq -r '.geoData.longitude // empty' "$json_file" 2>/dev/null) || true + description=$(jq -r '.description // empty' "$json_file" 2>/dev/null) || true + + local exif_args=() + if [[ -n "$taken_ts" && "$taken_ts" != "0" ]]; then + local taken_date + taken_date=$(date -d "@${taken_ts}" +"%Y:%m:%d %H:%M:%S" 2>/dev/null) || \ + taken_date=$(date -r "${taken_ts}" +"%Y:%m:%d %H:%M:%S" 2>/dev/null) || true + if [[ -n "$taken_date" ]]; then + exif_args+=("-DateTimeOriginal=$taken_date" "-CreateDate=$taken_date") + fi + fi + + if [[ -n "$geo_lat" && -n "$geo_lng" && "$geo_lat" != "0" && "$geo_lng" != "0" ]]; then + local lat_ref="N" lng_ref="E" + [[ $(echo "$geo_lat < 0" | bc -l 2>/dev/null || echo 0) == "1" ]] && lat_ref="S" && geo_lat="${geo_lat#-}" + [[ $(echo "$geo_lng < 0" | bc -l 2>/dev/null || echo 0) == "1" ]] && lng_ref="W" && geo_lng="${geo_lng#-}" + exif_args+=("-GPSLatitude=$geo_lat" "-GPSLatitudeRef=$lat_ref" + "-GPSLongitude=$geo_lng" "-GPSLongitudeRef=$lng_ref") + fi + + if [[ -n "$description" ]]; then + exif_args+=("-ImageDescription=$description" "-Description=$description") + fi + + if [[ ${#exif_args[@]} -eq 0 ]]; then + continue + fi + + if [[ "$DRY_RUN" == true ]]; then + log_info "[DRY-RUN] Would merge JSON metadata → $(basename "$media_file")" + else + if exiftool -overwrite_original -quiet "${exif_args[@]}" "$media_file" 2>/dev/null; then + ((merged_count++)) || true + else + log_warn "Failed to merge metadata for: $(basename "$media_file")" + fi + fi + done < "$file_list" + + log_info "Takeout sidecars found: ${json_count} | Merged: ${merged_count}" +} + +# ── Date Fixing from Filenames ──────────────────────────────────────── +fix_dates_from_filenames() { + local file_list="$1" + log_step "Fixing Missing EXIF Dates" + + local checked=0 + local fixed=0 + + while IFS= read -r file; do + local existing_date + existing_date=$(exiftool -s3 -DateTimeOriginal "$file" 2>/dev/null) || true + + if [[ -n "$existing_date" && "$existing_date" != "0000:00:00 00:00:00" ]]; then + continue + fi + + ((checked++)) || true + local basename_file + basename_file=$(basename "$file") + local extracted_date="" + + # IMG_YYYYMMDD_HHMMSS + if [[ "$basename_file" =~ ([0-9]{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])_([0-9]{2})([0-9]{2})([0-9]{2}) ]]; then + extracted_date="${BASH_REMATCH[1]}:${BASH_REMATCH[2]}:${BASH_REMATCH[3]} ${BASH_REMATCH[4]}:${BASH_REMATCH[5]}:${BASH_REMATCH[6]}" + # Screenshot_YYYY-MM-DD-HH-MM-SS or YYYY-MM-DD_HH-MM-SS + elif [[ "$basename_file" =~ ([0-9]{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])[-_]([0-9]{2})[-.]([0-9]{2})[-.]([0-9]{2}) ]]; then + extracted_date="${BASH_REMATCH[1]}:${BASH_REMATCH[2]}:${BASH_REMATCH[3]} ${BASH_REMATCH[4]}:${BASH_REMATCH[5]}:${BASH_REMATCH[6]}" + # YYYY-MM-DD (date only, no time) + elif [[ "$basename_file" =~ ([0-9]{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01]) ]]; then + extracted_date="${BASH_REMATCH[1]}:${BASH_REMATCH[2]}:${BASH_REMATCH[3]} 12:00:00" + fi + + if [[ -z "$extracted_date" ]]; then + continue + fi + + if [[ "$DRY_RUN" == true ]]; then + log_info "[DRY-RUN] Would set date ${extracted_date} on $(basename "$file")" + else + if exiftool -overwrite_original -quiet \ + "-DateTimeOriginal=$extracted_date" \ + "-CreateDate=$extracted_date" \ + "$file" 2>/dev/null; then + ((fixed++)) || true + write_log "FIXED" "$file" + else + log_warn "Failed to set date on: $(basename "$file")" + fi + fi + done < "$file_list" + + log_info "Files missing dates: ${checked} | Fixed from filename: ${fixed}" +} + +# ── HEIC Conversion ────────────────────────────────────────────────── +convert_heic_files() { + local file_list="$1" + + if [[ "$SKIP_HEIC_CONVERT" == true ]]; then + return + fi + + log_step "HEIC → JPEG Conversion" + + local heic_count=0 + local converted=0 + + while IFS= read -r file; do + local ext="${file##*.}" + ext="${ext,,}" + [[ "$ext" != "heic" && "$ext" != "heif" ]] && continue + ((heic_count++)) || true + + local jpeg_file="${file%.*}.jpg" + if [[ -f "$jpeg_file" ]]; then + continue + fi + + if [[ "$DRY_RUN" == true ]]; then + log_info "[DRY-RUN] Would convert $(basename "$file") → JPEG" + else + local success=false + if command -v magick &>/dev/null; then + magick "$file" "$jpeg_file" 2>/dev/null && success=true + elif command -v convert &>/dev/null; then + convert "$file" "$jpeg_file" 2>/dev/null && success=true + elif command -v heif-convert &>/dev/null; then + heif-convert "$file" "$jpeg_file" &>/dev/null && success=true + fi + + if [[ "$success" == true ]]; then + exiftool -overwrite_original -quiet -TagsFromFile "$file" "$jpeg_file" 2>/dev/null || true + ((converted++)) || true + else + log_warn "Failed to convert: $(basename "$file")" + fi + fi + done < "$file_list" + + log_info "HEIC files found: ${heic_count} | Converted: ${converted}" +} + +# ── Duplicate Detection ────────────────────────────────────────────── +check_duplicate() { + local file="$1" + local checksum + + checksum=$(sha256sum "$file" 2>/dev/null | awk '{print $1}') || return 1 + + if [[ -f "$UPLOAD_LOG" ]] && grep -q "^${checksum}" "$UPLOAD_LOG" 2>/dev/null; then + return 0 + fi + + return 1 +} + +record_upload() { + local file="$1" + local checksum + checksum=$(sha256sum "$file" 2>/dev/null | awk '{print $1}') || return 0 + echo "${checksum} ${file}" >> "$UPLOAD_LOG" +} + +# ── Progress Display ───────────────────────────────────────────────── +show_progress() { + local current="$1" + local total="$2" + local pct=0 + [[ "$total" -gt 0 ]] && pct=$(( current * 100 / total )) + + local elapsed=$(( $(date +%s) - START_TIME )) + local mins=$(( elapsed / 60 )) + local secs=$(( elapsed % 60 )) + + printf "\r${DIM}[%d/%d] %d%% complete | elapsed: %dm%02ds${RESET}" \ + "$current" "$total" "$pct" "$mins" "$secs" +} + +# ── Upload Files ────────────────────────────────────────────────────── +upload_files() { + local file_list="$1" + log_step "Uploading to Immich" + + if [[ "$DRY_RUN" == true ]]; then + log_info "[DRY-RUN] Would upload ${COUNT_TOTAL} files to ${SERVER_URL}" + log_info "[DRY-RUN] Album mode: $( [[ "$ALBUM_FROM_FOLDER" == true ]] && echo "from folder names" || echo "none" )" + return + fi + + local current=0 + + while IFS= read -r file; do + ((current++)) || true + show_progress "$current" "$COUNT_TOTAL" + + if [[ "$SKIP_DUPLICATES" == true ]] && check_duplicate "$file"; then + ((COUNT_SKIPPED++)) || true + write_log "SKIPPED" "$file" + continue + fi + + local upload_args=("upload" "--server" "$SERVER_URL" "--key" "$API_KEY") + + if [[ "$ALBUM_FROM_FOLDER" == true ]]; then + local album_name + album_name=$(basename "$(dirname "$file")") + if [[ -n "$album_name" && "$album_name" != "." ]]; then + upload_args+=("--album" "$album_name") + fi + fi + + upload_args+=("$file") + + if $IMMICH_CMD "${upload_args[@]}" &>/dev/null; then + ((COUNT_UPLOADED++)) || true + record_upload "$file" + write_log "UPLOADED" "$file" + else + ((COUNT_FAILED++)) || true + write_log "FAILED" "$file" + log_warn "Failed: $(basename "$file")" + fi + done < "$file_list" + + printf "\n" +} + +# ── Summary ─────────────────────────────────────────────────────────── +print_summary() { + local elapsed=$(( $(date +%s) - START_TIME )) + local mins=$(( elapsed / 60 )) + local secs=$(( elapsed % 60 )) + + log_step "Migration Summary" + + printf " ${BOLD}Source:${RESET} %s\n" "$SOURCE_DIR" + printf " ${BOLD}Server:${RESET} %s\n" "$SERVER_URL" + printf " ${BOLD}Total:${RESET} %d files\n" "$COUNT_TOTAL" + printf " ${GREEN}Uploaded:${RESET} %d\n" "$COUNT_UPLOADED" + printf " ${YELLOW}Skipped:${RESET} %d (duplicate)\n" "$COUNT_SKIPPED" + printf " ${RED}Failed:${RESET} %d\n" "$COUNT_FAILED" + printf " ${BOLD}Duration:${RESET} %dm%02ds\n" "$mins" "$secs" + + if [[ -n "$LOG_FILE" ]]; then + printf " ${BOLD}Log:${RESET} %s\n" "$LOG_FILE" + fi + + if [[ "$DRY_RUN" == true ]]; then + printf "\n ${YELLOW}(dry-run — no files were uploaded or modified)${RESET}\n" + fi + + printf "\n" +} + +# ── Cleanup ─────────────────────────────────────────────────────────── +cleanup() { + [[ -n "${WORK_DIR:-}" && -d "${WORK_DIR:-}" ]] && rm -rf "$WORK_DIR" +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + setup_colors + parse_args "$@" + + printf "\n${BOLD}Immich Migration Script${RESET}\n" + printf "${DIM}Source: %s${RESET}\n" "$SOURCE_DIR" + printf "${DIM}Server: %s${RESET}\n" "$SERVER_URL" + [[ "$DRY_RUN" == true ]] && printf "${YELLOW}Mode: DRY-RUN${RESET}\n" + printf "\n" + + START_TIME=$(date +%s) + trap cleanup EXIT + + check_deps + check_server + + local file_list + file_list=$(discover_files) + + if [[ "$COUNT_TOTAL" -eq 0 ]]; then + log_warn "No media files found in ${SOURCE_DIR}" + exit 0 + fi + + process_takeout_json "$file_list" + + if [[ "$FIX_DATES" == true ]]; then + fix_dates_from_filenames "$file_list" + fi + + convert_heic_files "$file_list" + + upload_files "$file_list" + + rm -f "$file_list" + + print_summary + + if [[ "$COUNT_FAILED" -gt 0 ]]; then + exit 1 + fi +} + +main "$@" diff --git a/incident-response-kit.sh b/incident-response-kit.sh new file mode 100644 index 0000000..6c31efe --- /dev/null +++ b/incident-response-kit.sh @@ -0,0 +1,580 @@ +#!/usr/bin/env bash + +######################################################################################### +#### incident-response-kit.sh — Live incident volatile data capture and IOC search #### +#### Collect processes, network, users, logs into tamper-evident archive #### +#### Requires: bash 4+, root recommended #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### sudo ./incident-response-kit.sh --collect #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -uo pipefail +# NOTE: no -e — collection must continue if individual commands fail + +#------------------------------------------------------------------------------ +# DEFAULTS +#------------------------------------------------------------------------------ +OUTPUT_DIR="/var/log/incident-response" +RUN_MODE="" +IOC_PATTERN="" +CASE_ID="$(date +%Y%m%d-%H%M%S)" +VERBOSE="${VERBOSE:-0}" +COLOR="${COLOR:-auto}" +HASH_CMD="sha256sum" + +#------------------------------------------------------------------------------ +# STATE +#------------------------------------------------------------------------------ +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +COLLECT_COUNT=0 +ERROR_COUNT=0 + +#------------------------------------------------------------------------------ +# COLORS (pre-initialized before setup_colors) +#------------------------------------------------------------------------------ +RED="" +GREEN="" +YELLOW="" +BLUE="" +CYAN="" +BOLD="" +DIM="" +RESET="" + +setup_colors() { + if [[ "${COLOR}" == "never" ]]; then + return + fi + if [[ "${COLOR}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[1;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +#------------------------------------------------------------------------------ +# LOGGING HELPERS +#------------------------------------------------------------------------------ +log() { echo -e "${GREEN}[ir-kit]${RESET} $1"; } +warn() { echo -e "${YELLOW}[ir-kit]${RESET} $1"; } +err() { echo -e "${RED}[ir-kit]${RESET} $1" >&2; } +verbose() { + if [[ "${VERBOSE}" == "1" ]]; then + echo -e "${DIM}[ir-kit]${RESET} $1" + fi +} +die() { err "$1"; exit 1; } + +section_header() { + echo -e "\n${CYAN}${BOLD}━━━ $1 ━━━${RESET}" +} + +subsection() { + echo -e " ${BLUE}▸${RESET} $1" +} + +field() { + printf " ${BOLD}%-18s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-18s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end + end="$(date +%s)" + local diff=$(( end - START_TIME )) + echo "${diff}s" +} + +#------------------------------------------------------------------------------ +# COLLECT HELPER +#------------------------------------------------------------------------------ +collect_cmd() { + local label="$1" outfile="$2" + shift 2 + verbose "Collecting: ${label}" + if "$@" > "$outfile" 2>/dev/null; then + ((COLLECT_COUNT++)) + echo -e " ${GREEN}✓${RESET} ${label}" + else + ((ERROR_COUNT++)) + echo -e " ${YELLOW}⊘${RESET} ${label} ${DIM}(skipped)${RESET}" + fi +} + +#------------------------------------------------------------------------------ +# MODE: FULL COLLECTION +#------------------------------------------------------------------------------ +do_collect() { + local CASE_DIR="${OUTPUT_DIR}/ir-${CASE_ID}" + mkdir -p "${CASE_DIR}" + + section_header "Incident Response — Full Collection" + field "Case ID:" "${CASE_ID}" + field "Output:" "${CASE_DIR}" + field "Started:" "$(date)" + echo "" + + # --- System Info --- + section_header "System Information" + collect_cmd "Hostname" "${CASE_DIR}/hostname.txt" hostname + collect_cmd "Date / Time" "${CASE_DIR}/date.txt" date --iso-8601=seconds + collect_cmd "Uptime" "${CASE_DIR}/uptime.txt" uptime + collect_cmd "Kernel (uname)" "${CASE_DIR}/uname.txt" uname -a + collect_cmd "OS Release" "${CASE_DIR}/os-release.txt" cat /etc/os-release + + # --- Volatile Data --- + section_header "Volatile Data" + collect_cmd "Processes (full)" "${CASE_DIR}/ps-auxwww.txt" ps auxwww + collect_cmd "Network listeners" "${CASE_DIR}/ss-tulnp.txt" ss -tulnp + collect_cmd "Network connections" "${CASE_DIR}/ss-anp.txt" ss -anp + collect_cmd "Routing table (v4)" "${CASE_DIR}/ip-route.txt" ip route + collect_cmd "Routing table (v6)" "${CASE_DIR}/ip6-route.txt" ip -6 route + collect_cmd "ARP / Neighbors" "${CASE_DIR}/ip-neigh.txt" ip neigh + collect_cmd "DNS resolv.conf" "${CASE_DIR}/resolv-conf.txt" cat /etc/resolv.conf + collect_cmd "Open files (summary)" "${CASE_DIR}/lsof-summary.txt" lsof -n -P +c 15 + collect_cmd "Loaded kernel modules" "${CASE_DIR}/lsmod.txt" lsmod + collect_cmd "Mount points" "${CASE_DIR}/mount.txt" mount + + # Environment variables of running processes + verbose "Collecting: Process environments" + if ls /proc/[0-9]*/environ > /dev/null 2>&1; then + ( + for f in /proc/[0-9]*/environ; do + local_pid="${f#/proc/}" + local_pid="${local_pid%/environ}" + printf "=== PID %s ===\n" "${local_pid}" + tr '\0' '\n' < "${f}" 2>/dev/null || true + echo "" + done + ) > "${CASE_DIR}/proc-environ.txt" 2>/dev/null + ((COLLECT_COUNT++)) + echo -e " ${GREEN}✓${RESET} Process environments" + else + ((ERROR_COUNT++)) + echo -e " ${YELLOW}⊘${RESET} Process environments ${DIM}(skipped)${RESET}" + fi + + # --- User Data --- + section_header "User Activity" + collect_cmd "Logged-in users (who)" "${CASE_DIR}/who.txt" who + collect_cmd "User activity (w)" "${CASE_DIR}/w.txt" w + collect_cmd "Login history (last)" "${CASE_DIR}/last.txt" last -25 + collect_cmd "Last login (lastlog)" "${CASE_DIR}/lastlog.txt" lastlog + # Users with shells + verbose "Collecting: Users with login shells" + if grep -vE '(nologin|false|sync|halt|shutdown)$' /etc/passwd > "${CASE_DIR}/users-with-shells.txt" 2>/dev/null; then + ((COLLECT_COUNT++)) + echo -e " ${GREEN}✓${RESET} Users with login shells" + else + ((ERROR_COUNT++)) + echo -e " ${YELLOW}⊘${RESET} Users with login shells ${DIM}(skipped)${RESET}" + fi + # Sudo log + verbose "Collecting: Sudo log" + if grep -i sudo /var/log/auth.log > "${CASE_DIR}/sudo-log.txt" 2>/dev/null || \ + grep -i sudo /var/log/secure > "${CASE_DIR}/sudo-log.txt" 2>/dev/null; then + ((COLLECT_COUNT++)) + echo -e " ${GREEN}✓${RESET} Sudo log" + else + ((ERROR_COUNT++)) + echo -e " ${YELLOW}⊘${RESET} Sudo log ${DIM}(skipped)${RESET}" + fi + + # --- Scheduled Tasks --- + section_header "Scheduled Tasks" + collect_cmd "System crontab" "${CASE_DIR}/crontab-system.txt" cat /etc/crontab + # User crontabs + verbose "Collecting: User crontabs" + ( + while IFS=: read -r user _; do + local_cron="$(crontab -l -u "${user}" 2>/dev/null)" || true + if [[ -n "${local_cron}" ]]; then + printf "=== %s ===\n%s\n\n" "${user}" "${local_cron}" + fi + done < /etc/passwd + ) > "${CASE_DIR}/crontab-users.txt" 2>/dev/null + ((COLLECT_COUNT++)) + echo -e " ${GREEN}✓${RESET} User crontabs" + + collect_cmd "Systemd timers" "${CASE_DIR}/systemd-timers.txt" systemctl list-timers --all --no-pager + + # --- Containers --- + section_header "Container Info" + if command -v docker &>/dev/null; then + collect_cmd "Docker containers" "${CASE_DIR}/docker-ps.txt" docker ps -a --no-trunc + else + echo -e " ${DIM}⊘ docker not installed${RESET}" + fi + if command -v podman &>/dev/null; then + collect_cmd "Podman containers" "${CASE_DIR}/podman-ps.txt" podman ps -a --no-trunc + else + echo -e " ${DIM}⊘ podman not installed${RESET}" + fi + + # --- Recent Logs --- + section_header "Recent Logs" + # auth.log or secure + verbose "Collecting: Auth log" + if tail -500 /var/log/auth.log > "${CASE_DIR}/auth-log.txt" 2>/dev/null || \ + tail -500 /var/log/secure > "${CASE_DIR}/auth-log.txt" 2>/dev/null; then + ((COLLECT_COUNT++)) + echo -e " ${GREEN}✓${RESET} Auth log (last 500 lines)" + else + ((ERROR_COUNT++)) + echo -e " ${YELLOW}⊘${RESET} Auth log ${DIM}(skipped)${RESET}" + fi + # syslog or messages + verbose "Collecting: Syslog" + if tail -500 /var/log/syslog > "${CASE_DIR}/syslog.txt" 2>/dev/null || \ + tail -500 /var/log/messages > "${CASE_DIR}/syslog.txt" 2>/dev/null; then + ((COLLECT_COUNT++)) + echo -e " ${GREEN}✓${RESET} Syslog (last 500 lines)" + else + ((ERROR_COUNT++)) + echo -e " ${YELLOW}⊘${RESET} Syslog ${DIM}(skipped)${RESET}" + fi + # kern.log + verbose "Collecting: Kern log" + if tail -500 /var/log/kern.log > "${CASE_DIR}/kern-log.txt" 2>/dev/null; then + ((COLLECT_COUNT++)) + echo -e " ${GREEN}✓${RESET} Kern log (last 500 lines)" + else + ((ERROR_COUNT++)) + echo -e " ${YELLOW}⊘${RESET} Kern log ${DIM}(skipped)${RESET}" + fi + collect_cmd "Journal (last hour)" "${CASE_DIR}/journalctl-1h.txt" journalctl --since "1 hour ago" --no-pager + + # --- Firewall --- + section_header "Firewall Rules" + collect_cmd "iptables rules" "${CASE_DIR}/iptables-save.txt" iptables-save + collect_cmd "nftables rules" "${CASE_DIR}/nftables.txt" nft list ruleset + collect_cmd "UFW status" "${CASE_DIR}/ufw-status.txt" ufw status verbose + + # --- Manifest & Archive --- + section_header "Creating Tamper-Evident Archive" + + local manifest="${CASE_DIR}/manifest-sha256.txt" + verbose "Building SHA-256 manifest" + : > "${manifest}" + while IFS= read -r -d '' file; do + ${HASH_CMD} "${file}" >> "${manifest}" 2>/dev/null || true + done < <(find "${CASE_DIR}" -type f ! -name "manifest-sha256.txt" -print0 2>/dev/null) + echo -e " ${GREEN}✓${RESET} Manifest created ($(wc -l < "${manifest}") files)" + + local archive_path="${OUTPUT_DIR}/ir-${CASE_ID}.tar.gz" + tar -czf "${archive_path}" -C "$(dirname "${CASE_DIR}")" "$(basename "${CASE_DIR}")" + echo -e " ${GREEN}✓${RESET} Archive: ${archive_path}" + + local archive_hash + archive_hash="$(${HASH_CMD} "${archive_path}" | awk '{print $1}')" + echo "${archive_hash} $(basename "${archive_path}")" > "${archive_path}.sha256" + echo -e " ${GREEN}✓${RESET} Archive hash: ${archive_hash}" + + # --- Summary --- + section_header "Collection Summary" + field "Case ID:" "${CASE_ID}" + field "Files collected:" "${COLLECT_COUNT}" + field_color "Errors:" "${YELLOW}${ERROR_COUNT}${RESET}" + field "Elapsed:" "$(elapsed)" + field "Archive:" "${archive_path}" + field "SHA-256:" "${archive_hash}" + echo "" +} + +#------------------------------------------------------------------------------ +# MODE: QUICK SNAPSHOT +#------------------------------------------------------------------------------ +do_quick() { + local CASE_DIR="${OUTPUT_DIR}/ir-${CASE_ID}" + mkdir -p "${CASE_DIR}" + local snapshot="${CASE_DIR}/quick-snapshot.txt" + + section_header "Incident Response — 30-Second Snapshot" + { + echo "=== Quick Incident Snapshot ===" + echo "Date: $(date)" + echo "Hostname: $(hostname)" + echo "" + + echo "--- Top 20 Processes by CPU ---" + ps aux --sort=-%cpu | head -21 2>/dev/null || true + echo "" + + echo "--- Top 20 Processes by Memory ---" + ps aux --sort=-%mem | head -21 2>/dev/null || true + echo "" + + echo "--- Network Listeners ---" + ss -tulnp 2>/dev/null || true + echo "" + + echo "--- Logged-in Users ---" + who 2>/dev/null || true + echo "" + + echo "--- Last 10 Auth Failures ---" + grep -i "failed\|failure" /var/log/auth.log 2>/dev/null | tail -10 || \ + grep -i "failed\|failure" /var/log/secure 2>/dev/null | tail -10 || \ + journalctl -p err --since "1 hour ago" --no-pager 2>/dev/null | grep -i "auth\|login\|failed" | tail -10 || \ + echo "(no auth failure data found)" + echo "" + } | tee "${snapshot}" + + log "Snapshot saved: ${snapshot}" +} + +#------------------------------------------------------------------------------ +# MODE: IOC SEARCH +#------------------------------------------------------------------------------ +do_ioc() { + if [[ -z "${IOC_PATTERN}" ]]; then + die "IOC pattern required: --ioc " + fi + + section_header "IOC Search: ${IOC_PATTERN}" + subsection "Scanning live system for indicator matches" + local found=0 + + local sources=( + "Process list:ps auxwww" + "Network connections:ss -anp" + "Open files:lsof -n -P" + ) + for entry in "${sources[@]}"; do + local label="${entry%%:*}" + local cmd="${entry#*:}" + verbose "Searching ${label}" + local matches + matches="$(eval "${cmd}" 2>/dev/null | grep -i -- "${IOC_PATTERN}" 2>/dev/null)" || true + if [[ -n "${matches}" ]]; then + echo -e " ${RED}⚑${RESET} ${BOLD}${label}${RESET}" + echo "${matches}" | while IFS= read -r line; do + echo " ${line}" + done + ((found++)) || true + else + echo -e " ${GREEN}✓${RESET} ${label} — clean" + fi + done + + local log_files=( + "/var/log/auth.log" + "/var/log/syslog" + "/etc/hosts" + "/etc/resolv.conf" + ) + for lf in "${log_files[@]}"; do + verbose "Searching ${lf}" + local matches + matches="$(grep -i -- "${IOC_PATTERN}" "${lf}" 2>/dev/null)" || true + if [[ -n "${matches}" ]]; then + echo -e " ${RED}⚑${RESET} ${BOLD}${lf}${RESET}" + echo "${matches}" | tail -20 | while IFS= read -r line; do + echo " ${line}" + done + ((found++)) || true + else + echo -e " ${GREEN}✓${RESET} ${lf} — clean" + fi + done + + # Crontabs + verbose "Searching crontabs" + local cron_matches + cron_matches="$( + cat /etc/crontab 2>/dev/null + while IFS=: read -r user _; do + crontab -l -u "${user}" 2>/dev/null || true + done < /etc/passwd 2>/dev/null + )" || true + local cron_hits + cron_hits="$(echo "${cron_matches}" | grep -i -- "${IOC_PATTERN}" 2>/dev/null)" || true + if [[ -n "${cron_hits}" ]]; then + echo -e " ${RED}⚑${RESET} ${BOLD}Crontabs${RESET}" + echo "${cron_hits}" | while IFS= read -r line; do + echo " ${line}" + done + ((found++)) || true + else + echo -e " ${GREEN}✓${RESET} Crontabs — clean" + fi + + echo "" + if [[ "${found}" -gt 0 ]]; then + echo -e "${RED}${BOLD}⚠ IOC found in ${found} source(s)${RESET}" + else + echo -e "${GREEN}${BOLD}✓ No matches found for IOC${RESET}" + fi +} + +#------------------------------------------------------------------------------ +# MODE: TIMELINE +#------------------------------------------------------------------------------ +do_timeline() { + section_header "Incident Timeline Builder" + local tmpfile + tmpfile="$(mktemp /tmp/ir-timeline.XXXXXX)" + + verbose "Extracting events from auth.log" + if [[ -r /var/log/auth.log ]]; then + awk '{print $0}' /var/log/auth.log >> "${tmpfile}" 2>/dev/null || true + elif [[ -r /var/log/secure ]]; then + awk '{print $0}' /var/log/secure >> "${tmpfile}" 2>/dev/null || true + fi + + verbose "Extracting events from syslog" + if [[ -r /var/log/syslog ]]; then + awk '{print $0}' /var/log/syslog >> "${tmpfile}" 2>/dev/null || true + elif [[ -r /var/log/messages ]]; then + awk '{print $0}' /var/log/messages >> "${tmpfile}" 2>/dev/null || true + fi + + verbose "Extracting events from kern.log" + if [[ -r /var/log/kern.log ]]; then + awk '{print $0}' /var/log/kern.log >> "${tmpfile}" 2>/dev/null || true + fi + + verbose "Extracting events from journalctl" + journalctl --since "24 hours ago" --no-pager -o short-iso 2>/dev/null >> "${tmpfile}" || true + + local total_events + total_events="$(wc -l < "${tmpfile}")" + + if [[ "${total_events}" -eq 0 ]]; then + warn "No log events found — insufficient permissions or empty logs" + rm -f "${tmpfile}" + return + fi + + log "Sorting ${total_events} events chronologically..." + sort -t' ' -k1,3 "${tmpfile}" | uniq > "${tmpfile}.sorted" 2>/dev/null || \ + sort "${tmpfile}" | uniq > "${tmpfile}.sorted" 2>/dev/null + + local sorted_count + sorted_count="$(wc -l < "${tmpfile}.sorted")" + field "Total events:" "${sorted_count} (deduplicated)" + echo "" + + echo -e "${BOLD}Last 50 events:${RESET}" + tail -50 "${tmpfile}.sorted" + + echo "" + field "Full timeline:" "${tmpfile}.sorted" + rm -f "${tmpfile}" +} + +#------------------------------------------------------------------------------ +# HELP +#------------------------------------------------------------------------------ +show_help() { + cat <&2; exit 1 ;; + esac + done +} + +# ============================================================================ +# METRICS GENERATION +# ============================================================================ + +generate_metrics() { + local script_start + script_start=$(date +%s) + + # ======================================================================== + # Prerequisite Check + # ======================================================================== + + if ! command -v jq >/dev/null 2>&1; then + echo "# ERROR: jq is required but not found" >&2 + cat </dev/null 2>&1; then + echo "# ERROR: incus command not found" >&2 + cat </dev/null) + + if [ -n "$pools_json" ] && [ "$pools_json" != "null" ]; then + local pool_urls + pool_urls=$(echo "$pools_json" | jq -r '.[]' 2>/dev/null) + + while IFS= read -r pool_url; do + [ -z "$pool_url" ] && continue + + local pool_name + pool_name=$(basename "$pool_url") + local esc_pool + esc_pool=$(prom_escape "$pool_name") + + local pool_config + pool_config=$(incus query "/1.0/storage-pools/$pool_name" 2>/dev/null) + local driver + driver=$(echo "$pool_config" | jq -r '.driver // "unknown"' 2>/dev/null) + local esc_driver + esc_driver=$(prom_escape "$driver") + + local resources_json + resources_json=$(incus query "/1.0/storage-pools/$pool_name/resources" 2>/dev/null) + + local total_space="" + local used_space="" + + if [ -n "$resources_json" ] && [ "$resources_json" != "null" ]; then + total_space=$(echo "$resources_json" | jq -r '.space.total // empty' 2>/dev/null) + used_space=$(echo "$resources_json" | jq -r '.space.used // empty' 2>/dev/null) + fi + + # Fallback: parse incus storage info output + if [ -z "$total_space" ] || [ -z "$used_space" ]; then + local storage_info + storage_info=$(incus storage info "$pool_name" 2>/dev/null) + + if [ -n "$storage_info" ]; then + total_space=$(echo "$storage_info" | awk '/Total space:/ { + val = $3; unit = $4 + if (unit == "GiB") val = val * 1073741824 + else if (unit == "TiB") val = val * 1099511627776 + else if (unit == "MiB") val = val * 1048576 + else if (unit == "KiB") val = val * 1024 + printf "%.0f", val + }') + used_space=$(echo "$storage_info" | awk '/Space used:/ { + val = $3; unit = $4 + if (unit == "GiB") val = val * 1073741824 + else if (unit == "TiB") val = val * 1099511627776 + else if (unit == "MiB") val = val * 1048576 + else if (unit == "KiB") val = val * 1024 + printf "%.0f", val + }') + fi + fi + + if [ -n "$total_space" ] && [ -n "$used_space" ]; then + local labels="pool=\"$esc_pool\",driver=\"$esc_driver\"" + pool_total_lines="${pool_total_lines}incus_storage_pool_total_bytes{${labels}} $total_space +" + pool_used_lines="${pool_used_lines}incus_storage_pool_used_bytes{${labels}} $used_space +" + local ratio + ratio=$(awk "BEGIN { if ($total_space > 0) printf \"%.6f\", $used_space / $total_space; else print \"0\" }") + pool_ratio_lines="${pool_ratio_lines}incus_storage_pool_usage_ratio{${labels}} $ratio +" + else + echo "# WARNING: could not read storage pool '$pool_name' resources" >&2 + fi + done <<< "$pool_urls" + fi + + if [ -n "$pool_total_lines" ]; then + echo "# HELP incus_storage_pool_total_bytes Total storage pool capacity in bytes" + echo "# TYPE incus_storage_pool_total_bytes gauge" + printf '%s' "$pool_total_lines" + echo "" + fi + + if [ -n "$pool_used_lines" ]; then + echo "# HELP incus_storage_pool_used_bytes Used storage pool space in bytes" + echo "# TYPE incus_storage_pool_used_bytes gauge" + printf '%s' "$pool_used_lines" + echo "" + fi + + if [ -n "$pool_ratio_lines" ]; then + echo "# HELP incus_storage_pool_usage_ratio Storage pool used/total ratio" + echo "# TYPE incus_storage_pool_usage_ratio gauge" + printf '%s' "$pool_ratio_lines" + echo "" + fi + + # ======================================================================== + # Instance Inventory Metrics + # ======================================================================== + local instances_json + instances_json=$(incus list --format json 2>/dev/null) + + local instance_total_lines="" + local instance_info_lines="" + local snapshot_total_lines="" + local snapshot_oldest_lines="" + local snapshot_newest_lines="" + + if [ -n "$instances_json" ] && [ "$instances_json" != "null" ] && [ "$instances_json" != "[]" ]; then + # Count instances by type and status + local counts + counts=$(echo "$instances_json" | jq -r ' + group_by(.type, .status) | + .[] | + {type: .[0].type, status: .[0].status, count: length} | + "\(.type) \(.status) \(.count)" + ' 2>/dev/null) + + while IFS= read -r count_line; do + [ -z "$count_line" ] && continue + local inst_type inst_status inst_count + inst_type=$(echo "$count_line" | awk '{print $1}') + inst_status=$(echo "$count_line" | awk '{print $2}') + inst_count=$(echo "$count_line" | awk '{print $3}') + local esc_type esc_status + esc_type=$(prom_escape "$inst_type") + esc_status=$(prom_escape "$inst_status") + instance_total_lines="${instance_total_lines}incus_instances_total{type=\"$esc_type\",status=\"$esc_status\"} $inst_count +" + done <<< "$counts" + + # Per-instance info gauge and snapshot metrics + local instance_data + instance_data=$(echo "$instances_json" | jq -r ' + .[] | + "\(.name)\t\(.type)\t\(.status)\t\(.project // "default")" + ' 2>/dev/null) + + local now + now=$(date +%s) + + while IFS=$'\t' read -r inst_name inst_type inst_status inst_project; do + [ -z "$inst_name" ] && continue + + local esc_name esc_type esc_status esc_project + esc_name=$(prom_escape "$inst_name") + esc_type=$(prom_escape "$inst_type") + esc_status=$(prom_escape "$inst_status") + esc_project=$(prom_escape "$inst_project") + + instance_info_lines="${instance_info_lines}incus_instance_info{name=\"$esc_name\",type=\"$esc_type\",status=\"$esc_status\",project=\"$esc_project\"} 1 +" + + # Snapshot metrics + local snap_json + snap_json=$(incus query "/1.0/instances/$inst_name/snapshots" --project "$inst_project" 2>/dev/null) + + if [ -n "$snap_json" ] && [ "$snap_json" != "null" ] && [ "$snap_json" != "[]" ]; then + local snap_count + snap_count=$(echo "$snap_json" | jq 'length' 2>/dev/null) + snap_count=${snap_count:-0} + + local snap_labels="name=\"$esc_name\",project=\"$esc_project\"" + snapshot_total_lines="${snapshot_total_lines}incus_instance_snapshots_total{${snap_labels}} $snap_count +" + + if [ "$snap_count" -gt 0 ] 2>/dev/null; then + # Fetch details for each snapshot to get created_at timestamps + local snap_timestamps="" + local snap_urls + snap_urls=$(echo "$snap_json" | jq -r '.[]' 2>/dev/null) + + while IFS= read -r snap_url; do + [ -z "$snap_url" ] && continue + local snap_detail + snap_detail=$(incus query "$snap_url" --project "$inst_project" 2>/dev/null) + if [ -n "$snap_detail" ]; then + local created_at + created_at=$(echo "$snap_detail" | jq -r '.created_at // empty' 2>/dev/null) + if [ -n "$created_at" ]; then + local snap_epoch + snap_epoch=$(date -d "$created_at" +%s 2>/dev/null) + if [ -n "$snap_epoch" ]; then + snap_timestamps="${snap_timestamps}${snap_epoch} +" + fi + fi + fi + done <<< "$snap_urls" + + if [ -n "$snap_timestamps" ]; then + local oldest newest + oldest=$(echo "$snap_timestamps" | sort -n | head -1) + newest=$(echo "$snap_timestamps" | sort -rn | head -1) + + if [ -n "$oldest" ]; then + local oldest_age + oldest_age=$((now - oldest)) + snapshot_oldest_lines="${snapshot_oldest_lines}incus_instance_snapshot_oldest_age_seconds{${snap_labels}} $oldest_age +" + fi + + if [ -n "$newest" ]; then + local newest_age + newest_age=$((now - newest)) + snapshot_newest_lines="${snapshot_newest_lines}incus_instance_snapshot_newest_age_seconds{${snap_labels}} $newest_age +" + fi + fi + fi + else + local snap_labels="name=\"$esc_name\",project=\"$esc_project\"" + snapshot_total_lines="${snapshot_total_lines}incus_instance_snapshots_total{${snap_labels}} 0 +" + fi + done <<< "$instance_data" + fi + + if [ -n "$instance_total_lines" ]; then + echo "# HELP incus_instances_total Number of instances by type and status" + echo "# TYPE incus_instances_total gauge" + printf '%s' "$instance_total_lines" + echo "" + fi + + if [ -n "$instance_info_lines" ]; then + echo "# HELP incus_instance_info Instance information (always 1)" + echo "# TYPE incus_instance_info gauge" + printf '%s' "$instance_info_lines" + echo "" + fi + + if [ -n "$snapshot_total_lines" ]; then + echo "# HELP incus_instance_snapshots_total Number of snapshots per instance" + echo "# TYPE incus_instance_snapshots_total gauge" + printf '%s' "$snapshot_total_lines" + echo "" + fi + + if [ -n "$snapshot_oldest_lines" ]; then + echo "# HELP incus_instance_snapshot_oldest_age_seconds Age of oldest snapshot in seconds" + echo "# TYPE incus_instance_snapshot_oldest_age_seconds gauge" + printf '%s' "$snapshot_oldest_lines" + echo "" + fi + + if [ -n "$snapshot_newest_lines" ]; then + echo "# HELP incus_instance_snapshot_newest_age_seconds Age of newest snapshot in seconds" + echo "# TYPE incus_instance_snapshot_newest_age_seconds gauge" + printf '%s' "$snapshot_newest_lines" + echo "" + fi + + # ======================================================================== + # Image Cache Metrics + # ======================================================================== + local images_json + images_json=$(incus image list --format json 2>/dev/null) + + local images_total=0 + local image_size_lines="" + + if [ -n "$images_json" ] && [ "$images_json" != "null" ] && [ "$images_json" != "[]" ]; then + images_total=$(echo "$images_json" | jq 'length' 2>/dev/null) + images_total=${images_total:-0} + + local image_data + image_data=$(echo "$images_json" | jq -r ' + .[] | + "\(.fingerprint[0:12])\t\(((.aliases // []) | if length > 0 then .[0].name else "" end))\t\(.architecture)\t\(.size)" + ' 2>/dev/null) + + while IFS=$'\t' read -r img_fp img_alias img_arch img_size; do + [ -z "$img_fp" ] && continue + local esc_fp esc_alias esc_arch + esc_fp=$(prom_escape "$img_fp") + esc_alias=$(prom_escape "$img_alias") + esc_arch=$(prom_escape "$img_arch") + image_size_lines="${image_size_lines}incus_image_size_bytes{fingerprint=\"$esc_fp\",alias=\"$esc_alias\",arch=\"$esc_arch\"} $img_size +" + done <<< "$image_data" + fi + + echo "# HELP incus_images_total Total number of cached images" + echo "# TYPE incus_images_total gauge" + echo "incus_images_total $images_total" + echo "" + + if [ -n "$image_size_lines" ]; then + echo "# HELP incus_image_size_bytes Image size in bytes" + echo "# TYPE incus_image_size_bytes gauge" + printf '%s' "$image_size_lines" + echo "" + fi + + # ======================================================================== + # Built-in Incus Metrics (/1.0/metrics) + # ======================================================================== + # Pull the native metrics (CPU, memory, network, disk I/O per instance) + # so everything flows through node_exporter — no second scrape target. + local builtin_metrics + builtin_metrics=$(incus query /1.0/metrics 2>/dev/null) + + if [ -n "$builtin_metrics" ]; then + echo "# ── Built-in Incus instance metrics (/1.0/metrics) ──" + echo "$builtin_metrics" + echo "" + else + echo "# WARNING: could not read /1.0/metrics (check incus daemon status)" >&2 + fi + + # ======================================================================== + # Exporter Runtime + # ======================================================================== + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + trap 'echo "Shutting down Incus metrics exporter..." >&2; exit 0' INT TERM + + while true; do + { + read -r request + local body + if [[ "$request" =~ ^GET\ /metrics ]]; then + body=$(generate_metrics) + printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body" + else + body=$(cat <<'HTMLEOF' + + +Incus Metrics Exporter v1.0 + +

Incus Metrics Exporter v1.0

+

Metrics

+

Sections

+
    +
  • Storage pool usage (total, used, ratio)
  • +
  • Instance inventory (count by type/status, info per instance)
  • +
  • Snapshot counts and age per instance
  • +
  • Image cache (count and size per image)
  • +
+ + +HTMLEOF +) + printf "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body" + fi + } | if nc -h 2>&1 | grep -q 'GNU\|traditional'; then + nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + else + nc -l "$HTTP_PORT" 2>/dev/null + fi + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.incus_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 3 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/infra-smoke-tests.sh b/infra-smoke-tests.sh new file mode 100755 index 0000000..3434691 --- /dev/null +++ b/infra-smoke-tests.sh @@ -0,0 +1,737 @@ +#!/usr/bin/env bash + +######################################################################################### +#### infra-smoke-tests.sh — Verify Prometheus/Grafana/Alertmanager/Loki stack health #### +#### Zero external dependencies. Runs in air-gapped environments. #### +#### Requires: bash 4+, curl, openssl (optional) #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### export PROMETHEUS_URL="http://prometheus:9090" #### +#### export GRAFANA_URL="http://grafana:3000" #### +#### ./infra-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +PROMETHEUS_URL="${PROMETHEUS_URL:-}" +GRAFANA_URL="${GRAFANA_URL:-}" +GRAFANA_TOKEN="${GRAFANA_TOKEN:-}" +ALERTMANAGER_URL="${ALERTMANAGER_URL:-}" +LOKI_URL="${LOKI_URL:-}" +EXPECTED_JOBS="${EXPECTED_JOBS:-}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +CURL_INSECURE="${CURL_INSECURE:-false}" +SKIP_LOKI="${SKIP_LOKI:-false}" +SKIP_ALERTMANAGER="${SKIP_ALERTMANAGER:-false}" +SKIP_GRAFANA="${SKIP_GRAFANA:-false}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" # text, tap, junit +JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0 +FAIL=0 +SKIP=0 +TOTAL=0 +RESULTS=() +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" + local detail="${2:-}" + ((PASS++)) || true + ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name}" + else + echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_fail() { + local name="$1" + local detail="${2:-}" + ((FAIL++)) || true + ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else + echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_skip() { + local name="$1" + local reason="${2:-}" + ((SKIP++)) || true + ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else + echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}" + fi +} + +# ── curl wrappers ───────────────────────────────────────────────────── +http_get() { + local url="$1" + shift + local curl_opts=(-s -S --max-time "$CURL_TIMEOUT") + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + verbose "curl GET ${url}" + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +http_get_status() { + local url="$1" + shift + local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + verbose "curl GET (status) ${url}" + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +http_get_with_status() { + local url="$1" + shift + local curl_opts=(-s -S -w "\n%{http_code}" --max-time "$CURL_TIMEOUT") + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + verbose "curl GET (body+status) ${url}" + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +grafana_get() { + local endpoint="$1" + local url="${GRAFANA_URL}${endpoint}" + if [[ -n "$GRAFANA_TOKEN" ]]; then + http_get "$url" -H "Authorization: Bearer ${GRAFANA_TOKEN}" + else + http_get "$url" + fi +} + +grafana_get_status() { + local endpoint="$1" + local url="${GRAFANA_URL}${endpoint}" + if [[ -n "$GRAFANA_TOKEN" ]]; then + http_get_status "$url" -H "Authorization: Bearer ${GRAFANA_TOKEN}" + else + http_get_status "$url" + fi +} + +# ── JSON parsing (no jq required) ──────────────────────────────────── +json_value() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"?\K[^\",}]+" || true; } | head -1 +} + +json_value_string() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"\K[^\"]*" || true; } | head -1 +} + +json_count() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"" || true; } | wc -l +} + +# ── TLS certificate check ──────────────────────────────────────────── +check_tls_cert() { + local url="$1" + local name="$2" + local host port + host=$(echo "$url" | sed -E 's|https?://||; s|/.*||; s|:.*||') + port=$(echo "$url" | grep -oP ':\K[0-9]+(?=/|$)' || echo "443") + + if ! command -v openssl &>/dev/null; then + record_skip "${name} TLS certificate" "openssl not available" + return + fi + + if [[ "$url" != https://* ]]; then + record_skip "${name} TLS certificate" "not HTTPS" + return + fi + + local cert_info + cert_info=$(echo | openssl s_client -servername "$host" -connect "${host}:${port}" 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null) || { + record_fail "${name} TLS certificate" "could not retrieve certificate" + return + } + + local end_date + end_date=$(echo "$cert_info" | sed -n 's/notAfter=//p') + if [[ -z "$end_date" ]]; then + record_fail "${name} TLS certificate" "could not parse expiry" + return + fi + + local end_epoch now_epoch days_left + end_epoch=$(date -d "$end_date" +%s 2>/dev/null) || { + record_fail "${name} TLS certificate" "could not parse date: ${end_date}" + return + } + now_epoch=$(date +%s) + days_left=$(( (end_epoch - now_epoch) / 86400 )) + + if [[ $days_left -lt 0 ]]; then + record_fail "${name} TLS certificate" "EXPIRED ${days_left#-} days ago" + elif [[ $days_left -lt 14 ]]; then + record_fail "${name} TLS certificate" "expires in ${days_left} days (critical)" + elif [[ $days_left -lt 30 ]]; then + record_pass "${name} TLS certificate" "${days_left} days remaining (warning)" + else + record_pass "${name} TLS certificate" "${days_left} days remaining" + fi +} + +# ── Output Functions ────────────────────────────────────────────────── +section_header() { + local name="$1" + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo "" + echo -e "${BOLD}${name}${RESET}" + fi +} + +print_header() { + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e "${BOLD}Infra Smoke Tests${RESET}" + [[ -n "$PROMETHEUS_URL" ]] && echo "Prometheus: ${PROMETHEUS_URL}" + [[ -n "$GRAFANA_URL" ]] && echo "Grafana: ${GRAFANA_URL}" + [[ -n "$ALERTMANAGER_URL" ]] && echo "Alertmanager: ${ALERTMANAGER_URL}" + [[ -n "$LOKI_URL" ]] && echo "Loki: ${LOKI_URL}" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + fi +} + +print_summary() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + local target="${PROMETHEUS_URL:-${GRAFANA_URL:-${ALERTMANAGER_URL:-${LOKI_URL:-unknown}}}}" + + if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} ${target}" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + + if [[ $FAIL -eq 0 ]]; then + echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else + echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}" + fi + fi +} + +print_tap_header() { echo "TAP version 13"; } +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +write_junit() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + cat > "$JUNIT_FILE" < + + +JUNIT_EOF + + local result + for result in "${RESULTS[@]}"; do + local status name detail + status=$(echo "$result" | cut -d'|' -f1) + name=$(echo "$result" | cut -d'|' -f2) + detail=$(echo "$result" | cut -d'|' -f3) + + name=$(echo "$name" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + detail=$(echo "$detail" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + + case "$status" in + PASS) + echo " " >> "$JUNIT_FILE" + [[ -n "$detail" ]] && echo " ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + FAIL) + echo " " >> "$JUNIT_FILE" + echo " FAILED: ${name} — ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + SKIP) + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + esac + done + echo " " >> "$JUNIT_FILE" + echo "" >> "$JUNIT_FILE" +} + +# ── Test Suites ─────────────────────────────────────────────────────── + +test_connectivity() { + section_header "Connectivity" + + # Prometheus health + if [[ -n "$PROMETHEUS_URL" ]]; then + local status + status=$(http_get_status "${PROMETHEUS_URL}/-/healthy") || status="000" + if [[ "$status" == "200" ]]; then + record_pass "Prometheus health endpoint" "HTTP ${status}" + else + record_fail "Prometheus health endpoint" "HTTP ${status}" + fi + check_tls_cert "$PROMETHEUS_URL" "Prometheus" + fi + + # Grafana health + if [[ -n "$GRAFANA_URL" && "$SKIP_GRAFANA" != "true" ]]; then + local status + status=$(http_get_status "${GRAFANA_URL}/api/health") || status="000" + if [[ "$status" == "200" ]]; then + record_pass "Grafana health endpoint" "HTTP ${status}" + else + record_fail "Grafana health endpoint" "HTTP ${status}" + fi + check_tls_cert "$GRAFANA_URL" "Grafana" + fi + + # Alertmanager health + if [[ -n "$ALERTMANAGER_URL" && "$SKIP_ALERTMANAGER" != "true" ]]; then + local status + status=$(http_get_status "${ALERTMANAGER_URL}/-/healthy") || status="000" + if [[ "$status" == "200" ]]; then + record_pass "Alertmanager health endpoint" "HTTP ${status}" + else + record_fail "Alertmanager health endpoint" "HTTP ${status}" + fi + check_tls_cert "$ALERTMANAGER_URL" "Alertmanager" + fi + + # Loki health + if [[ -n "$LOKI_URL" && "$SKIP_LOKI" != "true" ]]; then + local status + status=$(http_get_status "${LOKI_URL}/ready") || status="000" + if [[ "$status" == "200" ]]; then + record_pass "Loki ready endpoint" "HTTP ${status}" + else + record_fail "Loki ready endpoint" "HTTP ${status}" + fi + check_tls_cert "$LOKI_URL" "Loki" + fi +} + +test_prometheus() { + if [[ -z "$PROMETHEUS_URL" ]]; then return; fi + section_header "Prometheus" + + # Targets + local body + body=$(http_get "${PROMETHEUS_URL}/api/v1/targets?state=active") || body="" + if [[ -n "$body" ]]; then + local active_count + active_count=$(echo "$body" | { grep -oP '"health"\s*:\s*"up"' || true; } | wc -l) + local total_count + total_count=$(echo "$body" | { grep -oP '"health"\s*:\s*"' || true; } | wc -l) + if [[ $total_count -gt 0 ]]; then + record_pass "Prometheus targets" "${active_count}/${total_count} targets up" + else + record_fail "Prometheus targets" "no targets found" + fi + else + record_fail "Prometheus targets" "could not query targets API" + fi + + # Check expected jobs if configured + if [[ -n "$EXPECTED_JOBS" ]]; then + local targets_body + targets_body=$(http_get "${PROMETHEUS_URL}/api/v1/targets") || targets_body="" + IFS=',' read -ra jobs <<< "$EXPECTED_JOBS" + local job + for job in "${jobs[@]}"; do + job=$(echo "$job" | xargs) # trim whitespace + if echo "$targets_body" | grep -qP "\"job\"\s*:\s*\"${job}\""; then + local job_health + job_health=$(echo "$targets_body" | { grep -oP "\"job\"\s*:\s*\"${job}\"[^}]*\"health\"\s*:\s*\"\K[^\"]*" || true; } | head -1) + if [[ "$job_health" == "up" ]]; then + record_pass "Expected job: ${job}" "up" + else + record_fail "Expected job: ${job}" "health: ${job_health:-unknown}" + fi + else + record_fail "Expected job: ${job}" "not found in targets" + fi + done + fi + + # Alerting rules + local rules_body + rules_body=$(http_get "${PROMETHEUS_URL}/api/v1/rules") || rules_body="" + if [[ -n "$rules_body" ]]; then + local rule_groups firing_count + rule_groups=$(echo "$rules_body" | { grep -oP '"type"\s*:\s*"alerting"' || true; } | wc -l) + firing_count=$(echo "$rules_body" | { grep -oP '"state"\s*:\s*"firing"' || true; } | wc -l) + record_pass "Prometheus alerting rules" "${rule_groups} rules loaded, ${firing_count} firing" + else + record_fail "Prometheus alerting rules" "could not query rules API" + fi + + # TSDB stats + local tsdb_body + tsdb_body=$(http_get "${PROMETHEUS_URL}/api/v1/status/tsdb") || tsdb_body="" + if [[ -n "$tsdb_body" ]]; then + local num_series + num_series=$(json_value "numSeries" "$tsdb_body") + if [[ -n "$num_series" && "$num_series" -gt 0 ]] 2>/dev/null; then + record_pass "Prometheus TSDB stats" "${num_series} time series" + else + record_pass "Prometheus TSDB stats" "responding" + fi + else + record_fail "Prometheus TSDB stats" "could not query TSDB status" + fi + + # Config reload check + local config_body + config_body=$(http_get "${PROMETHEUS_URL}/api/v1/status/config") || config_body="" + if [[ -n "$config_body" ]]; then + local config_status + config_status=$(json_value "status" "$config_body") + if [[ "$config_status" == "success" ]]; then + record_pass "Prometheus config" "loaded successfully" + else + record_fail "Prometheus config" "status: ${config_status:-unknown}" + fi + else + record_fail "Prometheus config" "could not query config API" + fi +} + +test_grafana() { + if [[ -z "$GRAFANA_URL" || "$SKIP_GRAFANA" == "true" ]]; then return; fi + section_header "Grafana" + + # Datasources + local ds_body + ds_body=$(grafana_get "/api/datasources") || ds_body="" + if [[ -n "$ds_body" && "$ds_body" != "null" ]]; then + local ds_count + ds_count=$(echo "$ds_body" | { grep -oP '"id"\s*:' || true; } | wc -l) + if [[ $ds_count -gt 0 ]]; then + record_pass "Grafana datasources" "${ds_count} configured" + else + record_pass "Grafana datasources" "API responding (0 datasources)" + fi + else + local ds_status + ds_status=$(grafana_get_status "/api/datasources") || ds_status="000" + if [[ "$ds_status" == "401" || "$ds_status" == "403" ]]; then + record_skip "Grafana datasources" "authentication required (HTTP ${ds_status})" + else + record_fail "Grafana datasources" "could not query datasources API" + fi + fi + + # Dashboards search + local dash_body + dash_body=$(grafana_get "/api/search?type=dash-db&limit=1000") || dash_body="" + if [[ -n "$dash_body" && "$dash_body" != "null" ]]; then + local dash_count + dash_count=$(echo "$dash_body" | { grep -oP '"uid"\s*:' || true; } | wc -l) + record_pass "Grafana dashboards" "${dash_count} found" + else + local dash_status + dash_status=$(grafana_get_status "/api/search?type=dash-db") || dash_status="000" + if [[ "$dash_status" == "401" || "$dash_status" == "403" ]]; then + record_skip "Grafana dashboards" "authentication required (HTTP ${dash_status})" + else + record_fail "Grafana dashboards" "could not query search API" + fi + fi + + # Auth check (org info) + local org_body + org_body=$(grafana_get "/api/org") || org_body="" + if [[ -n "$org_body" ]]; then + local org_name + org_name=$(json_value_string "name" "$org_body") + if [[ -n "$org_name" ]]; then + record_pass "Grafana authentication" "org: ${org_name}" + else + local org_status + org_status=$(grafana_get_status "/api/org") || org_status="000" + if [[ "$org_status" == "401" || "$org_status" == "403" ]]; then + record_skip "Grafana authentication" "token not provided or invalid" + else + record_pass "Grafana authentication" "API responding" + fi + fi + else + record_skip "Grafana authentication" "could not query org API" + fi +} + +test_alertmanager() { + if [[ -z "$ALERTMANAGER_URL" || "$SKIP_ALERTMANAGER" == "true" ]]; then return; fi + section_header "Alertmanager" + + # Cluster status + local cluster_body + cluster_body=$(http_get "${ALERTMANAGER_URL}/api/v2/status") || cluster_body="" + if [[ -n "$cluster_body" ]]; then + local cluster_status + cluster_status=$(json_value_string "status" "$cluster_body") + local peers + peers=$(echo "$cluster_body" | { grep -oP '"address"\s*:' || true; } | wc -l) + if [[ "$cluster_status" == "ready" || -n "$cluster_status" ]]; then + record_pass "Alertmanager cluster status" "${cluster_status:-ok}, ${peers} peer(s)" + else + record_pass "Alertmanager cluster status" "responding" + fi + else + record_fail "Alertmanager cluster status" "could not query status API" + fi + + # Active alerts + local alerts_body + alerts_body=$(http_get "${ALERTMANAGER_URL}/api/v2/alerts?active=true") || alerts_body="" + if [[ -n "$alerts_body" ]]; then + local alert_count + alert_count=$(echo "$alerts_body" | { grep -oP '"fingerprint"\s*:' || true; } | wc -l) + record_pass "Alertmanager active alerts" "${alert_count} active" + else + record_fail "Alertmanager active alerts" "could not query alerts API" + fi + + # Receivers + local receivers_body + receivers_body=$(http_get "${ALERTMANAGER_URL}/api/v2/receivers") || receivers_body="" + if [[ -n "$receivers_body" ]]; then + local receiver_count + receiver_count=$(echo "$receivers_body" | { grep -oP '"name"\s*:' || true; } | wc -l) + record_pass "Alertmanager receivers" "${receiver_count} configured" + else + record_fail "Alertmanager receivers" "could not query receivers API" + fi +} + +test_loki() { + if [[ -z "$LOKI_URL" || "$SKIP_LOKI" == "true" ]]; then return; fi + section_header "Loki" + + # Labels + local labels_body + labels_body=$(http_get "${LOKI_URL}/loki/api/v1/labels") || labels_body="" + if [[ -n "$labels_body" ]]; then + local labels_status + labels_status=$(json_value "status" "$labels_body") + if [[ "$labels_status" == "success" ]]; then + local label_count + label_count=$(echo "$labels_body" | { grep -oP '"[^"]+"\s*[,\]]' || true; } | wc -l) + record_pass "Loki labels" "${label_count} labels found" + else + record_fail "Loki labels" "status: ${labels_status:-unknown}" + fi + else + record_fail "Loki labels" "could not query labels API" + fi + + # Basic query + local query_body + local encoded_query + encoded_query=$(printf '%s' '{job=~".+"}' | curl -Gso /dev/null -w '%{url_effective}' --data-urlencode @- '' 2>/dev/null | sed 's/^.//') || encoded_query='%7Bjob%3D~%22.%2B%22%7D' + query_body=$(http_get "${LOKI_URL}/loki/api/v1/query?query=${encoded_query}&limit=1") || query_body="" + if [[ -n "$query_body" ]]; then + local query_status + query_status=$(json_value "status" "$query_body") + if [[ "$query_status" == "success" ]]; then + record_pass "Loki query" "query engine responding" + else + record_fail "Loki query" "status: ${query_status:-unknown}" + fi + else + record_fail "Loki query" "could not query Loki" + fi +} + +test_integration() { + if [[ -z "$PROMETHEUS_URL" || -z "$GRAFANA_URL" || "$SKIP_GRAFANA" == "true" ]]; then return; fi + section_header "Integration" + + # Grafana → Prometheus query via Grafana proxy + local ds_body + ds_body=$(grafana_get "/api/datasources") || ds_body="" + if [[ -z "$ds_body" || "$ds_body" == "null" ]]; then + record_skip "Grafana → Prometheus query" "could not list datasources" + return + fi + + # Find a Prometheus datasource UID + local prom_uid + prom_uid=$(echo "$ds_body" | { grep -oP '"type"\s*:\s*"prometheus"[^}]*"uid"\s*:\s*"\K[^"]*' || true; } | head -1) + if [[ -z "$prom_uid" ]]; then + prom_uid=$(echo "$ds_body" | { grep -oP '"uid"\s*:\s*"\K[^"]*' || true; } | head -1) + fi + + if [[ -z "$prom_uid" ]]; then + record_skip "Grafana → Prometheus query" "no Prometheus datasource found" + return + fi + + local proxy_body + proxy_body=$(grafana_get "/api/datasources/proxy/uid/${prom_uid}/api/v1/query?query=up") || proxy_body="" + if [[ -n "$proxy_body" ]]; then + local proxy_status + proxy_status=$(json_value "status" "$proxy_body") + if [[ "$proxy_status" == "success" ]]; then + record_pass "Grafana → Prometheus query" "proxy query succeeded via datasource ${prom_uid}" + else + record_fail "Grafana → Prometheus query" "proxy returned: ${proxy_status:-unknown}" + fi + else + record_skip "Grafana → Prometheus query" "proxy query returned empty (may need auth)" + fi +} + +# ── Main / Argument Parsing ─────────────────────────────────────────── +usage() { + cat <&2 + exit 1 +fi + +if [[ -z "$NTFY_URL" && "$TEXTFILE_ENABLED" == "false" && "$DRY_RUN" == "false" ]]; then + echo -e "${RED}Error: At least one of --ntfy-url or --textfile is required${NC}" >&2 + echo " Example: sudo $(basename "$0") --ntfy-url https://ntfy.example.com/bot-alerts" + echo " Example: sudo $(basename "$0") --textfile" + exit 1 +fi + +if [[ "$TEXTFILE_ENABLED" == "true" && ! -d "$TEXTFILE_DIR" && "$DRY_RUN" == "false" ]]; then + echo -e "${RED}Error: Textfile directory not found: ${TEXTFILE_DIR}${NC}" >&2 + echo " Create it: sudo mkdir -p ${TEXTFILE_DIR}" + exit 1 +fi + +# --- Auto-detect log directory --- +detect_log_dir() { + if [[ "$LOG_DIR" != "auto" ]]; then + if [[ ! -d "$LOG_DIR" ]]; then + echo -e "${RED}Error: Log directory not found: ${LOG_DIR}${NC}" >&2 + exit 1 + fi + if [[ -n "$DOMAIN" ]]; then + LOG_PATTERN="${DOMAIN}.log" + info "Using specified log directory: $LOG_DIR/$LOG_PATTERN" + else + info "Using specified log directory: $LOG_DIR" + fi + return + fi + + step "Auto-detecting web server log directory..." + + # HestiaCP / VestaCP — per-domain logs + # Check apache first: has full access logs with user agents + # (nginx domain logs are proxy logs in nginx+apache mode) + if [[ -d /var/log/apache2/domains ]]; then + LOG_DIR="/var/log/apache2/domains" + if [[ -n "$DOMAIN" ]]; then + LOG_PATTERN="${DOMAIN}.log" + if [[ ! -f "${LOG_DIR}/${LOG_PATTERN}" ]]; then + echo -e "${RED}Error: Domain log not found: ${LOG_DIR}/${LOG_PATTERN}${NC}" >&2 + exit 1 + fi + info "Detected HestiaCP/VestaCP apache: $LOG_DIR/$LOG_PATTERN" + else + LOG_PATTERN="*.log" + info "Detected HestiaCP/VestaCP apache: $LOG_DIR (all domains)" + fi + return + fi + + if [[ -d /var/log/nginx/domains ]]; then + LOG_DIR="/var/log/nginx/domains" + if [[ -n "$DOMAIN" ]]; then + LOG_PATTERN="${DOMAIN}.log" + if [[ ! -f "${LOG_DIR}/${LOG_PATTERN}" ]]; then + echo -e "${RED}Error: Domain log not found: ${LOG_DIR}/${LOG_PATTERN}${NC}" >&2 + exit 1 + fi + info "Detected HestiaCP/VestaCP nginx: $LOG_DIR/$LOG_PATTERN" + else + LOG_PATTERN="*.log" + info "Detected HestiaCP/VestaCP nginx: $LOG_DIR (all domains)" + fi + return + fi + + # Standard nginx + if [[ -f /var/log/nginx/access.log ]]; then + LOG_DIR="/var/log/nginx" + LOG_PATTERN="access.log" + info "Detected nginx: $LOG_DIR/$LOG_PATTERN" + return + fi + + # Apache (Debian/Ubuntu) + if [[ -f /var/log/apache2/access.log ]]; then + LOG_DIR="/var/log/apache2" + LOG_PATTERN="access.log" + info "Detected apache2: $LOG_DIR/$LOG_PATTERN" + return + fi + + # Apache (RHEL/Rocky) + if [[ -f /var/log/httpd/access_log ]]; then + LOG_DIR="/var/log/httpd" + LOG_PATTERN="access_log" + info "Detected httpd: $LOG_DIR/$LOG_PATTERN" + return + fi + + echo -e "${RED}Error: Could not auto-detect log directory. Use --log-dir to specify.${NC}" >&2 + exit 1 +} + +detect_log_dir + +# ===================================================== +# Step 1: Create directories +# ===================================================== +step "Creating directories" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: /etc/bot-monitor/" + echo " Would create: /var/lib/bot-monitor/" +else + mkdir -p /etc/bot-monitor /var/lib/bot-monitor + info "Created /etc/bot-monitor and /var/lib/bot-monitor" +fi + +# ===================================================== +# Step 2: Install known agents whitelist +# ===================================================== +step "Installing known agents whitelist" + +AGENTS_FILE="/etc/bot-monitor/known-agents.txt" + +AGENTS_CONTENT='# Known user agents — one grep pattern per line +# Agents matching these patterns are excluded from unknown bot alerts. +# Add your own patterns as needed. +# +# https://mylinux.work + +# --- Search engines --- +Googlebot +Bingbot +Applebot +DuckDuckBot +YandexBot +Baiduspider +Sogou +Qwantify +Qwantbot + +# --- Social media / link previews --- +facebookexternalhit +Facebot +Twitterbot +LinkedInBot +Pinterestbot +Slackbot +Discordbot +TelegramBot +WhatsApp +SkypeUriPreview +BingPreview + +# --- Browsers --- +Chrome +Firefox +Safari +Edge +Opera +Vivaldi +Brave + +# --- Monitoring / uptime --- +Uptime-Kuma +UptimeRobot +Pingdom +StatusCake +Better Uptime +Datadog +Site24x7 +Cloudflare-Healthchecks +Fastly-Healthcheck +Blackbox-Exporter +ufw-threat-feeds + +# --- Feed readers --- +Feedly +Feedbin +NewsBlur +Tiny Tiny RSS +FreshRSS +Miniflux + +# --- Tools --- +curl +Wget +HTTPie +Lynx +w3m +link-check + +# --- AI user-facing search (cite sources) --- +ChatGPT-User +Claude-User +DuckAssistBot + +# --- OS-level networking --- +WebKit.Networking +NetworkingExtension + +# --- AI scrapers (already blocked) --- +ABEvalBot +GPTBot +ClaudeBot +anthropic-ai +CCBot +Bytespider +TikTokSpider +cohere-ai +PerplexityBot +Diffbot +MistralBot +YandexGPTBot +meta-externalagent +Meta-ExternalFetcher +meta-webindexer +PetalBot +Amazonbot +Amzn-SearchBot +AI2Bot +Ai2Bot-Dolma +Timpibot +img2dataset +YouBot +HanaleiBot +Applebot-Extended +Google-Extended + +# --- SEO crawlers (already blocked) --- +MJ12bot +SemrushBot +AhrefsBot +DotBot +DataForSeoBot +SERanking + +# --- Scraping frameworks (already blocked) --- +Scrapy +python-requests +Go-http-client +Java/ +libwww-perl +trafilatura + +# --- Vulnerability scanners (already blocked) --- +Nikto +sqlmap +Nmap +masscan +ZmEu +Morpheus' + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would create: ${AGENTS_FILE}" +else + if [[ -f "$AGENTS_FILE" ]]; then + cp "$AGENTS_FILE" "${AGENTS_FILE}.bak.$(date +%s)" + warn "Existing whitelist backed up" + fi + echo "$AGENTS_CONTENT" > "$AGENTS_FILE" + info "Installed: ${AGENTS_FILE} ($(grep -cve '^\s*#' -e '^\s*$' "$AGENTS_FILE") patterns)" +fi + +# ===================================================== +# Step 3: Install bot-monitor script +# ===================================================== +step "Installing bot-monitor script" + +SCRIPT_FILE="/usr/local/bin/bot-monitor.sh" + +cat > /tmp/bot-monitor.sh.tmp << 'SCRIPTEOF' +#!/bin/bash +# /usr/local/bin/bot-monitor.sh +# Detects unknown user agents in yesterday's web server logs. +# Sends alerts via ntfy and/or exports Prometheus metrics. +# +# Author: Phil Connor +# License: MIT + +set -euo pipefail + +# --- Configuration --- +LOG_DIR="__LOG_DIR__" +LOG_PATTERN="__LOG_PATTERN__" +KNOWN_AGENTS="/etc/bot-monitor/known-agents.txt" +STATE_DIR="/var/lib/bot-monitor" +NTFY_URL="__NTFY_URL__" +TEXTFILE_DIR="__TEXTFILE_DIR__" +TEXTFILE_ENABLED=__TEXTFILE_ENABLED__ +MIN_REQUESTS=__MIN_REQUESTS__ +HOSTNAME=$(hostname -f) + +# --- Setup --- +mkdir -p "$STATE_DIR" + +# Build grep exclusion pattern from known agents file +EXCLUDE_PATTERN=$(grep -v '^#' "$KNOWN_AGENTS" | grep -v '^$' | paste -sd'|') + +# --- Extract unknown agents from yesterday's logs --- +YESTERDAY=$(date -d yesterday +%d/%b/%Y) +UNKNOWN_FILE="$STATE_DIR/unknown-$(date -d yesterday +%Y-%m-%d).txt" + +grep "$YESTERDAY" "$LOG_DIR"/$LOG_PATTERN 2>/dev/null \ + | awk -F'"' '{print $6}' \ + | grep -v '^-$' \ + | grep -v '^$' \ + | sort | uniq -c | sort -rn \ + | grep -viE "$EXCLUDE_PATTERN" \ + | awk -v min="$MIN_REQUESTS" '$1 >= min' \ + > "$UNKNOWN_FILE" || true + +AGENT_COUNT=$(wc -l < "$UNKNOWN_FILE") +TOTAL_REQUESTS=0 +if [ "$AGENT_COUNT" -gt 0 ]; then + TOTAL_REQUESTS=$(awk '{sum += $1} END {print sum+0}' "$UNKNOWN_FILE") +fi + +# --- ntfy alert --- +if [ "$AGENT_COUNT" -gt 0 ] && [ -n "${NTFY_URL}" ]; then + SUMMARY=$(head -10 "$UNKNOWN_FILE" | while read count agent; do + printf " %6d %s\n" "$count" "$agent" + done) + + curl -fsSL \ + -H "Title: Unknown bots detected on $HOSTNAME" \ + -H "Priority: 3" \ + -H "Tags: spider,warning" \ + -d "Found $AGENT_COUNT unknown user agents yesterday: + +$SUMMARY + +Full list: $UNKNOWN_FILE" \ + "$NTFY_URL" > /dev/null 2>&1 +fi + +# --- Prometheus textfile metrics --- +if [ "${TEXTFILE_ENABLED}" = "true" ]; then + OUTPUT_FILE="${TEXTFILE_DIR}/bot_monitor.prom" + PROM_TMP=$(mktemp "${TEXTFILE_DIR}/.bot_monitor.XXXXXX") + + { + echo "# HELP bot_monitor_unknown_agents_total Number of unique unknown user agents detected yesterday." + echo "# TYPE bot_monitor_unknown_agents_total gauge" + echo "bot_monitor_unknown_agents_total ${AGENT_COUNT}" + echo '' + echo "# HELP bot_monitor_unknown_requests_total Total requests from unknown user agents yesterday." + echo "# TYPE bot_monitor_unknown_requests_total gauge" + echo "bot_monitor_unknown_requests_total ${TOTAL_REQUESTS}" + echo '' + echo "# HELP bot_monitor_last_scan_timestamp_seconds Unix timestamp of last bot monitor scan." + echo "# TYPE bot_monitor_last_scan_timestamp_seconds gauge" + echo "bot_monitor_last_scan_timestamp_seconds $(date +%s)" + echo '' + + if [ "$AGENT_COUNT" -gt 0 ]; then + echo "# HELP bot_monitor_agent_requests Requests per unknown user agent yesterday." + echo "# TYPE bot_monitor_agent_requests gauge" + head -20 "$UNKNOWN_FILE" | while read count agent; do + safe_agent=$(echo "$agent" | sed 's/\\/\\\\/g; s/"/\\"/g' | cut -c1-128) + echo "bot_monitor_agent_requests{agent=\"${safe_agent}\"} ${count}" + done + fi + } > "$PROM_TMP" + + chmod 644 "$PROM_TMP" + mv -f "$PROM_TMP" "$OUTPUT_FILE" +fi + +# --- Cleanup state files older than 30 days --- +find "$STATE_DIR" -name "unknown-*.txt" -mtime +30 -delete 2>/dev/null || true +SCRIPTEOF + +# Replace placeholders with actual values +sed -i "s|__LOG_DIR__|${LOG_DIR}|g" /tmp/bot-monitor.sh.tmp +sed -i "s|__LOG_PATTERN__|${LOG_PATTERN}|g" /tmp/bot-monitor.sh.tmp +sed -i "s|__NTFY_URL__|${NTFY_URL}|g" /tmp/bot-monitor.sh.tmp +sed -i "s|__TEXTFILE_DIR__|${TEXTFILE_DIR}|g" /tmp/bot-monitor.sh.tmp +sed -i "s|__TEXTFILE_ENABLED__|${TEXTFILE_ENABLED}|g" /tmp/bot-monitor.sh.tmp +sed -i "s|__MIN_REQUESTS__|${MIN_REQUESTS}|g" /tmp/bot-monitor.sh.tmp + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would install: ${SCRIPT_FILE}" + echo " Log dir: ${LOG_DIR}" + echo " Log pattern: ${LOG_PATTERN}" + [[ -n "$NTFY_URL" ]] && echo " ntfy URL: ${NTFY_URL}" + [[ "$TEXTFILE_ENABLED" == "true" ]] && echo " Textfile dir: ${TEXTFILE_DIR}" + echo " Min requests: ${MIN_REQUESTS}" +else + if [[ -f "$SCRIPT_FILE" ]]; then + cp "$SCRIPT_FILE" "${SCRIPT_FILE}.bak.$(date +%s)" + warn "Existing script backed up" + fi + mv /tmp/bot-monitor.sh.tmp "$SCRIPT_FILE" + chmod +x "$SCRIPT_FILE" + info "Installed: ${SCRIPT_FILE}" +fi + +# ===================================================== +# Step 4: Install cron job in root's crontab +# ===================================================== +step "Installing cron job" + +CRON_LINE="0 ${CRON_HOUR} * * * ${SCRIPT_FILE}" + +if [[ "$DRY_RUN" == "true" ]]; then + echo " Would add to root crontab: ${CRON_LINE}" + echo " Schedule: daily at ${CRON_HOUR}:00" +else + # Check if already in crontab + if crontab -l 2>/dev/null | grep -qF "$SCRIPT_FILE"; then + warn "Cron entry already exists in root crontab — skipping" + else + (crontab -l 2>/dev/null; echo "${CRON_LINE}") | crontab - + info "Cron added to root crontab: daily at ${CRON_HOUR}:00" + fi +fi + +# ===================================================== +# Summary +# ===================================================== +echo "" +echo -e "${BOLD}Done.${NC}" +echo "" +echo " Script: ${SCRIPT_FILE}" +echo " Whitelist: ${AGENTS_FILE}" +echo " State dir: /var/lib/bot-monitor/" +echo " Cron: daily at ${CRON_HOUR}:00" +echo " Log source: ${LOG_DIR}/${LOG_PATTERN}" +[[ -n "$NTFY_URL" ]] && echo " Alerts: ${NTFY_URL}" +[[ "$TEXTFILE_ENABLED" == "true" ]] && echo " Metrics: ${TEXTFILE_DIR}/bot_monitor.prom" +echo "" +echo " Test manually:" +echo " sudo ${SCRIPT_FILE}" +echo "" +echo " Check results:" +echo " ls -la /var/lib/bot-monitor/" +echo " cat /var/lib/bot-monitor/unknown-\$(date -d yesterday +%Y-%m-%d).txt" +if [[ "$TEXTFILE_ENABLED" == "true" ]]; then + echo "" + echo " Check Prometheus metrics:" + echo " cat ${TEXTFILE_DIR}/bot_monitor.prom" + echo "" + echo " PromQL examples:" + echo " bot_monitor_unknown_agents_total # unique unknown agents" + echo " bot_monitor_unknown_requests_total # total requests from unknowns" + echo " bot_monitor_agent_requests # per-agent request counts" +fi +echo "" +echo " Edit whitelist to reduce noise:" +echo " nano ${AGENTS_FILE}" diff --git a/install-cadvisor.sh b/install-cadvisor.sh new file mode 100644 index 0000000..304dca7 --- /dev/null +++ b/install-cadvisor.sh @@ -0,0 +1,482 @@ +#!/bin/bash + +############################################################# +#### cAdvisor Installer #### +#### Download, install, and configure Google cAdvisor #### +#### for Docker container metrics with Prometheus #### +#### #### +#### Supports: Docker (container) or binary (systemd) #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### Version: 1.0.0.20260308 #### +############################################################# + +set -euo pipefail + +SCRIPT_NAME=$(basename "$0") +readonly SCRIPT_NAME + +# Defaults +readonly DEFAULT_CADVISOR_VERSION="0.49.1" +readonly DEFAULT_LISTEN_PORT="8080" +readonly DEFAULT_PROMETHEUS_CONFIG="/etc/prometheus/prometheus.yml" +readonly DEFAULT_BIND_ADDRESS="0.0.0.0" + +# Configuration (overridable by environment or flags) +CADVISOR_VERSION="${CADVISOR_VERSION:-$DEFAULT_CADVISOR_VERSION}" +LISTEN_PORT="${LISTEN_PORT:-$DEFAULT_LISTEN_PORT}" +BIND_ADDRESS="${BIND_ADDRESS:-$DEFAULT_BIND_ADDRESS}" +PROMETHEUS_CONFIG="${PROMETHEUS_CONFIG:-$DEFAULT_PROMETHEUS_CONFIG}" +INSTALL_MODE="docker" +RESTART_POLICY="unless-stopped" +CONTAINER_NAME="cadvisor" +ADD_TO_PROMETHEUS=false +DRY_RUN=false +UNINSTALL=false + +# Logging +log_info() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: $1" +} + +log_error() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" >&2 +} + +log_warn() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] WARN: $1" +} + +show_help() { + cat << EOF +Usage: $SCRIPT_NAME [OPTIONS] + +Download, install, and configure Google cAdvisor for Docker container monitoring. + +INSTALL MODES: + --docker Run cAdvisor as a Docker container (default) + --binary Download and install as a systemd service + +OPTIONS: + --version cAdvisor version to install (default: $DEFAULT_CADVISOR_VERSION) + --port Metrics listen port (default: $DEFAULT_LISTEN_PORT) + --bind
Bind address (default: $DEFAULT_BIND_ADDRESS) + --name Docker container name (default: cadvisor) + --restart Docker restart policy (default: unless-stopped) + --add-to-prometheus Add scrape target to Prometheus config + --prometheus-config Path to prometheus.yml (default: $DEFAULT_PROMETHEUS_CONFIG) + --uninstall Remove cAdvisor installation + --dry-run Show what would be done without executing + --help, -h Show this help message + +ENVIRONMENT VARIABLES: + CADVISOR_VERSION cAdvisor version (default: $DEFAULT_CADVISOR_VERSION) + LISTEN_PORT Metrics listen port (default: $DEFAULT_LISTEN_PORT) + BIND_ADDRESS Bind address (default: $DEFAULT_BIND_ADDRESS) + PROMETHEUS_CONFIG Path to prometheus.yml (default: $DEFAULT_PROMETHEUS_CONFIG) + +EXAMPLES: + $SCRIPT_NAME --docker + $SCRIPT_NAME --docker --port 9080 --add-to-prometheus + $SCRIPT_NAME --binary --version 0.49.1 --add-to-prometheus + $SCRIPT_NAME --uninstall + +EOF + exit 0 +} + +######################### +### Parse Arguments ### +######################### +parse_arguments() { + while [[ $# -gt 0 ]]; do + case $1 in + --docker) INSTALL_MODE="docker"; shift ;; + --binary) INSTALL_MODE="binary"; shift ;; + --version) CADVISOR_VERSION="$2"; shift 2 ;; + --port) LISTEN_PORT="$2"; shift 2 ;; + --bind) BIND_ADDRESS="$2"; shift 2 ;; + --name) CONTAINER_NAME="$2"; shift 2 ;; + --restart) RESTART_POLICY="$2"; shift 2 ;; + --add-to-prometheus) ADD_TO_PROMETHEUS=true; shift ;; + --prometheus-config) PROMETHEUS_CONFIG="$2"; shift 2 ;; + --uninstall) UNINSTALL=true; shift ;; + --dry-run) DRY_RUN=true; shift ;; + --help|-h) show_help ;; + *) log_error "Unknown option: $1"; show_help ;; + esac + done +} + +######################### +### Permission Check ### +######################### +check_permissions() { + if [[ $EUID -ne 0 ]]; then + log_error "This script must be run as root" + exit 1 + fi +} + +######################### +### Pre-flight Checks ### +######################### +check_docker() { + if ! command -v docker &>/dev/null; then + log_error "Docker is not installed. Install Docker first or use --binary mode." + exit 1 + fi + + if ! docker info &>/dev/null; then + log_error "Docker daemon is not running" + exit 1 + fi +} + +check_port_available() { + if ss -tlnp 2>/dev/null | grep -q ":${LISTEN_PORT} " || \ + netstat -tlnp 2>/dev/null | grep -q ":${LISTEN_PORT} "; then + log_warn "Port $LISTEN_PORT is already in use" + # Check if it's cAdvisor already running + if docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${CONTAINER_NAME}$"; then + log_warn "cAdvisor container '$CONTAINER_NAME' is already running" + return 1 + fi + if systemctl is-active --quiet cadvisor 2>/dev/null; then + log_warn "cAdvisor systemd service is already running" + return 1 + fi + log_error "Port $LISTEN_PORT is in use by another process" + exit 1 + fi + return 0 +} + +detect_arch() { + local arch + arch=$(uname -m) + case "$arch" in + x86_64) echo "amd64" ;; + aarch64) echo "arm64" ;; + armv7l) echo "armv7" ;; + *) log_error "Unsupported architecture: $arch"; exit 1 ;; + esac +} + +######################### +### Docker Install ### +######################### +install_docker_mode() { + check_docker + + if ! check_port_available; then + log_info "cAdvisor is already running — nothing to do" + return + fi + + log_info "Installing cAdvisor v${CADVISOR_VERSION} as Docker container" + + local image="gcr.io/cadvisor/cadvisor:v${CADVISOR_VERSION}" + + local docker_cmd=( + docker run -d + --name "$CONTAINER_NAME" + --restart "$RESTART_POLICY" + -p "${BIND_ADDRESS}:${LISTEN_PORT}:8080" + -v /:/rootfs:ro + -v /var/run:/var/run:ro + -v /sys:/sys:ro + -v /var/lib/docker/:/var/lib/docker:ro + -v /dev/disk/:/dev/disk:ro + --privileged + --device /dev/kmsg + "$image" + ) + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would execute:" + echo " ${docker_cmd[*]}" + return + fi + + # Pull image first + log_info "Pulling image: $image" + docker pull "$image" + + # Remove existing stopped container if present + if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + log_info "Removing existing stopped container: $CONTAINER_NAME" + docker rm -f "$CONTAINER_NAME" 2>/dev/null || true + fi + + # Run container + "${docker_cmd[@]}" + + log_info "Container '$CONTAINER_NAME' started" + + # Wait for metrics endpoint + wait_for_metrics +} + +######################### +### Binary Install ### +######################### +install_binary_mode() { + local arch + arch=$(detect_arch) + + local download_url="https://github.com/google/cadvisor/releases/download/v${CADVISOR_VERSION}/cadvisor-v${CADVISOR_VERSION}-linux-${arch}" + local binary_path="/usr/local/bin/cadvisor" + + log_info "Installing cAdvisor v${CADVISOR_VERSION} as systemd service (${arch})" + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would download: $download_url" + log_info "[DRY RUN] Would install to: $binary_path" + log_info "[DRY RUN] Would create systemd unit: /etc/systemd/system/cadvisor.service" + return + fi + + # Download binary + log_info "Downloading cAdvisor binary..." + local temp_file + temp_file=$(mktemp) + if ! curl -fsSL -o "$temp_file" "$download_url"; then + rm -f "$temp_file" + log_error "Failed to download cAdvisor from $download_url" + exit 1 + fi + + # Install binary + chmod +x "$temp_file" + mv "$temp_file" "$binary_path" + log_info "Installed binary to $binary_path" + + # Create systemd unit + create_systemd_unit + + # Enable and start + systemctl daemon-reload + systemctl enable cadvisor + systemctl start cadvisor + + log_info "cAdvisor systemd service started" + + # Wait for metrics endpoint + wait_for_metrics +} + +create_systemd_unit() { + cat > /etc/systemd/system/cadvisor.service << EOF +[Unit] +Description=cAdvisor - Container Advisor +Documentation=https://github.com/google/cadvisor +After=network.target docker.service +Wants=docker.service + +[Service] +Type=simple +ExecStart=/usr/local/bin/cadvisor \\ + --port=${LISTEN_PORT} \\ + --listen_ip=${BIND_ADDRESS} \\ + --docker_only=true \\ + --housekeeping_interval=30s \\ + --storage_duration=2m0s +Restart=on-failure +RestartSec=5 +LimitNOFILE=65536 + +[Install] +WantedBy=multi-user.target +EOF + + log_info "Created systemd unit: /etc/systemd/system/cadvisor.service" +} + +######################### +### Post-install ### +######################### +wait_for_metrics() { + log_info "Waiting for metrics endpoint..." + local attempts=0 + local max_attempts=15 + + while [[ $attempts -lt $max_attempts ]]; do + if curl -sf "http://localhost:${LISTEN_PORT}/metrics" >/dev/null 2>&1; then + log_info "cAdvisor is responding on port $LISTEN_PORT" + return 0 + fi + attempts=$((attempts + 1)) + sleep 2 + done + + log_warn "cAdvisor did not respond within ${max_attempts} attempts — check logs" + return 1 +} + +add_prometheus_scrape_config() { + if [[ "$ADD_TO_PROMETHEUS" != "true" ]]; then + return + fi + + if [[ ! -f "$PROMETHEUS_CONFIG" ]]; then + log_warn "Prometheus config not found at $PROMETHEUS_CONFIG — skipping" + return + fi + + # Check if cadvisor job already exists + if grep -q "job_name.*cadvisor" "$PROMETHEUS_CONFIG" 2>/dev/null; then + log_info "cAdvisor scrape target already exists in $PROMETHEUS_CONFIG" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would add cAdvisor scrape config to $PROMETHEUS_CONFIG" + return + fi + + # Backup config + cp "$PROMETHEUS_CONFIG" "${PROMETHEUS_CONFIG}.bak.$(date +%s)" + + # Determine the target address + local target_host + if [[ "$INSTALL_MODE" == "docker" ]]; then + target_host="${CONTAINER_NAME}:8080" + else + target_host="localhost:${LISTEN_PORT}" + fi + + cat >> "$PROMETHEUS_CONFIG" << EOF + + - job_name: 'cadvisor' + scrape_interval: 15s + static_configs: + - targets: ['${target_host}'] +EOF + + log_info "Added cAdvisor scrape target to $PROMETHEUS_CONFIG" + + # Reload Prometheus if running + if systemctl is-active --quiet prometheus 2>/dev/null; then + if systemctl reload prometheus 2>/dev/null; then + log_info "Prometheus configuration reloaded" + else + systemctl restart prometheus 2>/dev/null || true + log_info "Prometheus restarted" + fi + fi +} + +######################### +### Uninstall ### +######################### +uninstall_cadvisor() { + log_info "Uninstalling cAdvisor..." + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would stop and remove Docker container '$CONTAINER_NAME'" + log_info "[DRY RUN] Would stop and remove systemd service 'cadvisor'" + log_info "[DRY RUN] Would remove /usr/local/bin/cadvisor" + return + fi + + # Remove Docker container + if docker ps -a --format '{{.Names}}' 2>/dev/null | grep -q "^${CONTAINER_NAME}$"; then + docker stop "$CONTAINER_NAME" 2>/dev/null || true + docker rm "$CONTAINER_NAME" 2>/dev/null || true + log_info "Removed Docker container: $CONTAINER_NAME" + fi + + # Remove systemd service + if [[ -f /etc/systemd/system/cadvisor.service ]]; then + systemctl stop cadvisor 2>/dev/null || true + systemctl disable cadvisor 2>/dev/null || true + rm -f /etc/systemd/system/cadvisor.service + systemctl daemon-reload + log_info "Removed systemd service" + fi + + # Remove binary + if [[ -f /usr/local/bin/cadvisor ]]; then + rm -f /usr/local/bin/cadvisor + log_info "Removed /usr/local/bin/cadvisor" + fi + + log_info "cAdvisor uninstalled" +} + +######################### +### Verify ### +######################### +verify_installation() { + if [[ "$DRY_RUN" == "true" ]]; then + return + fi + + echo + echo "=== cAdvisor Installation Summary ===" + echo " Mode: $INSTALL_MODE" + echo " Version: $CADVISOR_VERSION" + echo " Metrics URL: http://localhost:${LISTEN_PORT}/metrics" + echo " Web UI: http://localhost:${LISTEN_PORT}/" + echo + + if [[ "$INSTALL_MODE" == "docker" ]]; then + echo " Container: $CONTAINER_NAME" + echo " Status: $(docker inspect -f '{{.State.Status}}' "$CONTAINER_NAME" 2>/dev/null || echo 'unknown')" + else + echo " Service: cadvisor.service" + echo " Status: $(systemctl is-active cadvisor 2>/dev/null || echo 'unknown')" + fi + + echo + echo "Verify with:" + echo " curl -s http://localhost:${LISTEN_PORT}/metrics | head -20" + echo + + if [[ "$ADD_TO_PROMETHEUS" == "true" ]]; then + echo "Prometheus scrape target configured in: $PROMETHEUS_CONFIG" + echo + else + echo "To add to Prometheus, add this to your scrape_configs:" + echo + echo " - job_name: 'cadvisor'" + echo " scrape_interval: 15s" + echo " static_configs:" + if [[ "$INSTALL_MODE" == "docker" ]]; then + echo " - targets: ['${CONTAINER_NAME}:8080']" + else + echo " - targets: ['localhost:${LISTEN_PORT}']" + fi + echo + fi +} + +######################### +### Main ### +######################### +main() { + parse_arguments "$@" + + if [[ "$UNINSTALL" == "true" ]]; then + check_permissions + uninstall_cadvisor + exit 0 + fi + + check_permissions + + log_info "Installing cAdvisor v${CADVISOR_VERSION} (mode: ${INSTALL_MODE})" + + case "$INSTALL_MODE" in + docker) install_docker_mode ;; + binary) install_binary_mode ;; + *) log_error "Unknown install mode: $INSTALL_MODE"; exit 1 ;; + esac + + add_prometheus_scrape_config + verify_installation +} + +main "$@" diff --git a/install-code-server.sh b/install-code-server.sh index b0d4b6e..68c416f 100644 --- a/install-code-server.sh +++ b/install-code-server.sh @@ -2,530 +2,645 @@ #################################################################### #### Code-Server Install Script #### -#### For RHEL/Rocky/Alma, Oracle Linux, Debian & Ubuntu #### #### #### -#### Author: Phil Connor #### +#### Installs code-server with Nginx, Apache, or Caddy reverse #### +#### proxy, Let's Encrypt TLS certificate, and systemd service. #### +#### #### +#### Supported: RHEL/Rocky/Alma 8+, Oracle Linux 8+, #### +#### Debian 11+, Ubuntu 20.04+ #### +#### #### +#### Author: Phil Connor #### #### Contact: contact@mylinux.work #### +#### Website: https://mylinux.work #### #### License: MIT #### -#### Version: 1.3 #### +#### Version: 2.0 #### #### #### -#### Usage: sudo ./install-code-server.sh #### +#### Usage: #### +#### Edit the User Configuration section below, then run: #### +#### sudo ./install-code-server.sh #### +#### #### +#### Or pass options on the command line: #### +#### sudo ./install-code-server.sh --server code.example.com #### +#### --email admin@example.com --http nginx #### +#### --password 'YourPassword' #### +#### #### +#### sudo ./install-code-server.sh --help #### #################################################################### -############################# -#### User Configurations #### -############################# -CODEDIR=/code # Home directory for your Code -EMAIL=admin@mydomain.com # your domain email address -HTTPTYPE=APACHE # Choose Apache, Caddy or Nginx All UPPER Case -PASSWD=pAsSwOrD # Your Password for Code-server used for Apache, Nginx and Caddy -UNAME=MyUser # Username Used for Caddy -SERVDIR=/usr/local/code-server # where you want the code-server installed -SERVERNAME=code.mydomain.cloud # server fqdn name -USRDIR=/var/lib/code-server +set -euo pipefail -######################## -#### System Configs #### -######################## -CADPASS="$(echo -e "${PASSWD}\n$PASSWD" | caddy hash-password 2>/dev/null | tail --lines=1)" -OS=$(grep PRETTY_NAME /etc/os-release | sed 's/PRETTY_NAME=//g' | tr -d '="' | awk '{print $1}' | tr '[:upper:]' '[:lower:]') -OSVER=$(grep VERSION_ID /etc/os-release | sed 's/VERSION_ID=//g' | tr -d '="' | awk -F. '{print $1}') +# ============================================================================ +# USER CONFIGURATION -- edit these or override with command-line options +# ============================================================================ -define() { - IFS=$'\n' read -r -d '' "$1" - } +CODEDIR="/code" # Home directory for your code +EMAIL="admin@mydomain.com" # Email for Let's Encrypt registration +HTTPTYPE="NGINX" # APACHE, NGINX, or CADDY +PASSWD="" # code-server password (prompted if empty) +UNAME="" # Username for Caddy basic auth (optional) +SERVDIR="/usr/local/code-server" # code-server install directory +SERVERNAME="" # Server FQDN (required) +USRDIR="/var/lib/code-server" # User data directory (settings, extensions) +INSTALL_CRON=0 # Set to 1 to install auto-update cron +CRON_DAYS=25 # Days between automatic updates -########################################################### -#### Detect Package Manger from OS and OSVer Variables #### -########################################################### -if [ "${OS}" = ubuntu ]; then - PAKMGR="apt-get -y" -elif [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - if [ "${OSVER}" = 7 ]; then - PAKMGR="yum -y" - fi - if [[ ${OSVER} = 8 || ${OSVER} = 9 ]]; then - PAKMGR="dnf -y" - fi -fi +# ============================================================================ +# INTERNAL VARIABLES +# ============================================================================ -################################ -#### Check if OS is Updated #### -################################ -if [ "${OS}" = ubuntu ]; then - ${PAKMGR} upgrade - ${PAKMGR} install libc6 libstdc++6 -else - ${PAKMGR} update -fi +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' +VERSION="2.0" +ARCH="" -############################################### -#### Get the latest version of Code Server #### -############################################### -get_latest_version() { - { - version="$(curl -fsSLI -o /dev/null -w "%{url_effective}" https://github.com/coder/code-server/releases/latest)" - version="${version#https://github.com/coder/code-server/releases/tag/}" - version="${version#v}" - echo "$version" - } +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log() { echo -e "${GREEN}[+]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[!]${NC} $1"; } +log_err() { echo -e "${RED}[x]${NC} $1" >&2; } +log_info() { echo -e "${CYAN}[>]${NC} $1"; } + +show_help() { + cat </dev/null 2>&1; then + log_err "Required command not found: $cmd" + exit 1 + fi + done + + # FQDN check + if [[ -z "$SERVERNAME" ]]; then + log_err "Server FQDN is required (--server or edit SERVERNAME in script)" + exit 1 + fi + + # HTTP type check + case "$HTTPTYPE" in + APACHE|NGINX|CADDY) ;; + *) + log_err "Invalid HTTP type: $HTTPTYPE (must be APACHE, NGINX, or CADDY)" + exit 1 + ;; + esac + + # Password -- prompt if empty + if [[ -z "$PASSWD" ]]; then + echo -n "Enter code-server password: " + read -rs PASSWD + echo + if [[ -z "$PASSWD" ]]; then + log_err "Password cannot be empty" + exit 1 + fi + fi + + # DNS check + if ! host "$SERVERNAME" >/dev/null 2>&1 && ! dig +short "$SERVERNAME" 2>/dev/null | grep -q .; then + log_warn "DNS lookup for $SERVERNAME failed -- certbot may fail if DNS is not configured" + fi +} + +# ============================================================================ +# SYSTEM UPDATE +# ============================================================================ + +update_system() { + log "Updating system packages" + if [[ "$PKG_TYPE" == "deb" ]]; then + apt-get -y update + apt-get -y upgrade + else + dnf -y update + fi +} + +# ============================================================================ +# INSTALL CODE-SERVER +# ============================================================================ + install_codeserver() { - { - # check if command wget exists - if ! command -v wget >/dev/null 2>&1; then - ${PAKMGR} install wget + log "Installing code-server" + + # Get latest version + local version + version=$(curl -fsSL "https://api.github.com/repos/coder/code-server/releases/latest" | \ + grep '"tag_name"' | head -1 | grep -oP 'v\K[0-9]+\.[0-9]+\.[0-9]+') + + if [[ -z "$version" ]]; then + log_err "Failed to determine latest code-server version" + exit 1 fi - cd ~/ || exit - wget "https://github.com/coder/code-server/releases/download/v$version/code-server-$version-linux-amd64.tar.gz" - tar xvf "code-server-$version-linux-amd64.tar.gz" - mkdir -p ${SERVDIR} - cp -r ~/code-server-"$version"-linux-amd64/* ${SERVDIR} - ln -s ${SERVDIR}/bin/code-server /usr/bin/code-server - # Code Directory - mkdir -p "${CODEDIR}" - # User Directory - mkdir -p "${USRDIR}" + log_info "Latest version: $version" - csserv=/lib/systemd/system - touch $csserv/code-server.service - OUTFILE1="$csserv/code-server.service" - define SFILE << EOF - [Unit] - Description=code-server - After=nginx.service + # Download + local tarball="code-server-${version}-linux-${ARCH}.tar.gz" + local url="https://github.com/coder/code-server/releases/download/v${version}/${tarball}" - [Service] - Type=simple - Environment=PASSWORD=$PASSWD - ExecStart=/usr/bin/code-server --bind-addr 127.0.0.1:8080 --user-data-dir ${USRDIR} --auth password - Restart=always + cd /tmp + log_info "Downloading $tarball" + curl -fsSL -o "$tarball" "$url" + tar xzf "$tarball" - [Install] - WantedBy=multi-user.target -EOF + # Install + mkdir -p "$SERVDIR" + cp -r "code-server-${version}-linux-${ARCH}"/* "$SERVDIR"/ - { - printf "%s\n" "$SFILE" | cut -c 2- - } > "$OUTFILE1" + # Create symlink + ln -sf "${SERVDIR}/bin/code-server" /usr/bin/code-server - if [ $HTTPTYPE = CADDY ]; then - sed -i 's/After=nginx.service/After=caddy.service/g' $csserv/code-server.service - sed -i 's/auth: password/auth: none/' /root/.config/code-server/config.yaml - sed -i "s|ExecStart=/usr/bin/code-server --bind-addr 127.0.0.1:8080 --user-data-dir ${USRDIR} --auth password|ExecStart=/usr/bin/code-server --bind-addr 127.0.0.1:8080 --user-data-dir ${USRDIR}|" $csserv/code-server.service - fi - - systemctl daemon-reload - systemctl start code-server - systemctl enable code-server - } + # Create directories + mkdir -p "$CODEDIR" + mkdir -p "$USRDIR" + + # Cleanup + rm -rf "/tmp/$tarball" "/tmp/code-server-${version}-linux-${ARCH}" + + # Verify + if ! command -v code-server >/dev/null 2>&1; then + log_err "code-server installation failed" + exit 1 + fi + log "code-server $(code-server --version | head -1) installed to $SERVDIR" } -######################################## -#### Install Apache, Nginx or Caddy #### -######################################## -install_http() { - { - if [ $HTTPTYPE = APACHE ]; then - csserv=/lib/systemd/system - sed -i 's/After=nginx.service/After=apache.service/g' $csserv/code-server.service - if [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - if ! command -v httpd &> /dev/null; then - ${PAKMGR} install httpd - systemctl enable --now httpd - fi - AOUTFILE="/etc/httpd/conf.d/code-server.conf" - elif [ "${OS}" = ubuntu ]; then - if ! command -v apache2 &> /dev/null; then - ${PAKMGR} install apache2 - systemctl enable --now apache2 - fi - AOUTFILE="/etc/apache2/sites-available/code-server.conf" +# ============================================================================ +# SYSTEMD SERVICE +# ============================================================================ + +create_service() { + log "Creating systemd service" + + local after_service="network.target" + case "$HTTPTYPE" in + NGINX) after_service="nginx.service" ;; + APACHE) after_service="httpd.service" ;; + CADDY) after_service="caddy.service" ;; + esac + + cat > /lib/systemd/system/code-server.service < "${confdir}/code-server.conf" < "$conffile" < + ServerName $SERVERNAME + + RewriteEngine On + RewriteCond %{HTTP:Upgrade} =websocket [NC] + RewriteRule /(.*) ws://127.0.0.1:8080/\$1 [P,L] + RewriteCond %{HTTP:Upgrade} !=websocket [NC] + RewriteRule /(.*) http://127.0.0.1:8080/\$1 [P,L] + + ProxyRequests off + ProxyPreserveHost On + ProxyPass / http://127.0.0.1:8080/ nocanon + ProxyPassReverse / http://127.0.0.1:8080/ + + RequestHeader set X-Forwarded-Proto "http" + +EOF + + if [[ "$PKG_TYPE" == "deb" ]]; then + a2ensite code-server.conf + a2dissite 000-default.conf 2>/dev/null || true + systemctl enable --now apache2 + systemctl reload apache2 + else + systemctl enable --now httpd + systemctl reload httpd + fi + + log "Apache configured and running" +} + +# ============================================================================ +# REVERSE PROXY -- CADDY +# ============================================================================ + +install_caddy() { + log "Installing Caddy" + + if [[ "$PKG_TYPE" == "deb" ]]; then + apt-get -y install debian-keyring debian-archive-keyring apt-transport-https curl + curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' | \ + gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg + curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' | \ + tee /etc/apt/sources.list.d/caddy-stable.list + apt-get -y update + apt-get -y install caddy + else + dnf -y install 'dnf-command(copr)' + dnf -y copr enable @caddy/caddy + dnf -y install caddy + fi + + # Caddy handles TLS automatically -- no certbot needed + if [[ -n "$UNAME" ]]; then + # With basic auth + local caddy_hash + caddy_hash=$(caddy hash-password --plaintext "$PASSWD" 2>/dev/null || \ + echo "$PASSWD" | caddy hash-password 2>/dev/null || echo "") + + if [[ -z "$caddy_hash" ]]; then + log_warn "Could not generate Caddy password hash -- writing config without basic auth" + cat > /etc/caddy/Caddyfile < /etc/caddy/Caddyfile < - ServerName $SERVERNAME - #ProxyPreserveHost On - RewriteEngine On - RewriteCond %{HTTP:Upgrade} =websocket [NC] - RewriteRule /(.*) ws://127.0.0.1:8080/$1 [P,L] - RewriteCond %{HTTP:Upgrade} !=websocket [NC] - RewriteRule /(.*) http://127.0.0.1:8080/$1 [P,L] - ProxyRequests off - #RequestHeader set X-Forwarded-Proto https - #RequestHeader set X-Forwarded-Port 443 - ProxyPass / http://127.0.0.1:8080/ nocanon - ProxyPassReverse / http://127.0.0.1:8080/ - + else + # Without basic auth (code-server handles its own auth) + cat > /etc/caddy/Caddyfile < "$AOUTFILE" - - systemctl daemon-reload - systemctl restart code-server - systemctl restart httpd - fi + fi - if [ $HTTPTYPE = NGINX ]; then - if [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - OUTFILE="/etc/yum.repos.d/nginx.repo" - define NYUM << 'EOF' - [nginx-stable] - name=nginx stable repo - baseurl=http://nginx.org/packages/centos/$releasever/$basearch/ - gpgcheck=1 - enabled=1 - gpgkey=https://nginx.org/keys/nginx_signing.key - module_hotfixes=true -EOF - { - printf "%s\n" "$NYUM" | cut -c 4- - } > "$OUTFILE" - if [ "${OSVER}" = 8 ] || [ "${OSVER}" = 9 ]; then - # shellcheck disable=2016 - sed -i 's/baseurl=http:\/\/nginx.org\/packages\/centos\/7\/$basearch\//baseurl=http:\/\/nginx.org\/packages\/centos\/8\/$basearch\//g' $OUTFILE - fi - fi - - if [ "${OS}" = ubuntu ]; then - ${PAKMGR} install curl gnupg2 ca-certificates lsb-release - echo "deb http://nginx.org/packages/ubuntu $(lsb_release -cs) nginx" | sudo tee /etc/apt/sources.list.d/nginx.list - echo -e "Package: *\nPin: origin nginx.org\nPin: release o=nginx\nPin-Priority: 900\n" | sudo tee /etc/apt/preferences.d/99nginx - curl -o /tmp/nginx_signing.key https://nginx.org/keys/nginx_signing.key - if [ "$OSVER" = 16 ]; then - gpg --with-fingerprint /tmp/nginx_signing.key - else - gpg --dry-run --quiet --import --import-options show-only /tmp/nginx_signing.key - fi - sudo mv /tmp/nginx_signing.key /etc/apt/trusted.gpg.d/nginx_signing.asc - sudo apt update - fi - - ${PAKMGR} install nginx + # Caddy handles its own auth -- disable code-server auth + sed -i 's/--auth password/--auth none/' /lib/systemd/system/code-server.service + systemctl daemon-reload + systemctl restart code-server - if [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - nxdir=/etc/nginx/conf.d - elif [ "${OS}" = ubuntu ]; then - if [ "$OSVER" = 16 ]; then - nxdir=/etc/nginx/sites-available - else - nxdir=/etc/nginx/conf.d - fi - fi - - OUTFILE2="$nxdir/code-server.conf" - define NFIG << EOF - server { - listen 80; - listen [::]:80; - server_name $SERVERNAME; - location / { - proxy_pass http://localhost:8080/; - proxy_set_header Host \$host; - proxy_set_header Upgrade \$http_upgrade; - proxy_set_header Connection upgrade; - proxy_set_header Accept-Encoding gzip; - } - } -EOF - { - printf "%s\n" "$NFIG" | cut -c 2- - } > "$OUTFILE2" - - if [ "${OS}" = ubuntu ]; then - mv $nxdir/default $nxdir/default.orig - ln -sf /etc/nginx/sites-available/code-server.conf /etc/nginx/sites-enabled/code-server.conf - else - mv $nxdir/default.conf $nxdir/default.conf.orig - fi - systemctl start nginx - systemctl enable nginx - fi - - if [ "$HTTPTYPE" = CADDY ]; then - if [ "${OS}" = ubuntu ]; then - ${PAKMGR} debian-keyring debian-archive-keyring apt-transport-https - curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/cfg/gpg/gpg.155B6D79CA56EA34.key' | apt-key add - - curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/cfg/setup/config.deb.txt?distro=debian&version=any-version' | tee -a /etc/apt/sources.list.d/caddy-stable.list - ${PAKMGR} update - ${PAKMGR} install caddy - elif [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - if [ "${OSVER}" = 7 ]; then - ${PAKMGR} install yum-plugin-copr - elif [ "${OSVER}" = 8 ] || [ "${OSVER}" = 9 ]; then - ${PAKMGR} install 'dnf-command(copr)' - fi - ${PAKMGR} copr enable @caddy/caddy - ${PAKMGR} install caddy - fi - - caddir=/etc/caddy - mv $caddir/Caddyfile $caddir/Caddyfile.orig - touch $caddir/Caddyfile - OUTFILE3="$caddir/Caddyfile" - define CFILE << EOF - { #### Remove these 3 lines - acme_ca https://acme-staging-v02.api.letsencrypt.org/directory #### to make server live - } #### and grab cert from letsencrypt - - $SERVERNAME { - basicauth /* { - $UNAME $CADPASS - } - reverse_proxy 127.0.0.1:8080 - } - -EOF - { - printf "%s\n" "$CFILE" | cut -c 2- - } > "$OUTFILE3" - - systemctl enable caddy - systemctl start caddy - - fi - - } + systemctl enable --now caddy + log "Caddy configured and running (TLS handled automatically)" } -########################################## -#### Install Certbot and request Cert #### -########################################## +# ============================================================================ +# LET'S ENCRYPT (for Nginx and Apache only -- Caddy handles its own TLS) +# ============================================================================ + install_certbot() { - { - if [ $HTTPTYPE = NGINX ];then - if [ "${OS}" = ubuntu ]; then - ${PAKMGR} remove letsencrypt - ${PAKMGR} remove certbot - snap install core; snap refresh core - snap install --classic certbot - ${PAKMGR} install python3-certbot-nginx - elif [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - ${PAKMGR} remove certbot - ${PAKMGR} install epel-release - ${PAKMGR} install snapd - if [ "$OSVER" = 7 ]; then - ${PAKMGR} install python2-certbot-nginx - elif [ "${OSVER}" = 8 ] || [ "${OSVER}" = 9 ]; then - ${PAKMGR} install python3-certbot-nginx - fi - fi + if [[ "$HTTPTYPE" == "CADDY" ]]; then + log_info "Skipping certbot -- Caddy handles TLS automatically" + return fi - if [ $HTTPTYPE = APACHE ];then - if [ "${OS}" = ubuntu ]; then - ${PAKMGR} remove letsencrypt - ${PAKMGR} remove certbot - snap install core; snap refresh core - snap install --classic certbot - ${PAKMGR} install python3-certbot-apache - elif [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - ${PAKMGR} remove certbot - ${PAKMGR} install epel-release - ${PAKMGR} install snapd - if [ "$OSVER" = 7 ]; then - ${PAKMGR} install python2-certbot-apache - elif [ "${OSVER}" = 8 ] || [ "${OSVER}" = 9 ]; then - ${PAKMGR} install python3-certbot-apache - fi - fi - fi - systemctl enable --now snapd.socket - ln -s /var/lib/snapd/snap /snap - snap install core; snap refresh core - snap install --classic certbot - ln -s /snap/bin/certbot /usr/bin/certbot - - #certbot certonly --redirect --agree-tos --nginx -d $SERVERNAME -m "$EMAIL" --dry-run - if [ "$HTTPTYPE" = NGINX ]; then - certbot --non-interactive --redirect --agree-tos --nginx -d $SERVERNAME -m "$EMAIL" - systemctl restart nginx - elif [ "$HTTPTYPE" = APACHE ]; then - certbot --non-interactive --redirect --agree-tos --apache -d $SERVERNAME -m "$EMAIL" - if [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - systemctl restart httpd - else - systemctl restart apache2 - fi - fi - if [ $HTTPTYPE = NGINX ]; then - if [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - if ! grep "certbot" /var/spool/cron/root; then - echo "0 */12 * * * root certbot -q renew --nginx" >> /var/spool/cron/root - fi - elif [ "${OS}" = ubuntu ]; then - if ! grep "certbot" /var/spool/cron/crontabs/root; then - echo "0 */12 * * * root certbot -q renew --nginx" >> /var/spool/cron/crontabs/root - fi - fi - elif [ $HTTPTYPE = APACHE ]; then - if [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - if ! grep "certbot" /var/spool/cron/root; then - echo "0 */12 * * * root certbot -q renew --apache" >> /var/spool/cron/root - fi - elif [ "${OS}" = ubuntu ]; then - if ! grep "certbot" /var/spool/cron/crontabs/root; then - echo "0 */12 * * * root certbot -q renew --apache" >> /var/spool/cron/crontabs/root - fi + + log "Installing certbot for Let's Encrypt" + + if [[ "$PKG_TYPE" == "deb" ]]; then + apt-get -y install snapd + snap install core 2>/dev/null; snap refresh core 2>/dev/null + snap install --classic certbot 2>/dev/null + ln -sf /snap/bin/certbot /usr/bin/certbot + else + dnf -y install epel-release + dnf -y install certbot + if [[ "$HTTPTYPE" == "NGINX" ]]; then + dnf -y install python3-certbot-nginx + elif [[ "$HTTPTYPE" == "APACHE" ]]; then + dnf -y install python3-certbot-apache fi fi - if [[ ${OS} != "ubuntu" && ${OS} != "debian" ]]; then - grep nginx /var/log/audit/audit.log | audit2allow -M nginx 2>/dev/null || true - semodule -i nginx.pp 2>/dev/null || true - fi - } + # Request certificate + log_info "Requesting TLS certificate for $SERVERNAME" + if [[ "$HTTPTYPE" == "NGINX" ]]; then + certbot --non-interactive --redirect --agree-tos \ + --nginx -d "$SERVERNAME" -m "$EMAIL" + systemctl reload nginx + elif [[ "$HTTPTYPE" == "APACHE" ]]; then + certbot --non-interactive --redirect --agree-tos \ + --apache -d "$SERVERNAME" -m "$EMAIL" + if [[ "$PKG_TYPE" == "deb" ]]; then + systemctl reload apache2 + else + systemctl reload httpd + fi + fi + + # Certbot auto-renewal is handled by the systemd timer installed by certbot + if systemctl is-enabled certbot.timer >/dev/null 2>&1; then + log_info "Certbot auto-renewal timer is active" + elif systemctl is-enabled snap.certbot.renew.timer >/dev/null 2>&1; then + log_info "Certbot snap auto-renewal timer is active" + else + log_warn "No certbot auto-renewal timer found -- add a cron job for 'certbot renew'" + fi + + log "TLS certificate installed" } -function install_firewall() { - { - if [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - ${PAKMGR} install ipset perl-libwww-perl.noarch perl-LWP-Protocol-https.noarch perl-GDGraph perl-Sys-Syslog perl-Math-BigInt - elif [ "${OS}" = ubuntu ]; then - ${PAKMGR} install ipset libwww-perl liblwp-protocol-https-perl libgd-graph-perl - fi - cd /usr/src || exit - # rm -fv csf.tgz - wget https://download.configserver.com/csf.tgz - tar -xzf csf.tgz - cd csf || exit - ./install.sh - echo '' - echo '###########################################' - echo '#### Testing if CSF firewall will work ####' - echo '###########################################' - echo '' - perl /usr/local/csf/bin/csftest.pl - ##### Initial Settings ##### - sed -i 's/TESTING = "1"/TESTING = "0"/g' /etc/csf/csf.conf - sed -i 's/RESTRICT_SYSLOG = "0"/RESTRICT_SYSLOG = "3"/g' /etc/csf/csf.conf - sed -i '/^RESTRICT_UI/c\RESTRICT_UI = "1"' /etc/csf/csf.conf - sed -i '/^AUTO_UPDATES/c\AUTO_UPDATES = "1"' /etc/csf/csf.conf - ##### IPv4 Port Settings ##### - sed -i 's/TCP_IN = "20,21,22,25,53,80,110,143,443,465,587,993,995"/TCP_IN = "22,80,443,5666,10000"/g' /etc/csf/csf.conf - sed -i 's/TCP_OUT = "20,21,22,25,53,80,110,113,443,587,993,995"/TCP_OUT = "22,25,53,80,443,5666,10000"/g' /etc/csf/csf.conf - sed -i 's/UDP_IN = "20,21,53,80,443"/UDP_IN = "80,443"/g' /etc/csf/csf.conf - sed -i 's/UDP_OUT = "20,21,53,113,123"/UDP_OUT = "53,113,123"/g' /etc/csf/csf.conf - sed -i '/^ICMP_IN_RATE/c\ICMP_IN_RATE = "1/s"' /etc/csf/csf.conf - ##### IPv6 Port Settings ##### - sed -i 's/IPV6 = "0"/IPV6 = "1"/g' /etc/csf/csf.conf - sed -i 's/TCP6_IN = "20,21,22,25,53,80,110,143,443,465,587,993,995"/TCP6_IN = "22,80,443,5666"/g' /etc/csf/csf.conf - sed -i 's/TCP6_OUT = "20,21,22,25,53,80,110,113,443,587,993,995"/TCP6_OUT = "22,80,443,5666"/g' /etc/csf/csf.conf - sed -i 's/UDP6_IN = "20,21,53,80,443"/UDP6_IN = "80,443"/g' /etc/csf/csf.conf - sed -i 's/UDP6_OUT = "20,21,53,113,123"/UDP6_OUT = "53,113,123"/g' /etc/csf/csf.conf - ##### General Settings ##### - sed -i 's/SYSLOG_CHECK = "0"/SYSLOG_CHECK = "300"/g' /etc/csf/csf.conf - sed -i '/^IGNORE_ALLOW/c\IGNORE_ALLOW = "0"' /etc/csf/csf.conf - sed -i '/^LF_CSF/c\LF_CSF = "1"' /etc/csf/csf.conf - sed -i 's/LF_IPSET = "0"/LF_IPSET = "1"/g' /etc/csf/csf.conf - sed -i '/^PACKET_FILTER/c\PACKET_FILTER = "1"' /etc/csf/csf.conf - ##### SMTP Settings ##### - sed -i 's/SMTP_BLOCK = "0"/SMTP_BLOCK = "1"/g' /etc/csf/csf.conf - ##### Port Flood Settings ##### - sed -i 's/SYNFLOOD = "0"/SYNFLOOD = "1"/g' /etc/csf/csf.conf - sed -i 's/CONNLIMIT = ""/CONNLIMIT= "22;5,25;3,80;10"/g' /etc/csf/csf.conf - sed -i 's/PORTFLOOD = ""/PORTFLOOD = "22;tcp;5;300,25;tcp;5;300,80;tcp;20;5"/g' /etc/csf/csf.conf - sed -i 's/UDPFLOOD = "0"/UDPFLOOD = "1"/g' /etc/csf/csf.conf - ##### Logging Settings ##### - sed -i 's/SYSLOG = "0"/SYSLOG = "1"/g' /etc/csf/csf.conf - sed -i '/^DROP_LOGGING/c\DROP_LOGGING = "1"' /etc/csf/csf.conf - sed -i '/^DROP_ONLYRES/c\DROP_ONLYRES = "0"' /etc/csf/csf.conf - sed -i '/^UDPFLOOD_LOGGING/c\UDPFLOOD_LOGGING = "1"' /etc/csf/csf.conf - ##### Temp to Perm/Netblock Settings ##### - sed -i '/^LF_PERMBLOCK^/c\LF_PERMBLOCK = "1"' /etc/csf/csf.conf - sed -i 's/LF_NETBLOCK = "0"/LF_NETBLOCK = "1"/g' /etc/csf/csf.conf - ##### Login Failure Blocking and Alerts ##### - sed -i 's/LF_SSHD = "5"/LF_SSHD = "3"/g' /etc/csf/csf.conf - sed -i 's/LF_FTPD = "10"/LF_FTPD = "5"/g' /etc/csf/csf.conf - sed -i 's/LF_SMTPAUTH = "0"/LF_SMTPAUTH = "5"/g' /etc/csf/csf.conf - sed -i 's/LF_EXIMSYNTAX = "0"/LF_EXIMSYNTAX = "10"/g' /etc/csf/csf.conf - sed -i 's/LF_POP3D = "0"/LF_POP3D = "5"/g' /etc/csf/csf.conf - sed -i 's/LF_IMAPD = "0"/LF_IMAPD = "5"/g' /etc/csf/csf.conf - sed -i 's/LF_HTACCESS = "0"/LF_HTACCESS = "5"/g' /etc/csf/csf.conf - sed -i 's/LF_MODSEC = "5"/LF_MODSEC = "3"/g' /etc/csf/csf.conf - sed -i 's/LF_CXS = "0"/LF_CXS = "1"/g' /etc/csf/csf.conf - sed -i 's/LF_SYMLINK = "0"/LF_SYMLINK = "5"/g' /etc/csf/csf.conf - sed -i 's/LF_WEBMIN = "0"/LF_WEBMIN = "3"/g' /etc/csf/csf.conf - sed -i '/^LF_SSH_EMAIL_ALERT/c\LF_SSH_EMAIL_ALERT = "1"' /etc/csf/csf.conf - sed -i '/^LF_SU_EMAIL_ALERT/c\LF_SU_EMAIL_ALERT = "1"' /etc/csf/csf.conf - sed -i '/^LF_SUDO_EMAIL_ALERT/c\LF_SUDO_EMAIL_ALERT = "1"' /etc/csf/csf.conf - sed -i '/^LF_WEBMIN_EMAIL_ALERT/c\LF_WEBMIN_EMAIL_ALERT = "1"' /etc/csf/csf.conf - sed -i '/^LF_CONSOLE_EMAIL_ALERT/c\LF_CONSOLE_EMAIL_ALERT = "1"' /etc/csf/csf.conf - sed -i '/^LF_BLOCKINONLY/c\LF_BLOCKINONLY = "0"' /etc/csf/csf.conf - ##### Directory Watching & Integrity ##### - sed -i '/^LF_DIRWATCH^/c\LF_DIRWATCH = "300"' /etc/csf/csf.conf - sed -i '/^LF_INTEGRITY/c\LF_INTEGRITY = "3600"' /etc/csf/csf.conf - ##### Distributed Attacks ##### - sed -i 's/LF_DISTATTACK = "0"/LF_DISTATTACK = "1"/g' /etc/csf/csf.conf - sed -i 's/LF_DISTFTP = "0"/LF_DISTFTP = "5"/g' /etc/csf/csf.conf - sed -i 's/LF_DISTSMTP = "0"/LF_DISTSMTP = "5"/g' /etc/csf/csf.conf - ##### Connection Tracking ##### - sed -i 's/CT_LIMIT = "0"/CT_LIMIT = "300"/g' /etc/csf/csf.conf - ##### Process Tracking ##### - sed -i '/^PT_LIMIT/c\PT_LIMIT = "60"' /etc/csf/csf.conf - sed -i '/^PT_SKIP_HTTP/c\PT_SKIP_HTTP = "0"' /etc/csf/csf.conf - sed -i 's/PT_DELETED = "0"/PT_DELETED = "1"/g' /etc/csf/csf.conf - sed -i 's/PT_USERTIME = "1800"/PT_USERTIME = "0"/g' /etc/csf/csf.conf - sed -i 's/PT_FORKBOMB = "0"/PT_FORKBOMB = "250"/g' /etc/csf/csf.conf - ##### Port Scan Tracking ##### - sed -i 's/PS_INTERVAL = "0"/PS_INTERVAL = "300"/g' /etc/csf/csf.conf - sed -i '/^PS_EMAIL_ALERT/c\PS_EMAIL_ALERT = "1"' /etc/csf/csf.conf - ##### User ID Tracking ##### - sed -i 's/UID_INTERVAL = "0"/UID_INTERVAL = "600"/g' /etc/csf/csf.conf - ##### Account Tracking ##### - sed -i 's/AT_ALERT = "2"/AT_ALERT = "1"/g' /etc/csf/csf.conf - systemctl enable --now csf - systemctl enable --now lfd - } +# ============================================================================ +# SELINUX (RHEL-based only) +# ============================================================================ + +configure_selinux() { + if ! command -v getenforce >/dev/null 2>&1; then + return + fi + + if [[ "$(getenforce 2>/dev/null)" == "Enforcing" ]]; then + log_info "Configuring SELinux for reverse proxy" + setsebool -P httpd_can_network_connect 1 2>/dev/null || true + fi } -function install_webmin() { - { - if [[ ${OS} = centos || ${OS} = red || ${OS} = oracle || ${OS} = rocky || ${OS} = alma ]]; then - OUTFILE="/etc/yum.repos.d/webmin.repo" - define WYUM << 'EOF' - [Webmin] - name=Webmin Distribution Neutral - #baseurl=https://download.webmin.com/download/yum - mirrorlist=https://download.webmin.com/download/yum/mirrorlist - enabled=1 -EOF - { - printf "%s\n" "$WYUM" | cut -c 3- - } > "$OUTFILE" - wget https://download.webmin.com/jcameron-key.asc - rpm --import jcameron-key.asc - if [ "${OSVER}" = 7 ]; then - ${PAKMGR} install perl-Encode-Detect perl-Net-SSLeay perl-Data-Dumper tcp_wrappers-devel perl-IO-Tty webmin unzip - elif [ "${OSVER}" = 8 ] || [ "${OSVER}" = 9 ]; then - ${PAKMGR} install perl-Encode-Detect perl-Net-SSLeay perl-Data-Dumper tcp_wrappers tcp_wrappers-libs unzip - dnf config-manager --set-enabled powertools - ${PAKMGR} install perl-IO-Tty webmin - fi - elif [ "${OS}" = ubuntu ]; then - { - echo '' - echo '############################' - echo '#### Adding Webmin Repo ####' - echo '############################' - echo '' - echo 'deb https://download.webmin.com/download/repository sarge contrib' - } >> /etc/apt/sources.list - wget https://download.webmin.com/jcameron-key.asc - apt-key add jcameron-key.asc - ${PAKMGR} install apt-transport-https - ${PAKMGR} update - ${PAKMGR} install webmin - fi - } +# ============================================================================ +# AUTO-UPDATE CRON +# ============================================================================ + +install_update_cron() { + if [[ $INSTALL_CRON -eq 0 ]]; then + return + fi + + log "Installing auto-update cron job" + + local update_url="https://mylinux.work/downloads/update-code-server.sh" + local script_dest="/usr/local/bin/update-code-server.sh" + + curl -fsSL -o "$script_dest" "$update_url" + chmod 700 "$script_dest" + log_info "Update script installed to $script_dest" + + # Build cron entry -- run every N days at 3:00 AM + local cron_line="0 3 */${CRON_DAYS} * * ${script_dest} 2>&1 | logger -t update-code-server" + local cron_marker="# code-server auto-update" + + # Remove existing code-server cron entries + local existing_cron + existing_cron=$(crontab -l 2>/dev/null || true) + local new_cron + new_cron=$(echo "$existing_cron" | grep -v "$cron_marker" | grep -v "update-code-server" || true) + + # Add new entry + echo "${new_cron} +${cron_line} ${cron_marker}" | crontab - + + log "Cron job installed -- updates every $CRON_DAYS days at 3:00 AM" + log_info "View with: crontab -l" } -get_latest_version -install_codeserver -install_http -install_certbot -install_firewall -install_webmin \ No newline at end of file + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "==============================================" + echo " Code-Server Install Script v${VERSION}" + echo "==============================================" + echo "" + + detect_os + detect_arch + preflight_checks + + log_info "OS: $OS_ID $OS_VERSION ($PKG_TYPE)" + log_info "Architecture: $ARCH" + log_info "Server: $SERVERNAME" + log_info "Reverse proxy: $HTTPTYPE" + log_info "Code directory: $CODEDIR" + log_info "Install directory: $SERVDIR" + log_info "Data directory: $USRDIR" + echo "" + + update_system + install_codeserver + create_service + + case "$HTTPTYPE" in + NGINX) install_nginx ;; + APACHE) install_apache ;; + CADDY) install_caddy ;; + esac + + configure_selinux + install_certbot + install_update_cron + + echo "" + echo "==============================================" + echo -e " ${GREEN}Installation complete${NC}" + echo "==============================================" + echo "" + echo " code-server is running at:" + echo " https://$SERVERNAME" + echo "" + echo " Reverse proxy: $HTTPTYPE" + echo " Data directory: $USRDIR" + echo " Code directory: $CODEDIR" + echo "" + echo " Manage the service:" + echo " systemctl status code-server" + echo " systemctl restart code-server" + echo "" + echo "==============================================" +} + +main "$@" diff --git a/install-fail2ban.sh b/install-fail2ban.sh new file mode 100644 index 0000000..d654e33 --- /dev/null +++ b/install-fail2ban.sh @@ -0,0 +1,643 @@ +#!/bin/bash +################################################################################ +# Script Name: install-fail2ban.sh +# Version: 1.0 +# Description: Automated Fail2ban installation with multi-jail configuration, +# custom ban actions, allowlists, Prometheus metrics integration, +# and service auto-detection for Debian/Ubuntu and RHEL/Rocky/Alma +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Usage: +# sudo ./install-fail2ban.sh +# sudo ./install-fail2ban.sh --jails "sshd,nginx-http-auth,postfix" +# sudo ./install-fail2ban.sh --allowlist "10.0.0.0/8,192.168.1.0/24" +# sudo ./install-fail2ban.sh --ban-action nftables --bantime 3600 +# sudo ./install-fail2ban.sh --dry-run +# sudo ./install-fail2ban.sh --uninstall +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +readonly VERSION="1.0" +readonly SCRIPT_NAME="${0##*/}" +readonly LOG_FILE="/var/log/fail2ban-install.log" + +JAILS="" +ALLOWLIST="127.0.0.1/8 ::1" +BAN_ACTION="auto" +BANTIME="3600" +FINDTIME="600" +MAXRETRY="5" +BACKEND="auto" +DRY_RUN=false +UNINSTALL=false +PROMETHEUS=false +RECIDIVE=true + +# OS detection +OS_ID="" +OS_VERSION="" +PKG_MGR="" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; echo "[INFO] $(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE" 2>/dev/null || true; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; echo "[WARN] $(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE" 2>/dev/null || true; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; echo "[ERROR] $(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE" 2>/dev/null || true; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; echo "[STEP] $(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE" 2>/dev/null || true; } + +show_usage() { + cat </dev/null; then + PKG_MGR="yum" + fi + ;; + *) + log_error "Unsupported OS: $OS_ID" + exit 1 + ;; + esac + + log_info "Detected OS: $OS_ID $OS_VERSION (package manager: $PKG_MGR)" +} + +# ============================================================================ +# SERVICE DETECTION +# ============================================================================ + +detect_services() { + if [ -n "$JAILS" ]; then + log_info "Using specified jails: $JAILS" + return + fi + + log_step "Auto-detecting services for jail configuration..." + local detected=() + + # SSH is always enabled + detected+=("sshd") + + # Nginx + if systemctl is-active --quiet nginx 2>/dev/null || [ -f /etc/nginx/nginx.conf ]; then + detected+=("nginx-http-auth") + detected+=("nginx-botsearch") + log_info " Detected: nginx" + fi + + # Apache + if systemctl is-active --quiet httpd 2>/dev/null || systemctl is-active --quiet apache2 2>/dev/null; then + detected+=("apache-auth") + detected+=("apache-badbots") + log_info " Detected: apache" + fi + + # Postfix + if systemctl is-active --quiet postfix 2>/dev/null; then + detected+=("postfix") + log_info " Detected: postfix" + fi + + # Dovecot + if systemctl is-active --quiet dovecot 2>/dev/null; then + detected+=("dovecot") + log_info " Detected: dovecot" + fi + + # MySQL/MariaDB + if systemctl is-active --quiet mysqld 2>/dev/null || systemctl is-active --quiet mariadb 2>/dev/null; then + detected+=("mysqld-auth") + log_info " Detected: mysql/mariadb" + fi + + # BIND + if systemctl is-active --quiet named 2>/dev/null || systemctl is-active --quiet bind9 2>/dev/null; then + detected+=("named-refused") + log_info " Detected: bind/named" + fi + + JAILS=$(IFS=','; echo "${detected[*]}") + log_info "Auto-detected jails: $JAILS" +} + +# ============================================================================ +# BAN ACTION DETECTION +# ============================================================================ + +detect_ban_action() { + if [ "$BAN_ACTION" != "auto" ]; then + log_info "Using specified ban action: $BAN_ACTION" + return + fi + + if command -v nft &>/dev/null && nft list ruleset &>/dev/null; then + BAN_ACTION="nftables-multiport" + log_info "Detected nftables — using nftables-multiport action" + elif command -v iptables &>/dev/null; then + BAN_ACTION="iptables-multiport" + log_info "Detected iptables — using iptables-multiport action" + else + BAN_ACTION="iptables-multiport" + log_warn "No firewall detected — defaulting to iptables-multiport" + fi +} + +# ============================================================================ +# INSTALLATION +# ============================================================================ + +install_fail2ban() { + log_step "Installing Fail2ban..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would install fail2ban via $PKG_MGR" + return + fi + + case "$PKG_MGR" in + apt) + apt-get update -qq + apt-get install -y -qq fail2ban + ;; + dnf|yum) + $PKG_MGR install -y -q epel-release 2>/dev/null || true + $PKG_MGR install -y -q fail2ban fail2ban-firewalld 2>/dev/null || \ + $PKG_MGR install -y -q fail2ban + ;; + esac + + log_info "Fail2ban installed successfully" +} + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +configure_fail2ban() { + log_step "Configuring Fail2ban..." + + local jail_local="/etc/fail2ban/jail.local" + + if $DRY_RUN; then + log_info "[DRY RUN] Would write configuration to $jail_local" + show_config + return + fi + + # Backup existing config + if [ -f "$jail_local" ]; then + cp "$jail_local" "${jail_local}.bak.$(date +%s)" + log_info "Backed up existing $jail_local" + fi + + # Write jail.local + cat > "$jail_local" <> "$jail_local" + done + + # Recidive jail + if $RECIDIVE; then + cat >> "$jail_local" <<'RECIDIVEEOF' + +[recidive] +enabled = true +logpath = /var/log/fail2ban.log +bantime = 604800 +findtime = 86400 +maxretry = 3 +RECIDIVEEOF + log_info "Recidive jail enabled (repeat offenders banned for 7 days)" + fi + + log_info "Configuration written to $jail_local" +} + +write_jail_config() { + local jail="$1" + + case "$jail" in + sshd) + cat <<'EOF' + +[sshd] +enabled = true +port = ssh +logpath = %(sshd_log)s +maxretry = 5 +EOF + ;; + nginx-http-auth) + cat <<'EOF' + +[nginx-http-auth] +enabled = true +port = http,https +logpath = /var/log/nginx/error.log +maxretry = 5 +EOF + ;; + nginx-botsearch) + cat <<'EOF' + +[nginx-botsearch] +enabled = true +port = http,https +logpath = /var/log/nginx/access.log +maxretry = 2 +EOF + ;; + nginx-limit-req) + cat <<'EOF' + +[nginx-limit-req] +enabled = true +port = http,https +logpath = /var/log/nginx/error.log +maxretry = 5 +EOF + ;; + apache-auth) + cat <<'EOF' + +[apache-auth] +enabled = true +port = http,https +logpath = %(apache_error_log)s +maxretry = 5 +EOF + ;; + apache-badbots) + cat <<'EOF' + +[apache-badbots] +enabled = true +port = http,https +logpath = %(apache_access_log)s +maxretry = 2 +bantime = 86400 +EOF + ;; + postfix) + cat <<'EOF' + +[postfix] +enabled = true +port = smtp,465,submission +logpath = /var/log/mail.log +maxretry = 5 +EOF + ;; + dovecot) + cat <<'EOF' + +[dovecot] +enabled = true +port = pop3,pop3s,imap,imaps +logpath = /var/log/mail.log +maxretry = 5 +EOF + ;; + mysqld-auth) + cat <<'EOF' + +[mysqld-auth] +enabled = true +port = 3306 +logpath = /var/log/mysql/error.log +maxretry = 5 +EOF + ;; + named-refused) + cat <<'EOF' + +[named-refused] +enabled = true +port = domain,953 +logpath = /var/log/named/security.log +maxretry = 5 +EOF + ;; + *) + log_warn "Unknown jail: $jail — skipping" + ;; + esac +} + +show_config() { + echo "" + echo "=== Generated Configuration ===" + echo "[DEFAULT]" + echo "bantime = $BANTIME" + echo "findtime = $FINDTIME" + echo "maxretry = $MAXRETRY" + echo "banaction = $BAN_ACTION" + echo "ignoreip = $ALLOWLIST" + echo "" + IFS=',' read -ra JAIL_ARRAY <<< "$JAILS" + for jail in "${JAIL_ARRAY[@]}"; do + jail=$(echo "$jail" | xargs) + echo "[$jail] = enabled" + done + if $RECIDIVE; then + echo "[recidive] = enabled (7-day ban for repeat offenders)" + fi + echo "================================" +} + +# ============================================================================ +# PROMETHEUS INTEGRATION +# ============================================================================ + +setup_prometheus() { + if ! $PROMETHEUS; then + return + fi + + log_step "Setting up Prometheus metrics integration..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would download fail2ban-exporter.sh" + return + fi + + if [ -f /usr/local/bin/fail2ban-exporter.sh ]; then + log_info "fail2ban-exporter.sh already exists" + return + fi + + local exporter_url="https://mylinux.work/downloads/fail2ban-exporter.sh" + if curl -fsSL "$exporter_url" -o /usr/local/bin/fail2ban-exporter.sh 2>/dev/null; then + chmod +x /usr/local/bin/fail2ban-exporter.sh + log_info "Downloaded fail2ban-exporter.sh" + + # Add cron job + if ! crontab -l 2>/dev/null | grep -q "fail2ban-exporter"; then + (crontab -l 2>/dev/null; echo "*/2 * * * * /usr/local/bin/fail2ban-exporter.sh --textfile 2>/dev/null") | crontab - + log_info "Added cron job for fail2ban-exporter" + fi + else + log_warn "Could not download fail2ban-exporter.sh — configure manually" + fi +} + +# ============================================================================ +# SERVICE MANAGEMENT +# ============================================================================ + +start_fail2ban() { + log_step "Starting Fail2ban service..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would enable and start fail2ban" + return + fi + + systemctl enable fail2ban + systemctl restart fail2ban + sleep 2 + + if systemctl is-active --quiet fail2ban; then + log_info "Fail2ban is running" + else + log_error "Fail2ban failed to start — check journalctl -u fail2ban" + exit 1 + fi +} + +# ============================================================================ +# VERIFICATION +# ============================================================================ + +verify_installation() { + log_step "Verifying installation..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would verify installation" + return + fi + + echo "" + echo "=== Fail2ban Status ===" + fail2ban-client status + echo "" + + IFS=',' read -ra JAIL_ARRAY <<< "$JAILS" + for jail in "${JAIL_ARRAY[@]}"; do + jail=$(echo "$jail" | xargs) + echo "--- $jail ---" + fail2ban-client status "$jail" 2>/dev/null || echo " (jail not active — check logs)" + echo "" + done + + if $RECIDIVE; then + echo "--- recidive ---" + fail2ban-client status recidive 2>/dev/null || echo " (recidive not active)" + echo "" + fi + + echo "=== Configuration ===" + echo "Ban action: $BAN_ACTION" + echo "Ban time: ${BANTIME}s ($(( BANTIME / 60 )) minutes)" + echo "Find time: ${FINDTIME}s ($(( FINDTIME / 60 )) minutes)" + echo "Max retry: $MAXRETRY" + echo "Ignore list: $ALLOWLIST" + echo "" + log_info "Installation complete. Logs: $LOG_FILE" +} + +# ============================================================================ +# UNINSTALL +# ============================================================================ + +uninstall_fail2ban() { + log_step "Uninstalling Fail2ban..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would stop and remove fail2ban" + return + fi + + systemctl stop fail2ban 2>/dev/null || true + systemctl disable fail2ban 2>/dev/null || true + + case "$PKG_MGR" in + apt) + apt-get purge -y -qq fail2ban + apt-get autoremove -y -qq + ;; + dnf|yum) + $PKG_MGR remove -y -q fail2ban + ;; + esac + + # Clean up config + if [ -d /etc/fail2ban ]; then + log_info "Configuration directory /etc/fail2ban preserved — remove manually if desired" + fi + + log_info "Fail2ban uninstalled" + exit 0 +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "============================================" + echo " Fail2ban Install Script v${VERSION}" + echo " https://mylinux.work" + echo "============================================" + echo "" + + check_root + detect_os + + if $UNINSTALL; then + uninstall_fail2ban + fi + + detect_services + detect_ban_action + install_fail2ban + configure_fail2ban + setup_prometheus + start_fail2ban + verify_installation +} + +main "$@" diff --git a/install-mattermost.sh b/install-mattermost.sh new file mode 100755 index 0000000..1763807 --- /dev/null +++ b/install-mattermost.sh @@ -0,0 +1,485 @@ +#!/bin/bash + +############################################################# +#### Mattermost Server Install Script #### +#### Bare-metal deployment with PostgreSQL and nginx #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: sudo ./install-mattermost.sh #### +#### sudo ./install-mattermost.sh --help #### +############################################################# + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION (edit these before running) +# ============================================================================ + +MATTERMOST_VERSION="10.6.1" +DOMAIN="mattermost.example.com" +DB_NAME="mattermost" +DB_USER="mmuser" +DB_PASS="" +MM_USER="mattermost" +MM_DIR="/opt/mattermost" +DATA_DIR="/opt/mattermost/data" +NGINX_CONF=true + +# ============================================================================ +# INTERNAL +# ============================================================================ + +readonly VERSION="1.0" +readonly SCRIPT_NAME="${0##*/}" + +OS_ID="" +OS_FAMILY="" +PKG_MGR="" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +show_usage() { + cat </dev/null; then + PKG_MGR="yum" + fi + ;; + *) + log_error "Unsupported OS: $OS_ID" + exit 1 + ;; + esac + + log_info "Detected OS: $PRETTY_NAME (package manager: $PKG_MGR)" +} + +# ============================================================================ +# STEP 1: INSTALL DEPENDENCIES +# ============================================================================ + +install_dependencies() { + log_step "Installing dependencies..." + + case "$OS_FAMILY" in + debian) + apt-get update -qq + apt-get install -y -qq postgresql postgresql-contrib nginx curl jq + ;; + rhel) + $PKG_MGR install -y -q postgresql-server postgresql nginx curl jq + # Initialize PostgreSQL if not already done + if [[ ! -d /var/lib/pgsql/data ]] || [[ -z "$(ls -A /var/lib/pgsql/data 2>/dev/null)" ]]; then + log_info "Initializing PostgreSQL database..." + postgresql-setup --initdb + fi + # Enable and start PostgreSQL + systemctl enable postgresql + systemctl start postgresql + ;; + esac + + # Ensure PostgreSQL is running on Debian-based as well + if [[ "$OS_FAMILY" == "debian" ]]; then + systemctl enable postgresql + systemctl start postgresql + fi + + log_info "Dependencies installed" +} + +# ============================================================================ +# STEP 2: CONFIGURE POSTGRESQL +# ============================================================================ + +configure_database() { + log_step "Configuring PostgreSQL database..." + + # Generate random password if not provided + if [[ -z "$DB_PASS" ]]; then + DB_PASS=$(openssl rand -base64 24 | tr -d '/+=') + log_info "Generated random database password" + fi + + # Create user if not exists + if sudo -u postgres psql -tAc "SELECT 1 FROM pg_roles WHERE rolname='$DB_USER'" | grep -q 1; then + log_warn "PostgreSQL user '$DB_USER' already exists — updating password" + sudo -u postgres psql -c "ALTER USER $DB_USER WITH PASSWORD '$DB_PASS';" + else + sudo -u postgres psql -c "CREATE USER $DB_USER WITH PASSWORD '$DB_PASS';" + log_info "Created PostgreSQL user: $DB_USER" + fi + + # Create database if not exists + if sudo -u postgres psql -tAc "SELECT 1 FROM pg_database WHERE datname='$DB_NAME'" | grep -q 1; then + log_warn "Database '$DB_NAME' already exists" + else + sudo -u postgres psql -c "CREATE DATABASE $DB_NAME OWNER $DB_USER;" + log_info "Created database: $DB_NAME" + fi + + sudo -u postgres psql -c "GRANT ALL PRIVILEGES ON DATABASE $DB_NAME TO $DB_USER;" + + # Ensure md5/scram-sha-256 auth for local connections on RHEL + if [[ "$OS_FAMILY" == "rhel" ]]; then + local pg_hba + pg_hba=$(sudo -u postgres psql -tAc "SHOW hba_file;") + if [[ -f "$pg_hba" ]] && grep -q 'ident' "$pg_hba"; then + log_info "Updating pg_hba.conf to allow password authentication..." + sed -i 's/^\(local.*all.*all.*\)ident/\1md5/' "$pg_hba" + sed -i 's/^\(host.*all.*all.*127\.0\.0\.1\/32.*\)ident/\1md5/' "$pg_hba" + sed -i 's/^\(host.*all.*all.*::1\/128.*\)ident/\1md5/' "$pg_hba" + systemctl restart postgresql + fi + fi + + log_info "PostgreSQL configured" +} + +# ============================================================================ +# STEP 3: CREATE SYSTEM USER +# ============================================================================ + +create_system_user() { + log_step "Creating system user..." + + if id "$MM_USER" &>/dev/null; then + log_warn "System user '$MM_USER' already exists" + else + useradd --system --no-create-home --shell /usr/sbin/nologin "$MM_USER" + log_info "Created system user: $MM_USER" + fi +} + +# ============================================================================ +# STEP 4: DOWNLOAD AND EXTRACT MATTERMOST +# ============================================================================ + +download_mattermost() { + log_step "Downloading Mattermost v${MATTERMOST_VERSION}..." + + local download_url="https://releases.mattermost.com/${MATTERMOST_VERSION}/mattermost-${MATTERMOST_VERSION}-linux-amd64.tar.gz" + local tmp_file="/tmp/mattermost-${MATTERMOST_VERSION}.tar.gz" + + if [[ -d "$MM_DIR" ]] && [[ -f "$MM_DIR/bin/mattermost" ]]; then + local existing_version + existing_version=$("$MM_DIR/bin/mattermost" version 2>/dev/null | grep -oP 'Version:\s*\K[\d.]+' || echo "unknown") + log_warn "Mattermost already installed at $MM_DIR (version: $existing_version)" + log_warn "Skipping download — remove $MM_DIR to reinstall" + return 0 + fi + + curl -fSL -o "$tmp_file" "$download_url" + log_info "Downloaded: $download_url" + + # Extract to /opt (creates /opt/mattermost) + tar -xzf "$tmp_file" -C /opt/ + rm -f "$tmp_file" + + # Create data directory + mkdir -p "$DATA_DIR" + + # Set ownership + chown -R "$MM_USER:$MM_USER" "$MM_DIR" + + log_info "Mattermost extracted to $MM_DIR" +} + +# ============================================================================ +# STEP 5: CONFIGURE MATTERMOST +# ============================================================================ + +configure_mattermost() { + log_step "Configuring Mattermost..." + + local config_file="$MM_DIR/config/config.json" + + if [[ ! -f "$config_file" ]]; then + log_error "Config file not found: $config_file" + exit 1 + fi + + local dsn="postgres://${DB_USER}:${DB_PASS}@localhost:5432/${DB_NAME}?sslmode=disable&connect_timeout=10" + + # Use jq to update configuration + local tmp_config + tmp_config=$(mktemp) + + jq \ + --arg dsn "$dsn" \ + --arg siteurl "https://${DOMAIN}" \ + --arg datadir "$DATA_DIR" \ + ' + .SqlSettings.DriverName = "postgres" | + .SqlSettings.DataSource = $dsn | + .ServiceSettings.SiteURL = $siteurl | + .ServiceSettings.ListenAddress = ":8065" | + .FileSettings.Directory = $datadir + ' "$config_file" > "$tmp_config" + + mv "$tmp_config" "$config_file" + chown "$MM_USER:$MM_USER" "$config_file" + chmod 600 "$config_file" + + log_info "Configuration updated: $config_file" +} + +# ============================================================================ +# STEP 6: CREATE SYSTEMD SERVICE +# ============================================================================ + +create_service() { + log_step "Creating systemd service..." + + cat > /etc/systemd/system/mattermost.service < "$nginx_config" <<'NGINXEOF' +upstream mattermost_backend { + server localhost:8065; + keepalive 32; +} + +server { + listen 80; + server_name DOMAIN_PLACEHOLDER; + + client_max_body_size 100M; + + location ~ /api/v[0-9]+/(users/)?websocket$ { + proxy_pass http://mattermost_backend; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Frame-Options SAMEORIGIN; + proxy_buffers 256 16k; + proxy_buffer_size 16k; + proxy_read_timeout 600s; + } + + location / { + proxy_pass http://mattermost_backend; + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Frame-Options SAMEORIGIN; + proxy_buffers 256 16k; + proxy_buffer_size 16k; + } +} +NGINXEOF + + # Replace domain placeholder + sed -i "s/DOMAIN_PLACEHOLDER/$DOMAIN/" "$nginx_config" + + # Enable site on Debian-based systems + if [[ "$OS_FAMILY" == "debian" ]]; then + mkdir -p /etc/nginx/sites-enabled + ln -sf "$nginx_config" /etc/nginx/sites-enabled/mattermost + fi + + # Test and reload nginx + if nginx -t 2>/dev/null; then + systemctl enable nginx + systemctl reload nginx + log_info "Nginx configured and reloaded" + else + log_error "Nginx configuration test failed" + nginx -t + exit 1 + fi +} + +# ============================================================================ +# STEP 8: SUMMARY +# ============================================================================ + +print_summary() { + echo "" + echo "============================================" + echo " Mattermost Installation Complete" + echo "============================================" + echo "" + log_info "Database credentials:" + echo " User: $DB_USER" + echo " Password: $DB_PASS" + echo " Database: $DB_NAME" + echo "" + log_info "Mattermost:" + echo " URL: https://$DOMAIN" + echo " Service: mattermost.service" + echo " Config: $MM_DIR/config/config.json" + echo " Data: $DATA_DIR" + echo "" + log_info "Next steps:" + echo " 1. Set up TLS: sudo certbot --nginx -d $DOMAIN" + echo " 2. Visit https://$DOMAIN to create the admin account" + echo " 3. View logs: sudo journalctl -u mattermost -f" + echo "" +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "============================================" + echo " Mattermost Install Script v${VERSION}" + echo " https://mylinux.work" + echo "============================================" + echo "" + + check_root + detect_os + install_dependencies + configure_database + create_system_user + download_mattermost + configure_mattermost + create_service + configure_nginx + print_summary +} + +main "$@" diff --git a/install-node-exporter.sh b/install-node-exporter.sh new file mode 100644 index 0000000..c42ed21 --- /dev/null +++ b/install-node-exporter.sh @@ -0,0 +1,611 @@ +#!/bin/bash + +set -euo pipefail + +############################################################# +#### Prometheus Node Exporter Installer #### +#### For RHEL/Rocky/Alma, Oracle Linux, Debian & Ubuntu #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.1 #### +#### #### +#### Usage: ./install-node-exporter.sh [OPTIONS] #### +############################################################# + +# Script defaults +INSTALL_DIR="/usr/local/bin" +TEXTFILE_DIR="/var/lib/node_exporter/textfile_collector" +SERVICE_USER="node_exporter" +PORT=9100 +COLLECTORS="" +NO_COLLECTORS="" +UPDATE_MODE=false +UNINSTALL_MODE=false +DRY_RUN=false +OPEN_FIREWALL=true +PROMETHEUS_IP="" + +# System variables +logfile="/var/log/node-exporter-install.log" +TMPDIR="" + +######################### +### Logging Functions ### +######################### +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$logfile" +} + +log_error() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" | tee -a "$logfile" >&2 +} + +log_info() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: $1" | tee -a "$logfile" +} + +######################### +### Cleanup Trap ### +######################### +cleanup() { + if [[ -n "$TMPDIR" && -d "$TMPDIR" ]]; then + rm -rf "$TMPDIR" + fi +} + +trap cleanup EXIT + +######################### +### Utility Functions ### +######################### +show_help() { + cat << EOF +Prometheus Node Exporter Installer + +USAGE: + $0 [OPTIONS] + +OPTIONS: + --collectors LIST Additional collectors to enable (comma-separated) + e.g., "systemd,processes,tcpstat" + --no-collectors LIST Collectors to disable (comma-separated) + e.g., "wifi,infiniband" + --port PORT Listen port (default: 9100) + --textfile-dir DIR Textfile collector directory + (default: /var/lib/node_exporter/textfile_collector) + --prometheus-ip IP Restrict firewall rule to this source IP + --no-firewall Skip firewall configuration + --update Update existing installation + --uninstall Remove node_exporter completely + --dry-run Show what would be done without doing it + --help Show this help message + +EXAMPLES: + $0 + $0 --collectors "systemd,processes,tcpstat" + $0 --no-collectors "wifi,infiniband" --port 9200 + $0 --prometheus-ip 10.0.0.5 + $0 --update + $0 --uninstall + $0 --dry-run + +EOF +} + +######################### +### Permission Check ### +######################### +check_permissions() { + if [[ $EUID -ne 0 ]]; then + log_error "This script must be run as root! Login as root, or use sudo." + exit 1 + fi +} + +######################### +### System Detection ### +######################### +detect_os() { + if [[ "$(command -v lsb_release)" ]]; then + OS=$(lsb_release -i | awk '{print $3}' | tr '[:upper:]' '[:lower:]') + OSVER=$(lsb_release -r | awk '{print $2}' | cut -d. -f1) + else + OS=$({ grep PRETTY_NAME /etc/os-release || true; } | sed 's/PRETTY_NAME=//g' | tr -d '="' | awk '{print $1}' | tr '[:upper:]' '[:lower:]') + OSVER=$({ grep VERSION_ID /etc/os-release || true; } | sed 's/VERSION_ID=//g' | tr -d '"' | cut -d. -f1) + fi + + log_info "Detected OS: $OS version $OSVER" +} + +detect_arch() { + local machine + machine=$(uname -m) + case "$machine" in + x86_64) ARCH="amd64" ;; + aarch64) ARCH="arm64" ;; + *) + log_error "Unsupported architecture: $machine" + exit 1 + ;; + esac + log_info "Detected architecture: $ARCH" +} + +######################### +### Package Manager ### +######################### +setup_package_manager() { + case $OS in + "ubuntu"|"debian") + pkgmgr="apt -y" + ;; + "red"|"centos"|"oracle"|"rocky"|"almalinux") + if command -v dnf >/dev/null 2>&1; then + pkgmgr="dnf -y" + else + pkgmgr="yum -y" + fi + ;; + *) + log_error "Unsupported OS: $OS" + exit 1 + ;; + esac + log_info "Using package manager: $pkgmgr" +} + +######################### +### Dependencies ### +######################### +install_dependencies() { + for cmd in curl tar; do + if ! command -v "$cmd" >/dev/null 2>&1; then + log_info "Installing $cmd" + $pkgmgr install "$cmd" + fi + done +} + +######################### +### Version Helpers ### +######################### +get_latest_version() { + local version + version=$(curl -s https://api.github.com/repos/prometheus/node_exporter/releases/latest \ + | grep '"tag_name"' | cut -d '"' -f 4 | sed 's/^v//') + if [[ -z "$version" ]]; then + log_error "Failed to fetch latest version from GitHub API" + exit 1 + fi + echo "$version" +} + +get_installed_version() { + if [[ -x "${INSTALL_DIR}/node_exporter" ]]; then + "${INSTALL_DIR}/node_exporter" --version 2>&1 | head -1 | awk '{print $3}' + else + echo "" + fi +} + +######################### +### User Management ### +######################### +create_service_user() { + if ! id "$SERVICE_USER" &>/dev/null; then + log_info "Creating $SERVICE_USER user" + useradd --no-create-home --shell /usr/sbin/nologin --system "$SERVICE_USER" + else + log_info "User $SERVICE_USER already exists" + fi +} + +######################### +### Download & Install ## +######################### +download_and_install() { + local version="$1" + + TMPDIR=$(mktemp -d /tmp/node-exporter-install-XXXXXX) + local tarball="node_exporter-${version}.linux-${ARCH}.tar.gz" + local url="https://github.com/prometheus/node_exporter/releases/download/v${version}/${tarball}" + + log_info "Downloading node_exporter v${version} for ${ARCH}" + curl -sL -o "${TMPDIR}/${tarball}" "$url" || { + log_error "Failed to download ${url}" + exit 1 + } + + log_info "Extracting archive" + tar -xzf "${TMPDIR}/${tarball}" -C "$TMPDIR" + + log_info "Installing binary to ${INSTALL_DIR}/node_exporter" + cp "${TMPDIR}/node_exporter-${version}.linux-${ARCH}/node_exporter" "${INSTALL_DIR}/node_exporter" + chown root:root "${INSTALL_DIR}/node_exporter" + chmod 755 "${INSTALL_DIR}/node_exporter" + + # SELinux context for RHEL 8+ + if [[ "$OS" == "red" || "$OS" == "rocky" || "$OS" == "almalinux" || "$OS" == "oracle" ]] && [[ "$OSVER" -ge 8 ]]; then + restorecon -rv "${INSTALL_DIR}/node_exporter" || true + fi +} + +######################### +### Textfile Directory ## +######################### +create_textfile_dir() { + log_info "Creating textfile collector directory: ${TEXTFILE_DIR}" + mkdir -p "$TEXTFILE_DIR" + chown "$SERVICE_USER":"$SERVICE_USER" "$TEXTFILE_DIR" +} + +######################### +### Systemd Service ### +######################### +build_exec_start() { + local exec_start="${INSTALL_DIR}/node_exporter" + exec_start+=" --collector.textfile.directory=${TEXTFILE_DIR}" + + # Custom listen port + if [[ "$PORT" -ne 9100 ]]; then + exec_start+=" --web.listen-address=:${PORT}" + fi + + # Enable additional collectors + if [[ -n "$COLLECTORS" ]]; then + IFS=',' read -ra cols <<< "$COLLECTORS" + for col in "${cols[@]}"; do + exec_start+=" --collector.${col}" + done + fi + + # Disable collectors + if [[ -n "$NO_COLLECTORS" ]]; then + IFS=',' read -ra nocols <<< "$NO_COLLECTORS" + for col in "${nocols[@]}"; do + exec_start+=" --no-collector.${col}" + done + fi + + echo "$exec_start" +} + +create_systemd_service() { + local exec_start + exec_start=$(build_exec_start) + + log_info "Creating systemd service file" + cat > /etc/systemd/system/node_exporter.service << EOF +[Unit] +Description=Prometheus Node Exporter +Documentation=https://github.com/prometheus/node_exporter +Wants=network-online.target +After=network-online.target + +[Service] +User=${SERVICE_USER} +Group=${SERVICE_USER} +Type=simple +ExecStart=${exec_start} +Restart=always +RestartSec=5s +SyslogIdentifier=node_exporter + +[Install] +WantedBy=multi-user.target +EOF + + systemctl daemon-reload + systemctl enable node_exporter + systemctl start node_exporter + log_info "node_exporter service started" +} + +######################### +### Firewall Config ### +######################### +configure_firewall() { + if [[ "$OPEN_FIREWALL" == "false" ]]; then + log_info "Skipping firewall configuration (--no-firewall)" + return + fi + + # UFW (Debian/Ubuntu) + if command -v ufw >/dev/null 2>&1 && ufw status | grep -q "active"; then + log_info "Configuring UFW firewall rule for port ${PORT}" + if [[ -n "$PROMETHEUS_IP" ]]; then + ufw allow from "$PROMETHEUS_IP" to any port "$PORT" proto tcp comment "node_exporter" >/dev/null + else + ufw allow "$PORT"/tcp comment "node_exporter" >/dev/null + fi + log_info "UFW rule added" + return + fi + + # firewalld (RHEL/Rocky/Alma/Oracle) + if command -v firewall-cmd >/dev/null 2>&1 && systemctl is-active --quiet firewalld 2>/dev/null; then + log_info "Configuring firewalld rule for port ${PORT}" + if [[ -n "$PROMETHEUS_IP" ]]; then + firewall-cmd --permanent --new-zone=node_exporter 2>/dev/null || true + firewall-cmd --permanent --zone=node_exporter --add-source="$PROMETHEUS_IP" 2>/dev/null || true + firewall-cmd --permanent --zone=node_exporter --add-port="${PORT}/tcp" 2>/dev/null || true + else + firewall-cmd --permanent --add-port="${PORT}/tcp" >/dev/null + fi + firewall-cmd --reload >/dev/null + log_info "firewalld rule added" + return + fi + + log_info "No active firewall detected, skipping firewall configuration" +} + +######################### +### Verify Install ### +######################### +verify_installation() { + log_info "Verifying node_exporter service" + + sleep 2 + + if ! systemctl is-active --quiet node_exporter; then + log_error "node_exporter service is not running" + systemctl status node_exporter --no-pager | tee -a "$logfile" + exit 1 + fi + + if curl -sf "http://localhost:${PORT}/metrics" >/dev/null 2>&1; then + log_info "Metrics endpoint responding at http://localhost:${PORT}/metrics" + else + log_error "Metrics endpoint not responding on port ${PORT}" + exit 1 + fi +} + +######################### +### Print Summary ### +######################### +print_summary() { + local version + version=$(get_installed_version) + + echo + echo "=== Node Exporter Installation Summary ===" + echo " Version: ${version}" + echo " Binary: ${INSTALL_DIR}/node_exporter" + echo " Service user: ${SERVICE_USER}" + echo " Port: ${PORT}" + echo " Textfile dir: ${TEXTFILE_DIR}" + echo " Metrics URL: http://localhost:${PORT}/metrics" + [[ -n "$COLLECTORS" ]] && echo " Enabled: ${COLLECTORS}" + [[ -n "$NO_COLLECTORS" ]] && echo " Disabled: ${NO_COLLECTORS}" + echo + echo " Check logs at: ${logfile}" + echo +} + +######################### +### Install Mode ### +######################### +do_install() { + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would install node_exporter" + log_info "[DRY RUN] User: ${SERVICE_USER}" + log_info "[DRY RUN] Port: ${PORT}" + log_info "[DRY RUN] Textfile dir: ${TEXTFILE_DIR}" + log_info "[DRY RUN] Firewall: ${OPEN_FIREWALL}" + [[ -n "$COLLECTORS" ]] && log_info "[DRY RUN] Enable collectors: ${COLLECTORS}" + [[ -n "$NO_COLLECTORS" ]] && log_info "[DRY RUN] Disable collectors: ${NO_COLLECTORS}" + return + fi + + local version + version=$(get_latest_version) + log_info "Latest version: ${version}" + + create_service_user + download_and_install "$version" + create_textfile_dir + create_systemd_service + configure_firewall + verify_installation + print_summary +} + +######################### +### Update Mode ### +######################### +do_update() { + if [[ ! -x "${INSTALL_DIR}/node_exporter" ]]; then + log_error "node_exporter is not installed. Run without --update to install." + exit 1 + fi + + local current latest + current=$(get_installed_version) + latest=$(get_latest_version) + + log_info "Installed version: ${current}" + log_info "Latest version: ${latest}" + + if [[ "$current" == "$latest" ]]; then + log_info "Already up to date (v${current}), nothing to do" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would update node_exporter from v${current} to v${latest}" + return + fi + + log_info "Updating node_exporter from v${current} to v${latest}" + + systemctl stop node_exporter + download_and_install "$latest" + systemctl start node_exporter + + verify_installation + + echo + echo "=== Node Exporter Update Summary ===" + echo " Previous version: ${current}" + echo " New version: ${latest}" + echo " Metrics URL: http://localhost:${PORT}/metrics" + echo +} + +######################### +### Uninstall Mode ### +######################### +do_uninstall() { + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would uninstall node_exporter" + log_info "[DRY RUN] Stop and disable service" + log_info "[DRY RUN] Remove ${INSTALL_DIR}/node_exporter" + log_info "[DRY RUN] Remove /etc/systemd/system/node_exporter.service" + log_info "[DRY RUN] Remove ${TEXTFILE_DIR}" + log_info "[DRY RUN] Remove user ${SERVICE_USER}" + log_info "[DRY RUN] Remove firewall rule for port ${PORT}" + return + fi + + log_info "Uninstalling node_exporter" + + # Stop and disable service + if systemctl is-active --quiet node_exporter 2>/dev/null; then + systemctl stop node_exporter + log_info "Stopped node_exporter service" + fi + if systemctl is-enabled --quiet node_exporter 2>/dev/null; then + systemctl disable node_exporter + log_info "Disabled node_exporter service" + fi + + # Remove service file + if [[ -f /etc/systemd/system/node_exporter.service ]]; then + rm -f /etc/systemd/system/node_exporter.service + log_info "Removed systemd service file" + fi + + systemctl daemon-reload + + # Remove binary + if [[ -f "${INSTALL_DIR}/node_exporter" ]]; then + rm -f "${INSTALL_DIR}/node_exporter" + log_info "Removed binary" + fi + + # Remove textfile directory + if [[ -d "$TEXTFILE_DIR" ]]; then + rm -rf "$TEXTFILE_DIR" + log_info "Removed textfile directory" + fi + + # Remove user + if id "$SERVICE_USER" &>/dev/null; then + userdel "$SERVICE_USER" 2>/dev/null || true + log_info "Removed user ${SERVICE_USER}" + fi + + # Remove firewall rules + if command -v ufw >/dev/null 2>&1 && ufw status | grep -q "active"; then + ufw delete allow "$PORT"/tcp 2>/dev/null || true + log_info "Removed UFW rule" + elif command -v firewall-cmd >/dev/null 2>&1 && systemctl is-active --quiet firewalld 2>/dev/null; then + firewall-cmd --permanent --remove-port="${PORT}/tcp" 2>/dev/null || true + firewall-cmd --permanent --delete-zone=node_exporter 2>/dev/null || true + firewall-cmd --reload >/dev/null 2>/dev/null || true + log_info "Removed firewalld rule" + fi + + log_info "node_exporter has been completely removed" +} + +######################### +### Parse Arguments ### +######################### +parse_arguments() { + while [[ $# -gt 0 ]]; do + case $1 in + --collectors) + COLLECTORS="$2" + shift 2 + ;; + --no-collectors) + NO_COLLECTORS="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + --textfile-dir) + TEXTFILE_DIR="$2" + shift 2 + ;; + --prometheus-ip) + PROMETHEUS_IP="$2" + shift 2 + ;; + --no-firewall) + OPEN_FIREWALL=false + shift + ;; + --update) + UPDATE_MODE=true + shift + ;; + --uninstall) + UNINSTALL_MODE=true + shift + ;; + --dry-run) + DRY_RUN=true + shift + ;; + --help) + show_help + exit 0 + ;; + *) + log_error "Unknown option: $1" + show_help + exit 1 + ;; + esac + done +} + +######################### +### Main ### +######################### +main() { + mkdir -p "$(dirname "$logfile")" + touch "$logfile" + + parse_arguments "$@" + + log_info "Starting node_exporter installer" + log_info "Command line: $0 $*" + + check_permissions + detect_os + detect_arch + setup_package_manager + install_dependencies + + if [[ "$UNINSTALL_MODE" == "true" ]]; then + do_uninstall + elif [[ "$UPDATE_MODE" == "true" ]]; then + do_update + else + do_install + fi + + log_info "Done" +} + +# Run main function +main "$@" diff --git a/install-nsd.sh b/install-nsd.sh new file mode 100755 index 0000000..240c8b0 --- /dev/null +++ b/install-nsd.sh @@ -0,0 +1,666 @@ +#!/bin/bash +################################################################################ +# Script Name: install-nsd.sh +# Version: 1.1 +# Description: Install and configure NSD (Name Server Daemon) authoritative +# DNS server as primary or secondary nameserver with zone file +# generation and nsd-control support +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Date: 2026-03-31 +# +# Supported OS: +# - Ubuntu / Debian +# - RHEL / AlmaLinux / Rocky Linux / Fedora +# +# Usage: +# sudo ./install-nsd.sh --zone example.com +# sudo ./install-nsd.sh --zone example.com --zone example.org --with-control +# sudo ./install-nsd.sh --secondary --master-ip 10.0.0.1 --zone example.com +# sudo ./install-nsd.sh --uninstall +# ./install-nsd.sh --dry-run --zone example.com +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +readonly VERSION="1.0" +readonly SCRIPT_NAME="${0##*/}" +readonly LOG_FILE="/var/log/nsd-install.log" + +# Defaults +ROLE="primary" +MASTER_IP="" +LISTEN_IP="0.0.0.0" +ZONES=() +WITH_CONTROL=false +UNINSTALL=false +DRY_RUN=false + +# Paths (set after OS detection) +NSD_CONF="" +NSD_ZONE_DIR="" +NSD_USER="" +NSD_GROUP="" + +# OS detection +OS="" +OS_FAMILY="" + +# ============================================================================ +# COLOR OUTPUT +# ============================================================================ + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +print_success() { echo -e "${GREEN} ✓ $1${NC}"; } +print_error() { echo -e "${RED} ✗ $1${NC}" >&2; } +print_warning() { echo -e "${YELLOW} ⚠ $1${NC}"; } +print_info() { echo -e " → $1"; } + +# ============================================================================ +# LOGGING +# ============================================================================ + +log() { + local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $1" + if [[ "$DRY_RUN" == "false" ]] && [[ -w "$(dirname "$LOG_FILE")" || -w "$LOG_FILE" ]]; then + echo "$msg" >> "$LOG_FILE" + fi + echo "$msg" +} + +log_error() { + local msg="[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" + if [[ "$DRY_RUN" == "false" ]] && [[ -w "$(dirname "$LOG_FILE")" || -w "$LOG_FILE" ]]; then + echo "$msg" >> "$LOG_FILE" + fi + print_error "$1" +} + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_help() { + cat </dev/null; then + print_warning "systemd-resolved is running and binds to port 53" + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would disable systemd-resolved" + return + fi + + log "Disabling systemd-resolved to free port 53" + systemctl stop systemd-resolved + systemctl disable systemd-resolved + + # Replace symlinked resolv.conf with a static one + if [[ -L /etc/resolv.conf ]]; then + rm -f /etc/resolv.conf + echo "nameserver 1.1.1.1" > /etc/resolv.conf + echo "nameserver 8.8.8.8" >> /etc/resolv.conf + fi + + print_success "systemd-resolved disabled" + fi +} + +# ============================================================================ +# INSTALLATION +# ============================================================================ + +install_nsd() { + log "Installing NSD packages" + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would install NSD via package manager" + return + fi + + if [[ "$OS_FAMILY" == "debian" ]]; then + apt-get update -qq + apt-get install -y nsd + elif [[ "$OS_FAMILY" == "rhel" ]]; then + if [[ "$OS" != "fedora" ]]; then + dnf install -y epel-release + fi + dnf install -y nsd + fi + + if command -v nsd &>/dev/null; then + print_success "NSD installed ($(nsd -v 2>&1 | head -1))" + else + log_error "NSD installation failed" + exit 1 + fi +} + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +generate_nsd_conf() { + log "Generating NSD configuration" + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would write $NSD_CONF" + print_info "[DRY RUN] Role: $ROLE | Listen: $LISTEN_IP | Zones: ${ZONES[*]:-none}" + return + fi + + mkdir -p "$NSD_ZONE_DIR" + + # Build zone configuration blocks + local zone_blocks="" + for zone in "${ZONES[@]}"; do + if [[ "$ROLE" == "primary" ]]; then + zone_blocks+=" +zone: + name: \"${zone}\" + zonefile: \"${NSD_ZONE_DIR}/${zone}.zone\" + provide-xfr: 0.0.0.0/0 NOKEY + notify: 0.0.0.0 NOKEY +" + else + zone_blocks+=" +zone: + name: \"${zone}\" + zonefile: \"${NSD_ZONE_DIR}/${zone}.zone\" + allow-notify: ${MASTER_IP} NOKEY + request-xfr: AXFR ${MASTER_IP} NOKEY +" + fi + done + + # Build remote-control block + local control_block="" + if [[ "$WITH_CONTROL" == "true" ]]; then + control_block=" +remote-control: + control-enable: yes + control-interface: 127.0.0.1 + control-port: 8952 + server-key-file: \"/etc/nsd/nsd_server.key\" + server-cert-file: \"/etc/nsd/nsd_server.pem\" + control-key-file: \"/etc/nsd/nsd_control.key\" + control-cert-file: \"/etc/nsd/nsd_control.pem\" +" + fi + + cat > "$NSD_CONF" < "$zone_file" </dev/null; then + nsd-control-setup + print_success "nsd-control TLS keys generated" + else + log_error "nsd-control-setup not found — cannot generate keys" + exit 1 + fi +} + +# ============================================================================ +# PERMISSIONS & VALIDATION +# ============================================================================ + +set_permissions() { + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would set ownership to ${NSD_USER}:${NSD_GROUP}" + return + fi + + log "Setting file permissions" + + # Ensure NSD runtime directory exists + mkdir -p /run/nsd + chown "${NSD_USER}:${NSD_GROUP}" /run/nsd + + # Ensure zone directory ownership + if [[ -d "$NSD_ZONE_DIR" ]]; then + chown -R "${NSD_USER}:${NSD_GROUP}" "$NSD_ZONE_DIR" + fi + + # Ensure log file exists and is writable + touch /var/log/nsd.log + chown "${NSD_USER}:${NSD_GROUP}" /var/log/nsd.log + + print_success "File permissions set" +} + +validate_config() { + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would run nsd-checkconf $NSD_CONF" + return + fi + + log "Validating NSD configuration" + + if nsd-checkconf "$NSD_CONF"; then + print_success "Configuration validated (nsd-checkconf passed)" + else + log_error "Configuration validation failed — check $NSD_CONF" + exit 1 + fi +} + +# ============================================================================ +# SERVICE MANAGEMENT +# ============================================================================ + +start_nsd() { + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would enable and start NSD service" + return + fi + + log "Enabling and starting NSD service" + + systemctl enable nsd + systemctl restart nsd + + sleep 2 + + if systemctl is-active --quiet nsd; then + print_success "NSD service is running" + else + log_error "NSD service failed to start" + systemctl status nsd --no-pager >&2 + exit 1 + fi +} + +# ============================================================================ +# UNINSTALL +# ============================================================================ + +do_uninstall() { + log "Uninstalling NSD" + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would stop and disable NSD service" + print_info "[DRY RUN] Would remove NSD package and config files" + return + fi + + check_root + + # Stop service + if systemctl is-active --quiet nsd 2>/dev/null; then + systemctl stop nsd + systemctl disable nsd + print_success "NSD service stopped and disabled" + fi + + # Remove package + if [[ "$OS_FAMILY" == "debian" ]]; then + apt-get remove --purge -y nsd + apt-get autoremove -y + elif [[ "$OS_FAMILY" == "rhel" ]]; then + dnf remove -y nsd + fi + + # Remove configuration and zone files + rm -rf /etc/nsd + rm -f /var/log/nsd.log + + print_success "NSD removed" + log "Uninstall complete" + exit 0 +} + +# ============================================================================ +# ARGUMENT PARSING +# ============================================================================ + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --primary) + ROLE="primary" + shift + ;; + --secondary) + ROLE="secondary" + shift + ;; + --master-ip) + [[ -z "${2:-}" ]] && { log_error "--master-ip requires an IP address"; exit 1; } + MASTER_IP="$2" + shift 2 + ;; + --zone) + [[ -z "${2:-}" ]] && { log_error "--zone requires a domain name"; exit 1; } + ZONES+=("$2") + shift 2 + ;; + --ip) + [[ -z "${2:-}" ]] && { log_error "--ip requires an IP address"; exit 1; } + LISTEN_IP="$2" + shift 2 + ;; + --with-control) + WITH_CONTROL=true + shift + ;; + --uninstall) + UNINSTALL=true + shift + ;; + --dry-run) + DRY_RUN=true + shift + ;; + -h|--help) + show_help + ;; + --version) + show_version + ;; + *) + log_error "Unknown option: $1" + echo "Use --help for usage information" >&2 + exit 1 + ;; + esac + done + + # Validate arguments + if [[ "$ROLE" == "secondary" && -z "$MASTER_IP" ]]; then + log_error "--secondary requires --master-ip" + exit 1 + fi +} + +# ============================================================================ +# SUMMARY +# ============================================================================ + +print_summary() { + echo "" + echo "=== NSD Installation Complete ===" + echo "" + echo " Role: $ROLE" + echo " Listen address: $LISTEN_IP:53" + echo " Config file: $NSD_CONF" + echo " Zone directory: $NSD_ZONE_DIR" + [[ "$WITH_CONTROL" == "true" ]] && echo " nsd-control: enabled" + echo "" + + if [[ ${#ZONES[@]} -gt 0 ]]; then + echo " Zones configured:" + for zone in "${ZONES[@]}"; do + echo " • $zone" + done + echo "" + fi + + echo " Next steps:" + if [[ "$ROLE" == "primary" ]]; then + echo " 1. Edit zone files in $NSD_ZONE_DIR with real records" + echo " 2. Update serial number after each change" + echo " 3. Reload: nsd-control reload (or systemctl reload nsd)" + else + echo " 1. Ensure the master ($MASTER_IP) allows zone transfers" + echo " 2. Check transfer status: nsd-control zonestatus" + fi + echo " • Test: dig @${LISTEN_IP} A" + echo " • Logs: /var/log/nsd.log" + echo "" +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "=== NSD Authoritative DNS Server Installer v${VERSION} ===" + echo "" + + # Handle uninstall early + if [[ "$UNINSTALL" == "true" ]]; then + detect_os + do_uninstall + fi + + # Dry-run doesn't need root for preview + if [[ "$DRY_RUN" == "false" ]]; then + check_root + fi + + detect_os + disable_resolved + install_nsd + generate_nsd_conf + generate_zone_files + setup_nsd_control + set_permissions + validate_config + start_nsd + print_summary + + log "Installation complete" +} + +main "$@" diff --git a/install-ossec.sh b/install-ossec.sh new file mode 100644 index 0000000..d6db319 --- /dev/null +++ b/install-ossec.sh @@ -0,0 +1,482 @@ +#!/bin/bash +################################################################################ +# Script Name: install-ossec.sh +# Version: 1.0 +# Description: Automated OSSEC HIDS installation — server or agent mode, +# file integrity monitoring, rootkit detection, log analysis, +# active response, email alerts, and syscheck configuration +# on Debian/Ubuntu and RHEL/Rocky/AlmaLinux +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Usage: +# sudo ./install-ossec.sh --mode server +# sudo ./install-ossec.sh --mode agent --server-ip 10.0.1.10 +# sudo ./install-ossec.sh --mode local +# sudo ./install-ossec.sh --mode server --email admin@example.com +# sudo ./install-ossec.sh --dry-run --mode server +# sudo ./install-ossec.sh --uninstall +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +readonly VERSION="1.0" +readonly SCRIPT_NAME="${0##*/}" +readonly LOG_FILE="/var/log/ossec-install.log" + +INSTALL_MODE="" +SERVER_IP="" +EMAIL_ADDR="" +EMAIL_SMTP="localhost" +ACTIVE_RESPONSE=true +SYSCHECK_FREQUENCY="21600" +SYSCHECK_DIRS="/etc,/usr/bin,/usr/sbin,/bin,/sbin" +ROOTCHECK=true +LOG_ANALYSIS=true +DRY_RUN=false +UNINSTALL=false +OSSEC_VERSION="3.7.0" +OSSEC_DIR="/var/ossec" + +# OS detection +OS_ID="" +OS_VERSION="" +PKG_MGR="" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; echo "[INFO] $(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE" 2>/dev/null || true; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; echo "[WARN] $(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE" 2>/dev/null || true; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; echo "[ERROR] $(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE" 2>/dev/null || true; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; echo "[STEP] $(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE" 2>/dev/null || true; } + +show_usage() { + cat </dev/null || PKG_MGR="yum" ;; + *) log_error "Unsupported OS: $OS_ID"; exit 1 ;; + esac + + log_info "Detected OS: $OS_ID $OS_VERSION (package manager: $PKG_MGR)" +} + +# ============================================================================ +# DEPENDENCIES +# ============================================================================ + +install_dependencies() { + log_step "Installing build dependencies..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would install build dependencies" + return + fi + + case "$PKG_MGR" in + apt) + apt-get update -qq + apt-get install -y -qq build-essential make gcc libevent-dev libpcre2-dev \ + libssl-dev libsystemd-dev zlib1g-dev wget tar + ;; + dnf|yum) + $PKG_MGR groupinstall -y "Development Tools" 2>/dev/null || true + $PKG_MGR install -y -q gcc make libevent-devel pcre2-devel openssl-devel \ + systemd-devel zlib-devel wget tar + ;; + esac + + log_info "Dependencies installed" +} + +# ============================================================================ +# DOWNLOAD AND COMPILE +# ============================================================================ + +download_ossec() { + log_step "Downloading OSSEC ${OSSEC_VERSION}..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would download OSSEC ${OSSEC_VERSION}" + return + fi + + local url="https://github.com/ossec/ossec-hids/archive/refs/tags/${OSSEC_VERSION}.tar.gz" + local tmpdir="/tmp/ossec-build" + + rm -rf "$tmpdir" + mkdir -p "$tmpdir" + cd "$tmpdir" + + wget -q "$url" -O "ossec-${OSSEC_VERSION}.tar.gz" + tar xzf "ossec-${OSSEC_VERSION}.tar.gz" + + log_info "Downloaded and extracted OSSEC ${OSSEC_VERSION}" +} + +install_ossec() { + log_step "Installing OSSEC in ${INSTALL_MODE} mode..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would install OSSEC as ${INSTALL_MODE}" + return + fi + + local tmpdir="/tmp/ossec-build/ossec-hids-${OSSEC_VERSION}" + cd "$tmpdir" + + # Generate preloaded-vars.conf for non-interactive install + cat > etc/preloaded-vars.conf <> etc/preloaded-vars.conf + fi + + # Run the install + ./install.sh + + log_info "OSSEC installed to ${OSSEC_DIR}" +} + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +configure_ossec() { + log_step "Configuring OSSEC..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would configure syscheck directories and rules" + return + fi + + local ossec_conf="${OSSEC_DIR}/etc/ossec.conf" + + if [ ! -f "$ossec_conf" ]; then + log_warn "ossec.conf not found — skipping custom configuration" + return + fi + + # Update syscheck frequency + if grep -q "" "$ossec_conf"; then + sed -i "s|[0-9]*|${SYSCHECK_FREQUENCY}|g" "$ossec_conf" + log_info "Set syscheck frequency to ${SYSCHECK_FREQUENCY} seconds" + fi + + # Add additional syscheck directories + IFS=',' read -ra DIRS <<< "$SYSCHECK_DIRS" + for dir in "${DIRS[@]}"; do + dir=$(echo "$dir" | xargs) + if ! grep -q "${dir}<" "$ossec_conf" 2>/dev/null; then + sed -i "/<\/syscheck>/i\\ ${dir}" "$ossec_conf" + log_info "Added syscheck directory: ${dir}" + fi + done + + # Add common ignore paths to reduce noise + local ignores=( + "/etc/mtab" + "/etc/resolv.conf" + "/etc/adjtime" + "/etc/mail/statistics" + "/etc/random-seed" + "/etc/sysstat" + ) + for ign in "${ignores[@]}"; do + if ! grep -q "${ign}" "$ossec_conf" 2>/dev/null; then + sed -i "/<\/syscheck>/i\\ ${ign}" "$ossec_conf" + fi + done + + log_info "OSSEC configuration updated" +} + +# ============================================================================ +# FIREWALL +# ============================================================================ + +configure_firewall() { + if [ "$INSTALL_MODE" != "server" ]; then + return + fi + + log_step "Opening OSSEC server port (1514/udp)..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would open port 1514/udp" + return + fi + + if command -v ufw &>/dev/null && ufw status | grep -q "active"; then + ufw allow 1514/udp + log_info "Opened port 1514/udp via ufw" + elif command -v firewall-cmd &>/dev/null; then + firewall-cmd --permanent --add-port=1514/udp + firewall-cmd --reload + log_info "Opened port 1514/udp via firewalld" + elif command -v nft &>/dev/null; then + nft add rule inet filter input udp dport 1514 accept 2>/dev/null || true + log_info "Opened port 1514/udp via nftables" + fi +} + +# ============================================================================ +# SERVICE +# ============================================================================ + +start_ossec() { + log_step "Starting OSSEC..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would start OSSEC" + return + fi + + "${OSSEC_DIR}/bin/ossec-control" start + + sleep 3 + + if "${OSSEC_DIR}/bin/ossec-control" status | grep -q "running"; then + log_info "OSSEC is running" + else + log_warn "OSSEC may not have started — check ${OSSEC_DIR}/logs/ossec.log" + fi +} + +# ============================================================================ +# VERIFICATION +# ============================================================================ + +verify_installation() { + log_step "Verifying installation..." + + if $DRY_RUN; then + log_info "[DRY RUN] Installation summary:" + echo " Mode: ${INSTALL_MODE}" + echo " Directory: ${OSSEC_DIR}" + echo " Syscheck freq: ${SYSCHECK_FREQUENCY}s" + echo " Active response: ${ACTIVE_RESPONSE}" + echo " Rootcheck: ${ROOTCHECK}" + echo " Email: ${EMAIL_ADDR:-disabled}" + [ "$INSTALL_MODE" = "agent" ] && echo " Server IP: ${SERVER_IP}" + return + fi + + echo "" + echo "=== OSSEC Installation Summary ===" + echo "Mode: ${INSTALL_MODE}" + echo "Version: ${OSSEC_VERSION}" + echo "Install dir: ${OSSEC_DIR}" + echo "Active response: ${ACTIVE_RESPONSE}" + echo "Rootcheck: ${ROOTCHECK}" + echo "Syscheck freq: ${SYSCHECK_FREQUENCY}s ($(( SYSCHECK_FREQUENCY / 3600 ))h)" + echo "" + + echo "=== Process Status ===" + "${OSSEC_DIR}/bin/ossec-control" status + echo "" + + echo "=== Syscheck Status ===" + "${OSSEC_DIR}/bin/syscheck_control" -l 2>/dev/null | head -5 || echo "(syscheck not yet run)" + echo "" + + if [ "$INSTALL_MODE" = "server" ]; then + echo "=== Agent Management ===" + echo "To add agents:" + echo " ${OSSEC_DIR}/bin/manage_agents" + echo "" + fi + + echo "=== Useful Commands ===" + echo " Status: ${OSSEC_DIR}/bin/ossec-control status" + echo " Restart: ${OSSEC_DIR}/bin/ossec-control restart" + echo " Logs: tail -f ${OSSEC_DIR}/logs/ossec.log" + echo " Alerts: tail -f ${OSSEC_DIR}/logs/alerts/alerts.log" + echo "" + log_info "Installation complete. Logs: $LOG_FILE" +} + +# ============================================================================ +# UNINSTALL +# ============================================================================ + +uninstall_ossec() { + log_step "Uninstalling OSSEC..." + + if $DRY_RUN; then + log_info "[DRY RUN] Would stop and remove OSSEC" + return + fi + + if [ -x "${OSSEC_DIR}/bin/ossec-control" ]; then + "${OSSEC_DIR}/bin/ossec-control" stop 2>/dev/null || true + fi + + if [ -d "$OSSEC_DIR" ]; then + rm -rf "$OSSEC_DIR" + log_info "Removed ${OSSEC_DIR}" + fi + + # Remove ossec user/group + userdel ossec 2>/dev/null || true + userdel ossecm 2>/dev/null || true + userdel ossecr 2>/dev/null || true + groupdel ossec 2>/dev/null || true + + log_info "OSSEC uninstalled" + exit 0 +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo "============================================" + echo " OSSEC Install Script v${VERSION}" + echo " https://mylinux.work" + echo "============================================" + echo "" + + check_root + detect_os + + if $UNINSTALL; then + uninstall_ossec + fi + + install_dependencies + download_ossec + install_ossec + configure_ossec + configure_firewall + start_ossec + verify_installation +} + +main "$@" diff --git a/install-prometheus-stack.sh b/install-prometheus-stack.sh index 4713b84..21fbb66 100755 --- a/install-prometheus-stack.sh +++ b/install-prometheus-stack.sh @@ -9,7 +9,7 @@ set -euo pipefail #### Author: Phil Connor #### #### Contact: contact@mylinux.work #### #### License: MIT #### -#### Version: 3.0 #### +#### Version: 3.1 #### #### #### #### Usage: ./install-prometheus-stack.sh [OPTIONS] #### ############################################################# @@ -148,8 +148,8 @@ detect_os() { OS=$(lsb_release -i | awk '{print $3}' | tr '[:upper:]' '[:lower:]') OSVER=$(lsb_release -r | awk '{print $2}' | cut -d. -f1) else - OS=$(grep PRETTY_NAME /etc/os-release | sed 's/PRETTY_NAME=//g' | tr -d '="' | awk '{print $1}' | tr '[:upper:]' '[:lower:]') - OSVER=$(grep VERSION_ID /etc/os-release | sed 's/VERSION_ID=//g' | tr -d '"' | cut -d. -f1) + OS=$({ grep PRETTY_NAME /etc/os-release || true; } | sed 's/PRETTY_NAME=//g' | tr -d '="' | awk '{print $1}' | tr '[:upper:]' '[:lower:]') + OSVER=$({ grep VERSION_ID /etc/os-release || true; } | sed 's/VERSION_ID=//g' | tr -d '"' | cut -d. -f1) fi log_info "Detected OS: $OS version $OSVER" diff --git a/install-pxe-server.sh b/install-pxe-server.sh new file mode 100644 index 0000000..5e1befb --- /dev/null +++ b/install-pxe-server.sh @@ -0,0 +1,934 @@ +#!/usr/bin/env bash +# ============================================================================ +# install-pxe-server.sh +# Automated PXE boot server setup — installs dnsmasq, TFTP, and nginx, +# configures PXE boot menus, generates Kickstart and Preseed templates +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# License: MIT +# Version: 1.0.0 +# ============================================================================ + +set -uo pipefail + +# ============================================================================ +# Defaults +# ============================================================================ +INTERFACE="" +DHCP_RANGE="10.0.0.100,10.0.0.200" +TFTP_ROOT="/srv/tftp" +HTTP_ROOT="/var/www/pxe" +DISTROS="rocky9" +SERVER_IP="" +OS_FAMILY="" +OS_ID="" +OS_VERSION="" + +# ============================================================================ +# Colour output +# ============================================================================ +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +log_info() { echo -e "${CYAN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*"; } +log_success() { echo -e "${GREEN}[OK]${NC} $*"; } + +# ============================================================================ +# Usage +# ============================================================================ +usage() { + cat < Network interface for DHCP/TFTP binding + (default: auto-detected from default route) + --dhcp-range DHCP range as start,end + (default: 10.0.0.100,10.0.0.200) + --tftp-root TFTP root directory + (default: /srv/tftp) + --http-root HTTP document root for install media + (default: /var/www/pxe) + --distros Comma-separated list of distros to configure + (default: rocky9) + --help Print this help and exit + +Supported distros: + rocky9, rocky8, rhel9, rhel8, alma9, + ubuntu2404, ubuntu2204, debian12, debian11 + +Examples: + sudo $(basename "$0") + sudo $(basename "$0") --interface eth0 --dhcp-range 10.0.0.100,10.0.0.200 + sudo $(basename "$0") --distros rocky9,ubuntu2404,debian12 + sudo $(basename "$0") --interface ens192 --tftp-root /data/tftp --distros rocky9 + +EOF + exit 0 +} + +# ============================================================================ +# Parse arguments +# ============================================================================ +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --interface) + INTERFACE="$2" + shift 2 + ;; + --dhcp-range) + DHCP_RANGE="$2" + shift 2 + ;; + --tftp-root) + TFTP_ROOT="$2" + shift 2 + ;; + --http-root) + HTTP_ROOT="$2" + shift 2 + ;; + --distros) + DISTROS="$2" + shift 2 + ;; + --help) + usage + ;; + *) + log_error "Unknown option: $1" + usage + ;; + esac + done +} + +# ============================================================================ +# Detect OS +# ============================================================================ +detect_os() { + log_info "Detecting operating system..." + + if [[ ! -f /etc/os-release ]]; then + log_error "Cannot detect OS — /etc/os-release not found" + exit 1 + fi + + # shellcheck disable=SC1091 + source /etc/os-release + + OS_ID="${ID}" + OS_VERSION="${VERSION_ID}" + + case "${OS_ID}" in + debian|ubuntu) + OS_FAMILY="debian" + ;; + rocky|rhel|almalinux|centos) + OS_FAMILY="rhel" + ;; + *) + log_error "Unsupported OS: ${OS_ID} ${OS_VERSION}" + log_error "Supported: Debian 11+, Ubuntu 22.04+, RHEL/Rocky/Alma 8+" + exit 1 + ;; + esac + + log_success "Detected ${OS_ID} ${OS_VERSION} (${OS_FAMILY} family)" +} + +# ============================================================================ +# Detect network interface +# ============================================================================ +detect_interface() { + if [[ -n "${INTERFACE}" ]]; then + if ! ip link show "${INTERFACE}" &>/dev/null; then + log_error "Interface ${INTERFACE} does not exist" + echo "Available interfaces:" + ip -o link show | awk -F': ' '{print " " $2}' + exit 1 + fi + log_info "Using specified interface: ${INTERFACE}" + else + log_info "Auto-detecting network interface..." + INTERFACE=$(ip route show default 2>/dev/null | awk '{print $5; exit}') + + if [[ -z "${INTERFACE}" ]]; then + log_error "Cannot detect default network interface" + log_error "Specify one with --interface" + exit 1 + fi + log_success "Detected interface: ${INTERFACE}" + fi + + SERVER_IP=$(ip -4 addr show "${INTERFACE}" | awk '/inet / {split($2,a,"/"); print a[1]; exit}') + + if [[ -z "${SERVER_IP}" ]]; then + log_error "Cannot determine IP address for interface ${INTERFACE}" + exit 1 + fi + + log_success "Server IP: ${SERVER_IP}" +} + +# ============================================================================ +# Validate distro list +# ============================================================================ +validate_distros() { + local valid_distros="rocky9 rocky8 rhel9 rhel8 alma9 ubuntu2404 ubuntu2204 debian12 debian11" + + IFS=',' read -ra DISTRO_LIST <<< "${DISTROS}" + + for distro in "${DISTRO_LIST[@]}"; do + local found=false + for valid in ${valid_distros}; do + if [[ "${distro}" == "${valid}" ]]; then + found=true + break + fi + done + if [[ "${found}" == "false" ]]; then + log_error "Invalid distro: ${distro}" + log_error "Valid options: ${valid_distros}" + exit 1 + fi + done + + log_success "Distros to configure: ${DISTROS}" +} + +# ============================================================================ +# Install packages +# ============================================================================ +install_packages() { + log_info "Installing packages..." + + if [[ "${OS_FAMILY}" == "debian" ]]; then + export DEBIAN_FRONTEND=noninteractive + apt-get update -qq + apt-get install -y -qq \ + dnsmasq \ + tftpd-hpa \ + syslinux-common \ + pxelinux \ + nginx \ + wget \ + curl \ + >/dev/null 2>&1 + + if [[ $? -ne 0 ]]; then + log_error "Package installation failed" + exit 1 + fi + elif [[ "${OS_FAMILY}" == "rhel" ]]; then + dnf install -y -q \ + dnsmasq \ + tftp-server \ + syslinux \ + nginx \ + wget \ + curl \ + >/dev/null 2>&1 + + if [[ $? -ne 0 ]]; then + log_error "Package installation failed" + exit 1 + fi + fi + + log_success "Packages installed" +} + +# ============================================================================ +# Configure dnsmasq +# ============================================================================ +configure_dnsmasq() { + log_info "Configuring dnsmasq..." + + local dhcp_start dhcp_end + dhcp_start=$(echo "${DHCP_RANGE}" | cut -d',' -f1) + dhcp_end=$(echo "${DHCP_RANGE}" | cut -d',' -f2) + + # Disable default dnsmasq DNS to avoid conflicts + if [[ -f /etc/dnsmasq.conf ]]; then + cp /etc/dnsmasq.conf /etc/dnsmasq.conf.bak + fi + + mkdir -p /etc/dnsmasq.d + + cat > /etc/dnsmasq.d/pxe.conf <> /etc/dnsmasq.conf + fi + fi + + log_success "dnsmasq configured at /etc/dnsmasq.d/pxe.conf" +} + +# ============================================================================ +# Setup TFTP directory +# ============================================================================ +setup_tftp() { + log_info "Setting up TFTP directory..." + + mkdir -p "${TFTP_ROOT}/pxelinux.cfg" + mkdir -p "${TFTP_ROOT}/grub" + + # Copy syslinux/pxelinux files + if [[ "${OS_FAMILY}" == "debian" ]]; then + local pxe_src="/usr/lib/PXELINUX" + local sys_src="/usr/lib/syslinux/modules/bios" + + if [[ -f "${pxe_src}/pxelinux.0" ]]; then + cp "${pxe_src}/pxelinux.0" "${TFTP_ROOT}/" + else + log_warn "pxelinux.0 not found at ${pxe_src}/pxelinux.0" + fi + + for file in ldlinux.c32 menu.c32 libmenu.c32 libutil.c32; do + if [[ -f "${sys_src}/${file}" ]]; then + cp "${sys_src}/${file}" "${TFTP_ROOT}/" + else + log_warn "${file} not found at ${sys_src}/${file}" + fi + done + + elif [[ "${OS_FAMILY}" == "rhel" ]]; then + local sys_src="/usr/share/syslinux" + + for file in pxelinux.0 ldlinux.c32 menu.c32 libmenu.c32 libutil.c32; do + if [[ -f "${sys_src}/${file}" ]]; then + cp "${sys_src}/${file}" "${TFTP_ROOT}/" + else + log_warn "${file} not found at ${sys_src}/${file}" + fi + done + fi + + # Create distro directories in TFTP root + IFS=',' read -ra DISTRO_LIST <<< "${DISTROS}" + for distro in "${DISTRO_LIST[@]}"; do + mkdir -p "${TFTP_ROOT}/${distro}" + done + + # Set permissions + chmod -R 755 "${TFTP_ROOT}" + + if [[ "${OS_FAMILY}" == "debian" ]]; then + chown -R tftp:tftp "${TFTP_ROOT}" + + # Configure tftpd-hpa + cat > /etc/default/tftpd-hpa < "${menu_file}" <<'MENU_HEADER' +DEFAULT menu.c32 +PROMPT 0 +TIMEOUT 300 +ONTIMEOUT local + +MENU TITLE ===== PXE Boot Server ===== +MENU COLOR border 30;44 #40ffffff #a0000000 std +MENU COLOR title 1;36;44 #9033cccc #a0000000 std +MENU COLOR sel 7;37;40 #e0ffffff #20ffffff all +MENU COLOR unsel 37;44 #50ffffff #a0000000 std +MENU COLOR help 37;40 #c0ffffff #a0000000 std +MENU COLOR timeout_msg 37;40 #80ffffff #00000000 std +MENU COLOR timeout 1;37;40 #c0ffffff #00000000 std + +LABEL local + MENU LABEL Boot from local disk + MENU DEFAULT + LOCALBOOT 0 + +MENU_HEADER + + IFS=',' read -ra DISTRO_LIST <<< "${DISTROS}" + for distro in "${DISTRO_LIST[@]}"; do + case "${distro}" in + rocky9) + cat >> "${menu_file}" <> "${menu_file}" <> "${menu_file}" <> "${menu_file}" <> "${menu_file}" <> "${menu_file}" <> "${menu_file}" <> "${menu_file}" <> "${menu_file}" < "${ks_file}" <> /root/ks-post.log +%end +EOF + + log_success "Kickstart template created: ${ks_file}" +} + +# ============================================================================ +# Generate Preseed template +# ============================================================================ +generate_preseed() { + local distro="$1" + local preseed_dir="${HTTP_ROOT}/preseed" + local preseed_file="${preseed_dir}/preseed-${distro}.cfg" + + mkdir -p "${preseed_dir}" + + log_info "Generating Preseed template: ${preseed_file}" + + local mirror_host="deb.debian.org" + local mirror_dir="/debian" + if [[ "${distro}" == ubuntu* ]]; then + mirror_host="archive.ubuntu.com" + mirror_dir="/ubuntu" + fi + + cat > "${preseed_file}" <> /target/root/preseed-post.log + +### Reboot after install +d-i finish-install/reboot_in_progress note +d-i debian-installer/exit/poweroff boolean false +EOF + + log_success "Preseed template created: ${preseed_file}" +} + +# ============================================================================ +# Configure nginx +# ============================================================================ +configure_nginx() { + log_info "Configuring nginx..." + + mkdir -p "${HTTP_ROOT}"/{ks,preseed} + + # Create distro directories in HTTP root + IFS=',' read -ra DISTRO_LIST <<< "${DISTROS}" + for distro in "${DISTRO_LIST[@]}"; do + mkdir -p "${HTTP_ROOT}/${distro}" + done + + # Remove default site if present + rm -f /etc/nginx/sites-enabled/default 2>/dev/null + rm -f /etc/nginx/conf.d/default.conf 2>/dev/null + + cat > /etc/nginx/conf.d/pxe.conf </dev/null; then + log_success "nginx configured at /etc/nginx/conf.d/pxe.conf" + else + log_error "nginx configuration test failed" + nginx -t + exit 1 + fi +} + +# ============================================================================ +# Configure firewall +# ============================================================================ +configure_firewall() { + log_info "Configuring firewall..." + + if command -v firewall-cmd &>/dev/null && systemctl is-active --quiet firewalld; then + log_info "Detected firewalld" + firewall-cmd --permanent --add-port=67/udp # DHCP + firewall-cmd --permanent --add-port=68/udp # DHCP client + firewall-cmd --permanent --add-port=69/udp # TFTP + firewall-cmd --permanent --add-port=80/tcp # HTTP + firewall-cmd --permanent --add-port=4011/udp # ProxyDHCP + firewall-cmd --reload + log_success "Firewall ports opened (firewalld)" + + elif command -v ufw &>/dev/null && ufw status | grep -q "Status: active"; then + log_info "Detected ufw" + ufw allow 67/udp comment "DHCP server" + ufw allow 68/udp comment "DHCP client" + ufw allow 69/udp comment "TFTP" + ufw allow 80/tcp comment "HTTP" + ufw allow 4011/udp comment "ProxyDHCP" + log_success "Firewall ports opened (ufw)" + + else + log_warn "No active firewall detected — skipping firewall configuration" + log_warn "Manually open ports: 67/udp, 68/udp, 69/udp, 80/tcp, 4011/udp" + fi +} + +# ============================================================================ +# Enable services +# ============================================================================ +enable_services() { + log_info "Enabling and starting services..." + + # dnsmasq + systemctl enable dnsmasq + systemctl restart dnsmasq + if systemctl is-active --quiet dnsmasq; then + log_success "dnsmasq is running" + else + log_error "dnsmasq failed to start" + journalctl -u dnsmasq --no-pager -n 10 + fi + + # TFTP + if [[ "${OS_FAMILY}" == "debian" ]]; then + systemctl enable tftpd-hpa + systemctl restart tftpd-hpa + if systemctl is-active --quiet tftpd-hpa; then + log_success "tftpd-hpa is running" + else + log_error "tftpd-hpa failed to start" + journalctl -u tftpd-hpa --no-pager -n 10 + fi + elif [[ "${OS_FAMILY}" == "rhel" ]]; then + systemctl enable tftp.socket + systemctl restart tftp.socket + if systemctl is-active --quiet tftp.socket; then + log_success "tftp.socket is running" + else + log_error "tftp.socket failed to start" + journalctl -u tftp.socket --no-pager -n 10 + fi + fi + + # nginx + systemctl enable nginx + systemctl restart nginx + if systemctl is-active --quiet nginx; then + log_success "nginx is running" + else + log_error "nginx failed to start" + journalctl -u nginx --no-pager -n 10 + fi +} + +# ============================================================================ +# Print summary +# ============================================================================ +print_summary() { + local dhcp_start dhcp_end + dhcp_start=$(echo "${DHCP_RANGE}" | cut -d',' -f1) + dhcp_end=$(echo "${DHCP_RANGE}" | cut -d',' -f2) + + echo "" + echo "╔══════════════════════════════════════════════════════════════╗" + echo "║ PXE Boot Server — Setup Complete ║" + echo "╠══════════════════════════════════════════════════════════════╣" + echo "║ ║" + printf "║ Server IP: %-41s║\n" "${SERVER_IP}" + printf "║ Interface: %-41s║\n" "${INTERFACE}" + printf "║ DHCP Range: %-41s║\n" "${dhcp_start} — ${dhcp_end}" + printf "║ TFTP Root: %-41s║\n" "${TFTP_ROOT}" + printf "║ HTTP Root: %-41s║\n" "${HTTP_ROOT}" + printf "║ Distros: %-41s║\n" "${DISTROS}" + echo "║ ║" + echo "╠══════════════════════════════════════════════════════════════╣" + echo "║ Configuration Files ║" + echo "╠══════════════════════════════════════════════════════════════╣" + printf "║ dnsmasq: %-41s║\n" "/etc/dnsmasq.d/pxe.conf" + printf "║ PXE menu: %-41s║\n" "${TFTP_ROOT}/pxelinux.cfg/default" + printf "║ nginx: %-41s║\n" "/etc/nginx/conf.d/pxe.conf" + echo "║ ║" + + IFS=',' read -ra DISTRO_LIST <<< "${DISTROS}" + for distro in "${DISTRO_LIST[@]}"; do + case "${distro}" in + rocky*|rhel*|alma*) + printf "║ Kickstart: %-41s║\n" "http://${SERVER_IP}/ks/ks-${distro}.cfg" + ;; + ubuntu*|debian*) + printf "║ Preseed: %-41s║\n" "http://${SERVER_IP}/preseed/preseed-${distro}.cfg" + ;; + esac + done + + echo "║ ║" + echo "╠══════════════════════════════════════════════════════════════╣" + echo "║ Next Steps ║" + echo "╠══════════════════════════════════════════════════════════════╣" + echo "║ ║" + echo "║ 1. Mount or extract distro ISOs into the HTTP root: ║" + + for distro in "${DISTRO_LIST[@]}"; do + printf "║ mount -o loop,ro %-33s║\n" "${HTTP_ROOT}/${distro}/" + done + + echo "║ ║" + echo "║ 2. Copy kernel + initrd to TFTP root: ║" + + for distro in "${DISTRO_LIST[@]}"; do + printf "║ %s -> %-43s║\n" "vmlinuz/initrd" "${TFTP_ROOT}/${distro}/" + done + + echo "║ ║" + echo "║ 3. Edit Kickstart/Preseed templates: ║" + echo "║ - Set root/user password hashes ║" + echo "║ - Adjust partitioning layout ║" + echo "║ - Add site-specific packages ║" + echo "║ ║" + echo "║ 4. PXE boot a target machine and select a distro ║" + echo "║ ║" + echo "╚══════════════════════════════════════════════════════════════╝" + echo "" +} + +# ============================================================================ +# Main +# ============================================================================ +main() { + echo "" + echo "================================================" + echo " PXE Boot Server — Automated Setup" + echo " Version 1.0.0" + echo "================================================" + echo "" + + # Check root + if [[ "${EUID}" -ne 0 ]]; then + log_error "This script must be run as root" + exit 1 + fi + + parse_args "$@" + detect_os + detect_interface + validate_distros + install_packages + configure_dnsmasq + setup_tftp + create_pxe_menu + + # Generate answer file templates based on selected distros + IFS=',' read -ra DISTRO_LIST <<< "${DISTROS}" + for distro in "${DISTRO_LIST[@]}"; do + case "${distro}" in + rocky*|rhel*|alma*) + generate_kickstart "${distro}" + ;; + ubuntu*|debian*) + generate_preseed "${distro}" + ;; + esac + done + + configure_nginx + configure_firewall + enable_services + print_summary + + log_success "PXE boot server setup complete" +} + +main "$@" diff --git a/install-snort.sh b/install-snort.sh new file mode 100644 index 0000000..642d05c --- /dev/null +++ b/install-snort.sh @@ -0,0 +1,547 @@ +#!/bin/bash +################################################################################ +# Script Name: install-snort.sh +# Description: Automated Snort 3 IDS/IPS installation from source with +# rule management on Ubuntu/Debian and RHEL/Rocky/Alma/Fedora +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.01 +# +# Usage: +# sudo ./install-snort.sh +# sudo ./install-snort.sh --iface eth0 --oinkcode YOUR_CODE +# sudo ./install-snort.sh --community-rules +# sudo ./install-snort.sh --dry-run +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +SNORT_VERSION="3.1.84.0" +DAQ_VERSION="3.0.14" +IFACE="" +OINKCODE="" +COMMUNITY_RULES=true +REGISTERED_RULES=false +HOME_NET="[192.168.0.0/16,10.0.0.0/8,172.16.0.0/12]" +INSTALL_DIR="/usr/local" +CONFIG_DIR="/etc/snort" +LOG_DIR="/var/log/snort" +RULE_DIR="/etc/snort/rules" +DRY_RUN=false +UNINSTALL=false +SKIP_BUILD=false + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +show_usage() { + cat </dev/null | awk '/default/ {print $5; exit}') + if [[ -z "$IFACE" ]]; then + IFACE=$(ip -o link show up 2>/dev/null | awk -F': ' '!/lo/{print $2; exit}') + fi + if [[ -z "$IFACE" ]]; then + log_error "Cannot auto-detect interface. Use --iface" + exit 1 + fi + log_info "Auto-detected interface: $IFACE" + fi +} + +# ============================================================================ +# BUILD DEPENDENCIES +# ============================================================================ + +install_deps_debian() { + log_step "Installing build dependencies (Debian/Ubuntu)..." + apt-get update -qq + apt-get install -y -qq \ + build-essential cmake pkg-config \ + libhwloc-dev libluajit-5.1-dev libssl-dev \ + libpcap-dev libpcre3-dev zlib1g-dev \ + libdumbnet-dev liblzma-dev libsafec-dev \ + libunwind-dev uuid-dev \ + flex bison \ + libflatbuffers-dev flatbuffers-compiler \ + libhyperscan-dev \ + libjemalloc-dev \ + git wget curl jq \ + cpputest libcpputest-dev +} + +install_deps_rhel() { + log_step "Installing build dependencies (RHEL/Rocky/Alma)..." + if [[ "$OS_ID" != "fedora" ]]; then + dnf install -y -q epel-release + dnf config-manager --set-enabled crb 2>/dev/null || \ + dnf config-manager --set-enabled powertools 2>/dev/null || true + fi + dnf groupinstall -y -q "Development Tools" + dnf install -y -q \ + cmake3 pkgconfig \ + hwloc-devel luajit-devel openssl-devel \ + libpcap-devel pcre-devel zlib-devel \ + libdnet-devel xz-devel libsafec-devel \ + libunwind-devel uuid-devel \ + flex bison \ + flatbuffers-devel flatbuffers-compiler \ + hyperscan-devel \ + jemalloc-devel \ + git wget curl jq +} + +install_dependencies() { + case "$OS_FAMILY" in + debian) install_deps_debian ;; + rhel) install_deps_rhel ;; + esac +} + +# ============================================================================ +# BUILD & INSTALL +# ============================================================================ + +build_libdaq() { + log_step "Building libdaq $DAQ_VERSION..." + + local build_dir="/tmp/snort-build" + mkdir -p "$build_dir" + cd "$build_dir" + + if [[ ! -d "libdaq-$DAQ_VERSION" ]]; then + wget -q "https://github.com/snort3/libdaq/archive/refs/tags/v$DAQ_VERSION.tar.gz" \ + -O "libdaq-$DAQ_VERSION.tar.gz" + tar xzf "libdaq-$DAQ_VERSION.tar.gz" + fi + + cd "libdaq-$DAQ_VERSION" + ./bootstrap 2>/dev/null || true + ./configure --prefix="$INSTALL_DIR" + make -j"$(nproc)" + make install + + ldconfig + log_info "libdaq $DAQ_VERSION installed" +} + +build_snort() { + log_step "Building Snort $SNORT_VERSION..." + + local build_dir="/tmp/snort-build" + mkdir -p "$build_dir" + cd "$build_dir" + + if [[ ! -d "snort3-$SNORT_VERSION" ]]; then + wget -q "https://github.com/snort3/snort3/archive/refs/tags/$SNORT_VERSION.tar.gz" \ + -O "snort3-$SNORT_VERSION.tar.gz" + tar xzf "snort3-$SNORT_VERSION.tar.gz" + fi + + cd "snort3-$SNORT_VERSION" + mkdir -p build && cd build + + cmake .. \ + -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ + -DENABLE_LARGE_PCAP=ON \ + -DENABLE_JEMALLOC=ON \ + 2>&1 | tail -5 + + make -j"$(nproc)" + make install + + ldconfig + log_info "Snort $SNORT_VERSION installed to $INSTALL_DIR" + + # Verify + "$INSTALL_DIR/bin/snort" -V 2>&1 | head -3 +} + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +create_snort_user() { + if ! id snort &>/dev/null; then + groupadd -r snort 2>/dev/null || true + useradd -r -g snort -s /sbin/nologin -d /var/log/snort snort 2>/dev/null || true + log_info "Created snort user and group" + fi +} + +configure_directories() { + log_step "Creating directory structure..." + + mkdir -p "$CONFIG_DIR" + mkdir -p "$CONFIG_DIR/rules" + mkdir -p "$CONFIG_DIR/builtin_rules" + mkdir -p "$CONFIG_DIR/so_rules" + mkdir -p "$CONFIG_DIR/lists" + mkdir -p "$LOG_DIR" + mkdir -p /var/lib/snort + + chown -R snort:snort "$LOG_DIR" + chown -R snort:snort /var/lib/snort +} + +configure_snort() { + log_step "Configuring Snort..." + + # Copy default config if not present + if [[ ! -f "$CONFIG_DIR/snort.lua" ]]; then + if [[ -f "$INSTALL_DIR/etc/snort/snort.lua" ]]; then + cp "$INSTALL_DIR/etc/snort/snort.lua" "$CONFIG_DIR/snort.lua" + cp "$INSTALL_DIR/etc/snort/snort_defaults.lua" "$CONFIG_DIR/" 2>/dev/null || true + cp "$INSTALL_DIR/etc/snort/file_magic.lua" "$CONFIG_DIR/" 2>/dev/null || true + fi + fi + + # Create local.lua overrides + cat > "$CONFIG_DIR/local.lua" </dev/null || true | awk '{s+=$1} END {print s+0}') + log_info "Total rules installed: $rule_count" +} + +setup_rule_update_cron() { + log_step "Setting up weekly rule update cron..." + + cat > /etc/cron.d/snort-rule-update </dev/null && systemctl restart snort 2>/dev/null +CRONEOF + + chmod 644 /etc/cron.d/snort-rule-update + log_info "Weekly rule update cron job created (Sunday 03:00)" +} + +# ============================================================================ +# SYSTEMD SERVICE +# ============================================================================ + +create_service() { + log_step "Creating Snort systemd service..." + + cat > /etc/systemd/system/snort.service </dev/null || true + fi +} + +# ============================================================================ +# VALIDATION +# ============================================================================ + +validate_installation() { + log_step "Validating Snort installation..." + + "$INSTALL_DIR/bin/snort" -c "$CONFIG_DIR/snort.lua" --warn-all -T 2>&1 | tail -5 || { + log_warn "Config validation produced warnings" + } + + echo "" + log_info "===== Installation Summary =====" + log_info "Snort version: $SNORT_VERSION" + log_info "DAQ version: $DAQ_VERSION" + log_info "Interface: $IFACE" + log_info "Config: $CONFIG_DIR/snort.lua" + log_info "Rules: $RULE_DIR/" + log_info "Logs: $LOG_DIR/" + log_info "HOME_NET: $HOME_NET" + echo "" +} + +# ============================================================================ +# UNINSTALL +# ============================================================================ + +uninstall_snort() { + log_step "Uninstalling Snort..." + + systemctl stop snort 2>/dev/null || true + systemctl disable snort 2>/dev/null || true + rm -f /etc/systemd/system/snort.service + rm -f /etc/cron.d/snort-rule-update + systemctl daemon-reload + + rm -rf "$INSTALL_DIR/bin/snort" + rm -rf "$INSTALL_DIR/lib/snort" + rm -rf "$INSTALL_DIR/etc/snort" + + log_info "Snort binaries removed" + log_info "Config ($CONFIG_DIR/) and logs ($LOG_DIR/) left intact" + log_info "Remove manually if no longer needed" +} + +# ============================================================================ +# DRY RUN +# ============================================================================ + +dry_run() { + echo "" + log_info "===== DRY RUN — No changes will be made =====" + echo "" + log_info "OS: $PRETTY_NAME" + log_info "Snort version: $SNORT_VERSION" + log_info "DAQ version: $DAQ_VERSION" + log_info "Interface: $IFACE" + log_info "HOME_NET: $HOME_NET" + log_info "Install prefix: $INSTALL_DIR" + log_info "Config dir: $CONFIG_DIR" + log_info "Log dir: $LOG_DIR" + log_info "Community rules: $COMMUNITY_RULES" + log_info "Registered rules: $REGISTERED_RULES" + echo "" + log_info "Actions that would be performed:" + echo " 1. Install build dependencies" + echo " 2. Build libdaq $DAQ_VERSION from source" + echo " 3. Build Snort $SNORT_VERSION from source" + echo " 4. Create snort user and group" + echo " 5. Create directory structure" + echo " 6. Write Snort configuration" + echo " 7. Download and install rules" + echo " 8. Create weekly rule update cron job" + echo " 9. Create and start systemd service" + echo "" +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + check_root + detect_os + detect_interface + + if [[ "$UNINSTALL" == true ]]; then + uninstall_snort + exit 0 + fi + + if [[ "$DRY_RUN" == true ]]; then + dry_run + exit 0 + fi + + echo "" + log_info "===== Snort 3 IDS/IPS Installer =====" + echo "" + + if [[ "$SKIP_BUILD" == false ]]; then + install_dependencies + build_libdaq + build_snort + fi + + create_snort_user + configure_directories + configure_snort + download_rules + setup_rule_update_cron + create_service + validate_installation + + echo "" + log_info "===== Installation Complete =====" + log_info "View alerts: tail -f $LOG_DIR/alert_json.txt | jq ." + log_info "Test config: snort -c $CONFIG_DIR/snort.lua -T" + log_info "Service status: systemctl status snort" + echo "" +} + +main "$@" diff --git a/install-suricata.sh b/install-suricata.sh new file mode 100644 index 0000000..dfdb3b0 --- /dev/null +++ b/install-suricata.sh @@ -0,0 +1,580 @@ +#!/bin/bash +################################################################################ +# Script Name: install-suricata.sh +# Description: Automated Suricata IDS/IPS installation with multi-source +# rule selection on Ubuntu/Debian and RHEL/Rocky/Alma/Fedora +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.01 +# +# Usage: +# sudo ./install-suricata.sh +# sudo ./install-suricata.sh --iface eth0 --rules et/open,oisf/trafficid +# sudo ./install-suricata.sh --ips --iface ens192 +# sudo ./install-suricata.sh --dry-run +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +IFACE="" +MODE="ids" +RULE_SOURCES="et/open" +HOME_NET="[192.168.0.0/16,10.0.0.0/8,172.16.0.0/12]" +THREADS="auto" +EVE_LOG="/var/log/suricata/eve.json" +SURICATA_CONF="/etc/suricata/suricata.yaml" +DRY_RUN=false +UNINSTALL=false +SKIP_RULES=false + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +log_step() { echo -e "${CYAN}[STEP]${NC} $*"; } + +show_usage() { + cat </dev/null | awk '/default/ {print $5; exit}') + if [[ -z "$IFACE" ]]; then + IFACE=$(ip -o link show up 2>/dev/null | awk -F': ' '!/lo/{print $2; exit}') + fi + if [[ -z "$IFACE" ]]; then + log_error "Cannot auto-detect network interface. Use --iface" + exit 1 + fi + log_info "Auto-detected interface: $IFACE" + else + if ! ip link show "$IFACE" &>/dev/null; then + log_error "Interface $IFACE does not exist" + exit 1 + fi + log_info "Using interface: $IFACE" + fi +} + +detect_threads() { + if [[ "$THREADS" == "auto" ]]; then + THREADS=$(nproc 2>/dev/null || echo 2) + # Reserve 1 core for the management thread, minimum 1 worker + local workers=$((THREADS - 1)) + [[ $workers -lt 1 ]] && workers=1 + THREADS=$workers + log_info "Auto-detected CPU threads: $THREADS workers" + fi +} + +# ============================================================================ +# INSTALLATION +# ============================================================================ + +install_suricata_debian() { + log_step "Adding Suricata PPA / repository..." + + if [[ "$OS_ID" == "ubuntu" ]]; then + apt-get update -qq + apt-get install -y -qq software-properties-common + add-apt-repository -y ppa:oisf/suricata-stable + apt-get update -qq + else + # Debian + apt-get update -qq + apt-get install -y -qq gnupg2 apt-transport-https + fi + + log_step "Installing Suricata and dependencies..." + apt-get install -y -qq suricata suricata-update jq + + log_info "Suricata installed: $(suricata --build-info | head -1)" +} + +install_suricata_rhel() { + log_step "Installing EPEL and Suricata repository..." + + if [[ "$OS_ID" != "fedora" ]]; then + dnf install -y -q epel-release + fi + + dnf install -y -q dnf-plugins-core + dnf copr enable -y @oisf/suricata-7.0 2>/dev/null || true + + log_step "Installing Suricata and dependencies..." + dnf install -y -q suricata jq + + # Install suricata-update if not bundled + if ! command -v suricata-update &>/dev/null; then + dnf install -y -q python3-pip + pip3 install suricata-update + fi + + log_info "Suricata installed: $(suricata --build-info | head -1)" +} + +install_suricata() { + if command -v suricata &>/dev/null; then + log_warn "Suricata is already installed: $(suricata -V 2>&1 | head -1)" + log_info "Proceeding with configuration..." + return 0 + fi + + case "$OS_FAMILY" in + debian) install_suricata_debian ;; + rhel) install_suricata_rhel ;; + esac +} + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +backup_config() { + if [[ -f "$SURICATA_CONF" ]]; then + local backup="${SURICATA_CONF}.bak.$(date +%Y%m%d%H%M%S)" + cp "$SURICATA_CONF" "$backup" + log_info "Backed up existing config to $backup" + fi +} + +configure_suricata() { + log_step "Configuring Suricata..." + + backup_config + + # Set HOME_NET + if grep -q 'HOME_NET:' "$SURICATA_CONF"; then + sed -i "s|HOME_NET:.*|HOME_NET: \"$HOME_NET\"|" "$SURICATA_CONF" + log_info "Set HOME_NET: $HOME_NET" + fi + + # Set EXTERNAL_NET + if grep -q 'EXTERNAL_NET:' "$SURICATA_CONF"; then + sed -i 's|EXTERNAL_NET:.*|EXTERNAL_NET: "!\$HOME_NET"|' "$SURICATA_CONF" + fi + + # Set interface + sed -i "s|interface: eth0|interface: $IFACE|g" "$SURICATA_CONF" 2>/dev/null || true + + # Configure af-packet interface + if grep -q 'af-packet:' "$SURICATA_CONF"; then + # Update the first af-packet interface entry + sed -i "/af-packet:/,/- interface:/{s|- interface:.*|- interface: $IFACE|}" "$SURICATA_CONF" 2>/dev/null || true + fi + + # Set threading + if [[ "$THREADS" -gt 1 ]]; then + sed -i "s|#\? threads:.*| threads: $THREADS|" "$SURICATA_CONF" 2>/dev/null || true + log_info "Set worker threads: $THREADS" + fi + + # Enable community-id for EVE (useful for correlation) + sed -i 's|# community-id:.*|community-id: true|' "$SURICATA_CONF" 2>/dev/null || true + sed -i 's|community-id: false|community-id: true|' "$SURICATA_CONF" 2>/dev/null || true + + # Configure IPS mode if requested + if [[ "$MODE" == "ips" ]]; then + log_step "Configuring IPS (inline) mode..." + + # Set default action to drop for IPS + if grep -q '# default-rule-path' "$SURICATA_CONF" || grep -q 'default-rule-path' "$SURICATA_CONF"; then + log_info "IPS mode: rules with 'alert' will log, rules with 'drop' will block" + fi + + # Configure NFQ for inline mode + cat >> "$SURICATA_CONF" <<'NFQEOF' + +# NFQ inline mode (IPS) +nfq: + mode: accept + fail-open: yes +NFQEOF + + log_info "IPS mode configured (NFQ with fail-open)" + log_warn "You must configure iptables/nftables to send traffic to NFQUEUE" + log_warn "Example: iptables -I FORWARD -j NFQUEUE --queue-num 0" + fi + + # Create log directory + mkdir -p "$(dirname "$EVE_LOG")" + chown suricata:suricata "$(dirname "$EVE_LOG")" 2>/dev/null || true + + log_info "Configuration written to $SURICATA_CONF" +} + +# ============================================================================ +# RULE MANAGEMENT +# ============================================================================ + +configure_rules() { + if [[ "$SKIP_RULES" == true ]]; then + log_info "Skipping rule download (--skip-rules)" + return 0 + fi + + log_step "Configuring rule sources..." + + # Parse comma-separated rule sources + IFS=',' read -ra SOURCES <<< "$RULE_SOURCES" + + for source in "${SOURCES[@]}"; do + source=$(echo "$source" | xargs) # trim whitespace + log_info "Enabling rule source: $source" + + case "$source" in + et/open) + suricata-update enable-source et/open 2>/dev/null || true + ;; + et/pro) + if [[ -z "${OINKCODE:-}" ]]; then + log_warn "ET Pro requires OINKCODE environment variable — skipping" + continue + fi + suricata-update enable-source "et/pro secret-code=$OINKCODE" 2>/dev/null || true + ;; + oisf/trafficid) + suricata-update enable-source oisf/trafficid 2>/dev/null || true + ;; + ptresearch/attackdetection) + suricata-update enable-source ptresearch/attackdetection 2>/dev/null || true + ;; + sslbl/ssl-fp-blacklist) + suricata-update enable-source sslbl/ssl-fp-blacklist 2>/dev/null || true + ;; + sslbl/ja3-fingerprints) + suricata-update enable-source sslbl/ja3-fingerprints 2>/dev/null || true + ;; + etnetera/aggressive) + suricata-update enable-source etnetera/aggressive 2>/dev/null || true + ;; + tgreen/hunting) + suricata-update enable-source tgreen/hunting 2>/dev/null || true + ;; + malsilo/win-malware) + suricata-update enable-source malsilo/win-malware 2>/dev/null || true + ;; + stamus/lateral) + suricata-update enable-source stamus/lateral 2>/dev/null || true + ;; + *) + log_warn "Unknown rule source: $source — trying as custom source" + suricata-update enable-source "$source" 2>/dev/null || \ + log_warn "Failed to enable source: $source" + ;; + esac + done + + log_step "Downloading and installing rules..." + suricata-update update + + # Show enabled sources + log_info "Enabled rule sources:" + suricata-update list-sources --enabled 2>/dev/null || true + + # Count installed rules + local rule_count + rule_count=$(suricata-update list-enabled-sources 2>/dev/null | wc -l || echo "unknown") + local total_rules + total_rules=$(grep -c '^\(alert\|drop\|reject\|pass\)' /var/lib/suricata/rules/suricata.rules 2>/dev/null || true) + log_info "Total rules installed: $total_rules" +} + +setup_rule_update_cron() { + log_step "Setting up daily rule update cron job..." + + cat > /etc/cron.d/suricata-update <<'CRONEOF' +# Update Suricata rules daily at 03:00 and reload +0 3 * * * root suricata-update update && suricatasc -c reload-rules 2>/dev/null +CRONEOF + + chmod 644 /etc/cron.d/suricata-update + log_info "Daily rule update cron job created (03:00)" +} + +# ============================================================================ +# SYSTEMD SERVICE +# ============================================================================ + +configure_service() { + log_step "Configuring Suricata systemd service..." + + # Ensure the service file uses the correct interface + local service_file="/etc/systemd/system/suricata.service" + local default_service="/lib/systemd/system/suricata.service" + + if [[ -f "$default_service" ]] && ! [[ -f "$service_file" ]]; then + # Check if default service needs interface override + if grep -q '%i' "$default_service" || grep -q 'af-packet' "$default_service"; then + log_info "Service file uses af-packet config from suricata.yaml" + fi + fi + + # Enable and start + systemctl daemon-reload + systemctl enable suricata + systemctl restart suricata + + # Wait for startup + sleep 3 + + if systemctl is-active --quiet suricata; then + log_info "Suricata is running" + else + log_error "Suricata failed to start — check: journalctl -u suricata" + systemctl status suricata --no-pager || true + fi +} + +# ============================================================================ +# VALIDATION +# ============================================================================ + +validate_installation() { + log_step "Validating Suricata installation..." + + # Test config + log_info "Testing configuration..." + if suricata -T -c "$SURICATA_CONF" 2>&1 | tail -1 | grep -q 'Configuration provided was successfully loaded'; then + log_info "Configuration is valid" + else + log_warn "Configuration test produced warnings (may still be functional)" + suricata -T -c "$SURICATA_CONF" 2>&1 | tail -5 + fi + + # Check EVE log + if [[ -f "$EVE_LOG" ]]; then + log_info "EVE log exists: $EVE_LOG" + local line_count + line_count=$(wc -l < "$EVE_LOG" 2>/dev/null || echo 0) + log_info "EVE log entries: $line_count" + else + log_info "EVE log will be created once traffic is processed" + fi + + # Show status + echo "" + log_info "===== Installation Summary =====" + log_info "Mode: $MODE" + log_info "Interface: $IFACE" + log_info "Config: $SURICATA_CONF" + log_info "EVE log: $EVE_LOG" + log_info "Rules: $RULE_SOURCES" + log_info "Threads: $THREADS" + log_info "HOME_NET: $HOME_NET" + echo "" + suricata -V 2>&1 || true +} + +# ============================================================================ +# UNINSTALL +# ============================================================================ + +uninstall_suricata() { + log_step "Uninstalling Suricata..." + + systemctl stop suricata 2>/dev/null || true + systemctl disable suricata 2>/dev/null || true + + case "$OS_FAMILY" in + debian) + apt-get remove --purge -y suricata suricata-update 2>/dev/null || true + apt-get autoremove -y 2>/dev/null || true + ;; + rhel) + dnf remove -y suricata 2>/dev/null || true + ;; + esac + + rm -f /etc/cron.d/suricata-update + + log_info "Suricata removed" + log_info "Configuration files in /etc/suricata/ were left intact" + log_info "Log files in /var/log/suricata/ were left intact" + log_info "Rule files in /var/lib/suricata/ were left intact" + log_info "Remove these directories manually if no longer needed" +} + +# ============================================================================ +# DRY RUN +# ============================================================================ + +dry_run() { + echo "" + log_info "===== DRY RUN — No changes will be made =====" + echo "" + log_info "OS: $PRETTY_NAME" + log_info "Interface: $IFACE" + log_info "Mode: $MODE" + log_info "Rules: $RULE_SOURCES" + log_info "HOME_NET: $HOME_NET" + log_info "Threads: $THREADS" + log_info "EVE log: $EVE_LOG" + log_info "Config: $SURICATA_CONF" + echo "" + log_info "Actions that would be performed:" + echo " 1. Install Suricata via $( [[ $OS_FAMILY == debian ]] && echo 'apt' || echo 'dnf')" + echo " 2. Configure $SURICATA_CONF" + echo " 3. Set HOME_NET: $HOME_NET" + echo " 4. Set interface: $IFACE" + echo " 5. Set worker threads: $THREADS" + echo " 6. Enable rule sources: $RULE_SOURCES" + echo " 7. Download and install rules via suricata-update" + echo " 8. Create daily rule update cron job" + echo " 9. Enable and start suricata systemd service" + if [[ "$MODE" == "ips" ]]; then + echo " 10. Configure NFQ inline mode (IPS)" + fi + echo "" +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + check_root + detect_os + detect_interface + detect_threads + + if [[ "$UNINSTALL" == true ]]; then + uninstall_suricata + exit 0 + fi + + if [[ "$DRY_RUN" == true ]]; then + dry_run + exit 0 + fi + + echo "" + log_info "===== Suricata IDS/IPS Installer =====" + echo "" + + install_suricata + configure_suricata + configure_rules + setup_rule_update_cron + configure_service + validate_installation + + echo "" + log_info "===== Installation Complete =====" + log_info "Monitor alerts: tail -f $EVE_LOG | jq 'select(.event_type==\"alert\")'" + log_info "View stats: suricatasc -c dump-counters" + log_info "Reload rules: suricatasc -c reload-rules" + echo "" +} + +main "$@" diff --git a/install-unbound.sh b/install-unbound.sh new file mode 100644 index 0000000..afc29ac --- /dev/null +++ b/install-unbound.sh @@ -0,0 +1,951 @@ +#!/bin/bash +################################################################################ +# Script Name: install-unbound.sh +# Version: 1.1 +# Description: Install and configure Unbound recursive DNS resolver with +# DNSSEC validation, DNS-over-TLS forwarding, local zones, +# unbound-control, and optional NSD stub-zone pairing +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Date: 2026-03-31 +# +# Supported OS: +# - Ubuntu / Debian +# - RHEL / AlmaLinux / Rocky Linux / Fedora +# +# Usage: +# sudo ./install-unbound.sh +# sudo ./install-unbound.sh --forward cloudflare --dot +# sudo ./install-unbound.sh --forward google --local-zone home.lab +# sudo ./install-unbound.sh --nsd-stub example.com --nsd-port 5353 +# sudo ./install-unbound.sh --uninstall +# ./install-unbound.sh --dry-run +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +readonly VERSION="1.0" +readonly SCRIPT_NAME="${0##*/}" +readonly LOG_FILE="/var/log/unbound-install.log" + +# Defaults +MODE="recursive" # recursive (root hints) or forward +FORWARDER="" # cloudflare, google, quad9, or custom IP +USE_DOT=false # DNS-over-TLS for forwarding +LOCAL_ZONES=() # Local zone names to create +LOCAL_ENTRIES=() # Individual local-data entries +NSD_STUB_ZONES=() # Zones to stub to NSD +NSD_PORT="5353" # Port NSD listens on +LISTEN_IP="127.0.0.1" # Default: localhost only +LISTEN_ALL=false # Listen on all interfaces +THREADS="" # Auto-detect from CPU cores +CACHE_SIZE="" # Auto-detect from RAM +UNINSTALL=false +DRY_RUN=false + +# Paths (set after OS detection) +UNBOUND_CONF="" +UNBOUND_CONF_DIR="" +UNBOUND_USER="" +TRUST_ANCHOR="" +ROOT_HINTS="" +TLS_BUNDLE="" + +# OS detection +OS="" +OS_FAMILY="" + +# ============================================================================ +# COLOR OUTPUT +# ============================================================================ + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +print_success() { echo -e "${GREEN} ✓ $1${NC}"; } +print_error() { echo -e "${RED} ✗ $1${NC}" >&2; } +print_warning() { echo -e "${YELLOW} ⚠ $1${NC}"; } +print_info() { echo -e " → $1"; } +print_header() { echo -e "\n${CYAN}━━━ $1 ━━━${NC}"; } + +# ============================================================================ +# LOGGING +# ============================================================================ + +log() { + local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $1" + if [[ "$DRY_RUN" == "false" ]] && [[ -w "$(dirname "$LOG_FILE")" || -w "$LOG_FILE" ]]; then + echo "$msg" >> "$LOG_FILE" + fi + echo "$msg" +} + +log_error() { + local msg="[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" + if [[ "$DRY_RUN" == "false" ]] && [[ -w "$(dirname "$LOG_FILE")" || -w "$LOG_FILE" ]]; then + echo "$msg" >> "$LOG_FILE" + fi + print_error "$1" +} + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_help() { + cat </dev/null; then + print_warning "systemd-resolved is running and binds to port 53" + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would disable systemd-resolved" + return + fi + + log "Disabling systemd-resolved to free port 53" + systemctl stop systemd-resolved + systemctl disable systemd-resolved + + if [[ -L /etc/resolv.conf ]]; then + rm -f /etc/resolv.conf + echo "nameserver 1.1.1.1" > /etc/resolv.conf + fi + + print_success "Disabled systemd-resolved" + else + print_info "systemd-resolved not running — no action needed" + fi +} + +# ============================================================================ +# INSTALL UNBOUND +# ============================================================================ + +install_unbound() { + print_header "Installing Unbound" + + if command -v unbound &>/dev/null; then + print_info "Unbound is already installed" + return + fi + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would install unbound packages" + return + fi + + log "Installing Unbound packages" + + case "$OS_FAMILY" in + debian) + apt-get update -qq + apt-get install -y -qq unbound unbound-host dns-root-data > /dev/null + ;; + rhel) + if [[ "$OS" != "fedora" ]]; then + dnf install -y -q epel-release > /dev/null 2>&1 || true + fi + dnf install -y -q unbound > /dev/null + ;; + esac + + print_success "Unbound installed" +} + +# ============================================================================ +# AUTO-DETECT TUNING +# ============================================================================ + +detect_tuning() { + if [[ -z "$THREADS" ]]; then + THREADS=$(nproc 2>/dev/null || echo 2) + fi + + if [[ -z "$CACHE_SIZE" ]]; then + local total_mb + total_mb=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo 2>/dev/null || echo 1024) + if (( total_mb <= 1024 )); then + CACHE_SIZE="16" + elif (( total_mb <= 2048 )); then + CACHE_SIZE="32" + elif (( total_mb <= 4096 )); then + CACHE_SIZE="64" + else + CACHE_SIZE="128" + fi + fi + + # Slabs must be power of 2 >= threads + local slabs=2 + while (( slabs < THREADS )); do + slabs=$((slabs * 2)) + done + SLAB_COUNT=$slabs + + local rrset_size=$((CACHE_SIZE * 2)) + + log "Tuning: threads=$THREADS, msg-cache=${CACHE_SIZE}m, rrset-cache=${rrset_size}m, slabs=$SLAB_COUNT" +} + +# ============================================================================ +# SETUP ROOT HINTS +# ============================================================================ + +setup_root_hints() { + if [[ "$MODE" != "recursive" ]]; then + return + fi + + print_info "Setting up root hints" + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would download root hints" + return + fi + + local hints_dir + hints_dir=$(dirname "$ROOT_HINTS") + mkdir -p "$hints_dir" + + if wget -qO "$ROOT_HINTS.tmp" https://www.internic.net/domain/named.cache 2>/dev/null; then + mv "$ROOT_HINTS.tmp" "$ROOT_HINTS" + chown "$UNBOUND_USER":"$UNBOUND_USER" "$ROOT_HINTS" 2>/dev/null || true + print_success "Root hints downloaded" + else + print_warning "Could not download root hints — using package default" + fi +} + +# ============================================================================ +# SETUP TRUST ANCHOR +# ============================================================================ + +setup_trust_anchor() { + print_info "Setting up DNSSEC trust anchor" + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would set up DNSSEC trust anchor" + return + fi + + local anchor_dir + anchor_dir=$(dirname "$TRUST_ANCHOR") + mkdir -p "$anchor_dir" + chown "$UNBOUND_USER":"$UNBOUND_USER" "$anchor_dir" + + if command -v unbound-anchor &>/dev/null; then + unbound-anchor -a "$TRUST_ANCHOR" 2>/dev/null || true + chown "$UNBOUND_USER":"$UNBOUND_USER" "$TRUST_ANCHOR" 2>/dev/null || true + print_success "DNSSEC trust anchor configured" + else + print_warning "unbound-anchor not found — DNSSEC trust anchor not configured" + fi +} + +# ============================================================================ +# SETUP UNBOUND-CONTROL +# ============================================================================ + +setup_control() { + print_info "Setting up unbound-control" + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would generate unbound-control keys" + return + fi + + if [[ -f /etc/unbound/unbound_control.key ]]; then + print_info "unbound-control keys already exist" + return + fi + + if command -v unbound-control-setup &>/dev/null; then + unbound-control-setup > /dev/null 2>&1 + print_success "unbound-control keys generated" + else + print_warning "unbound-control-setup not found" + fi +} + +# ============================================================================ +# GENERATE CONFIGURATION +# ============================================================================ + +generate_config() { + print_header "Generating Configuration" + + local rrset_size=$((CACHE_SIZE * 2)) + local listen_addr="$LISTEN_IP" + if [[ "$LISTEN_ALL" == "true" ]]; then + listen_addr="0.0.0.0" + fi + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would write $UNBOUND_CONF" + print_info " Listen: $listen_addr" + print_info " Mode: $MODE" + [[ -n "$FORWARDER" ]] && print_info " Forwarder: $FORWARDER (DoT: $USE_DOT)" + print_info " Threads: $THREADS, Cache: ${CACHE_SIZE}m" + [[ ${#LOCAL_ZONES[@]} -gt 0 ]] && print_info " Local zones: ${LOCAL_ZONES[*]}" + [[ ${#NSD_STUB_ZONES[@]} -gt 0 ]] && print_info " NSD stub zones: ${NSD_STUB_ZONES[*]} (port $NSD_PORT)" + return + fi + + # Backup existing config + if [[ -f "$UNBOUND_CONF" ]]; then + cp "$UNBOUND_CONF" "${UNBOUND_CONF}.bak.$(date +%Y%m%d%H%M%S)" + print_info "Backed up existing config" + fi + + mkdir -p "$UNBOUND_CONF_DIR" + + # ── Main config ── + cat > "$UNBOUND_CONF" <> "$UNBOUND_CONF" <> "$UNBOUND_CONF" + done + + cat >> "$UNBOUND_CONF" <> "$UNBOUND_CONF" + echo "" >> "$UNBOUND_CONF" + fi + + # TLS bundle (for DoT) + if [[ "$USE_DOT" == "true" ]]; then + echo " tls-cert-bundle: \"$TLS_BUNDLE\"" >> "$UNBOUND_CONF" + echo "" >> "$UNBOUND_CONF" + fi + + # Include directory + echo " include: \"$UNBOUND_CONF_DIR/*.conf\"" >> "$UNBOUND_CONF" + echo "" >> "$UNBOUND_CONF" + + # Remote control + cat >> "$UNBOUND_CONF" < "$conf_file" <> "$conf_file" <> "$conf_file" <> "$conf_file" <> "$conf_file" <> "$conf_file" <> "$conf_file" <> "$conf_file" < "$conf_file" <> "$conf_file" + done + + for entry in "${LOCAL_ENTRIES[@]}"; do + echo " local-data: \"$entry\"" >> "$conf_file" + done + + print_success "Local zones config written (${#LOCAL_ZONES[@]} zones, ${#LOCAL_ENTRIES[@]} records)" +} + +# ============================================================================ +# NSD STUB ZONE CONFIGURATION +# ============================================================================ + +generate_nsd_stub_config() { + local conf_file="$UNBOUND_CONF_DIR/nsd-stubs.conf" + + cat > "$conf_file" <> "$conf_file" </dev/null && ufw status | grep -q "active"; then + ufw allow 53/tcp > /dev/null 2>&1 + ufw allow 53/udp > /dev/null 2>&1 + print_success "Opened ports 53/tcp and 53/udp (ufw)" + elif command -v firewall-cmd &>/dev/null && systemctl is-active --quiet firewalld; then + firewall-cmd --permanent --add-service=dns > /dev/null 2>&1 + firewall-cmd --reload > /dev/null 2>&1 + print_success "Opened DNS ports (firewalld)" + else + print_warning "No active firewall detected — skipping" + fi +} + +# ============================================================================ +# START AND VALIDATE +# ============================================================================ + +validate_and_start() { + print_header "Validating and Starting Unbound" + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would validate config and start unbound" + return + fi + + # Create pid directory + mkdir -p /run/unbound + chown "$UNBOUND_USER":"$UNBOUND_USER" /run/unbound + + # Set ownership + chown -R "$UNBOUND_USER":"$UNBOUND_USER" /etc/unbound/ 2>/dev/null || true + chmod 640 "$UNBOUND_CONF" + + # Validate + if unbound-checkconf "$UNBOUND_CONF" > /dev/null 2>&1; then + print_success "Configuration validated (unbound-checkconf)" + else + print_error "Configuration validation failed:" + unbound-checkconf "$UNBOUND_CONF" + exit 1 + fi + + # Enable and start + systemctl enable unbound > /dev/null 2>&1 + systemctl restart unbound + + if systemctl is-active --quiet unbound; then + print_success "Unbound is running" + else + print_error "Unbound failed to start" + journalctl -u unbound --no-pager -n 10 + exit 1 + fi + + # Quick test + sleep 1 + if dig @127.0.0.1 example.com A +short +time=5 > /dev/null 2>&1; then + print_success "DNS resolution test passed" + else + print_warning "DNS resolution test failed — check logs with: journalctl -u unbound" + fi +} + +# ============================================================================ +# UNINSTALL +# ============================================================================ + +do_uninstall() { + print_header "Uninstalling Unbound" + + if [[ "$DRY_RUN" == "true" ]]; then + print_info "[DRY RUN] Would remove Unbound and configuration" + return + fi + + systemctl stop unbound 2>/dev/null || true + systemctl disable unbound 2>/dev/null || true + + case "$OS_FAMILY" in + debian) + apt-get remove --purge -y -qq unbound unbound-host dns-root-data > /dev/null 2>&1 || true + apt-get autoremove -y -qq > /dev/null 2>&1 || true + ;; + rhel) + dnf remove -y -q unbound > /dev/null 2>&1 || true + ;; + esac + + rm -rf /etc/unbound + rm -f /var/lib/unbound/root.key + + print_success "Unbound removed" + log "Unbound uninstalled" +} + +# ============================================================================ +# SUMMARY +# ============================================================================ + +print_summary() { + local rrset_size=$((CACHE_SIZE * 2)) + + print_header "Installation Complete" + + echo "" + echo " Unbound recursive DNS resolver is running." + echo "" + echo " Config: $UNBOUND_CONF" + echo " Config dir: $UNBOUND_CONF_DIR" + echo " Listen: ${LISTEN_ALL:+0.0.0.0}${LISTEN_ALL:-$LISTEN_IP}:53" + echo " Mode: $MODE" + [[ -n "$FORWARDER" ]] && echo " Forwarder: $FORWARDER (DoT: $USE_DOT)" + echo " Threads: $THREADS" + echo " Cache: msg=${CACHE_SIZE}m, rrset=${rrset_size}m" + echo " DNSSEC: enabled" + [[ ${#LOCAL_ZONES[@]} -gt 0 ]] && echo " Local zones: ${LOCAL_ZONES[*]}" + [[ ${#NSD_STUB_ZONES[@]} -gt 0 ]] && echo " NSD stubs: ${NSD_STUB_ZONES[*]} (port $NSD_PORT)" + echo " Log: $LOG_FILE" + echo "" + echo " Useful commands:" + echo " sudo unbound-control status" + echo " sudo unbound-control stats_noreset" + echo " dig @127.0.0.1 example.com A" + echo " sudo journalctl -u unbound -f" + echo "" +} + +# ============================================================================ +# PARSE ARGUMENTS +# ============================================================================ + +ALLOW_NETS=() + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --recursive) + MODE="recursive" + shift + ;; + --forward) + MODE="forward" + FORWARDER="${2:?'--forward requires a provider (cloudflare, google, quad9, or IP)'}" + shift 2 + ;; + --dot) + USE_DOT=true + shift + ;; + --listen) + LISTEN_IP="${2:?'--listen requires an IP address'}" + shift 2 + ;; + --listen-all) + LISTEN_ALL=true + shift + ;; + --allow-net) + ALLOW_NETS+=("${2:?'--allow-net requires a CIDR'}") + shift 2 + ;; + --local-zone) + LOCAL_ZONES+=("${2:?'--local-zone requires a zone name'}") + shift 2 + ;; + --local-data) + LOCAL_ENTRIES+=("${2:?'--local-data requires a record'}") + shift 2 + ;; + --nsd-stub) + NSD_STUB_ZONES+=("${2:?'--nsd-stub requires a zone name'}") + shift 2 + ;; + --nsd-port) + NSD_PORT="${2:?'--nsd-port requires a port number'}" + shift 2 + ;; + --threads) + THREADS="${2:?'--threads requires a number'}" + shift 2 + ;; + --cache-size) + CACHE_SIZE="${2:?'--cache-size requires a size in MB'}" + shift 2 + ;; + --uninstall) + UNINSTALL=true + shift + ;; + --dry-run) + DRY_RUN=true + shift + ;; + -h|--help) + show_help + ;; + --version) + show_version + ;; + *) + log_error "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac + done + + # Validate + if [[ "$USE_DOT" == "true" && "$MODE" != "forward" ]]; then + log_error "--dot requires --forward (DoT is for forwarding to upstream)" + exit 1 + fi +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + echo "" + echo " ╔══════════════════════════════════════════╗" + echo " ║ Unbound Installer v$VERSION ║" + echo " ║ https://mylinux.work ║" + echo " ╚══════════════════════════════════════════╝" + echo "" + + [[ "$DRY_RUN" == "true" ]] && print_warning "DRY RUN MODE — no changes will be made" + + # Root check (skip for dry-run) + if [[ "$DRY_RUN" == "false" ]]; then + check_root + fi + + detect_os + + if [[ "$UNINSTALL" == "true" ]]; then + do_uninstall + exit 0 + fi + + detect_tuning + disable_resolved + install_unbound + setup_trust_anchor + setup_root_hints + setup_control + generate_config + configure_firewall + validate_and_start + print_summary + + log "Installation complete" +} + +main "$@" diff --git a/install-webtop.sh b/install-webtop.sh new file mode 100755 index 0000000..18a2f65 --- /dev/null +++ b/install-webtop.sh @@ -0,0 +1,368 @@ +#!/usr/bin/env bash + +###################################################################################### +#### install-webtop.sh — Deploy LinuxServer.io Webtop behind nginx reverse proxy #### +#### Installs Docker, runs Webtop container with persistent config, configures #### +#### nginx with basic auth and WebSocket proxy for KasmVNC. #### +#### Requires: Ubuntu 22.04+, root access #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### sudo ./install-webtop.sh --domain webtop.example.com #### +#### sudo ./install-webtop.sh --domain webtop.example.com --no-ssl #### +#### sudo ./install-webtop.sh --domain webtop.example.com --allow-ip 1.2.3.4 #### +#### #### +#### See --help for all options. #### +###################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +DOMAIN="" +DESKTOP="fedora-mate" +WEBTOP_IMAGE="" +CONTAINER_NAME="webtop" +CONFIG_DIR="/opt/webtop" +INTERNAL_PORT="3000" +TIMEZONE="${TZ:-America/Chicago}" +SHM_SIZE="1gb" +ALLOW_IP="" +NO_SSL=false +AUTH_USER="admin" +AUTH_PASS="" +SKIP_DOCKER=false + +# ── Colors ──────────────────────────────────────────────────────────── +if [[ -t 1 ]]; then + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + RED='\033[0;31m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' +else + GREEN="" YELLOW="" RED="" BLUE="" BOLD="" RESET="" +fi + +log() { echo -e "${GREEN}[OK]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*"; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +step() { echo -e "\n${BOLD}── $* ──${RESET}"; } + +# ── Help ────────────────────────────────────────────────────────────── +show_help() { + cat <<'EOF' +Usage: install-webtop.sh [OPTIONS] + +Deploy LinuxServer.io Webtop (Fedora MATE) behind nginx reverse proxy +with basic auth, WebSocket support, and optional Let's Encrypt SSL. + +Required: + --domain DOMAIN Domain name for nginx vhost and SSL cert + +Options: + --desktop DESKTOP Desktop environment (default: fedora-mate) + Options: fedora-mate, fedora-xfce, fedora-kde, + ubuntu-mate, ubuntu-xfce, ubuntu-kde, + alpine-xfce, alpine-kde + --image IMAGE Override with a custom image tag + --name NAME Container name (default: webtop) + --config-dir PATH Persistent config directory (default: /opt/webtop) + --port PORT Internal KasmVNC port (default: 3000) + --tz TIMEZONE Timezone (default: America/Chicago) + --shm-size SIZE Shared memory size (default: 1gb) + --allow-ip IP Restrict access to this IP in UFW (optional) + --auth-user USER Basic auth username (default: admin) + --auth-pass PASS Basic auth password (prompted if not set) + --no-ssl Skip Let's Encrypt — use HTTP only + --skip-docker Skip Docker installation (already installed) + -h, --help Show this help + +Examples: + sudo ./install-webtop.sh --domain webtop.example.com + sudo ./install-webtop.sh --domain webtop.example.com --allow-ip 203.0.113.50 + sudo ./install-webtop.sh --domain webtop.example.com --desktop ubuntu-xfce + sudo ./install-webtop.sh --domain webtop.example.com --no-ssl --auth-pass s3cret +EOF + exit 0 +} + +# ── Parse Arguments ─────────────────────────────────────────────────── +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --domain) DOMAIN="$2"; shift 2 ;; + --desktop) DESKTOP="$2"; shift 2 ;; + --image) WEBTOP_IMAGE="$2"; shift 2 ;; + --name) CONTAINER_NAME="$2"; shift 2 ;; + --config-dir) CONFIG_DIR="$2"; shift 2 ;; + --port) INTERNAL_PORT="$2"; shift 2 ;; + --tz) TIMEZONE="$2"; shift 2 ;; + --shm-size) SHM_SIZE="$2"; shift 2 ;; + --allow-ip) ALLOW_IP="$2"; shift 2 ;; + --auth-user) AUTH_USER="$2"; shift 2 ;; + --auth-pass) AUTH_PASS="$2"; shift 2 ;; + --no-ssl) NO_SSL=true; shift ;; + --skip-docker) SKIP_DOCKER=true; shift ;; + -h|--help) show_help ;; + *) err "Unknown option: $1"; echo "Run with --help for usage."; exit 1 ;; + esac + done + + if [[ -z "$DOMAIN" ]]; then + err "--domain is required" + exit 1 + fi + + # resolve image from desktop choice (--image overrides) + if [[ -z "$WEBTOP_IMAGE" ]]; then + local valid_desktops="fedora-mate fedora-xfce fedora-kde ubuntu-mate ubuntu-xfce ubuntu-kde alpine-xfce alpine-kde" + if ! echo "$valid_desktops" | grep -qw "$DESKTOP"; then + err "Unknown desktop: ${DESKTOP}" + echo "Valid options: ${valid_desktops}" + exit 1 + fi + WEBTOP_IMAGE="lscr.io/linuxserver/webtop:${DESKTOP}" + fi +} + +# ── Root Check ──────────────────────────────────────────────────────── +check_root() { + if [[ $EUID -ne 0 ]]; then + err "This script must be run as root." + exit 1 + fi +} + +# ── Install Docker ──────────────────────────────────────────────────── +install_docker() { + step "Docker" + + if [[ "$SKIP_DOCKER" == "true" ]]; then + log "Skipping Docker install (--skip-docker)" + return + fi + + if command -v docker &>/dev/null; then + log "Docker already installed: $(docker --version)" + return + fi + + curl -fsSL https://get.docker.com | sh + systemctl enable --now docker + log "Docker installed: $(docker --version)" +} + +# ── Create Config Directory ─────────────────────────────────────────── +setup_config_dir() { + step "Config directory" + mkdir -p "${CONFIG_DIR}" + log "Persistent config: ${CONFIG_DIR}" +} + +# ── Run Webtop Container ───────────────────────────────────────────── +run_webtop() { + step "Webtop container" + + if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + warn "Container '${CONTAINER_NAME}' already exists — removing" + docker rm -f "${CONTAINER_NAME}" >/dev/null 2>&1 + fi + + docker run -d \ + --name "${CONTAINER_NAME}" \ + --restart unless-stopped \ + -p "127.0.0.1:${INTERNAL_PORT}:3000" \ + -e PUID=1000 \ + -e PGID=1000 \ + -e "TZ=${TIMEZONE}" \ + -v "${CONFIG_DIR}:/config" \ + --shm-size="${SHM_SIZE}" \ + "${WEBTOP_IMAGE}" + + log "Container '${CONTAINER_NAME}' running (${WEBTOP_IMAGE})" + log "Bound to 127.0.0.1:${INTERNAL_PORT} (nginx will proxy)" +} + +# ── Install nginx ───────────────────────────────────────────────────── +install_nginx() { + step "nginx" + + if ! command -v nginx &>/dev/null; then + apt-get update -qq + apt-get install -y -qq nginx apache2-utils >/dev/null + log "nginx installed" + else + log "nginx already installed" + if ! command -v htpasswd &>/dev/null; then + apt-get install -y -qq apache2-utils >/dev/null + fi + fi +} + +# ── Basic Auth ──────────────────────────────────────────────────────── +setup_auth() { + step "Basic auth" + + if [[ -z "$AUTH_PASS" ]]; then + AUTH_PASS=$(openssl rand -base64 16) + fi + + htpasswd -bc /etc/nginx/.htpasswd_webtop "${AUTH_USER}" "${AUTH_PASS}" 2>/dev/null + chown root:www-data /etc/nginx/.htpasswd_webtop + chmod 640 /etc/nginx/.htpasswd_webtop + + # save credentials to a root-only file + local creds_file="${CONFIG_DIR}/.credentials" + echo "username=${AUTH_USER}" > "${creds_file}" + echo "password=${AUTH_PASS}" >> "${creds_file}" + chmod 600 "${creds_file}" + + log "Basic auth configured (user: ${AUTH_USER})" + log "Credentials saved to ${creds_file} (root-only)" +} + +# ── nginx Vhost ─────────────────────────────────────────────────────── +configure_nginx() { + step "nginx vhost" + + cat > "/etc/nginx/sites-available/${DOMAIN}" <&1 + systemctl reload nginx + log "nginx vhost configured: ${DOMAIN}" +} + +# ── Let's Encrypt SSL ──────────────────────────────────────────────── +setup_ssl() { + step "SSL (Let's Encrypt)" + + if [[ "$NO_SSL" == "true" ]]; then + warn "Skipping SSL (--no-ssl) — access via http://${DOMAIN}" + return + fi + + if ! command -v certbot &>/dev/null; then + apt-get install -y -qq certbot python3-certbot-nginx >/dev/null + log "certbot installed" + fi + + certbot --nginx -d "${DOMAIN}" --non-interactive --agree-tos \ + --register-unsafely-without-email --redirect + + log "SSL configured: https://${DOMAIN}" +} + +# ── UFW Rules ───────────────────────────────────────────────────────── +setup_ufw() { + step "Firewall" + + if ! command -v ufw &>/dev/null; then + warn "UFW not found — skipping firewall config" + return + fi + + ufw allow OpenSSH >/dev/null 2>&1 || true + ufw allow 'Nginx Full' >/dev/null 2>&1 || true + + if [[ -n "$ALLOW_IP" ]]; then + ufw deny from any to any port 80 >/dev/null 2>&1 || true + ufw deny from any to any port 443 >/dev/null 2>&1 || true + ufw allow from "${ALLOW_IP}" to any port 80 >/dev/null 2>&1 || true + ufw allow from "${ALLOW_IP}" to any port 443 >/dev/null 2>&1 || true + log "Access restricted to ${ALLOW_IP}" + else + log "nginx ports open (no IP restriction)" + warn "Consider using --allow-ip to restrict access" + fi + + if ! ufw status | grep -q "^Status: active"; then + ufw --force enable >/dev/null 2>&1 + log "UFW enabled" + fi +} + +# ── Summary ─────────────────────────────────────────────────────────── +print_summary() { + local proto="http" + if [[ "$NO_SSL" == "false" ]]; then + proto="https" + fi + + echo "" + echo "────────────────────────────────────────" + echo -e "${BOLD}Webtop Deployment Complete${RESET}" + echo "────────────────────────────────────────" + echo " URL: ${proto}://${DOMAIN}" + echo " Username: ${AUTH_USER}" + echo " Password: ${AUTH_PASS}" + echo " Creds file: ${CONFIG_DIR}/.credentials" + echo " Image: ${WEBTOP_IMAGE}" + echo " Container: ${CONTAINER_NAME}" + echo " Config: ${CONFIG_DIR}" + echo " Timezone: ${TIMEZONE}" + if [[ -n "$ALLOW_IP" ]]; then + echo " Allowed IP: ${ALLOW_IP}" + fi + echo "────────────────────────────────────────" + echo "" + echo "Manage:" + echo " docker logs ${CONTAINER_NAME} # view logs" + echo " docker restart ${CONTAINER_NAME} # restart" + echo " docker stop ${CONTAINER_NAME} # stop" + echo " docker start ${CONTAINER_NAME} # start" + echo " ls ${CONFIG_DIR}/ # persistent data" +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + parse_args "$@" + check_root + + echo -e "${BOLD}Webtop Installer${RESET}" + echo "Domain: ${DOMAIN}" + echo "Image: ${WEBTOP_IMAGE}" + echo "Config: ${CONFIG_DIR}" + + install_docker + setup_config_dir + run_webtop + install_nginx + setup_auth + configure_nginx + setup_ssl + setup_ufw + print_summary +} + +main "$@" diff --git a/install-windows-exporter.ps1 b/install-windows-exporter.ps1 new file mode 100644 index 0000000..5f9fafc --- /dev/null +++ b/install-windows-exporter.ps1 @@ -0,0 +1,581 @@ +############################################################# +#### Prometheus Windows Exporter Installer #### +#### For Windows Server 2016+ and Windows 10/11 #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: .\install-windows-exporter.ps1 [OPTIONS] #### +############################################################# + +param( + [string]$ServerType = "standard", # standard, iis, sql, ad, rdp, all + [string]$ExtraCollectors = "", # additional collectors (comma-separated) + [string]$RemoveCollectors = "", # collectors to remove from the profile + [int]$Port = 9182, + [string]$InstallDir = "C:\Program Files\windows_exporter", + [string]$PrometheusIP = "", # restrict firewall to this IP + [switch]$NoFirewall, + [switch]$NoDefenderExclusion, + [switch]$Update, + [switch]$Uninstall, + [switch]$DryRun, + [switch]$Help +) + +$ErrorActionPreference = "Stop" + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +$ServiceName = "windows_exporter" +$ServiceDisplayName = "Prometheus Windows Exporter" +$FirewallRuleName = "Prometheus Windows Exporter (TCP-In)" +$GitHubApiUrl = "https://api.github.com/repos/prometheus-community/windows_exporter/releases/latest" +$TextFileDir = Join-Path $InstallDir "textfile_inputs" +$TargetExe = Join-Path $InstallDir "windows_exporter.exe" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +function Write-ColorOutput { + param([string]$Message, [string]$Color = "Green") + $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" + Write-Host "[$timestamp] $Message" -ForegroundColor $Color +} + +function Show-Help { + Write-Host @" + +Prometheus Windows Exporter Installer v1.0 +https://mylinux.work + +USAGE: + .\install-windows-exporter.ps1 [OPTIONS] + +OPTIONS: + -ServerType Server profile: standard, iis, sql, ad, rdp, all (default: standard) + -ExtraCollectors Additional collectors, comma-separated (e.g. "service,hyperv") + -RemoveCollectors Remove collectors from the profile (e.g. "tcp,time") + -Port Metrics listen port (default: 9182) + -InstallDir Installation directory (default: C:\Program Files\windows_exporter) + -PrometheusIP Restrict firewall rule to this source IP + -NoFirewall Skip firewall rule creation + -NoDefenderExclusion Skip Windows Defender exclusion + -Update Update existing installation to latest version + -Uninstall Remove windows_exporter completely + -DryRun Show what would be done without making changes + -Help Show this help + +SERVER PROFILES: + standard [defaults],process,tcp,time + iis [defaults],process,tcp,time,iis + sql [defaults],process,tcp,time,mssql + ad [defaults],process,tcp,time,ad,dns + rdp [defaults],process,tcp,time,remote_fx + all [defaults],process,tcp,time,iis,mssql,ad,dns,remote_fx + + [defaults] = cpu,cs,logical_disk,os,system,net,cache,logon,memory + +EXAMPLES: + # Standard install + .\install-windows-exporter.ps1 + + # IIS server with restricted firewall + .\install-windows-exporter.ps1 -ServerType iis -PrometheusIP "10.0.0.5" + + # SQL server with extra service collector + .\install-windows-exporter.ps1 -ServerType sql -ExtraCollectors "service" + + # Update to latest version + .\install-windows-exporter.ps1 -Update + + # Uninstall + .\install-windows-exporter.ps1 -Uninstall +"@ +} + +# ============================================================================ +# ADMINISTRATOR CHECK +# ============================================================================ + +function Test-Administrator { + $currentUser = [Security.Principal.WindowsIdentity]::GetCurrent() + $principal = New-Object Security.Principal.WindowsPrincipal($currentUser) + return $principal.IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator) +} + +# ============================================================================ +# SERVER PROFILE BUILDER +# ============================================================================ + +function Get-CollectorString { + $defaultCollectors = @("cpu", "cs", "logical_disk", "os", "system", "net", "cache", "logon", "memory") + $profileCollectors = @("process", "tcp", "time") + + switch ($ServerType.ToLower()) { + "iis" { $profileCollectors += @("iis") } + "sql" { $profileCollectors += @("mssql") } + "ad" { $profileCollectors += @("ad", "dns") } + "rdp" { $profileCollectors += @("remote_fx") } + "all" { $profileCollectors += @("iis", "mssql", "ad", "dns", "remote_fx") } + "standard" { } + default { + Write-ColorOutput "WARNING: Unknown server type '$ServerType', using 'standard'" "Yellow" + } + } + + $allCollectors = $defaultCollectors + $profileCollectors + + # Add extra collectors + if ($ExtraCollectors) { + $extras = $ExtraCollectors -split "," | ForEach-Object { $_.Trim() } | Where-Object { $_ -ne "" } + foreach ($col in $extras) { + if ($allCollectors -notcontains $col) { + $allCollectors += $col + } + } + } + + # Remove unwanted collectors + if ($RemoveCollectors) { + $removals = $RemoveCollectors -split "," | ForEach-Object { $_.Trim() } | Where-Object { $_ -ne "" } + $allCollectors = $allCollectors | Where-Object { $removals -notcontains $_ } + } + + return ($allCollectors | Select-Object -Unique) -join "," +} + +# ============================================================================ +# GITHUB RELEASE FUNCTIONS +# ============================================================================ + +function Get-LatestRelease { + Write-ColorOutput "Querying GitHub for latest release..." + try { + $release = Invoke-RestMethod -Uri $GitHubApiUrl -UseBasicParsing + $asset = $release.assets | Where-Object { $_.name -match "amd64" -and $_.name -match "\.exe$" } | Select-Object -First 1 + + if (-not $asset) { + Write-ColorOutput "ERROR: Could not find amd64 .exe asset in latest release" "Red" + exit 1 + } + + return @{ + Version = $release.tag_name + DownloadUrl = $asset.browser_download_url + AssetName = $asset.name + } + } catch { + Write-ColorOutput "ERROR: Failed to query GitHub API: $($_.Exception.Message)" "Red" + exit 1 + } +} + +# ============================================================================ +# SERVICE MANAGEMENT +# ============================================================================ + +function Stop-ExistingService { + $service = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue + if ($service) { + Write-ColorOutput "Stopping existing service..." + Stop-Service -Name $ServiceName -Force -ErrorAction SilentlyContinue + Start-Sleep -Seconds 3 + } +} + +function Remove-ExistingService { + $service = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue + if ($service) { + Stop-ExistingService + Write-ColorOutput "Deleting existing service..." + sc.exe delete $ServiceName | Out-Null + Start-Sleep -Seconds 2 + } +} + +function New-ExporterService { + param([string]$CollectorString) + + $serviceArgs = @( + "--log.file=eventlog", + "--web.listen-address=0.0.0.0:$Port", + "--collector.textfile.directories=`"$TextFileDir`"", + "--collectors.enabled=$CollectorString" + ) + + $binaryPath = "`"$TargetExe`" " + ($serviceArgs -join " ") + + Write-ColorOutput "Creating service with collectors: $CollectorString" "Cyan" + if ($DryRun) { + Write-ColorOutput "[DRY RUN] Would create service: $binaryPath" "Yellow" + return + } + + New-Service -Name $ServiceName ` + -BinaryPathName $binaryPath ` + -DisplayName $ServiceDisplayName ` + -StartupType Automatic ` + -Description "Prometheus metrics exporter for Windows (managed by install-windows-exporter.ps1)" | Out-Null + + Write-ColorOutput "Service created successfully" +} + +# ============================================================================ +# FIREWALL MANAGEMENT +# ============================================================================ + +function Add-FirewallRule { + if ($NoFirewall) { + Write-ColorOutput "Skipping firewall rule (-NoFirewall specified)" "Yellow" + return + } + + # Remove existing rule if present + $existing = Get-NetFirewallRule -DisplayName $FirewallRuleName -ErrorAction SilentlyContinue + if ($existing) { + Remove-NetFirewallRule -DisplayName $FirewallRuleName -ErrorAction SilentlyContinue + Write-ColorOutput "Removed existing firewall rule" + } + + if ($DryRun) { + $scope = if ($PrometheusIP) { $PrometheusIP } else { "Any" } + Write-ColorOutput "[DRY RUN] Would create firewall rule: port $Port, source: $scope" "Yellow" + return + } + + $params = @{ + DisplayName = $FirewallRuleName + Direction = "Inbound" + Protocol = "TCP" + LocalPort = $Port + Action = "Allow" + Profile = "Any" + Description = "Allow Prometheus to scrape windows_exporter metrics on port $Port" + } + + if ($PrometheusIP) { + $params.RemoteAddress = $PrometheusIP + Write-ColorOutput "Creating firewall rule: port $Port restricted to $PrometheusIP" + } else { + Write-ColorOutput "Creating firewall rule: port $Port open to all sources" + } + + New-NetFirewallRule @params | Out-Null + Write-ColorOutput "Firewall rule created" +} + +function Remove-FirewallRule { + $existing = Get-NetFirewallRule -DisplayName $FirewallRuleName -ErrorAction SilentlyContinue + if ($existing) { + Remove-NetFirewallRule -DisplayName $FirewallRuleName + Write-ColorOutput "Firewall rule removed" + } else { + Write-ColorOutput "No firewall rule found to remove" "Yellow" + } +} + +# ============================================================================ +# DEFENDER EXCLUSION +# ============================================================================ + +function Add-DefenderExclusion { + if ($NoDefenderExclusion) { + Write-ColorOutput "Skipping Defender exclusion (-NoDefenderExclusion specified)" "Yellow" + return + } + + if ($DryRun) { + Write-ColorOutput "[DRY RUN] Would add Defender exclusion for: $InstallDir" "Yellow" + return + } + + try { + Add-MpPreference -ExclusionPath $InstallDir -ErrorAction Stop + Write-ColorOutput "Windows Defender exclusion added for: $InstallDir" + } catch { + Write-ColorOutput "WARNING: Could not add Defender exclusion: $($_.Exception.Message)" "Yellow" + } +} + +function Remove-DefenderExclusion { + try { + Remove-MpPreference -ExclusionPath $InstallDir -ErrorAction Stop + Write-ColorOutput "Windows Defender exclusion removed for: $InstallDir" + } catch { + Write-ColorOutput "WARNING: Could not remove Defender exclusion: $($_.Exception.Message)" "Yellow" + } +} + +# ============================================================================ +# VERIFICATION +# ============================================================================ + +function Test-MetricsEndpoint { + Write-ColorOutput "Waiting for service to start..." + Start-Sleep -Seconds 5 + + $metricsUrl = "http://localhost:$Port/metrics" + try { + $response = Invoke-WebRequest -Uri $metricsUrl -UseBasicParsing -TimeoutSec 10 + if ($response.StatusCode -eq 200) { + Write-ColorOutput "Metrics endpoint responding at $metricsUrl" "Green" + return $true + } + } catch { + Write-ColorOutput "WARNING: Metrics endpoint not responding at $metricsUrl" "Yellow" + Write-ColorOutput "The service may still be starting. Check: Get-Service $ServiceName" "Yellow" + return $false + } + return $false +} + +# ============================================================================ +# UNINSTALL MODE +# ============================================================================ + +function Invoke-Uninstall { + Write-ColorOutput "=== Uninstalling Windows Exporter ===" "Cyan" + + $service = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue + if (-not $service -and -not (Test-Path $InstallDir)) { + Write-ColorOutput "Windows Exporter does not appear to be installed" "Yellow" + exit 0 + } + + Write-Host "" + Write-Host "This will remove:" -ForegroundColor Yellow + Write-Host " - Service: $ServiceName" + Write-Host " - Directory: $InstallDir" + Write-Host " - Firewall rule: $FirewallRuleName" + Write-Host " - Defender exclusion: $InstallDir" + Write-Host "" + + if (-not $DryRun) { + $confirm = Read-Host "Are you sure? (y/N)" + if ($confirm -ne "y" -and $confirm -ne "Y") { + Write-ColorOutput "Uninstall cancelled" "Yellow" + exit 0 + } + } + + if ($DryRun) { + Write-ColorOutput "[DRY RUN] Would uninstall windows_exporter" "Yellow" + return + } + + # Stop and remove service + Remove-ExistingService + + # Remove firewall rule + Remove-FirewallRule + + # Remove Defender exclusion + Remove-DefenderExclusion + + # Remove install directory + if (Test-Path $InstallDir) { + Remove-Item -Path $InstallDir -Recurse -Force + Write-ColorOutput "Install directory removed: $InstallDir" + } + + Write-Host "" + Write-ColorOutput "=== Windows Exporter uninstalled successfully ===" "Green" +} + +# ============================================================================ +# UPDATE MODE +# ============================================================================ + +function Invoke-Update { + Write-ColorOutput "=== Updating Windows Exporter ===" "Cyan" + + $service = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue + if (-not $service) { + Write-ColorOutput "ERROR: Service '$ServiceName' not found. Use install mode instead." "Red" + exit 1 + } + + $release = Get-LatestRelease + Write-ColorOutput "Latest version: $($release.Version)" + + # Check installed version if binary exists + if (Test-Path $TargetExe) { + try { + $installedVersion = (Get-Item $TargetExe).VersionInfo.ProductVersion + if ($installedVersion) { + Write-ColorOutput "Installed version: $installedVersion" + $latestClean = $release.Version -replace '^v', '' + if ($installedVersion -eq $latestClean) { + Write-ColorOutput "Already running the latest version. No update needed." "Green" + exit 0 + } + } + } catch { + Write-ColorOutput "Could not determine installed version, proceeding with update" "Yellow" + } + } + + if ($DryRun) { + Write-ColorOutput "[DRY RUN] Would update to $($release.Version)" "Yellow" + return + } + + # Download new binary + $tempExe = Join-Path $env:TEMP "windows_exporter_update.exe" + Write-ColorOutput "Downloading $($release.AssetName)..." + Invoke-WebRequest -Uri $release.DownloadUrl -OutFile $tempExe -UseBasicParsing + + # Stop service, replace binary, start service + Stop-ExistingService + Copy-Item -Path $tempExe -Destination $TargetExe -Force + Remove-Item $tempExe -Force -ErrorAction SilentlyContinue + Write-ColorOutput "Binary updated" + + Start-Service -Name $ServiceName + $verified = Test-MetricsEndpoint + + Write-Host "" + Write-ColorOutput "=== Update Summary ===" "Cyan" + Write-ColorOutput "Version: $($release.Version)" + Write-ColorOutput "Status: $(if ($verified) { 'Verified' } else { 'Started (verification pending)' })" + Write-ColorOutput "Metrics URL: http://localhost:$Port/metrics" +} + +# ============================================================================ +# INSTALL MODE +# ============================================================================ + +function Invoke-Install { + Write-ColorOutput "=== Installing Windows Exporter ===" "Cyan" + Write-ColorOutput "Server type: $ServerType" + Write-ColorOutput "Port: $Port" + Write-ColorOutput "Install dir: $InstallDir" + + # --- Create directories --- + if (-not $DryRun) { + if (-not (Test-Path $InstallDir)) { + New-Item -Path $InstallDir -ItemType Directory -Force | Out-Null + Write-ColorOutput "Created directory: $InstallDir" + } + if (-not (Test-Path $TextFileDir)) { + New-Item -Path $TextFileDir -ItemType Directory -Force | Out-Null + Write-ColorOutput "Created directory: $TextFileDir" + } + } else { + Write-ColorOutput "[DRY RUN] Would create: $InstallDir" "Yellow" + Write-ColorOutput "[DRY RUN] Would create: $TextFileDir" "Yellow" + } + + # --- Defender exclusion --- + Add-DefenderExclusion + + # --- Get latest release --- + $release = Get-LatestRelease + Write-ColorOutput "Latest version: $($release.Version)" + + # --- Check if already installed with same version --- + if (Test-Path $TargetExe) { + try { + $installedVersion = (Get-Item $TargetExe).VersionInfo.ProductVersion + $latestClean = $release.Version -replace '^v', '' + if ($installedVersion -and $installedVersion -eq $latestClean) { + Write-ColorOutput "Version $installedVersion already installed, skipping download" "Yellow" + $skipDownload = $true + } + } catch { } + } + + # --- Download binary --- + $tempExe = Join-Path $env:TEMP "windows_exporter_install.exe" + if (-not $skipDownload) { + Write-ColorOutput "Downloading $($release.AssetName)..." + if (-not $DryRun) { + Invoke-WebRequest -Uri $release.DownloadUrl -OutFile $tempExe -UseBasicParsing + Write-ColorOutput "Download complete" + } else { + Write-ColorOutput "[DRY RUN] Would download: $($release.DownloadUrl)" "Yellow" + } + } + + # --- Stop and remove existing service --- + Remove-ExistingService + + # --- Copy binary --- + if (-not $skipDownload -and -not $DryRun) { + Copy-Item -Path $tempExe -Destination $TargetExe -Force + Remove-Item $tempExe -Force -ErrorAction SilentlyContinue + Write-ColorOutput "Binary installed to: $TargetExe" + } + + # --- Build collector string --- + $collectorString = Get-CollectorString + + # --- Create service --- + New-ExporterService -CollectorString $collectorString + + # --- Firewall rule --- + Add-FirewallRule + + # --- Start service --- + if (-not $DryRun) { + Write-ColorOutput "Starting service..." + Start-Service -Name $ServiceName + $verified = Test-MetricsEndpoint + } + + # --- Summary --- + Write-Host "" + Write-ColorOutput "=== Installation Summary ===" "Cyan" + Write-ColorOutput "Version: $($release.Version)" + Write-ColorOutput "Binary: $TargetExe" + Write-ColorOutput "Port: $Port" + Write-ColorOutput "Collectors: $collectorString" + Write-ColorOutput "Textfile dir: $TextFileDir" + Write-ColorOutput "Firewall: $(if ($NoFirewall) { 'Skipped' } elseif ($PrometheusIP) { "Restricted to $PrometheusIP" } else { 'Open' })" + Write-ColorOutput "Defender: $(if ($NoDefenderExclusion) { 'Skipped' } else { 'Exclusion added' })" + + if (-not $DryRun) { + Write-ColorOutput "Status: $(if ($verified) { 'Verified - metrics endpoint responding' } else { 'Started (verification pending)' })" + Write-ColorOutput "Verify URL: http://localhost:$Port/metrics" "Cyan" + } + + Write-Host "" + Write-ColorOutput "=== Installation complete ===" "Green" +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Show help +if ($Help) { + Show-Help + exit 0 +} + +# Check administrator privileges +if (-not (Test-Administrator)) { + Write-ColorOutput "ERROR: This script must be run as Administrator" "Red" + Write-Host "Right-click PowerShell and select 'Run as administrator'" -ForegroundColor Yellow + exit 1 +} + +if ($DryRun) { + Write-ColorOutput "=== DRY RUN MODE - No changes will be made ===" "Yellow" + Write-Host "" +} + +# Route to the appropriate mode +if ($Uninstall) { + Invoke-Uninstall +} elseif ($Update) { + Invoke-Update +} else { + Invoke-Install +} diff --git a/ip-intel-exporter.sh b/ip-intel-exporter.sh new file mode 100755 index 0000000..d932563 --- /dev/null +++ b/ip-intel-exporter.sh @@ -0,0 +1,545 @@ +#!/bin/bash +################################################################################ +# Script Name: ip-intel-exporter.sh +# Version: 1.1 +# Description: Prometheus exporter for IP intelligence metrics from nginx logs. +# Parses access logs (GeoIP-enriched or standard), classifies +# traffic by country, ASN, cloud provider, and threat type. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - nginx access log (enriched or combined format) +# - mmdblookup (mmdb-bin) for standard log format GeoIP lookups +# - netcat (nc) for HTTP mode +# +# Usage: +# ./ip-intel-exporter.sh # stdout +# ./ip-intel-exporter.sh --textfile # node_exporter textfile +# ./ip-intel-exporter.sh --http # HTTP server on port 9199 +# ./ip-intel-exporter.sh --log /path/to/access.log +# +################################################################################ + +set -euo pipefail + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +NGINX_LOG="/var/log/nginx/access.log" +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9199 +LOG_FORMAT="auto" +# shellcheck disable=SC2034 +TOP_N=10 +# shellcheck disable=SC2034 +MMDB_CITY="/usr/share/GeoIP/GeoLite2-City.mmdb" +# shellcheck disable=SC2034 +MMDB_ASN="/usr/share/GeoIP/GeoLite2-ASN.mmdb" + +# ============================================================================ +# COLORS +# ============================================================================ + +if [[ -t 1 ]]; then + RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[0;33m'; RESET='\033[0m' +else + RED="" GREEN="" YELLOW="" RESET="" +fi + +log() { echo -e "${GREEN}[OK]${RESET} $*" >&2; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } + +# Sanitize a string for use as a Prometheus label value. +# Converts \xNN escape sequences to real UTF-8, escapes backslashes and quotes. +prom_label() { + local val + val=$(printf '%b' "$1") + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + printf '%s' "$val" +} + +# ============================================================================ +# USAGE +# ============================================================================ + +show_usage() { + cat </dev/null; then + warn "mmdblookup not found — country/ASN metrics unavailable for standard log format" + warn "Install: apt install mmdb-bin" + fi + fi +} + +# ============================================================================ +# METRICS COLLECTION +# ============================================================================ + +collect_metrics() { + local start_time + start_time=$(date +%s%N) + + local metrics="" + local now + now=$(date +%s) + + # --- Parse log with awk (single pass for performance) --- + local awk_output + awk_output=$(awk ' + BEGIN { + FS=" " + # Known cloud provider patterns (lowercase match) + split("amazon,aws,cloudflare,google cloud,microsoft,azure,digitalocean,linode,vultr,hetzner,ovh", providers, ",") + } + + { + ip = $1 + # Extract status code — field after "HTTP/x.x" + status = "" + for (i = 1; i <= NF; i++) { + if ($i ~ /^[0-9]{3}$/ && $(i-1) ~ /HTTP\//) { + status = $i + break + } + } + if (status == "") { + # fallback: find first standalone 3-digit number after the request + for (i = 7; i <= NF; i++) { + if ($i ~ /^[0-9]{3}$/) { + status = $i + break + } + } + } + + # Extract method + for (i = 1; i <= NF; i++) { + if ($i ~ /^"(GET|POST|HEAD|PUT|DELETE|PATCH|OPTIONS|CONNECT)$/) { + method = substr($i, 2) + break + } + } + + # Extract user agent (between second and third set of quotes after status) + ua = "" + quote_count = 0 + ua_start = 0 + for (i = 1; i <= NF; i++) { + if ($i ~ /^"/) quote_count++ + if (quote_count == 6 && ua_start == 0) { + ua_start = i + } + } + + # Simpler: reconstruct the full line and parse with regex + line = $0 + + # Country and ASN (enriched format: last fields are CC "ASN Org") + country = "" + asn_org = "" + if (format == "enriched") { + # Match trailing: XX "Some ASN Org" + if (match(line, /([A-Z]{2}|-) "([^"]*)"[[:space:]]*$/, m)) { + country = m[1] + asn_org = m[2] + } + } + + # Count total requests per IP + ip_total[ip]++ + + # Count by status class + if (status >= 400) { + ip_blocked[ip]++ + blocked_total++ + } + + # Count by country + if (country != "" && country != "-") { + country_req[country]++ + if (status >= 400) country_blocked[country]++ + } + + # Count by ASN + if (asn_org != "" && asn_org != "-") { + asn_req[asn_org]++ + if (status >= 400) asn_blocked[asn_org]++ + + # Classify cloud provider + lower_asn = tolower(asn_org) + provider = "other" + if (lower_asn ~ /amazon|aws/) provider = "aws" + else if (lower_asn ~ /cloudflare/) provider = "cloudflare" + else if (lower_asn ~ /google/) provider = "gcp" + else if (lower_asn ~ /microsoft|azure/) provider = "azure" + else if (lower_asn ~ /digitalocean/) provider = "digitalocean" + else if (lower_asn ~ /hetzner/) provider = "hetzner" + + provider_req[provider]++ + if (status >= 400) provider_blocked[provider]++ + } + + # POST probes (POST returning 4xx/5xx) + if (method == "POST" && status >= 400) post_probes++ + + # Scanner detection + if (tolower(line) ~ /(nikto|sqlmap|nmap|masscan|zgrab|zmeu|morpheus)/) scanners++ + + # Empty user agent + if (line ~ /" "-"$/ || line ~ /" ""$/ || line ~ /" ""-"/) empty_ua++ + + total_requests++ + } + + END { + # Output delimited data for bash to parse + print "TOTAL_REQUESTS=" total_requests + print "BLOCKED_TOTAL=" blocked_total+0 + print "UNIQUE_IPS=" length(ip_total) + print "UNIQUE_BLOCKED_IPS=" length(ip_blocked) + print "POST_PROBES=" post_probes+0 + print "SCANNERS=" scanners+0 + print "EMPTY_UA=" empty_ua+0 + + # Country requests + for (c in country_req) print "COUNTRY_REQ|" c "|" country_req[c] + for (c in country_blocked) print "COUNTRY_BLK|" c "|" country_blocked[c] + + # ASN requests (top 20) + for (a in asn_req) print "ASN_REQ|" a "|" asn_req[a] + for (a in asn_blocked) print "ASN_BLK|" a "|" asn_blocked[a] + + # Provider + for (p in provider_req) print "PROVIDER_REQ|" p "|" provider_req[p] + for (p in provider_blocked) print "PROVIDER_BLK|" p "|" provider_blocked[p] + + # Top blocked IPs (by blocked count) + PROCINFO["sorted_in"] = "@val_num_desc" + n = 0 + for (ip in ip_blocked) { + if (n++ >= 10) break + print "TOP_BLOCKED|" ip "|" ip_blocked[ip] + } + } + ' format="$LOG_FORMAT" "$NGINX_LOG" 2>/dev/null) + + # --- Parse awk output --- + local total_requests=0 unique_ips=0 unique_blocked=0 + local blocked_total=0 + local post_probes=0 scanners=0 empty_ua=0 + + declare -A country_req=() country_blk=() asn_req=() asn_blk=() provider_req=() provider_blk=() + declare -a top_blocked_ips=() + + while IFS= read -r line; do + case "$line" in + TOTAL_REQUESTS=*) total_requests="${line#*=}" ;; + BLOCKED_TOTAL=*) blocked_total="${line#*=}" ;; + UNIQUE_IPS=*) unique_ips="${line#*=}" ;; + UNIQUE_BLOCKED_IPS=*) unique_blocked="${line#*=}" ;; + POST_PROBES=*) post_probes="${line#*=}" ;; + SCANNERS=*) scanners="${line#*=}" ;; + EMPTY_UA=*) empty_ua="${line#*=}" ;; + COUNTRY_REQ\|*) + IFS='|' read -r _ key val <<< "$line" + country_req["$key"]="$val" ;; + COUNTRY_BLK\|*) + IFS='|' read -r _ key val <<< "$line" + country_blk["$key"]="$val" ;; + ASN_REQ\|*) + IFS='|' read -r _ key val <<< "$line" + key=$(prom_label "$key") + asn_req["$key"]=$(( ${asn_req["$key"]:-0} + val )) ;; + ASN_BLK\|*) + IFS='|' read -r _ key val <<< "$line" + key=$(prom_label "$key") + asn_blk["$key"]=$(( ${asn_blk["$key"]:-0} + val )) ;; + PROVIDER_REQ\|*) + IFS='|' read -r _ key val <<< "$line" + provider_req["$key"]="$val" ;; + PROVIDER_BLK\|*) + IFS='|' read -r _ key val <<< "$line" + provider_blk["$key"]="$val" ;; + TOP_BLOCKED\|*) + top_blocked_ips+=("$line") ;; + esac + done <<< "$awk_output" + + # --- Build Prometheus metrics --- + + metrics+="# HELP ip_intel_up Exporter status (1=up) +# TYPE ip_intel_up gauge +ip_intel_up 1 + +# HELP ip_intel_requests_total Total requests in log +# TYPE ip_intel_requests_total gauge +ip_intel_requests_total ${total_requests:-0} + +# HELP ip_intel_blocked_total Total blocked requests (4xx/5xx) +# TYPE ip_intel_blocked_total gauge +ip_intel_blocked_total ${blocked_total:-0} + +# HELP ip_intel_unique_ips_total Unique IPs seen +# TYPE ip_intel_unique_ips_total gauge +ip_intel_unique_ips_total ${unique_ips:-0} + +# HELP ip_intel_unique_blocked_ips_total Unique IPs returning 4xx/5xx +# TYPE ip_intel_unique_blocked_ips_total gauge +ip_intel_unique_blocked_ips_total ${unique_blocked:-0} + +# HELP ip_intel_post_probe_total POST requests returning 4xx/5xx +# TYPE ip_intel_post_probe_total gauge +ip_intel_post_probe_total ${post_probes:-0} + +# HELP ip_intel_scanner_total Requests from known scanner user agents +# TYPE ip_intel_scanner_total gauge +ip_intel_scanner_total ${scanners:-0} + +# HELP ip_intel_empty_ua_total Requests with empty user agent +# TYPE ip_intel_empty_ua_total gauge +ip_intel_empty_ua_total ${empty_ua:-0} + +" + + # Country metrics + if [[ ${#country_req[@]} -gt 0 ]]; then + metrics+="# HELP ip_intel_requests_by_country Requests by country code +# TYPE ip_intel_requests_by_country gauge +" + for c in "${!country_req[@]}"; do + metrics+="ip_intel_requests_by_country{country=\"${c}\"} ${country_req[$c]} +" + done + metrics+=" +" + fi + + if [[ ${#country_blk[@]} -gt 0 ]]; then + metrics+="# HELP ip_intel_blocked_by_country Blocked requests by country code +# TYPE ip_intel_blocked_by_country gauge +" + for c in "${!country_blk[@]}"; do + metrics+="ip_intel_blocked_by_country{country=\"${c}\"} ${country_blk[$c]} +" + done + metrics+=" +" + fi + + # ASN metrics + if [[ ${#asn_req[@]} -gt 0 ]]; then + metrics+="# HELP ip_intel_requests_by_asn Requests by ASN organization +# TYPE ip_intel_requests_by_asn gauge +" + for a in "${!asn_req[@]}"; do + metrics+="ip_intel_requests_by_asn{asn_org=\"${a}\"} ${asn_req[$a]} +" + done + metrics+=" +" + fi + + if [[ ${#asn_blk[@]} -gt 0 ]]; then + metrics+="# HELP ip_intel_blocked_by_asn Blocked requests by ASN organization +# TYPE ip_intel_blocked_by_asn gauge +" + for a in "${!asn_blk[@]}"; do + metrics+="ip_intel_blocked_by_asn{asn_org=\"${a}\"} ${asn_blk[$a]} +" + done + metrics+=" +" + fi + + # Provider metrics + if [[ ${#provider_req[@]} -gt 0 ]]; then + metrics+="# HELP ip_intel_requests_by_provider Requests by cloud provider +# TYPE ip_intel_requests_by_provider gauge +" + for p in "${!provider_req[@]}"; do + metrics+="ip_intel_requests_by_provider{provider=\"${p}\"} ${provider_req[$p]} +" + done + metrics+=" +" + fi + + if [[ ${#provider_blk[@]} -gt 0 ]]; then + metrics+="# HELP ip_intel_blocked_by_provider Blocked requests by cloud provider +# TYPE ip_intel_blocked_by_provider gauge +" + for p in "${!provider_blk[@]}"; do + metrics+="ip_intel_blocked_by_provider{provider=\"${p}\"} ${provider_blk[$p]} +" + done + metrics+=" +" + fi + + # Top blocked IPs + if [[ ${#top_blocked_ips[@]} -gt 0 ]]; then + metrics+="# HELP ip_intel_top_blocked_ip_requests Top blocked IPs by blocked request count +# TYPE ip_intel_top_blocked_ip_requests gauge +" + for entry in "${top_blocked_ips[@]}"; do + IFS='|' read -r _ ip count <<< "$entry" + metrics+="ip_intel_top_blocked_ip_requests{ip=\"${ip}\"} ${count} +" + done + metrics+=" +" + fi + + # Exporter metadata + local end_time + end_time=$(date +%s%N) + local duration + duration=$(echo "scale=3; ($end_time - $start_time) / 1000000000" | bc 2>/dev/null || echo "0") + + metrics+="# HELP ip_intel_exporter_duration_seconds Script execution time +# TYPE ip_intel_exporter_duration_seconds gauge +ip_intel_exporter_duration_seconds ${duration} + +# HELP ip_intel_exporter_last_run_timestamp Last successful run (unix timestamp) +# TYPE ip_intel_exporter_last_run_timestamp gauge +ip_intel_exporter_last_run_timestamp ${now} +" + + echo "$metrics" +} + +# ============================================================================ +# OUTPUT HANDLING +# ============================================================================ + +output_metrics() { + local metrics + metrics=$(collect_metrics) + + if [[ -n "$OUTPUT_FILE" ]]; then + echo "$metrics" > "${OUTPUT_FILE}.tmp" + mv "${OUTPUT_FILE}.tmp" "$OUTPUT_FILE" + log "Metrics written to ${OUTPUT_FILE}" + else + echo "$metrics" + fi +} + +serve_http() { + log "Starting HTTP server on port ${HTTP_PORT}" + log "Metrics endpoint: http://localhost:${HTTP_PORT}/metrics" + + if ! command -v nc &>/dev/null && ! command -v ncat &>/dev/null; then + err "netcat (nc/ncat) not found — required for HTTP mode" + exit 1 + fi + + local nc_cmd="nc" + command -v ncat &>/dev/null && nc_cmd="ncat" + + while true; do + local metrics + metrics=$(collect_metrics) + local content_length=${#metrics} + local response="HTTP/1.1 200 OK\r\nContent-Type: text/plain; charset=utf-8\r\nContent-Length: ${content_length}\r\nConnection: close\r\n\r\n${metrics}" + echo -e "$response" | $nc_cmd -l -p "$HTTP_PORT" -q 1 2>/dev/null || \ + echo -e "$response" | $nc_cmd -l "$HTTP_PORT" 2>/dev/null || true + done +} + +# ============================================================================ +# MAIN +# ============================================================================ + +parse_args "$@" +preflight + +if [[ "$HTTP_MODE" == "true" ]]; then + serve_http +else + output_metrics +fi diff --git a/iptables-blocklist-metrics.sh b/iptables-blocklist-metrics.sh index 3db8486..a3e6b28 100755 --- a/iptables-blocklist-metrics.sh +++ b/iptables-blocklist-metrics.sh @@ -1,12 +1,29 @@ #!/bin/bash ################################################################################ # Script Name: iptables-blocklist-metrics.sh -# Version: 2.0 +# Version: 2.03 # Description: Prometheus exporter for iptables threat feed blocking metrics # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT +# +# Changes in v2.03: +# - Fixed blocked_total metric type from counter to gauge (rolling window) +# - Fixed HTTP response headers (printf with proper \r\n termination) +# - Fixed rule-stats grep prefix-matching wrong feed (word-boundary match) +# - Cached iptables -L INPUT output once per scrape instead of per-feed +# - Removed unused get_iptables_rule_stats helper function +# - Used FEEDS_ARRAY instead of grep on config file for ipset status check +# - Removed unnecessary sync call in textfile output path +# +# Changes in v2.02: +# - Added journal data caching (single journalctl call instead of per-feed) +# - Added feeds config caching into array +# - Fixed ipset member counting to use Members: section +# - Added SCRIPT_VERSION variable for version strings +# - Added scrape timestamp metric +# - Fixed hardcoded version in info metric and HTML page ################################################################################ # Ensure PATH includes sbin (for ipset/iptables when run from cron) @@ -38,14 +55,18 @@ FEEDS_CONFIG="$CONFIG_DIR/feeds.conf" IPSET_PREFIX="iptables-feed" WHITELIST_IPSET="iptables-whitelist" WHITELIST_IPSET_V6="iptables-whitelist-v6" -LOG_FILE="/var/log/iptables-threats.log" TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9419 -SCRIPT_START_TIME=$(date +%s) LOCK_FILE="/var/run/iptables-blocklist-metrics.lock" +SCRIPT_VERSION="2.03" + +# Global cache variables +JOURNAL_1H="" +JOURNAL_24H="" +FEEDS_ARRAY=() show_usage() { cat </dev/null | grep '\[THREAT' || echo "") + JOURNAL_24H=$(timeout 30 journalctl -k --since "24 hours ago" 2>/dev/null | grep '\[THREAT' || echo "") +} + +# Parse feeds config ONCE into array +cache_feeds_config() { + FEEDS_ARRAY=() + if [ -f "$FEEDS_CONFIG" ]; then + while IFS='|' read -r enabled name url type description; do + [[ "$enabled" =~ ^#.*$ ]] && continue + [[ -z "$enabled" ]] && continue + FEEDS_ARRAY+=("$enabled|$name|$url|$type|$description") + done < "$FEEDS_CONFIG" + fi +} + get_ipset_size() { local ipset_name="$1" - local size - size=$(ipset list "$ipset_name" 2>/dev/null | grep '^[0-9a-fA-F.:]' | wc -l 2>/dev/null) - echo "${size:-0}" + local count + count=$(ipset list "$ipset_name" 2>/dev/null | sed -n '/^Members:$/,$p' | tail -n +2 | wc -l) + echo "${count:-0}" } get_feed_blocks() { local feed="$1" local period="$2" + local data + + case "$period" in + "1 hour ago") data="$JOURNAL_1H" ;; + "24 hours ago") data="$JOURNAL_24H" ;; + *) echo 0; return ;; + esac + + if [ -z "$data" ]; then echo 0; return; fi local count - count=$(journalctl -k --since "$period" 2>/dev/null | grep "\[THREAT:${feed}\]" | wc -l 2>/dev/null) + count=$(printf '%s' "$data" | grep -c "\[THREAT:${feed}\]" 2>/dev/null) echo "${count:-0}" } get_feed_blocks_v6() { local feed="$1" local period="$2" + local data + + case "$period" in + "1 hour ago") data="$JOURNAL_1H" ;; + "24 hours ago") data="$JOURNAL_24H" ;; + *) echo 0; return ;; + esac + + if [ -z "$data" ]; then echo 0; return; fi local count - count=$(journalctl -k --since "$period" 2>/dev/null | grep "\[THREAT-v6:${feed}\]" | wc -l 2>/dev/null) + count=$(printf '%s' "$data" | grep -c "\[THREAT-v6:${feed}\]" 2>/dev/null) echo "${count:-0}" } @@ -128,13 +185,6 @@ get_cache_age() { fi } -get_iptables_rule_stats() { - local chain="$1" - local feed="$2" - # Extract packet and byte counts from iptables -L -v -n -x (exact numbers, no human-readable K/M/G) - iptables -L "$chain" -v -n -x 2>/dev/null | grep "${IPSET_PREFIX}-${feed}" | head -1 | awk '{print $1"|"$2}' -} - get_total_unique_ips() { local ip_version="$1" local count=0 @@ -189,7 +239,8 @@ get_total_cache_size() { acquire_lock() { if [ -f "$LOCK_FILE" ]; then - local pid=$(cat "$LOCK_FILE" 2>/dev/null) + local pid + pid=$(cat "$LOCK_FILE" 2>/dev/null) if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then echo "ERROR: Another instance is already running (PID: $pid)" >&2 exit 1 @@ -207,17 +258,20 @@ cleanup() { } generate_metrics() { - local start_time=$(date +%s) - local current_time=$(date +%s) + local start_time + start_time=$(date +%s) + + local iptables_input_stats + iptables_input_stats=$(iptables -L INPUT -v -n -x 2>/dev/null) cat </dev/null || echo 0) +iptables_blocklist_enabled_feeds $(printf '%s\n' "${FEEDS_ARRAY[@]}" | grep -c '^1|') # HELP iptables_blocklist_ipset_size Number of IPs per feed ipset # TYPE iptables_blocklist_ipset_size gauge @@ -226,7 +280,7 @@ EOF # Only export metrics for ipsets that actually exist for ipset_name in $(ipset list -n 2>/dev/null | grep "^${IPSET_PREFIX}-"); do # Extract feed name and IP version - local feed_name="${ipset_name#${IPSET_PREFIX}-}" + local feed_name="${ipset_name#"${IPSET_PREFIX}"-}" local ip_version="4" if [[ "$feed_name" =~ -v6$ ]]; then @@ -236,39 +290,37 @@ EOF # Get status from config local status="disabled" - if grep -q "^1|${feed_name}|" "$FEEDS_CONFIG" 2>/dev/null; then + if printf '%s\n' "${FEEDS_ARRAY[@]}" | grep -q "^1|${feed_name}|" 2>/dev/null; then status="enabled" fi - local size=$(get_ipset_size "$ipset_name") + local size + size=$(get_ipset_size "$ipset_name") echo "iptables_blocklist_ipset_size{feed=\"$feed_name\",ip_version=\"$ip_version\",status=\"$status\"} $size" done cat </dev/null; then - effectiveness_v4=$(awk "BEGIN {printf \"%.2f\", ($blocks_v4 / $ipset_size) * 1000}" 2>/dev/null || echo "0") - effectiveness_v6=$(awk "BEGIN {printf \"%.2f\", ($blocks_v6 / $ipset_size) * 1000}" 2>/dev/null || echo "0") - else - effectiveness_v4="0" - effectiveness_v6="0" - fi - - echo "iptables_blocklist_effectiveness{feed=\"$name\",ip_version=\"4\"} $effectiveness_v4" - echo "iptables_blocklist_effectiveness{feed=\"$name\",ip_version=\"6\"} $effectiveness_v6" - done < "$FEEDS_CONFIG" - fi + for feed_line in "${FEEDS_ARRAY[@]}"; do + IFS='|' read -r enabled name url type description <<< "$feed_line" + [ "$enabled" != "1" ] && continue + + local ipset_size blocks_v4 blocks_v6 effectiveness_v4 effectiveness_v6 + ipset_size=$(get_ipset_size "${IPSET_PREFIX}-${name}") + blocks_v4=$(get_feed_blocks "$name" "24 hours ago") + blocks_v6=$(get_feed_blocks_v6 "$name" "24 hours ago") + + # Strip whitespace and ensure integers + ipset_size=$(echo "$ipset_size" | tr -d '\n' | tr -d ' ') + blocks_v4=$(echo "$blocks_v4" | tr -d '\n' | tr -d ' ') + blocks_v6=$(echo "$blocks_v6" | tr -d '\n' | tr -d ' ') + ipset_size=${ipset_size:-0} + blocks_v4=${blocks_v4:-0} + blocks_v6=${blocks_v6:-0} + + if [ "$ipset_size" -gt 0 ] 2>/dev/null; then + effectiveness_v4=$(awk "BEGIN {printf \"%.2f\", ($blocks_v4 / $ipset_size) * 1000}" 2>/dev/null || echo "0") + effectiveness_v6=$(awk "BEGIN {printf \"%.2f\", ($blocks_v6 / $ipset_size) * 1000}" 2>/dev/null || echo "0") + else + effectiveness_v4="0" + effectiveness_v6="0" + fi + + echo "iptables_blocklist_effectiveness{feed=\"$name\",ip_version=\"4\"} $effectiveness_v4" + echo "iptables_blocklist_effectiveness{feed=\"$name\",ip_version=\"6\"} $effectiveness_v6" + done # Feed update/cache metrics cat </dev/null; then - ratio_v4=$(awk "BEGIN {printf \"%.4f\", $v4_size / $total}" 2>/dev/null || echo "0") - ratio_v6=$(awk "BEGIN {printf \"%.4f\", $v6_size / $total}" 2>/dev/null || echo "0") - else - ratio_v4="0" - ratio_v6="0" - fi - - echo "iptables_blocklist_ip_version_ratio{feed=\"$name\",version=\"4\"} $ratio_v4" - echo "iptables_blocklist_ip_version_ratio{feed=\"$name\",version=\"6\"} $ratio_v6" - done < "$FEEDS_CONFIG" - fi + for feed_line in "${FEEDS_ARRAY[@]}"; do + IFS='|' read -r enabled name url type description <<< "$feed_line" + + local v4_size v6_size total ratio_v4 ratio_v6 + v4_size=$(get_ipset_size "${IPSET_PREFIX}-${name}") + v6_size=$(get_ipset_size "${IPSET_PREFIX}-${name}-v6") + + v4_size=${v4_size:-0} + v6_size=${v6_size:-0} + total=$((v4_size + v6_size)) + + if [ "$total" -gt 0 ] 2>/dev/null; then + ratio_v4=$(awk "BEGIN {printf \"%.4f\", $v4_size / $total}" 2>/dev/null || echo "0") + ratio_v6=$(awk "BEGIN {printf \"%.4f\", $v6_size / $total}" 2>/dev/null || echo "0") + else + ratio_v4="0" + ratio_v6="0" + fi + + echo "iptables_blocklist_ip_version_ratio{feed=\"$name\",version=\"4\"} $ratio_v4" + echo "iptables_blocklist_ip_version_ratio{feed=\"$name\",version=\"6\"} $ratio_v6" + done # Total metrics cat </dev/null | wc -l) # TYPE iptables_blocklist_rule_packets counter EOF - if [ -f "$FEEDS_CONFIG" ]; then - while IFS='|' read -r enabled name url type description; do - [[ "$enabled" =~ ^#.*$ ]] && continue - [[ -z "$enabled" ]] && continue - [ "$enabled" != "1" ] && continue - - local stats_log stats_drop packets_log bytes_log packets_drop bytes_drop - - stats_log=$(iptables -L INPUT -v -n -x 2>/dev/null | grep "${IPSET_PREFIX}-${name}" | grep LOG | head -1 | awk '{print $1"|"$2}') - stats_drop=$(iptables -L INPUT -v -n -x 2>/dev/null | grep "${IPSET_PREFIX}-${name}" | grep DROP | head -1 | awk '{print $1"|"$2}') - - if [ -n "$stats_log" ]; then - packets_log=$(echo "$stats_log" | cut -d'|' -f1) - bytes_log=$(echo "$stats_log" | cut -d'|' -f2) - echo "iptables_blocklist_rule_packets{feed=\"$name\",ip_version=\"4\",action=\"log\"} ${packets_log:-0}" - fi - - if [ -n "$stats_drop" ]; then - packets_drop=$(echo "$stats_drop" | cut -d'|' -f1) - bytes_drop=$(echo "$stats_drop" | cut -d'|' -f2) - echo "iptables_blocklist_rule_packets{feed=\"$name\",ip_version=\"4\",action=\"drop\"} ${packets_drop:-0}" - fi - done < "$FEEDS_CONFIG" - fi + for feed_line in "${FEEDS_ARRAY[@]}"; do + IFS='|' read -r enabled name url type description <<< "$feed_line" + [ "$enabled" != "1" ] && continue + + local stats_log stats_drop packets_log bytes_log packets_drop bytes_drop + + stats_log=$(echo "$iptables_input_stats" | grep " ${IPSET_PREFIX}-${name} " | grep LOG | head -1 | awk '{print $1"|"$2}') + stats_drop=$(echo "$iptables_input_stats" | grep " ${IPSET_PREFIX}-${name} " | grep DROP | head -1 | awk '{print $1"|"$2}') + + if [ -n "$stats_log" ]; then + packets_log=$(echo "$stats_log" | cut -d'|' -f1) + bytes_log=$(echo "$stats_log" | cut -d'|' -f2) + echo "iptables_blocklist_rule_packets{feed=\"$name\",ip_version=\"4\",action=\"log\"} ${packets_log:-0}" + fi + + if [ -n "$stats_drop" ]; then + packets_drop=$(echo "$stats_drop" | cut -d'|' -f1) + bytes_drop=$(echo "$stats_drop" | cut -d'|' -f2) + echo "iptables_blocklist_rule_packets{feed=\"$name\",ip_version=\"4\",action=\"drop\"} ${packets_drop:-0}" + fi + done cat </dev/null | grep "${IPSET_PREFIX}-${name}" | grep LOG | head -1 | awk '{print $1"|"$2}') - stats_drop=$(iptables -L INPUT -v -n -x 2>/dev/null | grep "${IPSET_PREFIX}-${name}" | grep DROP | head -1 | awk '{print $1"|"$2}') - - if [ -n "$stats_log" ]; then - packets_log=$(echo "$stats_log" | cut -d'|' -f1) - bytes_log=$(echo "$stats_log" | cut -d'|' -f2) - echo "iptables_blocklist_rule_bytes{feed=\"$name\",ip_version=\"4\",action=\"log\"} ${bytes_log:-0}" - fi - - if [ -n "$stats_drop" ]; then - packets_drop=$(echo "$stats_drop" | cut -d'|' -f1) - bytes_drop=$(echo "$stats_drop" | cut -d'|' -f2) - echo "iptables_blocklist_rule_bytes{feed=\"$name\",ip_version=\"4\",action=\"drop\"} ${bytes_drop:-0}" - fi - done < "$FEEDS_CONFIG" - fi + for feed_line in "${FEEDS_ARRAY[@]}"; do + IFS='|' read -r enabled name url type description <<< "$feed_line" + [ "$enabled" != "1" ] && continue + + local stats_log stats_drop packets_log bytes_log packets_drop bytes_drop + + stats_log=$(echo "$iptables_input_stats" | grep " ${IPSET_PREFIX}-${name} " | grep LOG | head -1 | awk '{print $1"|"$2}') + stats_drop=$(echo "$iptables_input_stats" | grep " ${IPSET_PREFIX}-${name} " | grep DROP | head -1 | awk '{print $1"|"$2}') + + if [ -n "$stats_log" ]; then + packets_log=$(echo "$stats_log" | cut -d'|' -f1) + bytes_log=$(echo "$stats_log" | cut -d'|' -f2) + echo "iptables_blocklist_rule_bytes{feed=\"$name\",ip_version=\"4\",action=\"log\"} ${bytes_log:-0}" + fi + + if [ -n "$stats_drop" ]; then + packets_drop=$(echo "$stats_drop" | cut -d'|' -f1) + bytes_drop=$(echo "$stats_drop" | cut -d'|' -f2) + echo "iptables_blocklist_rule_bytes{feed=\"$name\",ip_version=\"4\",action=\"drop\"} ${bytes_drop:-0}" + fi + done cat <iptables Blocklist Metrics Exporter" + printf "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n" + echo "

iptables Blocklist Metrics Exporter v${SCRIPT_VERSION}

" echo "

Per-feed threat blocking statistics

" echo "

Metrics

" fi @@ -598,12 +632,17 @@ main() { if [ "$HTTP_MODE" = true ]; then run_http_server elif [ -n "$OUTPUT_FILE" ]; then + # Cache data before generating metrics + cache_journal_data + cache_feeds_config + # Ensure output directory exists mkdir -p "$(dirname "$OUTPUT_FILE")" # Create temp file in /tmp (not in node_exporter directory!) # This prevents node_exporter from seeing partial writes - local temp_file=$(mktemp /tmp/iptables_metrics.XXXXXX) + local temp_file + temp_file=$(mktemp /tmp/iptables_metrics.XXXXXX) # Generate metrics to temp file generate_metrics > "$temp_file" @@ -617,10 +656,9 @@ main() { # Ensure node_exporter user can read it chmod 644 "$OUTPUT_FILE" - - # Force filesystem sync (optional but helps) - sync else + cache_journal_data + cache_feeds_config generate_metrics fi } diff --git a/iptables-blocklists.sh b/iptables-blocklists.sh index 537d8a0..4980dc3 100755 --- a/iptables-blocklists.sh +++ b/iptables-blocklists.sh @@ -1,13 +1,23 @@ #!/bin/bash ################################################################################ # Script Name: iptables-blocklists.sh -# Version: 1.0 +# Version: 1.04 # Description: Per-feed iptables threat intelligence blocking with ipset # Author: Phil Connor # Contact: contact@mylinux.work # Website: https://mylinux.work # License: MIT ################################################################################ +# Changelog: +# v1.04 - Fix IPv6 parser: reject timestamps (e.g. 14:34:21) misidentified +# as IPv6 addresses by requiring hex letters, 3+ groups, or :: +# v1.03 - Fix remove-feed/disable-feed: delete iptables rules before destroying +# ipsets. Remove dangerous ipset destruction of disabled feeds during +# update. Always swap ipsets even when feed is empty (clears stale +# blocks). Use -I (insert) for IPv6 rules instead of -A (append). +# Fix show-stats to count both IPv4+IPv6 blocks with cached journal. +# Add curl requirement check. Fix ipset member counting. +################################################################################ # Don't use 'set -e' - causes issues with ipset error handling CONFIG_DIR="/etc/iptables-threats" @@ -45,6 +55,7 @@ COMMANDS: whitelist-add IP Add IP/CIDR to whitelist whitelist-init Initialize whitelist with RFC1918/Docker networks whitelist-list Show all whitelisted IPs + clean-cache Remove cache files for disabled feeds OPTIONS: -h, --help Show this help message @@ -77,7 +88,53 @@ EOF } log_message() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" + local msg + msg="[$(date '+%Y-%m-%d %H:%M:%S')] $1" + echo "$msg" + echo "$msg" >> "$LOG_FILE" 2>/dev/null || true +} + +# Iterate over enabled feeds in $FEEDS_CONFIG, calling the provided callback +# function with arguments: name url type description +# Usage: for_each_enabled_feed my_callback_function +for_each_enabled_feed() { + local callback="$1" + [ -f "$FEEDS_CONFIG" ] || return 0 + + local enabled name url type description + while IFS='|' read -r enabled name url type description; do + [[ "$enabled" =~ ^#.*$ ]] && continue + [[ -z "$enabled" ]] && continue + [ "$enabled" != "1" ] && continue + "$callback" "$name" "$url" "$type" "$description" + done < "$FEEDS_CONFIG" +} + +# Iterate over ALL feeds (enabled + disabled), calling the provided callback +# function with arguments: enabled name url type description +for_each_feed() { + local callback="$1" + [ -f "$FEEDS_CONFIG" ] || return 0 + + local enabled name url type description + while IFS='|' read -r enabled name url type description; do + [[ "$enabled" =~ ^#.*$ ]] && continue + [[ -z "$enabled" ]] && continue + "$callback" "$enabled" "$name" "$url" "$type" "$description" + done < "$FEEDS_CONFIG" +} + +validate_feed_name() { + local name="$1" + if [ -z "$name" ]; then + echo "ERROR: Feed name cannot be empty"; return 1 + fi + if [[ ! "$name" =~ ^[a-zA-Z0-9_-]+$ ]]; then + echo "ERROR: Feed name '$name' contains invalid characters (only a-z, 0-9, _, - allowed)"; return 1 + fi + if [ "${#name}" -gt 20 ]; then + echo "ERROR: Feed name '$name' too long (max 20 chars, ipset name limit)"; return 1 + fi } parse_args() { @@ -89,7 +146,7 @@ parse_args() { --no-auto-update) ENABLE_AUTO_UPDATE=false; shift ;; --no-ipv6) ENABLE_IPV6=false; shift ;; --update-interval) UPDATE_INTERVAL="$2"; shift 2 ;; - install|update|apply-rules|test-rules|list-feeds|show-stats|whitelist-init|whitelist-list) COMMAND="$1"; shift ;; + install|update|apply-rules|test-rules|list-feeds|show-stats|whitelist-init|whitelist-list|clean-cache) COMMAND="$1"; shift ;; add-feed) COMMAND="add-feed"; FEED_NAME="$2"; FEED_URL="$3"; shift 3 ;; remove-feed|enable-feed|disable-feed) COMMAND="$1"; FEED_NAME="$2"; shift 2 ;; whitelist-add) COMMAND="whitelist-add"; WHITELIST_IP="$2"; shift 2 ;; @@ -115,6 +172,10 @@ check_requirements() { exit 1 fi fi + + command -v curl >/dev/null 2>&1 || { echo "ERROR: curl required"; exit 1; } + + ensure_ipsets_exist } create_directory_structure() { @@ -137,16 +198,29 @@ cleanup_old_backups() { } initialize_feeds_config() { - [ -f "$FEEDS_CONFIG" ] && return - + local has_feeds + has_feeds=$(grep -c '^[01]|' "$FEEDS_CONFIG" 2>/dev/null || true) + + if [ -f "$FEEDS_CONFIG" ] && [ "$has_feeds" -gt 0 ]; then + log_message "Feeds configuration already exists with $has_feeds feeds" + return + fi + + log_message "Creating feeds configuration..." + + [ -f "$FEEDS_CONFIG" ] && mv "$FEEDS_CONFIG" "${FEEDS_CONFIG}.old-$(date +%Y%m%d-%H%M%S)" + cat > "$FEEDS_CONFIG" <<'EOF' # Threat Intelligence Feeds Configuration # Format: ENABLED|NAME|URL|TYPE|DESCRIPTION # # ENABLED: 1 (enabled) or 0 (disabled) # NAME: Unique feed identifier -# URL: Feed URL -# TYPE: Format type (plain, cidr, commented, custom) +# URL: Feed URL (http/https) or local file (file:///path/to/file) +# TYPE: Format type (plain, cidr, commented) +# plain - One IP/CIDR per line, no comments +# cidr - IP/CIDR with optional inline comments/fields +# commented - Lines starting with # or ; are ignored, IPs extracted # DESCRIPTION: Feed description 1|cinsarmy|http://cinsscore.com/list/ci-badguys.txt|plain|CINS Army Malicious IPs @@ -181,95 +255,144 @@ EOF chmod 600 "$FEEDS_CONFIG" } +_ensure_feed_ipset() { + local name="$1" + + ipset list "${IPSET_PREFIX}-${name}" >/dev/null 2>&1 || \ + ipset create "${IPSET_PREFIX}-${name}" hash:net family inet hashsize 4096 maxelem 200000 2>/dev/null || true + + if [ "$ENABLE_IPV6" = true ]; then + ipset list "${IPSET_PREFIX}-${name}-v6" >/dev/null 2>&1 || \ + ipset create "${IPSET_PREFIX}-${name}-v6" hash:net family inet6 hashsize 4096 maxelem 200000 2>/dev/null || true + fi +} + +ensure_ipsets_exist() { + if [ -f /etc/sysconfig/ipset ]; then + ipset restore -f /etc/sysconfig/ipset 2>/dev/null || true + elif [ -f /etc/iptables/ipsets ]; then + ipset restore -f /etc/iptables/ipsets 2>/dev/null || true + fi + + ipset list "$WHITELIST_IPSET" >/dev/null 2>&1 || \ + ipset create "$WHITELIST_IPSET" hash:net family inet hashsize 1024 maxelem 10000 2>/dev/null || true + + if [ "$ENABLE_IPV6" = true ]; then + ipset list "$WHITELIST_IPSET_V6" >/dev/null 2>&1 || \ + ipset create "$WHITELIST_IPSET_V6" hash:net family inet6 hashsize 1024 maxelem 10000 2>/dev/null || true + fi + + for_each_enabled_feed _ensure_feed_ipset +} + setup_ipsets() { log_message "Setting up per-feed ipsets..." - - # Whitelist + if ! ipset list "$WHITELIST_IPSET" >/dev/null 2>&1; then ipset create "$WHITELIST_IPSET" hash:net family inet hashsize 1024 maxelem 10000 ipset add "$WHITELIST_IPSET" 127.0.0.1 2>/dev/null || true fi - + if [ "$ENABLE_IPV6" = true ] && ! ipset list "$WHITELIST_IPSET_V6" >/dev/null 2>&1; then ipset create "$WHITELIST_IPSET_V6" hash:net family inet6 hashsize 1024 maxelem 10000 ipset add "$WHITELIST_IPSET_V6" ::1 2>/dev/null || true fi - - # Create ipset per feed - while IFS='|' read -r enabled name url type description; do - [[ "$enabled" =~ ^#.*$ ]] && continue - [[ -z "$enabled" ]] && continue - [ "$enabled" != "1" ] && continue - - if ! ipset list "${IPSET_PREFIX}-${name}" >/dev/null 2>&1; then - ipset create "${IPSET_PREFIX}-${name}" hash:net family inet hashsize 4096 maxelem 200000 - fi - - if [ "$ENABLE_IPV6" = true ] && ! ipset list "${IPSET_PREFIX}-${name}-v6" >/dev/null 2>&1; then - ipset create "${IPSET_PREFIX}-${name}-v6" hash:net family inet6 hashsize 4096 maxelem 200000 - fi - done < "$FEEDS_CONFIG" + + for_each_enabled_feed _ensure_feed_ipset } download_feed() { - curl -f -s -m 30 -L "$1" -o "$2" 2>/dev/null + local url="$1" output="$2" + + # Local file support: file:///path or file://path + if [[ "$url" == file://* ]]; then + local local_path="${url#file://}" + if [ ! -f "$local_path" ]; then + log_message " Local file not found: $local_path" + return 1 + fi + cp "$local_path" "$output" 2>/dev/null || return 1 + return 0 + fi + + local http_code + http_code=$(curl -f -s -m 60 --connect-timeout 10 -L \ + -A "iptables-threat-feeds-per-feed/1.0" \ + -w "%{http_code}" -o "$output" "$url" 2>/dev/null) || true + + if [ ! -s "$output" ]; then + log_message " Download failed for $url (HTTP $http_code, empty response)" + return 1 + fi + return 0 } parse_feed() { local file="$1" type="$2" out_v4="$3" out_v6="$4" - true > "$out_v4" - true > "$out_v6" - + + : > "$out_v4" + : > "$out_v6" + + local cleaned + cleaned=$(mktemp) + tr -d '\r' < "$file" > "$cleaned" + + # IPv6 filter: require either a hex letter [a-fA-F], or 3+ colon-separated + # groups, or ::. This excludes timestamps like 14:34:21 which only have + # digits and exactly two colon-separated groups. + local v6_filter='([a-fA-F]|:.*:.*:|::)' + case "$type" in plain) - grep -E '^[0-9.]+(/[0-9]+)?$' "$file" >> "$out_v4" 2>/dev/null || true - [ "$ENABLE_IPV6" = true ] && grep -E '^[0-9a-fA-F:]+(/[0-9]+)?$' "$file" | grep ':' >> "$out_v6" 2>/dev/null || true + grep -E '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(/[0-9]+)?$' "$cleaned" >> "$out_v4" 2>/dev/null || true + if [ "$ENABLE_IPV6" = true ]; then + grep -E '^[0-9a-fA-F:]+(/[0-9]+)?$' "$cleaned" | grep ':' \ + | grep -E "$v6_filter" >> "$out_v6" 2>/dev/null || true + fi ;; cidr) - grep -E '^[0-9.]+' "$file" | cut -d' ' -f1 | cut -d'#' -f1 | grep -v '^$' >> "$out_v4" 2>/dev/null || true - [ "$ENABLE_IPV6" = true ] && grep -E '^[0-9a-fA-F:]+' "$file" | grep ':' | cut -d' ' -f1 | cut -d'#' -f1 >> "$out_v6" 2>/dev/null || true + grep -oE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(/[0-9]+)?' "$cleaned" \ + | grep -v '^$' >> "$out_v4" 2>/dev/null || true + if [ "$ENABLE_IPV6" = true ]; then + grep -oE '^[0-9a-fA-F:]+(/[0-9]+)?' "$cleaned" \ + | grep ':' | grep -E "$v6_filter" | grep -v '^$' >> "$out_v6" 2>/dev/null || true + fi ;; commented) - grep -v -E '^[#;]|^$' "$file" | grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(/[0-9]+)?' >> "$out_v4" 2>/dev/null || true - [ "$ENABLE_IPV6" = true ] && grep -v -E '^[#;]|^$' "$file" | grep -oE '[0-9a-fA-F:]+(/[0-9]+)?' | grep -E '^[0-9a-fA-F]{1,4}:[0-9a-fA-F:]+' >> "$out_v6" 2>/dev/null || true + grep -v -E '^[#;]|^$' "$cleaned" \ + | grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(/[0-9]+)?' >> "$out_v4" 2>/dev/null || true + if [ "$ENABLE_IPV6" = true ]; then + grep -v -E '^[#;]|^$' "$cleaned" \ + | grep -oE '[0-9a-fA-F:]+(/[0-9]+)?' \ + | grep -E '^[0-9a-fA-F]{1,4}:[0-9a-fA-F:]+' \ + | grep -E "$v6_filter" >> "$out_v6" 2>/dev/null || true + fi ;; esac + + rm -f "$cleaned" } update_feeds() { log_message "Starting per-feed update (FAST ipset restore mode)..." # Auto-cleanup cache and ipsets for disabled feeds - local enabled_feeds=$(grep '^1|' "$FEEDS_CONFIG" 2>/dev/null | cut -d'|' -f2) + local enabled_feeds + enabled_feeds=$(grep '^1|' "$FEEDS_CONFIG" 2>/dev/null | cut -d'|' -f2) local cleaned_cache=0 - local cleaned_ipsets=0 # Clean cache files for cache_file in "$CACHE_DIR"/*.raw "$CACHE_DIR"/*-v4.parsed "$CACHE_DIR"/*-v6.parsed "$CACHE_DIR"/*-v4.restore "$CACHE_DIR"/*-v6.restore; do [ -f "$cache_file" ] || continue - local bn=$(basename "$cache_file") + local bn + bn=$(basename "$cache_file") local fn="${bn%%.raw}"; fn="${fn%%-v4.parsed}"; fn="${fn%%-v6.parsed}"; fn="${fn%%-v4.restore}"; fn="${fn%%-v6.restore}" if ! echo "$enabled_feeds" | grep -q "^${fn}$"; then rm -f "$cache_file" && cleaned_cache=$((cleaned_cache + 1)) fi done - # Clean ipsets for disabled feeds - while IFS='|' read -r enabled name url type description; do - [[ "$enabled" =~ ^#.*$ ]] && continue - [[ -z "$enabled" ]] && continue - [ "$enabled" = "1" ] && continue - - if ipset list "${IPSET_PREFIX}-${name}" >/dev/null 2>&1; then - ipset destroy "${IPSET_PREFIX}-${name}" 2>/dev/null && cleaned_ipsets=$((cleaned_ipsets + 1)) - fi - if ipset list "${IPSET_PREFIX}-${name}-v6" >/dev/null 2>&1; then - ipset destroy "${IPSET_PREFIX}-${name}-v6" 2>/dev/null && cleaned_ipsets=$((cleaned_ipsets + 1)) - fi - done < "$FEEDS_CONFIG" - [ "$cleaned_cache" -gt 0 ] && log_message " Cleaned $cleaned_cache stale cache files" - [ "$cleaned_ipsets" -gt 0 ] && log_message " Destroyed $cleaned_ipsets stale ipsets" local total=0 failed=0 @@ -290,36 +413,34 @@ update_feeds() { c4=$(wc -l < "$v4" 2>/dev/null || echo 0) [ "$ENABLE_IPV6" = true ] && c6=$(wc -l < "$v6" 2>/dev/null || echo 0) - # FAST IPv4: Use ipset restore - if [ "$c4" -gt 0 ]; then - # Ensure target ipset exists for swap - if ! ipset list "${IPSET_PREFIX}-${name}" >/dev/null 2>&1; then - ipset create "${IPSET_PREFIX}-${name}" hash:net family inet hashsize 4096 maxelem 200000 - fi - - { - echo "create ${IPSET_PREFIX}-${name}-tmp hash:net family inet hashsize 4096 maxelem 200000" - echo "flush ${IPSET_PREFIX}-${name}-tmp" - while IFS= read -r ip; do - [ -z "$ip" ] && continue - echo "add ${IPSET_PREFIX}-${name}-tmp $ip" - done < "$v4" - echo "swap ${IPSET_PREFIX}-${name} ${IPSET_PREFIX}-${name}-tmp" - echo "destroy ${IPSET_PREFIX}-${name}-tmp" - } > "$CACHE_DIR/${name}-v4.restore" - - ipset restore < "$CACHE_DIR/${name}-v4.restore" 2>/dev/null || { - log_message " ⚠ Batch load failed for $name IPv4, using fallback" - ipset flush "${IPSET_PREFIX}-${name}" 2>/dev/null || true - while IFS= read -r ip; do - [ -z "$ip" ] && continue - ipset add "${IPSET_PREFIX}-${name}" "$ip" 2>/dev/null || true - done < "$v4" - } + # FAST IPv4: Use ipset restore (always swap, even if empty, to clear stale entries) + # Ensure target ipset exists for swap + if ! ipset list "${IPSET_PREFIX}-${name}" >/dev/null 2>&1; then + ipset create "${IPSET_PREFIX}-${name}" hash:net family inet hashsize 4096 maxelem 200000 fi - # FAST IPv6: Use ipset restore - if [ "$ENABLE_IPV6" = true ] && [ "$c6" -gt 0 ]; then + { + echo "create ${IPSET_PREFIX}-${name}-tmp hash:net family inet hashsize 4096 maxelem 200000" + echo "flush ${IPSET_PREFIX}-${name}-tmp" + while IFS= read -r ip; do + [ -z "$ip" ] && continue + echo "add ${IPSET_PREFIX}-${name}-tmp $ip" + done < "$v4" + echo "swap ${IPSET_PREFIX}-${name} ${IPSET_PREFIX}-${name}-tmp" + echo "destroy ${IPSET_PREFIX}-${name}-tmp" + } > "$CACHE_DIR/${name}-v4.restore" + + ipset restore < "$CACHE_DIR/${name}-v4.restore" 2>/dev/null || { + log_message " ⚠ Batch load failed for $name IPv4, using fallback" + ipset flush "${IPSET_PREFIX}-${name}" 2>/dev/null || true + while IFS= read -r ip; do + [ -z "$ip" ] && continue + ipset add "${IPSET_PREFIX}-${name}" "$ip" 2>/dev/null || true + done < "$v4" + } + + # FAST IPv6: Use ipset restore (always swap, even if empty, to clear stale entries) + if [ "$ENABLE_IPV6" = true ]; then # Ensure target ipset exists for swap if ! ipset list "${IPSET_PREFIX}-${name}-v6" >/dev/null 2>&1; then ipset create "${IPSET_PREFIX}-${name}-v6" hash:net family inet6 hashsize 4096 maxelem 200000 @@ -391,6 +512,7 @@ apply_iptables_rules() { # Add per-feed rules local line=2 + local v6_line=2 while IFS='|' read -r enabled name url type description; do [[ "$enabled" =~ ^#.*$ ]] && continue [[ -z "$enabled" ]] && continue @@ -404,8 +526,10 @@ apply_iptables_rules() { # IPv6 if [ "$ENABLE_IPV6" = true ]; then - ip6tables -A INPUT -m set --match-set "${IPSET_PREFIX}-${name}-v6" src -m limit --limit 5/min -j LOG --log-prefix "[THREAT-v6:${name}] " - ip6tables -A INPUT -m set --match-set "${IPSET_PREFIX}-${name}-v6" src -j DROP + ip6tables -I INPUT $v6_line -m set --match-set "${IPSET_PREFIX}-${name}-v6" src -m limit --limit 5/min -j LOG --log-prefix "[THREAT-v6:${name}] " + v6_line=$((v6_line + 1)) + ip6tables -I INPUT $v6_line -m set --match-set "${IPSET_PREFIX}-${name}-v6" src -j DROP + v6_line=$((v6_line + 1)) fi done < "$FEEDS_CONFIG" @@ -429,9 +553,11 @@ apply_iptables_rules() { setup_iptables_persistence() { log_message "Setting up iptables persistence..." - - # Create systemd service for iptables restore - cat > /etc/systemd/system/iptables-restore.service <<'EOF' + + local script_path + script_path=$(readlink -f "$0") + + cat > /etc/systemd/system/iptables-restore.service </dev/null || ipset restore -f /etc/iptables/ipsets 2>/dev/null || true' +ExecStart=/bin/bash -c 'ipset restore -f /etc/sysconfig/ipset 2>/dev/null || ipset restore -f /etc/iptables/ipsets 2>/dev/null || true; ${script_path} _ensure-ipsets 2>/dev/null || true' ExecStart=/bin/bash -c 'iptables-restore /etc/sysconfig/iptables 2>/dev/null || iptables-restore /etc/iptables/rules.v4 2>/dev/null || true' ExecStart=/bin/bash -c 'ip6tables-restore /etc/sysconfig/ip6tables 2>/dev/null || ip6tables-restore /etc/iptables/rules.v6 2>/dev/null || true' [Install] WantedBy=multi-user.target EOF - + systemctl daemon-reload systemctl enable iptables-restore.service 2>/dev/null || true log_message "✓ iptables persistence configured" @@ -456,7 +582,8 @@ EOF setup_auto_update() { [ "$ENABLE_AUTO_UPDATE" = false ] && return - local script=$(readlink -f "$0") + local script + script=$(readlink -f "$0") cat > /etc/systemd/system/iptables-threat-feeds-update.service </dev/null | grep '\[THREAT' || true) + echo "Per-Feed Blocking Statistics" printf "%-25s %10s %10s %12s\n" "FEED" "IPv4" "IPv6" "BLOCKS(1h)" echo "-------------------------------------------------------------------" @@ -494,35 +624,45 @@ cmd_show_stats() { [[ -z "$enabled" ]] && continue [ "$enabled" != "1" ] && continue - local v4 v6=0 blocks - v4=$(ipset list "${IPSET_PREFIX}-${name}" 2>/dev/null | grep -c '^[0-9.]' || echo 0) - [ "$ENABLE_IPV6" = true ] && v6=$(ipset list "${IPSET_PREFIX}-${name}-v6" 2>/dev/null | grep -c '^[0-9a-fA-F:]' || echo 0) - blocks=$(journalctl -k --since "1 hour ago" 2>/dev/null | grep -c "\[THREAT:${name}\]" || echo 0) + local v4 v6=0 blocks_v4=0 blocks_v6=0 + v4=$(ipset list "${IPSET_PREFIX}-${name}" 2>/dev/null | sed -n '/^Members:$/,$p' | tail -n +2 | wc -l) + [ "$ENABLE_IPV6" = true ] && v6=$(ipset list "${IPSET_PREFIX}-${name}-v6" 2>/dev/null | sed -n '/^Members:$/,$p' | tail -n +2 | wc -l) - printf "%-25s %10s %10s %12s\n" "$name" "$v4" "$v6" "$blocks" + if [ -n "$journal_1h" ]; then + blocks_v4=$(printf '%s' "$journal_1h" | grep -c "\[THREAT:${name}\]" 2>/dev/null || true) + blocks_v6=$(printf '%s' "$journal_1h" | grep -c "\[THREAT-v6:${name}\]" 2>/dev/null || true) + fi + local blocks=$(( ${blocks_v4:-0} + ${blocks_v6:-0} )) + + printf "%-25s %10s %10s %12s\n" "$name" "${v4:-0}" "${v6:-0}" "$blocks" done < "$FEEDS_CONFIG" } +_print_feed() { + local enabled="$1" name="$2" _url="$3" _type="$4" description="$5" + printf "%-10s %-25s %s\n" "$([ "$enabled" = "1" ] && echo "ENABLED" || echo "DISABLED")" "$name" "$description" +} + cmd_list_feeds() { printf "%-10s %-25s %s\n" "STATUS" "NAME" "DESC" - while IFS='|' read -r enabled name url type description; do - [[ "$enabled" =~ ^#.*$ ]] && continue - [[ -z "$enabled" ]] && continue - printf "%-10s %-25s %s\n" "$([ "$enabled" = "1" ] && echo "ENABLED" || echo "DISABLED")" "$name" "$description" - done < "$FEEDS_CONFIG" + for_each_feed _print_feed } cmd_whitelist_add() { [ -z "$WHITELIST_IP" ] && { echo "Usage: $0 whitelist-add "; exit 1; } - if echo "$WHITELIST_IP" | grep -q ':'; then - ipset add "$WHITELIST_IPSET_V6" "$WHITELIST_IP" 2>/dev/null && \ - log_message "✓ Added to IPv6 whitelist: $WHITELIST_IP" || \ - { echo "Failed to add $WHITELIST_IP"; exit 1; } + if [[ "$WHITELIST_IP" == *:* ]]; then + if ipset add "$WHITELIST_IPSET_V6" "$WHITELIST_IP" 2>/dev/null; then + log_message "✓ Added to IPv6 whitelist: $WHITELIST_IP" + else + echo "Failed to add $WHITELIST_IP"; exit 1 + fi else - ipset add "$WHITELIST_IPSET" "$WHITELIST_IP" 2>/dev/null && \ - log_message "✓ Added to IPv4 whitelist: $WHITELIST_IP" || \ - { echo "Failed to add $WHITELIST_IP"; exit 1; } + if ipset add "$WHITELIST_IPSET" "$WHITELIST_IP" 2>/dev/null; then + log_message "✓ Added to IPv4 whitelist: $WHITELIST_IP" + else + echo "Failed to add $WHITELIST_IP"; exit 1 + fi fi ipset save > /etc/sysconfig/ipset 2>/dev/null || ipset save > /etc/iptables/ipsets 2>/dev/null || true @@ -586,6 +726,7 @@ cmd_whitelist_list() { cmd_add_feed() { [ -z "$FEED_NAME" ] || [ -z "$FEED_URL" ] && { echo "Usage: $0 add-feed "; exit 1; } + validate_feed_name "$FEED_NAME" || exit 1 grep -q "^[01]|${FEED_NAME}|" "$FEEDS_CONFIG" 2>/dev/null && { echo "Feed exists"; exit 1; } echo "1|${FEED_NAME}|${FEED_URL}|plain|Custom: ${FEED_NAME}" >> "$FEEDS_CONFIG" log_message "✓ Added feed: $FEED_NAME" @@ -593,15 +734,23 @@ cmd_add_feed() { cmd_remove_feed() { [ -z "$FEED_NAME" ] && { echo "Usage: $0 remove-feed "; exit 1; } - sed -i "/|${FEED_NAME}|/d" "$FEEDS_CONFIG" - - # Remove ipsets and rules + + # Remove rules for this feed first (while ipsets still exist) + iptables -D INPUT -m set --match-set "${IPSET_PREFIX}-${FEED_NAME}" src -m limit --limit 5/min -j LOG --log-prefix "[THREAT:${FEED_NAME}] " 2>/dev/null || true + iptables -D INPUT -m set --match-set "${IPSET_PREFIX}-${FEED_NAME}" src -j DROP 2>/dev/null || true + if [ "$ENABLE_IPV6" = true ]; then + ip6tables -D INPUT -m set --match-set "${IPSET_PREFIX}-${FEED_NAME}-v6" src -m limit --limit 5/min -j LOG --log-prefix "[THREAT-v6:${FEED_NAME}] " 2>/dev/null || true + ip6tables -D INPUT -m set --match-set "${IPSET_PREFIX}-${FEED_NAME}-v6" src -j DROP 2>/dev/null || true + fi + + # Now safe to destroy ipsets ipset destroy "${IPSET_PREFIX}-${FEED_NAME}" 2>/dev/null || true ipset destroy "${IPSET_PREFIX}-${FEED_NAME}-v6" 2>/dev/null || true - + + # Remove from config last + sed -i "/|${FEED_NAME}|/d" "$FEEDS_CONFIG" + log_message "✓ Removed feed: $FEED_NAME" - log_message "Reapplying rules..." - apply_iptables_rules } cmd_enable_feed() { @@ -622,15 +771,23 @@ cmd_enable_feed() { cmd_disable_feed() { [ -z "$FEED_NAME" ] && { echo "Usage: $0 disable-feed "; exit 1; } - sed -i "s/^1|${FEED_NAME}|/0|${FEED_NAME}|/" "$FEEDS_CONFIG" - - # Destroy ipsets to clear metrics + + # Remove rules for this feed first (while ipsets still exist) + iptables -D INPUT -m set --match-set "${IPSET_PREFIX}-${FEED_NAME}" src -m limit --limit 5/min -j LOG --log-prefix "[THREAT:${FEED_NAME}] " 2>/dev/null || true + iptables -D INPUT -m set --match-set "${IPSET_PREFIX}-${FEED_NAME}" src -j DROP 2>/dev/null || true + if [ "$ENABLE_IPV6" = true ]; then + ip6tables -D INPUT -m set --match-set "${IPSET_PREFIX}-${FEED_NAME}-v6" src -m limit --limit 5/min -j LOG --log-prefix "[THREAT-v6:${FEED_NAME}] " 2>/dev/null || true + ip6tables -D INPUT -m set --match-set "${IPSET_PREFIX}-${FEED_NAME}-v6" src -j DROP 2>/dev/null || true + fi + + # Now safe to destroy ipsets ipset destroy "${IPSET_PREFIX}-${FEED_NAME}" 2>/dev/null || true ipset destroy "${IPSET_PREFIX}-${FEED_NAME}-v6" 2>/dev/null || true - + + # Mark disabled in config + sed -i "s/^1|${FEED_NAME}|/0|${FEED_NAME}|/" "$FEEDS_CONFIG" + log_message "✓ Disabled: $FEED_NAME" - log_message "Reapplying rules..." - apply_iptables_rules } cmd_install() { @@ -704,19 +861,20 @@ cmd_test_rules() { echo "IPv6 rules that would be created:" echo " 1. Whitelist bypass: -I INPUT 1 -m set --match-set $WHITELIST_IPSET_V6 src -j ACCEPT" - local v6_count=0 + local v6_line=2 while IFS='|' read -r enabled name url type description; do [[ "$enabled" =~ ^#.*$ ]] && continue [[ -z "$enabled" ]] && continue [ "$enabled" != "1" ] && continue - v6_count=$((v6_count + 1)) - echo " $((v6_count * 2)). [${name}] LOG: -A INPUT -m set --match-set ${IPSET_PREFIX}-${name}-v6 src -j LOG" - echo " $((v6_count * 2 + 1)). [${name}] DROP: -A INPUT -m set --match-set ${IPSET_PREFIX}-${name}-v6 src -j DROP" + echo " $v6_line. [${name}] LOG: -I INPUT $v6_line -m set --match-set ${IPSET_PREFIX}-${name}-v6 src -j LOG" + v6_line=$((v6_line + 1)) + echo " $v6_line. [${name}] DROP: -I INPUT $v6_line -m set --match-set ${IPSET_PREFIX}-${name}-v6 src -j DROP" + v6_line=$((v6_line + 1)) done < "$FEEDS_CONFIG" echo "" - echo "Total IPv6 rules: $((v6_count * 2 + 1))" + echo "Total IPv6 rules: $((v6_line - 1))" fi echo "" @@ -726,7 +884,44 @@ cmd_test_rules() { echo "==========================================" } +cmd_clean_cache() { + log_message "Cleaning cache for disabled feeds..." + + local removed=0 + local kept=0 + + local enabled_feeds + enabled_feeds=$(grep '^1|' "$FEEDS_CONFIG" 2>/dev/null | cut -d'|' -f2) + + for cache_file in "$CACHE_DIR"/*.raw "$CACHE_DIR"/*-v4.parsed "$CACHE_DIR"/*-v6.parsed "$CACHE_DIR"/*-v4.restore "$CACHE_DIR"/*-v6.restore; do + [ -f "$cache_file" ] || continue + + local bn feed_name + bn=$(basename "$cache_file") + feed_name="${bn%%.raw}" + feed_name="${feed_name%%-v4.parsed}" + feed_name="${feed_name%%-v6.parsed}" + feed_name="${feed_name%%-v4.restore}" + feed_name="${feed_name%%-v6.restore}" + + if ! grep -q "^${feed_name}$" <<< "$enabled_feeds"; then + rm -f "$cache_file" + removed=$((removed + 1)) + else + kept=$((kept + 1)) + fi + done + + log_message "Removed $removed cache files, kept $kept active feeds" +} + main() { + # Internal command used by iptables-restore.service at boot + if [ "${1:-}" = "_ensure-ipsets" ]; then + ensure_ipsets_exist + exit 0 + fi + parse_args "$@" case "$COMMAND" in install) cmd_install ;; @@ -751,6 +946,7 @@ main() { whitelist-add) cmd_whitelist_add ;; whitelist-init) cmd_whitelist_init ;; whitelist-list) cmd_whitelist_list ;; + clean-cache) cmd_clean_cache ;; esac } diff --git a/jenkins-backup.sh b/jenkins-backup.sh new file mode 100644 index 0000000..a7a5706 --- /dev/null +++ b/jenkins-backup.sh @@ -0,0 +1,636 @@ +#!/usr/bin/env bash + +######################################################################################### +#### jenkins-backup.sh — Backup and restore Jenkins configuration and jobs #### +#### Supports JENKINS_HOME tar, job XML export, and credential backup #### +#### Requires: bash 4+, tar, curl #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### export JENKINS_HOME="/var/lib/jenkins" #### +#### ./jenkins-backup.sh --backup #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +JENKINS_HOME="${JENKINS_HOME:-/var/lib/jenkins}" +BACKUP_DIR="${BACKUP_DIR:-/var/backups/jenkins}" +RETENTION_COUNT="${RETENTION_COUNT:-7}" +JENKINS_URL="${JENKINS_URL:-}" +JENKINS_USER="${JENKINS_USER:-}" +JENKINS_TOKEN="${JENKINS_TOKEN:-}" +BACKUP_TYPE="${BACKUP_TYPE:-full}" +EXCLUDE_PLUGINS="${EXCLUDE_PLUGINS:-false}" +CURL_TIMEOUT="${CURL_TIMEOUT:-30}" +CURL_INSECURE="${CURL_INSECURE:-false}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +RUN_MODE="backup" +RESTORE_DIR="" +DRY_RUN="false" +CONFIG_ONLY="false" +API_BACKUP="false" +TMPDIR_WORK="" +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Cleanup ─────────────────────────────────────────────────────────── +cleanup() { + if [[ -n "$TMPDIR_WORK" && -d "$TMPDIR_WORK" ]]; then + verbose "Cleaning up temp directory: $TMPDIR_WORK" + rm -rf "$TMPDIR_WORK" + fi +} + +trap cleanup EXIT + +# ── Helpers ─────────────────────────────────────────────────────────── +human_size() { + local bytes="$1" + if [[ "$bytes" -ge 1073741824 ]]; then + echo "$(( bytes / 1073741824 ))G" + elif [[ "$bytes" -ge 1048576 ]]; then + echo "$(( bytes / 1048576 ))M" + elif [[ "$bytes" -ge 1024 ]]; then + echo "$(( bytes / 1024 ))K" + else + echo "${bytes}B" + fi +} + +elapsed_time() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + local mins=$(( duration / 60 )) + local secs=$(( duration % 60 )) + if [[ "$mins" -gt 0 ]]; then + echo "${mins}m ${secs}s" + else + echo "${secs}s" + fi +} + +jenkins_api() { + local endpoint="$1" + local output="${2:-}" + local url="${JENKINS_URL%/}${endpoint}" + local curl_opts=(-s -S --max-time "$CURL_TIMEOUT" -f) + + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + + if [[ -n "$JENKINS_USER" && -n "$JENKINS_TOKEN" ]]; then + curl_opts+=(-u "${JENKINS_USER}:${JENKINS_TOKEN}") + fi + + if [[ -n "$output" ]]; then + curl "${curl_opts[@]}" "$url" -o "$output" + else + curl "${curl_opts[@]}" "$url" + fi +} + +# ── Show Help ───────────────────────────────────────────────────────── +show_help() { + cat << EOF +Usage: $SCRIPT_NAME [OPTIONS] + +Backup and restore Jenkins configuration, jobs, credentials, and plugins. +Supports full JENKINS_HOME archives, config-only snapshots, and remote +API exports with automatic retention. + +OPTIONS: + --backup Run a backup (default) + --config-only Backup configuration files only (no workspaces/builds) + --api-backup Export job configs via Jenkins API (remote, no filesystem) + --restore DIR Restore from the specified backup directory or archive + --dry-run With --restore, show what would be restored without changes + --list List available backups + --verify DIR Verify backup integrity via checksums + --help, -h Show this help message + +ENVIRONMENT VARIABLES: + JENKINS_HOME Jenkins home directory (default: /var/lib/jenkins) + BACKUP_DIR Root backup directory (default: /var/backups/jenkins) + RETENTION_COUNT Number of backups to retain (default: 7) + JENKINS_URL Jenkins base URL (required for --api-backup) + JENKINS_USER Jenkins username (required for --api-backup) + JENKINS_TOKEN Jenkins API token (required for --api-backup) + BACKUP_TYPE Backup type: full, config, api (default: full) + EXCLUDE_PLUGINS Skip plugin JPI files in config backup (default: false) + CURL_TIMEOUT HTTP timeout in seconds (default: 30) + CURL_INSECURE Allow self-signed certs (default: false) + VERBOSE Enable verbose output (default: false) + COLOR Color output: auto, always, never (default: auto) + +EXAMPLES: + # Full JENKINS_HOME backup + JENKINS_HOME=/var/lib/jenkins ./jenkins-backup.sh --backup + + # Config-only backup (XML configs + plugins, no build data) + ./jenkins-backup.sh --config-only + + # Remote API backup (no filesystem access needed) + JENKINS_URL=http://localhost:8080 JENKINS_USER=admin JENKINS_TOKEN=xxxx \\ + ./jenkins-backup.sh --api-backup + + # Restore from a backup + ./jenkins-backup.sh --restore /var/backups/jenkins/20260404-120000-full + + # Dry-run restore + ./jenkins-backup.sh --restore /var/backups/jenkins/20260404-120000-full --dry-run + + # List available backups + ./jenkins-backup.sh --list + + # Verify a backup + ./jenkins-backup.sh --verify /var/backups/jenkins/20260404-120000-full + +EOF + exit 0 +} + +# ── Parse Arguments ─────────────────────────────────────────────────── +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --backup) RUN_MODE="backup"; shift ;; + --config-only) RUN_MODE="backup"; CONFIG_ONLY="true"; BACKUP_TYPE="config"; shift ;; + --api-backup) RUN_MODE="backup"; API_BACKUP="true"; BACKUP_TYPE="api"; shift ;; + --restore) + RUN_MODE="restore" + if [[ $# -lt 2 ]]; then + err "--restore requires a directory argument" + exit 1 + fi + RESTORE_DIR="$2"; shift 2 + ;; + --dry-run) DRY_RUN="true"; shift ;; + --list) RUN_MODE="list"; shift ;; + --verify) + RUN_MODE="verify" + if [[ $# -lt 2 ]]; then + err "--verify requires a directory argument" + exit 1 + fi + RESTORE_DIR="$2"; shift 2 + ;; + --help|-h) show_help ;; + *) + err "Unknown option: $1" + echo "Run '$SCRIPT_NAME --help' for usage." >&2 + exit 1 + ;; + esac + done +} + +# ── Validation ──────────────────────────────────────────────────────── +validate_backup() { + if [[ "$API_BACKUP" == "true" ]]; then + if [[ -z "$JENKINS_URL" ]]; then + err "JENKINS_URL is required for --api-backup" + exit 1 + fi + if [[ -z "$JENKINS_USER" || -z "$JENKINS_TOKEN" ]]; then + err "JENKINS_USER and JENKINS_TOKEN are required for --api-backup" + exit 1 + fi + if ! command -v curl &>/dev/null; then + err "curl is required but not installed" + exit 1 + fi + else + if [[ ! -d "$JENKINS_HOME" ]]; then + err "JENKINS_HOME does not exist: $JENKINS_HOME" + exit 1 + fi + for cmd in tar sha256sum; do + if ! command -v "$cmd" &>/dev/null; then + err "$cmd is required but not installed" + exit 1 + fi + done + fi + + mkdir -p "$BACKUP_DIR" +} + +# ── Full Backup ─────────────────────────────────────────────────────── +do_full_backup() { + local timestamp + timestamp=$(date +%Y%m%d-%H%M%S) + local dest="$BACKUP_DIR/${timestamp}-full" + local archive="$dest/jenkins-full-${timestamp}.tar.gz" + + mkdir -p "$dest" + log "Starting full backup of $JENKINS_HOME" + verbose "Destination: $dest" + + local tar_excludes=( + --exclude='workspace' + --exclude='*/builds/*/archive' + --exclude='*.log' + ) + + tar czf "$archive" \ + "${tar_excludes[@]}" \ + -C "$(dirname "$JENKINS_HOME")" \ + "$(basename "$JENKINS_HOME")" 2>/dev/null || { + err "Failed to create tar archive" + rm -rf "$dest" + exit 1 + } + + sha256sum "$archive" > "$dest/checksums.sha256" + echo "full" > "$dest/.backup-type" + + local size + size=$(stat -c%s "$archive" 2>/dev/null || stat -f%z "$archive" 2>/dev/null || echo 0) + log "Full backup complete: $(human_size "$size") in $(elapsed_time)" + log "Archive: $archive" +} + +# ── Config-Only Backup ──────────────────────────────────────────────── +do_config_backup() { + local timestamp + timestamp=$(date +%Y%m%d-%H%M%S) + local dest="$BACKUP_DIR/${timestamp}-config" + local archive="$dest/jenkins-config-${timestamp}.tar.gz" + + mkdir -p "$dest" + log "Starting config-only backup of $JENKINS_HOME" + verbose "Destination: $dest" + + TMPDIR_WORK=$(mktemp -d) + local staging="$TMPDIR_WORK/jenkins-config" + mkdir -p "$staging" + + # Core config files + for f in config.xml credentials.xml hudson.model.UpdateCenter.xml jenkins.model.JenkinsLocationConfiguration.xml; do + if [[ -f "$JENKINS_HOME/$f" ]]; then + cp "$JENKINS_HOME/$f" "$staging/" + verbose " Copied $f" + fi + done + + # Directories: users, secrets, nodes + for d in users secrets nodes; do + if [[ -d "$JENKINS_HOME/$d" ]]; then + cp -r "$JENKINS_HOME/$d" "$staging/" + verbose " Copied $d/" + fi + done + + # Job configs (config.xml only) + if [[ -d "$JENKINS_HOME/jobs" ]]; then + find "$JENKINS_HOME/jobs" -name config.xml -type f | while IFS= read -r jobxml; do + local relpath + relpath="${jobxml#"$JENKINS_HOME"/}" + local target_dir + target_dir="$staging/$(dirname "$relpath")" + mkdir -p "$target_dir" + cp "$jobxml" "$target_dir/" + done + verbose " Copied job configs" + fi + + # Plugin JPI files + if [[ "$EXCLUDE_PLUGINS" != "true" && -d "$JENKINS_HOME/plugins" ]]; then + mkdir -p "$staging/plugins" + find "$JENKINS_HOME/plugins" -maxdepth 1 -name '*.jpi' -exec cp {} "$staging/plugins/" \; + local plugin_count + plugin_count=$(find "$staging/plugins" -name '*.jpi' 2>/dev/null | wc -l) + verbose " Copied $plugin_count plugin files" + fi + + tar czf "$archive" -C "$TMPDIR_WORK" "jenkins-config" 2>/dev/null || { + err "Failed to create config archive" + rm -rf "$dest" + exit 1 + } + + sha256sum "$archive" > "$dest/checksums.sha256" + echo "config" > "$dest/.backup-type" + + local size + size=$(stat -c%s "$archive" 2>/dev/null || stat -f%z "$archive" 2>/dev/null || echo 0) + log "Config backup complete: $(human_size "$size") in $(elapsed_time)" + log "Archive: $archive" +} + +# ── API Backup ──────────────────────────────────────────────────────── +do_api_backup() { + local timestamp + timestamp=$(date +%Y%m%d-%H%M%S) + local dest="$BACKUP_DIR/${timestamp}-api" + + mkdir -p "$dest/jobs" "$dest/plugins" + log "Starting API backup from $JENKINS_URL" + verbose "Destination: $dest" + + # Get job list — parse JSON without jq + local job_list_json + job_list_json=$(jenkins_api "/api/json?tree=jobs[name]") || { + err "Failed to fetch job list from Jenkins API" + rm -rf "$dest" + exit 1 + } + + # Extract job names from JSON using grep/sed + local job_names + job_names=$(echo "$job_list_json" | { grep -o '"name" *: *"[^"]*"' || true; } | sed 's/.*: *"//;s/"$//') + + local job_count=0 + local job_fail=0 + if [[ -n "$job_names" ]]; then + while IFS= read -r job_name; do + [[ -z "$job_name" ]] && continue + local encoded_name + encoded_name="${job_name// /%20}" + if jenkins_api "/job/${encoded_name}/config.xml" "$dest/jobs/${job_name}.xml" 2>/dev/null; then + verbose " Exported job: $job_name" + job_count=$((job_count + 1)) + else + warn "Failed to export job: $job_name" + job_fail=$((job_fail + 1)) + fi + done <<< "$job_names" + fi + log "Exported $job_count jobs ($job_fail failed)" + + # Plugin list + local plugin_json + if plugin_json=$(jenkins_api "/pluginManager/api/json?depth=1&tree=plugins[shortName,version,enabled,active]"); then + echo "$plugin_json" > "$dest/plugins/plugin-list.json" + local plugin_count + plugin_count=$(echo "$plugin_json" | { grep -o '"shortName"' || true; } | wc -l) + log "Exported plugin list: $plugin_count plugins" + else + warn "Failed to fetch plugin list" + fi + + # Create checksum file for all exported files + if command -v sha256sum &>/dev/null; then + find "$dest" -type f ! -name 'checksums.sha256' -exec sha256sum {} + > "$dest/checksums.sha256" 2>/dev/null || true + fi + echo "api" > "$dest/.backup-type" + + local size + size=$(du -sb "$dest" 2>/dev/null | cut -f1) + size="${size:-0}" + log "API backup complete: $(human_size "$size") in $(elapsed_time)" + log "Backup directory: $dest" +} + +# ── Backup Entry Point ─────────────────────────────────────────────── +do_backup() { + START_TIME=$(date +%s) + validate_backup + + if [[ "$API_BACKUP" == "true" ]]; then + do_api_backup + elif [[ "$CONFIG_ONLY" == "true" ]]; then + do_config_backup + else + do_full_backup + fi + + prune_backups +} + +# ── Prune ───────────────────────────────────────────────────────────── +prune_backups() { + log "Pruning old backups (retaining $RETENTION_COUNT)..." + + local backup_count + backup_count=$(find "$BACKUP_DIR" -mindepth 1 -maxdepth 1 -type d | wc -l) + + if [[ "$backup_count" -le "$RETENTION_COUNT" ]]; then + log " No pruning needed ($backup_count backups present)" + return 0 + fi + + local remove_count=$((backup_count - RETENTION_COUNT)) + find "$BACKUP_DIR" -mindepth 1 -maxdepth 1 -type d | sort | head -n "$remove_count" | while IFS= read -r dir; do + log " Removing $(basename "$dir")" + rm -rf "$dir" + done + + log " Pruned $remove_count old backups" +} + +# ── Restore ─────────────────────────────────────────────────────────── +do_restore() { + if [[ -z "$RESTORE_DIR" ]]; then + err "--restore requires a directory argument" + exit 1 + fi + + if [[ ! -d "$RESTORE_DIR" ]]; then + err "Restore directory does not exist: $RESTORE_DIR" + exit 1 + fi + + # Find the tar archive in the backup directory + local archive + archive=$(find "$RESTORE_DIR" -maxdepth 1 -name '*.tar.gz' -type f | head -1) + + if [[ -z "$archive" ]]; then + err "No tar archive found in: $RESTORE_DIR" + exit 1 + fi + + log "Restore source: $archive" + log "Restore target: $JENKINS_HOME" + + if [[ "$DRY_RUN" == "true" ]]; then + log "${BOLD}Dry run — listing archive contents:${RESET}" + tar tzf "$archive" | head -50 + local total + total=$(tar tzf "$archive" | wc -l) + log " ($total files total)" + log "Dry run complete — no changes made" + return 0 + fi + + # Verify checksum if available + if [[ -f "$RESTORE_DIR/checksums.sha256" ]]; then + log "Verifying backup integrity..." + if (cd "$RESTORE_DIR" && sha256sum -c checksums.sha256 --quiet 2>/dev/null); then + log " ${GREEN}Checksums verified${RESET}" + else + warn "Checksum verification failed — proceed with caution" + fi + fi + + if [[ ! -d "$JENKINS_HOME" ]]; then + mkdir -p "$JENKINS_HOME" + fi + + log "Restoring archive..." + tar xzf "$archive" -C "$(dirname "$JENKINS_HOME")" || { + err "Failed to extract archive" + exit 1 + } + + warn "Jenkins must be restarted for changes to take effect" + warn " sudo systemctl restart jenkins" + log "Restore complete" +} + +# ── List ────────────────────────────────────────────────────────────── +do_list() { + if [[ ! -d "$BACKUP_DIR" ]]; then + log "No backups found (directory does not exist: $BACKUP_DIR)" + return 0 + fi + + local count=0 + local format=" %-28s %-8s %s\n" + + printf "\n" + # shellcheck disable=SC2059 + printf "$format" "BACKUP" "TYPE" "SIZE" + # shellcheck disable=SC2059 + printf "$format" "----------------------------" "--------" "--------" + + for dir in "$BACKUP_DIR"/*/; do + [[ -d "$dir" ]] || continue + count=$((count + 1)) + + local name + name=$(basename "$dir") + + local btype="unknown" + if [[ -f "$dir/.backup-type" ]]; then + btype=$(cat "$dir/.backup-type") + fi + + local size + size=$(du -sh "$dir" 2>/dev/null | cut -f1) + size="${size:-?}" + + # shellcheck disable=SC2059 + printf "$format" "$name" "$btype" "$size" + done + + printf "\n" + if [[ "$count" -eq 0 ]]; then + log "No backups found in $BACKUP_DIR" + else + log "$count backup(s) in $BACKUP_DIR" + fi +} + +# ── Verify ──────────────────────────────────────────────────────────── +do_verify() { + if [[ -z "$RESTORE_DIR" ]]; then + err "--verify requires a directory argument" + exit 1 + fi + + if [[ ! -d "$RESTORE_DIR" ]]; then + err "Backup directory does not exist: $RESTORE_DIR" + exit 1 + fi + + log "Verifying backup: $RESTORE_DIR" + + # Check backup type + local btype="unknown" + if [[ -f "$RESTORE_DIR/.backup-type" ]]; then + btype=$(cat "$RESTORE_DIR/.backup-type") + fi + log " Backup type: $btype" + + # Check for checksums + if [[ -f "$RESTORE_DIR/checksums.sha256" ]]; then + log " Verifying checksums..." + if (cd "$RESTORE_DIR" && sha256sum -c checksums.sha256 2>/dev/null); then + log " ${GREEN}All checksums passed${RESET}" + else + err "Checksum verification FAILED" + exit 1 + fi + else + warn "No checksum file found — cannot verify integrity" + fi + + # Archive contents summary + local archive + archive=$(find "$RESTORE_DIR" -maxdepth 1 -name '*.tar.gz' -type f | head -1) + if [[ -n "$archive" ]]; then + local size + size=$(stat -c%s "$archive" 2>/dev/null || stat -f%z "$archive" 2>/dev/null || echo 0) + log " Archive: $(basename "$archive") ($(human_size "$size"))" + local file_count + file_count=$(tar tzf "$archive" | wc -l) + log " Files in archive: $file_count" + fi + + # API backup — list exported files + if [[ "$btype" == "api" ]]; then + local job_count + job_count=$(find "$RESTORE_DIR/jobs" -name '*.xml' 2>/dev/null | wc -l) + log " Exported jobs: $job_count" + if [[ -f "$RESTORE_DIR/plugins/plugin-list.json" ]]; then + local plugin_count + plugin_count=$({ grep -o '"shortName"' "$RESTORE_DIR/plugins/plugin-list.json" || true; } | wc -l) + log " Plugins recorded: $plugin_count" + fi + fi + + log "Verification complete" +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + setup_colors + + parse_args "$@" + + case "$RUN_MODE" in + backup) do_backup ;; + restore) do_restore ;; + list) do_list ;; + verify) do_verify ;; + *) err "Unknown mode: $RUN_MODE"; exit 1 ;; + esac +} + +main "$@" diff --git a/jenkins-exporter.sh b/jenkins-exporter.sh new file mode 100644 index 0000000..2617172 --- /dev/null +++ b/jenkins-exporter.sh @@ -0,0 +1,418 @@ +#!/usr/bin/env bash +# +# Jenkins Prometheus Metrics Exporter +# +# Prometheus textfile collector exporter for Jenkins. +# Uses the Jenkins JSON API to collect build queue length, executor +# utilization, job success/failure rates, build duration, node status, +# disk usage, and JVM heap statistics. +# +# Usage: +# JENKINS_URL="https://jenkins.example.com" JENKINS_USER="admin" JENKINS_TOKEN="xxx" ./jenkins-exporter.sh +# JENKINS_URL="https://jenkins.example.com" JENKINS_USER="admin" JENKINS_TOKEN="xxx" ./jenkins-exporter.sh --textfile +# JENKINS_URL="https://jenkins.example.com" JENKINS_USER="admin" JENKINS_TOKEN="xxx" ./jenkins-exporter.sh --install +# +# Parameters: +# --textfile Write to textfile collector directory +# --install Create cron job for automatic collection +# --help Show usage +# +# Environment: +# JENKINS_URL Jenkins base URL (required) +# JENKINS_USER Jenkins username (required) +# JENKINS_TOKEN Jenkins API token (required) +# TEXTFILE_DIR Textfile collector directory (default: /var/lib/node_exporter/textfile_collector) +# CURL_TIMEOUT API request timeout in seconds (default: 10) +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# Version: 1.0 +# +# Metrics Exported: +# Core: +# - jenkins_up +# - jenkins_exporter_info{version} +# +# Build Queue: +# - jenkins_build_queue_length +# +# Executors: +# - jenkins_executors_total +# - jenkins_executors_busy +# - jenkins_executors_idle +# +# Jobs: +# - jenkins_jobs_total +# - jenkins_job_last_build_duration_seconds{job} +# - jenkins_job_last_build_result{job} +# +# Nodes: +# - jenkins_nodes_total +# - jenkins_nodes_online +# - jenkins_nodes_offline +# +# Disk: +# - jenkins_disk_usage_bytes +# +# JVM: +# - jenkins_jvm_heap_used_bytes +# - jenkins_jvm_heap_max_bytes +# +# Exporter: +# - jenkins_exporter_duration_seconds +# - jenkins_exporter_last_run_timestamp + +set -euo pipefail + +# --- Configuration --- +readonly VERSION="1.0" +readonly SCRIPT_NAME="$(basename "$0")" +JENKINS_URL="${JENKINS_URL:-}" +JENKINS_USER="${JENKINS_USER:-}" +JENKINS_TOKEN="${JENKINS_TOKEN:-}" +TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile_collector}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +TEXTFILE_MODE=false +OUTPUT="" +START_TIME="" + +# --- Functions --- + +usage() { + cat </dev/null; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "ERROR: Missing required commands: ${missing[*]}" >&2 + echo "Install with: apt install ${missing[*]} OR dnf install ${missing[*]}" >&2 + exit 1 + fi +} + +validate_config() { + if [[ -z "$JENKINS_URL" ]]; then + echo "ERROR: JENKINS_URL environment variable is required" >&2 + exit 1 + fi + if [[ -z "$JENKINS_USER" ]]; then + echo "ERROR: JENKINS_USER environment variable is required" >&2 + exit 1 + fi + if [[ -z "$JENKINS_TOKEN" ]]; then + echo "ERROR: JENKINS_TOKEN environment variable is required" >&2 + exit 1 + fi + # Strip trailing slash + JENKINS_URL="${JENKINS_URL%/}" +} + +api_get() { + local endpoint="$1" + curl -sf --max-time "$CURL_TIMEOUT" \ + -u "${JENKINS_USER}:${JENKINS_TOKEN}" \ + "${JENKINS_URL}${endpoint}" 2>/dev/null || echo "" +} + +add_metric() { + local name="$1" + local type="$2" + local help="$3" + local value="$4" + local labels="${5:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name}{${labels}} ${value} +" + else + OUTPUT+="# HELP ${name} ${help} +# TYPE ${name} ${type} +${name} ${value} +" + fi +} + +add_metric_value() { + local name="$1" + local value="$2" + local labels="${3:-}" + + if [[ -n "$labels" ]]; then + OUTPUT+="${name}{${labels}} ${value} +" + else + OUTPUT+="${name} ${value} +" + fi +} + +collect_health() { + local api_json + api_json=$(api_get "/api/json") + + if [[ -z "$api_json" ]]; then + add_metric "jenkins_up" "gauge" "Jenkins reachability (1=up, 0=down)" "0" + return 1 + fi + + add_metric "jenkins_up" "gauge" "Jenkins reachability (1=up, 0=down)" "1" + return 0 +} + +collect_queue() { + local queue_json + queue_json=$(api_get "/queue/api/json") + + if [[ -n "$queue_json" ]]; then + local queue_length + queue_length=$(echo "$queue_json" | jq -r '.items | length // 0' 2>/dev/null) + add_metric "jenkins_build_queue_length" "gauge" "Number of items in the build queue" "${queue_length:-0}" + fi +} + +collect_executors() { + local computer_json + computer_json=$(api_get "/computer/api/json") + + if [[ -z "$computer_json" ]]; then + return + fi + + local total_executors busy_executors idle_executors + total_executors=$(echo "$computer_json" | jq -r '.totalExecutors // 0' 2>/dev/null) + busy_executors=$(echo "$computer_json" | jq -r '.busyExecutors // 0' 2>/dev/null) + idle_executors=$((total_executors - busy_executors)) + + add_metric "jenkins_executors_total" "gauge" "Total number of executors" "${total_executors:-0}" + add_metric "jenkins_executors_busy" "gauge" "Number of busy executors" "${busy_executors:-0}" + add_metric "jenkins_executors_idle" "gauge" "Number of idle executors" "${idle_executors:-0}" + + # Node status + local nodes_total nodes_online nodes_offline + nodes_total=$(echo "$computer_json" | jq -r '.computer | length // 0' 2>/dev/null) + nodes_offline=$(echo "$computer_json" | jq -r '[.computer[] | select(.offline == true)] | length // 0' 2>/dev/null) + nodes_online=$((nodes_total - nodes_offline)) + + add_metric "jenkins_nodes_total" "gauge" "Total number of nodes" "${nodes_total:-0}" + add_metric "jenkins_nodes_online" "gauge" "Number of online nodes" "${nodes_online:-0}" + add_metric "jenkins_nodes_offline" "gauge" "Number of offline nodes" "${nodes_offline:-0}" + + # JVM stats from the master node monitor data + local heap_used heap_max + heap_used=$(echo "$computer_json" | jq -r '.computer[0].monitorData["hudson.node_monitors.SwapSpaceMonitor"]["totalPhysicalMemory"] // empty' 2>/dev/null) + + # Use the built-in master node's memory monitor + local master_monitor + master_monitor=$(echo "$computer_json" | jq -r '.computer[] | select(.displayName == "Built-In Node" or .displayName == "master") | .monitorData // empty' 2>/dev/null) + + if [[ -n "$master_monitor" ]]; then + heap_used=$(echo "$master_monitor" | jq -r '.["hudson.node_monitors.SwapSpaceMonitor"]["availablePhysicalMemory"] // empty' 2>/dev/null) + heap_max=$(echo "$master_monitor" | jq -r '.["hudson.node_monitors.SwapSpaceMonitor"]["totalPhysicalMemory"] // empty' 2>/dev/null) + fi + + # Try the overallLoad API for JVM info + local overall_json + overall_json=$(api_get "/overallLoad/api/json") + + if [[ -n "$overall_json" ]]; then + # overallLoad provides queue length stats as well + : + fi + + # Collect JVM heap from the system properties or groovy script endpoint + # Fall back to the computer API memory data + if [[ -n "${heap_used:-}" && "$heap_used" != "null" ]]; then + add_metric "jenkins_jvm_heap_used_bytes" "gauge" "JVM heap memory used" "$heap_used" + fi + if [[ -n "${heap_max:-}" && "$heap_max" != "null" ]]; then + add_metric "jenkins_jvm_heap_max_bytes" "gauge" "JVM heap memory max" "$heap_max" + fi +} + +collect_jobs() { + local jobs_json + jobs_json=$(api_get "/api/json?tree=jobs[name,lastBuild[duration,result]]") + + if [[ -z "$jobs_json" ]]; then + return + fi + + local job_count + job_count=$(echo "$jobs_json" | jq -r '.jobs | length // 0' 2>/dev/null) + add_metric "jenkins_jobs_total" "gauge" "Total number of jobs" "${job_count:-0}" + + # Per-job metrics + local job_names + job_names=$(echo "$jobs_json" | jq -r '.jobs[]? | .name // empty' 2>/dev/null) + + local first_duration=true + local first_result=true + + while IFS= read -r job_name; do + [[ -z "$job_name" ]] && continue + + local duration result + duration=$(echo "$jobs_json" | jq -r --arg name "$job_name" '.jobs[] | select(.name == $name) | .lastBuild.duration // empty' 2>/dev/null) + result=$(echo "$jobs_json" | jq -r --arg name "$job_name" '.jobs[] | select(.name == $name) | .lastBuild.result // empty' 2>/dev/null) + + # Sanitise job name for Prometheus labels + local safe_name + safe_name=$(echo "$job_name" | sed 's/[^a-zA-Z0-9_\-]/_/g') + + if [[ -n "$duration" && "$duration" != "null" ]]; then + local duration_seconds + duration_seconds=$(echo "scale=2; $duration / 1000" | bc 2>/dev/null || echo "0") + if [[ "$first_duration" == true ]]; then + add_metric "jenkins_job_last_build_duration_seconds" "gauge" "Last build duration per job" "$duration_seconds" "job=\"${safe_name}\"" + first_duration=false + else + add_metric_value "jenkins_job_last_build_duration_seconds" "$duration_seconds" "job=\"${safe_name}\"" + fi + fi + + if [[ -n "$result" && "$result" != "null" ]]; then + local result_value + if [[ "$result" == "SUCCESS" ]]; then + result_value=1 + else + result_value=0 + fi + if [[ "$first_result" == true ]]; then + add_metric "jenkins_job_last_build_result" "gauge" "Last build result (1=SUCCESS, 0=FAILURE)" "$result_value" "job=\"${safe_name}\"" + first_result=false + else + add_metric_value "jenkins_job_last_build_result" "$result_value" "job=\"${safe_name}\"" + fi + fi + done <<< "$job_names" +} + +collect_disk_usage() { + local disk_json + disk_json=$(api_get "/disk-usage/api/json") + + if [[ -z "$disk_json" ]]; then + return + fi + + local disk_usage + disk_usage=$(echo "$disk_json" | jq -r '.diskUsage // empty' 2>/dev/null) + + if [[ -n "$disk_usage" && "$disk_usage" != "null" ]]; then + add_metric "jenkins_disk_usage_bytes" "gauge" "Total disk usage (disk-usage plugin)" "$disk_usage" + fi +} + +write_output() { + if [[ "$TEXTFILE_MODE" == true ]]; then + local output_file="${TEXTFILE_DIR}/jenkins.prom" + local temp_file="${output_file}.$$" + + mkdir -p "$TEXTFILE_DIR" + echo "$OUTPUT" > "$temp_file" + mv "$temp_file" "$output_file" + else + echo "$OUTPUT" + fi +} + +install_cron() { + if [[ $EUID -ne 0 ]]; then + echo "ERROR: --install requires root" >&2 + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + cat > /etc/cron.d/jenkins-exporter </dev/null +EOF + + chmod 644 /etc/cron.d/jenkins-exporter + echo "Installed cron job: /etc/cron.d/jenkins-exporter" + echo "Metrics will be written to: ${TEXTFILE_DIR}/jenkins.prom" +} + +# --- Main --- + +main() { + # Parse arguments + for arg in "$@"; do + case "$arg" in + --textfile) TEXTFILE_MODE=true ;; + --install) + check_dependencies + validate_config + install_cron + exit 0 + ;; + --help|-h) usage ;; + *) echo "Unknown option: $arg" >&2; usage ;; + esac + done + + check_dependencies + validate_config + + START_TIME=$(date +%s%N) + + # Exporter info + add_metric "jenkins_exporter_info" "gauge" "Exporter version information" "1" "version=\"${VERSION}\"" + + # Collect metrics + if collect_health; then + collect_queue + collect_executors + collect_jobs + collect_disk_usage + fi + + # Exporter performance + local end_time duration + end_time=$(date +%s%N) + duration=$(echo "scale=2; ($end_time - $START_TIME) / 1000000000" | bc 2>/dev/null || echo "0") + add_metric "jenkins_exporter_duration_seconds" "gauge" "Time to generate all metrics" "$duration" + add_metric "jenkins_exporter_last_run_timestamp" "gauge" "Unix timestamp of last successful run" "$(date +%s)" + + write_output +} + +main "$@" diff --git a/jenkins-smoke-tests.sh b/jenkins-smoke-tests.sh new file mode 100644 index 0000000..6de395f --- /dev/null +++ b/jenkins-smoke-tests.sh @@ -0,0 +1,791 @@ +#!/usr/bin/env bash + +######################################################################################### +#### jenkins-smoke-tests.sh — Verify Jenkins instance health after upgrades/changes #### +#### Zero external dependencies. Runs in air-gapped environments. #### +#### Requires: bash 4+, curl, openssl (optional) #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### export JENKINS_URL="https://jenkins.example.com" #### +#### export JENKINS_USER="admin" #### +#### export JENKINS_TOKEN="your-api-token" #### +#### ./jenkins-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +JENKINS_URL="${JENKINS_URL:-}" +JENKINS_USER="${JENKINS_USER:-}" +JENKINS_TOKEN="${JENKINS_TOKEN:-}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +CURL_INSECURE="${CURL_INSECURE:-false}" +SKIP_PLUGINS="${SKIP_PLUGINS:-false}" +SKIP_DISK="${SKIP_DISK:-false}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" # text, tap, junit +JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +JENKINS_HOME="${JENKINS_HOME:-/var/lib/jenkins}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0 +FAIL=0 +SKIP=0 +TOTAL=0 +RESULTS=() +START_TIME="" +JENKINS_VERSION="" +CRUMB_VALUE="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" + local detail="${2:-}" + ((PASS++)) || true + ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name}" + else + echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_fail() { + local name="$1" + local detail="${2:-}" + ((FAIL++)) || true + ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else + echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_skip() { + local name="$1" + local reason="${2:-}" + ((SKIP++)) || true + ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else + echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}" + fi +} + +# ── curl wrapper ────────────────────────────────────────────────────── +api_curl() { + local endpoint="$1" + shift + local curl_opts=(-s -S --max-time "$CURL_TIMEOUT") + + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + [[ -n "$JENKINS_USER" && -n "$JENKINS_TOKEN" ]] && curl_opts+=(-u "${JENKINS_USER}:${JENKINS_TOKEN}") + + local url="${JENKINS_URL}${endpoint}" + verbose "curl GET ${url} $*" + + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +api_curl_status() { + local endpoint="$1" + shift + local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") + + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + [[ -n "$JENKINS_USER" && -n "$JENKINS_TOKEN" ]] && curl_opts+=(-u "${JENKINS_USER}:${JENKINS_TOKEN}") + + local url="${JENKINS_URL}${endpoint}" + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +api_curl_headers() { + local endpoint="$1" + shift + local curl_opts=(-s -S -D - -o /dev/null --max-time "$CURL_TIMEOUT") + + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + [[ -n "$JENKINS_USER" && -n "$JENKINS_TOKEN" ]] && curl_opts+=(-u "${JENKINS_USER}:${JENKINS_TOKEN}") + + local url="${JENKINS_URL}${endpoint}" + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +# ── JSON parsing (no jq required) ──────────────────────────────────── +json_value() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"?\K[^\",}]+" || true; } | head -1 +} + +json_value_string() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"\K[^\"]*" || true; } | head -1 +} + +json_count() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"\s*:" || true; } | wc -l +} + +# ══════════════════════════════════════════════════════════════════════ +# TEST SUITES +# ══════════════════════════════════════════════════════════════════════ + +# ── 1. Connectivity ────────────────────────────────────────────────── +test_connectivity() { + echo "" + echo -e "${BOLD}Connectivity${RESET}" + + # 1a. HTTP(S) reachable + local curl_opts=(-s -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT" -L) + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + + local http_code + http_code=$(curl "${curl_opts[@]}" "${JENKINS_URL}/" 2>/dev/null) || http_code="000" + + if [[ "$http_code" =~ ^(200|403)$ ]]; then + record_pass "Jenkins reachable" "HTTP ${http_code}" + else + record_fail "Jenkins reachable" "HTTP ${http_code}" + fi + + # 1b. Login page accessible + local login_code + login_code=$(curl "${curl_opts[@]}" "${JENKINS_URL}/login" 2>/dev/null) || login_code="000" + + if [[ "$login_code" == "200" ]]; then + record_pass "Login page accessible" "HTTP ${login_code}" + else + record_fail "Login page accessible" "HTTP ${login_code}" + fi + + # 1c. TLS certificate validity (if HTTPS) + if [[ "$JENKINS_URL" == https://* ]]; then + local host + host=$(echo "$JENKINS_URL" | sed 's|https://||' | cut -d/ -f1 | cut -d: -f1) + local port + port=$(echo "$JENKINS_URL" | grep -oP ':\K[0-9]+$' || echo "443") + + local expiry + expiry=$(echo | openssl s_client -servername "$host" -connect "${host}:${port}" 2>/dev/null | \ + openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) || expiry="" + + if [[ -n "$expiry" ]]; then + local expiry_epoch + expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null) || expiry_epoch=0 + local now_epoch + now_epoch=$(date +%s) + local days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + + if [[ $days_left -gt 30 ]]; then + record_pass "TLS certificate valid" "${days_left} days remaining" + elif [[ $days_left -gt 0 ]]; then + record_pass "TLS certificate valid" "${days_left} days remaining (renew soon)" + else + record_fail "TLS certificate valid" "expired or expiring in ${days_left} days" + fi + else + record_skip "TLS certificate check" "could not retrieve certificate" + fi + else + record_skip "TLS certificate check" "not using HTTPS" + fi +} + +# ── 2. API Authentication ──────────────────────────────────────────── +test_api_auth() { + echo "" + echo -e "${BOLD}API Authentication${RESET}" + + # 2a. JSON API reachable + local api_status + api_status=$(api_curl_status "/api/json") + + if [[ "$api_status" == "200" ]]; then + record_pass "JSON API reachable" "HTTP ${api_status}" + elif [[ "$api_status" == "403" ]]; then + record_fail "JSON API reachable" "HTTP 403 — authentication failed" + return + else + record_fail "JSON API reachable" "HTTP ${api_status}" + return + fi + + # 2b. Authentication — extract version from headers + local headers + headers=$(api_curl_headers "/api/json") + + JENKINS_VERSION=$(echo "$headers" | { grep -i '^X-Jenkins:' || true; } | tr -d '\r' | awk '{print $2}') + + if [[ -n "$JENKINS_VERSION" ]]; then + record_pass "API authentication" "Jenkins ${JENKINS_VERSION}" + else + record_pass "API authentication" "authenticated (version unknown)" + fi + + # 2c. CRUMB (CSRF protection token) + local crumb_json + crumb_json=$(api_curl "/crumbIssuer/api/json" 2>/dev/null) || crumb_json="" + + if [[ -n "$crumb_json" ]]; then + local crumb_header + crumb_header=$(json_value_string "crumbRequestField" "$crumb_json") + CRUMB_VALUE=$(json_value_string "crumb" "$crumb_json") + verbose "CSRF crumb: ${crumb_header}=${CRUMB_VALUE}" + + if [[ -n "$CRUMB_VALUE" ]]; then + record_pass "CSRF crumb available" "crumb retrieved" + else + record_skip "CSRF crumb available" "crumb issuer returned empty" + fi + else + record_skip "CSRF crumb available" "crumb issuer not enabled" + fi +} + +# ── 3. System Health ───────────────────────────────────────────────── +test_system_health() { + echo "" + echo -e "${BOLD}System Health${RESET}" + + # 3a. Jenkins version from headers (already captured, but confirm) + if [[ -n "$JENKINS_VERSION" ]]; then + record_pass "Jenkins version" "${JENKINS_VERSION}" + else + local headers + headers=$(api_curl_headers "/" 2>/dev/null) || headers="" + JENKINS_VERSION=$(echo "$headers" | { grep -i '^X-Jenkins:' || true; } | tr -d '\r' | awk '{print $2}') + if [[ -n "$JENKINS_VERSION" ]]; then + record_pass "Jenkins version" "${JENKINS_VERSION}" + else + record_skip "Jenkins version" "could not determine version" + fi + fi + + # 3b. Executor status + local computer_json + computer_json=$(api_curl "/computer/api/json" 2>/dev/null) || computer_json="" + + if [[ -n "$computer_json" ]]; then + local total_executors + total_executors=$(json_value "totalExecutors" "$computer_json") + local busy_executors + busy_executors=$(json_value "busyExecutors" "$computer_json") + + if [[ -n "$total_executors" ]]; then + local free_executors=$(( total_executors - busy_executors )) + record_pass "Executor status" "${busy_executors}/${total_executors} busy, ${free_executors} idle" + else + record_skip "Executor status" "could not parse executor count" + fi + else + record_fail "Executor status" "could not reach /computer/api/json" + fi + + # 3c. Build queue + local queue_json + queue_json=$(api_curl "/queue/api/json" 2>/dev/null) || queue_json="" + + if [[ -n "$queue_json" ]]; then + local queue_items + queue_items=$(echo "$queue_json" | { grep -oP '"id"\s*:' || true; } | wc -l) + + if [[ $queue_items -eq 0 ]]; then + record_pass "Build queue" "empty" + elif [[ $queue_items -lt 10 ]]; then + record_pass "Build queue" "${queue_items} item(s) queued" + else + record_fail "Build queue" "${queue_items} items queued (possible bottleneck)" + fi + else + record_fail "Build queue" "could not reach /queue/api/json" + fi + + # 3d. System info (admin only) + local sysinfo_status + sysinfo_status=$(api_curl_status "/manage/systemInfo") + + if [[ "$sysinfo_status" == "200" ]]; then + record_pass "System info accessible" "admin access confirmed" + elif [[ "$sysinfo_status" == "403" ]]; then + record_skip "System info accessible" "admin access required" + else + record_fail "System info accessible" "HTTP ${sysinfo_status}" + fi +} + +# ── 4. Agents/Nodes ────────────────────────────────────────────────── +test_agents() { + echo "" + echo -e "${BOLD}Agents${RESET}" + + local computer_json + computer_json=$(api_curl "/computer/api/json?depth=1" 2>/dev/null) || computer_json="" + + if [[ -z "$computer_json" ]]; then + record_fail "Agent list" "could not reach /computer/api/json" + return + fi + + # Count nodes + local node_count + node_count=$(echo "$computer_json" | { grep -oP '"displayName"\s*:' || true; } | wc -l) + + if [[ $node_count -eq 0 ]]; then + record_skip "Agent list" "no nodes found" + return + fi + + record_pass "Agent list" "${node_count} node(s) registered" + + # Master node online + local master_offline + # The built-in node is typically first and named "master" or "(built-in)" + # Check if any node has offline=false + master_offline=$(echo "$computer_json" | { grep -oP '"offline"\s*:\s*\K(true|false)' || true; } | head -1) + + if [[ "$master_offline" == "false" ]]; then + record_pass "Built-in node online" "controller node available" + elif [[ "$master_offline" == "true" ]]; then + record_fail "Built-in node online" "controller node offline" + else + record_skip "Built-in node online" "could not determine status" + fi + + # Count online vs offline + local online_count + online_count=$(echo "$computer_json" | { grep -oP '"offline"\s*:\s*false' || true; } | wc -l) + local offline_count + offline_count=$(echo "$computer_json" | { grep -oP '"offline"\s*:\s*true' || true; } | wc -l) + + if [[ $offline_count -eq 0 ]]; then + record_pass "Agent availability" "${online_count}/${node_count} online" + elif [[ $online_count -gt 0 ]]; then + record_fail "Agent availability" "${online_count}/${node_count} online, ${offline_count} offline" + else + record_fail "Agent availability" "all ${node_count} agents offline" + fi + + # Temporarily offline agents (manually taken offline) + local temp_offline + temp_offline=$(echo "$computer_json" | { grep -oP '"temporarilyOffline"\s*:\s*true' || true; } | wc -l) + + if [[ $temp_offline -gt 0 ]]; then + record_pass "Manually offline agents" "${temp_offline} agent(s) temporarily disabled" + fi +} + +# ── 5. Jobs ─────────────────────────────────────────────────────────── +test_jobs() { + echo "" + echo -e "${BOLD}Jobs${RESET}" + + local jobs_json + jobs_json=$(api_curl "/api/json?tree=jobs[name,color,url,inQueue]" 2>/dev/null) || jobs_json="" + + if [[ -z "$jobs_json" ]]; then + record_fail "Job list" "could not reach /api/json" + return + fi + + local job_count + job_count=$(echo "$jobs_json" | { grep -oP '"name"\s*:' || true; } | wc -l) + + if [[ $job_count -eq 0 ]]; then + record_skip "Job list" "no jobs configured" + return + fi + + record_pass "Job list" "${job_count} job(s) found" + + # Count by status (color field) + local blue_count red_count yellow_count disabled_count notbuilt_count aborted_count + blue_count=$(echo "$jobs_json" | { grep -oP '"color"\s*:\s*"blue[^"]*"' || true; } | wc -l) + red_count=$(echo "$jobs_json" | { grep -oP '"color"\s*:\s*"red[^"]*"' || true; } | wc -l) + yellow_count=$(echo "$jobs_json" | { grep -oP '"color"\s*:\s*"yellow[^"]*"' || true; } | wc -l) + disabled_count=$(echo "$jobs_json" | { grep -oP '"color"\s*:\s*"disabled[^"]*"' || true; } | wc -l) + notbuilt_count=$(echo "$jobs_json" | { grep -oP '"color"\s*:\s*"notbuilt[^"]*"' || true; } | wc -l) + aborted_count=$(echo "$jobs_json" | { grep -oP '"color"\s*:\s*"aborted[^"]*"' || true; } | wc -l) + + local status_parts=() + [[ $blue_count -gt 0 ]] && status_parts+=("${blue_count} passing") + [[ $red_count -gt 0 ]] && status_parts+=("${red_count} failing") + [[ $yellow_count -gt 0 ]] && status_parts+=("${yellow_count} unstable") + [[ $disabled_count -gt 0 ]] && status_parts+=("${disabled_count} disabled") + [[ $notbuilt_count -gt 0 ]] && status_parts+=("${notbuilt_count} not built") + [[ $aborted_count -gt 0 ]] && status_parts+=("${aborted_count} aborted") + + local status_summary + status_summary=$(IFS=", "; echo "${status_parts[*]}") + + if [[ $red_count -gt 0 ]]; then + record_fail "Job health" "${status_summary}" + else + record_pass "Job health" "${status_summary}" + fi + + # Stuck builds (in queue) + local in_queue_count + in_queue_count=$(echo "$jobs_json" | { grep -oP '"inQueue"\s*:\s*true' || true; } | wc -l) + + if [[ $in_queue_count -gt 0 ]]; then + record_pass "Queued jobs" "${in_queue_count} job(s) waiting in queue" + fi +} + +# ── 6. Plugins ──────────────────────────────────────────────────────── +test_plugins() { + if [[ "$SKIP_PLUGINS" == "true" ]]; then + echo "" + echo -e "${BOLD}Plugins${RESET}" + record_skip "Plugin check" "SKIP_PLUGINS=true" + return + fi + + echo "" + echo -e "${BOLD}Plugins${RESET}" + + local plugins_json + plugins_json=$(api_curl "/pluginManager/api/json?depth=1" 2>/dev/null) || plugins_json="" + + if [[ -z "$plugins_json" ]]; then + record_fail "Plugin list" "could not reach /pluginManager/api/json" + return + fi + + # Count installed plugins + local plugin_count + plugin_count=$(echo "$plugins_json" | { grep -oP '"shortName"\s*:' || true; } | wc -l) + + if [[ $plugin_count -eq 0 ]]; then + record_skip "Plugin list" "no plugins installed" + return + fi + + record_pass "Plugin list" "${plugin_count} plugin(s) installed" + + # Count active vs inactive + local active_count + active_count=$(echo "$plugins_json" | { grep -oP '"active"\s*:\s*true' || true; } | wc -l) + local inactive_count + inactive_count=$(echo "$plugins_json" | { grep -oP '"active"\s*:\s*false' || true; } | wc -l) + + if [[ $inactive_count -gt 0 ]]; then + record_pass "Plugin status" "${active_count} active, ${inactive_count} inactive" + else + record_pass "Plugin status" "${active_count} active" + fi + + # Plugins with updates available + local update_count + update_count=$(echo "$plugins_json" | { grep -oP '"hasUpdate"\s*:\s*true' || true; } | wc -l) + + if [[ $update_count -eq 0 ]]; then + record_pass "Plugin updates" "all plugins up to date" + elif [[ $update_count -lt 5 ]]; then + record_pass "Plugin updates" "${update_count} update(s) available" + else + record_fail "Plugin updates" "${update_count} updates available (review recommended)" + fi +} + +# ── 7. Disk & Resources ────────────────────────────────────────────── +test_disk() { + if [[ "$SKIP_DISK" == "true" ]]; then + echo "" + echo -e "${BOLD}Disk & Resources${RESET}" + record_skip "Disk space check" "SKIP_DISK=true" + return + fi + + echo "" + echo -e "${BOLD}Disk & Resources${RESET}" + + # Check if JENKINS_HOME exists locally + if [[ ! -d "$JENKINS_HOME" ]]; then + record_skip "Disk space check" "JENKINS_HOME not found locally at ${JENKINS_HOME}" + return + fi + + # Get disk usage percentage for JENKINS_HOME partition + local disk_usage + disk_usage=$(df "$JENKINS_HOME" 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%') || disk_usage="" + + if [[ -z "$disk_usage" ]]; then + record_skip "Disk space check" "could not determine disk usage" + return + fi + + local disk_avail + disk_avail=$(df -h "$JENKINS_HOME" 2>/dev/null | awk 'NR==2 {print $4}') || disk_avail="unknown" + + if [[ $disk_usage -lt 70 ]]; then + record_pass "Disk space" "${disk_usage}% used (${disk_avail} free)" + elif [[ $disk_usage -lt 85 ]]; then + record_pass "Disk space" "${disk_usage}% used (${disk_avail} free) — monitor closely" + elif [[ $disk_usage -lt 95 ]]; then + record_fail "Disk space" "${disk_usage}% used (${disk_avail} free) — cleanup needed" + else + record_fail "Disk space" "${disk_usage}% used (${disk_avail} free) — critical" + fi + + # JENKINS_HOME size + local home_size + home_size=$(du -sh "$JENKINS_HOME" 2>/dev/null | awk '{print $1}') || home_size="" + + if [[ -n "$home_size" ]]; then + record_pass "JENKINS_HOME size" "${home_size} at ${JENKINS_HOME}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} ${JENKINS_URL}" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + + if [[ $FAIL -eq 0 ]]; then + echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else + echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}" + fi +} + +print_tap_header() { + echo "TAP version 13" +} + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +write_junit() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + cat > "$JUNIT_FILE" < + + +JUNIT_EOF + + for result in "${RESULTS[@]}"; do + local status name detail + status=$(echo "$result" | cut -d'|' -f1) + name=$(echo "$result" | cut -d'|' -f2) + detail=$(echo "$result" | cut -d'|' -f3) + + # XML-escape the values + name=$(echo "$name" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + detail=$(echo "$detail" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + + case "$status" in + PASS) + echo " " >> "$JUNIT_FILE" + [[ -n "$detail" ]] && echo " ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + FAIL) + echo " " >> "$JUNIT_FILE" + echo " FAILED: ${name} — ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + SKIP) + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + esac + done + + echo " " >> "$JUNIT_FILE" + echo "" >> "$JUNIT_FILE" + + log "JUnit report written to ${JUNIT_FILE}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat < +# License: MIT +# Description: Prometheus exporter for Jira metrics via REST API +# +# Exports metrics for: +# - Issue counts by status (To Do, In Progress, Done, etc.) +# - Issue counts by priority (Highest, High, Medium, Low, Lowest) +# - Issue counts by type (Bug, Story, Task, Epic, Sub-task) +# - Issue counts by project +# - Backlog size and age (unresolved issues) +# - Created vs resolved issue rates (24h, 7d) +# - Average resolution time (resolved last 7d) +# - Overdue issues count +# - Sprint active issue counts (if boards configured) +# - Component issue counts +# - Assignee workload (top 15) +# - Unassigned issue count +# - SLA/resolution time buckets +# - Login activity from security log (logins, failed logins, per-user) +# - Plugin/add-on inventory (installed, enabled, disabled, user-installed) +# - Service Desk queues, request types, SLA metrics, CSAT, organizations +# - Scrape metadata +# +# Modes: +# --textfile Write to node_exporter textfile collector +# --http Run HTTP server for direct Prometheus scraping +# stdout Default: print metrics to stdout +# +# Requirements: +# - curl and jq must be available +# - Jira base URL, username, and API token (or password) +# - For Jira Cloud: use email + API token +# - For Jira Server/DC: use username + password or PAT +# +# Configuration: +# Set environment variables or edit the defaults below: +# JIRA_BASE_URL - Full URL including port +# e.g. https://yourcompany.atlassian.net (Cloud) +# e.g. http://localhost:8080 (Server/DC) +# JIRA_USER - e.g. user or user@company.com (not needed for PAT) +# JIRA_API_TOKEN - API token, password, or Personal Access Token (PAT) +# JIRA_PROJECTS - Comma-separated project keys (default: all) +# JIRA_HOME - Jira home/data directory (for security log parsing) +# e.g. /mnt/ebs/application-data/jira +# +# Authentication: +# Uses Bearer token authentication (Authorization: Bearer ). +# - Jira Cloud: Generate API token at +# https://id.atlassian.com/manage-profile/security/api-tokens +# - Jira Server/DC: Generate a PAT via Profile → Personal Access Tokens +# (requires Jira 8.14+) +# +# Troubleshooting: +# All metrics return zero: +# 1. Verify JIRA_BASE_URL includes the port (e.g. http://localhost:8080, +# NOT http://localhost). Test with: +# curl -H "Authorization: Bearer $JIRA_API_TOKEN" \ +# "$JIRA_BASE_URL/rest/api/2/search?jql=&maxResults=0" +# You should see a JSON response with "total" > 0. +# 2. If the above returns blank, the URL or port is wrong. +# 3. If it returns HTML or a 404, check for a context path +# (e.g. http://localhost:8080/jira). +# +# 403 Forbidden / AUTHENTICATION_DENIED: +# - Basic auth may be blocked. This script uses Bearer token auth +# which requires a PAT (Server/DC) or API token (Cloud). +# - CAPTCHA may be triggered from prior failed logins. Clear it via +# Jira Admin → User Management → find user → Reset failed login count. +# - Ensure the X-Atlassian-Token: no-check header is present (included +# by default in this script). +# +# 401 Unauthorized: +# - PAT may be expired or revoked. Generate a new one. +# - Verify the token is correct (no trailing whitespace/newline). +# +# Partial metrics (some sections zero): +# - The user associated with the PAT must have Browse Projects +# permission on the target projects. +# - Login metrics require JIRA_HOME to be set correctly and the +# script must have read access to the Jira security log at +# $JIRA_HOME/log/atlassian-jira-security.log +# - Service Desk metrics require Jira Service Management. +# +# Login metrics all zero: +# - Verify JIRA_HOME points to the correct Jira data directory. +# - Check the security log exists: +# ls $JIRA_HOME/log/atlassian-jira-security.log +# - Ensure the user running this script can read the log file. +# - Note: The audit log REST API (/rest/api/2/auditing/record) +# does not exist on Jira 10.x. This script parses the security +# log file directly instead. +# +# Changelog: +# 1.4 - Auto-discover all projects when JIRA_PROJECTS is not set +# Fixed per-project metrics (HELP/TYPE headers always emitted) +# Fixed missing metrics: all HELP/TYPE headers now always present +# even when data is empty (assignee, component, plugin, service +# desk queue/org, user logins, resolution time) +# Per-project metrics now grouped by metric name per Prometheus +# exposition format (HELP/TYPE then all values, not interleaved) +# Switched login metrics from audit log REST API (404 on Jira 10.x) +# to parsing $JIRA_HOME/log/atlassian-jira-security.log directly +# Added JIRA_HOME configuration variable and --jira-home CLI option +# Added troubleshooting section for login metrics +# 1.3 - Switched authentication from basic auth to Bearer token (PAT) +# Fixed JQL URL encoding (use curl --data-urlencode) +# Added troubleshooting documentation +# 1.2 - Added plugin/add-on inventory metrics from UPM API +# Added Service Desk metrics (queues, requests, SLAs, CSAT, +# request types, organizations) +# 1.1 - Added login activity metrics from Jira audit log +# (total logins, failed logins, unique users, per-user counts) +# 1.0 - Initial release +################################################################################ + +SCRIPT_VERSION="1.4" +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9418 +LOCK_FILE="/var/run/jira-metrics.lock" + +# Jira connection settings (edit these or override via environment variables) +JIRA_BASE_URL="${JIRA_BASE_URL:-https://yourcompany.atlassian.net}" +JIRA_USER="${JIRA_USER:-user@company.com}" +JIRA_API_TOKEN="${JIRA_API_TOKEN:-your-api-token-here}" +JIRA_PROJECTS="${JIRA_PROJECTS:-}" # comma-separated, empty = all (e.g. "PROJ1,PROJ2") +JIRA_HOME="${JIRA_HOME:-/mnt/ebs/application-data/jira}" # Jira home directory (for log parsing) + +# Timeouts +CURL_TIMEOUT=30 +MAX_RESULTS=1000 + +show_usage() { + cat </dev/null 2>&1; then + echo "ERROR: curl is required" >&2; fail=1 + fi + if ! command -v jq >/dev/null 2>&1; then + echo "ERROR: jq is required" >&2; fail=1 + fi + if [ -z "$JIRA_BASE_URL" ]; then + echo "ERROR: JIRA_BASE_URL not set" >&2; fail=1 + fi + if [ -z "$JIRA_USER" ] || [ -z "$JIRA_API_TOKEN" ]; then + echo "ERROR: JIRA_USER and JIRA_API_TOKEN must be set" >&2; fail=1 + fi + [ "$fail" -eq 1 ] && exit 1 +} + +acquire_lock() { + if [ -f "$LOCK_FILE" ]; then + local pid + pid=$(cat "$LOCK_FILE" 2>/dev/null) + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + echo "ERROR: Another instance is already running (PID: $pid)" >&2 + exit 1 + else + echo "Removing stale lock file" >&2 + rm -f "$LOCK_FILE" + fi + fi + echo $$ > "$LOCK_FILE" + trap cleanup EXIT INT TERM +} + +cleanup() { + rm -f "$LOCK_FILE" +} + +# --------------------------------------------------------------------------- +# Jira API helper +# --------------------------------------------------------------------------- + +jira_api() { + local endpoint="$1" + local url="${JIRA_BASE_URL}/rest/api/2${endpoint}" + curl -s --max-time "$CURL_TIMEOUT" \ + -H "Authorization: Bearer ${JIRA_API_TOKEN}" \ + -H "Content-Type: application/json" \ + -H "X-Atlassian-Token: no-check" \ + "$url" 2>/dev/null +} + +jira_search() { + local jql="$1" + local max_results="${2:-0}" + local fields="${3:-}" + local url="${JIRA_BASE_URL}/rest/api/2/search" + local data_args=("--data-urlencode" "jql=${jql}" "--data-urlencode" "maxResults=${max_results}") + if [ -n "$fields" ]; then + data_args+=("--data-urlencode" "fields=${fields}") + fi + curl -s -G --max-time "$CURL_TIMEOUT" \ + -H "Authorization: Bearer ${JIRA_API_TOKEN}" \ + -H "Content-Type: application/json" \ + -H "X-Atlassian-Token: no-check" \ + "${data_args[@]}" \ + "$url" 2>/dev/null +} + +jira_search_count() { + local jql="$1" + local result + result=$(jira_search "$jql" 0) + echo "$result" | jq -r '.total // 0' 2>/dev/null +} + +# Build project filter clause +get_project_filter() { + if [ -z "$JIRA_PROJECTS" ]; then + echo "" + return + fi + local clause="project in (" + local first=true + IFS=',' read -ra PROJ_ARRAY <<< "$JIRA_PROJECTS" + for proj in "${PROJ_ARRAY[@]}"; do + proj=$(echo "$proj" | tr -d ' ') + if [ "$first" = true ]; then + clause="${clause}\"${proj}\"" + first=false + else + clause="${clause},\"${proj}\"" + fi + done + clause="${clause})" + echo "$clause" +} + +# --------------------------------------------------------------------------- +# Data collection (cached in temp dir) +# --------------------------------------------------------------------------- + +CACHE_DIR="" + +cache_all_jira_data() { + CACHE_DIR=$(mktemp -d /tmp/jira_metrics_cache.XXXXXX) + local pf + pf=$(get_project_filter) + local pf_and="" + [ -n "$pf" ] && pf_and="${pf} AND " + + # Issue counts by status + for status in "To Do" "In Progress" "In Review" "Done" "Closed" "Open" "Reopened" "Resolved"; do + local safe_status + safe_status=$(echo "$status" | tr ' ' '_' | tr '[:upper:]' '[:lower:]') + jira_search_count "${pf_and}status = \"${status}\"" > "$CACHE_DIR/status_${safe_status}" & + done + + # Issue counts by priority + for priority in "Highest" "High" "Medium" "Low" "Lowest"; do + local safe_priority + safe_priority=$(echo "$priority" | tr '[:upper:]' '[:lower:]') + jira_search_count "${pf_and}priority = \"${priority}\"" > "$CACHE_DIR/priority_${safe_priority}" & + done + + # Issue counts by type + for itype in "Bug" "Story" "Task" "Epic" "Sub-task" "Incident"; do + local safe_type + safe_type=$(echo "$itype" | tr ' ' '_' | tr '-' '_' | tr '[:upper:]' '[:lower:]') + jira_search_count "${pf_and}issuetype = \"${itype}\"" > "$CACHE_DIR/type_${safe_type}" & + done + + # Total unresolved (backlog) + jira_search_count "${pf_and}resolution = Unresolved" > "$CACHE_DIR/backlog_total" & + + # Overdue issues + jira_search_count "${pf_and}due < now() AND resolution = Unresolved" > "$CACHE_DIR/overdue" & + + # Unassigned issues + jira_search_count "${pf_and}assignee is EMPTY AND resolution = Unresolved" > "$CACHE_DIR/unassigned" & + + # Created in last 24h + jira_search_count "${pf_and}created >= -1d" > "$CACHE_DIR/created_24h" & + + # Resolved in last 24h + jira_search_count "${pf_and}resolved >= -1d" > "$CACHE_DIR/resolved_24h" & + + # Created in last 7d + jira_search_count "${pf_and}created >= -7d" > "$CACHE_DIR/created_7d" & + + # Resolved in last 7d + jira_search_count "${pf_and}resolved >= -7d" > "$CACHE_DIR/resolved_7d" & + + # Created in last 30d + jira_search_count "${pf_and}created >= -30d" > "$CACHE_DIR/created_30d" & + + # Resolved in last 30d + jira_search_count "${pf_and}resolved >= -30d" > "$CACHE_DIR/resolved_30d" & + + # Aged backlog buckets (unresolved, created more than X days ago) + jira_search_count "${pf_and}resolution = Unresolved AND created <= -7d" > "$CACHE_DIR/backlog_older_7d" & + jira_search_count "${pf_and}resolution = Unresolved AND created <= -30d" > "$CACHE_DIR/backlog_older_30d" & + jira_search_count "${pf_and}resolution = Unresolved AND created <= -90d" > "$CACHE_DIR/backlog_older_90d" & + jira_search_count "${pf_and}resolution = Unresolved AND created <= -365d" > "$CACHE_DIR/backlog_older_365d" & + + # High-priority unresolved + jira_search_count "${pf_and}priority in (Highest, High) AND resolution = Unresolved" > "$CACHE_DIR/high_priority_unresolved" & + + # Critical bugs unresolved + jira_search_count "${pf_and}issuetype = Bug AND priority in (Highest, High) AND resolution = Unresolved" > "$CACHE_DIR/critical_bugs" & + + wait +} + +cache_project_data() { + local proj_list="$JIRA_PROJECTS" + if [ -z "$proj_list" ]; then + proj_list=$(jira_api "/project" | jq -r '.[].key' 2>/dev/null | paste -sd ',' -) + if [ -z "$proj_list" ]; then + return + fi + JIRA_PROJECTS="$proj_list" + fi + IFS=',' read -ra PROJ_ARRAY <<< "$proj_list" + for proj in "${PROJ_ARRAY[@]}"; do + proj=$(echo "$proj" | tr -d ' ') + local safe_proj + safe_proj=$(echo "$proj" | tr '[:upper:]' '[:lower:]') + jira_search_count "project = \"${proj}\" AND resolution = Unresolved" > "$CACHE_DIR/proj_open_${safe_proj}" & + jira_search_count "project = \"${proj}\"" > "$CACHE_DIR/proj_total_${safe_proj}" & + jira_search_count "project = \"${proj}\" AND created >= -7d" > "$CACHE_DIR/proj_created_7d_${safe_proj}" & + jira_search_count "project = \"${proj}\" AND resolved >= -7d" > "$CACHE_DIR/proj_resolved_7d_${safe_proj}" & + done + wait +} + +cache_assignee_data() { + local pf + pf=$(get_project_filter) + local pf_and="" + [ -n "$pf" ] && pf_and="${pf} AND " + + local result + result=$(jira_search "${pf_and}resolution = Unresolved AND assignee is not EMPTY ORDER BY assignee ASC" "$MAX_RESULTS" "assignee") + if [ -n "$result" ]; then + echo "$result" | jq -r ' + [.issues[]? | .fields.assignee.displayName // .fields.assignee.name // "unknown"] | + group_by(.) | + map({name: .[0], count: length}) | + sort_by(-.count) | + .[:15][] | + "\(.name)|\(.count)" + ' 2>/dev/null > "$CACHE_DIR/assignees" + fi +} + +cache_component_data() { + local pf + pf=$(get_project_filter) + local pf_and="" + [ -n "$pf" ] && pf_and="${pf} AND " + + local result + result=$(jira_search "${pf_and}resolution = Unresolved AND component is not EMPTY" "$MAX_RESULTS" "components") + if [ -n "$result" ] && echo "$result" | jq -e '.issues' >/dev/null 2>&1; then + echo "$result" | jq -r ' + [.issues[]? | .fields.components[]?.name // "unknown"] | + group_by(.) | + map({name: .[0], count: length}) | + sort_by(-.count) | + .[:20][] | + "\(.name)|\(.count)" + ' 2>/dev/null > "$CACHE_DIR/components" + fi +} + +cache_resolution_time_data() { + local pf + pf=$(get_project_filter) + local pf_and="" + [ -n "$pf" ] && pf_and="${pf} AND " + + local result + result=$(jira_search "${pf_and}resolved >= -7d" 100 "created,resolutiondate") + if [ -n "$result" ]; then + echo "$result" | jq -r ' + [.issues[]? | + select(.fields.resolutiondate != null and .fields.created != null) | + ((.fields.resolutiondate | sub("\\.[0-9]+.*";"") | strptime("%Y-%m-%dT%H:%M:%S") | mktime) - + (.fields.created | sub("\\.[0-9]+.*";"") | strptime("%Y-%m-%dT%H:%M:%S") | mktime)) / 3600 + ] | + if length > 0 then + { avg: (add / length), min: min, max: max, count: length } + else + { avg: 0, min: 0, max: 0, count: 0 } + end | + "\(.avg)|\(.min)|\(.max)|\(.count)" + ' 2>/dev/null > "$CACHE_DIR/resolution_times" + fi +} + +cache_login_data() { + local security_log="${JIRA_HOME}/log/atlassian-jira-security.log" + + if [ ! -f "$security_log" ]; then + echo "0" > "$CACHE_DIR/logins_total" + echo "0" > "$CACHE_DIR/logins_success" + echo "0" > "$CACHE_DIR/logins_failed" + echo "0" > "$CACHE_DIR/logins_unique_users" + return + fi + + local since_date + since_date=$(date -d '24 hours ago' '+%Y-%m-%d %H:%M' 2>/dev/null || \ + date -v-1d '+%Y-%m-%d %H:%M' 2>/dev/null) + + # Extract lines from last 24h (compare date prefix YYYY-MM-DD HH:MM) + local recent_lines + recent_lines=$(awk -v since="$since_date" '$0 ~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}/ { ts=substr($0,1,16); if (ts >= since) print }' "$security_log" 2>/dev/null) + + if [ -z "$recent_lines" ]; then + echo "0" > "$CACHE_DIR/logins_total" + echo "0" > "$CACHE_DIR/logins_success" + echo "0" > "$CACHE_DIR/logins_failed" + echo "0" > "$CACHE_DIR/logins_unique_users" + return + fi + + # Successful logins: "has PASSED authentication" + local success_count + success_count=$(echo "$recent_lines" | grep -c 'has PASSED authentication' 2>/dev/null) + echo "${success_count:-0}" > "$CACHE_DIR/logins_success" + + # Failed logins: "tried to login but" + local failed_count + failed_count=$(echo "$recent_lines" | grep -c 'tried to login but' 2>/dev/null) + echo "${failed_count:-0}" > "$CACHE_DIR/logins_failed" + + # Total login events + local total_count + total_count=$(( success_count + failed_count )) + echo "${total_count:-0}" > "$CACHE_DIR/logins_total" + + # Unique users (extract username from 'user' has PASSED) + local unique_users + unique_users=$(echo "$recent_lines" | grep 'has PASSED authentication' | \ + grep -oP "The user '\K[^']+" 2>/dev/null | \ + sort -u | wc -l) + echo "${unique_users:-0}" > "$CACHE_DIR/logins_unique_users" + + # Per-user login counts (top 15) + echo "$recent_lines" | grep 'has PASSED authentication' | \ + grep -oP "The user '\K[^']+" 2>/dev/null | \ + sort | uniq -c | sort -rn | head -15 | \ + awk '{print $2"|"$1}' > "$CACHE_DIR/logins_per_user" 2>/dev/null +} + +# --------------------------------------------------------------------------- +# Plugin / add-on metrics (UPM API - Jira Server/DC) +# --------------------------------------------------------------------------- + +cache_plugin_data() { + local result + result=$(curl -s --max-time "$CURL_TIMEOUT" \ + -H "Authorization: Bearer ${JIRA_API_TOKEN}" \ + -H "Accept: application/vnd.atl.plugins.installed+json" \ + "${JIRA_BASE_URL}/rest/plugins/1.0/" 2>/dev/null) + + if [ -n "$result" ] && echo "$result" | jq -e '.plugins' >/dev/null 2>&1; then + echo "$result" | jq -r '.plugins | length' > "$CACHE_DIR/plugins_total" 2>/dev/null + echo "$result" | jq -r '[.plugins[]? | select(.enabled == true)] | length' > "$CACHE_DIR/plugins_enabled" 2>/dev/null + echo "$result" | jq -r '[.plugins[]? | select(.enabled == false)] | length' > "$CACHE_DIR/plugins_disabled" 2>/dev/null + echo "$result" | jq -r '[.plugins[]? | select(.userInstalled == true)] | length' > "$CACHE_DIR/plugins_user_installed" 2>/dev/null + echo "$result" | jq -r '[.plugins[]? | select(.userInstalled == false or .userInstalled == null)] | length' > "$CACHE_DIR/plugins_system" 2>/dev/null + + echo "$result" | jq -r ' + [.plugins[]? | select(.userInstalled == true)] | + .[] | + "\(.name // .key)|\(if .enabled then 1 else 0 end)|\(.version // "unknown")" + ' 2>/dev/null > "$CACHE_DIR/plugins_detail" + else + echo "0" > "$CACHE_DIR/plugins_total" + echo "0" > "$CACHE_DIR/plugins_enabled" + echo "0" > "$CACHE_DIR/plugins_disabled" + echo "0" > "$CACHE_DIR/plugins_user_installed" + echo "0" > "$CACHE_DIR/plugins_system" + fi +} + +# --------------------------------------------------------------------------- +# Service Desk metrics (Jira Service Desk / Service Management API) +# --------------------------------------------------------------------------- + +servicedesk_api() { + local endpoint="$1" + curl -s --max-time "$CURL_TIMEOUT" \ + -H "Authorization: Bearer ${JIRA_API_TOKEN}" \ + -H "Content-Type: application/json" \ + -H "X-ExperimentalApi: opt-in" \ + "${JIRA_BASE_URL}/rest/servicedeskapi${endpoint}" 2>/dev/null +} + +cache_servicedesk_data() { + local sd_list + sd_list=$(servicedesk_api "/servicedesk") + + if [ -z "$sd_list" ] || ! echo "$sd_list" | jq -e '.values' >/dev/null 2>&1; then + echo "0" > "$CACHE_DIR/sd_available" + return + fi + + echo "1" > "$CACHE_DIR/sd_available" + + local sd_count + sd_count=$(echo "$sd_list" | jq -r '.size // (.values | length)' 2>/dev/null) + echo "${sd_count:-0}" > "$CACHE_DIR/sd_count" + + echo "$sd_list" | jq -r '.values[]? | "\(.id)|\(.projectKey // "unknown")|\(.projectName // "unknown")"' \ + 2>/dev/null > "$CACHE_DIR/sd_list" + + local tmp_queues="$CACHE_DIR/sd_queues_all" + local tmp_request_types="$CACHE_DIR/sd_request_types_all" + local tmp_orgs="$CACHE_DIR/sd_orgs_all" + : > "$tmp_queues" + : > "$tmp_request_types" + : > "$tmp_orgs" + + while IFS='|' read -r sd_id sd_key _sd_name; do + [ -z "$sd_id" ] && continue + + local queues + queues=$(servicedesk_api "/servicedesk/${sd_id}/queue") + if [ -n "$queues" ] && echo "$queues" | jq -e '.values' >/dev/null 2>&1; then + echo "$queues" | jq -r --arg proj "$sd_key" ' + .values[]? | "\($proj)|\(.id)|\(.name // "unknown")|\(.issueCount // 0)" + ' 2>/dev/null >> "$tmp_queues" + fi + + local rtypes + rtypes=$(servicedesk_api "/servicedesk/${sd_id}/requesttype") + if [ -n "$rtypes" ] && echo "$rtypes" | jq -e '.values' >/dev/null 2>&1; then + echo "$rtypes" | jq -r --arg proj "$sd_key" ' + .values[]? | "\($proj)|\(.id)|\(.name // "unknown")" + ' 2>/dev/null >> "$tmp_request_types" + fi + + local orgs + orgs=$(servicedesk_api "/servicedesk/${sd_id}/organization") + if [ -n "$orgs" ] && echo "$orgs" | jq -e '.values' >/dev/null 2>&1; then + local org_count + org_count=$(echo "$orgs" | jq -r '.size // (.values | length)' 2>/dev/null) + echo "${sd_key}|${org_count:-0}" >> "$tmp_orgs" + fi + done < "$CACHE_DIR/sd_list" + + local pf + pf=$(get_project_filter) + local pf_and="" + [ -n "$pf" ] && pf_and="${pf} AND " + + jira_search_count "${pf_and}\"Customer Request Type\" is not EMPTY AND resolution = Unresolved" \ + > "$CACHE_DIR/sd_open_requests" 2>/dev/null & + jira_search_count "${pf_and}\"Customer Request Type\" is not EMPTY AND resolved >= -24h" \ + > "$CACHE_DIR/sd_resolved_24h" 2>/dev/null & + jira_search_count "${pf_and}\"Customer Request Type\" is not EMPTY AND resolved >= -7d" \ + > "$CACHE_DIR/sd_resolved_7d" 2>/dev/null & + jira_search_count "${pf_and}\"Customer Request Type\" is not EMPTY AND created >= -24h" \ + > "$CACHE_DIR/sd_created_24h" 2>/dev/null & + jira_search_count "${pf_and}\"Customer Request Type\" is not EMPTY AND created >= -7d" \ + > "$CACHE_DIR/sd_created_7d" 2>/dev/null & + + jira_search_count "${pf_and}\"Customer Request Type\" is not EMPTY AND priority in (Highest, High) AND resolution = Unresolved" \ + > "$CACHE_DIR/sd_high_priority_open" 2>/dev/null & + + jira_search_count "${pf_and}\"Customer Request Type\" is not EMPTY AND due < now() AND resolution = Unresolved" \ + > "$CACHE_DIR/sd_breached_sla" 2>/dev/null & + + wait + + local csat_result + csat_result=$(jira_search "${pf_and}\"Customer Request Type\" is not EMPTY AND resolved >= -30d AND \"Satisfaction rating\" is not EMPTY" \ + 100 "customfield_10421,satisfaction") + if [ -n "$csat_result" ] && echo "$csat_result" | jq -e '.issues' >/dev/null 2>&1; then + local csat_total + csat_total=$(echo "$csat_result" | jq -r '.total // 0' 2>/dev/null) + echo "${csat_total}" > "$CACHE_DIR/sd_csat_responses" + else + echo "0" > "$CACHE_DIR/sd_csat_responses" + fi +} + +# --------------------------------------------------------------------------- +# Read cached values +# --------------------------------------------------------------------------- + +read_cache() { + local key="$1" default="${2:-0}" + local val + val=$(tr -d '[:space:]' < "$CACHE_DIR/$key" 2>/dev/null) + if [ -z "$val" ] || [ "$val" = "null" ]; then + echo "$default" + else + echo "$val" + fi +} + +# --------------------------------------------------------------------------- +# Metric generation +# --------------------------------------------------------------------------- + +generate_metrics() { + local start_time + start_time=$(date +%s) + + # Read all cached values + local status_todo status_in_progress status_in_review status_done status_closed status_open status_reopened status_resolved + status_todo=$(read_cache "status_to_do") + status_in_progress=$(read_cache "status_in_progress") + status_in_review=$(read_cache "status_in_review") + status_done=$(read_cache "status_done") + status_closed=$(read_cache "status_closed") + status_open=$(read_cache "status_open") + status_reopened=$(read_cache "status_reopened") + status_resolved=$(read_cache "status_resolved") + + local priority_highest priority_high priority_medium priority_low priority_lowest + priority_highest=$(read_cache "priority_highest") + priority_high=$(read_cache "priority_high") + priority_medium=$(read_cache "priority_medium") + priority_low=$(read_cache "priority_low") + priority_lowest=$(read_cache "priority_lowest") + + local type_bug type_story type_task type_epic type_sub_task type_incident + type_bug=$(read_cache "type_bug") + type_story=$(read_cache "type_story") + type_task=$(read_cache "type_task") + type_epic=$(read_cache "type_epic") + type_sub_task=$(read_cache "type_sub_task") + type_incident=$(read_cache "type_incident") + + local backlog_total overdue unassigned + backlog_total=$(read_cache "backlog_total") + overdue=$(read_cache "overdue") + unassigned=$(read_cache "unassigned") + + local created_24h resolved_24h created_7d resolved_7d created_30d resolved_30d + created_24h=$(read_cache "created_24h") + resolved_24h=$(read_cache "resolved_24h") + created_7d=$(read_cache "created_7d") + resolved_7d=$(read_cache "resolved_7d") + created_30d=$(read_cache "created_30d") + resolved_30d=$(read_cache "resolved_30d") + + local backlog_older_7d backlog_older_30d backlog_older_90d backlog_older_365d + backlog_older_7d=$(read_cache "backlog_older_7d") + backlog_older_30d=$(read_cache "backlog_older_30d") + backlog_older_90d=$(read_cache "backlog_older_90d") + backlog_older_365d=$(read_cache "backlog_older_365d") + + local high_priority_unresolved critical_bugs + high_priority_unresolved=$(read_cache "high_priority_unresolved") + critical_bugs=$(read_cache "critical_bugs") + + cat </dev/null) + if [ -n "$rt_data" ] && [ "$rt_data" != "null" ]; then + rt_avg=$(echo "$rt_data" | cut -d'|' -f1) + rt_min=$(echo "$rt_data" | cut -d'|' -f2) + rt_max=$(echo "$rt_data" | cut -d'|' -f3) + rt_count=$(echo "$rt_data" | cut -d'|' -f4) + fi + fi + cat <&2 + + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4; charset=utf-8\r\n\r\n" + cache_all_jira_data + cache_project_data + cache_assignee_data + cache_component_data + cache_resolution_time_data + cache_login_data + cache_plugin_data + cache_servicedesk_data + generate_metrics + else + printf "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n" + echo "

Jira Exporter v${SCRIPT_VERSION}

Metrics" + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +main() { + parse_args "$@" + preflight_check + + [ "$HTTP_MODE" != true ] && acquire_lock + + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + cache_all_jira_data + cache_project_data + cache_assignee_data + cache_component_data + cache_resolution_time_data + cache_login_data + cache_plugin_data + cache_servicedesk_data + + mkdir -p "$(dirname "$OUTPUT_FILE")" + + local temp_file + temp_file=$(mktemp /tmp/jira_metrics.XXXXXX) + + generate_metrics > "$temp_file" + + rm -f "$OUTPUT_FILE" + mv "$temp_file" "$OUTPUT_FILE" + chmod 644 "$OUTPUT_FILE" + sync + else + cache_all_jira_data + cache_project_data + cache_assignee_data + cache_component_data + cache_resolution_time_data + cache_login_data + cache_plugin_data + cache_servicedesk_data + generate_metrics + fi +} + +main "$@" diff --git a/journal-error-exporter.sh b/journal-error-exporter.sh new file mode 100755 index 0000000..754655c --- /dev/null +++ b/journal-error-exporter.sh @@ -0,0 +1,485 @@ +#!/bin/bash +################################################################################ +# Script Name: journal-error-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for journalctl error/critical/warning +# messages per systemd unit. Exports per-priority message counts, +# per-unit breakdown, top offending units, error rates, and +# journal disk usage metrics. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - journalctl (systemd journal) +# - netcat (nc) for HTTP mode +# - Standard Unix tools (awk, sort, uniq, grep) +# +# Performance: +# Journal output is cached per priority+period combination — journalctl +# is called once per unique combo, not once per metric. Per-unit counts +# are extracted from a single cached 24h query. +# Typical run time: a few seconds even on busy systems. +# +# Usage: +# # Output to stdout +# ./journal-error-exporter.sh +# +# # HTTP server mode +# ./journal-error-exporter.sh --http -p 9201 +# +# # Textfile collector mode +# ./journal-error-exporter.sh --textfile +# +# Metrics Exported: +# Core Status: +# - journal_error_up - Exporter status (1=up, 0=down) +# - journal_error_exporter_info{version} - Exporter version +# +# Message Counts: +# - journal_error_messages_total{priority,period} - Messages per priority per period +# +# Per-Unit Breakdown (24h, top 20): +# - journal_error_unit_messages{unit,priority} - Per-unit message count by priority +# +# Top Offenders: +# - journal_error_top_unit_count{unit} - Top 20 units by err+crit+alert+emerg (24h) +# +# Rates: +# - journal_error_rate_per_hour{priority} - Average messages per hour (24h) +# +# Journal Health: +# - journal_error_journal_disk_usage_bytes - Journal disk usage +# - journal_error_exporter_duration_seconds - Script execution time +# - journal_error_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9201 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9201 +CACHE_DIR="" + +# Priority label map +declare -A PRIORITY_LABELS +PRIORITY_LABELS=([0]="emerg" [1]="alert" [2]="crit" [3]="err" [4]="warning") + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Set up a temp directory for caching journal output per priority+period +# Called once at the start of generate_metrics +setup_cache() { + CACHE_DIR=$(mktemp -d /tmp/journal-error-exporter.XXXXXX) +} + +# Clean up cached journal data +cleanup_cache() { + [ -n "$CACHE_DIR" ] && rm -rf "$CACHE_DIR" + CACHE_DIR="" +} + +# Get message count for a specific priority and time period (cached) +# journalctl is called once per unique priority+period combo +# Args: $1 - priority number (0-4), $2 - period ("1 hour ago", "24 hours ago", "7 days ago") +# Returns: Number of messages +get_priority_count() { + local priority_num="$1" + local period="$2" + local cache_key="priority_${priority_num}_${period// /_}" + local cache_file="$CACHE_DIR/$cache_key" + + # Return cached result if available + if [ -f "$cache_file" ]; then + cat "$cache_file" + return + fi + + local count + count=$(journalctl --priority="${priority_num}..${priority_num}" --since "$period" --no-pager -q 2>/dev/null | wc -l) + echo "${count:-0}" > "$cache_file" + cat "$cache_file" +} + +# Get top units by err+ (priority 0..3) message count in 24h +# Args: $1 - limit (default: 20) +# Returns: Lines with "count unit" format, sorted by count descending +get_top_units() { + local limit="${1:-20}" + local cache_file="$CACHE_DIR/top_units" + + # Return cached result if available + if [ -f "$cache_file" ]; then + head -n "$limit" "$cache_file" + return + fi + + # Extract unit/process field from syslog-format output + journalctl --priority=0..3 --since "24 hours ago" --output=short --no-pager -q 2>/dev/null | \ + awk '{print $5}' | sed 's/\[.*//; s/://' | \ + sort | uniq -c | sort -rn > "$cache_file" 2>/dev/null + + head -n "$limit" "$cache_file" +} + +# Get message count for a specific unit at a specific priority (24h) +# Args: $1 - unit name, $2 - priority number (0-4) +# Returns: Number of messages +get_unit_priority_count() { + local unit="$1" + local priority_num="$2" + local cache_key="unit_${unit}_p${priority_num}" + local cache_file="$CACHE_DIR/$cache_key" + + # Return cached result if available + if [ -f "$cache_file" ]; then + cat "$cache_file" + return + fi + + local count + count=$(journalctl --priority="${priority_num}..${priority_num}" --since "24 hours ago" --no-pager -q 2>/dev/null | \ + awk '{print $5}' | sed 's/\[.*//; s/://' | \ + grep -cx "$unit" 2>/dev/null) + echo "${count:-0}" > "$cache_file" + cat "$cache_file" +} + +# Get journal disk usage in bytes +# Parses journalctl --disk-usage output +# Returns: Disk usage in bytes +get_journal_disk_usage() { + local output + output=$(journalctl --disk-usage 2>/dev/null) + + # Output format: "Archived and active journals take up 123.4M in the file system." + # or similar with G/K/B suffixes + local size_str + size_str=$(echo "$output" | grep -oE '[0-9]+(\.[0-9]+)?[KMGTB]+' | head -1) + + if [ -z "$size_str" ]; then + echo "0" + return + fi + + local number suffix + number=$(echo "$size_str" | grep -oE '[0-9]+(\.[0-9]+)?') + suffix=$(echo "$size_str" | grep -oE '[KMGTB]+$') + + case "$suffix" in + B) awk "BEGIN {printf \"%.0f\", $number}" ;; + K) awk "BEGIN {printf \"%.0f\", $number * 1024}" ;; + M) awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024}" ;; + G) awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024 * 1024}" ;; + T) awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024 * 1024 * 1024}" ;; + *) echo "0" ;; + esac +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Verify journalctl is available + if ! command -v journalctl >/dev/null 2>&1; then + cat </dev/null; then + rate=$(awk "BEGIN {printf \"%.2f\", $total / 24}" 2>/dev/null || echo "0.00") + else + rate="0.00" + fi + echo "journal_error_rate_per_hour{priority=\"$priority_label\"} $rate" + done + + echo "" + + # ======================================================================== + # Journal Health + # ======================================================================== + local disk_usage + disk_usage=$(get_journal_disk_usage) + + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + # Infinite loop accepting HTTP requests + while true; do + { + read -r request + # Check if request is for /metrics endpoint + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else # Serve HTML landing page for other requests + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +Journal Error Exporter v1.0 + +

Journal Error Exporter v1.0

+

Metrics

+

Metric Categories

+
    +
  • Core Status: exporter up/down, version info
  • +
  • Message Counts: per-priority counts (emerg/alert/crit/err/warning) per period
  • +
  • Per-Unit Breakdown: per-unit message counts by priority (24h, top 20)
  • +
  • Top Offenders: top 20 units by error+ count (24h)
  • +
  • Rates: average messages per hour by priority (24h)
  • +
  • Journal Health: disk usage, exporter runtime
  • +
+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Main entry point - routes to appropriate output mode +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + # Run HTTP server (blocks until killed) + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + # Textfile collector mode: write atomically using temp file + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + # Create temp file in SAME directory for atomic rename (same filesystem) + local temp_file + temp_file=$(mktemp "${output_dir}/.journal_error_metrics.XXXXXX") + + # Generate metrics to temp file + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + # Validate: file must have content + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + # Set permissions before move + chmod 644 "$temp_file" + + # Atomic rename - no gap where file is missing + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + # Default: output to stdout + generate_metrics + fi +} + +# Execute main function with all script arguments +main "$@" diff --git a/k6-test-runner.sh b/k6-test-runner.sh new file mode 100644 index 0000000..ea5bf7a --- /dev/null +++ b/k6-test-runner.sh @@ -0,0 +1,559 @@ +#!/usr/bin/env bash + +######################################################################################### +#### k6-test-runner.sh — Run k6 load tests with Prometheus push and formatted #### +#### reports. Execute test scripts, push metrics, compare runs, threshold checks #### +#### Requires: bash 4+, k6 #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### ./k6-test-runner.sh --run ./tests/load.js #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Colors (pre-initialized) ───────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" CYAN="" BOLD="" DIM="" RESET="" + +setup_colors() { + if [[ "${COLOR:-auto}" == "never" ]]; then + return + fi + if [[ "${COLOR:-auto}" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } +die() { err "$*"; exit 1; } + +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-22s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-22s${RESET} %b\n" "$1" "$2" +} + +elapsed() { + local end_time + end_time=$(date +%s) + echo "$(( end_time - START_TIME ))s" +} + +# ── Defaults ────────────────────────────────────────────────────────── +RUN_MODE="" +TEST_SCRIPT="" +K6_PATH="${K6_PATH:-k6}" +VUS="${K6_VUS:-10}" +DURATION="${K6_DURATION:-30s}" +PUSH_GW="${K6_PUSH_GATEWAY:-}" +RESULTS_DIR="${K6_RESULTS_DIR:-./k6-results}" +OUTPUT_FORMAT="${K6_FORMAT:-text}" +THRESHOLDS_FILE="" +COMPARE_RUN="" +LIST_DIR="" +INSPECT_SCRIPT="" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +declare -a ENV_VARS=() +TAGS="${K6_TAGS:-}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +START_TIME="" +RUN_ID="" + +# ── Dependency checks ──────────────────────────────────────────────── +require_k6() { + if ! command -v "$K6_PATH" &>/dev/null; then + die "k6 not found: ${K6_PATH}. Install from https://k6.io/docs/get-started/installation/" + fi + verbose "k6 found: $(command -v "$K6_PATH") ($("$K6_PATH" version 2>/dev/null | head -1))" +} + +require_jq() { + if ! command -v jq &>/dev/null; then + die "jq is required for result parsing" + fi +} + +# ── Format helpers ─────────────────────────────────────────────────── +format_duration_ms() { + local ms="$1" + if command -v awk &>/dev/null; then + if awk "BEGIN{exit ($ms >= 1000) ? 0 : 1}" 2>/dev/null; then + awk "BEGIN{printf \"%.2fs\", $ms / 1000}" + else + awk "BEGIN{printf \"%.1fms\", $ms}" + fi + else + echo "${ms}ms" + fi +} + +format_rate() { + local rate="$1" + if command -v awk &>/dev/null; then + awk "BEGIN{printf \"%.1f/s\", $rate}" + else + echo "${rate}/s" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# RUN MODE +# ══════════════════════════════════════════════════════════════════════ +do_run() { + [[ -z "$TEST_SCRIPT" ]] && die "No test script specified" + [[ ! -f "$TEST_SCRIPT" ]] && die "Test script not found: ${TEST_SCRIPT}" + require_k6 + + RUN_ID="$(date +%Y%m%d-%H%M%S)" + local run_dir="${RESULTS_DIR}/${RUN_ID}" + mkdir -p "$run_dir" + + section_header "k6 Load Test" + field "Test script:" "$TEST_SCRIPT" + field "Virtual users:" "$VUS" + field "Duration:" "$DURATION" + field "Run ID:" "$RUN_ID" + [[ -n "$PUSH_GW" ]] && field "Push gateway:" "$PUSH_GW" + echo "" + + # Build k6 command + local -a k6_args=("run") + k6_args+=("--vus" "$VUS") + k6_args+=("--duration" "$DURATION") + k6_args+=("--summary-export" "${run_dir}/summary.json") + k6_args+=("--out" "json=${run_dir}/results.json") + + if [[ -n "$PUSH_GW" ]]; then + k6_args+=("--out" "experimental-prometheus-rw=${PUSH_GW}") + fi + + if [[ -n "$THRESHOLDS_FILE" && -f "$THRESHOLDS_FILE" ]]; then + k6_args+=("--config" "$THRESHOLDS_FILE") + fi + + for ev in "${ENV_VARS[@]+"${ENV_VARS[@]}"}"; do + k6_args+=("-e" "$ev") + done + + if [[ -n "$TAGS" ]]; then + k6_args+=("--tag" "$TAGS") + fi + + k6_args+=("$TEST_SCRIPT") + + verbose "Command: ${K6_PATH} ${k6_args[*]}" + + # Save run metadata + cat > "${run_dir}/metadata.json" <&1 | tee "${run_dir}/output.log" || exit_code=$? + + echo "" + + # Parse and display results + if [[ -f "${run_dir}/summary.json" ]]; then + display_results "${run_dir}/summary.json" + else + warn "No summary file generated" + fi + + section_header "Run Summary" + field "Run ID:" "$RUN_ID" + field "Results:" "$run_dir" + field "Duration:" "$(elapsed)" + + if [[ $exit_code -ne 0 ]]; then + field_color "Status:" "${RED}FAILED (exit ${exit_code})${RESET}" + else + field_color "Status:" "${GREEN}PASSED${RESET}" + fi + + return "$exit_code" +} + +display_results() { + local summary_file="$1" + + if ! command -v jq &>/dev/null; then + warn "jq not available — skipping detailed results" + return + fi + + section_header "Test Results" + + local http_reqs http_req_dur_avg http_req_dur_p95 http_req_dur_p99 + local http_req_failed iterations data_received data_sent + + http_reqs=$(jq -r '.metrics.http_reqs.values.count // 0' "$summary_file" 2>/dev/null || echo 0) + http_req_dur_avg=$(jq -r '.metrics.http_req_duration.values.avg // 0' "$summary_file" 2>/dev/null || echo 0) + http_req_dur_p95=$(jq -r '.metrics.http_req_duration.values["p(95)"] // 0' "$summary_file" 2>/dev/null || echo 0) + http_req_dur_p99=$(jq -r '.metrics.http_req_duration.values["p(99)"] // 0' "$summary_file" 2>/dev/null || echo 0) + http_req_failed=$(jq -r '.metrics.http_req_failed.values.rate // 0' "$summary_file" 2>/dev/null || echo 0) + iterations=$(jq -r '.metrics.iterations.values.count // 0' "$summary_file" 2>/dev/null || echo 0) + data_received=$(jq -r '.metrics.data_received.values.count // 0' "$summary_file" 2>/dev/null || echo 0) + data_sent=$(jq -r '.metrics.data_sent.values.count // 0' "$summary_file" 2>/dev/null || echo 0) + + local iter_rate + iter_rate=$(jq -r '.metrics.iterations.values.rate // 0' "$summary_file" 2>/dev/null || echo 0) + + printf " ${BOLD}%-28s %s${RESET}\n" "METRIC" "VALUE" + printf " %s\n" "$(printf '%.0s─' {1..50})" + printf " %-28s %s\n" "HTTP Requests" "$http_reqs" + printf " %-28s %s\n" "Request Duration (avg)" "$(format_duration_ms "$http_req_dur_avg")" + printf " %-28s %s\n" "Request Duration (p95)" "$(format_duration_ms "$http_req_dur_p95")" + printf " %-28s %s\n" "Request Duration (p99)" "$(format_duration_ms "$http_req_dur_p99")" + + local fail_pct + fail_pct=$(awk "BEGIN{printf \"%.2f\", $http_req_failed * 100}" 2>/dev/null || echo "0") + if awk "BEGIN{exit ($http_req_failed > 0) ? 0 : 1}" 2>/dev/null; then + printf " %-28s ${RED}%s%%${RESET}\n" "Failed Requests" "$fail_pct" + else + printf " %-28s ${GREEN}%s%%${RESET}\n" "Failed Requests" "$fail_pct" + fi + + printf " %-28s %s\n" "Iterations" "$iterations" + printf " %-28s %s\n" "Iteration Rate" "$(format_rate "$iter_rate")" + printf " %-28s %s\n" "Data Received" "$(numfmt --to=iec "$data_received" 2>/dev/null || echo "${data_received} B")" + printf " %-28s %s\n" "Data Sent" "$(numfmt --to=iec "$data_sent" 2>/dev/null || echo "${data_sent} B")" +} + +# ══════════════════════════════════════════════════════════════════════ +# LIST MODE +# ══════════════════════════════════════════════════════════════════════ +do_list() { + local dir="${LIST_DIR:-.}" + [[ ! -d "$dir" ]] && die "Directory not found: ${dir}" + + section_header "Available Test Scripts" + + local count=0 + printf " ${BOLD}%-40s %10s %s${RESET}\n" "SCRIPT" "SIZE" "MODIFIED" + printf " %s\n" "$(printf '%.0s─' {1..65})" + + while IFS= read -r -d '' f; do + local name size modified + name=$(basename "$f") + size=$(stat --printf="%s" "$f" 2>/dev/null || stat -f%z "$f" 2>/dev/null || echo 0) + modified=$(stat --printf="%y" "$f" 2>/dev/null | cut -d' ' -f1 || stat -f"%Sm" -t "%Y-%m-%d" "$f" 2>/dev/null || echo "unknown") + local human_size + human_size=$(numfmt --to=iec "$size" 2>/dev/null || echo "${size}B") + printf " %-40s %10s %s\n" "${name:0:38}" "$human_size" "$modified" + ((count++)) || true + done < <(find "$dir" -maxdepth 2 -name '*.js' -type f -print0 2>/dev/null | sort -z) + + echo "" + field "Total scripts:" "$count" + + if [[ "$count" -eq 0 ]]; then + warn "No .js test scripts found in ${dir}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# COMPARE MODE +# ══════════════════════════════════════════════════════════════════════ +do_compare() { + [[ -z "$COMPARE_RUN" ]] && die "No run ID specified for comparison" + require_jq + + local prev_summary="${RESULTS_DIR}/${COMPARE_RUN}/summary.json" + [[ ! -f "$prev_summary" ]] && die "Previous run not found: ${prev_summary}" + + # Find most recent run (excluding the comparison target) + local latest_dir="" + while IFS= read -r d; do + local base + base=$(basename "$d") + if [[ "$base" != "$COMPARE_RUN" && -f "${d}/summary.json" ]]; then + latest_dir="$d" + break + fi + done < <(find "$RESULTS_DIR" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort -r) + + if [[ -z "$latest_dir" ]]; then + die "No other runs found to compare against" + fi + + local curr_summary="${latest_dir}/summary.json" + local curr_id + curr_id=$(basename "$latest_dir") + + section_header "Run Comparison" + field "Current run:" "$curr_id" + field "Previous run:" "$COMPARE_RUN" + echo "" + + printf " ${BOLD}%-24s %14s %14s %10s${RESET}\n" "METRIC" "CURRENT" "PREVIOUS" "DELTA" + printf " %s\n" "$(printf '%.0s─' {1..66})" + + local metrics=("http_reqs.values.count" "http_req_duration.values.avg" "http_req_duration.values[\"p(95)\"]" "iterations.values.count") + local labels=("HTTP Requests" "Duration (avg ms)" "Duration (p95 ms)" "Iterations") + + for i in "${!metrics[@]}"; do + local metric="${metrics[$i]}" + local label="${labels[$i]}" + local curr_val prev_val + curr_val=$(jq -r ".metrics.${metric} // 0" "$curr_summary" 2>/dev/null || echo 0) + prev_val=$(jq -r ".metrics.${metric} // 0" "$prev_summary" 2>/dev/null || echo 0) + + local delta delta_pct color + delta=$(awk "BEGIN{printf \"%.1f\", $curr_val - $prev_val}" 2>/dev/null || echo "0") + if awk "BEGIN{exit ($prev_val > 0) ? 0 : 1}" 2>/dev/null; then + delta_pct=$(awk "BEGIN{printf \"%.1f%%\", (($curr_val - $prev_val) / $prev_val) * 100}" 2>/dev/null || echo "0%") + else + delta_pct="N/A" + fi + + color="$RESET" + if awk "BEGIN{exit ($delta > 0) ? 0 : 1}" 2>/dev/null; then + color="$RED" + elif awk "BEGIN{exit ($delta < 0) ? 0 : 1}" 2>/dev/null; then + color="$GREEN" + fi + + printf " %-24s %14s %14s ${color}%10s${RESET}\n" "$label" \ + "$(awk "BEGIN{printf \"%.1f\", $curr_val}")" \ + "$(awk "BEGIN{printf \"%.1f\", $prev_val}")" \ + "$delta_pct" + done +} + +# ══════════════════════════════════════════════════════════════════════ +# REPORT MODE +# ══════════════════════════════════════════════════════════════════════ +do_report() { + [[ ! -d "$RESULTS_DIR" ]] && die "Results directory not found: ${RESULTS_DIR}" + + section_header "Test Run History" + + printf " ${BOLD}%-20s %-30s %8s %12s %s${RESET}\n" "RUN ID" "SCRIPT" "VUS" "DURATION" "STATUS" + printf " %s\n" "$(printf '%.0s─' {1..80})" + + local count=0 + while IFS= read -r d; do + local meta="${d}/metadata.json" + [[ ! -f "$meta" ]] && continue + + local run_id script vus duration + run_id=$(basename "$d") + if command -v jq &>/dev/null; then + script=$(jq -r '.script // "unknown"' "$meta" 2>/dev/null || echo "unknown") + vus=$(jq -r '.vus // "?"' "$meta" 2>/dev/null || echo "?") + duration=$(jq -r '.duration // "?"' "$meta" 2>/dev/null || echo "?") + else + script="(jq required)" + vus="?" + duration="?" + fi + + local status_icon="${GREEN}✓${RESET}" + if [[ -f "${d}/summary.json" ]]; then + local fail_rate + fail_rate=$(jq -r '.metrics.http_req_failed.values.rate // 0' "${d}/summary.json" 2>/dev/null || echo 0) + if awk "BEGIN{exit ($fail_rate > 0) ? 0 : 1}" 2>/dev/null; then + status_icon="${RED}✗${RESET}" + fi + else + status_icon="${YELLOW}?${RESET}" + fi + + printf " %-20s %-30s %8s %12s %b\n" "$run_id" "${script:0:28}" "$vus" "$duration" "$status_icon" + ((count++)) || true + done < <(find "$RESULTS_DIR" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort -r) + + echo "" + field "Total runs:" "$count" + + if [[ "$count" -eq 0 ]]; then + warn "No test runs found in ${RESULTS_DIR}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# INSPECT MODE +# ══════════════════════════════════════════════════════════════════════ +do_inspect() { + [[ -z "$INSPECT_SCRIPT" ]] && die "No test script specified" + [[ ! -f "$INSPECT_SCRIPT" ]] && die "Test script not found: ${INSPECT_SCRIPT}" + + section_header "Test Inspection" + field "Script:" "$INSPECT_SCRIPT" + field "Size:" "$(stat --printf="%s" "$INSPECT_SCRIPT" 2>/dev/null || stat -f%z "$INSPECT_SCRIPT" 2>/dev/null || echo unknown) bytes" + echo "" + + log "k6 command that would execute:" + echo "" + echo -e " ${DIM}${K6_PATH} run \\" + echo -e " --vus ${VUS} \\" + echo -e " --duration ${DURATION} \\" + [[ -n "$PUSH_GW" ]] && echo -e " --out experimental-prometheus-rw=${PUSH_GW} \\" + [[ -n "$THRESHOLDS_FILE" ]] && echo -e " --config ${THRESHOLDS_FILE} \\" + for ev in "${ENV_VARS[@]+"${ENV_VARS[@]}"}"; do + echo -e " -e ${ev} \\" + done + echo -e " ${INSPECT_SCRIPT}${RESET}" + + echo "" + if command -v "$K6_PATH" &>/dev/null; then + field_color "k6 version:" "${GREEN}$("$K6_PATH" version 2>/dev/null | head -1)${RESET}" + else + field_color "k6 status:" "${RED}not installed${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ +show_help() { + cat <&2; exit 1 ;; + esac + done +} + +get_keepalived_pid() { + local pid="" + if [ -f /run/keepalived.pid ]; then + pid=$(cat /run/keepalived.pid 2>/dev/null) + elif [ -f /var/run/keepalived.pid ]; then + pid=$(cat /var/run/keepalived.pid 2>/dev/null) + fi + if [ -z "$pid" ] || ! kill -0 "$pid" 2>/dev/null; then + pid=$(pgrep -f "keepalived" | head -1) + fi + echo "$pid" +} + +check_keepalived() { + if pgrep keepalived >/dev/null 2>&1; then + return 0 + fi + if systemctl is-active keepalived >/dev/null 2>&1; then + return 0 + fi + return 1 +} + +map_state() { + case "$1" in + MASTER) echo 2 ;; + BACKUP) echo 1 ;; + FAULT) echo 0 ;; + INIT) echo 0 ;; + *) echo 0 ;; + esac +} + +dump_keepalived_data() { + local pid + pid=$(get_keepalived_pid) + [ -z "$pid" ] && return 1 + kill -USR1 "$pid" 2>/dev/null || return 1 + sleep "$SIGNAL_WAIT" + [ -f "$DATA_FILE" ] || return 1 + return 0 +} + +dump_keepalived_stats() { + local pid + pid=$(get_keepalived_pid) + [ -z "$pid" ] && return 1 + kill -USR2 "$pid" 2>/dev/null || return 1 + sleep "$SIGNAL_WAIT" + [ -f "$STATS_FILE" ] || return 1 + return 0 +} + +check_vip_assigned() { + local vip="$1" + local ip_only="${vip%%/*}" + if ip addr show 2>/dev/null | grep -qw "$ip_only"; then + echo 1 + else + echo 0 + fi +} + +# ────────────────────────────────────────────────────────────────────────────── +# METRIC GENERATION +# ────────────────────────────────────────────────────────────────────────────── + +parse_data_file() { + [ -f "$DATA_FILE" ] || return + local current_instance="" + local in_vip_block=false + + while IFS= read -r line; do + if [[ "$line" =~ ^[[:space:]]*VRRP\ Instance:\ (.+)$ ]]; then + current_instance="${BASH_REMATCH[1]}" + current_instance=$(echo "$current_instance" | xargs) + in_vip_block=false + continue + fi + [ -z "$current_instance" ] && continue + + if [[ "$line" =~ ^[[:space:]]*State\ =\ (.+)$ ]]; then + local state="${BASH_REMATCH[1]}" + state=$(echo "$state" | xargs) + local state_val + state_val=$(map_state "$state") + echo "keepalived_vrrp_state{instance=\"$current_instance\"} $state_val" + in_vip_block=false + elif [[ "$line" =~ ^[[:space:]]*Wantstate\ =\ (.+)$ ]]; then + in_vip_block=false + elif [[ "$line" =~ ^[[:space:]]*Last\ transition\ =\ ([0-9]+) ]]; then + local ts="${BASH_REMATCH[1]}" + echo "keepalived_vrrp_last_transition_timestamp{instance=\"$current_instance\"} $ts" + in_vip_block=false + elif [[ "$line" =~ ^[[:space:]]*Base\ priority\ =\ ([0-9]+) ]]; then + echo "keepalived_vrrp_priority{instance=\"$current_instance\"} ${BASH_REMATCH[1]}" + in_vip_block=false + elif [[ "$line" =~ ^[[:space:]]*Effective\ priority\ =\ ([0-9]+) ]]; then + echo "keepalived_vrrp_effective_priority{instance=\"$current_instance\"} ${BASH_REMATCH[1]}" + in_vip_block=false + elif [[ "$line" =~ ^[[:space:]]*Advert\ interval\ =\ ([0-9]+) ]]; then + echo "keepalived_vrrp_advert_interval{instance=\"$current_instance\"} ${BASH_REMATCH[1]}" + in_vip_block=false + elif [[ "$line" =~ ^[[:space:]]*Virtual\ Router\ ID\ =\ ([0-9]+) ]]; then + echo "keepalived_vrrp_virtual_router_id{instance=\"$current_instance\"} ${BASH_REMATCH[1]}" + in_vip_block=false + elif [[ "$line" =~ ^[[:space:]]*Virtual\ IP ]]; then + in_vip_block=true + elif $in_vip_block && [[ "$line" =~ ^[[:space:]]+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(/[0-9]+)?) ]]; then + local vip="${BASH_REMATCH[1]}" + vip=$(echo "$vip" | xargs) + local assigned + assigned=$(check_vip_assigned "$vip") + echo "keepalived_vrrp_vip_status{instance=\"$current_instance\",vip=\"$vip\"} $assigned" + elif $in_vip_block && [[ ! "$line" =~ ^[[:space:]] ]]; then + in_vip_block=false + fi + done < "$DATA_FILE" +} + +parse_stats_file() { + [ -f "$STATS_FILE" ] || return + local current_instance="" + local in_adverts=false + + while IFS= read -r line; do + if [[ "$line" =~ ^[[:space:]]*VRRP\ Instance:\ (.+)$ ]]; then + current_instance="${BASH_REMATCH[1]}" + current_instance=$(echo "$current_instance" | xargs) + in_adverts=false + continue + fi + [ -z "$current_instance" ] && continue + + if [[ "$line" =~ ^[[:space:]]*Advertisements: ]]; then + in_adverts=true + elif $in_adverts && [[ "$line" =~ ^[[:space:]]*Received:\ ([0-9]+) ]]; then + echo "keepalived_vrrp_adverts_received_total{instance=\"$current_instance\"} ${BASH_REMATCH[1]}" + elif $in_adverts && [[ "$line" =~ ^[[:space:]]*Sent:\ ([0-9]+) ]]; then + echo "keepalived_vrrp_adverts_sent_total{instance=\"$current_instance\"} ${BASH_REMATCH[1]}" + in_adverts=false + elif [[ "$line" =~ ^[[:space:]]*Became\ master:\ ([0-9]+) ]]; then + echo "keepalived_vrrp_became_master_total{instance=\"$current_instance\"} ${BASH_REMATCH[1]}" + elif [[ "$line" =~ ^[[:space:]]*Released\ master:\ ([0-9]+) ]]; then + echo "keepalived_vrrp_released_master_total{instance=\"$current_instance\"} ${BASH_REMATCH[1]}" + elif [[ "$line" =~ ^[[:space:]]*Gratuitous\ ARP:\ ([0-9]+) ]]; then + echo "keepalived_vrrp_garp_sent_total{instance=\"$current_instance\"} ${BASH_REMATCH[1]}" + fi + done < "$STATS_FILE" +} + +generate_metrics() { + local script_start + script_start=$(date +%s) + + if ! check_keepalived; then + echo "# HELP keepalived_up Keepalived exporter status" + echo "# TYPE keepalived_up gauge" + echo "keepalived_up 0" + echo "" + echo "# HELP keepalived_process_running Keepalived process status (1=running, 0=stopped)" + echo "# TYPE keepalived_process_running gauge" + echo "keepalived_process_running 0" + return + fi + + echo "# HELP keepalived_up Keepalived exporter status" + echo "# TYPE keepalived_up gauge" + echo "keepalived_up 1" + echo "" + echo "# HELP keepalived_process_running Keepalived process status (1=running, 0=stopped)" + echo "# TYPE keepalived_process_running gauge" + echo "keepalived_process_running 1" + echo "" + + # Signal keepalived to dump data and stats + local has_data=false + local has_stats=false + if dump_keepalived_data; then + has_data=true + fi + if dump_keepalived_stats; then + has_stats=true + fi + + if ! $has_data && ! $has_stats; then + echo "# No VRRP data available — could not signal keepalived or read dump files" + echo "" + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + cat <&2 + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + echo "Keepalived Exporter

Keepalived Prometheus Exporter

Metrics

" + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ────────────────────────────────────────────────────────────────────────────── +# MAIN +# ────────────────────────────────────────────────────────────────────────────── + +main() { + parse_args "$@" + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + local temp_file + temp_file=$(mktemp "${output_dir}/.keepalived_metrics.XXXXXX") + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + echo "Metrics written to $OUTPUT_FILE" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/kubernetes-smoke-tests.sh b/kubernetes-smoke-tests.sh new file mode 100644 index 0000000..f3f7c4b --- /dev/null +++ b/kubernetes-smoke-tests.sh @@ -0,0 +1,565 @@ +#!/usr/bin/env bash + +##################################################################################### +#### kubernetes-smoke-tests.sh — Verify Kubernetes cluster health #### +#### Checks API server, etcd, CoreDNS, scheduling, services, PVC, RBAC, certs. #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: ./kubernetes-smoke-tests.sh #### +#### KUBECONFIG=/path/to/kubeconfig ./kubernetes-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +##################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +TEST_NAMESPACE="${TEST_NAMESPACE:-smoke-test-$$}" +TEST_IMAGE="${TEST_IMAGE:-busybox:latest}" +SKIP_PVC="${SKIP_PVC:-false}" +SKIP_NETPOL="${SKIP_NETPOL:-false}" +SKIP_SCHEDULING="${SKIP_SCHEDULING:-false}" +POD_TIMEOUT="${POD_TIMEOUT:-60}" +STORAGE_CLASS="${STORAGE_CLASS:-}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0 +FAIL=0 +SKIP=0 +TOTAL=0 +RESULTS=() +START_TIME=$(date +%s) +NS_CREATED="false" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Helpers ─────────────────────────────────────────────────────────── +log() { [[ "$VERBOSE" == "true" ]] && echo -e "${BLUE}# $*${RESET}" >&2 || true; } + +pass() { + ((TOTAL++)) || true + ((PASS++)) || true + RESULTS+=("ok $TOTAL - $1") + [[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "${GREEN} PASS${RESET} $1" >&2 || true +} + +fail() { + ((TOTAL++)) || true + ((FAIL++)) || true + RESULTS+=("not ok $TOTAL - $1") + [[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "${RED} FAIL${RESET} $1" >&2 || true +} + +skip() { + ((TOTAL++)) || true + ((SKIP++)) || true + RESULTS+=("ok $TOTAL - $1 # SKIP $2") + [[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "${YELLOW} SKIP${RESET} $1 ($2)" >&2 || true +} + +wait_for_pod() { + local name="$1" ns="$2" timeout="$3" + local deadline=$(($(date +%s) + timeout)) + while [[ $(date +%s) -lt $deadline ]]; do + local phase + phase=$(kubectl get pod "$name" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [[ "$phase" == "Running" ]]; then + return 0 + elif [[ "$phase" == "Failed" || "$phase" == "Error" ]]; then + return 1 + fi + sleep 2 + done + return 1 +} + +wait_for_pvc() { + local name="$1" ns="$2" timeout="$3" + local deadline=$(($(date +%s) + timeout)) + while [[ $(date +%s) -lt $deadline ]]; do + local phase + phase=$(kubectl get pvc "$name" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [[ "$phase" == "Bound" ]]; then + return 0 + fi + sleep 2 + done + return 1 +} + +wait_for_endpoint() { + local name="$1" ns="$2" timeout="$3" + local deadline=$(($(date +%s) + timeout)) + while [[ $(date +%s) -lt $deadline ]]; do + local addrs + addrs=$(kubectl get endpoints "$name" -n "$ns" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || echo "") + if [[ -n "$addrs" ]]; then + return 0 + fi + sleep 2 + done + return 1 +} + +# ── Cleanup ─────────────────────────────────────────────────────────── +cleanup() { + if [[ "$NS_CREATED" == "true" ]]; then + log "Cleaning up namespace $TEST_NAMESPACE" + kubectl delete namespace "$TEST_NAMESPACE" --ignore-not-found --wait=false &>/dev/null || true + fi +} +trap cleanup EXIT + +# ── Help ────────────────────────────────────────────────────────────── +show_help() { + cat <<'HELP' +kubernetes-smoke-tests.sh — Verify Kubernetes cluster health + +Environment variables: + KUBECONFIG Path to kubeconfig (default: kubectl default) + TEST_NAMESPACE Namespace for test resources (default: smoke-test-) + TEST_IMAGE Image for test pods (default: busybox:latest) + SKIP_PVC Skip PVC test (default: false) + SKIP_NETPOL Skip network policy test (default: false) + SKIP_SCHEDULING Skip pod scheduling test (default: false) + POD_TIMEOUT Seconds to wait for pods (default: 60) + STORAGE_CLASS StorageClass for PVC test (default: cluster default) + OUTPUT_FORMAT Output format: text, tap (default: text) + COLOR Color output: auto, always, never (default: auto) + VERBOSE Verbose logging: true/false (default: false) + +Exit codes: + 0 All tests passed + 1 One or more tests failed + 2 Script error +HELP + exit 0 +} + +[[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && show_help + +# ── Preflight ───────────────────────────────────────────────────────── +setup_colors + +if ! command -v kubectl &>/dev/null; then + echo "ERROR: kubectl not found" >&2 + exit 2 +fi + +[[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "${BOLD}Kubernetes Smoke Tests${RESET}" >&2 +[[ "$OUTPUT_FORMAT" == "text" ]] && echo -e "Namespace: ${BLUE}$TEST_NAMESPACE${RESET}" >&2 +[[ "$OUTPUT_FORMAT" == "text" ]] && echo "" >&2 + +# ── Tests ───────────────────────────────────────────────────────────── + +test_api_server() { + log "Checking API server" + if kubectl cluster-info &>/dev/null; then + pass "API server reachable" + else + fail "API server unreachable" + fi +} + +test_nodes_ready() { + log "Checking node readiness" + local not_ready + not_ready=$(kubectl get nodes --no-headers 2>/dev/null | grep -v " Ready " | wc -l) + local total + total=$(kubectl get nodes --no-headers 2>/dev/null | wc -l) + if [[ "$not_ready" -eq 0 && "$total" -gt 0 ]]; then + pass "all $total nodes Ready" + elif [[ "$total" -eq 0 ]]; then + fail "no nodes found" + else + fail "$not_ready of $total nodes not Ready" + fi +} + +test_control_plane() { + log "Checking control plane pods" + local components=("kube-apiserver" "kube-controller-manager" "kube-scheduler" "etcd") + local all_ok=true + local missing=() + for comp in "${components[@]}"; do + local count + count=$(kubectl get pods -n kube-system -l "component=$comp" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) + if [[ "$count" -eq 0 ]]; then + # Try by name prefix for managed clusters + count=$(kubectl get pods -n kube-system --no-headers 2>/dev/null | grep "^${comp}" | grep -c "Running" || true) + fi + if [[ "$count" -eq 0 ]]; then + all_ok=false + missing+=("$comp") + fi + done + if [[ "$all_ok" == "true" ]]; then + pass "control plane pods healthy" + else + fail "control plane pods missing: ${missing[*]}" + fi +} + +test_coredns_running() { + log "Checking CoreDNS" + local running + running=$(kubectl get pods -n kube-system -l k8s-app=kube-dns --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) + if [[ "$running" -gt 0 ]]; then + pass "CoreDNS running ($running pods)" + else + fail "CoreDNS not running" + fi +} + +test_dns_resolution() { + log "Testing DNS resolution inside cluster" + if [[ "$SKIP_SCHEDULING" == "true" ]]; then + skip "DNS resolution" "SKIP_SCHEDULING=true" + return + fi + + # Create namespace if needed + if [[ "$NS_CREATED" != "true" ]]; then + kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true + NS_CREATED="true" + fi + + kubectl run smoke-dns-$$ \ + --namespace="$TEST_NAMESPACE" \ + --image="$TEST_IMAGE" \ + --restart=Never \ + --command -- sleep 300 &>/dev/null 2>&1 || true + + if wait_for_pod "smoke-dns-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then + if kubectl exec "smoke-dns-$$" -n "$TEST_NAMESPACE" -- \ + nslookup kubernetes.default.svc.cluster.local &>/dev/null 2>&1; then + pass "DNS resolution working (kubernetes.default)" + else + fail "DNS resolution failed inside pod" + fi + else + fail "DNS test pod did not reach Running state" + fi + kubectl delete pod "smoke-dns-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true +} + +test_pod_scheduling() { + if [[ "$SKIP_SCHEDULING" == "true" ]]; then + skip "pod scheduling" "SKIP_SCHEDULING=true" + return + fi + log "Testing pod scheduling" + + if [[ "$NS_CREATED" != "true" ]]; then + kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true + NS_CREATED="true" + fi + + kubectl run smoke-sched-$$ \ + --namespace="$TEST_NAMESPACE" \ + --image="$TEST_IMAGE" \ + --restart=Never \ + --command -- sleep 10 &>/dev/null 2>&1 || true + + if wait_for_pod "smoke-sched-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then + pass "pod scheduling working" + else + fail "pod did not reach Running within ${POD_TIMEOUT}s" + fi + kubectl delete pod "smoke-sched-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true +} + +test_service_endpoint() { + if [[ "$SKIP_SCHEDULING" == "true" ]]; then + skip "service endpoint" "SKIP_SCHEDULING=true" + return + fi + log "Testing service endpoint creation" + + if [[ "$NS_CREATED" != "true" ]]; then + kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true + NS_CREATED="true" + fi + + # Create deployment + kubectl create deployment smoke-svc-$$ \ + --namespace="$TEST_NAMESPACE" \ + --image="$TEST_IMAGE" \ + -- sleep 300 &>/dev/null 2>&1 || true + + # Expose as service + kubectl expose deployment "smoke-svc-$$" \ + --namespace="$TEST_NAMESPACE" \ + --port=80 --target-port=80 &>/dev/null 2>&1 || true + + # Wait for endpoint + if wait_for_endpoint "smoke-svc-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then + pass "service endpoint has addresses" + else + fail "service endpoint has no addresses" + fi + + kubectl delete deployment "smoke-svc-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true + kubectl delete service "smoke-svc-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true +} + +test_pvc_provisioning() { + if [[ "$SKIP_PVC" == "true" ]]; then + skip "PVC provisioning" "SKIP_PVC=true" + return + fi + log "Testing PVC provisioning" + + if [[ "$NS_CREATED" != "true" ]]; then + kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true + NS_CREATED="true" + fi + + local sc_spec="" + if [[ -n "$STORAGE_CLASS" ]]; then + sc_spec="storageClassName: $STORAGE_CLASS" + fi + + cat </dev/null +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: smoke-pvc-$$ +spec: + accessModes: [ReadWriteOnce] + ${sc_spec} + resources: + requests: + storage: 1Gi +PVC_EOF + + if wait_for_pvc "smoke-pvc-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then + pass "PVC provisioned and Bound" + else + fail "PVC did not reach Bound state" + fi + kubectl delete pvc "smoke-pvc-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true +} + +test_rbac() { + log "Testing RBAC" + + if [[ "$NS_CREATED" != "true" ]]; then + kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true + NS_CREATED="true" + fi + + # Create ServiceAccount + kubectl create serviceaccount "smoke-sa-$$" \ + -n "$TEST_NAMESPACE" &>/dev/null 2>&1 || true + + # Create Role + cat </dev/null +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: smoke-role-$$ +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +ROLE_EOF + + # Create RoleBinding + cat </dev/null +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: smoke-rb-$$ +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: smoke-role-$$ +subjects: + - kind: ServiceAccount + name: smoke-sa-$$ + namespace: $TEST_NAMESPACE +RB_EOF + + # Verify RBAC + if kubectl auth can-i get pods \ + --namespace="$TEST_NAMESPACE" \ + --as="system:serviceaccount:${TEST_NAMESPACE}:smoke-sa-$$" &>/dev/null 2>&1; then + pass "RBAC working (ServiceAccount can get pods)" + else + fail "RBAC check failed" + fi +} + +test_network_policy() { + if [[ "$SKIP_NETPOL" == "true" ]]; then + skip "network policy" "SKIP_NETPOL=true" + return + fi + if [[ "$SKIP_SCHEDULING" == "true" ]]; then + skip "network policy" "SKIP_SCHEDULING=true" + return + fi + log "Testing network policy enforcement" + + if [[ "$NS_CREATED" != "true" ]]; then + kubectl create namespace "$TEST_NAMESPACE" &>/dev/null || true + NS_CREATED="true" + fi + + # Create a target pod + kubectl run smoke-netpol-target-$$ \ + --namespace="$TEST_NAMESPACE" \ + --image="$TEST_IMAGE" \ + --restart=Never \ + --command -- sleep 300 &>/dev/null 2>&1 || true + + if ! wait_for_pod "smoke-netpol-target-$$" "$TEST_NAMESPACE" "$POD_TIMEOUT"; then + fail "network policy test pod did not start" + return + fi + + # Apply deny-all ingress policy + cat </dev/null +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: smoke-deny-all-$$ +spec: + podSelector: {} + policyTypes: [Ingress] +NP_EOF + + # Network policy is applied — verify it exists + if kubectl get networkpolicy "smoke-deny-all-$$" -n "$TEST_NAMESPACE" &>/dev/null; then + pass "network policy applied (deny-all ingress)" + else + fail "network policy not applied" + fi + + kubectl delete pod "smoke-netpol-target-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true + kubectl delete networkpolicy "smoke-deny-all-$$" -n "$TEST_NAMESPACE" --ignore-not-found &>/dev/null || true +} + +test_cert_expiry() { + log "Checking API server certificate expiry" + local api_server + api_server=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}' 2>/dev/null || echo "") + + if [[ -z "$api_server" ]]; then + skip "certificate expiry" "cannot determine API server address" + return + fi + + local host port + host=$(echo "$api_server" | sed 's|https://||;s|:.*||') + port=$(echo "$api_server" | grep -o ':[0-9]*' | tr -d ':') + port="${port:-443}" + + if ! command -v openssl &>/dev/null; then + skip "certificate expiry" "openssl not installed" + return + fi + + local end_date + end_date=$(echo | openssl s_client -connect "${host}:${port}" -servername "$host" 2>/dev/null \ + | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) + + if [[ -z "$end_date" ]]; then + skip "certificate expiry" "could not read certificate" + return + fi + + local exp_epoch now_epoch days_left + exp_epoch=$(date -d "$end_date" +%s 2>/dev/null || date -jf "%b %d %T %Y %Z" "$end_date" +%s 2>/dev/null || echo 0) + now_epoch=$(date +%s) + + if [[ "$exp_epoch" -eq 0 ]]; then + skip "certificate expiry" "could not parse date" + return + fi + + days_left=$(( (exp_epoch - now_epoch) / 86400 )) + if [[ "$days_left" -ge 30 ]]; then + pass "API server cert valid ($days_left days remaining)" + else + fail "API server cert expiring in $days_left days (min 30)" + fi +} + +test_resource_pressure() { + log "Checking node resource pressure" + local pressure_nodes=() + local conditions=("MemoryPressure" "DiskPressure" "PIDPressure") + + for cond in "${conditions[@]}"; do + local affected + affected=$(kubectl get nodes -o jsonpath="{.items[?(@.status.conditions[?(@.type=='$cond')].status=='True')].metadata.name}" 2>/dev/null || echo "") + if [[ -n "$affected" ]]; then + pressure_nodes+=("$cond: $affected") + fi + done + + if [[ ${#pressure_nodes[@]} -eq 0 ]]; then + pass "no node resource pressure" + else + fail "node pressure detected: ${pressure_nodes[*]}" + fi +} + +# ── Run Tests ───────────────────────────────────────────────────────── + +test_api_server +test_nodes_ready +test_control_plane +test_coredns_running +test_dns_resolution +test_pod_scheduling +test_service_endpoint +test_pvc_provisioning +test_rbac +test_network_policy +test_cert_expiry +test_resource_pressure + +# ── Output ──────────────────────────────────────────────────────────── +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "TAP version 13" + echo "1..$TOTAL" + for r in "${RESULTS[@]}"; do echo "$r"; done +fi + +echo "" +if [[ "$OUTPUT_FORMAT" == "text" ]]; then + echo -e "${BOLD}Results:${RESET} pass=$PASS fail=$FAIL skip=$SKIP total=$TOTAL (${DURATION}s)" +else + echo "# pass=$PASS fail=$FAIL skip=$SKIP total=$TOTAL duration=${DURATION}s" +fi + +[[ "$FAIL" -eq 0 ]] diff --git a/lambda-deployer.sh b/lambda-deployer.sh new file mode 100755 index 0000000..0adae80 --- /dev/null +++ b/lambda-deployer.sh @@ -0,0 +1,579 @@ +#!/usr/bin/env bash + +######################################################################################### +#### lambda-deployer.sh — Package, deploy, and schedule Python Lambda functions #### +#### Supports dependency bundling, EventBridge scheduling, invocation, and log tail #### +#### Requires: bash 4+, aws-cli v2, jq, zip, pip3 #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### export AWS_PROFILE="production" #### +#### ./lambda-deployer.sh --deploy --function-name my-func --role-arn #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +AWS_REGION="${AWS_REGION:-}" +FUNCTION_NAME="${FUNCTION_NAME:-}" +LAMBDA_RUNTIME="${LAMBDA_RUNTIME:-python3.12}" +LAMBDA_HANDLER="${LAMBDA_HANDLER:-lambda_function.lambda_handler}" +LAMBDA_ROLE_ARN="${LAMBDA_ROLE_ARN:-}" +LAMBDA_TIMEOUT="${LAMBDA_TIMEOUT:-30}" +LAMBDA_MEMORY="${LAMBDA_MEMORY:-128}" +LAMBDA_ENV_VARS="${LAMBDA_ENV_VARS:-}" +LAMBDA_LAYERS="${LAMBDA_LAYERS:-}" +SOURCE_DIR="${SOURCE_DIR:-.}" +SCHEDULE_EXPRESSION="${SCHEDULE_EXPRESSION:-}" +PAYLOAD="${PAYLOAD:-}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +RUN_MODE="" +ZIP_FILE="" +START_TIME="" +TEMP_DIR="" + +# ── Colors ──────────────────────────────────────────────────────────── +RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +die() { + err "$@" + exit 1 +} + +# ── Cleanup ─────────────────────────────────────────────────────────── +cleanup() { + if [[ -n "$TEMP_DIR" ]] && [[ -d "$TEMP_DIR" ]]; then + verbose "Cleaning up temp directory: $TEMP_DIR" + rm -rf "$TEMP_DIR" + fi +} + +trap cleanup EXIT + +# ── AWS CLI wrapper ─────────────────────────────────────────────────── +aws_cmd() { + local args=("$@") + [[ -n "$AWS_REGION" ]] && args+=(--region "$AWS_REGION") + verbose "aws ${args[*]}" + aws "${args[@]}" +} + +# ── Resolve region ──────────────────────────────────────────────────── +resolve_region() { + [[ -z "$AWS_REGION" ]] && AWS_REGION="$(aws configure get region 2>/dev/null || true)" + [[ -z "$AWS_REGION" ]] && die "Cannot determine AWS region. Set AWS_REGION or configure aws-cli." + verbose "Region: $AWS_REGION" +} + +# ── Dependency checks ──────────────────────────────────────────────── +check_dependencies() { + local missing=() + for cmd in aws jq zip pip3; do + command -v "$cmd" &>/dev/null || missing+=("$cmd") + done + [[ ${#missing[@]} -gt 0 ]] && die "Missing required tools: ${missing[*]}" + [[ "${BASH_VERSINFO[0]}" -lt 4 ]] && die "Bash 4+ required (found ${BASH_VERSION})" +} + +# ── Print header ────────────────────────────────────────────────────── +print_header() { + echo -e "${BOLD}Lambda Deployer${RESET}" + echo "Region: $AWS_REGION" + echo "Mode: $RUN_MODE" + echo "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "" +} + +# ── Elapsed time ────────────────────────────────────────────────────── +elapsed() { + local end + end=$(date +%s) + echo $(( end - START_TIME )) +} + +# ── Build environment variables JSON ───────────────────────────────── +build_env_vars_json() { + local env_str="$1" + [[ -z "$env_str" ]] && return + local json="{" + local first=true + IFS=',' read -ra pairs <<< "$env_str" + for pair in "${pairs[@]}"; do + [[ "$first" == "true" ]] && first=false || json+="," + json+="\"${pair%%=*}\":\"${pair#*=}\"" + done + echo "{\"Variables\":${json}}}" +} + +# ── Build layers array ─────────────────────────────────────────────── +build_layers_args() { + [[ -z "$1" ]] && return + echo "${1//,/ }" +} + +# ── Package mode ────────────────────────────────────────────────────── +do_package() { + [[ -z "$FUNCTION_NAME" ]] && die "--function-name is required for package mode" + + local src_dir + src_dir="$(realpath "$SOURCE_DIR")" + [[ -d "$src_dir" ]] || die "Source directory not found: $src_dir" + + log "Packaging function ${BOLD}$FUNCTION_NAME${RESET}..." + + TEMP_DIR="$(mktemp -d)" + local pkg_dir="$TEMP_DIR/package" + mkdir -p "$pkg_dir" + + # Install dependencies + local req_file="$src_dir/requirements.txt" + if [[ -f "$req_file" ]]; then + log "Installing dependencies from requirements.txt..." + pip3 install -r "$req_file" -t "$pkg_dir" --quiet --disable-pip-version-check 2>/dev/null \ + || die "pip3 install failed" + + # Clean up pip metadata to reduce zip size + find "$pkg_dir" -type d \( -name "__pycache__" -o -name "*.dist-info" -o -name "*.egg-info" \) \ + -exec rm -rf {} + 2>/dev/null || true + + local pkg_count + pkg_count=$(grep -cE '^[^#[:space:]]' "$req_file" || echo "0") + log "Collected $pkg_count package(s)" + else + warn "No requirements.txt found — packaging handler only" + fi + + # Copy handler code + local py_count=0 + while IFS= read -r -d '' f; do + cp "$f" "$pkg_dir/" + ((py_count++)) || true + done < <(find "$src_dir" -maxdepth 1 -name "*.py" -print0) + + if [[ $py_count -eq 0 ]]; then + die "No .py files found in $src_dir" + fi + verbose "Copied $py_count Python file(s)" + + # Create zip + ZIP_FILE="/tmp/lambda-${FUNCTION_NAME}.zip" + (cd "$pkg_dir" && zip -r -q "$ZIP_FILE" .) \ + || die "Failed to create zip" + + local size + size=$(du -h "$ZIP_FILE" | cut -f1) + log "Created deployment package: ${BOLD}$ZIP_FILE${RESET} ($size)" +} + +# ── Check if function exists ───────────────────────────────────────── +function_exists() { + local name="$1" + aws_cmd lambda get-function --function-name "$name" &>/dev/null +} + +# ── Deploy mode ─────────────────────────────────────────────────────── +do_deploy() { + [[ -z "$FUNCTION_NAME" ]] && die "--function-name is required for deploy mode" + + # Package first if zip doesn't exist + ZIP_FILE="/tmp/lambda-${FUNCTION_NAME}.zip" + if [[ ! -f "$ZIP_FILE" ]]; then + do_package + fi + + if function_exists "$FUNCTION_NAME"; then + log "Function ${BOLD}$FUNCTION_NAME${RESET} exists — updating..." + + aws_cmd lambda update-function-code \ + --function-name "$FUNCTION_NAME" \ + --zip-file "fileb://$ZIP_FILE" \ + --output text --query 'FunctionArn' >/dev/null \ + || die "Failed to update function code" + + aws_cmd lambda wait function-updated \ + --function-name "$FUNCTION_NAME" 2>/dev/null || true + + local config_args=( + lambda update-function-configuration + --function-name "$FUNCTION_NAME" + --runtime "$LAMBDA_RUNTIME" + --handler "$LAMBDA_HANDLER" + --timeout "$LAMBDA_TIMEOUT" + --memory-size "$LAMBDA_MEMORY" + ) + + if [[ -n "$LAMBDA_ENV_VARS" ]]; then + local env_json + env_json=$(build_env_vars_json "$LAMBDA_ENV_VARS") + config_args+=(--environment "$env_json") + fi + + if [[ -n "$LAMBDA_LAYERS" ]]; then + local layers + layers=$(build_layers_args "$LAMBDA_LAYERS") + # shellcheck disable=SC2086,SC2206 + config_args+=(--layers $layers) + fi + + local fn_arn + fn_arn=$(aws_cmd "${config_args[@]}" --output text --query 'FunctionArn') \ + || die "Failed to update function configuration" + + echo -e " ${GREEN}✓${RESET} Function updated: $fn_arn" + else + [[ -z "$LAMBDA_ROLE_ARN" ]] && die "--role-arn is required to create a new function" + + log "Function ${BOLD}$FUNCTION_NAME${RESET} does not exist — creating..." + log "Creating function $FUNCTION_NAME ($LAMBDA_RUNTIME, $LAMBDA_MEMORY MB, ${LAMBDA_TIMEOUT}s timeout)" + + local create_args=( + lambda create-function + --function-name "$FUNCTION_NAME" + --runtime "$LAMBDA_RUNTIME" + --handler "$LAMBDA_HANDLER" + --role "$LAMBDA_ROLE_ARN" + --timeout "$LAMBDA_TIMEOUT" + --memory-size "$LAMBDA_MEMORY" + --zip-file "fileb://$ZIP_FILE" + ) + + if [[ -n "$LAMBDA_ENV_VARS" ]]; then + local env_json + env_json=$(build_env_vars_json "$LAMBDA_ENV_VARS") + create_args+=(--environment "$env_json") + fi + + if [[ -n "$LAMBDA_LAYERS" ]]; then + local layers + layers=$(build_layers_args "$LAMBDA_LAYERS") + # shellcheck disable=SC2086,SC2206 + create_args+=(--layers $layers) + fi + + local fn_arn + fn_arn=$(aws_cmd "${create_args[@]}" --output text --query 'FunctionArn') \ + || die "Failed to create function" + + # Wait for function to become active + verbose "Waiting for function to become active..." + aws_cmd lambda wait function-active-v2 \ + --function-name "$FUNCTION_NAME" 2>/dev/null || true + + echo -e " ${GREEN}✓${RESET} Function created: $fn_arn" + fi +} + +# ── Schedule mode ───────────────────────────────────────────────────── +do_schedule() { + [[ -z "$FUNCTION_NAME" ]] && die "--function-name is required for schedule mode" + [[ -z "$SCHEDULE_EXPRESSION" ]] && die "--schedule-expression is required for schedule mode" + + log "Configuring EventBridge schedule for ${BOLD}$FUNCTION_NAME${RESET}..." + + local fn_arn + fn_arn=$(aws_cmd lambda get-function \ + --function-name "$FUNCTION_NAME" \ + --output text --query 'Configuration.FunctionArn' 2>/dev/null) \ + || die "Function $FUNCTION_NAME not found — deploy it first" + + local rule_name="lambda-deployer-${FUNCTION_NAME}" + local rule_arn + rule_arn=$(aws_cmd events put-rule \ + --name "$rule_name" \ + --schedule-expression "$SCHEDULE_EXPRESSION" \ + --state ENABLED \ + --description "Scheduled trigger for $FUNCTION_NAME (managed by lambda-deployer)" \ + --output text --query 'RuleArn') \ + || die "Failed to create EventBridge rule" + + echo -e " ${GREEN}✓${RESET} Rule: $rule_name ($SCHEDULE_EXPRESSION)" + + aws_cmd events put-targets \ + --rule "$rule_name" \ + --targets "[{\"Id\":\"${FUNCTION_NAME}-target\",\"Arn\":\"${fn_arn}\"}]" \ + --output text >/dev/null \ + || die "Failed to add Lambda target to rule" + + local stmt_id="lambda-deployer-${FUNCTION_NAME}-invoke" + aws_cmd lambda remove-permission \ + --function-name "$FUNCTION_NAME" \ + --statement-id "$stmt_id" 2>/dev/null || true + + aws_cmd lambda add-permission \ + --function-name "$FUNCTION_NAME" \ + --statement-id "$stmt_id" \ + --action "lambda:InvokeFunction" \ + --principal "events.amazonaws.com" \ + --source-arn "$rule_arn" \ + --output text >/dev/null \ + || die "Failed to add invoke permission" + + echo -e " ${GREEN}✓${RESET} Permission granted for EventBridge to invoke $FUNCTION_NAME" + log "Schedule configured: $SCHEDULE_EXPRESSION" +} + +# ── Invoke mode ─────────────────────────────────────────────────────── +do_invoke() { + [[ -z "$FUNCTION_NAME" ]] && die "--function-name is required for invoke mode" + + log "Invoking ${BOLD}$FUNCTION_NAME${RESET}..." + + local invoke_args=( + lambda invoke + --function-name "$FUNCTION_NAME" + --log-type Tail + ) + + if [[ -n "$PAYLOAD" ]]; then + invoke_args+=(--payload "$PAYLOAD") + verbose "Payload: $PAYLOAD" + fi + + local output_file + output_file="$(mktemp)" + + local response + response=$(aws_cmd "${invoke_args[@]}" "$output_file" 2>&1) \ + || die "Invoke failed: $response" + + local func_error + func_error=$(echo "$response" | jq -r '.FunctionError // empty' 2>/dev/null || echo "") + + if [[ -n "$func_error" ]]; then + echo -e " ${RED}✗${RESET} Function error: $func_error" + else + echo -e " ${GREEN}✓${RESET} Status: $(echo "$response" | jq -r '.StatusCode // 200' 2>/dev/null)" + fi + + echo "" + echo -e "${BOLD}Response:${RESET}" + jq '.' "$output_file" 2>/dev/null || cat "$output_file" + echo "" + + local log_result + log_result=$(echo "$response" | jq -r '.LogResult // empty' 2>/dev/null || echo "") + if [[ -n "$log_result" ]]; then + echo -e "${BOLD}Execution Log:${RESET}" + echo "$log_result" | base64 --decode 2>/dev/null || true + fi + + rm -f "$output_file" +} + +# ── Logs mode ───────────────────────────────────────────────────────── +do_logs() { + [[ -z "$FUNCTION_NAME" ]] && die "--function-name is required for logs mode" + + local log_group="/aws/lambda/$FUNCTION_NAME" + local exists + exists=$(aws_cmd logs describe-log-groups \ + --log-group-name-prefix "$log_group" \ + --query "logGroups[?logGroupName=='$log_group'].logGroupName" \ + --output text 2>/dev/null || echo "") + [[ -z "$exists" ]] && die "Log group $log_group not found — has $FUNCTION_NAME been invoked?" + + log "Tailing logs for ${BOLD}$FUNCTION_NAME${RESET} (Ctrl+C to stop)..." + echo "" + + local start_time next_token="" + start_time=$(( $(date +%s) * 1000 - 300000 )) + + while true; do + local filter_args=(logs filter-log-events --log-group-name "$log_group" + --start-time "$start_time" --interleaved) + [[ -n "$next_token" ]] && filter_args+=(--next-token "$next_token") + + local result + result=$(aws_cmd "${filter_args[@]}" --output json 2>/dev/null || echo "{}") + + local events + events=$(echo "$result" | jq -r \ + '.events[]? | "\(.timestamp | . / 1000 | strftime("%Y-%m-%dT%H:%M:%SZ")) \(.message)"' \ + 2>/dev/null || echo "") + [[ -n "$events" ]] && echo "$events" + + local new_token + new_token=$(echo "$result" | jq -r '.nextToken // empty' 2>/dev/null || echo "") + if [[ -n "$new_token" ]]; then + next_token="$new_token" + else + next_token="" + local last_ts + last_ts=$(echo "$result" | jq -r '.events[-1]?.timestamp // empty' 2>/dev/null || echo "") + [[ -n "$last_ts" ]] && start_time=$(( last_ts + 1 )) + fi + sleep 2 + done +} + +# ── List mode ───────────────────────────────────────────────────────── +do_list() { + log "Listing Lambda functions in ${BOLD}$AWS_REGION${RESET}..." + echo "" + + local functions + functions=$(aws_cmd lambda list-functions \ + --query 'Functions[*].[FunctionName,Runtime,CodeSize,LastModified]' \ + --output json 2>/dev/null) \ + || die "Failed to list functions" + + local count + count=$(echo "$functions" | jq 'length' 2>/dev/null || echo "0") + + if [[ "$count" -eq 0 ]]; then + log "No Lambda functions found in $AWS_REGION" + return + fi + + printf " ${BOLD}%-25s %-14s %-10s %s${RESET}\n" "FUNCTION" "RUNTIME" "SIZE" "LAST MODIFIED" + echo " ─────────────────────────────────────────────────────────────────" + + echo "$functions" | jq -r '.[] | @tsv' 2>/dev/null | while IFS=$'\t' read -r name runtime size modified; do + # Convert bytes to human-readable + local human_size + if [[ "$size" -ge 1048576 ]]; then + human_size="$(awk "BEGIN{printf \"%.1f MB\", $size/1048576}")" + elif [[ "$size" -ge 1024 ]]; then + human_size="$(awk "BEGIN{printf \"%.1f KB\", $size/1024}")" + else + human_size="${size} B" + fi + + # Trim the modified timestamp + local short_modified + short_modified="${modified%%+*}" + short_modified="${short_modified%.*}" + + printf " %-25s %-14s %-10s %s\n" "$name" "$runtime" "$human_size" "$short_modified" + done + + echo "" + log "Total: $count function(s)" +} + +# ── Usage / help ────────────────────────────────────────────────────── +usage() { + cat < [options] + +Modes: + --package Package handler + deps into zip --deploy Create/update function + --schedule EventBridge scheduled rule --invoke Invoke function + --logs Tail CloudWatch Logs --list List functions + +Options: + --function-name NAME Function name --runtime RUNTIME (python3.12) + --handler HANDLER Entry point --role-arn ARN Exec role + --timeout N Seconds (30) --memory N MB (128) + --env-vars K=V,... Env variables --layers ARN,... Layer ARNs + --schedule-expression E Cron/rate expr --payload JSON Invoke payload + --source-dir DIR Source dir (.) --verbose Debug output + --no-color No ANSI colors -h, --help This help +EOF +} + +# ── Parse arguments ─────────────────────────────────────────────────── +parse_args() { + if [[ $# -eq 0 ]]; then + usage + exit 0 + fi + + while [[ $# -gt 0 ]]; do + case "$1" in + --package|--deploy|--schedule|--invoke|--logs|--list) + RUN_MODE="${1#--}"; shift ;; + --function-name) [[ $# -lt 2 ]] && die "$1 requires a value"; FUNCTION_NAME="$2"; shift 2 ;; + --runtime) [[ $# -lt 2 ]] && die "$1 requires a value"; LAMBDA_RUNTIME="$2"; shift 2 ;; + --handler) [[ $# -lt 2 ]] && die "$1 requires a value"; LAMBDA_HANDLER="$2"; shift 2 ;; + --role-arn) [[ $# -lt 2 ]] && die "$1 requires a value"; LAMBDA_ROLE_ARN="$2"; shift 2 ;; + --timeout) [[ $# -lt 2 ]] && die "$1 requires a value"; LAMBDA_TIMEOUT="$2"; shift 2 ;; + --memory) [[ $# -lt 2 ]] && die "$1 requires a value"; LAMBDA_MEMORY="$2"; shift 2 ;; + --env-vars) [[ $# -lt 2 ]] && die "$1 requires a value"; LAMBDA_ENV_VARS="$2"; shift 2 ;; + --layers) [[ $# -lt 2 ]] && die "$1 requires a value"; LAMBDA_LAYERS="$2"; shift 2 ;; + --schedule-expression) [[ $# -lt 2 ]] && die "$1 requires a value"; SCHEDULE_EXPRESSION="$2"; shift 2 ;; + --payload) [[ $# -lt 2 ]] && die "$1 requires a value"; PAYLOAD="$2"; shift 2 ;; + --source-dir) [[ $# -lt 2 ]] && die "$1 requires a value"; SOURCE_DIR="$2"; shift 2 ;; + --verbose) VERBOSE="true"; shift ;; + --no-color) COLOR="never"; shift ;; + -h|--help) usage; exit 0 ;; + *) die "Unknown option: $1 (see --help)" ;; + esac + done + + if [[ -z "$RUN_MODE" ]]; then err "No mode specified"; echo ""; usage; exit 1; fi +} + +# ── Main ────────────────────────────────────────────────────────────── +main() { + parse_args "$@" + setup_colors + check_dependencies + resolve_region + + START_TIME=$(date +%s) + print_header + + case "$RUN_MODE" in + package) + do_package + ;; + deploy) + do_deploy + ;; + schedule) + do_schedule + ;; + invoke) + do_invoke + ;; + logs) + do_logs + ;; + list) + do_list + ;; + *) + die "Unknown mode: $RUN_MODE" + ;; + esac + + if [[ "$RUN_MODE" != "logs" ]]; then + log "Completed in $(elapsed)s" + fi +} + +main "$@" diff --git a/laps-metrics-exporter.ps1 b/laps-metrics-exporter.ps1 new file mode 100644 index 0000000..87a7c34 --- /dev/null +++ b/laps-metrics-exporter.ps1 @@ -0,0 +1,274 @@ +<# +.SYNOPSIS + Windows LAPS Prometheus Metrics Exporter +.DESCRIPTION + Prometheus exporter for Windows Local Administrator Password Solution (LAPS). + Monitors password expiration, rotation status, coverage across OUs, and + stale passwords. Exports metrics for windows_exporter textfile collector + or standalone HTTP listener. +.PARAMETER TextFile + Write to windows_exporter textfile directory +.PARAMETER OutFile + Write metrics to a specific file path +.PARAMETER Install + Create a scheduled task for automatic collection +.PARAMETER Listen + Start HTTP listener on specified address:port +.PARAMETER Interval + Collection interval in seconds for scheduled task (default: 300) +.NOTES + Author: Phil Connor + Contact: contact@mylinux.work + Website: https://mylinux.work + License: MIT + Version: 1.0 +#> + +param( + [switch]$TextFile, + [string]$OutFile = "", + [switch]$Install, + [string]$Listen = "", + [int]$Interval = 300 +) + +$ErrorActionPreference = "SilentlyContinue" +$Version = "1.0" +$TextfileDir = "C:\ProgramData\node_exporter" +$MetricPrefix = "windows_laps" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +function Get-PrometheusEscape { + param([string]$Value) + $Value -replace '\\', '\\' -replace '"', '\"' -replace "`n", '' +} + +function Write-MetricHeader { + param([string]$Name, [string]$Help, [string]$Type) + "# HELP $Name $Help" + "# TYPE $Name $Type" +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +function Get-LapsMetrics { + $startTime = Get-Date + $metrics = [System.Collections.ArrayList]::new() + + # Check if LAPS module is available + $lapsAvailable = $false + if (Get-Module -ListAvailable -Name "LAPS" -ErrorAction SilentlyContinue) { + Import-Module LAPS -ErrorAction SilentlyContinue + $lapsAvailable = $true + } + # Also check for Windows LAPS (new in Server 2022+) + $windowsLaps = $false + if (Get-Command "Get-LapsAADPassword" -ErrorAction SilentlyContinue) { + $windowsLaps = $true + $lapsAvailable = $true + } + + $up = if ($lapsAvailable) { 1 } else { 0 } + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_up" "LAPS exporter status (1=up, 0=down)" "gauge")) + [void]$metrics.Add("${MetricPrefix}_up $up") + [void]$metrics.Add("") + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_exporter_info" "Exporter version information" "gauge")) + [void]$metrics.Add("${MetricPrefix}_exporter_info{version=`"$Version`"} 1") + [void]$metrics.Add("") + + if ($up -eq 0) { + $endTime = Get-Date + $duration = [math]::Round(($endTime - $startTime).TotalSeconds, 2) + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_exporter_duration_seconds" "Script execution time" "gauge")) + [void]$metrics.Add("${MetricPrefix}_exporter_duration_seconds $duration") + return ($metrics -join "`n") + } + + # ======================================================================== + # COMPUTER ACCOUNT LAPS STATUS + # ======================================================================== + + try { + $computers = Get-ADComputer -Filter {Enabled -eq $true} -Properties ` + "ms-Mcs-AdmPwdExpirationTime", "ms-Mcs-AdmPwd", "DistinguishedName", "OperatingSystem" ` + -ErrorAction Stop + + $totalComputers = @($computers).Count + $lapsManaged = 0 + $lapsUnmanaged = 0 + $passwordExpired = 0 + $passwordExpiringSoon = 0 + $now = Get-Date + $soonThreshold = $now.AddDays(7) + + foreach ($computer in $computers) { + $expTime = $computer."ms-Mcs-AdmPwdExpirationTime" + + if ($expTime -and $expTime -gt 0) { + $lapsManaged++ + $expDate = [DateTime]::FromFileTime($expTime) + + if ($expDate -lt $now) { + $passwordExpired++ + } elseif ($expDate -lt $soonThreshold) { + $passwordExpiringSoon++ + } + } else { + $lapsUnmanaged++ + } + } + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_computers_total" "Total enabled computer accounts" "gauge")) + [void]$metrics.Add("${MetricPrefix}_computers_total $totalComputers") + [void]$metrics.Add("") + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_computers_managed" "Computers with LAPS password set" "gauge")) + [void]$metrics.Add("${MetricPrefix}_computers_managed $lapsManaged") + [void]$metrics.Add("") + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_computers_unmanaged" "Computers without LAPS password" "gauge")) + [void]$metrics.Add("${MetricPrefix}_computers_unmanaged $lapsUnmanaged") + [void]$metrics.Add("") + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_passwords_expired" "Computers with expired LAPS passwords" "gauge")) + [void]$metrics.Add("${MetricPrefix}_passwords_expired $passwordExpired") + [void]$metrics.Add("") + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_passwords_expiring_soon" "Computers with LAPS passwords expiring within 7 days" "gauge")) + [void]$metrics.Add("${MetricPrefix}_passwords_expiring_soon $passwordExpiringSoon") + [void]$metrics.Add("") + + # Coverage percentage + $coverage = if ($totalComputers -gt 0) { [math]::Round(($lapsManaged / $totalComputers) * 100, 1) } else { 0 } + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_coverage_percent" "LAPS coverage percentage" "gauge")) + [void]$metrics.Add("${MetricPrefix}_coverage_percent $coverage") + [void]$metrics.Add("") + + # ==================================================================== + # PER-OU BREAKDOWN + # ==================================================================== + + $ouStats = @{} + foreach ($computer in $computers) { + $dn = $computer.DistinguishedName + $ouParts = ($dn -split ",") | Where-Object { $_ -match "^OU=" } + $ou = if ($ouParts.Count -gt 0) { ($ouParts[0] -replace "^OU=", "") } else { "Default" } + + if (-not $ouStats.ContainsKey($ou)) { + $ouStats[$ou] = @{ Total = 0; Managed = 0 } + } + $ouStats[$ou].Total++ + + $expTime = $computer."ms-Mcs-AdmPwdExpirationTime" + if ($expTime -and $expTime -gt 0) { + $ouStats[$ou].Managed++ + } + } + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_ou_computers_total" "Computers per OU" "gauge")) + foreach ($ou in $ouStats.Keys) { + $ouName = Get-PrometheusEscape $ou + [void]$metrics.Add("${MetricPrefix}_ou_computers_total{ou=`"$ouName`"} $($ouStats[$ou].Total)") + } + [void]$metrics.Add("") + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_ou_computers_managed" "LAPS-managed computers per OU" "gauge")) + foreach ($ou in $ouStats.Keys) { + $ouName = Get-PrometheusEscape $ou + [void]$metrics.Add("${MetricPrefix}_ou_computers_managed{ou=`"$ouName`"} $($ouStats[$ou].Managed)") + } + [void]$metrics.Add("") + + } catch { + [void]$metrics.Add("# ERROR: Failed to query AD computers: $_") + [void]$metrics.Add("") + } + + # ======================================================================== + # LAPS GPO STATUS + # ======================================================================== + + try { + $gpos = Get-GPO -All -ErrorAction Stop | Where-Object { + $_.DisplayName -match "LAPS|Local Administrator Password" + } + + $lapsGpos = @($gpos).Count + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_gpo_count" "LAPS-related GPOs" "gauge")) + [void]$metrics.Add("${MetricPrefix}_gpo_count $lapsGpos") + [void]$metrics.Add("") + } catch { + # GPO query may not be available on all systems + } + + # ======================================================================== + # EXPORTER RUNTIME + # ======================================================================== + + $endTime = Get-Date + $duration = [math]::Round(($endTime - $startTime).TotalSeconds, 2) + $timestamp = [math]::Round((Get-Date -UFormat %s), 0) + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_exporter_duration_seconds" "Script execution time" "gauge")) + [void]$metrics.Add("${MetricPrefix}_exporter_duration_seconds $duration") + [void]$metrics.Add("") + + [void]$metrics.Add((Write-MetricHeader "${MetricPrefix}_exporter_last_run_timestamp" "Unix timestamp of last run" "gauge")) + [void]$metrics.Add("${MetricPrefix}_exporter_last_run_timestamp $timestamp") + + return ($metrics -join "`n") +} + +# ============================================================================ +# OUTPUT MODES +# ============================================================================ + +if ($Install) { + $scriptPath = $MyInvocation.MyCommand.Path + $action = New-ScheduledTaskAction -Execute "powershell.exe" ` + -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$scriptPath`" -TextFile" + $trigger = New-ScheduledTaskTrigger -RepetitionInterval (New-TimeSpan -Seconds $Interval) -Once -At (Get-Date) + $principal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest + $settings = New-ScheduledTaskSettingsSet -AllowStartIfOnBatteries -DontStopIfGoingOnBatteries -StartWhenAvailable + Register-ScheduledTask -TaskName "LAPS Metrics Exporter" -Action $action -Trigger $trigger -Principal $principal -Settings $settings -Force + Write-Host "Scheduled task 'LAPS Metrics Exporter' created (interval: ${Interval}s)" + exit 0 +} + +if ($Listen -ne "") { + $port = $Listen -replace '.*:', '' + if (-not $port) { $port = "9199" } + $listener = [System.Net.HttpListener]::new() + $listener.Prefixes.Add("http://+:$port/") + $listener.Start() + Write-Host "LAPS exporter listening on port $port..." + + while ($listener.IsListening) { + $context = $listener.GetContext() + $response = $context.Response + $output = Get-LapsMetrics + $buffer = [System.Text.Encoding]::UTF8.GetBytes($output) + $response.ContentType = "text/plain; version=0.0.4" + $response.ContentLength64 = $buffer.Length + $response.OutputStream.Write($buffer, 0, $buffer.Length) + $response.Close() + } +} elseif ($TextFile -or $OutFile -ne "") { + $outputPath = if ($OutFile -ne "") { $OutFile } else { Join-Path $TextfileDir "laps.prom" } + $outputDir = Split-Path $outputPath -Parent + if (-not (Test-Path $outputDir)) { New-Item -ItemType Directory -Path $outputDir -Force | Out-Null } + $tempFile = Join-Path $outputDir ".laps-metrics.tmp" + $metricsOutput = Get-LapsMetrics + $metricsOutput | Out-File -FilePath $tempFile -Encoding utf8 -NoNewline + Move-Item -Path $tempFile -Destination $outputPath -Force + Write-Host "Metrics written to $outputPath" +} else { + Get-LapsMetrics +} diff --git a/lighthouse-audit.sh b/lighthouse-audit.sh new file mode 100755 index 0000000..646ac49 --- /dev/null +++ b/lighthouse-audit.sh @@ -0,0 +1,320 @@ +#!/usr/bin/env bash + +######################################################################################### +#### lighthouse-audit.sh — Automated Lighthouse audits with Prometheus integration #### +#### Runs headless Chrome via Lighthouse CLI, extracts scores with jq, pushes #### +#### metrics to Pushgateway — saves HTML reports and color-codes output #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./lighthouse-audit.sh https://mylinux.work #### +#### ./lighthouse-audit.sh --file urls.txt --pushgateway http://localhost:9091 #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +URLS_FILE="" +OUTPUT_DIR="${OUTPUT_DIR:-/var/lib/lighthouse/reports}" +PUSHGATEWAY_URL="${PUSHGATEWAY_URL:-}" +CHROME_FLAGS="--headless --no-sandbox --disable-dev-shm-usage --disable-gpu" +JOB_NAME="${JOB_NAME:-lighthouse}" +LIGHTHOUSE_TIMEOUT="${LIGHTHOUSE_TIMEOUT:-60}" +RUNS="${RUNS:-1}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +TEMP_DIR="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${CYAN}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +# ── Cleanup ─────────────────────────────────────────────────────────── +cleanup() { + if [[ -n "$TEMP_DIR" && -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} +trap cleanup EXIT + +# ── Color-coded score output ───────────────────────────────────────── +color_score() { + local score=$1 + if (( score >= 90 )); then + echo -e "${GREEN}${score}${RESET}" + elif (( score >= 50 )); then + echo -e "${YELLOW}${score}${RESET}" + else + echo -e "${RED}${score}${RESET}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# AUDIT +# ══════════════════════════════════════════════════════════════════════ + +run_audit() { + local url="$1" + local date_stamp + date_stamp=$(date +%Y-%m-%d_%H%M%S) + local slug + slug=$(echo "$url" | sed 's|https\?://||;s|/|_|g;s|[^a-zA-Z0-9_.-]||g') + + log "Auditing: ${url}" + + local perf_total=0 a11y_total=0 bp_total=0 seo_total=0 + local lcp_total=0 cls_total=0 tbt_total=0 + local best_json="" + local successful_runs=0 + + for ((run=1; run<=RUNS; run++)); do + [[ "$RUNS" -gt 1 ]] && log " Run ${run}/${RUNS}" + + local json_file="${TEMP_DIR}/${slug}_run${run}.json" + + lighthouse "$url" \ + --output json \ + --output-path "$json_file" \ + --chrome-flags="$CHROME_FLAGS" \ + --max-wait-for-load "$((LIGHTHOUSE_TIMEOUT * 1000))" \ + --quiet 2>/dev/null || { + warn "Lighthouse failed for ${url} (run ${run})" + continue + } + + local perf a11y bp seo lcp cls tbt + perf=$(jq -r '.categories.performance.score * 100 | floor' "$json_file") + a11y=$(jq -r '.categories.accessibility.score * 100 | floor' "$json_file") + bp=$(jq -r '.categories["best-practices"].score * 100 | floor' "$json_file") + seo=$(jq -r '.categories.seo.score * 100 | floor' "$json_file") + lcp=$(jq -r '.audits["largest-contentful-paint"].numericValue / 1000' "$json_file") + cls=$(jq -r '.audits["cumulative-layout-shift"].numericValue' "$json_file") + tbt=$(jq -r '.audits["total-blocking-time"].numericValue' "$json_file") + + perf_total=$(echo "$perf_total + $perf" | bc) + a11y_total=$(echo "$a11y_total + $a11y" | bc) + bp_total=$(echo "$bp_total + $bp" | bc) + seo_total=$(echo "$seo_total + $seo" | bc) + lcp_total=$(echo "$lcp_total + $lcp" | bc) + cls_total=$(echo "$cls_total + $cls" | bc) + tbt_total=$(echo "$tbt_total + $tbt" | bc) + + successful_runs=$((successful_runs + 1)) + best_json="$json_file" + done + + if [[ -z "$best_json" ]]; then + warn "All runs failed for ${url}" + return 1 + fi + + # Average scores + local perf_avg a11y_avg bp_avg seo_avg lcp_avg cls_avg tbt_avg + perf_avg=$(echo "$perf_total / $successful_runs" | bc) + a11y_avg=$(echo "$a11y_total / $successful_runs" | bc) + bp_avg=$(echo "$bp_total / $successful_runs" | bc) + seo_avg=$(echo "$seo_total / $successful_runs" | bc) + lcp_avg=$(echo "scale=2; $lcp_total / $successful_runs" | bc) + cls_avg=$(echo "scale=3; $cls_total / $successful_runs" | bc) + tbt_avg=$(echo "scale=0; $tbt_total / $successful_runs" | bc) + + # Generate HTML report from last run's JSON + local report_file="${OUTPUT_DIR}/${slug}_${date_stamp}.html" + lighthouse "$url" \ + --output html \ + --output-path "$report_file" \ + --chrome-flags="$CHROME_FLAGS" \ + --max-wait-for-load "$((LIGHTHOUSE_TIMEOUT * 1000))" \ + --quiet 2>/dev/null || true + + # Print results + log "Results for ${url}:" + printf " ${BOLD}Performance:${RESET} %s\n" "$(color_score "$perf_avg")" + printf " ${BOLD}Accessibility:${RESET} %s\n" "$(color_score "$a11y_avg")" + printf " ${BOLD}Best Practices:${RESET} %s\n" "$(color_score "$bp_avg")" + printf " ${BOLD}SEO:${RESET} %s\n" "$(color_score "$seo_avg")" + printf " LCP: %ss CLS: %s TBT: %sms\n" "$lcp_avg" "$cls_avg" "$tbt_avg" + [[ -f "$report_file" ]] && log "Report: ${report_file}" + + # Push to Prometheus Pushgateway + if [[ -n "$PUSHGATEWAY_URL" ]]; then + local encoded_url + encoded_url=$(echo "$url" | sed 's|/|%2F|g;s|:|%3A|g') + + cat <&2 + exit 1 ;; + *) POSITIONAL_URLS+=("$1"); shift ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + # Collect URLs + URLS=() + + for u in "${POSITIONAL_URLS[@]+"${POSITIONAL_URLS[@]}"}"; do + URLS+=("$u") + done + + if [[ -n "$URLS_FILE" ]]; then + if [[ ! -f "$URLS_FILE" ]]; then + err "URLs file not found: ${URLS_FILE}" + exit 1 + fi + while IFS= read -r line; do + line=$(echo "$line" | xargs) + [[ -z "$line" || "$line" == \#* ]] && continue + URLS+=("$line") + done < "$URLS_FILE" + fi + + if [[ ! -t 0 ]]; then + while IFS= read -r line; do + line=$(echo "$line" | xargs) + [[ -z "$line" || "$line" == \#* ]] && continue + URLS+=("$line") + done + fi + + if [[ ${#URLS[@]} -eq 0 ]]; then + err "No URLs provided (see --help)" + exit 1 + fi + + # Validate dependencies + command -v lighthouse >/dev/null 2>&1 || { err "lighthouse not found — npm install -g lighthouse"; exit 1; } + command -v jq >/dev/null 2>&1 || { err "jq not found — apt install jq"; exit 1; } + + mkdir -p "$OUTPUT_DIR" + TEMP_DIR=$(mktemp -d) + + log "Lighthouse Audit — $(date -u +%Y-%m-%dT%H:%M:%SZ)" + log "URLs: ${#URLS[@]} | Runs: ${RUNS} | Timeout: ${LIGHTHOUSE_TIMEOUT}s" + + for url in "${URLS[@]}"; do + run_audit "$url" || true + [[ ${#URLS[@]} -gt 1 ]] && sleep 5 + done + + log "Done." +} + +main "$@" diff --git a/linux-baseline-checks.sh b/linux-baseline-checks.sh new file mode 100644 index 0000000..ea56ae5 --- /dev/null +++ b/linux-baseline-checks.sh @@ -0,0 +1,924 @@ +#!/usr/bin/env bash + +######################################################################################### +#### linux-baseline-checks.sh — Post-provision baseline security validation #### +#### Zero external dependencies. Validates freshly provisioned or hardened servers. #### +#### Requires: bash 4+, curl (optional), openssl (optional), ss #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./linux-baseline-checks.sh #### +#### ./linux-baseline-checks.sh --skip-updates --format tap #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +SKIP_SSH="${SKIP_SSH:-false}" +SKIP_FIREWALL="${SKIP_FIREWALL:-false}" +SKIP_UPDATES="${SKIP_UPDATES:-false}" +EXPECTED_SERVICES="${EXPECTED_SERVICES:-}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" # text, tap, junit +JUNIT_FILE="${JUNIT_FILE:-baseline-results.xml}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0 +FAIL=0 +SKIP=0 +TOTAL=0 +RESULTS=() +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" + local detail="${2:-}" + ((PASS++)) || true + ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name}" + else + echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_fail() { + local name="$1" + local detail="${2:-}" + ((FAIL++)) || true + ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else + echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_skip() { + local name="$1" + local reason="${2:-}" + ((SKIP++)) || true + ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else + echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}" + fi +} + +# ── Helper: read sshd config value ─────────────────────────────────── +sshd_config_value() { + local key="$1" + local value="" + # Check drop-in configs first, then main config + if [[ -d /etc/ssh/sshd_config.d ]]; then + value=$(grep -rhi "^${key}" /etc/ssh/sshd_config.d/ 2>/dev/null | tail -1 | awk '{print $2}') || true + fi + if [[ -z "$value" ]]; then + value=$(grep -hi "^${key}" /etc/ssh/sshd_config 2>/dev/null | tail -1 | awk '{print $2}') || true + fi + echo "$value" +} + +# ══════════════════════════════════════════════════════════════════════ +# TEST SUITES +# ══════════════════════════════════════════════════════════════════════ + +# ── 1. SSH Hardening ───────────────────────────────────────────────── +test_ssh() { + if [[ "$SKIP_SSH" == "true" ]]; then + echo "" + echo -e "${BOLD}SSH Hardening${RESET}" + record_skip "SSH hardening" "SKIP_SSH=true" + return + fi + + echo "" + echo -e "${BOLD}SSH Hardening${RESET}" + + if [[ ! -f /etc/ssh/sshd_config ]]; then + record_skip "SSH hardening" "sshd_config not found" + return + fi + + # PermitRootLogin + local root_login + root_login=$(sshd_config_value "PermitRootLogin") + verbose "PermitRootLogin = ${root_login:-}" + + if [[ "$root_login" == "no" || "$root_login" == "prohibit-password" ]]; then + record_pass "PermitRootLogin disabled" "${root_login}" + elif [[ -z "$root_login" ]]; then + record_fail "PermitRootLogin disabled" "not explicitly set (default may allow root)" + else + record_fail "PermitRootLogin disabled" "set to ${root_login}" + fi + + # PasswordAuthentication + local pass_auth + pass_auth=$(sshd_config_value "PasswordAuthentication") + verbose "PasswordAuthentication = ${pass_auth:-}" + + if [[ "$pass_auth" == "no" ]]; then + record_pass "PasswordAuthentication disabled" "key-only auth enforced" + elif [[ -z "$pass_auth" ]]; then + record_fail "PasswordAuthentication disabled" "not explicitly set (default is yes)" + else + record_fail "PasswordAuthentication disabled" "set to ${pass_auth}" + fi + + # Protocol 2 (older systems may have Protocol directive) + local protocol + protocol=$(sshd_config_value "Protocol") + verbose "Protocol = ${protocol:-}" + + if [[ -z "$protocol" || "$protocol" == "2" ]]; then + record_pass "SSH Protocol 2" "${protocol:-default (2)}" + else + record_fail "SSH Protocol 2" "set to ${protocol}" + fi + + # Non-standard port detection + local ssh_port + ssh_port=$(sshd_config_value "Port") + if [[ -z "$ssh_port" ]]; then + ssh_port="22" + fi + verbose "SSH Port = ${ssh_port}" + + if [[ "$ssh_port" != "22" ]]; then + record_pass "SSH non-standard port" "port ${ssh_port}" + else + record_fail "SSH non-standard port" "still using default port 22" + fi + + # PubkeyAuthentication + local pubkey_auth + pubkey_auth=$(sshd_config_value "PubkeyAuthentication") + verbose "PubkeyAuthentication = ${pubkey_auth:-}" + + if [[ -z "$pubkey_auth" || "$pubkey_auth" == "yes" ]]; then + record_pass "PubkeyAuthentication enabled" "${pubkey_auth:-default (yes)}" + else + record_fail "PubkeyAuthentication enabled" "set to ${pubkey_auth}" + fi +} + +# ── 2. Firewall ────────────────────────────────────────────────────── +test_firewall() { + if [[ "$SKIP_FIREWALL" == "true" ]]; then + echo "" + echo -e "${BOLD}Firewall${RESET}" + record_skip "Firewall" "SKIP_FIREWALL=true" + return + fi + + echo "" + echo -e "${BOLD}Firewall${RESET}" + + local fw_found=false + + # ufw + if command -v ufw &>/dev/null; then + local ufw_status + ufw_status=$(ufw status 2>/dev/null | head -1) || ufw_status="" + verbose "ufw status: ${ufw_status}" + + if [[ "$ufw_status" == *"active"* ]]; then + record_pass "ufw active" "firewall enabled" + fw_found=true + + # Default deny + local ufw_default + ufw_default=$(ufw status verbose 2>/dev/null | grep "Default:" | head -1) || ufw_default="" + if [[ "$ufw_default" == *"deny"* || "$ufw_default" == *"reject"* ]]; then + record_pass "ufw default deny policy" "${ufw_default}" + else + record_fail "ufw default deny policy" "${ufw_default:-unknown}" + fi + else + record_fail "ufw active" "${ufw_status:-not running}" + fi + fi + + # firewalld + if command -v firewall-cmd &>/dev/null; then + local fwd_state + fwd_state=$(firewall-cmd --state 2>/dev/null) || fwd_state="" + verbose "firewalld state: ${fwd_state}" + + if [[ "$fwd_state" == "running" ]]; then + record_pass "firewalld active" "firewall running" + fw_found=true + + # Default zone + local default_zone + default_zone=$(firewall-cmd --get-default-zone 2>/dev/null) || default_zone="" + if [[ -n "$default_zone" ]]; then + record_pass "firewalld default zone" "${default_zone}" + fi + elif [[ -z "$fwd_state" ]]; then + verbose "firewalld not running" + else + record_fail "firewalld active" "${fwd_state}" + fi + fi + + # iptables (fallback) + if [[ "$fw_found" == "false" ]] && command -v iptables &>/dev/null; then + local ipt_rules + ipt_rules=$(iptables -L INPUT -n 2>/dev/null | wc -l) || ipt_rules=0 + verbose "iptables INPUT rules: ${ipt_rules}" + + if [[ "$ipt_rules" -gt 2 ]]; then + record_pass "iptables rules present" "$((ipt_rules - 2)) rules in INPUT chain" + fw_found=true + + # Check default policy + local ipt_policy + ipt_policy=$(iptables -L INPUT -n 2>/dev/null | head -1 | grep -oP '\(policy \K[^)]+') || ipt_policy="" + if [[ "$ipt_policy" == "DROP" || "$ipt_policy" == "REJECT" ]]; then + record_pass "iptables default deny policy" "INPUT policy ${ipt_policy}" + else + record_fail "iptables default deny policy" "INPUT policy ${ipt_policy:-ACCEPT}" + fi + fi + fi + + if [[ "$fw_found" == "false" ]]; then + record_fail "Firewall active" "no firewall detected (ufw/firewalld/iptables)" + fi +} + +# ── 3. Time Sync ───────────────────────────────────────────────────── +test_timesync() { + echo "" + echo -e "${BOLD}Time Sync${RESET}" + + local sync_found=false + + # chrony + if command -v chronyc &>/dev/null; then + local chrony_tracking + chrony_tracking=$(chronyc tracking 2>/dev/null) || chrony_tracking="" + + if [[ -n "$chrony_tracking" ]]; then + record_pass "chronyd running" "chrony active" + sync_found=true + + # Check sources + local chrony_sources + chrony_sources=$(chronyc sources 2>/dev/null | grep -c '^\^' 2>/dev/null) || chrony_sources=0 + if [[ "$chrony_sources" -gt 0 ]]; then + record_pass "NTP sources reachable" "${chrony_sources} source(s) configured" + else + record_fail "NTP sources reachable" "no sources responding" + fi + + # Drift check (system time offset) + local offset + offset=$(echo "$chrony_tracking" | grep -oP 'System time\s*:\s*\K[0-9.]+') || offset="" + if [[ -n "$offset" ]]; then + # Compare as string — anything under 1 second is fine + local int_offset + int_offset=$(echo "$offset" | cut -d. -f1) + if [[ "${int_offset:-0}" -eq 0 ]]; then + record_pass "Time drift acceptable" "${offset} seconds offset" + else + record_fail "Time drift acceptable" "${offset} seconds offset (>1s)" + fi + fi + fi + fi + + # systemd-timesyncd + if [[ "$sync_found" == "false" ]] && command -v timedatectl &>/dev/null; then + local timesync_status + timesync_status=$(timedatectl show --property=NTPSynchronized --value 2>/dev/null) || timesync_status="" + verbose "NTPSynchronized = ${timesync_status}" + + if [[ "$timesync_status" == "yes" ]]; then + record_pass "systemd-timesyncd synchronized" "NTP active" + sync_found=true + elif systemctl is-active systemd-timesyncd &>/dev/null; then + record_pass "systemd-timesyncd running" "service active" + sync_found=true + record_fail "NTP synchronized" "timesyncd running but not synchronized" + fi + fi + + if [[ "$sync_found" == "false" ]]; then + record_fail "Time sync service" "no NTP service detected (chrony/systemd-timesyncd)" + fi +} + +# ── 4. Services ────────────────────────────────────────────────────── +test_services() { + echo "" + echo -e "${BOLD}Services${RESET}" + + # sshd running + if systemctl is-active sshd &>/dev/null || systemctl is-active ssh &>/dev/null; then + record_pass "sshd running" "SSH daemon active" + else + record_fail "sshd running" "SSH daemon not active" + fi + + # cron / systemd-timers + if systemctl is-active cron &>/dev/null || systemctl is-active crond &>/dev/null; then + record_pass "cron active" "cron daemon running" + elif systemctl list-timers --no-pager 2>/dev/null | grep -q "\.timer"; then + record_pass "systemd timers active" "timers configured" + else + record_fail "cron/timers active" "no cron or systemd timers found" + fi + + # Unnecessary services check + local unnecessary_services=("cups" "cups-browsed" "avahi-daemon" "telnet" "rsh" "rlogin" "tftp" "xinetd") + local bad_found=false + local bad_list="" + + for svc in "${unnecessary_services[@]}"; do + if systemctl is-active "$svc" &>/dev/null; then + bad_found=true + bad_list="${bad_list:+${bad_list}, }${svc}" + fi + done + + if [[ "$bad_found" == "true" ]]; then + record_fail "No unnecessary services" "running: ${bad_list}" + else + record_pass "No unnecessary services" "cups/avahi/telnet/rsh/tftp not running" + fi + + # Expected services (user-defined) + if [[ -n "$EXPECTED_SERVICES" ]]; then + IFS=',' read -ra exp_svcs <<< "$EXPECTED_SERVICES" + for svc in "${exp_svcs[@]}"; do + svc=$(echo "$svc" | xargs) # trim whitespace + if systemctl is-active "$svc" &>/dev/null; then + record_pass "Expected service: ${svc}" "running" + else + record_fail "Expected service: ${svc}" "not running" + fi + done + fi +} + +# ── 5. Users ───────────────────────────────────────────────────────── +test_users() { + echo "" + echo -e "${BOLD}Users${RESET}" + + # UID 0 accounts (should only be root) + local uid0_accounts + uid0_accounts=$(awk -F: '$3 == 0 { print $1 }' /etc/passwd 2>/dev/null) || uid0_accounts="" + local uid0_count + uid0_count=$(echo "$uid0_accounts" | grep -c . 2>/dev/null) || uid0_count=0 + + if [[ "$uid0_count" -eq 1 && "$uid0_accounts" == "root" ]]; then + record_pass "No extra UID 0 accounts" "only root has UID 0" + elif [[ "$uid0_count" -gt 1 ]]; then + record_fail "No extra UID 0 accounts" "UID 0: ${uid0_accounts//$'\n'/, }" + else + record_pass "No extra UID 0 accounts" "only root has UID 0" + fi + + # Empty passwords + if [[ -r /etc/shadow ]]; then + local empty_real + empty_real=$(awk -F: '$2 == "" { print $1 }' /etc/shadow 2>/dev/null) || empty_real="" + + if [[ -z "$empty_real" ]]; then + record_pass "No empty passwords" "all accounts have passwords or are locked" + else + record_fail "No empty passwords" "empty password: ${empty_real//$'\n'/, }" + fi + else + record_skip "No empty passwords" "/etc/shadow not readable (run as root)" + fi + + # Password aging (PASS_MAX_DAYS in login.defs) + if [[ -f /etc/login.defs ]]; then + local max_days + max_days=$(grep -E "^PASS_MAX_DAYS" /etc/login.defs 2>/dev/null | awk '{print $2}') || max_days="" + verbose "PASS_MAX_DAYS = ${max_days:-}" + + if [[ -n "$max_days" && "$max_days" -le 365 && "$max_days" -gt 0 ]]; then + record_pass "Password aging configured" "PASS_MAX_DAYS=${max_days}" + elif [[ -n "$max_days" && "$max_days" -gt 365 ]]; then + record_fail "Password aging configured" "PASS_MAX_DAYS=${max_days} (>365)" + else + record_fail "Password aging configured" "PASS_MAX_DAYS not set or invalid" + fi + else + record_skip "Password aging configured" "/etc/login.defs not found" + fi + + # /etc/shadow permissions + if [[ -f /etc/shadow ]]; then + local shadow_perms + shadow_perms=$(stat -c '%a' /etc/shadow 2>/dev/null) || shadow_perms="" + verbose "/etc/shadow permissions: ${shadow_perms}" + + if [[ "$shadow_perms" == "640" || "$shadow_perms" == "600" || "$shadow_perms" == "000" ]]; then + record_pass "/etc/shadow permissions" "${shadow_perms}" + else + record_fail "/etc/shadow permissions" "${shadow_perms} (expected 640 or stricter)" + fi + fi +} + +# ── 6. Filesystem ──────────────────────────────────────────────────── +test_filesystem() { + echo "" + echo -e "${BOLD}Filesystem${RESET}" + + # /tmp noexec/nosuid + local tmp_opts + tmp_opts=$(findmnt -n -o OPTIONS /tmp 2>/dev/null) || tmp_opts="" + verbose "/tmp mount options: ${tmp_opts:-}" + + if [[ -n "$tmp_opts" ]]; then + local tmp_pass=true + local tmp_detail="" + + if [[ "$tmp_opts" == *"noexec"* ]]; then + tmp_detail="noexec" + else + tmp_pass=false + fi + + if [[ "$tmp_opts" == *"nosuid"* ]]; then + tmp_detail="${tmp_detail:+${tmp_detail},}nosuid" + else + tmp_pass=false + fi + + if [[ "$tmp_pass" == "true" ]]; then + record_pass "/tmp noexec,nosuid" "${tmp_detail}" + else + record_fail "/tmp noexec,nosuid" "missing options (current: ${tmp_opts})" + fi + else + record_fail "/tmp noexec,nosuid" "/tmp not mounted as separate partition" + fi + + # /var separate partition + local var_mount + var_mount=$(findmnt -n -o SOURCE /var 2>/dev/null) || var_mount="" + local root_mount + root_mount=$(findmnt -n -o SOURCE / 2>/dev/null) || root_mount="" + verbose "/var source: ${var_mount:-}" + + if [[ -n "$var_mount" && "$var_mount" != "$root_mount" ]]; then + record_pass "/var separate partition" "${var_mount}" + else + record_fail "/var separate partition" "not on separate partition" + fi + + # /home separate partition + local home_mount + home_mount=$(findmnt -n -o SOURCE /home 2>/dev/null) || home_mount="" + verbose "/home source: ${home_mount:-}" + + if [[ -n "$home_mount" && "$home_mount" != "$root_mount" ]]; then + record_pass "/home separate partition" "${home_mount}" + else + record_fail "/home separate partition" "not on separate partition" + fi + + # Sticky bit on /tmp + local sticky + sticky=$(stat -c '%a' /tmp 2>/dev/null) || sticky="" + verbose "/tmp mode: ${sticky}" + + if [[ "${sticky:0:1}" == "1" || "$sticky" == "1777" ]]; then + record_pass "Sticky bit on /tmp" "mode ${sticky}" + else + record_fail "Sticky bit on /tmp" "mode ${sticky:-unknown} (expected 1777)" + fi +} + +# ── 7. Kernel ───────────────────────────────────────────────────────── +test_kernel() { + echo "" + echo -e "${BOLD}Kernel${RESET}" + + # ASLR + local aslr + aslr=$(cat /proc/sys/kernel/randomize_va_space 2>/dev/null) || aslr="" + verbose "ASLR (randomize_va_space) = ${aslr}" + + if [[ "$aslr" == "2" ]]; then + record_pass "ASLR enabled" "randomize_va_space=2 (full)" + elif [[ "$aslr" == "1" ]]; then + record_pass "ASLR enabled" "randomize_va_space=1 (partial)" + else + record_fail "ASLR enabled" "randomize_va_space=${aslr:-not readable}" + fi + + # SYN cookies + local syncookies + syncookies=$(cat /proc/sys/net/ipv4/tcp_syncookies 2>/dev/null) || syncookies="" + verbose "tcp_syncookies = ${syncookies}" + + if [[ "$syncookies" == "1" ]]; then + record_pass "SYN cookies enabled" "tcp_syncookies=1" + else + record_fail "SYN cookies enabled" "tcp_syncookies=${syncookies:-not readable}" + fi + + # IP forwarding (should be disabled unless router) + local ip_forward + ip_forward=$(cat /proc/sys/net/ipv4/ip_forward 2>/dev/null) || ip_forward="" + verbose "ip_forward = ${ip_forward}" + + if [[ "$ip_forward" == "0" ]]; then + record_pass "IP forwarding disabled" "ip_forward=0" + else + record_fail "IP forwarding disabled" "ip_forward=${ip_forward:-not readable} (enabled — expected unless router)" + fi + + # Core dumps restricted + local core_pattern + core_pattern=$(cat /proc/sys/kernel/core_pattern 2>/dev/null) || core_pattern="" + local core_limit + core_limit=$(ulimit -c 2>/dev/null) || core_limit="" + verbose "core_pattern = ${core_pattern}" + verbose "ulimit -c = ${core_limit}" + + if [[ "$core_limit" == "0" ]]; then + record_pass "Core dumps restricted" "ulimit -c = 0" + elif [[ "$core_pattern" == *"systemd-coredump"* || "$core_pattern" == "|"* ]]; then + record_pass "Core dumps restricted" "handled by ${core_pattern%% *}" + else + record_fail "Core dumps restricted" "core dumps enabled (ulimit -c = ${core_limit})" + fi +} + +# ── 8. Updates ──────────────────────────────────────────────────────── +test_updates() { + if [[ "$SKIP_UPDATES" == "true" ]]; then + echo "" + echo -e "${BOLD}Updates${RESET}" + record_skip "Pending updates" "SKIP_UPDATES=true" + return + fi + + echo "" + echo -e "${BOLD}Updates${RESET}" + + # Detect package manager and check for pending security updates + if command -v apt-get &>/dev/null; then + verbose "Package manager: apt" + local updates + updates=$(apt-get -s upgrade 2>/dev/null | grep -c "^Inst" 2>/dev/null) || updates=0 + local security + security=$(apt-get -s upgrade 2>/dev/null | grep -c "security" 2>/dev/null) || security=0 + + if [[ "$security" -gt 0 ]]; then + record_fail "Security updates pending" "${security} security, ${updates} total" + elif [[ "$updates" -gt 0 ]]; then + record_pass "No security updates pending" "${updates} non-security updates available" + else + record_pass "System up to date" "no pending updates" + fi + + elif command -v dnf &>/dev/null; then + verbose "Package manager: dnf" + local sec_updates + sec_updates=$(dnf updateinfo list --security 2>/dev/null | grep -c "security" 2>/dev/null) || sec_updates=0 + + if [[ "$sec_updates" -gt 0 ]]; then + record_fail "Security updates pending" "${sec_updates} security updates" + else + record_pass "No security updates pending" "dnf reports no security updates" + fi + + elif command -v yum &>/dev/null; then + verbose "Package manager: yum" + local yum_sec + yum_sec=$(yum updateinfo list security 2>/dev/null | grep -c "security" 2>/dev/null) || yum_sec=0 + + if [[ "$yum_sec" -gt 0 ]]; then + record_fail "Security updates pending" "${yum_sec} security updates" + else + record_pass "No security updates pending" "yum reports no security updates" + fi + + else + record_skip "Pending updates" "no supported package manager found (apt/dnf/yum)" + fi +} + +# ── 9. Permissions ─────────────────────────────────────────────────── +test_permissions() { + echo "" + echo -e "${BOLD}Permissions${RESET}" + + # /etc/passwd (should be 644) + local passwd_perms + passwd_perms=$(stat -c '%a' /etc/passwd 2>/dev/null) || passwd_perms="" + verbose "/etc/passwd permissions: ${passwd_perms}" + + if [[ "$passwd_perms" == "644" ]]; then + record_pass "/etc/passwd permissions" "${passwd_perms}" + else + record_fail "/etc/passwd permissions" "${passwd_perms} (expected 644)" + fi + + # /etc/shadow (should be 640 or stricter) + if [[ -f /etc/shadow ]]; then + local shadow_perms + shadow_perms=$(stat -c '%a' /etc/shadow 2>/dev/null) || shadow_perms="" + + if [[ "$shadow_perms" == "640" || "$shadow_perms" == "600" || "$shadow_perms" == "000" ]]; then + record_pass "/etc/shadow permissions" "${shadow_perms}" + else + record_fail "/etc/shadow permissions" "${shadow_perms} (expected 640 or stricter)" + fi + fi + + # /etc/gshadow (should be 640 or stricter) + if [[ -f /etc/gshadow ]]; then + local gshadow_perms + gshadow_perms=$(stat -c '%a' /etc/gshadow 2>/dev/null) || gshadow_perms="" + verbose "/etc/gshadow permissions: ${gshadow_perms}" + + if [[ "$gshadow_perms" == "640" || "$gshadow_perms" == "600" || "$gshadow_perms" == "000" ]]; then + record_pass "/etc/gshadow permissions" "${gshadow_perms}" + else + record_fail "/etc/gshadow permissions" "${gshadow_perms} (expected 640 or stricter)" + fi + fi + + # World-writable files in /etc + local ww_files + ww_files=$(find /etc -maxdepth 2 -type f -perm -0002 2>/dev/null | head -10) || ww_files="" + local ww_count + ww_count=$(echo "$ww_files" | grep -c . 2>/dev/null) || ww_count=0 + + if [[ -z "$ww_files" ]]; then + record_pass "No world-writable files in /etc" "clean" + else + record_fail "No world-writable files in /etc" "${ww_count} found" + fi +} + +# ── 10. Logging ────────────────────────────────────────────────────── +test_logging() { + echo "" + echo -e "${BOLD}Logging${RESET}" + + # rsyslog or journald + local logging_found=false + + if systemctl is-active rsyslog &>/dev/null; then + record_pass "rsyslog running" "syslog daemon active" + logging_found=true + fi + + if systemctl is-active systemd-journald &>/dev/null; then + record_pass "journald running" "systemd journal active" + logging_found=true + fi + + if [[ "$logging_found" == "false" ]]; then + record_fail "Logging service" "neither rsyslog nor journald running" + fi + + # /var/log permissions + local varlog_perms + varlog_perms=$(stat -c '%a' /var/log 2>/dev/null) || varlog_perms="" + verbose "/var/log permissions: ${varlog_perms}" + + if [[ "$varlog_perms" == "755" || "$varlog_perms" == "750" || "$varlog_perms" == "700" ]]; then + record_pass "/var/log permissions" "${varlog_perms}" + else + record_fail "/var/log permissions" "${varlog_perms} (expected 755 or stricter)" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + local hostname + hostname=$(hostname 2>/dev/null || echo "unknown") + + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} ${hostname}" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + + if [[ $FAIL -eq 0 ]]; then + echo -e "${GREEN}${BOLD}All checks passed.${RESET}" + else + echo -e "${RED}${BOLD}${FAIL} check(s) failed.${RESET}" + fi +} + +print_tap_header() { + echo "TAP version 13" +} + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +write_junit() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + cat > "$JUNIT_FILE" < + + +JUNIT_EOF + + for result in "${RESULTS[@]}"; do + local status name detail + status=$(echo "$result" | cut -d'|' -f1) + name=$(echo "$result" | cut -d'|' -f2) + detail=$(echo "$result" | cut -d'|' -f3) + + # XML-escape the values + name=$(echo "$name" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + detail=$(echo "$detail" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + + case "$status" in + PASS) + echo " " >> "$JUNIT_FILE" + [[ -n "$detail" ]] && echo " ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + FAIL) + echo " " >> "$JUNIT_FILE" + echo " FAILED: ${name} — ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + SKIP) + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + esac + done + + echo " " >> "$JUNIT_FILE" + echo "" >> "$JUNIT_FILE" + + log "JUnit report written to ${JUNIT_FILE}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat </dev/null || echo 'unknown')" + echo -e "Time: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo -e "Kernel: $(uname -r)" + echo "" + fi + + # Run test suites + test_ssh + test_firewall + test_timesync + test_services + test_users + test_filesystem + test_kernel + test_updates + test_permissions + test_logging + + # Output + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + print_tap_footer + elif [[ "$OUTPUT_FORMAT" == "junit" ]]; then + print_summary + write_junit + else + print_summary + fi + + # Exit code + [[ $FAIL -eq 0 ]] && exit 0 || exit 1 +} + +main "$@" diff --git a/linux-log-analyzer.sh b/linux-log-analyzer.sh new file mode 100755 index 0000000..feccfb6 --- /dev/null +++ b/linux-log-analyzer.sh @@ -0,0 +1,1717 @@ +#!/bin/bash +################################################################################ +# Script Name: linux-log-analyzer.sh +# Version: 1.00 +# Description: Analyze Linux system logs across five log types — system, auth, +# Docker, Java, GitLab, and PostgreSQL. Parses syslog, auth.log, +# journalctl, Java/log4j logs, GitLab JSON logs, and PostgreSQL +# logs to surface errors, failed logins, service failures, stack +# traces, slow queries, and crash events. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - bash, awk, grep, sort +# - journalctl (optional, for systemd-based systems) +# - Root/sudo for auth and system log types +# +# Usage: +# sudo ./linux-log-analyzer.sh --type system +# sudo ./linux-log-analyzer.sh --type auth --since 24h +# sudo ./linux-log-analyzer.sh --type docker +# ./linux-log-analyzer.sh --type java --log /opt/app/logs/application.log +# sudo ./linux-log-analyzer.sh --type gitlab +# ./linux-log-analyzer.sh --type postgresql +# sudo ./linux-log-analyzer.sh --all-types +# sudo ./linux-log-analyzer.sh --type system --json +# sudo ./linux-log-analyzer.sh --all-types --no-color > report.txt +# +# Log Types: +# system - syslog/messages: service failures, OOM, disk errors, panics +# auth - auth.log/secure: SSH brute force, sudo, account changes +# docker - Docker daemon: container crashes, restart loops, OOM +# java - log4j/logback/catalina: stack traces, OOM, GC pauses, errors +# gitlab - GitLab Omnibus: 5xx errors, slow requests, Sidekiq failures +# postgresql - PostgreSQL: deadlocks, slow queries, connection errors +################################################################################ + +set -uo pipefail + +# ============================================================================ +# DEFAULTS +# ============================================================================ + +VERSION="1.00" +LOG_TYPE="" +ALL_TYPES=false +SINCE="" +SINCE_EPOCH="" +TOP_N=20 +JSON_MODE=false +NO_COLOR=false +OUTPUT_FILE="" +CUSTOM_LOG="" + +# Colors +RED='\033[0;31m' +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +CYAN='\033[0;36m' +BOLD='\033[1m' +DIM='\033[2m' +NC='\033[0m' + +# JSON accumulator +JSON_OUTPUT="" + +# ============================================================================ +# USAGE +# ============================================================================ + +show_usage() { + cat < report.txt + sudo $0 --type system --json | jq . + +EOF + exit 0 +} + +# ============================================================================ +# HELPERS +# ============================================================================ + +disable_colors() { + RED="" YELLOW="" GREEN="" CYAN="" BOLD="" DIM="" NC="" +} + +log_info() { echo -e "${CYAN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "This log type requires root. Run with: sudo $0 $*" + return 1 + fi + return 0 +} + +section_header() { + local title="$1" + echo "" + echo -e "${BOLD}====================================================" + echo -e " ${title}" + echo -e "====================================================${NC}" +} + +subsection_header() { + local title="$1" + echo "" + echo -e "${BOLD}── ${title} ─────────────────────────────────────────${NC}" +} + +# Convert relative time strings to epoch +parse_since() { + local since="$1" + local now + now=$(date +%s) + + case "$since" in + *h) + local hours="${since%h}" + SINCE_EPOCH=$((now - hours * 3600)) + ;; + *d) + local days="${since%d}" + SINCE_EPOCH=$((now - days * 86400)) + ;; + *) + # Try ISO date + SINCE_EPOCH=$(date -d "$since" +%s 2>/dev/null || echo "") + if [[ -z "$SINCE_EPOCH" ]]; then + log_error "Invalid --since value: $since" + exit 1 + fi + ;; + esac +} + +# Get journalctl --since string from SINCE +journalctl_since_arg() { + if [[ -n "$SINCE" ]]; then + case "$SINCE" in + *h) echo "--since=${SINCE%h} hours ago" ;; + *d) echo "--since=${SINCE%d} days ago" ;; + *) echo "--since=$SINCE" ;; + esac + fi +} + +# Filter lines by timestamp — uses a reference date to avoid per-line date calls +filter_by_time() { + if [[ -z "$SINCE_EPOCH" ]]; then + cat + return + fi + local cutoff_date + cutoff_date=$(date -d "@${SINCE_EPOCH}" "+%Y-%m-%d %H:%M:%S" 2>/dev/null) + local cutoff_syslog + cutoff_syslog=$(date -d "@${SINCE_EPOCH}" "+%b %d %H:%M:%S" 2>/dev/null) + local current_year + current_year=$(date +%Y) + + awk -v cutoff_iso="$cutoff_date" -v cutoff_syslog="$cutoff_syslog" -v yr="$current_year" ' + BEGIN { + # Month name to number for syslog comparison + split("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec", m) + for (i = 1; i <= 12; i++) mon_num[m[i]] = sprintf("%02d", i) + } + { + # ISO: 2026-04-13 14:22:03 or 2026-04-13T14:22:03 + if (match($0, /^[0-9]{4}-[0-9]{2}-[0-9]{2}[T ][0-9]{2}:[0-9]{2}:[0-9]{2}/)) { + ts = substr($0, RSTART, RLENGTH) + gsub(/T/, " ", ts) + if (ts >= cutoff_iso) print + next + } + # Syslog: Apr 13 14:22:03 + if (match($0, /^[A-Z][a-z]{2} [ 0-9][0-9] [0-9]{2}:[0-9]{2}:[0-9]{2}/)) { + ts = substr($0, RSTART, RLENGTH) + # Convert to sortable: YYYY-MM-DD HH:MM:SS + mon_name = substr(ts, 1, 3) + day = substr(ts, 5, 2) + gsub(/ /, "0", day) + time_part = substr(ts, 8) + sortable = yr "-" mon_num[mon_name] "-" day " " time_part + if (sortable >= cutoff_iso) print + next + } + # No timestamp — include line (continuation / multi-line) + print + }' +} + +# Find first available log file from a list +find_log() { + for f in "$@"; do + if [[ -r "$f" ]]; then + echo "$f" + return 0 + fi + done + return 1 +} + +# Read log source — file or journalctl +read_log_source() { + local log_file="$1" + local journal_unit="${2:-}" + + if [[ -n "$CUSTOM_LOG" && -r "$CUSTOM_LOG" ]]; then + cat "$CUSTOM_LOG" | filter_by_time + return + fi + + if [[ -n "$log_file" && -r "$log_file" ]]; then + cat "$log_file" | filter_by_time + return + fi + + if [[ -n "$journal_unit" ]] && command -v journalctl &>/dev/null; then + local since_arg + since_arg=$(journalctl_since_arg) + if [[ -n "$since_arg" ]]; then + journalctl -u "$journal_unit" --no-pager -q $since_arg 2>/dev/null + else + journalctl -u "$journal_unit" --no-pager -q 2>/dev/null + fi + return + fi + + return 1 +} + +# Emit a JSON key-value pair (accumulates into JSON_OUTPUT) +json_add() { + local key="$1" + local value="$2" + if [[ -n "$JSON_OUTPUT" ]]; then + JSON_OUTPUT="${JSON_OUTPUT}," + fi + JSON_OUTPUT="${JSON_OUTPUT}\"${key}\":${value}" +} + +json_string() { + local s="$1" + s="${s//\\/\\\\}" + s="${s//\"/\\\"}" + printf '"%s"' "$s" +} + +# ============================================================================ +# SYSTEM LOG ANALYZER +# ============================================================================ + +analyze_system() { + if ! check_root; then return 1; fi + + local log_file + log_file=$(find_log /var/log/syslog /var/log/messages) + + local log_data + log_data=$(read_log_source "${log_file:-}" "") + + # Also pull journalctl if available and no file source + if [[ -z "$log_data" ]] && command -v journalctl &>/dev/null; then + local since_arg + since_arg=$(journalctl_since_arg) + if [[ -n "$since_arg" ]]; then + log_data=$(journalctl --no-pager -q $since_arg 2>/dev/null) + else + log_data=$(journalctl --no-pager -q --since "24 hours ago" 2>/dev/null) + fi + fi + + if [[ -z "$log_data" ]]; then + log_warn "No system log sources found (syslog, messages, or journalctl)" + return 1 + fi + + local total_lines + total_lines=$(echo "$log_data" | wc -l) + + if ! $JSON_MODE; then + section_header "System Log Analysis" + echo "" + echo -e " Source: ${DIM}${log_file:-journalctl}${NC}" + echo -e " Lines: ${total_lines}" + fi + + # Service failures + local service_failures + service_failures=$(echo "$log_data" | grep -iE "(failed|error|fatal)" | \ + grep -oP '(\S+\.service)' | sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "Service Failures" + if [[ -n "$service_failures" ]]; then + printf " ${BOLD}%-4s %-35s %s${NC}\n" "#" "Service" "Failures" + echo "$service_failures" | awk '{printf " %-4d %-35s %d\n", NR, $2, $1}' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # OOM kills + local oom_kills + oom_kills=$(echo "$log_data" | grep -i "oom-kill\|oom_kill\|out of memory\|killed process" | head -"$TOP_N") + + local oom_count=0 + if [[ -n "$oom_kills" ]]; then + oom_count=$(echo "$oom_kills" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "OOM Kills" + if [[ -n "$oom_kills" ]]; then + echo "$oom_kills" | while IFS= read -r line; do + local proc + proc=$(echo "$line" | grep -oP "process \d+ \(\K[^)]+|Killed process \d+ \(\K[^)]+|oom_kill.*task=\K\S+" | head -1) + local pid + pid=$(echo "$line" | grep -oP "process \K\d+|Killed process \K\d+" | head -1) + local ts + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9:]+") + printf " %-22s PID %-7s %s\n" "${ts:-unknown}" "${pid:-?}" "${proc:-unknown}" + done + else + echo " None found." + fi + fi + + # Disk errors + local disk_errors + disk_errors=$(echo "$log_data" | grep -iE "(I/O error|EXT[234]-fs.*(warning|error)|XFS.*(error|corruption)|BTRFS.*error|SMART.*error|end_request.*I/O|medium error|blk_update_request.*error|Buffer I/O error|remount.*read-only)" | head -"$TOP_N") + + local disk_error_count=0 + if [[ -n "$disk_errors" ]]; then + disk_error_count=$(echo "$disk_errors" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Disk Errors" + if [[ -n "$disk_errors" ]]; then + echo "$disk_errors" | while IFS= read -r line; do + local ts + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9]{2}:[0-9]{2}:[0-9]{2}") + local msg + msg=$(echo "$line" | sed -E 's/^.*kernel(\[[0-9.]+\])?: //' | head -c 80) + printf " %-22s %s\n" "${ts:-unknown}" "$msg" + done + else + echo " None found." + fi + fi + + # Failed logins (from syslog) + local failed_logins + failed_logins=$(echo "$log_data" | grep -iE "(failed password|authentication failure|failed login)" | \ + grep -oP "user[ =]\K\S+|for \K\S+(?= from)" | sort | uniq -c | sort -rn | head -"$TOP_N") + + local failed_login_total=0 + if [[ -n "$failed_logins" ]]; then + failed_login_total=$(echo "$failed_logins" | awk '{s+=$1} END {print s+0}') + fi + + if ! $JSON_MODE; then + subsection_header "Failed Logins" + if [[ -n "$failed_logins" ]]; then + printf " ${BOLD}%-4s %-25s %s${NC}\n" "#" "User" "Failures" + echo "$failed_logins" | awk '{printf " %-4d %-25s %d\n", NR, $2, $1}' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # Sudo usage + local sudo_usage + sudo_usage=$(echo "$log_data" | grep -i "sudo:" | grep "COMMAND=" | \ + grep -oP "^\S+\s+\S+\s+\S+\s+\S+\s+\K\S+(?=\s)" | sort | uniq -c | sort -rn | head -"$TOP_N") + + local sudo_total=0 + if [[ -n "$sudo_usage" ]]; then + sudo_total=$(echo "$sudo_usage" | awk '{s+=$1} END {print s+0}') + fi + + if ! $JSON_MODE; then + subsection_header "Sudo Usage" + if [[ -n "$sudo_usage" ]]; then + printf " ${BOLD}%-4s %-25s %s${NC}\n" "#" "User" "Commands" + echo "$sudo_usage" | awk '{printf " %-4d %-25s %d\n", NR, $2, $1}' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # Kernel panics / segfaults + local kernel_panics + kernel_panics=$(echo "$log_data" | grep -iE "(kernel panic|segfault|general protection fault|BUG:|Oops:)" | head -"$TOP_N") + + local panic_count=0 + if [[ -n "$kernel_panics" ]]; then + panic_count=$(echo "$kernel_panics" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Kernel Panics / Segfaults" + if [[ -n "$kernel_panics" ]]; then + echo "$kernel_panics" | while IFS= read -r line; do + local ts + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9:]+") + local msg + msg=$(echo "$line" | sed 's/^.*\(kernel\)[^:]*: //') + printf " %-22s %s\n" "${ts:-unknown}" "${msg:0:80}" + done + else + echo " None found." + fi + fi + + # Cron failures + local cron_failures + cron_failures=$(echo "$log_data" | grep -iE "(cron.*error|CRON.*FAILED|cron.*exit status [^0])" | head -"$TOP_N") + + local cron_fail_count=0 + if [[ -n "$cron_failures" ]]; then + cron_fail_count=$(echo "$cron_failures" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Cron Failures" + if [[ -n "$cron_failures" ]]; then + echo "$cron_failures" | while IFS= read -r line; do + local ts + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9:]+") + local msg + msg=$(echo "$line" | sed 's/^.*CRON[^:]*: //' | head -c 80) + printf " %-22s %s\n" "${ts:-unknown}" "$msg" + done + else + echo " None found." + fi + fi + + # Top error patterns + local error_patterns + error_patterns=$(echo "$log_data" | grep -iE "(error|fail|fatal|critical)" | \ + sed -E 's/^[0-9]{4}-[0-9]{2}-[0-9]{2}T[^ ]+ [^ ]+ [^:]+: //' | \ + sed -E 's/^[A-Z][a-z]{2} +[0-9]{1,2} [0-9:]{8} [^ ]+ [^:]+: //' | \ + sed -E ':a; s/^(time|ts|level|caller|source|component|host)=[^ ]+ //; ta' | \ + sed -E 's/^\[[0-9T:.Z+ -]+ [A-Z]+ [^]]+\] //' | \ + sed 's/[0-9]\{2,\}/#/g' | sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "Top Error Patterns" + if [[ -n "$error_patterns" ]]; then + printf " ${BOLD}%-4s %-6s %s${NC}\n" "#" "Count" "Pattern" + echo "$error_patterns" | awk '{ + count = $1 + $1 = "" + sub(/^ +/, "") + msg = substr($0, 1, 70) + printf " %-4d %-6d %s\n", NR, count, msg + }' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # Service failure count + local svc_fail_count=0 + if [[ -n "$service_failures" ]]; then + svc_fail_count=$(echo "$service_failures" | awk '{s+=$1} END {print s+0}') + fi + + # Summary + if ! $JSON_MODE; then + section_header "Summary" + echo "" + printf " %-25s %d\n" "Service failures:" "$svc_fail_count" + printf " %-25s %d\n" "OOM kills:" "$oom_count" + printf " %-25s %d\n" "Disk errors:" "$disk_error_count" + printf " %-25s %d\n" "Failed logins:" "$failed_login_total" + printf " %-25s %d\n" "Sudo commands:" "$sudo_total" + printf " %-25s %d\n" "Kernel panics:" "$panic_count" + printf " %-25s %d\n" "Cron failures:" "$cron_fail_count" + printf " %-25s %d\n" "Total lines parsed:" "$total_lines" + else + json_add "system" "{\"service_failures\":${svc_fail_count},\"oom_kills\":${oom_count},\"disk_errors\":${disk_error_count},\"failed_logins\":${failed_login_total},\"sudo_commands\":${sudo_total},\"kernel_panics\":${panic_count},\"cron_failures\":${cron_fail_count},\"total_lines\":${total_lines}}" + fi +} + +# ============================================================================ +# AUTH LOG ANALYZER +# ============================================================================ + +analyze_auth() { + if ! check_root; then return 1; fi + + local log_file + log_file=$(find_log /var/log/auth.log /var/log/secure) + + local log_data + log_data=$(read_log_source "${log_file:-}" "ssh") + + if [[ -z "$log_data" ]]; then + log_warn "No auth log sources found (auth.log, secure, or journalctl)" + return 1 + fi + + local total_lines + total_lines=$(echo "$log_data" | wc -l) + + if ! $JSON_MODE; then + section_header "Auth Log Analysis" + echo "" + echo -e " Source: ${DIM}${log_file:-journalctl}${NC}" + echo -e " Lines: ${total_lines}" + fi + + # SSH login failures by IP + local ssh_fail_ips + ssh_fail_ips=$(echo "$log_data" | grep -i "failed password" | \ + grep -oP "from \K[0-9a-f.:]+(?= port)" | sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "SSH Failed Logins by IP" + if [[ -n "$ssh_fail_ips" ]]; then + printf " ${BOLD}%-4s %-40s %s${NC}\n" "#" "IP Address" "Failures" + echo "$ssh_fail_ips" | awk '{printf " %-4d %-40s %d\n", NR, $2, $1}' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # Brute force detection (>10 failures) + local brute_force + brute_force=$(echo "$log_data" | grep -i "failed password" | \ + grep -oP "from \K[0-9a-f.:]+(?= port)" | sort | uniq -c | sort -rn | awk '$1 > 10') + + local brute_force_count=0 + if [[ -n "$brute_force" ]]; then + brute_force_count=$(echo "$brute_force" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Brute Force Suspects (>10 failures)" + if [[ -n "$brute_force" ]]; then + printf " ${BOLD}%-40s %s${NC}\n" "IP Address" "Failures" + echo "$brute_force" | awk '{printf " %-40s %s\n", $2, $1}' + echo "" + echo -e " ${RED}${brute_force_count} IP(s) with >10 failed attempts${NC}" + else + echo -e " ${GREEN}None found.${NC}" + fi + fi + + # SSH login successes by user + local ssh_success + ssh_success=$(echo "$log_data" | grep -i "accepted" | \ + grep -oP "for \K\S+(?= from)" | sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "SSH Successful Logins by User" + if [[ -n "$ssh_success" ]]; then + printf " ${BOLD}%-4s %-25s %s${NC}\n" "#" "User" "Logins" + echo "$ssh_success" | awk '{printf " %-4d %-25s %d\n", NR, $2, $1}' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # SSH key vs password + local key_logins=0 pass_logins=0 + key_logins=$(echo "$log_data" | grep -ic "accepted publickey" || true) + pass_logins=$(echo "$log_data" | grep -ic "accepted password" || true) + + if ! $JSON_MODE; then + subsection_header "Auth Method Breakdown" + printf " %-25s %d\n" "SSH key:" "$key_logins" + printf " %-25s %d\n" "Password:" "$pass_logins" + fi + + # Root login attempts + local root_attempts + root_attempts=$(echo "$log_data" | grep -iE "(failed password|accepted).* for root " | wc -l || true) + local root_success + root_success=$(echo "$log_data" | grep -i "accepted.* for root " | wc -l || true) + + if ! $JSON_MODE; then + subsection_header "Root Login Attempts" + printf " %-25s %d\n" "Total attempts:" "$root_attempts" + printf " %-25s %d\n" "Successful:" "$root_success" + if [[ "$root_success" -gt 0 ]]; then + echo "" + echo "$log_data" | grep -i "accepted.* for root " | tail -5 | while IFS= read -r line; do + local ts ip + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9:]+") + ip=$(echo "$line" | grep -oP "from \K[0-9a-f.:]+") + printf " %-22s from %s\n" "${ts:-unknown}" "${ip:-unknown}" + done + fi + fi + + # Sudo failures + local sudo_failures + sudo_failures=$(echo "$log_data" | grep -iE "sudo.*authentication failure|sudo.*incorrect password|sudo.*3 incorrect" | head -"$TOP_N") + + local sudo_fail_count=0 + if [[ -n "$sudo_failures" ]]; then + sudo_fail_count=$(echo "$sudo_failures" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Sudo Failures" + if [[ -n "$sudo_failures" ]]; then + echo "$sudo_failures" | while IFS= read -r line; do + local ts user + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9:]+") + user=$(echo "$line" | grep -oP "user[ =]\K\S+|;\s+\K\S+(?=\s+:)" | head -1) + printf " %-22s user: %s\n" "${ts:-unknown}" "${user:-unknown}" + done + else + echo " None found." + fi + fi + + # Account changes (useradd, usermod, groupadd, userdel, passwd) + local account_changes + account_changes=$(echo "$log_data" | grep -iE "(useradd|usermod|userdel|groupadd|groupdel|passwd|new user|new group|delete user)" | head -"$TOP_N") + + local account_change_count=0 + if [[ -n "$account_changes" ]]; then + account_change_count=$(echo "$account_changes" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Account Changes" + if [[ -n "$account_changes" ]]; then + echo "$account_changes" | while IFS= read -r line; do + local ts msg + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9:]+") + msg=$(echo "$line" | sed 's/^.*\(useradd\|usermod\|userdel\|groupadd\|passwd\)/\1/' | head -c 70) + printf " %-22s %s\n" "${ts:-unknown}" "$msg" + done + else + echo " None found." + fi + fi + + # Totals + local total_failures + total_failures=$(echo "$log_data" | grep -ic "failed password" || true) + local total_success + total_success=$(echo "$log_data" | grep -ic "accepted" || true) + local total_auth=$((total_failures + total_success)) + local success_rate=0 + if [[ "$total_auth" -gt 0 ]]; then + success_rate=$(awk "BEGIN {printf \"%.1f\", ($total_success / $total_auth) * 100}") + fi + + if ! $JSON_MODE; then + section_header "Summary" + echo "" + printf " %-25s %d\n" "Total auth events:" "$total_auth" + printf " %-25s %d\n" "Successful logins:" "$total_success" + printf " %-25s %d\n" "Failed logins:" "$total_failures" + printf " %-25s %s%%\n" "Success rate:" "$success_rate" + printf " %-25s %d\n" "Brute force IPs:" "$brute_force_count" + printf " %-25s %d\n" "Sudo failures:" "$sudo_fail_count" + printf " %-25s %d\n" "Account changes:" "$account_change_count" + else + json_add "auth" "{\"total_auth_events\":${total_auth},\"successful_logins\":${total_success},\"failed_logins\":${total_failures},\"success_rate\":${success_rate},\"brute_force_ips\":${brute_force_count},\"sudo_failures\":${sudo_fail_count},\"account_changes\":${account_change_count}}" + fi +} + +# ============================================================================ +# DOCKER LOG ANALYZER +# ============================================================================ + +analyze_docker() { + if ! command -v docker &>/dev/null && ! [[ -r /var/log/docker.log ]]; then + log_warn "Docker not found and no /var/log/docker.log — skipping" + return 1 + fi + + local log_data="" + + # Try journalctl first (most common on systemd systems) + if command -v journalctl &>/dev/null; then + local since_arg + since_arg=$(journalctl_since_arg) + if [[ -n "$since_arg" ]]; then + log_data=$(journalctl -u docker.service --no-pager -q $since_arg 2>/dev/null) + else + log_data=$(journalctl -u docker.service --no-pager -q 2>/dev/null) + fi + fi + + # Fallback to docker.log + if [[ -z "$log_data" && -r /var/log/docker.log ]]; then + log_data=$(cat /var/log/docker.log | filter_by_time) + fi + + # Also pull docker events if docker is available + local docker_ps_data="" + if command -v docker &>/dev/null; then + docker_ps_data=$(docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.CreatedAt}}' 2>/dev/null || true) + fi + + if [[ -z "$log_data" && -z "$docker_ps_data" ]]; then + log_warn "No Docker log data available" + return 1 + fi + + local total_lines=0 + if [[ -n "$log_data" ]]; then + total_lines=$(echo "$log_data" | wc -l) + fi + + if ! $JSON_MODE; then + section_header "Docker Log Analysis" + echo "" + echo -e " Source: ${DIM}journalctl/docker.log${NC}" + echo -e " Lines: ${total_lines}" + fi + + # Container start/stop/restart events + local container_events + container_events=$(echo "$log_data" | grep -oP "container \K(start|stop|restart|kill|die|create|destroy)\S*" 2>/dev/null | \ + sort | uniq -c | sort -rn) + + if ! $JSON_MODE; then + subsection_header "Container Events" + if [[ -n "$container_events" ]]; then + printf " ${BOLD}%-20s %s${NC}\n" "Event" "Count" + echo "$container_events" | awk '{printf " %-20s %d\n", $2, $1}' + else + echo " None found in daemon logs." + fi + fi + + # Container restart loops (from docker ps) + local restart_loops="" + if [[ -n "$docker_ps_data" ]]; then + restart_loops=$(echo "$docker_ps_data" | grep -i "restarting" || true) + fi + + if ! $JSON_MODE; then + subsection_header "Containers in Restart Loop" + if [[ -n "$restart_loops" ]]; then + echo "$restart_loops" | while IFS=$'\t' read -r name status created; do + printf " %-30s %s\n" "$name" "$status" + done + else + echo -e " ${GREEN}None found.${NC}" + fi + fi + + # OOM kills in containers + local docker_oom + docker_oom=$(echo "$log_data" | grep -iE "(oom|out of memory)" | head -"$TOP_N") + + local docker_oom_count=0 + if [[ -n "$docker_oom" ]]; then + docker_oom_count=$(echo "$docker_oom" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Container OOM Kills" + if [[ -n "$docker_oom" ]]; then + echo "$docker_oom" | while IFS= read -r line; do + local ts + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9:]+") + local msg + msg=$(echo "$line" | sed 's/^.*dockerd[^:]*: //' | head -c 80) + printf " %-22s %s\n" "${ts:-unknown}" "$msg" + done + else + echo " None found." + fi + fi + + # Docker daemon errors + local daemon_errors + daemon_errors=$(echo "$log_data" | grep -iE "(error|fatal|panic)" | \ + grep -v "level=info" | grep -v "level=warning" | head -"$TOP_N") + + local daemon_error_count=0 + if [[ -n "$daemon_errors" ]]; then + daemon_error_count=$(echo "$daemon_errors" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Daemon Errors" + if [[ -n "$daemon_errors" ]]; then + echo "$daemon_errors" | while IFS= read -r line; do + local ts msg + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9:]+") + msg=$(echo "$line" | grep -oP 'msg="\K[^"]+|msg=\K\S+' | head -1) + printf " %-22s %s\n" "${ts:-unknown}" "${msg:-$(echo "$line" | tail -c 80)}" + done + else + echo " None found." + fi + fi + + # Health check failures + local health_failures + health_failures=$(echo "$log_data" | grep -iE "(health.*unhealthy|health check|health_status)" | head -"$TOP_N") + + local health_fail_count=0 + if [[ -n "$health_failures" ]]; then + health_fail_count=$(echo "$health_failures" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Health Check Failures" + if [[ -n "$health_failures" ]]; then + echo "$health_failures" | while IFS= read -r line; do + local ts msg + ts=$(echo "$line" | grep -oP "^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+|^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9:]+") + msg=$(echo "$line" | sed 's/^.*dockerd[^:]*: //' | head -c 80) + printf " %-22s %s\n" "${ts:-unknown}" "$msg" + done + else + echo " None found." + fi + fi + + # Docker warnings + local daemon_warnings + daemon_warnings=$(echo "$log_data" | grep -i "level=warning\|WARN" | \ + grep -oP 'msg="\K[^"]+|msg=\K\S+' | sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "Top Warning Patterns" + if [[ -n "$daemon_warnings" ]]; then + printf " ${BOLD}%-6s %s${NC}\n" "Count" "Warning" + echo "$daemon_warnings" | awk '{ + count = $1; $1 = "" + sub(/^ +/, "") + printf " %-6d %s\n", count, substr($0, 1, 70) + }' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # Exited containers (from docker ps -a) + local exited_containers="" + if [[ -n "$docker_ps_data" ]]; then + exited_containers=$(echo "$docker_ps_data" | grep -i "exited" | head -"$TOP_N" || true) + fi + + if ! $JSON_MODE; then + subsection_header "Exited Containers" + if [[ -n "$exited_containers" ]]; then + printf " ${BOLD}%-30s %s${NC}\n" "Container" "Status" + echo "$exited_containers" | while IFS=$'\t' read -r name status created; do + printf " %-30s %s\n" "$name" "$status" + done + else + echo " None found." + fi + fi + + local restart_count=0 + if [[ -n "$restart_loops" ]]; then + restart_count=$(echo "$restart_loops" | wc -l) + fi + + if ! $JSON_MODE; then + section_header "Summary" + echo "" + printf " %-25s %d\n" "Daemon errors:" "$daemon_error_count" + printf " %-25s %d\n" "OOM kills:" "$docker_oom_count" + printf " %-25s %d\n" "Health check failures:" "$health_fail_count" + printf " %-25s %d\n" "Restart loops:" "$restart_count" + printf " %-25s %d\n" "Log lines parsed:" "$total_lines" + else + json_add "docker" "{\"daemon_errors\":${daemon_error_count},\"oom_kills\":${docker_oom_count},\"health_check_failures\":${health_fail_count},\"restart_loops\":${restart_count},\"total_lines\":${total_lines}}" + fi +} + +# ============================================================================ +# JAVA LOG ANALYZER +# ============================================================================ + +analyze_java() { + local log_file="" + + if [[ -n "$CUSTOM_LOG" ]]; then + log_file="$CUSTOM_LOG" + else + # Search common Java/Tomcat log locations + log_file=$(find_log \ + /opt/tomcat/logs/catalina.out \ + /var/log/tomcat*/catalina.out \ + /opt/tomcat*/logs/catalina.out \ + /var/log/wildfly/server.log \ + /opt/wildfly/standalone/log/server.log \ + /var/log/jetty/jetty.log \ + /opt/sonarqube/logs/sonar.log \ + /opt/nexus/sonatype-work/nexus3/log/nexus.log \ + /opt/jenkins/log/jenkins.log \ + /var/log/jenkins/jenkins.log) + fi + + if [[ -z "$log_file" || ! -r "$log_file" ]]; then + log_warn "No Java log sources found — use --log to specify the path" + return 1 + fi + + local log_data + log_data=$(cat "$log_file" | filter_by_time) + + if [[ -z "$log_data" ]]; then + log_warn "No log data in ${log_file} for the specified time range" + return 1 + fi + + local total_lines + total_lines=$(echo "$log_data" | wc -l) + + if ! $JSON_MODE; then + section_header "Java Log Analysis" + echo "" + echo -e " Source: ${DIM}${log_file}${NC}" + echo -e " Lines: ${total_lines}" + fi + + # ERROR / WARN / FATAL counts + local error_count warn_count fatal_count + error_count=$(echo "$log_data" | grep -cE "\bERROR\b" || true) + warn_count=$(echo "$log_data" | grep -cE "\bWARN\b" || true) + fatal_count=$(echo "$log_data" | grep -cE "\bFATAL\b" || true) + + if ! $JSON_MODE; then + subsection_header "Log Level Breakdown" + printf " %-15s %d\n" "FATAL:" "$fatal_count" + printf " %-15s %d\n" "ERROR:" "$error_count" + printf " %-15s %d\n" "WARN:" "$warn_count" + fi + + # Stack traces (lines starting with "at " or "Caused by:") + local stack_traces + stack_traces=$(echo "$log_data" | grep -cE "^\s+at |^Caused by:" || true) + local exception_types + exception_types=$(echo "$log_data" | grep -oP "^\S+Exception|^\S+Error|Caused by: \K\S+Exception|\S+Exception(?=:)" | \ + sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "Exceptions / Stack Traces" + printf " %-25s %d\n" "Stack trace lines:" "$stack_traces" + echo "" + if [[ -n "$exception_types" ]]; then + printf " ${BOLD}%-4s %-6s %s${NC}\n" "#" "Count" "Exception" + echo "$exception_types" | awk '{printf " %-4d %-6d %s\n", NR, $1, $2}' | head -"$TOP_N" + else + echo " No exceptions found." + fi + fi + + # OOM errors + local oom_errors + oom_errors=$(echo "$log_data" | grep -iE "(OutOfMemoryError|java.lang.OutOfMemoryError|GC overhead limit exceeded|Java heap space|Metaspace)" | head -"$TOP_N") + + local oom_count=0 + if [[ -n "$oom_errors" ]]; then + oom_count=$(echo "$oom_errors" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "OutOfMemoryErrors" + if [[ -n "$oom_errors" ]]; then + echo "$oom_errors" | while IFS= read -r line; do + local ts + ts=$(echo "$line" | grep -oP "^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9]{2}:[0-9]{2}:[0-9]{2}|^[A-Z][a-z]{2} [0-9]{2}, [0-9]{4} [0-9:]+") + local msg + msg=$(echo "$line" | grep -oP "OutOfMemoryError.*|GC overhead limit exceeded|Java heap space|Metaspace" | head -c 70) + printf " %-22s %s\n" "${ts:-unknown}" "${msg:-OOM}" + done + echo "" + echo -e " ${RED}${oom_count} OOM event(s) — check JVM heap settings${NC}" + else + echo -e " ${GREEN}None found.${NC}" + fi + fi + + # GC pauses (from GC log lines embedded in app logs or GC-related messages) + local gc_events + gc_events=$(echo "$log_data" | grep -iE "(GC pause|Full GC|GC\(|Allocation Failure|G1 Evacuation Pause|CMS-concurrent|to-space exhausted)" | head -"$TOP_N") + + local gc_count=0 + if [[ -n "$gc_events" ]]; then + gc_count=$(echo "$gc_events" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "GC Events" + if [[ -n "$gc_events" ]]; then + # Show Full GC vs minor GC breakdown + local full_gc_count minor_gc_count + full_gc_count=$(echo "$gc_events" | grep -c "Full GC" || true) + minor_gc_count=$((gc_count - full_gc_count)) + printf " %-25s %d\n" "Full GC:" "$full_gc_count" + printf " %-25s %d\n" "Minor GC:" "$minor_gc_count" + printf " %-25s %d\n" "Total:" "$gc_count" + else + echo " None found in application log (check dedicated GC log if separate)." + fi + fi + + # Thread deadlocks + local deadlocks + deadlocks=$(echo "$log_data" | grep -iE "(deadlock|DEADLOCK|Found one Java-level deadlock)" | head -"$TOP_N") + + local deadlock_count=0 + if [[ -n "$deadlocks" ]]; then + deadlock_count=$(echo "$deadlocks" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Thread Deadlocks" + if [[ -n "$deadlocks" ]]; then + echo "$deadlocks" | while IFS= read -r line; do + local ts + ts=$(echo "$line" | grep -oP "^[0-9]{4}-[0-9]{2}-[0-9]{2}[ T][0-9]{2}:[0-9]{2}:[0-9]{2}|^[A-Z][a-z]{2} [0-9]{2}, [0-9]{4} [0-9:]+") + printf " %s\n" "${ts:-unknown} — deadlock detected" + done + echo "" + echo -e " ${RED}${deadlock_count} deadlock(s) found${NC}" + else + echo " None found." + fi + fi + + # Connection/network errors + local conn_errors + conn_errors=$(echo "$log_data" | grep -iE "(ConnectException|SocketTimeoutException|ConnectionRefused|Connection reset|NoRouteToHostException|UnknownHostException)" | \ + grep -oP "\S*(Exception|Error)\S*" | sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "Connection / Network Errors" + if [[ -n "$conn_errors" ]]; then + printf " ${BOLD}%-6s %s${NC}\n" "Count" "Exception" + echo "$conn_errors" | awk '{printf " %-6d %s\n", $1, $2}' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # Top ERROR messages (deduped) + local error_patterns + error_patterns=$(echo "$log_data" | grep -E "\bERROR\b" | \ + sed 's/^[0-9T:.+Z -]*//' | sed 's/\[.*\] //' | \ + sed 's/[0-9]\{3,\}/#/g' | sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "Top ERROR Patterns" + if [[ -n "$error_patterns" ]]; then + printf " ${BOLD}%-6s %s${NC}\n" "Count" "Pattern" + echo "$error_patterns" | awk '{ + count = $1; $1 = "" + sub(/^ +/, "") + printf " %-6d %s\n", count, substr($0, 1, 65) + }' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # Top WARN messages (deduped) + local warn_patterns + warn_patterns=$(echo "$log_data" | grep -E "\bWARN\b" | \ + sed 's/^[0-9T:.+Z -]*//' | sed 's/\[.*\] //' | \ + sed 's/[0-9]\{3,\}/#/g' | sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "Top WARN Patterns" + if [[ -n "$warn_patterns" ]]; then + printf " ${BOLD}%-6s %s${NC}\n" "Count" "Pattern" + echo "$warn_patterns" | awk '{ + count = $1; $1 = "" + sub(/^ +/, "") + printf " %-6d %s\n", count, substr($0, 1, 65) + }' | head -"$TOP_N" + else + echo " None found." + fi + fi + + # Unique exception count + local unique_exceptions=0 + if [[ -n "$exception_types" ]]; then + unique_exceptions=$(echo "$exception_types" | wc -l) + fi + + if ! $JSON_MODE; then + section_header "Summary" + echo "" + printf " %-25s %d\n" "FATAL entries:" "$fatal_count" + printf " %-25s %d\n" "ERROR entries:" "$error_count" + printf " %-25s %d\n" "WARN entries:" "$warn_count" + printf " %-25s %d\n" "Stack trace lines:" "$stack_traces" + printf " %-25s %d\n" "Unique exceptions:" "$unique_exceptions" + printf " %-25s %d\n" "OOM errors:" "$oom_count" + printf " %-25s %d\n" "GC events:" "$gc_count" + printf " %-25s %d\n" "Deadlocks:" "$deadlock_count" + printf " %-25s %d\n" "Total lines parsed:" "$total_lines" + else + json_add "java" "{\"fatal\":${fatal_count},\"error\":${error_count},\"warn\":${warn_count},\"stack_trace_lines\":${stack_traces},\"unique_exceptions\":${unique_exceptions},\"oom_errors\":${oom_count},\"gc_events\":${gc_count},\"deadlocks\":${deadlock_count},\"total_lines\":${total_lines}}" + fi +} + +# ============================================================================ +# GITLAB LOG ANALYZER +# ============================================================================ + +analyze_gitlab() { + local gitlab_log_dir="/var/log/gitlab" + + if [[ -n "$CUSTOM_LOG" ]]; then + gitlab_log_dir="$CUSTOM_LOG" + fi + + if [[ ! -d "$gitlab_log_dir" ]]; then + log_warn "GitLab log directory not found: ${gitlab_log_dir}" + return 1 + fi + + if ! $JSON_MODE; then + section_header "GitLab Log Analysis" + echo "" + echo -e " Source: ${DIM}${gitlab_log_dir}${NC}" + fi + + # Production JSON log (Rails) + local prod_log="${gitlab_log_dir}/gitlab-rails/production_json.log" + local total_requests=0 error_5xx=0 slow_requests=0 auth_failures=0 + + if [[ -r "$prod_log" ]]; then + local prod_data + prod_data=$(cat "$prod_log" | filter_by_time) + total_requests=$(echo "$prod_data" | wc -l) + + # 5xx errors + local fivexx_data + fivexx_data=$(echo "$prod_data" | awk -F'"status":' '{ + if (NF > 1) { + split($2, a, /[,}]/) + status = a[1] + 0 + if (status >= 500) print + } + }') + + error_5xx=0 + if [[ -n "$fivexx_data" ]]; then + error_5xx=$(echo "$fivexx_data" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "HTTP 5xx Errors (Rails)" + if [[ "$error_5xx" -gt 0 ]]; then + # Top paths with 5xx + echo "$fivexx_data" | awk -F'"path":"' '{ + if (NF > 1) { + split($2, a, "\"") + print a[1] + } + }' | sort | uniq -c | sort -rn | head -"$TOP_N" | \ + awk '{printf " %-6d %s\n", $1, $2}' + else + echo -e " ${GREEN}None found.${NC}" + fi + fi + + # Slow requests (>5s) + local slow_data + slow_data=$(echo "$prod_data" | awk -F'"duration_s":' '{ + if (NF > 1) { + split($2, a, /[,}]/) + dur = a[1] + 0 + if (dur > 5.0) print + } + }') + + slow_requests=0 + if [[ -n "$slow_data" ]]; then + slow_requests=$(echo "$slow_data" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Slow Requests (>5s)" + if [[ "$slow_requests" -gt 0 ]]; then + echo "$slow_data" | awk -F'"' '{ + path = ""; dur = "" + for (i = 1; i <= NF; i++) { + if ($i == "path") path = $(i+2) + if ($i == "duration_s") { + split($(i+1), d, /[:,}]/) + dur = d[2] + } + } + if (path != "") printf " %.2fs %s\n", dur+0, path + }' | sort -rn | head -"$TOP_N" + else + echo " None found." + fi + fi + + # Auth failures + local auth_fail_data + auth_fail_data=$(echo "$prod_data" | grep -i '"status":401\|"status":403' || true) + auth_failures=0 + if [[ -n "$auth_fail_data" ]]; then + auth_failures=$(echo "$auth_fail_data" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Authentication Failures (401/403)" + if [[ "$auth_failures" -gt 0 ]]; then + echo "$auth_fail_data" | awk -F'"' '{ + path = ""; remote_ip = "" + for (i = 1; i <= NF; i++) { + if ($i == "path") path = $(i+2) + if ($i == "remote_ip") remote_ip = $(i+2) + } + if (remote_ip != "") printf " %-20s %s\n", remote_ip, path + }' | sort | uniq -c | sort -rn | head -"$TOP_N" | \ + awk '{printf " %-6d %-20s %s\n", $1, $2, $3}' + else + echo " None found." + fi + fi + + # Top endpoints by request count + if ! $JSON_MODE; then + subsection_header "Top Endpoints by Request Count" + echo "$prod_data" | awk -F'"path":"' '{ + if (NF > 1) { + split($2, a, "\"") + print a[1] + } + }' | sort | uniq -c | sort -rn | head -"$TOP_N" | \ + awk '{printf " %-6d %s\n", $1, $2}' + fi + else + if ! $JSON_MODE; then + log_warn "GitLab production log not found: ${prod_log}" + fi + fi + + # Sidekiq failures + local sidekiq_log="${gitlab_log_dir}/sidekiq/current" + local sidekiq_failures=0 + + if [[ -r "$sidekiq_log" ]]; then + local sidekiq_data + sidekiq_data=$(cat "$sidekiq_log" | filter_by_time) + + local sidekiq_fail_data + sidekiq_fail_data=$(echo "$sidekiq_data" | grep -i '"job_status":"fail"\|"severity":"ERROR"' || true) + + if [[ -n "$sidekiq_fail_data" ]]; then + sidekiq_failures=$(echo "$sidekiq_fail_data" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Sidekiq Failed Jobs" + if [[ "$sidekiq_failures" -gt 0 ]]; then + echo "$sidekiq_fail_data" | awk -F'"' '{ + job_class = ""; error = "" + for (i = 1; i <= NF; i++) { + if ($i == "class") job_class = $(i+2) + if ($i == "error_class") error = $(i+2) + } + if (job_class != "") printf " %s — %s\n", job_class, error + }' | sort | uniq -c | sort -rn | head -"$TOP_N" | \ + awk '{ + count = $1; $1 = "" + sub(/^ +/, "") + printf " %-6d %s\n", count, $0 + }' + else + echo " None found." + fi + fi + fi + + # Gitaly errors + local gitaly_log="${gitlab_log_dir}/gitaly/current" + local gitaly_errors=0 + + if [[ -r "$gitaly_log" ]]; then + local gitaly_data + gitaly_data=$(cat "$gitaly_log" | filter_by_time) + + local gitaly_error_data + gitaly_error_data=$(echo "$gitaly_data" | grep -iE '"level":"error"|"level":"fatal"' || true) + + if [[ -n "$gitaly_error_data" ]]; then + gitaly_errors=$(echo "$gitaly_error_data" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Gitaly Errors" + if [[ "$gitaly_errors" -gt 0 ]]; then + echo "$gitaly_error_data" | awk -F'"' '{ + msg = "" + for (i = 1; i <= NF; i++) { + if ($i == "msg" || $i == "error") { + msg = $(i+2) + break + } + } + if (msg != "") print msg + }' | sort | uniq -c | sort -rn | head -"$TOP_N" | \ + awk '{ + count = $1; $1 = "" + sub(/^ +/, "") + printf " %-6d %s\n", count, substr($0, 1, 70) + }' + else + echo " None found." + fi + fi + fi + + # Error rate + local error_rate="0.0" + if [[ "$total_requests" -gt 0 ]]; then + error_rate=$(awk "BEGIN {printf \"%.2f\", ($error_5xx / $total_requests) * 100}") + fi + + if ! $JSON_MODE; then + section_header "Summary" + echo "" + printf " %-25s %d\n" "Total requests:" "$total_requests" + printf " %-25s %d (%.2f%%)\n" "5xx errors:" "$error_5xx" "$error_rate" + printf " %-25s %d\n" "Slow requests (>5s):" "$slow_requests" + printf " %-25s %d\n" "Auth failures:" "$auth_failures" + printf " %-25s %d\n" "Sidekiq failures:" "$sidekiq_failures" + printf " %-25s %d\n" "Gitaly errors:" "$gitaly_errors" + else + json_add "gitlab" "{\"total_requests\":${total_requests},\"5xx_errors\":${error_5xx},\"error_rate\":${error_rate},\"slow_requests\":${slow_requests},\"auth_failures\":${auth_failures},\"sidekiq_failures\":${sidekiq_failures},\"gitaly_errors\":${gitaly_errors}}" + fi +} + +# ============================================================================ +# POSTGRESQL LOG ANALYZER +# ============================================================================ + +analyze_postgresql() { + local log_file="" + + if [[ -n "$CUSTOM_LOG" ]]; then + log_file="$CUSTOM_LOG" + else + # Search common PostgreSQL log locations + log_file=$(find_log \ + /var/log/postgresql/postgresql-*-main.log \ + /var/log/postgresql/postgresql.log \ + /var/lib/pgsql/data/log/postgresql-*.log \ + /var/lib/pgsql/data/pg_log/postgresql-*.log \ + /var/log/pgsql/postgresql.log) + fi + + local log_data="" + + if [[ -n "$log_file" && -r "$log_file" ]]; then + log_data=$(cat "$log_file" | filter_by_time) + elif command -v journalctl &>/dev/null; then + local since_arg + since_arg=$(journalctl_since_arg) + if [[ -n "$since_arg" ]]; then + log_data=$(journalctl -u postgresql* --no-pager -q $since_arg 2>/dev/null) + else + log_data=$(journalctl -u postgresql* --no-pager -q 2>/dev/null) + fi + fi + + if [[ -z "$log_data" ]]; then + log_warn "No PostgreSQL log sources found" + return 1 + fi + + local total_lines + total_lines=$(echo "$log_data" | wc -l) + + if ! $JSON_MODE; then + section_header "PostgreSQL Log Analysis" + echo "" + echo -e " Source: ${DIM}${log_file:-journalctl}${NC}" + echo -e " Lines: ${total_lines}" + fi + + # FATAL/ERROR/PANIC counts + local fatal_count error_count panic_count + fatal_count=$(echo "$log_data" | grep -c " FATAL: " || true) + error_count=$(echo "$log_data" | grep -c " ERROR: " || true) + panic_count=$(echo "$log_data" | grep -c " PANIC: " || true) + + if ! $JSON_MODE; then + subsection_header "Severity Breakdown" + printf " %-15s %d\n" "PANIC:" "$panic_count" + printf " %-15s %d\n" "FATAL:" "$fatal_count" + printf " %-15s %d\n" "ERROR:" "$error_count" + fi + + # Connection errors + local conn_errors + conn_errors=$(echo "$log_data" | grep -iE "(too many connections|connection refused|remaining connection slots|sorry, too many clients)" | head -"$TOP_N") + + local conn_error_count=0 + if [[ -n "$conn_errors" ]]; then + conn_error_count=$(echo "$conn_errors" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Connection Errors" + if [[ -n "$conn_errors" ]]; then + echo "$conn_errors" | while IFS= read -r line; do + local ts msg + ts=$(echo "$line" | grep -oP "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9:]+|^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+") + msg=$(echo "$line" | grep -oP "(FATAL|ERROR): \K.*" | head -c 70) + printf " %-22s %s\n" "${ts:-unknown}" "${msg:-$line}" + done + else + echo -e " ${GREEN}None found.${NC}" + fi + fi + + # Deadlocks + local deadlocks + deadlocks=$(echo "$log_data" | grep -i "deadlock detected" | head -"$TOP_N") + + local deadlock_count=0 + if [[ -n "$deadlocks" ]]; then + deadlock_count=$(echo "$deadlocks" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Deadlocks" + if [[ -n "$deadlocks" ]]; then + echo "$deadlocks" | while IFS= read -r line; do + local ts + ts=$(echo "$line" | grep -oP "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9:]+|^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+") + printf " %s\n" "${ts:-unknown}" + done + echo "" + echo " Total: ${deadlock_count}" + else + echo " None found." + fi + fi + + # Slow queries (log_min_duration_statement) + local slow_queries + slow_queries=$(echo "$log_data" | grep -i "duration:" | grep -v "LOG: connection\|LOG: disconnection" | head -"$TOP_N") + + local slow_query_count=0 + if [[ -n "$slow_queries" ]]; then + slow_query_count=$(echo "$slow_queries" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Slow Queries" + if [[ "$slow_query_count" -gt 0 ]]; then + echo "$slow_queries" | while IFS= read -r line; do + local ts dur stmt + ts=$(echo "$line" | grep -oP "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9:]+|^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+") + dur=$(echo "$line" | grep -oP "duration: \K[0-9.]+ ms") + stmt=$(echo "$line" | grep -oP "statement: \K.*" | head -c 60) + printf " %-22s %-12s %s\n" "${ts:-unknown}" "${dur:-?}" "${stmt:-...}" + done + else + echo " None found (requires log_min_duration_statement in postgresql.conf)." + fi + fi + + # Checkpoint warnings + local checkpoint_warns + checkpoint_warns=$(echo "$log_data" | grep -iE "(checkpoint.*too frequent|checkpoints are occurring too frequently|checkpoint.*taking)" | head -"$TOP_N") + + local checkpoint_count=0 + if [[ -n "$checkpoint_warns" ]]; then + checkpoint_count=$(echo "$checkpoint_warns" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Checkpoint Warnings" + if [[ -n "$checkpoint_warns" ]]; then + echo "$checkpoint_warns" | while IFS= read -r line; do + local ts msg + ts=$(echo "$line" | grep -oP "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9:]+|^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+") + msg=$(echo "$line" | grep -oP "LOG: \K.*|WARNING: \K.*" | head -c 70) + printf " %-22s %s\n" "${ts:-unknown}" "${msg:-checkpoint warning}" + done + else + echo " None found." + fi + fi + + # WAL/replication errors + local wal_errors + wal_errors=$(echo "$log_data" | grep -iE "(WAL.*error|replication.*error|could not receive|streaming replication|recovery target)" | head -"$TOP_N") + + local wal_error_count=0 + if [[ -n "$wal_errors" ]]; then + wal_error_count=$(echo "$wal_errors" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "WAL / Replication Errors" + if [[ -n "$wal_errors" ]]; then + echo "$wal_errors" | while IFS= read -r line; do + local ts msg + ts=$(echo "$line" | grep -oP "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9:]+|^[A-Z][a-z]{2} [ 0-9]{2} [0-9:]+") + msg=$(echo "$line" | grep -oP "(FATAL|ERROR|LOG): \K.*" | head -c 70) + printf " %-22s %s\n" "${ts:-unknown}" "${msg:-$line}" + done + else + echo " None found." + fi + fi + + # Auth failures + local pg_auth_failures + pg_auth_failures=$(echo "$log_data" | grep -iE "(password authentication failed|no pg_hba.conf entry|authentication failed)" | head -"$TOP_N") + + local pg_auth_fail_count=0 + if [[ -n "$pg_auth_failures" ]]; then + pg_auth_fail_count=$(echo "$pg_auth_failures" | wc -l) + fi + + if ! $JSON_MODE; then + subsection_header "Authentication Failures" + if [[ -n "$pg_auth_failures" ]]; then + echo "$pg_auth_failures" | awk -F'"' '{ + # Try to extract user and database + line = $0 + } + { + match($0, /user "([^"]+)"/, u) + match($0, /database "([^"]+)"/, d) + user = (u[1] != "") ? u[1] : "?" + db = (d[1] != "") ? d[1] : "?" + printf " user=%-15s db=%-15s\n", user, db + }' | sort | uniq -c | sort -rn | head -"$TOP_N" | \ + awk '{printf " %-6d %s %s\n", $1, $2, $3}' + else + echo " None found." + fi + fi + + # Top error patterns + local pg_error_patterns + pg_error_patterns=$(echo "$log_data" | grep -E " (ERROR|FATAL): " | \ + sed 's/^[^:]*: //' | sed 's/[0-9]\{2,\}/#/g' | \ + sort | uniq -c | sort -rn | head -"$TOP_N") + + if ! $JSON_MODE; then + subsection_header "Top Error Patterns" + if [[ -n "$pg_error_patterns" ]]; then + printf " ${BOLD}%-6s %s${NC}\n" "Count" "Pattern" + echo "$pg_error_patterns" | awk '{ + count = $1; $1 = "" + sub(/^ +/, "") + printf " %-6d %s\n", count, substr($0, 1, 65) + }' | head -"$TOP_N" + else + echo " None found." + fi + fi + + if ! $JSON_MODE; then + section_header "Summary" + echo "" + printf " %-25s %d\n" "PANIC entries:" "$panic_count" + printf " %-25s %d\n" "FATAL entries:" "$fatal_count" + printf " %-25s %d\n" "ERROR entries:" "$error_count" + printf " %-25s %d\n" "Connection errors:" "$conn_error_count" + printf " %-25s %d\n" "Deadlocks:" "$deadlock_count" + printf " %-25s %d\n" "Slow queries:" "$slow_query_count" + printf " %-25s %d\n" "Checkpoint warnings:" "$checkpoint_count" + printf " %-25s %d\n" "WAL/replication errors:" "$wal_error_count" + printf " %-25s %d\n" "Auth failures:" "$pg_auth_fail_count" + printf " %-25s %d\n" "Total lines parsed:" "$total_lines" + else + json_add "postgresql" "{\"panic\":${panic_count},\"fatal\":${fatal_count},\"error\":${error_count},\"connection_errors\":${conn_error_count},\"deadlocks\":${deadlock_count},\"slow_queries\":${slow_query_count},\"checkpoint_warnings\":${checkpoint_count},\"wal_errors\":${wal_error_count},\"auth_failures\":${pg_auth_fail_count},\"total_lines\":${total_lines}}" + fi +} + +# ============================================================================ +# ARGUMENT PARSING +# ============================================================================ + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) show_usage ;; + --type) LOG_TYPE="$2"; shift 2 ;; + --all-types) ALL_TYPES=true; shift ;; + --since) SINCE="$2"; parse_since "$2"; shift 2 ;; + --top) TOP_N="$2"; shift 2 ;; + --json) JSON_MODE=true; shift ;; + --no-color) NO_COLOR=true; shift ;; + --output) OUTPUT_FILE="$2"; shift 2 ;; + --log) CUSTOM_LOG="$2"; shift 2 ;; + *) log_error "Unknown option: $1"; show_usage ;; + esac + done + + if [[ -z "$LOG_TYPE" ]] && ! $ALL_TYPES; then + log_error "Specify --type or --all-types" + echo "" + show_usage + fi +} + +# ============================================================================ +# MAIN +# ============================================================================ + +main() { + parse_args "$@" + + # Disable colors if requested or not a terminal + if $NO_COLOR || [[ ! -t 1 ]]; then + disable_colors + fi + + # Redirect output to file if specified + if [[ -n "$OUTPUT_FILE" ]]; then + disable_colors + exec > >(tee "$OUTPUT_FILE") 2>&1 + fi + + if ! $JSON_MODE; then + echo -e "${BOLD}Linux Log Analyzer v${VERSION}${NC}" + if [[ -n "$SINCE" ]]; then + echo -e "Since: ${SINCE}" + fi + fi + + if $ALL_TYPES; then + # Run all available analyzers + local ran_any=false + + for type in system auth docker java gitlab postgresql; do + if ! $JSON_MODE; then + echo "" + echo -e "${DIM}────────────────────────────────────────────────────${NC}" + echo -e "${BOLD} Analyzing: ${type}${NC}" + echo -e "${DIM}────────────────────────────────────────────────────${NC}" + fi + + case "$type" in + system) analyze_system && ran_any=true || log_warn "system: skipped" ;; + auth) analyze_auth && ran_any=true || log_warn "auth: skipped" ;; + docker) analyze_docker && ran_any=true || log_warn "docker: skipped" ;; + java) analyze_java && ran_any=true || log_warn "java: skipped" ;; + gitlab) analyze_gitlab && ran_any=true || log_warn "gitlab: skipped" ;; + postgresql) analyze_postgresql && ran_any=true || log_warn "postgresql: skipped" ;; + esac + done + + if ! $ran_any; then + log_error "No log sources found for any type" + exit 1 + fi + else + case "$LOG_TYPE" in + system) analyze_system ;; + auth) analyze_auth ;; + docker) analyze_docker ;; + java) analyze_java ;; + gitlab) analyze_gitlab ;; + postgresql) analyze_postgresql ;; + *) log_error "Unknown log type: $LOG_TYPE"; show_usage ;; + esac + fi + + if $JSON_MODE; then + echo "{${JSON_OUTPUT}}" + fi +} + +main "$@" diff --git a/log-disk-analyzer.sh b/log-disk-analyzer.sh new file mode 100755 index 0000000..828ddbd --- /dev/null +++ b/log-disk-analyzer.sh @@ -0,0 +1,355 @@ +#!/usr/bin/env bash + +######################################################################################### +#### log-disk-analyzer.sh — Analyze log directory disk usage and report problems #### +#### Reports largest files, growth rates, unrotated logs, broken symlinks, #### +#### empty files, and subdirectory breakdown #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.00 #### +#### #### +#### Usage: #### +#### ./log-disk-analyzer.sh #### +#### ./log-disk-analyzer.sh --path /opt/app/logs --top 10 #### +#### ./log-disk-analyzer.sh --json #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +LOG_PATH="${LOG_PATH:-/var/log}" +TOP_COUNT="${TOP_COUNT:-20}" +COLOR="${COLOR:-auto}" +TEXTFILE_DIR="/var/lib/node_exporter" +PROM_FILE="" +JSON_OUTPUT="false" +VERBOSE="false" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME +TOTAL_SIZE=0 TOTAL_FILES=0 EMPTY_COUNT=0 BROKEN_COUNT=0 UNROTATED_COUNT=0 +FILE_LIST=() SIZE_LIST=() RECOMMENDATIONS=() + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BOLD="" DIM="" RESET=""; return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' + BOLD='\033[1m' DIM='\033[2m' RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } + +# ── Helpers ─────────────────────────────────────────────────────────── +human_size() { + local b="$1" + if [[ "$b" -ge 1073741824 ]]; then printf "%.1f GB" "$(echo "scale=1; $b/1073741824" | bc)" + elif [[ "$b" -ge 1048576 ]]; then printf "%.1f MB" "$(echo "scale=1; $b/1048576" | bc)" + elif [[ "$b" -ge 1024 ]]; then printf "%.1f KB" "$(echo "scale=1; $b/1024" | bc)" + else printf "%d B" "$b"; fi +} + +file_age_days() { + local mtime now age + mtime=$(stat -c %Y "$1" 2>/dev/null) || return 1 + now=$(date +%s); age=$(( (now - mtime) / 86400 )) + [[ "$age" -lt 1 ]] && age=1 + echo "$age" +} + +separator() { printf " %s\n" "$(printf '%.0s─' {1..62})"; } + +json_escape() { local s="$1"; s="${s//\\/\\\\}"; s="${s//\"/\\\"}"; printf '%s' "$s"; } + +# ── Collect all files ───────────────────────────────────────────────── +collect_files() { + while IFS= read -r line; do + local size="${line%% *}" file="${line#* }" + [[ -z "$size" || -z "$file" ]] && continue + FILE_LIST+=("$file"); SIZE_LIST+=("$size") + TOTAL_SIZE=$((TOTAL_SIZE + size)); TOTAL_FILES=$((TOTAL_FILES + 1)) + done < <(find "$LOG_PATH" -type f -printf '%s\t%p\n' 2>/dev/null | sort -t$'\t' -k1 -rn) +} + +# ── Top files by size ───────────────────────────────────────────────── +print_top_files() { + local limit="$TOP_COUNT" + [[ "$VERBOSE" == "true" ]] && limit="${#FILE_LIST[@]}" + [[ "$limit" -gt "${#FILE_LIST[@]}" ]] && limit="${#FILE_LIST[@]}" + echo ""; echo -e " ${BOLD}TOP ${limit} FILES BY SIZE${RESET}"; separator + for (( i = 0; i < limit; i++ )); do + local size_h color="" + size_h="$(human_size "${SIZE_LIST[$i]}")" + [[ "${SIZE_LIST[$i]}" -ge 104857600 ]] && color="$RED" + [[ "${SIZE_LIST[$i]}" -lt 104857600 && "${SIZE_LIST[$i]}" -ge 52428800 ]] && color="$YELLOW" + printf " %b%-10s%b %s\n" "$color" "$size_h" "$RESET" "${FILE_LIST[$i]}" + done +} + +# ── Growth rates ────────────────────────────────────────────────────── +print_growth_rates() { + local limit="$TOP_COUNT" + [[ "$VERBOSE" == "true" ]] && limit="${#FILE_LIST[@]}" + [[ "$limit" -gt "${#FILE_LIST[@]}" ]] && limit="${#FILE_LIST[@]}" + local -a rates=() rate_files=() rate_ages=() + for (( i = 0; i < ${#FILE_LIST[@]}; i++ )); do + local age; age=$(file_age_days "${FILE_LIST[$i]}") || continue + rates+=("$(( SIZE_LIST[i] / age ))"); rate_files+=("${FILE_LIST[$i]}"); rate_ages+=("$age") + done + [[ ${#rates[@]} -eq 0 ]] && return + local -a sorted_idx=() + mapfile -t sorted_idx < <(for i in "${!rates[@]}"; do echo "$i ${rates[$i]}"; done | sort -k2 -rn | head -n "$limit" | awk '{print $1}') + echo ""; echo -e " ${BOLD}GROWTH RATE (SIZE / AGE)${RESET}"; separator + for idx in "${sorted_idx[@]}"; do + printf " %-16s %s (%d days old)\n" "$(human_size "${rates[$idx]}")/day" "${rate_files[$idx]}" "${rate_ages[$idx]}" + done +} + +# ── Unrotated logs ──────────────────────────────────────────────────── +print_unrotated() { + echo ""; echo -e " ${BOLD}UNROTATED LOGS (>100 MB, >7 days old)${RESET}"; separator + local found=0 + while IFS= read -r file; do + [[ -z "$file" ]] && continue + local size age; size=$(stat -c %s "$file" 2>/dev/null) || continue + [[ "$size" -lt 104857600 ]] && continue + age=$(file_age_days "$file") || continue; [[ "$age" -lt 7 ]] && continue + found=1; UNROTATED_COUNT=$((UNROTATED_COUNT + 1)) + local size_h; size_h="$(human_size "$size")" + printf " %b%-10s%b %3d days %s\n" "$RED" "$size_h" "$RESET" "$age" "$file" + RECOMMENDATIONS+=("Rotate $(basename "$file") — ${size_h} and ${age} days old") + done < <(find "$LOG_PATH" -type f -name '*.log' 2>/dev/null) + [[ "$found" -eq 0 ]] && echo -e " ${GREEN}None found${RESET}" +} + +# ── Empty files ─────────────────────────────────────────────────────── +print_empty_files() { + echo ""; echo -e " ${BOLD}EMPTY FILES${RESET}"; separator + local found=0 + while IFS= read -r file; do + [[ -z "$file" ]] && continue; found=1; EMPTY_COUNT=$((EMPTY_COUNT + 1)) + echo " $file" + done < <(find "$LOG_PATH" -type f -empty 2>/dev/null) + [[ "$found" -eq 0 ]] && echo -e " ${GREEN}None found${RESET}" + if [[ "$EMPTY_COUNT" -gt 0 ]]; then + RECOMMENDATIONS+=("Remove ${EMPTY_COUNT} empty log files to reclaim inodes") + fi +} + +# ── Broken symlinks ────────────────────────────────────────────────── +print_broken_symlinks() { + echo ""; echo -e " ${BOLD}BROKEN SYMLINKS${RESET}"; separator + local found=0 + while IFS= read -r link; do + [[ -z "$link" ]] && continue; found=1; BROKEN_COUNT=$((BROKEN_COUNT + 1)) + local target; target=$(readlink "$link" 2>/dev/null || echo "unknown") + printf " %b%s%b -> %s\n" "$YELLOW" "$link" "$RESET" "$target" + done < <(find "$LOG_PATH" -xtype l 2>/dev/null) + [[ "$found" -eq 0 ]] && echo -e " ${GREEN}None found${RESET}" + if [[ "$BROKEN_COUNT" -gt 0 ]]; then + RECOMMENDATIONS+=("Fix ${BROKEN_COUNT} broken symlinks") + fi +} + +# ── Subdirectory breakdown ──────────────────────────────────────────── +print_subdir_breakdown() { + echo ""; echo -e " ${BOLD}DISK USAGE BY SUBDIRECTORY${RESET}"; separator + du -d 1 "$LOG_PATH" 2>/dev/null | sort -rn | while IFS=$'\t' read -r kb path; do + [[ "$path" == "$LOG_PATH" ]] && continue + printf " %-10s %s\n" "$(human_size $((kb * 1024)))" "$path" + done || true +} + +# ── Summary ─────────────────────────────────────────────────────────── +print_summary() { + echo ""; echo -e " ${BOLD}Summary${RESET}" + printf " %-20s %s\n" "Total size:" "$(human_size "$TOTAL_SIZE")" + printf " %-20s %d\n" "Total files:" "$TOTAL_FILES" + printf " %-20s %d\n" "Empty files:" "$EMPTY_COUNT" + printf " %-20s %d\n" "Broken symlinks:" "$BROKEN_COUNT" + printf " %-20s %d\n" "Unrotated logs:" "$UNROTATED_COUNT" + if [[ ${#RECOMMENDATIONS[@]} -gt 0 ]]; then + echo ""; echo -e " ${BOLD}Recommendations:${RESET}" + for rec in "${RECOMMENDATIONS[@]}"; do echo -e " ${YELLOW}•${RESET} $rec"; done + fi + echo "" +} + +# ── JSON output ─────────────────────────────────────────────────────── +print_json() { + local limit="$TOP_COUNT" i first + [[ "$VERBOSE" == "true" ]] && limit="${#FILE_LIST[@]}" + [[ "$limit" -gt "${#FILE_LIST[@]}" ]] && limit="${#FILE_LIST[@]}" + printf '{"path":"%s","top_files":[' "$(json_escape "$LOG_PATH")" + for (( i = 0; i < limit; i++ )); do + [[ "$i" -gt 0 ]] && printf ',' + printf '{"file":"%s","size":%s}' "$(json_escape "${FILE_LIST[$i]}")" "${SIZE_LIST[$i]}" + done + printf '],"growth_rates":[' + first=1 + for (( i = 0; i < limit && i < ${#FILE_LIST[@]}; i++ )); do + local age; age=$(file_age_days "${FILE_LIST[$i]}") || continue + [[ "$first" -eq 0 ]] && printf ','; first=0 + printf '{"file":"%s","bytes_per_day":%d,"age_days":%d}' \ + "$(json_escape "${FILE_LIST[$i]}")" "$(( SIZE_LIST[i] / age ))" "$age" + done + printf '],"unrotated_logs":[' + first=1 + while IFS= read -r file; do + [[ -z "$file" ]] && continue + local size age; size=$(stat -c %s "$file" 2>/dev/null) || continue + [[ "$size" -lt 104857600 ]] && continue + age=$(file_age_days "$file") || continue; [[ "$age" -lt 7 ]] && continue + [[ "$first" -eq 0 ]] && printf ','; first=0 + printf '{"file":"%s","size":%d,"age_days":%d}' "$(json_escape "$file")" "$size" "$age" + done < <(find "$LOG_PATH" -type f -name '*.log' 2>/dev/null) + printf '],"empty_files":[' + first=1 + while IFS= read -r file; do + [[ -z "$file" ]] && continue; [[ "$first" -eq 0 ]] && printf ','; first=0 + printf '"%s"' "$(json_escape "$file")" + done < <(find "$LOG_PATH" -type f -empty 2>/dev/null) + printf '],"broken_symlinks":[' + first=1 + while IFS= read -r link; do + [[ -z "$link" ]] && continue; [[ "$first" -eq 0 ]] && printf ','; first=0 + printf '{"link":"%s","target":"%s"}' "$(json_escape "$link")" "$(json_escape "$(readlink "$link" 2>/dev/null || echo unknown)")" + done < <(find "$LOG_PATH" -xtype l 2>/dev/null) + printf '],"summary":{"total_size":%d,"total_files":%d,"empty_files":%d,"broken_symlinks":%d,"unrotated_logs":%d}}\n' \ + "$TOTAL_SIZE" "$TOTAL_FILES" "$EMPTY_COUNT" "$BROKEN_COUNT" "$UNROTATED_COUNT" +} + +# ── Prometheus output ───────────────────────────────────────────────── +write_prometheus() { + local file="$1" + local output_dir + output_dir="$(dirname "$file")" + mkdir -p "$output_dir" + local tmp + tmp=$(mktemp "${output_dir}/.log_disk.XXXXXX") + { + echo "# HELP log_disk_total_bytes Total size of log directory in bytes" + echo "# TYPE log_disk_total_bytes gauge" + printf 'log_disk_total_bytes{path="%s"} %d\n' "$LOG_PATH" "$TOTAL_SIZE" + echo "# HELP log_disk_total_files Total number of files in log directory" + echo "# TYPE log_disk_total_files gauge" + printf 'log_disk_total_files{path="%s"} %d\n' "$LOG_PATH" "$TOTAL_FILES" + echo "# HELP log_disk_empty_files Number of empty files in log directory" + echo "# TYPE log_disk_empty_files gauge" + printf 'log_disk_empty_files{path="%s"} %d\n' "$LOG_PATH" "$EMPTY_COUNT" + echo "# HELP log_disk_broken_symlinks Number of broken symlinks in log directory" + echo "# TYPE log_disk_broken_symlinks gauge" + printf 'log_disk_broken_symlinks{path="%s"} %d\n' "$LOG_PATH" "$BROKEN_COUNT" + echo "# HELP log_disk_unrotated_logs Number of unrotated log files" + echo "# TYPE log_disk_unrotated_logs gauge" + printf 'log_disk_unrotated_logs{path="%s"} %d\n' "$LOG_PATH" "$UNROTATED_COUNT" + } > "$tmp" + chmod 644 "$tmp" + mv -f "$tmp" "$file" + verbose "Metrics written to ${file}" +} + +# ══════════════════════════════════════════════════════════════════════ +# USAGE +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2; exit 1 ;; + *) err "Unexpected argument: $1"; echo "Run ${SCRIPT_NAME} --help for usage" >&2; exit 1 ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + if [[ ! -d "$LOG_PATH" ]]; then + err "Directory not found: $LOG_PATH" + exit 1 + fi + + collect_files + + if [[ "$JSON_OUTPUT" == "true" ]]; then + print_json + else + echo "" + echo -e "${BOLD}Log Disk Analyzer${RESET}" + echo -e "${DIM}Path: ${LOG_PATH}${RESET}" + print_top_files + print_growth_rates + print_unrotated + print_empty_files + print_broken_symlinks + print_subdir_breakdown + print_summary + fi + + if [[ -n "$PROM_FILE" ]]; then + write_prometheus "$PROM_FILE" + fi +} + +main "$@" diff --git a/login-attempt-exporter.sh b/login-attempt-exporter.sh new file mode 100755 index 0000000..a3d9362 --- /dev/null +++ b/login-attempt-exporter.sh @@ -0,0 +1,799 @@ +#!/bin/bash +################################################################################ +# Script Name: login-attempt-exporter.sh +# Version: 1.1 +# Description: Prometheus exporter for SSH/PAM login attempts providing +# comprehensive metrics for monitoring authentication activity, +# failed logins, brute force detection, and threat analysis +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - Access to auth logs (/var/log/auth.log, /var/log/secure, or journalctl) +# - netcat (nc) for HTTP mode +# - Standard Unix tools (grep, awk, sort, uniq) +# +# Performance: +# Log lines are cached per period — the log is parsed once per period +# (1h, 24h, 7d), not once per metric. Timestamps are parsed natively +# in awk using mktime() with no subprocess spawning per line. +# Typical run time: a few seconds even on large auth logs. +# +# Usage: +# # Output to stdout +# ./login-attempt-exporter.sh +# +# # HTTP server mode +# ./login-attempt-exporter.sh --http -p 9198 +# +# # Textfile collector mode +# ./login-attempt-exporter.sh --textfile +# +# Metrics Exported: +# Core Status: +# - login_attempt_up - Exporter status (1=up, 0=down) +# - login_attempt_exporter_info{version,log_source} - Exporter info +# +# Attempt Counters: +# - login_attempt_failed_total{period} - Failed attempts per period +# - login_attempt_success_total{period} - Successful logins per period +# - login_attempt_invalid_user_total{period} - Invalid user attempts +# - login_attempt_preauth_disconnect_total{period} - Pre-auth disconnects +# +# Authentication Methods: +# - login_attempt_auth_method_count{method,period} - Attempts by method +# +# Unique Counts: +# - login_attempt_unique_source_ips{status,period} - Unique IPs +# - login_attempt_unique_users{status,period} - Unique usernames +# +# Top Attackers: +# - login_attempt_top_failed_ip_count{ip} - Top source IPs (24h) +# - login_attempt_top_failed_user_count{user} - Top targeted users (24h) +# - login_attempt_top_success_user_count{user} - Top successful users (24h) +# +# Threat Detection: +# - login_attempt_brute_force_score{ip} - Brute force heuristic score +# - login_attempt_root_attempts{period} - Root login attempts +# +# Rates: +# - login_attempt_failed_per_hour - Average failed/hour (24h) +# - login_attempt_success_rate - Success ratio (24h) +# +# Log Health: +# - login_attempt_log_size_bytes - Auth log file size +# - login_attempt_log_age_seconds - Seconds since last modification +# - login_attempt_exporter_duration_seconds - Script execution time +# - login_attempt_exporter_last_run_timestamp - Last run unix timestamp +# +# Configuration: +# Default HTTP port: 9198 +# Textfile directory: /var/lib/node_exporter +# Log sources: /var/log/auth.log, /var/log/secure, journalctl +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9198 +AUTH_LOG="" +LOG_SOURCE="" +CACHE_DIR="" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Detect the correct auth log source +# Sets AUTH_LOG and LOG_SOURCE global variables +# Returns: 0 if found, 1 if no log source available +detect_auth_log() { + if [ -r "/var/log/auth.log" ]; then + AUTH_LOG="/var/log/auth.log" + LOG_SOURCE="auth.log" + return 0 + elif [ -r "/var/log/secure" ]; then + AUTH_LOG="/var/log/secure" + LOG_SOURCE="secure" + return 0 + elif command -v journalctl >/dev/null 2>&1 && journalctl -u sshd -n 1 >/dev/null 2>&1; then + AUTH_LOG="" + LOG_SOURCE="journalctl" + return 0 + fi + return 1 +} + +# Set up a temp directory for caching filtered log lines per period +# Called once at the start of generate_metrics +setup_cache() { + CACHE_DIR=$(mktemp -d /tmp/login-attempt-exporter.XXXXXX) +} + +# Clean up cached log data +cleanup_cache() { + [ -n "$CACHE_DIR" ] && rm -rf "$CACHE_DIR" + CACHE_DIR="" +} + +# Get log lines filtered by time period (cached per period) +# The log is read once per period and cached to a temp file. +# Subsequent calls for the same period return the cached result. +# Args: $1 - period ("1 hour ago", "24 hours ago", "7 days ago") +# Returns: Filtered log lines on stdout +get_log_lines() { + local period="$1" + local cache_key="${period// /_}" + local cache_file="$CACHE_DIR/$cache_key" + + # Return cached result if available + if [ -f "$cache_file" ]; then + cat "$cache_file" + return + fi + + if [ "$LOG_SOURCE" = "journalctl" ]; then + journalctl -u sshd --since "$period" --no-pager 2>/dev/null > "$cache_file" + elif [ -n "$AUTH_LOG" ] && [ -r "$AUTH_LOG" ]; then + local cutoff_timestamp + cutoff_timestamp=$(date -d "$period" +%s 2>/dev/null || echo 0) + + # Use awk mktime() for native timestamp parsing — no subprocess per line + # Supports both traditional syslog (Apr 20 10:08:23) and + # ISO 8601 / RFC 3339 (2026-04-20T10:08:23.066974+02:00) formats + awk -v cutoff="$cutoff_timestamp" ' + BEGIN { + m["Jan"]=1; m["Feb"]=2; m["Mar"]=3; m["Apr"]=4 + m["May"]=5; m["Jun"]=6; m["Jul"]=7; m["Aug"]=8 + m["Sep"]=9; m["Oct"]=10; m["Nov"]=11; m["Dec"]=12 + "date +%Y" | getline year; close("date +%Y") + } + { + if ($1 in m) { + # Traditional syslog: Apr 20 10:08:23 + split($3, t, ":") + ts = mktime(year " " m[$1] " " $2+0 " " t[1] " " t[2] " " t[3]) + if (ts >= cutoff) print + } else if (match($1, /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}/)) { + # ISO 8601: 2026-04-20T10:08:23.066974+02:00 + split($1, dt, "[-T:.+]") + ts = mktime(dt[1] " " dt[2]+0 " " dt[3]+0 " " dt[4]+0 " " dt[5]+0 " " dt[6]+0) + if (ts >= cutoff) print + } + }' "$AUTH_LOG" > "$cache_file" 2>/dev/null + fi + + cat "$cache_file" +} + +# Count failed password attempts in a time period +# Args: $1 - period (e.g., "1 hour ago") +# Returns: Number of failed attempts +count_failed_attempts() { + local period="$1" + local count + count=$(get_log_lines "$period" | grep -c "sshd.*Failed password" 2>/dev/null) + echo "${count:-0}" +} + +# Count successful logins (password + publickey) in a time period +# Args: $1 - period (e.g., "1 hour ago") +# Returns: Number of successful logins +count_successful_logins() { + local period="$1" + local count + count=$(get_log_lines "$period" | grep -c "sshd.*Accepted" 2>/dev/null) + echo "${count:-0}" +} + +# Count invalid user attempts in a time period +# Args: $1 - period (e.g., "1 hour ago") +# Returns: Number of invalid user attempts +count_invalid_users() { + local period="$1" + local count + count=$(get_log_lines "$period" | grep -c "sshd.*Invalid user" 2>/dev/null) + echo "${count:-0}" +} + +# Count pre-auth disconnects in a time period +# Args: $1 - period (e.g., "1 hour ago") +# Returns: Number of pre-auth disconnect events +count_preauth_disconnects() { + local period="$1" + local count + count=$(get_log_lines "$period" | grep -cE "sshd.*(Connection closed by.*\[preauth\]|Disconnected from.*\[preauth\])" 2>/dev/null) + echo "${count:-0}" +} + +# Get authentication method counts for a time period +# Args: $1 - period (e.g., "24 hours ago") +# Returns: Lines of "method count" format (password, publickey, keyboard-interactive) +get_auth_method_counts() { + local period="$1" + local lines + lines=$(get_log_lines "$period") + + local password_count publickey_count kbdint_count + password_count=$(echo "$lines" | grep -c "sshd.*Accepted password" 2>/dev/null) + publickey_count=$(echo "$lines" | grep -c "sshd.*Accepted publickey" 2>/dev/null) + kbdint_count=$(echo "$lines" | grep -c "sshd.*Accepted keyboard-interactive" 2>/dev/null) + + echo "password ${password_count:-0}" + echo "publickey ${publickey_count:-0}" + echo "keyboard-interactive ${kbdint_count:-0}" +} + +# Get top source IPs from failed attempts (24h) +# Args: $1 - limit (default: 10) +# Returns: Lines with "count IP" format, sorted by count descending +get_top_failed_ips() { + local limit="${1:-10}" + get_log_lines "24 hours ago" | grep "sshd.*Failed password" | \ + grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | \ + sort | uniq -c | sort -rn | head -n "$limit" +} + +# Get most targeted usernames from failed attempts (24h) +# Args: $1 - limit (default: 10) +# Returns: Lines with "count username" format, sorted by count descending +get_top_failed_users() { + local limit="${1:-10}" + get_log_lines "24 hours ago" | grep "sshd.*Failed password" | \ + sed -n 's/.*Failed password for invalid user \([^ ]*\).*/\1/p; s/.*Failed password for \([^ ]*\).*/\1/p' | \ + sort | uniq -c | sort -rn | head -n "$limit" +} + +# Get users with most successful logins (24h) +# Args: $1 - limit (default: 10) +# Returns: Lines with "count username" format, sorted by count descending +get_top_successful_users() { + local limit="${1:-10}" + get_log_lines "24 hours ago" | grep "sshd.*Accepted" | \ + awk '{for(i=1;i<=NF;i++) if($i=="for") {print $(i+1); break}}' | \ + sort | uniq -c | sort -rn | head -n "$limit" +} + +# Count unique source IPs in a time period +# Args: $1 - period, $2 - status (failed/success) +# Returns: Number of unique IPs +get_unique_source_ips() { + local period="$1" + local status="$2" + local pattern count + + if [ "$status" = "failed" ]; then + pattern="sshd.*Failed password" + else + pattern="sshd.*Accepted" + fi + + count=$(get_log_lines "$period" | grep "$pattern" | \ + grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | \ + sort -u | wc -l 2>/dev/null) + echo "${count:-0}" +} + +# Count unique usernames in a time period +# Args: $1 - period, $2 - status (failed/success) +# Returns: Number of unique usernames +get_unique_users() { + local period="$1" + local status="$2" + local count + + if [ "$status" = "failed" ]; then + count=$(get_log_lines "$period" | grep "sshd.*Failed password" | \ + sed -n 's/.*Failed password for invalid user \([^ ]*\).*/\1/p; s/.*Failed password for \([^ ]*\).*/\1/p' | \ + sort -u | wc -l 2>/dev/null) + else + count=$(get_log_lines "$period" | grep "sshd.*Accepted" | \ + awk '{for(i=1;i<=NF;i++) if($i=="for") {print $(i+1); break}}' | \ + sort -u | wc -l 2>/dev/null) + fi + echo "${count:-0}" +} + +# Detect brute force candidates: IPs with 10+ failed attempts in 1h hitting 3+ unique users +# Returns: Lines of "ip score" where score = attempts × unique_users +get_brute_force_candidates() { + local lines + lines=$(get_log_lines "1 hour ago" | grep "sshd.*Failed password") + + [ -z "$lines" ] && return + + # Extract IP and username pairs, then compute per-IP stats + echo "$lines" | \ + awk '{ + ip = ""; user = "" + for (i=1; i<=NF; i++) { + if ($i ~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/) ip = $i + if ($i == "for" && $(i+1) == "invalid" && $(i+2) == "user") { user = $(i+3); i+=3 } + else if ($i == "for") { user = $(i+1) } + } + if (ip != "" && user != "") { + attempts[ip]++ + users[ip, user] = 1 + } + } + END { + for (ip in attempts) { + unique = 0 + for (key in users) { + split(key, parts, SUBSEP) + if (parts[1] == ip) unique++ + } + if (attempts[ip] >= 10 && unique >= 3) { + score = attempts[ip] * unique + print ip, score + } + } + }' +} + +# Count login attempts targeting root +# Args: $1 - period (e.g., "1 hour ago") +# Returns: Number of root login attempts +get_root_login_attempts() { + local period="$1" + local count + count=$(get_log_lines "$period" | grep -c "sshd.*Failed password for root" 2>/dev/null) + echo "${count:-0}" +} + +# Calculate average failed attempts per hour over the last 24h +# Returns: Floating point average +get_hourly_attempt_rate() { + local total + total=$(count_failed_attempts "24 hours ago") + total=${total:-0} + + if [ "$total" -gt 0 ] 2>/dev/null; then + awk "BEGIN {printf \"%.2f\", $total / 24}" 2>/dev/null || echo "0.00" + else + echo "0.00" + fi +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +# Generate all Prometheus metrics +# Returns: Prometheus text format metrics on stdout +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Detect auth log source + if ! detect_auth_log; then + cat </dev/null; then + success_rate=$(awk "BEGIN {printf \"%.4f\", $success_24h / $total_24h}" 2>/dev/null || echo "0") + else + success_rate="0" + fi + + cat </dev/null || echo "0") + log_age=$(($(date +%s) - $(stat -c %Y "$AUTH_LOG" 2>/dev/null || echo 0))) + fi + + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + # Infinite loop accepting HTTP requests + while true; do + { + read -r request + # Check if request is for /metrics endpoint + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else # Serve HTML landing page for other requests + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +Login Attempt Exporter v1.0 + +

Login Attempt Exporter v1.0

+

Metrics

+

Metric Categories

+
    +
  • Core Status: exporter up/down, version info
  • +
  • Attempt Counters: failed, successful, invalid user, pre-auth disconnects
  • +
  • Authentication Methods: password, publickey, keyboard-interactive
  • +
  • Unique Counts: unique source IPs and usernames
  • +
  • Top Attackers: top failed IPs, targeted users, successful users
  • +
  • Threat Detection: brute force scores, root login attempts
  • +
  • Rates: failed attempts per hour, success ratio
  • +
  • Log Health: log size, age, exporter runtime
  • +
+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null # -q 1: wait 1 second after EOF before closing + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +# Main entry point - routes to appropriate output mode +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + # Run HTTP server (blocks until killed) + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + # Textfile collector mode: write atomically using temp file + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + # Create temp file in SAME directory for atomic rename (same filesystem) + local temp_file + temp_file=$(mktemp "${output_dir}/.login_attempt_metrics.XXXXXX") + + # Generate metrics to temp file + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + # Validate: file must exist, have content, and contain enough metric lines + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + # Set permissions before move + chmod 644 "$temp_file" + + # Atomic rename - no gap where file is missing + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + # Default: output to stdout + generate_metrics + fi +} + +# Execute main function with all script arguments +main "$@" diff --git a/logrotate-check-exporter.sh b/logrotate-check-exporter.sh new file mode 100755 index 0000000..688dca8 --- /dev/null +++ b/logrotate-check-exporter.sh @@ -0,0 +1,385 @@ +#!/usr/bin/env bash + +######################################################################################### +#### logrotate-check-exporter.sh — Logrotate health metrics for Prometheus #### +#### Tracks rotation timestamps, log file sizes/ages, and stale log detection #### +#### Requires: bash 4+, coreutils #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.2 #### +#### #### +#### Usage: #### +#### ./logrotate-check-exporter.sh # stdout #### +#### ./logrotate-check-exporter.sh --textfile # node_exporter textfile #### +#### ./logrotate-check-exporter.sh --daemon # continuous collection #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +SCRIPT_NAME=$(basename "$0") +readonly SCRIPT_NAME + +# Default configuration +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +LOG_DIR="${LOG_DIR:-/var/log}" +WATCH_PATHS="" +readonly DEFAULT_STALE_THRESHOLD=172800 +readonly DEFAULT_COLLECTION_INTERVAL=60 + +# Configuration variables (can be overridden by environment) +STALE_THRESHOLD=${STALE_THRESHOLD:-$DEFAULT_STALE_THRESHOLD} +COLLECTION_INTERVAL=${COLLECTION_INTERVAL:-$DEFAULT_COLLECTION_INTERVAL} +DEBUG=${DEBUG:-} + +# Runtime flags +RUN_MODE="once" + +debug_echo() { + if [[ -n "$DEBUG" ]]; then + echo "[DEBUG] $*" >&2 + fi +} + +show_help() { + cat << EOF +Usage: $SCRIPT_NAME [OPTIONS] + +Logrotate health metrics for Prometheus (v1.2). + +Parses the logrotate status file to track rotation timestamps, monitors log file +sizes and ages, and detects stale logs that haven't been rotated within a +configurable threshold. + +By default, discovers all log files in $LOG_DIR (excluding compressed archives). +Use --watch to monitor specific files instead. + +MODES: + --textfile Write to node_exporter textfile collector + --daemon Run continuously at COLLECTION_INTERVAL + (default) Output metrics to stdout + +OPTIONS: + --watch PATHS Comma-separated log file paths to monitor instead of auto-discovery + --log-dir DIR Base directory for auto-discovery (default: /var/log) + -o, --output Output file path + -h, --help Show this help message + +ENVIRONMENT VARIABLES: + LOG_DIR Base directory for log file discovery (default: /var/log) + STALE_THRESHOLD Seconds before a file is considered stale (default: $DEFAULT_STALE_THRESHOLD = 48h) + COLLECTION_INTERVAL Seconds between collections in daemon mode (default: $DEFAULT_COLLECTION_INTERVAL) + DEBUG Enable debug output + +EXAMPLES: + $SCRIPT_NAME # Auto-discover all logs in /var/log + $SCRIPT_NAME --textfile # Write to textfile collector + $SCRIPT_NAME --watch /var/log/syslog,/var/log/auth.log # Monitor specific files + $SCRIPT_NAME --log-dir /opt/app/logs # Scan a custom log directory + $SCRIPT_NAME --daemon # Continuous collection + +METRICS: + - logrotate_last_run_timestamp Unix timestamp of last logrotate run + - logrotate_status Whether logrotate has run recently (1/0) + - logrotate_files_total Files tracked in logrotate status + - logrotate_stale_files_total Files not rotated within threshold + - log_file_size_bytes Size of monitored log files + - log_file_age_seconds Age of monitored log files + +EOF + exit 0 +} + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --textfile) OUTPUT_FILE="$TEXTFILE_DIR/logrotate_check.prom"; shift ;; + --daemon) RUN_MODE="daemon"; shift ;; + --watch) WATCH_PATHS="${2:?--watch requires paths}"; shift 2 ;; + --log-dir) LOG_DIR="${2:?--log-dir requires a path}"; shift 2 ;; + -o|--output) OUTPUT_FILE="${2:?--output requires a path}"; shift 2 ;; + --help|-h) show_help ;; + *) echo "Unknown option: $1" >&2; show_help ;; + esac +done + +# Locate the logrotate status file +find_status_file() { + if [[ -f /var/lib/logrotate/status ]]; then + echo "/var/lib/logrotate/status" + elif [[ -f /var/lib/logrotate.status ]]; then + echo "/var/lib/logrotate.status" + else + debug_echo "No logrotate status file found" + echo "" + fi +} + +# Validate output directory exists when writing to file +validate_output() { + if [[ -n "$OUTPUT_FILE" ]]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + if [[ ! -d "$output_dir" ]]; then + echo "Error: Output directory not found: $output_dir" >&2 + echo "Create it: sudo mkdir -p $output_dir" >&2 + exit 1 + fi + fi +} + +# Collect logrotate status metrics +collect_logrotate_status() { + local status_file + status_file=$(find_status_file) + + if [[ -z "$status_file" ]]; then + cat </dev/null || echo 0) + + # Determine if logrotate is stale + local now + now=$(date +%s) + local age=$(( now - last_run_ts )) + local status=1 + if [[ $age -gt $STALE_THRESHOLD ]]; then + status=0 + fi + + # Count total tracked files and stale files in status + local files_total=0 + local stale_files=0 + + while IFS= read -r line; do + # Skip header line and empty lines + [[ "$line" =~ ^\".*\" ]] || continue + + files_total=$(( files_total + 1 )) + + # Extract the date portion — format: "filename" date + local date_str + date_str=$(echo "$line" | sed 's/^"[^"]*"[[:space:]]*//') + + if [[ -n "$date_str" ]]; then + local file_ts + file_ts=$(date -d "$date_str" +%s 2>/dev/null || echo 0) + if [[ $file_ts -gt 0 ]]; then + local file_age=$(( now - file_ts )) + if [[ $file_age -gt $STALE_THRESHOLD ]]; then + stale_files=$(( stale_files + 1 )) + fi + fi + fi + done < "$status_file" + + debug_echo "Status file: $files_total files tracked, $stale_files stale" + + cat </dev/null | sort +} + +# Collect log file size and age metrics +collect_log_files() { + local now + now=$(date +%s) + + local size_lines="" + local age_lines="" + local has_entries=0 + + if [[ -n "$WATCH_PATHS" ]]; then + # Explicit watch list + IFS=',' read -ra paths <<< "$WATCH_PATHS" + for path in "${paths[@]}"; do + path=$(echo "$path" | xargs) + [[ -f "$path" ]] || continue + + has_entries=1 + + local size + size=$(stat -c %s "$path" 2>/dev/null || echo 0) + local mtime + mtime=$(stat -c %Y "$path" 2>/dev/null || echo 0) + local age=$(( now - mtime )) + + size_lines+="log_file_size_bytes{path=\"${path}\"} ${size}\n" + age_lines+="log_file_age_seconds{path=\"${path}\"} ${age}\n" + debug_echo "Log file: $path size=$size age=${age}s" + done + else + # Auto-discover log files + debug_echo "Auto-discovering log files in $LOG_DIR" + while IFS= read -r path; do + [[ -f "$path" ]] || continue + + has_entries=1 + + local size + size=$(stat -c %s "$path" 2>/dev/null || echo 0) + local mtime + mtime=$(stat -c %Y "$path" 2>/dev/null || echo 0) + local age=$(( now - mtime )) + + size_lines+="log_file_size_bytes{path=\"${path}\"} ${size}\n" + age_lines+="log_file_age_seconds{path=\"${path}\"} ${age}\n" + debug_echo "Log file: $path size=$size age=${age}s" + done < <(discover_log_files) + fi + + if [[ $has_entries -eq 1 ]]; then + echo "# HELP log_file_size_bytes Size of monitored log file in bytes." + echo "# TYPE log_file_size_bytes gauge" + echo -e "$size_lines" + + echo "# HELP log_file_age_seconds Seconds since last modification of monitored log file." + echo "# TYPE log_file_age_seconds gauge" + echo -e "$age_lines" + fi +} + +# Collect exporter metadata +collect_metadata() { + local duration="$1" + local success="$2" + + cat < "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [[ "$file_lines" -lt 5 ]]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + + debug_echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" +} + +# Main +main() { + validate_output + + case "$RUN_MODE" in + once) + if [[ -n "$OUTPUT_FILE" ]]; then + write_metrics + else + generate_metrics + fi + ;; + daemon) + [[ -z "$OUTPUT_FILE" ]] && OUTPUT_FILE="$TEXTFILE_DIR/logrotate_check.prom" + validate_output + echo "$SCRIPT_NAME running in daemon mode (interval: ${COLLECTION_INTERVAL}s)" >&2 + while true; do + write_metrics + sleep "$COLLECTION_INTERVAL" + done + ;; + esac +} + +main diff --git a/logrotate-smoke-tests.sh b/logrotate-smoke-tests.sh new file mode 100644 index 0000000..d44892e --- /dev/null +++ b/logrotate-smoke-tests.sh @@ -0,0 +1,532 @@ +#!/usr/bin/env bash + +############################################################################################ +#### logrotate-smoke-tests.sh — Verify logrotate configs, rotation, and log hygiene #### +#### Zero external dependencies. Read-only — never modifies configs or log files. #### +#### Requires: bash 4+, coreutils #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.01 #### +#### #### +#### Usage: #### +#### sudo ./logrotate-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +############################################################################################ + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +LOG_DIR="${LOG_DIR:-/var/log}" +MAX_LOG_SIZE_MB="${MAX_LOG_SIZE_MB:-500}" +MAX_DISK_USAGE_PCT="${MAX_DISK_USAGE_PCT:-80}" +MAX_ARCHIVE_AGE_DAYS="${MAX_ARCHIVE_AGE_DAYS:-90}" +SKIP_SYNTAX="${SKIP_SYNTAX:-false}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" # text, tap, junit +JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0 +FAIL=0 +SKIP=0 +TOTAL=0 +RESULTS=() +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" + local detail="${2:-}" + ((PASS++)) || true + ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name}" + else + echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_fail() { + local name="$1" + local detail="${2:-}" + ((FAIL++)) || true + ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else + echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_skip() { + local name="$1" + local reason="${2:-}" + ((SKIP++)) || true + ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else + echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# TEST SUITES +# ══════════════════════════════════════════════════════════════════════ + +# ── 1. Binary & Config ─────────────────────────────────────────────── +test_binary_and_config() { + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then + echo "" + echo -e "${BOLD}Binary & Config${RESET}" + fi + + # 1. logrotate binary exists and is executable + local lr_path + lr_path=$(command -v logrotate 2>/dev/null) || lr_path="" + if [[ -n "$lr_path" && -x "$lr_path" ]]; then + record_pass "logrotate binary exists" "$lr_path" + else + record_fail "logrotate binary exists" "logrotate not found in PATH" + fi + + # 2. /etc/logrotate.conf exists and is readable + if [[ -r /etc/logrotate.conf ]]; then + record_pass "/etc/logrotate.conf exists and is readable" + else + record_fail "/etc/logrotate.conf exists and is readable" "file missing or not readable" + fi + + # 4. /etc/logrotate.d/ directory exists with configs + if [[ -d /etc/logrotate.d ]]; then + local conf_count + conf_count=$(find /etc/logrotate.d -maxdepth 1 -type f 2>/dev/null | wc -l) + if [[ "$conf_count" -gt 0 ]]; then + record_pass "/etc/logrotate.d/ directory exists" "${conf_count} config files" + else + record_fail "/etc/logrotate.d/ directory exists" "directory exists but contains no config files" + fi + else + record_fail "/etc/logrotate.d/ directory exists" "directory missing" + fi +} + +# ── 2. Config Syntax ───────────────────────────────────────────────── +test_config_syntax() { + if [[ "$SKIP_SYNTAX" == "true" ]]; then + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then + echo "" + echo -e "${BOLD}Config Syntax${RESET}" + fi + record_skip "logrotate.conf syntax valid" "SKIP_SYNTAX=true" + return + fi + + local lr_path + lr_path=$(command -v logrotate 2>/dev/null) || lr_path="" + if [[ -z "$lr_path" ]]; then + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then + echo "" + echo -e "${BOLD}Config Syntax${RESET}" + fi + record_skip "logrotate.conf syntax valid" "logrotate binary not found" + return + fi + + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then + echo "" + echo -e "${BOLD}Config Syntax${RESET}" + fi + + # 3. logrotate config syntax valid + local syntax_output + if syntax_output=$(logrotate -d /etc/logrotate.conf 2>&1); then + record_pass "logrotate.conf syntax valid" + verbose "logrotate -d output: ${syntax_output:0:200}" + else + record_fail "logrotate.conf syntax valid" "logrotate -d returned errors" + verbose "logrotate -d output: ${syntax_output:0:500}" + fi + + # 5. Individual config syntax check + if [[ -d /etc/logrotate.d ]]; then + local cfg + for cfg in /etc/logrotate.d/*; do + [[ -f "$cfg" ]] || continue + local cfg_name + cfg_name=$(basename "$cfg") + + local cfg_output + if cfg_output=$(logrotate -d "$cfg" 2>&1); then + record_pass "logrotate.d/${cfg_name}" "syntax valid" + else + record_fail "logrotate.d/${cfg_name}" "syntax error" + verbose "logrotate -d ${cfg} output: ${cfg_output:0:500}" + fi + done + fi +} + +# ── 3. Log Hygiene ─────────────────────────────────────────────────── +test_log_hygiene() { + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then + echo "" + echo -e "${BOLD}Log Hygiene${RESET}" + fi + + # 6. Compressed log archives exist where expected + local archive_count + archive_count=$(find "$LOG_DIR" -maxdepth 2 -type f \( -name '*.gz' -o -name '*.xz' \) 2>/dev/null | wc -l) + if [[ "$archive_count" -gt 0 ]]; then + record_pass "Compressed log archives exist" "${archive_count} archives found" + else + record_fail "Compressed log archives exist" "no .gz or .xz files found in ${LOG_DIR}" + fi + + # 7. No oversized unrotated logs + local oversized + oversized=$(find "$LOG_DIR" -maxdepth 2 -type f -not -name '*.gz' -not -name '*.xz' \ + -size +"${MAX_LOG_SIZE_MB}M" 2>/dev/null) || oversized="" + if [[ -z "$oversized" ]]; then + record_pass "No oversized unrotated logs" "all under ${MAX_LOG_SIZE_MB} MB" + else + local oversized_count + oversized_count=$(echo "$oversized" | wc -l) + local first_file + first_file=$(echo "$oversized" | head -1) + record_fail "No oversized unrotated logs" "${oversized_count} file(s) exceed ${MAX_LOG_SIZE_MB} MB (e.g. ${first_file})" + fi + + # 8. Log directory permissions + if [[ -d "$LOG_DIR" ]]; then + local dir_owner + dir_owner=$(stat -c '%U' "$LOG_DIR" 2>/dev/null) || dir_owner="unknown" + local dir_perms + dir_perms=$(stat -c '%a' "$LOG_DIR" 2>/dev/null) || dir_perms="000" + + local world_writable=false + if [[ "${dir_perms: -1}" =~ [2367] ]]; then + world_writable=true + fi + + if [[ "$dir_owner" == "root" && "$world_writable" == "false" ]]; then + record_pass "${LOG_DIR} permissions" "owner ${dir_owner}, mode ${dir_perms}" + elif [[ "$dir_owner" != "root" ]]; then + record_fail "${LOG_DIR} permissions" "owner is ${dir_owner}, expected root" + else + record_fail "${LOG_DIR} permissions" "world-writable (mode ${dir_perms})" + fi + else + record_fail "${LOG_DIR} permissions" "directory does not exist" + fi +} + +# ── 4. Disk & Retention ────────────────────────────────────────────── +test_disk_and_retention() { + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then + echo "" + echo -e "${BOLD}Disk & Retention${RESET}" + fi + + # 9. /var/log disk usage under threshold + local usage_pct + usage_pct=$(df "$LOG_DIR" 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}') || usage_pct="" + if [[ -n "$usage_pct" ]]; then + if [[ "$usage_pct" -lt "$MAX_DISK_USAGE_PCT" ]]; then + record_pass "${LOG_DIR} disk usage" "${usage_pct}% (under ${MAX_DISK_USAGE_PCT}% threshold)" + else + record_fail "${LOG_DIR} disk usage" "${usage_pct}% (exceeds ${MAX_DISK_USAGE_PCT}% threshold)" + fi + else + record_skip "${LOG_DIR} disk usage" "could not determine disk usage" + fi + + # 10. Stale .gz/.xz files older than retention threshold + local stale_count + stale_count=$(find "$LOG_DIR" -maxdepth 2 -type f \( -name '*.gz' -o -name '*.xz' \) \ + -mtime +"${MAX_ARCHIVE_AGE_DAYS}" 2>/dev/null | wc -l) + if [[ "$stale_count" -eq 0 ]]; then + record_pass "No stale archives" "none older than ${MAX_ARCHIVE_AGE_DAYS} days" + else + record_fail "No stale archives" "${stale_count} archive(s) older than ${MAX_ARCHIVE_AGE_DAYS} days" + fi + + # 11. logrotate status file exists and is recent + local status_file="/var/lib/logrotate/logrotate.status" + if [[ ! -f "$status_file" ]]; then + # Some distros use /var/lib/logrotate.status + status_file="/var/lib/logrotate.status" + fi + + if [[ -f "$status_file" ]]; then + local status_mtime + status_mtime=$(stat -c '%Y' "$status_file" 2>/dev/null) || status_mtime=0 + local now + now=$(date +%s) + local age_days=$(( (now - status_mtime) / 86400 )) + + if [[ "$age_days" -le 2 ]]; then + local status_date + status_date=$(date -d "@${status_mtime}" +%Y-%m-%d 2>/dev/null) || status_date="unknown" + record_pass "logrotate status file" "updated ${status_date}" + else + record_fail "logrotate status file" "last updated ${age_days} days ago" + fi + else + record_fail "logrotate status file" "not found at /var/lib/logrotate/logrotate.status or /var/lib/logrotate.status" + fi +} + +# ── 5. Common Misconfigs ───────────────────────────────────────────── +test_misconfigs() { + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then + echo "" + echo -e "${BOLD}Common Misconfigs${RESET}" + fi + + if [[ ! -d /etc/logrotate.d ]]; then + record_skip "Config 'rotate' directive check" "/etc/logrotate.d not found" + record_skip "Config 'compress' directive check" "/etc/logrotate.d not found" + return + fi + + # 12a. Check for missing 'rotate' directive + local missing_rotate=() + local cfg + for cfg in /etc/logrotate.d/*; do + [[ -f "$cfg" ]] || continue + if ! grep -qE '^\s*rotate\s+' "$cfg" 2>/dev/null; then + # Check if the main config has a global rotate + if ! grep -qE '^\s*rotate\s+' /etc/logrotate.conf 2>/dev/null; then + missing_rotate+=("$(basename "$cfg")") + fi + fi + done + + if [[ ${#missing_rotate[@]} -eq 0 ]]; then + record_pass "All configs have 'rotate' directive" + else + record_fail "All configs have 'rotate' directive" "missing in: ${missing_rotate[*]}" + fi + + # 12b. Check for missing 'compress' directive + local missing_compress=() + for cfg in /etc/logrotate.d/*; do + [[ -f "$cfg" ]] || continue + if ! grep -qE '^\s*(compress|nocompress)' "$cfg" 2>/dev/null; then + if ! grep -qE '^\s*compress' /etc/logrotate.conf 2>/dev/null; then + missing_compress+=("$(basename "$cfg")") + fi + fi + done + + if [[ ${#missing_compress[@]} -eq 0 ]]; then + record_pass "All configs have 'compress' directive" + else + record_fail "All configs have 'compress' directive" "missing in: ${missing_compress[*]}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} ${LOG_DIR}" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + + if [[ $FAIL -eq 0 ]]; then + echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else + echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}" + fi +} + +print_tap_header() { + echo "TAP version 13" +} + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +write_junit() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + cat > "$JUNIT_FILE" < + + +JUNIT_EOF + + for result in "${RESULTS[@]}"; do + local status name detail + status=$(echo "$result" | cut -d'|' -f1) + name=$(echo "$result" | cut -d'|' -f2) + detail=$(echo "$result" | cut -d'|' -f3) + + # XML-escape the values + name=$(echo "$name" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + detail=$(echo "$detail" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + + case "$status" in + PASS) + echo " " >> "$JUNIT_FILE" + [[ -n "$detail" ]] && echo " ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + FAIL) + echo " " >> "$JUNIT_FILE" + echo " FAILED: ${name} — ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + SKIP) + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + esac + done + + echo " " >> "$JUNIT_FILE" + echo "" >> "$JUNIT_FILE" + + log "JUnit report written to ${JUNIT_FILE}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2; exit 1 ;; + esac + done +} + +prom_escape() { + local val="$1" + val="${val//\\/\\\\}" + val="${val//\"/\\\"}" + val="${val//$'\n'/}" + echo "$val" +} + +get_report_value() { + local key="$1" + grep "^${key}=" "$LYNIS_REPORT" 2>/dev/null | head -1 | cut -d'=' -f2- +} + +count_report_key() { + local key="$1" + grep -c "^${key}" "$LYNIS_REPORT" 2>/dev/null || true +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Run audit if requested + if [ "$RUN_AUDIT" = true ]; then + lynis audit system --quick --no-colors --quiet 2>/dev/null + fi + + # Check if report exists + if [ ! -f "$LYNIS_REPORT" ]; then + cat </dev/null || true) + + local tests_passed tests_failed + tests_passed=$(grep "^test_result=OK" "$LYNIS_REPORT" 2>/dev/null | wc -l) + tests_failed=$(grep "^test_result=WARNING\|^test_result=SUGGESTION\|^test_result=DIFFERENT\|^test_result=WEAK" "$LYNIS_REPORT" 2>/dev/null | wc -l) + local tests_skipped_count + tests_skipped_count=$(grep -c "^test_skipped=" "$LYNIS_REPORT" 2>/dev/null || true) + + cat </dev/null; then + firewall_active=1 + fi + + if get_report_value "malware_scanner_installed" | grep -qi "1\|yes\|true" 2>/dev/null; then + malware_scanner=1 + fi + + vulnerable_packages=$(get_report_value "vulnerable_packages_found") + vulnerable_packages=${vulnerable_packages:-0} + + local plugins_enabled + plugins_enabled=$(get_report_value "plugins_enabled") + plugins_enabled=${plugins_enabled:-0} + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +Lynis Exporter v$EXPORTER_VERSION + +

Lynis Prometheus Exporter v$EXPORTER_VERSION

+

Metrics

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.lynis_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 5 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines)" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/mail-smoke-tests.sh b/mail-smoke-tests.sh new file mode 100755 index 0000000..014b73f --- /dev/null +++ b/mail-smoke-tests.sh @@ -0,0 +1,529 @@ +#!/usr/bin/env bash + +##################################################################################### +#### mail-smoke-tests.sh — Verify mail infrastructure is healthy #### +#### Checks Postfix, Dovecot, SMTP/IMAP, STARTTLS, SPF/DKIM/DMARC, queues. #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: MAIL_DOMAIN=example.com ./mail-smoke-tests.sh #### +#### SMTP_HOST=mail.example.com IMAP_PORT=993 ./mail-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +##################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +MAIL_DOMAIN="${MAIL_DOMAIN:-}" +SMTP_HOST="${SMTP_HOST:-localhost}" +SMTP_PORT="${SMTP_PORT:-25}" +SUBMISSION_PORT="${SUBMISSION_PORT:-587}" +IMAP_HOST="${IMAP_HOST:-localhost}" +IMAP_PORT="${IMAP_PORT:-993}" +IMAP_USER="${IMAP_USER:-}" +IMAP_PASS="${IMAP_PASS:-}" +DKIM_SELECTOR="${DKIM_SELECTOR:-default}" +MAX_QUEUE_SIZE="${MAX_QUEUE_SIZE:-50}" +RELAY_TEST_ADDR="${RELAY_TEST_ADDR:-test@example.com}" +SSL_WARN_DAYS="${SSL_WARN_DAYS:-30}" +SKIP_DNS="${SKIP_DNS:-false}" +SKIP_IMAP="${SKIP_IMAP:-false}" +SKIP_RELAY="${SKIP_RELAY:-false}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" +COLOR="${COLOR:-auto}" +VERBOSE="${VERBOSE:-false}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0; FAIL=0; SKIP=0; TOTAL=0 +RESULTS=() +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" detail="${2:-}" + ((PASS++)) || true; ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name}${detail:+ (${detail})}" + else echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_fail() { + local name="$1" detail="${2:-}" + ((FAIL++)) || true; ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}"; fi +} + +record_skip() { + local name="$1" reason="${2:-}" + ((SKIP++)) || true; ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}"; fi +} + +# ── Helpers ─────────────────────────────────────────────────────────── +has_cmd() { command -v "$1" >/dev/null 2>&1; } + +section() { + if [[ "$OUTPUT_FORMAT" != "tap" ]]; then echo ""; echo -e "${BOLD}$1${RESET}"; fi +} + +# ── Cleanup ─────────────────────────────────────────────────────────── +# shellcheck disable=SC2317 +cleanup() { + verbose "Cleaning up temporary files..." +} +trap cleanup EXIT + +# ══════════════════════════════════════════════════════════════════════ +# TEST FUNCTIONS +# ══════════════════════════════════════════════════════════════════════ + +# ── 1. Postfix running ─────────────────────────────────────────────── +test_postfix_running() { + if has_cmd systemctl; then + if systemctl is-active --quiet postfix 2>/dev/null; then + record_pass "Postfix running" "systemctl active" + else + record_fail "Postfix running" "systemctl inactive" + fi + elif pgrep -x master >/dev/null 2>&1; then + record_pass "Postfix running" "master process found" + else + record_fail "Postfix running" "master process not found" + fi +} + +# ── 2. Dovecot running ────────────────────────────────────────────── +test_dovecot_running() { + if [[ "$SKIP_IMAP" == "true" ]]; then record_skip "Dovecot running" "SKIP_IMAP=true"; return; fi + if has_cmd systemctl; then + if systemctl is-active --quiet dovecot 2>/dev/null; then + record_pass "Dovecot running" "systemctl active" + else + record_fail "Dovecot running" "systemctl inactive" + fi + elif pgrep -x dovecot >/dev/null 2>&1; then + record_pass "Dovecot running" "dovecot process found" + else + record_fail "Dovecot running" "dovecot process not found" + fi +} + +# ── 3. SMTP connect ───────────────────────────────────────────────── +test_smtp_connect() { + local output exit_code=0 + verbose "Testing SMTP connection to ${SMTP_HOST}:${SMTP_PORT}" + output=$(timeout 10 bash -c "echo QUIT | nc -w5 ${SMTP_HOST} ${SMTP_PORT}" 2>&1) || exit_code=$? + if [[ $exit_code -eq 0 ]] && echo "$output" | grep -q "220"; then + local banner + banner=$(echo "$output" | grep "220" | head -1 | tr -d '\r') + record_pass "SMTP connect (${SMTP_HOST}:${SMTP_PORT})" "${banner:0:60}" + else + record_fail "SMTP connect (${SMTP_HOST}:${SMTP_PORT})" "no 220 banner received" + fi +} + +# ── 4. SMTP EHLO ──────────────────────────────────────────────────── +test_smtp_ehlo() { + local ehlo_name; ehlo_name=$(hostname -f 2>/dev/null || echo "localhost") + local output exit_code=0 + verbose "Sending EHLO to ${SMTP_HOST}:${SMTP_PORT}" + output=$(timeout 10 bash -c "{ echo 'EHLO ${ehlo_name}'; sleep 1; echo 'QUIT'; } | nc -w5 ${SMTP_HOST} ${SMTP_PORT}" 2>&1) || exit_code=$? + if echo "$output" | grep -q "^250"; then + local caps + caps=$(echo "$output" | grep -c "^250" 2>/dev/null) || caps=0 + record_pass "SMTP EHLO" "${caps} capabilities advertised" + else + record_fail "SMTP EHLO" "no 250 response to EHLO" + fi +} + +# ── 5. IMAP connect ───────────────────────────────────────────────── +test_imap_connect() { + if [[ "$SKIP_IMAP" == "true" ]]; then record_skip "IMAP connect" "SKIP_IMAP=true"; return; fi + local output exit_code=0 + verbose "Testing IMAP connection to ${IMAP_HOST}:${IMAP_PORT}" + if [[ "$IMAP_PORT" == "993" ]]; then + output=$(timeout 10 bash -c "echo 'a1 LOGOUT' | openssl s_client -quiet -connect ${IMAP_HOST}:${IMAP_PORT} 2>/dev/null" 2>&1) || exit_code=$? + else + output=$(timeout 10 bash -c "echo 'a1 LOGOUT' | nc -w5 ${IMAP_HOST} ${IMAP_PORT}" 2>&1) || exit_code=$? + fi + if echo "$output" | grep -qi "OK\|IMAP\|ready"; then + record_pass "IMAP connect (${IMAP_HOST}:${IMAP_PORT})" "server responded" + else + record_fail "IMAP connect (${IMAP_HOST}:${IMAP_PORT})" "no IMAP banner received" + fi +} + +# ── 6. IMAP login ─────────────────────────────────────────────────── +test_imap_login() { + if [[ "$SKIP_IMAP" == "true" ]]; then record_skip "IMAP login" "SKIP_IMAP=true"; return; fi + if [[ -z "$IMAP_USER" || -z "$IMAP_PASS" ]]; then + record_skip "IMAP login" "IMAP_USER/IMAP_PASS not set" + return + fi + local output exit_code=0 + verbose "Attempting IMAP login as ${IMAP_USER}" + if [[ "$IMAP_PORT" == "993" ]]; then + output=$(timeout 10 bash -c "{ sleep 1; echo 'a1 LOGIN ${IMAP_USER} ${IMAP_PASS}'; sleep 1; echo 'a2 LOGOUT'; } | openssl s_client -quiet -connect ${IMAP_HOST}:${IMAP_PORT} 2>/dev/null" 2>&1) || exit_code=$? + else + output=$(timeout 10 bash -c "{ sleep 1; echo 'a1 LOGIN ${IMAP_USER} ${IMAP_PASS}'; sleep 1; echo 'a2 LOGOUT'; } | nc -w5 ${IMAP_HOST} ${IMAP_PORT}" 2>&1) || exit_code=$? + fi + if echo "$output" | grep -q "a1 OK"; then + record_pass "IMAP login" "${IMAP_USER}" + else + record_fail "IMAP login" "login failed for ${IMAP_USER}" + fi +} + +# ── 7. SPF record ─────────────────────────────────────────────────── +test_spf_record() { + if [[ "$SKIP_DNS" == "true" ]]; then record_skip "SPF record" "SKIP_DNS=true"; return; fi + if [[ -z "$MAIL_DOMAIN" ]]; then record_skip "SPF record" "MAIL_DOMAIN not set"; return; fi + if ! has_cmd dig; then record_skip "SPF record" "dig not installed"; return; fi + local output + output=$(dig +short TXT "${MAIL_DOMAIN}" 2>/dev/null) || true + if echo "$output" | grep -qi "v=spf1"; then + local spf + spf=$(echo "$output" | grep -i "v=spf1" | head -1 | tr -d '"') + record_pass "SPF record (${MAIL_DOMAIN})" "${spf:0:60}" + else + record_fail "SPF record (${MAIL_DOMAIN})" "no v=spf1 TXT record found" + fi +} + +# ── 8. DKIM record ────────────────────────────────────────────────── +test_dkim_record() { + if [[ "$SKIP_DNS" == "true" ]]; then record_skip "DKIM record" "SKIP_DNS=true"; return; fi + if [[ -z "$MAIL_DOMAIN" ]]; then record_skip "DKIM record" "MAIL_DOMAIN not set"; return; fi + if ! has_cmd dig; then record_skip "DKIM record" "dig not installed"; return; fi + local selector_domain="${DKIM_SELECTOR}._domainkey.${MAIL_DOMAIN}" + local output + output=$(dig +short TXT "${selector_domain}" 2>/dev/null) || true + if [[ -n "$output" ]] && echo "$output" | grep -qi "v=DKIM1\|p="; then + record_pass "DKIM record (${selector_domain})" "key present" + else + record_fail "DKIM record (${selector_domain})" "no DKIM TXT record found" + fi +} + +# ── 9. DMARC record ───────────────────────────────────────────────── +test_dmarc_record() { + if [[ "$SKIP_DNS" == "true" ]]; then record_skip "DMARC record" "SKIP_DNS=true"; return; fi + if [[ -z "$MAIL_DOMAIN" ]]; then record_skip "DMARC record" "MAIL_DOMAIN not set"; return; fi + if ! has_cmd dig; then record_skip "DMARC record" "dig not installed"; return; fi + local dmarc_domain="_dmarc.${MAIL_DOMAIN}" + local output + output=$(dig +short TXT "${dmarc_domain}" 2>/dev/null) || true + if echo "$output" | grep -qi "v=DMARC1"; then + local dmarc + dmarc=$(echo "$output" | grep -i "v=DMARC1" | head -1 | tr -d '"') + record_pass "DMARC record (${dmarc_domain})" "${dmarc:0:60}" + else + record_fail "DMARC record (${dmarc_domain})" "no v=DMARC1 TXT record found" + fi +} + +# ── 10. Mail queue size ───────────────────────────────────────────── +test_mail_queue() { + if ! has_cmd postqueue; then record_skip "Mail queue size" "postqueue not available"; return; fi + local queue_output queue_count + queue_output=$(postqueue -p 2>/dev/null) || true + if echo "$queue_output" | grep -q "Mail queue is empty"; then + record_pass "Mail queue size" "queue empty" + return + fi + queue_count=$(echo "$queue_output" | grep -c "^[A-F0-9]" 2>/dev/null) || queue_count=0 + if [[ "$queue_count" -le "$MAX_QUEUE_SIZE" ]]; then + record_pass "Mail queue size" "${queue_count} messages (<= ${MAX_QUEUE_SIZE})" + else + record_fail "Mail queue size" "${queue_count} messages (> ${MAX_QUEUE_SIZE})" + fi +} + +# ── 11. Open relay test ───────────────────────────────────────────── +test_open_relay() { + if [[ "$SKIP_RELAY" == "true" ]]; then record_skip "Open relay test" "SKIP_RELAY=true"; return; fi + local ehlo_name; ehlo_name=$(hostname -f 2>/dev/null || echo "localhost") + local output exit_code=0 + verbose "Testing open relay via ${SMTP_HOST}:${SMTP_PORT}" + output=$(timeout 10 bash -c "{ + sleep 1 + echo 'EHLO ${ehlo_name}' + sleep 1 + echo 'MAIL FROM:' + sleep 1 + echo 'RCPT TO:<${RELAY_TEST_ADDR}>' + sleep 1 + echo 'QUIT' + } | nc -w5 ${SMTP_HOST} ${SMTP_PORT}" 2>&1) || exit_code=$? + if echo "$output" | grep -qE "^(454|550|553|554|521|503)"; then + record_pass "Open relay test" "relay correctly refused" + elif echo "$output" | grep -q "^250.*Ok\|^250.*Accepted"; then + record_fail "Open relay test" "server accepted relay — possible open relay" + else + record_pass "Open relay test" "relay not accepted" + fi +} + +# ── 12. SSL/TLS on SMTP ───────────────────────────────────────────── +test_smtp_tls() { + if ! has_cmd openssl; then record_skip "SMTP STARTTLS" "openssl not installed"; return; fi + local output exit_code=0 + verbose "Testing STARTTLS on ${SMTP_HOST}:${SUBMISSION_PORT}" + output=$(timeout 10 openssl s_client -starttls smtp -connect "${SMTP_HOST}:${SUBMISSION_PORT}" -servername "${SMTP_HOST}" &1) || exit_code=$? + if echo "$output" | grep -qi "connected\|verify return\|SSL handshake"; then + local expiry_line expiry_date days_left + expiry_line=$(echo "$output" | grep -i "notAfter" | head -1) || true + if [[ -n "$expiry_line" ]]; then + expiry_date=$(echo "$expiry_line" | sed 's/.*notAfter=//') || true + if [[ -n "$expiry_date" ]]; then + local expiry_epoch now_epoch + expiry_epoch=$(date -d "$expiry_date" +%s 2>/dev/null) || expiry_epoch=0 + now_epoch=$(date +%s) + if [[ "$expiry_epoch" -gt 0 ]]; then + days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + if [[ "$days_left" -lt "$SSL_WARN_DAYS" ]]; then + record_fail "SMTP STARTTLS (${SMTP_HOST}:${SUBMISSION_PORT})" "cert expires in ${days_left}d (< ${SSL_WARN_DAYS}d)" + return + fi + record_pass "SMTP STARTTLS (${SMTP_HOST}:${SUBMISSION_PORT})" "cert valid, ${days_left}d remaining" + return + fi + fi + fi + record_pass "SMTP STARTTLS (${SMTP_HOST}:${SUBMISSION_PORT})" "TLS handshake OK" + else + record_fail "SMTP STARTTLS (${SMTP_HOST}:${SUBMISSION_PORT})" "STARTTLS handshake failed" + fi +} + +# ── 13. SSL/TLS on IMAP ───────────────────────────────────────────── +test_imap_tls() { + if [[ "$SKIP_IMAP" == "true" ]]; then record_skip "IMAP TLS" "SKIP_IMAP=true"; return; fi + if ! has_cmd openssl; then record_skip "IMAP TLS" "openssl not installed"; return; fi + local output exit_code=0 + verbose "Testing TLS on ${IMAP_HOST}:${IMAP_PORT}" + output=$(timeout 10 openssl s_client -connect "${IMAP_HOST}:${IMAP_PORT}" -servername "${IMAP_HOST}" &1) || exit_code=$? + if echo "$output" | grep -qi "connected\|verify return\|SSL handshake"; then + local expiry_line expiry_date days_left + expiry_line=$(echo "$output" | grep -i "notAfter" | head -1) || true + if [[ -n "$expiry_line" ]]; then + expiry_date=$(echo "$expiry_line" | sed 's/.*notAfter=//') || true + if [[ -n "$expiry_date" ]]; then + local expiry_epoch now_epoch + expiry_epoch=$(date -d "$expiry_date" +%s 2>/dev/null) || expiry_epoch=0 + now_epoch=$(date +%s) + if [[ "$expiry_epoch" -gt 0 ]]; then + days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + if [[ "$days_left" -lt "$SSL_WARN_DAYS" ]]; then + record_fail "IMAP TLS (${IMAP_HOST}:${IMAP_PORT})" "cert expires in ${days_left}d (< ${SSL_WARN_DAYS}d)" + return + fi + record_pass "IMAP TLS (${IMAP_HOST}:${IMAP_PORT})" "cert valid, ${days_left}d remaining" + return + fi + fi + fi + record_pass "IMAP TLS (${IMAP_HOST}:${IMAP_PORT})" "TLS handshake OK" + else + record_fail "IMAP TLS (${IMAP_HOST}:${IMAP_PORT})" "TLS handshake failed" + fi +} + +# ── 14. Deferred mail check ───────────────────────────────────────── +test_deferred_queue() { + if ! has_cmd postqueue; then record_skip "Deferred mail check" "postqueue not available"; return; fi + local deferred_count=0 + if [[ -d /var/spool/postfix/deferred ]]; then + deferred_count=$(find /var/spool/postfix/deferred -type f 2>/dev/null | wc -l) || deferred_count=0 + fi + if [[ "$deferred_count" -eq 0 ]]; then + record_pass "Deferred mail check" "no deferred messages" + else + record_fail "Deferred mail check" "${deferred_count} deferred message(s)" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_tap_header() { + echo "TAP version 13" +} + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +print_summary() { + local end_time; end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} Mail Smoke Tests" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + if [[ $FAIL -eq 0 ]]; then echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}"; fi +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2; exit 1 ;; + esac + done +} + +# ── Metrics generation ──────────────────────────────────────────────────────── + +generate_metrics() { + local script_start + script_start=$(date +%s) + + # Prerequisite checks + local prereq_fail="" + command -v curl >/dev/null 2>&1 || prereq_fail="curl is required but not found" + [ -z "$prereq_fail" ] && ! command -v jq >/dev/null 2>&1 && prereq_fail="jq is required but not found" + [ -z "$prereq_fail" ] && [ -z "$MAILCOW_API_KEY" ] && prereq_fail="MAILCOW_API_KEY is required but not set" + + if [ -n "$prereq_fail" ]; then + echo "# ERROR: $prereq_fail" >&2 + cat </dev/null) + if [ -n "$version_json" ] && [ "$version_json" != "null" ]; then + local mc_version + mc_version=$(echo "$version_json" | jq -r '.version // "unknown"' 2>/dev/null) + mc_version=$(prom_escape "$mc_version") + cat <&2 + exporter_up=0 + fi + + # ── Container Health ───────────────────────────────────────────── + local containers_json container_lines="" + containers_json=$($api_curl "${MAILCOW_URL}/api/v1/get/status/containers" 2>/dev/null) + if [ -n "$containers_json" ] && [ "$containers_json" != "null" ]; then + local container_data + container_data=$(echo "$containers_json" | jq -r ' + to_entries[] | "\(.key)\t\(.value.state // "unknown")" + ' 2>/dev/null) + if [ -n "$container_data" ]; then + while IFS=$'\t' read -r c_name c_state; do + [ -z "$c_name" ] && continue + local esc_name up_val + esc_name=$(prom_escape "$c_name") + [ "$c_state" = "running" ] && up_val=1 || up_val=0 + container_lines="${container_lines}mailcow_container_up{name=\"${esc_name}\"} ${up_val} +" + done <<< "$container_data" + fi + else + echo "# WARNING: could not read /api/v1/get/status/containers" >&2 + exporter_up=0 + fi + + if [ -n "$container_lines" ]; then + echo "# HELP mailcow_container_up Container running status (1=running, 0=down)" + echo "# TYPE mailcow_container_up gauge" + printf '%s' "$container_lines" + echo "" + fi + + # ── Domain Metrics ─────────────────────────────────────────────── + local domains_json domain_count=0 domain_info_lines="" + domains_json=$($api_curl "${MAILCOW_URL}/api/v1/get/domain/all" 2>/dev/null) + if [ -n "$domains_json" ] && [ "$domains_json" != "null" ]; then + domain_count=$(echo "$domains_json" | jq 'length' 2>/dev/null) + domain_count=${domain_count:-0} + local domain_data + domain_data=$(echo "$domains_json" | jq -r ' + .[] | "\(.domain_name)\t\(.active // 0)" + ' 2>/dev/null) + if [ -n "$domain_data" ]; then + while IFS=$'\t' read -r d_name d_active; do + [ -z "$d_name" ] && continue + local esc_domain esc_active + esc_domain=$(prom_escape "$d_name") + esc_active=$(prom_escape "$d_active") + domain_info_lines="${domain_info_lines}mailcow_domain_info{domain=\"${esc_domain}\",active=\"${esc_active}\"} 1 +" + done <<< "$domain_data" + fi + else + echo "# WARNING: could not read /api/v1/get/domain/all" >&2 + fi + + cat </dev/null) + if [ -n "$mailboxes_json" ] && [ "$mailboxes_json" != "null" ]; then + mailbox_count=$(echo "$mailboxes_json" | jq 'length' 2>/dev/null) + mailbox_count=${mailbox_count:-0} + local mailbox_data + mailbox_data=$(echo "$mailboxes_json" | jq -r ' + .[] | "\(.username)\t\(.domain)\t\(.quota_used // 0)\t\(.quota // 0)\t\(.messages // 0)\t\(.active // 0)" + ' 2>/dev/null) + if [ -n "$mailbox_data" ]; then + while IFS=$'\t' read -r mb_user mb_domain mb_quota_used mb_quota_total mb_messages mb_active; do + [ -z "$mb_user" ] && continue + local esc_mb esc_mbd + esc_mb=$(prom_escape "$mb_user") + esc_mbd=$(prom_escape "$mb_domain") + local labels="mailbox=\"${esc_mb}\",domain=\"${esc_mbd}\"" + quota_used_lines="${quota_used_lines}mailcow_mailbox_quota_used_bytes{${labels}} ${mb_quota_used} +" + quota_total_lines="${quota_total_lines}mailcow_mailbox_quota_total_bytes{${labels}} ${mb_quota_total} +" + messages_lines="${messages_lines}mailcow_mailbox_messages{${labels}} ${mb_messages} +" + active_lines="${active_lines}mailcow_mailbox_active{${labels}} ${mb_active} +" + done <<< "$mailbox_data" + fi + else + echo "# WARNING: could not read /api/v1/get/mailbox/all" >&2 + fi + + cat </dev/null) + if [ -n "$rspamd_json" ] && [ "$rspamd_json" != "null" ]; then + rspamd_scanned=$(echo "$rspamd_json" | jq 'length' 2>/dev/null) + rspamd_scanned=${rspamd_scanned:-0} + local action_data + action_data=$(echo "$rspamd_json" | jq -r ' + group_by(.action) | .[] | "\(.[0].action)\t\(length)" + ' 2>/dev/null) + if [ -n "$action_data" ]; then + while IFS=$'\t' read -r act_name act_count; do + [ -z "$act_name" ] && continue + local esc_action + esc_action=$(prom_escape "$act_name") + action_lines="${action_lines}mailcow_rspamd_action_total{action=\"${esc_action}\"} ${act_count} +" + done <<< "$action_data" + fi + else + echo "# WARNING: could not read /api/v1/get/logs/rspamd-history/100" >&2 + fi + + cat </dev/null) + if [ -n "$quarantine_json" ] && [ "$quarantine_json" != "null" ]; then + quarantine_count=$(echo "$quarantine_json" | jq 'length' 2>/dev/null) + quarantine_count=${quarantine_count:-0} + else + echo "# WARNING: could not read /api/v1/get/quarantine/all" >&2 + fi + + cat </dev/null) + if [ -n "$ratelimit_json" ] && [ "$ratelimit_json" != "null" ]; then + echo "# Rate limit data retrieved successfully" + else + echo "# WARNING: could not read /api/v1/get/rl-mbox" >&2 + fi + + # ── Exporter Runtime ───────────────────────────────────────────── + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "# ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + trap 'echo "# Shutting down mailcow exporter..." >&2; exit 0' INT TERM + + while true; do + { + read -r request + local body + if [[ "$request" =~ ^GET\ /metrics ]]; then + body=$(generate_metrics) + printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body" + else + body='Mailcow Exporter v1.0

Mailcow Exporter v1.0

Metrics

' + printf "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body" + fi + } | if nc -h 2>&1 | grep -q 'GNU\|traditional'; then + nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + else + nc -l "$HTTP_PORT" 2>/dev/null + fi + done +} + +# ── Main execution ──────────────────────────────────────────────────────────── + +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.mailcow_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "# ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 3 ]; then + rm -f "$temp_file" + echo "# ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "# Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/mailcow-smoke-tests.sh b/mailcow-smoke-tests.sh new file mode 100755 index 0000000..1270fc7 --- /dev/null +++ b/mailcow-smoke-tests.sh @@ -0,0 +1,643 @@ +#!/usr/bin/env bash + +######################################################################################### +#### mailcow-smoke-tests.sh — Verify Mailcow instance health after upgrades #### +#### Zero external dependencies. Requires: bash 4+, curl, openssl, nc (netcat) #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.0 #### +#### #### +#### Usage: #### +#### export MAILCOW_URL="https://mail.example.com" #### +#### export MAILCOW_API_KEY="your-api-key" #### +#### ./mailcow-smoke-tests.sh #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +MAILCOW_URL="${MAILCOW_URL:-}" +MAILCOW_API_KEY="${MAILCOW_API_KEY:-}" +MAILCOW_DIR="${MAILCOW_DIR:-/opt/mailcow-dockerized}" +CURL_TIMEOUT="${CURL_TIMEOUT:-10}" +CURL_INSECURE="${CURL_INSECURE:-false}" +SKIP_SMTP="${SKIP_SMTP:-false}" +SKIP_IMAP="${SKIP_IMAP:-false}" +SKIP_CLAMD="${SKIP_CLAMD:-false}" +OUTPUT_FORMAT="${OUTPUT_FORMAT:-text}" # text, tap, junit +JUNIT_FILE="${JUNIT_FILE:-smoke-results.xml}" +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" + +# ── State ───────────────────────────────────────────────────────────── +PASS=0 +FAIL=0 +SKIP=0 +TOTAL=0 +RESULTS=() +START_TIME="" + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + BOLD='\033[1m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" BLUE="" BOLD="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${BLUE}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${BLUE}[DEBUG]${RESET} $*"; fi; } + +# ── Test Result Recording ───────────────────────────────────────────── +record_pass() { + local name="$1" + local detail="${2:-}" + ((PASS++)) || true + ((TOTAL++)) || true + RESULTS+=("PASS|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name}" + else + echo -e " ${GREEN}✓${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_fail() { + local name="$1" + local detail="${2:-}" + ((FAIL++)) || true + ((TOTAL++)) || true + RESULTS+=("FAIL|${name}|${detail}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "not ok ${TOTAL} - ${name}" + [[ -n "$detail" ]] && echo " # ${detail}" + else + echo -e " ${RED}✗${RESET} ${name}${detail:+ — ${detail}}" + fi +} + +record_skip() { + local name="$1" + local reason="${2:-}" + ((SKIP++)) || true + ((TOTAL++)) || true + RESULTS+=("SKIP|${name}|${reason}") + if [[ "$OUTPUT_FORMAT" == "tap" ]]; then + echo "ok ${TOTAL} - ${name} # SKIP ${reason}" + else + echo -e " ${YELLOW}⊘${RESET} ${name}${reason:+ — ${reason}}" + fi +} + +# ── curl wrapper ────────────────────────────────────────────────────── +api_curl() { + local endpoint="$1" + shift + local curl_opts=(-s -S --max-time "$CURL_TIMEOUT") + + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + curl_opts+=(-H "X-API-Key: ${MAILCOW_API_KEY}") + curl_opts+=(-H "Content-Type: application/json") + + local url="${MAILCOW_URL}${endpoint}" + verbose "curl GET ${url}" + + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +api_curl_status() { + local endpoint="$1" + shift + local curl_opts=(-s -S -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") + + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + curl_opts+=(-H "X-API-Key: ${MAILCOW_API_KEY}") + curl_opts+=(-H "Content-Type: application/json") + + local url="${MAILCOW_URL}${endpoint}" + curl "${curl_opts[@]}" "$@" "$url" 2>/dev/null +} + +# ── JSON parsing (no jq required) ──────────────────────────────────── +json_value() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"?\K[^\",}]+" || true; } | head -1 +} + +json_value_string() { + local key="$1" + local json="$2" + echo "$json" | { grep -oP "\"${key}\"\s*:\s*\"\K[^\"]*" || true; } | head -1 +} + +# ── Host extraction ─────────────────────────────────────────────────── +get_mailcow_host() { + echo "$MAILCOW_URL" | sed 's|https\?://||' | cut -d/ -f1 | cut -d: -f1 +} + +# ── Port check ──────────────────────────────────────────────────────── +check_port() { + local host="$1" + local port="$2" + nc -z -w 3 "$host" "$port" >/dev/null 2>&1 +} + +# ══════════════════════════════════════════════════════════════════════ +# TEST SUITES +# ══════════════════════════════════════════════════════════════════════ + +# ── 1. Connectivity ────────────────────────────────────────────────── +test_connectivity() { + echo "" + echo -e "${BOLD}Connectivity${RESET}" + + # 1a. Mailcow UI reachable + local curl_opts=(-s -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + + local http_code + http_code=$(curl "${curl_opts[@]}" "${MAILCOW_URL}/" 2>/dev/null) || http_code="000" + + if [[ "$http_code" == "200" || "$http_code" == "301" || "$http_code" == "302" ]]; then + record_pass "Mailcow UI reachable" "HTTP ${http_code}" + else + record_fail "Mailcow UI reachable" "HTTP ${http_code}" + fi + + # 1b. TLS certificate validity + if [[ "$MAILCOW_URL" == https://* ]]; then + local host + host=$(get_mailcow_host) + local port + port=$(echo "$MAILCOW_URL" | grep -oP ':\K[0-9]+$' || echo "443") + + local expiry + expiry=$(echo | openssl s_client -servername "$host" -connect "${host}:${port}" 2>/dev/null | \ + openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) || expiry="" + + if [[ -n "$expiry" ]]; then + local expiry_epoch + expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null) || expiry_epoch=0 + local now_epoch + now_epoch=$(date +%s) + local days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + + if [[ $days_left -gt 30 ]]; then + record_pass "TLS certificate valid" "${days_left} days remaining" + elif [[ $days_left -gt 0 ]]; then + record_pass "TLS certificate valid" "${days_left} days remaining (renew soon)" + else + record_fail "TLS certificate valid" "expired or expiring in ${days_left} days" + fi + else + record_skip "TLS certificate check" "could not retrieve certificate" + fi + else + record_skip "TLS certificate check" "not using HTTPS" + fi +} + +# ── 2. API ──────────────────────────────────────────────────────────── +test_api() { + echo "" + echo -e "${BOLD}API${RESET}" + + # 2a. API authentication — version endpoint + local version_json + version_json=$(api_curl "/api/v1/get/status/version" 2>/dev/null) || version_json="" + + local mc_version + mc_version=$(json_value "version" "$version_json") + + if [[ -n "$mc_version" && "$mc_version" != "null" ]]; then + record_pass "API authentication" "Mailcow ${mc_version}" + elif [[ "$version_json" == *"error"* || "$version_json" == *"unauthorized"* ]]; then + record_fail "API authentication" "API key rejected" + else + record_fail "API authentication" "no version returned" + fi + + # 2b. Container status + local containers_json + containers_json=$(api_curl "/api/v1/get/status/containers" 2>/dev/null) || containers_json="" + + if [[ -n "$containers_json" && "$containers_json" != *"error"* ]]; then + local running_count=0 + local stopped_count=0 + local container_names="" + + # Count running vs non-running containers + running_count=$(echo "$containers_json" | { grep -oP '"state"\s*:\s*"running"' || true; } | wc -l) + stopped_count=$(echo "$containers_json" | { grep -oP '"state"\s*:\s*"(?!running)[^"]*"' || true; } | wc -l) + + if [[ $running_count -gt 0 && $stopped_count -eq 0 ]]; then + record_pass "Container status" "all ${running_count} containers running" + elif [[ $running_count -gt 0 ]]; then + record_fail "Container status" "${running_count} running, ${stopped_count} not running" + else + record_fail "Container status" "no running containers found" + fi + else + record_fail "Container status" "could not query container status" + fi + + # 2c. List domains + local domains_status + domains_status=$(api_curl_status "/api/v1/get/domain/all") + if [[ "$domains_status" == "200" ]]; then + local domains_json + domains_json=$(api_curl "/api/v1/get/domain/all" 2>/dev/null) || domains_json="" + local domain_count + domain_count=$(echo "$domains_json" | { grep -oP '"domain_name"\s*:' || true; } | wc -l) + record_pass "List domains" "${domain_count} domain(s) found" + else + record_fail "List domains" "HTTP ${domains_status}" + fi + + # 2d. List mailboxes + local mailboxes_status + mailboxes_status=$(api_curl_status "/api/v1/get/mailbox/all") + if [[ "$mailboxes_status" == "200" ]]; then + local mailboxes_json + mailboxes_json=$(api_curl "/api/v1/get/mailbox/all" 2>/dev/null) || mailboxes_json="" + local mailbox_count + mailbox_count=$(echo "$mailboxes_json" | { grep -oP '"username"\s*:' || true; } | wc -l) + record_pass "List mailboxes" "${mailbox_count} mailbox(es) found" + else + record_fail "List mailboxes" "HTTP ${mailboxes_status}" + fi +} + +# ── 3. Mail Services ───────────────────────────────────────────────── +test_mail_services() { + echo "" + echo -e "${BOLD}Mail Services${RESET}" + + local host + host=$(get_mailcow_host) + + # 3a. SMTP + if [[ "$SKIP_SMTP" == "true" ]]; then + record_skip "SMTP port 25" "SKIP_SMTP=true" + record_skip "SMTP port 587" "SKIP_SMTP=true" + else + if check_port "$host" 25; then + record_pass "SMTP port 25" "accepting connections" + else + record_fail "SMTP port 25" "not reachable" + fi + + if check_port "$host" 587; then + record_pass "SMTP port 587 (submission)" "accepting connections" + else + record_fail "SMTP port 587 (submission)" "not reachable" + fi + fi + + # 3b. IMAP + if [[ "$SKIP_IMAP" == "true" ]]; then + record_skip "IMAP port 143" "SKIP_IMAP=true" + record_skip "IMAP port 993" "SKIP_IMAP=true" + else + if check_port "$host" 143; then + record_pass "IMAP port 143" "accepting connections" + else + record_fail "IMAP port 143" "not reachable" + fi + + if check_port "$host" 993; then + record_pass "IMAP port 993 (IMAPS)" "accepting connections" + else + record_fail "IMAP port 993 (IMAPS)" "not reachable" + fi + fi + + # 3c. POP3 (optional — skip if not available) + if check_port "$host" 110; then + record_pass "POP3 port 110" "accepting connections" + else + record_skip "POP3 port 110" "not reachable (may be disabled)" + fi + + if check_port "$host" 995; then + record_pass "POP3 port 995 (POP3S)" "accepting connections" + else + record_skip "POP3 port 995 (POP3S)" "not reachable (may be disabled)" + fi +} + +# ── 4. Spam Filter ─────────────────────────────────────────────────── +test_spam_filter() { + echo "" + echo -e "${BOLD}Spam Filter${RESET}" + + # 4a. rspamd — check container status from API + local containers_json + containers_json=$(api_curl "/api/v1/get/status/containers" 2>/dev/null) || containers_json="" + + if [[ -n "$containers_json" ]]; then + local rspamd_state + rspamd_state=$(echo "$containers_json" | { grep -oP '"rspamd-mailcow[^}]*"state"\s*:\s*"\K[^"]*' || true; } | head -1) + + if [[ -z "$rspamd_state" ]]; then + # Try alternate pattern — look for rspamd in container names + rspamd_state=$(echo "$containers_json" | { grep -B5 '"rspamd' || true; } | { grep -oP '"state"\s*:\s*"\K[^"]*' || true; } | head -1) + fi + + if [[ "$rspamd_state" == "running" ]]; then + record_pass "rspamd running" "spam filter operational" + elif [[ -n "$rspamd_state" ]]; then + record_fail "rspamd running" "state: ${rspamd_state}" + else + record_skip "rspamd status" "could not determine rspamd container state" + fi + else + record_fail "rspamd status" "could not query containers" + fi + + # 4b. ClamAV + if [[ "$SKIP_CLAMD" == "true" ]]; then + record_skip "ClamAV status" "SKIP_CLAMD=true" + elif [[ -n "$containers_json" ]]; then + local clamd_state + clamd_state=$(echo "$containers_json" | { grep -oP '"clamd-mailcow[^}]*"state"\s*:\s*"\K[^"]*' || true; } | head -1) + + if [[ -z "$clamd_state" ]]; then + clamd_state=$(echo "$containers_json" | { grep -B5 '"clamd' || true; } | { grep -oP '"state"\s*:\s*"\K[^"]*' || true; } | head -1) + fi + + if [[ "$clamd_state" == "running" ]]; then + record_pass "ClamAV running" "antivirus operational" + elif [[ -n "$clamd_state" ]]; then + record_fail "ClamAV running" "state: ${clamd_state}" + else + record_skip "ClamAV status" "ClamAV container not found (may be disabled)" + fi + else + record_skip "ClamAV status" "could not query containers" + fi +} + +# ── 5. Webmail ──────────────────────────────────────────────────────── +test_webmail() { + echo "" + echo -e "${BOLD}Webmail${RESET}" + + local curl_opts=(-s -o /dev/null -w "%{http_code}" --max-time "$CURL_TIMEOUT") + [[ "$CURL_INSECURE" == "true" ]] && curl_opts+=(-k) + + local sogo_code + sogo_code=$(curl "${curl_opts[@]}" "${MAILCOW_URL}/SOGo" 2>/dev/null) || sogo_code="000" + + if [[ "$sogo_code" == "200" || "$sogo_code" == "301" || "$sogo_code" == "302" ]]; then + record_pass "SOGo reachable" "HTTP ${sogo_code}" + elif [[ "$sogo_code" == "000" ]]; then + record_fail "SOGo reachable" "connection failed" + else + record_fail "SOGo reachable" "HTTP ${sogo_code}" + fi +} + +# ── 6. Quarantine ──────────────────────────────────────────────────── +test_quarantine() { + echo "" + echo -e "${BOLD}Quarantine${RESET}" + + local quarantine_json + quarantine_json=$(api_curl "/api/v1/get/quarantine/all" 2>/dev/null) || quarantine_json="" + + local quarantine_status + quarantine_status=$(api_curl_status "/api/v1/get/quarantine/all") + + if [[ "$quarantine_status" == "200" ]]; then + local q_count + q_count=$(echo "$quarantine_json" | { grep -oP '"id"\s*:' || true; } | wc -l) + record_pass "Quarantine endpoint" "${q_count} item(s) in quarantine" + else + record_fail "Quarantine endpoint" "HTTP ${quarantine_status}" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# OUTPUT +# ══════════════════════════════════════════════════════════════════════ + +print_summary() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + echo "" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + echo -e "${BOLD}Summary${RESET} ${MAILCOW_URL}" + echo -e " ${GREEN}${PASS} passed${RESET} ${RED}${FAIL} failed${RESET} ${YELLOW}${SKIP} skipped${RESET} (${duration}s)" + echo -e "${BOLD}────────────────────────────────────────${RESET}" + + if [[ $FAIL -eq 0 ]]; then + echo -e "${GREEN}${BOLD}All tests passed.${RESET}" + else + echo -e "${RED}${BOLD}${FAIL} test(s) failed.${RESET}" + fi +} + +print_tap_header() { + echo "TAP version 13" +} + +print_tap_footer() { + echo "1..${TOTAL}" + echo "# pass ${PASS}" + echo "# fail ${FAIL}" + echo "# skip ${SKIP}" +} + +write_junit() { + local end_time + end_time=$(date +%s) + local duration=$(( end_time - START_TIME )) + + cat > "$JUNIT_FILE" < + + +JUNIT_EOF + + for result in "${RESULTS[@]}"; do + local status name detail + status=$(echo "$result" | cut -d'|' -f1) + name=$(echo "$result" | cut -d'|' -f2) + detail=$(echo "$result" | cut -d'|' -f3) + + # XML-escape the values + name=$(echo "$name" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + detail=$(echo "$detail" | sed 's/&/\&/g; s//\>/g; s/"/\"/g') + + case "$status" in + PASS) + echo " " >> "$JUNIT_FILE" + [[ -n "$detail" ]] && echo " ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + FAIL) + echo " " >> "$JUNIT_FILE" + echo " FAILED: ${name} — ${detail}" >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + SKIP) + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + echo " " >> "$JUNIT_FILE" + ;; + esac + done + + echo " " >> "$JUNIT_FILE" + echo "" >> "$JUNIT_FILE" + + log "JUnit report written to ${JUNIT_FILE}" +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +usage() { + cat <&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +# ── Helpers ─────────────────────────────────────────────────────────── +section_header() { + echo "" + echo -e " ${BOLD}${CYAN}── $1 ──${RESET}" + echo "" +} + +field() { + printf " ${BOLD}%-24s${RESET} %s\n" "$1" "$2" +} + +field_color() { + printf " ${BOLD}%-24s${RESET} %b\n" "$1" "$2" +} + +human_bytes() { + local bytes="$1" + if [[ "$bytes" -ge 1073741824 ]]; then + awk "BEGIN { printf \"%.1f GiB\", $bytes / 1073741824 }" + elif [[ "$bytes" -ge 1048576 ]]; then + awk "BEGIN { printf \"%.1f MiB\", $bytes / 1048576 }" + elif [[ "$bytes" -ge 1024 ]]; then + awk "BEGIN { printf \"%.1f KiB\", $bytes / 1024 }" + else + echo "${bytes} B" + fi +} + +human_kb() { + local kb="$1" + human_bytes $((kb * 1024)) +} + +should_show() { + local section="$1" + if [[ "$RUN_ALL" == "true" ]]; then + return 0 + fi + case "$section" in + oom) [[ "$RUN_OOM" == "true" ]] ;; + top) [[ "$RUN_TOP" == "true" ]] ;; + swap) [[ "$RUN_SWAP" == "true" ]] ;; + slab) [[ "$RUN_SLAB" == "true" ]] ;; + psi) [[ "$RUN_PSI" == "true" ]] ;; + fragmentation) [[ "$RUN_FRAG" == "true" ]] ;; + summary) [[ "$RUN_SUMMARY" == "true" ]] ;; + *) return 1 ;; + esac +} + +# ══════════════════════════════════════════════════════════════════════ +# OOM KILL HISTORY +# ══════════════════════════════════════════════════════════════════════ + +show_oom() { + section_header "OOM Kill History" + + local oom_lines="" + + # Try journalctl first + if command -v journalctl &>/dev/null; then + oom_lines=$(journalctl -k --since "7 days ago" --no-pager 2>/dev/null | grep -i "killed process" || true) + fi + + # Fallback to dmesg + if [[ -z "$oom_lines" ]]; then + oom_lines=$(dmesg 2>/dev/null | grep -i "killed process" || true) + fi + + if [[ -z "$oom_lines" ]]; then + echo -e " No recent OOM kills found ${GREEN}✓${RESET}" + OOM_COUNT=0 + return + fi + + OOM_COUNT=$(echo "$oom_lines" | wc -l) + echo -e " Found ${RED}${OOM_COUNT}${RESET} OOM event(s) in the last 7 days" + echo "" + + echo "$oom_lines" | tail -20 | while IFS= read -r line; do + local proc_name pid + proc_name=$(echo "$line" | grep -oP "Killed process \d+ \(\K[^)]+") + pid=$(echo "$line" | grep -oP "Killed process \K\d+") + local ts + ts=$(echo "$line" | awk '{print $1, $2, $3}') + printf " ${RED}✗${RESET} %-22s %-20s (PID %s)\n" "$ts" "${proc_name:-unknown}" "${pid:-?}" + done +} + +# ══════════════════════════════════════════════════════════════════════ +# TOP MEMORY CONSUMERS +# ══════════════════════════════════════════════════════════════════════ + +show_top() { + section_header "Top Memory Consumers" + + printf " ${BOLD}%-8s %-12s %-12s %-12s %s${RESET}\n" "PID" "USER" "RSS" "VSZ" "COMMAND" + printf " %s\n" "$(printf '%.0s─' {1..60})" + + ps axo pid,user,rss,vsz,comm --sort=-rss 2>/dev/null | tail -n +2 | head -"$TOP_N" | while IFS= read -r line; do + local pid user rss vsz cmd + pid=$(echo "$line" | awk '{print $1}') + user=$(echo "$line" | awk '{print $2}') + rss=$(echo "$line" | awk '{print $3}') + vsz=$(echo "$line" | awk '{print $4}') + cmd=$(echo "$line" | awk '{print $5}') + + local rss_h vsz_h + rss_h=$(human_kb "$rss") + vsz_h=$(human_kb "$vsz") + + printf " %-8s %-12s %-12s %-12s %s\n" "$pid" "${user:0:12}" "$rss_h" "$vsz_h" "${cmd:0:30}" + done +} + +# ══════════════════════════════════════════════════════════════════════ +# SWAP USAGE +# ══════════════════════════════════════════════════════════════════════ + +show_swap() { + section_header "Swap Usage" + + if [[ -f /proc/meminfo ]]; then + local swap_total_kb swap_free_kb swap_used_kb + swap_total_kb=$(awk '/^SwapTotal:/ {print $2}' /proc/meminfo) + swap_free_kb=$(awk '/^SwapFree:/ {print $2}' /proc/meminfo) + swap_used_kb=$((swap_total_kb - swap_free_kb)) + + if [[ "$swap_total_kb" -eq 0 ]]; then + field "Swap:" "Not configured" + SWAP_PCT=0 + return + fi + + SWAP_PCT=$(awk "BEGIN { printf \"%.0f\", $swap_used_kb * 100 / $swap_total_kb }") + + local color="$GREEN" + if [[ "$SWAP_PCT" -ge 80 ]]; then color="$RED" + elif [[ "$SWAP_PCT" -ge 50 ]]; then color="$YELLOW" + fi + + field "Total swap:" "$(human_kb "$swap_total_kb")" + field_color "Used swap:" "${color}$(human_kb "$swap_used_kb") (${SWAP_PCT}%)${RESET}" + field "Free swap:" "$(human_kb "$swap_free_kb")" + fi + + # Swap in/out rates from vmstat + if command -v vmstat &>/dev/null; then + local si so + si=$(vmstat 1 2 2>/dev/null | tail -1 | awk '{print $7}') + so=$(vmstat 1 2 2>/dev/null | tail -1 | awk '{print $8}') + if [[ -n "$si" && -n "$so" ]]; then + field "Swap in rate:" "${si} KB/s" + field "Swap out rate:" "${so} KB/s" + fi + fi + + # Per-process swap + echo "" + echo -e " ${BOLD}Top swap consumers:${RESET}" + printf " %-8s %-12s %s\n" "PID" "SWAP" "COMMAND" + + local found=0 + for proc_dir in /proc/[0-9]*; do + local pid="${proc_dir##*/}" + local swap_kb=0 + if [[ -r "${proc_dir}/status" ]]; then + swap_kb=$(awk '/^VmSwap:/ {print $2}' "${proc_dir}/status" 2>/dev/null || echo "0") + fi + if [[ "${swap_kb:-0}" -gt 0 ]]; then + local cmd_name + cmd_name=$(cat "${proc_dir}/comm" 2>/dev/null || echo "?") + echo "${swap_kb} ${pid} ${cmd_name}" + fi + done 2>/dev/null | sort -rn | head -"$TOP_N" | while IFS= read -r line; do + local skb pid cmd + skb=$(echo "$line" | awk '{print $1}') + pid=$(echo "$line" | awk '{print $2}') + cmd=$(echo "$line" | awk '{print $3}') + printf " %-8s %-12s %s\n" "$pid" "$(human_kb "$skb")" "$cmd" + found=1 + done + + if [[ "$found" -eq 0 ]]; then + echo " No processes using swap" + fi +} + +# ══════════════════════════════════════════════════════════════════════ +# SLAB CACHES +# ══════════════════════════════════════════════════════════════════════ + +show_slab() { + section_header "Slab Caches" + + if [[ ! -r /proc/slabinfo ]]; then + warn "/proc/slabinfo not readable (requires root)" + return + fi + + printf " ${BOLD}%-30s %10s %10s %12s${RESET}\n" "CACHE" "NUM_OBJS" "OBJ_SIZE" "TOTAL_SIZE" + printf " %s\n" "$(printf '%.0s─' {1..65})" + + # Parse slabinfo (skip header lines) + tail -n +3 /proc/slabinfo 2>/dev/null | awk '{ + name=$1; num_objs=$3; obj_size=$4; + total = num_objs * obj_size; + print total, name, num_objs, obj_size + }' | sort -rn | head -"$TOP_N" | while IFS= read -r line; do + local total name num_objs obj_size + total=$(echo "$line" | awk '{print $1}') + name=$(echo "$line" | awk '{print $2}') + num_objs=$(echo "$line" | awk '{print $3}') + obj_size=$(echo "$line" | awk '{print $4}') + + local total_h + total_h=$(human_bytes "$total") + + printf " %-30s %10s %10s %12s\n" "${name:0:30}" "$num_objs" "${obj_size}B" "$total_h" + done + + # Total slab usage + local slab_total_kb + slab_total_kb=$(awk '/^Slab:/ {print $2}' /proc/meminfo 2>/dev/null || echo "0") + echo "" + field "Total slab memory:" "$(human_kb "$slab_total_kb")" +} + +# ══════════════════════════════════════════════════════════════════════ +# PSI METRICS +# ══════════════════════════════════════════════════════════════════════ + +show_psi() { + section_header "PSI Metrics (Pressure Stall Information)" + + if [[ ! -f /proc/pressure/memory ]]; then + echo -e " ${DIM}PSI not available (requires kernel 4.20+)${RESET}" + return + fi + + printf " ${BOLD}%-18s %8s %8s %8s %14s${RESET}\n" "PRESSURE" "avg10" "avg60" "avg300" "total (µs)" + printf " %s\n" "$(printf '%.0s─' {1..58})" + + while IFS= read -r line; do + local ptype avg10 avg60 avg300 total + ptype=$(echo "$line" | awk '{print $1}') + avg10=$(echo "$line" | grep -oP 'avg10=\K[0-9.]+') + avg60=$(echo "$line" | grep -oP 'avg60=\K[0-9.]+') + avg300=$(echo "$line" | grep -oP 'avg300=\K[0-9.]+') + total=$(echo "$line" | grep -oP 'total=\K[0-9]+') + + printf " %-18s %8s %8s %8s %14s\n" "$ptype" "$avg10" "$avg60" "$avg300" "$total" + + # Track for health score + if [[ "$ptype" == "some" ]]; then + PSI_SOME_AVG300=$(echo "$avg300" | cut -d. -f1) + fi + done < /proc/pressure/memory +} + +# ══════════════════════════════════════════════════════════════════════ +# FRAGMENTATION +# ══════════════════════════════════════════════════════════════════════ + +show_fragmentation() { + section_header "Memory Fragmentation (buddyinfo)" + + if [[ ! -f /proc/buddyinfo ]]; then + warn "/proc/buddyinfo not available" + return + fi + + echo -e " ${BOLD}Free pages by order (0=4K, 1=8K, 2=16K, ... 10=4M):${RESET}" + echo "" + + while IFS= read -r line; do + local node zone orders + node=$(echo "$line" | awk '{print $2}') + zone=$(echo "$line" | awk '{print $4}') + orders=$(echo "$line" | awk '{for(i=5;i<=NF;i++) printf "%7s", $i; print ""}') + + printf " %-6s %-10s %s\n" "$node" "$zone" "$orders" + done < /proc/buddyinfo + + echo "" + echo -e " ${DIM}Low counts at higher orders indicate fragmentation${RESET}" +} + +# ══════════════════════════════════════════════════════════════════════ +# HEALTH SUMMARY +# ══════════════════════════════════════════════════════════════════════ + +show_summary() { + section_header "Memory Health Summary" + + SCORE=100 + local findings=() + + # OOM kills penalty (heavy: -30 per kill, max -60) + if [[ "$OOM_COUNT" -gt 0 ]]; then + local oom_penalty=$((OOM_COUNT * 30)) + [[ "$oom_penalty" -gt 60 ]] && oom_penalty=60 + SCORE=$((SCORE - oom_penalty)) + findings+=("${RED}⚠${RESET} ${OOM_COUNT} OOM kill(s) in last 7 days") + else + findings+=("${GREEN}✓${RESET} No recent OOM kills") + fi + + # PSI penalty (medium: -15 if avg300 some > 5) + if [[ -f /proc/pressure/memory ]]; then + if [[ "$PSI_SOME_AVG300" -gt 5 ]]; then + SCORE=$((SCORE - 15)) + findings+=("${YELLOW}⚠${RESET} PSI avg300 some > 5.0 — sustained memory contention") + elif [[ "$PSI_SOME_AVG300" -gt 1 ]]; then + SCORE=$((SCORE - 5)) + findings+=("${YELLOW}⚠${RESET} PSI avg300 some > 1.0 — mild memory contention") + else + findings+=("${GREEN}✓${RESET} PSI levels normal") + fi + fi + + # Swap penalty (medium: -15 if >50% used) + if [[ "$SWAP_PCT" -gt 80 ]]; then + SCORE=$((SCORE - 20)) + findings+=("${RED}⚠${RESET} Swap ${SWAP_PCT}% used — heavy swap pressure") + elif [[ "$SWAP_PCT" -gt 50 ]]; then + SCORE=$((SCORE - 15)) + findings+=("${YELLOW}⚠${RESET} Swap ${SWAP_PCT}% used — moderate swap pressure") + elif [[ "$SWAP_PCT" -gt 20 ]]; then + SCORE=$((SCORE - 5)) + findings+=("${YELLOW}⚠${RESET} Swap ${SWAP_PCT}% used — light swap usage") + else + findings+=("${GREEN}✓${RESET} No active swap pressure") + fi + + # Memory usage penalty + if [[ -f /proc/meminfo ]]; then + local total_kb avail_kb mem_pct + total_kb=$(awk '/^MemTotal:/ {print $2}' /proc/meminfo) + avail_kb=$(awk '/^MemAvailable:/ {print $2}' /proc/meminfo) + mem_pct=$(awk "BEGIN { printf \"%.0f\", ($total_kb - $avail_kb) * 100 / $total_kb }") + if [[ "$mem_pct" -ge 95 ]]; then + SCORE=$((SCORE - 15)) + findings+=("${RED}⚠${RESET} Memory ${mem_pct}% used — critically low available memory") + elif [[ "$mem_pct" -ge 90 ]]; then + SCORE=$((SCORE - 10)) + findings+=("${YELLOW}⚠${RESET} Memory ${mem_pct}% used — high memory utilization") + fi + fi + + [[ "$SCORE" -lt 0 ]] && SCORE=0 + + # Determine rating + local rating color + if [[ "$SCORE" -ge 90 ]]; then + rating="Excellent"; color="$GREEN" + elif [[ "$SCORE" -ge 75 ]]; then + rating="Good"; color="$GREEN" + elif [[ "$SCORE" -ge 50 ]]; then + rating="Fair"; color="$YELLOW" + elif [[ "$SCORE" -ge 25 ]]; then + rating="Poor"; color="$RED" + else + rating="Critical"; color="$RED" + fi + + field_color "Overall score:" "${color}${SCORE} / 100 (${rating})${RESET}" + echo "" + + for f in "${findings[@]}"; do + echo -e " $f" + done +} + +# ══════════════════════════════════════════════════════════════════════ +# HELP +# ══════════════════════════════════════════════════════════════════════ + +show_help() { + cat <&2; echo "Run $SCRIPT_NAME --help for usage" >&2; exit 1 ;; + esac + done + + setup_colors + + echo "" + echo -e "${BOLD}Memory Pressure Analysis — $(hostname -f 2>/dev/null || hostname)${RESET}" + echo -e "${DIM}$(date '+%Y-%m-%d %H:%M:%S %Z')${RESET}" + + should_show "oom" && show_oom + should_show "top" && show_top + should_show "swap" && show_swap + should_show "slab" && show_slab + should_show "psi" && show_psi + should_show "fragmentation" && show_fragmentation + should_show "summary" && show_summary + + echo "" +} + +main "$@" diff --git a/memory-pressure-exporter.sh b/memory-pressure-exporter.sh new file mode 100644 index 0000000..4345ee2 --- /dev/null +++ b/memory-pressure-exporter.sh @@ -0,0 +1,723 @@ +#!/bin/bash +################################################################################ +# Script Name: memory-pressure-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for memory and swap pressure metrics. +# Exports PSI stall information, OOM kill events, swap activity +# rates, NUMA memory balance, slab pressure, transparent hugepage +# stats, and zone watermark proximity. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - Standard Unix tools (awk, grep, cat) +# - netcat (nc) for HTTP mode +# - Optional: journalctl (OOM tracking), kernel 4.20+ (PSI), +# multi-node NUMA. Each section is skipped gracefully if unavailable. +# +# Usage: +# # Output to stdout +# ./memory-pressure-exporter.sh +# +# # HTTP server mode +# ./memory-pressure-exporter.sh --http -p 9198 +# +# # Textfile collector mode +# ./memory-pressure-exporter.sh --textfile +# +# Metrics Exported: +# Core Status: +# - memory_pressure_up - Exporter status (1=up, 0=down) +# - memory_pressure_exporter_info{version} - Exporter version +# +# PSI Memory (if /proc/pressure/memory exists): +# - memory_pressure_psi_some_avg10 - PSI memory some avg10 +# - memory_pressure_psi_some_avg60 - PSI memory some avg60 +# - memory_pressure_psi_some_avg300 - PSI memory some avg300 +# - memory_pressure_psi_some_total_microseconds - PSI memory some total +# - memory_pressure_psi_full_avg10 - PSI memory full avg10 +# - memory_pressure_psi_full_avg60 - PSI memory full avg60 +# - memory_pressure_psi_full_avg300 - PSI memory full avg300 +# - memory_pressure_psi_full_total_microseconds - PSI memory full total +# +# PSI I/O (if /proc/pressure/io exists): +# - memory_pressure_psi_io_some_avg10 - PSI I/O some avg10 +# - memory_pressure_psi_io_some_avg60 - PSI I/O some avg60 +# - memory_pressure_psi_io_some_avg300 - PSI I/O some avg300 +# - memory_pressure_psi_io_some_total_microseconds - PSI I/O some total +# - memory_pressure_psi_io_full_avg10 - PSI I/O full avg10 +# - memory_pressure_psi_io_full_avg60 - PSI I/O full avg60 +# - memory_pressure_psi_io_full_avg300 - PSI I/O full avg300 +# - memory_pressure_psi_io_full_total_microseconds - PSI I/O full total +# +# OOM Kills (if journalctl available): +# - memory_pressure_oom_kills_24h - OOM kills in last 24 hours +# - memory_pressure_oom_last_kill_timestamp - Unix timestamp of last OOM +# - memory_pressure_oom_last_victim{process} - Last killed process (1) +# +# Swap Activity: +# - memory_pressure_swap_in_pages_per_sec - Swap in pages/sec +# - memory_pressure_swap_out_pages_per_sec - Swap out pages/sec +# - memory_pressure_swap_in_bytes_per_sec - Swap in bytes/sec +# - memory_pressure_swap_out_bytes_per_sec - Swap out bytes/sec +# +# NUMA (if multi-node): +# - memory_pressure_numa_total_bytes{node} - Total memory per node +# - memory_pressure_numa_free_bytes{node} - Free memory per node +# - memory_pressure_numa_used_percent{node} - Usage percentage per node +# +# Transparent Hugepages: +# - memory_pressure_thp_fault_alloc_total - THP fault allocations +# - memory_pressure_thp_collapse_alloc_total - THP collapse allocations +# - memory_pressure_thp_fault_fallback_total - THP fault fallbacks +# - memory_pressure_compact_stall_total - Compaction stalls +# +# Slab: +# - memory_pressure_slab_reclaimable_bytes - Reclaimable slab +# - memory_pressure_slab_unreclaimable_bytes - Unreclaimable slab +# - memory_pressure_slab_total_bytes - Total slab +# - memory_pressure_slab_unreclaimable_percent - Unreclaimable percentage +# +# Zone Watermarks: +# - memory_pressure_zone_free_pages{zone} - Current free pages +# - memory_pressure_zone_min_pages{zone} - Min watermark +# - memory_pressure_zone_low_pages{zone} - Low watermark +# - memory_pressure_zone_high_pages{zone} - High watermark +# - memory_pressure_zone_free_above_low{zone} - 1 if free > low +# +# Exporter: +# - memory_pressure_exporter_duration_seconds - Script execution time +# - memory_pressure_exporter_last_run_timestamp - Last run timestamp +# +# Configuration: +# Default HTTP port: 9198 +# Textfile directory: /var/lib/node_exporter +# SAMPLE_INTERVAL: seconds between swap activity samples (default: 1) +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9198 +SAMPLE_INTERVAL="${SAMPLE_INTERVAL:-1}" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# ============================================================================ +# PSI PRESSURE +# ============================================================================ + +# Parse a PSI file (/proc/pressure/memory or /proc/pressure/io) +# Args: $1 - file path +# Output: lines of "type avg10 avg60 avg300 total" +# where type is "some" or "full" +get_psi_stats() { + local psi_file="$1" + [ -f "$psi_file" ] || return + + awk '{ + type = $1 + avg10 = avg60 = avg300 = total = 0 + for (i = 2; i <= NF; i++) { + split($i, kv, "=") + if (kv[1] == "avg10") avg10 = kv[2] + if (kv[1] == "avg60") avg60 = kv[2] + if (kv[1] == "avg300") avg300 = kv[2] + if (kv[1] == "total") total = kv[2] + } + print type, avg10, avg60, avg300, total + }' "$psi_file" +} + +# ============================================================================ +# OOM KILL TRACKING +# ============================================================================ + +# Get OOM kill count in last 24 hours +# Returns: count +get_oom_kill_count() { + if ! command -v journalctl >/dev/null 2>&1; then + echo "0" + return + fi + local count + count=$(journalctl -k --grep="Out of memory" --since "24 hours ago" --no-pager -q 2>/dev/null | wc -l) + echo "${count:-0}" +} + +# Get last OOM kill timestamp (unix epoch) +# Returns: timestamp or 0 +get_oom_last_timestamp() { + if ! command -v journalctl >/dev/null 2>&1; then + echo "0" + return + fi + local last_line + last_line=$(journalctl -k --grep="Out of memory" --since "24 hours ago" --no-pager -q -o short-unix 2>/dev/null | tail -1) + if [ -n "$last_line" ]; then + echo "$last_line" | awk '{printf "%d", $1}' + else + echo "0" + fi +} + +# Get last OOM victim process name +# Returns: process name or empty +get_oom_last_victim() { + if ! command -v journalctl >/dev/null 2>&1; then + return + fi + journalctl -k --grep="Killed process" --since "24 hours ago" --no-pager -q 2>/dev/null \ + | tail -1 \ + | grep -oP 'Killed process \d+ \(\K[^)]+' \ + | head -1 +} + +# ============================================================================ +# SWAP ACTIVITY +# ============================================================================ + +# Read swap counters from /proc/vmstat +# Returns: "pswpin pswpout" +get_swap_counters() { + awk '/^pswpin / { pin=$2 } /^pswpout / { pout=$2 } END { print pin, pout }' /proc/vmstat 2>/dev/null +} + +# ============================================================================ +# NUMA MEMORY +# ============================================================================ + +# Check if system has multiple NUMA nodes +# Returns: 0 (true) if multi-node, 1 (false) if single +is_numa_multi_node() { + [ -d /sys/devices/system/node/node1 ] +} + +# Get NUMA memory info per node +# Output: lines of "nodeN total_kb free_kb" +get_numa_memory() { + local node_dir="/sys/devices/system/node" + [ -d "$node_dir" ] || return + + for node_path in "$node_dir"/node[0-9]*; do + [ -d "$node_path" ] || continue + local node_name + node_name=$(basename "$node_path") + local meminfo="$node_path/meminfo" + [ -f "$meminfo" ] || continue + + local total free + total=$(awk '/MemTotal/ {print $4}' "$meminfo" 2>/dev/null) + free=$(awk '/MemFree/ {print $4}' "$meminfo" 2>/dev/null) + echo "$node_name ${total:-0} ${free:-0}" + done +} + +# ============================================================================ +# TRANSPARENT HUGEPAGES & COMPACTION +# ============================================================================ + +# Get THP and compaction stats from /proc/vmstat +# Returns: "thp_fault_alloc thp_collapse_alloc thp_fault_fallback compact_stall" +get_thp_stats() { + awk ' + /^thp_fault_alloc / { fault=$2 } + /^thp_collapse_alloc / { collapse=$2 } + /^thp_fault_fallback / { fallback=$2 } + /^compact_stall / { stall=$2 } + END { print fault+0, collapse+0, fallback+0, stall+0 } + ' /proc/vmstat 2>/dev/null +} + +# ============================================================================ +# SLAB MEMORY +# ============================================================================ + +# Get slab memory from /proc/meminfo +# Returns: "reclaimable_kb unreclaimable_kb" +get_slab_stats() { + awk ' + /^SReclaimable:/ { reclaimable=$2 } + /^SUnreclaim:/ { unreclaimable=$2 } + END { print reclaimable+0, unreclaimable+0 } + ' /proc/meminfo 2>/dev/null +} + +# ============================================================================ +# ZONE WATERMARKS +# ============================================================================ + +# Parse /proc/zoneinfo for Normal and DMA32 zones +# Output: lines of "zone free min low high" +get_zone_watermarks() { + awk ' + /^Node [0-9]+, zone +[A-Za-z0-9]+/ { + zone = $NF + } + zone == "Normal" || zone == "DMA32" { + if ($1 == "pages" && $2 == "free") free = $3 + if ($1 == "min") min_wm = $2 + if ($1 == "low") low_wm = $2 + if ($1 == "high") { + high_wm = $2 + print zone, free+0, min_wm+0, low_wm+0, high_wm+0 + zone = "" + } + } + ' /proc/zoneinfo 2>/dev/null +} + +# ============================================================================ +# METRICS GENERATION +# ============================================================================ + +generate_metrics() { + local script_start + script_start=$(date +%s) + + # ======================================================================== + # Exporter Status + # ======================================================================== + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +Memory Pressure Exporter v1.0 + +

Memory Pressure Exporter v1.0

+

Metrics

+

Sections (auto-detected)

+
    +
  • PSI memory and I/O pressure (requires kernel 4.20+)
  • +
  • OOM kill tracking (requires journalctl)
  • +
  • Swap activity rates
  • +
  • NUMA memory balance (requires multi-node system)
  • +
  • Transparent hugepage and compaction stats
  • +
  • Slab memory pressure
  • +
  • Zone watermark proximity
  • +
+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.memory_pressure_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/motd-generator.sh b/motd-generator.sh new file mode 100644 index 0000000..a8039a8 --- /dev/null +++ b/motd-generator.sh @@ -0,0 +1,437 @@ +#!/usr/bin/env bash + +######################################################################################### +#### motd-generator.sh — Generate a dynamic MOTD with system stats and health info #### +#### Shows hostname, IP, uptime, disk, load, memory, updates, and service status #### +#### Dry-run by default for --install — use --force to write to update-motd.d #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version 1.02 #### +#### #### +#### Usage: #### +#### ./motd-generator.sh #### +#### ./motd-generator.sh --plain #### +#### ./motd-generator.sh --install --force #### +#### #### +#### See --help for all options. #### +######################################################################################### + +set -euo pipefail + +# ── Defaults ────────────────────────────────────────────────────────── +VERBOSE="${VERBOSE:-false}" +COLOR="${COLOR:-auto}" +DRY_RUN="${DRY_RUN:-true}" +MODE="${MODE:-display}" +PLAIN="${PLAIN:-false}" +MOTD_TARGET="${MOTD_TARGET:-/etc/update-motd.d/99-custom}" + +# ── State ───────────────────────────────────────────────────────────── +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME + +# ── Colors ──────────────────────────────────────────────────────────── +setup_colors() { + if [[ "$COLOR" == "never" ]]; then + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + return + fi + if [[ "$COLOR" == "always" ]] || [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + CYAN='\033[0;36m' + BOLD='\033[1m' + DIM='\033[2m' + RESET='\033[0m' + else + RED="" GREEN="" YELLOW="" CYAN="" BOLD="" DIM="" RESET="" + fi +} + +# ── Logging ─────────────────────────────────────────────────────────── +log() { echo -e "${CYAN}[INFO]${RESET} $*"; } +warn() { echo -e "${YELLOW}[WARN]${RESET} $*" >&2; } +err() { echo -e "${RED}[ERROR]${RESET} $*" >&2; } +verbose() { if [[ "$VERBOSE" == "true" ]]; then echo -e "${DIM}[DEBUG]${RESET} $*"; fi; } + +# ── Data Gathering ─────────────────────────────────────────────────── + +get_hostname() { + hostname -f 2>/dev/null || hostname +} + +get_primary_ip() { + local ip="" + if command -v ip &>/dev/null; then + ip=$(ip route get 1.1.1.1 2>/dev/null | grep -oP 'src \K[\d.]+' | head -1) + fi + if [[ -z "$ip" ]] && command -v hostname &>/dev/null; then + ip=$(hostname -I 2>/dev/null | awk '{print $1}') + fi + echo "${ip:-N/A}" +} + +get_uptime() { + uptime -p 2>/dev/null || uptime | sed 's/.*up //' | sed 's/, [0-9]* user.*//' +} + +get_disk_usage() { + df / 2>/dev/null | tail -1 | awk '{print $5}' | tr -d '%' +} + +get_load_average() { + cut -d' ' -f1-3 /proc/loadavg 2>/dev/null || echo "N/A" +} + +get_memory_usage() { + if [[ -f /proc/meminfo ]]; then + local total_kb avail_kb + total_kb=$(awk '/^MemTotal:/ {print $2}' /proc/meminfo) + avail_kb=$(awk '/^MemAvailable:/ {print $2}' /proc/meminfo) + if [[ "$total_kb" -gt 0 ]]; then + local pct + pct=$(awk "BEGIN { printf \"%.0f\", ($total_kb - $avail_kb) * 100 / $total_kb }") + local total_mb=$(( total_kb / 1024 )) + local used_mb=$(( (total_kb - avail_kb) / 1024 )) + echo "${pct}|${used_mb}|${total_mb}" + return + fi + fi + echo "0|0|0" +} + +get_pending_updates() { + if command -v apt-get &>/dev/null; then + apt list --upgradable 2>/dev/null | grep -c "upgradable" || true + elif command -v dnf &>/dev/null; then + dnf check-update --quiet 2>/dev/null | grep -cE "^\S" || true + elif command -v yum &>/dev/null; then + yum check-update --quiet 2>/dev/null | grep -cE "^\S" || true + else + echo "N/A" + fi +} + +get_logged_in_users() { + who 2>/dev/null | wc -l +} + +get_failed_services() { + if command -v systemctl &>/dev/null; then + systemctl --no-legend --state=failed 2>/dev/null | wc -l + else + echo "N/A" + fi +} + +# ── Color Thresholds ───────────────────────────────────────────────── + +threshold_color() { + local value="$1" + local warn_at="${2:-75}" + local crit_at="${3:-90}" + + if [[ "$value" -ge "$crit_at" ]]; then + echo "$RED" + elif [[ "$value" -ge "$warn_at" ]]; then + echo "$YELLOW" + else + echo "$GREEN" + fi +} + +load_color() { + local load_1m="$1" + local cpus + cpus=$(nproc 2>/dev/null || echo 1) + local pct + pct=$(awk "BEGIN { printf \"%.0f\", ($load_1m / $cpus) * 100 }") + threshold_color "$pct" 75 90 +} + +# ── MOTD Output ────────────────────────────────────────────────────── + +generate_motd_plain() { + local host ip up disk_pct load mem_info mem_pct mem_used mem_total + local updates users_count failed_count + + host=$(get_hostname) + ip=$(get_primary_ip) + up=$(get_uptime) + disk_pct=$(get_disk_usage) + load=$(get_load_average) + mem_info=$(get_memory_usage) + mem_pct=$(echo "$mem_info" | cut -d'|' -f1) + mem_used=$(echo "$mem_info" | cut -d'|' -f2) + mem_total=$(echo "$mem_info" | cut -d'|' -f3) + updates=$(get_pending_updates) + users_count=$(get_logged_in_users) + failed_count=$(get_failed_services) + + local dc mc lc + dc=$(threshold_color "$disk_pct" 75 90) + mc=$(threshold_color "$mem_pct" 75 90) + local load_1m + load_1m=$(echo "$load" | awk '{print $1}') + lc=$(load_color "$load_1m") + + echo -e " ${BOLD}Hostname:${RESET} $host" + echo -e " ${BOLD}IP Address:${RESET} $ip" + echo -e " ${BOLD}Uptime:${RESET} $up" + echo -e " ${BOLD}Disk (root):${RESET} ${dc}${disk_pct}%${RESET}" + echo -e " ${BOLD}Load Average:${RESET} ${lc}${load}${RESET}" + echo -e " ${BOLD}Memory:${RESET} ${mc}${mem_used}M / ${mem_total}M (${mem_pct}%)${RESET}" + echo -e " ${BOLD}Pending Updates:${RESET} $updates" + echo -e " ${BOLD}Logged-in Users:${RESET} $users_count" + echo -e " ${BOLD}Failed Services:${RESET} $failed_count" +} + +generate_motd_box() { + local host ip up disk_pct load mem_info mem_pct mem_used mem_total + local updates users_count failed_count + + host=$(get_hostname) + ip=$(get_primary_ip) + up=$(get_uptime) + disk_pct=$(get_disk_usage) + load=$(get_load_average) + mem_info=$(get_memory_usage) + mem_pct=$(echo "$mem_info" | cut -d'|' -f1) + mem_used=$(echo "$mem_info" | cut -d'|' -f2) + mem_total=$(echo "$mem_info" | cut -d'|' -f3) + updates=$(get_pending_updates) + users_count=$(get_logged_in_users) + failed_count=$(get_failed_services) + + local dc mc lc + dc=$(threshold_color "$disk_pct" 75 90) + mc=$(threshold_color "$mem_pct" 75 90) + local load_1m + load_1m=$(echo "$load" | awk '{print $1}') + lc=$(load_color "$load_1m") + + local fc_color="$GREEN" + if [[ "$failed_count" != "N/A" && "$failed_count" -gt 0 ]]; then + fc_color="$RED" + fi + + # Build rows as "label|value" pairs to measure widest content + local label_w=18 + local rows=( + "Hostname:|$host" + "IP Address:|$ip" + "Uptime:|$up" + "Disk (root):|${disk_pct}%" + "Load Average:|$load" + "Memory:|${mem_used}M / ${mem_total}M (${mem_pct}%)" + "Pending Updates:|$updates" + "Logged-in Users:|$users_count" + "Failed Services:|$failed_count" + ) + + # Calculate box width from widest content + local header="System Status: ${host}" + local dateline + dateline=$(date '+%Y-%m-%d %H:%M:%S %Z') + local w=${#header} + [[ ${#dateline} -gt $w ]] && w=${#dateline} + + local row label value row_len + for row in "${rows[@]}"; do + label="${row%%|*}" + value="${row#*|}" + row_len=$(( label_w + 1 + ${#value} )) + [[ $row_len -gt $w ]] && w=$row_len + done + + # Add 2 for inner padding (space on each side) + w=$(( w + 2 )) + # Minimum width + [[ $w -lt 56 ]] && w=56 + + # Build box-drawing borders at calculated width + local bar + bar=$(printf '═%.0s' $(seq 1 $((w + 2)))) + local border="╔${bar}╗" + local bottom="╚${bar}╝" + local sep="╠${bar}╣" + + BOX_W=$w # export to _box_row + + echo "" + echo -e " ${CYAN}${border}${RESET}" + printf " ${CYAN}║${RESET} ${BOLD}%-${w}s${RESET} ${CYAN}║${RESET}\n" "$header" + printf " ${CYAN}║${RESET} ${DIM}%-${w}s${RESET} ${CYAN}║${RESET}\n" "$dateline" + echo -e " ${CYAN}${sep}${RESET}" + _box_row "Hostname: " "$host" + _box_row "IP Address: " "$ip" + _box_row "Uptime: " "$up" + _box_row "Disk (root): " "${disk_pct}%" "$dc" + _box_row "Load Average: " "$load" "$lc" + _box_row "Memory: " "${mem_used}M / ${mem_total}M (${mem_pct}%)" "$mc" + _box_row "Pending Updates: " "$updates" + _box_row "Logged-in Users: " "$users_count" + _box_row "Failed Services: " "$failed_count" "$fc_color" + echo -e " ${CYAN}${bottom}${RESET}" + echo "" +} + +_box_row() { + # Print a row inside the box with correct padding regardless of content length + # Usage: _box_row "Label:" "value" [color_prefix] + local label="$1" + local value="$2" + local color="${3:-}" + local inner_w="${BOX_W:-56}" + + # Build the visible text (no ANSI) + local vis_text=" ${label} ${value}" + local vis_len=${#vis_text} + + # Pad to fill the box + local pad_len=$(( inner_w - vis_len )) + [[ $pad_len -lt 0 ]] && pad_len=0 + local padding + padding=$(printf '%*s' "$pad_len" '') + + # Build output with optional color on value + if [[ -n "$color" ]]; then + printf " ${CYAN}║${RESET} %s %b%s${RESET}%s ${CYAN}║${RESET}\n" "$label" "$color" "$value" "$padding" + else + printf " ${CYAN}║${RESET} %s %s%s ${CYAN}║${RESET}\n" "$label" "$value" "$padding" + fi +} + +# ── Install ────────────────────────────────────────────────────────── + +install_motd() { + if [[ "$DRY_RUN" == "true" ]]; then + log "[DRY-RUN] Would install MOTD script to ${MOTD_TARGET}" + log "[DRY-RUN] Run with --force to actually install" + echo "" + log "Generated script preview:" + echo " #!/bin/bash" + echo " # Generated by ${SCRIPT_NAME} on $(date '+%Y-%m-%d %H:%M:%S')" + echo " $(readlink -f "$0") --plain --no-color" + return + fi + + if [[ $EUID -ne 0 ]]; then + err "Installation requires root privileges" + exit 1 + fi + + local script_path + script_path=$(readlink -f "$0") + + local motd_dir + motd_dir=$(dirname "$MOTD_TARGET") + if [[ ! -d "$motd_dir" ]]; then + mkdir -p "$motd_dir" + fi + + cat > "$MOTD_TARGET" <&2 + exit 1 ;; + esac + done +} + +# ══════════════════════════════════════════════════════════════════════ +# MAIN +# ══════════════════════════════════════════════════════════════════════ + +main() { + parse_args "$@" + setup_colors + + case "$MODE" in + install) + install_motd + ;; + display) + if [[ "$PLAIN" == "true" ]]; then + generate_motd_plain + else + generate_motd_box + fi + ;; + esac +} + +main "$@" diff --git a/mssql-metrics-exporter.ps1 b/mssql-metrics-exporter.ps1 new file mode 100644 index 0000000..9b26bce --- /dev/null +++ b/mssql-metrics-exporter.ps1 @@ -0,0 +1,749 @@ +<# +.SYNOPSIS + MSSQL Prometheus Metrics Exporter +.DESCRIPTION + Prometheus exporter for Microsoft SQL Server. Queries DMVs (Dynamic Management + Views) and system catalog views to collect instance health, database sizes, + backup freshness, connection counts, wait statistics, buffer cache performance, + SQL Agent job status, and TempDB usage. Outputs Prometheus-compatible text + format for consumption by windows_exporter textfile collector. +.PARAMETER Mode + Output mode: 'stdout' (default), 'textfile', or 'http' +.PARAMETER SqlInstance + SQL Server instance name or hostname (default: localhost) +.PARAMETER ConnectionString + Full connection string (overrides SqlInstance) +.PARAMETER TextfileDir + Directory for textfile collector output +.PARAMETER HttpPort + HTTP port for http mode (default: 9399) +.PARAMETER InstallScheduledTask + Switch to create a scheduled task for periodic collection +.PARAMETER TaskIntervalMinutes + Interval in minutes for the scheduled task (default: 5) +.NOTES + Author: Phil Connor + Contact: contact@mylinux.work + Website: https://mylinux.work + License: MIT + Version: 1.0 + + Metrics Exported: + Instance: + - mssql_up + - mssql_instance_uptime_seconds + - mssql_version_info{version,edition} + + Database Sizes: + - mssql_database_size_bytes{database} + - mssql_database_log_size_bytes{database} + - mssql_database_log_usage_percent{database} + + Backup Freshness: + - mssql_backup_age_full_hours{database} + - mssql_backup_age_diff_hours{database} + + Connections: + - mssql_connections_active{database} + - mssql_sessions_total + - mssql_sessions_blocked + + Wait Statistics: + - mssql_wait_time_seconds{wait_type} + - mssql_waiting_tasks{wait_type} + + Buffer Cache: + - mssql_buffer_cache_hit_ratio + - mssql_page_life_expectancy_seconds + + SQL Agent Jobs: + - mssql_agent_job_status{job} + - mssql_agent_job_duration_seconds{job} + + TempDB: + - mssql_tempdb_size_bytes + - mssql_tempdb_usage_bytes + + Errors: + - mssql_error_count + + Exporter: + - mssql_collector_duration_seconds +#> + +param( + [ValidateSet('stdout', 'textfile', 'http')] + [string]$Mode = 'stdout', + + [string]$SqlInstance = 'localhost', + + [string]$ConnectionString, + + [string]$TextfileDir = 'C:\Program Files\windows_exporter\textfile_inputs', + + [int]$HttpPort = 9399, + + [switch]$InstallScheduledTask, + + [int]$TaskIntervalMinutes = 5 +) + +# Handle --textfile and --http as positional arguments +if ($args -contains '--textfile') { $Mode = 'textfile' } +if ($args -contains '--http') { $Mode = 'http' } + +# ============================================================================ +# SCHEDULED TASK INSTALLATION +# ============================================================================ + +if ($InstallScheduledTask) { + $taskName = "MSSQLMetricsExporter" + $existingTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + + if (-not $existingTask) { + $taskAction = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$($MyInvocation.MyCommand.Path)`" -Mode textfile" + + if (-not $TaskIntervalMinutes -or $TaskIntervalMinutes -le 0) { + throw "TaskIntervalMinutes must be a positive integer" + } + + $taskTrigger = New-ScheduledTaskTrigger -Once -At (Get-Date).AddMinutes(1) -RepetitionInterval (New-TimeSpan -Minutes $TaskIntervalMinutes) -RepetitionDuration (New-TimeSpan -Days 365) + $taskPrincipal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest + + try { + Write-Host "Creating scheduled task: $taskName" + Register-ScheduledTask -TaskName $taskName -Action $taskAction -Trigger $taskTrigger -Principal $taskPrincipal -Description "Exports MSSQL metrics for Prometheus every $TaskIntervalMinutes minutes" + + $createdTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue + if (-not $createdTask) { + throw "Failed to verify scheduled task creation" + } + Write-Host "Successfully created scheduled task: $taskName" -ForegroundColor Green + } catch { + Write-Error "Failed to create auto-start task: $($_.Exception.Message)" + throw + } + } else { + Write-Host "Scheduled task '$taskName' already exists, skipping creation" + } + if ($Mode -eq 'stdout') { return } +} + +$ErrorActionPreference = 'SilentlyContinue' + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +function Get-UnixTimestamp { + [int][double]::Parse((Get-Date -UFormat '%s')) +} + +function Format-MetricValue { + param([double]$Value, [int]$Decimals = 2) + [math]::Round($Value, $Decimals) +} + +function Invoke-SqlQuery { + param([string]$Query) + try { + if ($ConnectionString) { + Invoke-Sqlcmd -ConnectionString $ConnectionString -Query $Query -ErrorAction Stop + } else { + Invoke-Sqlcmd -ServerInstance $SqlInstance -Query $Query -TrustServerCertificate -ErrorAction Stop + } + } catch { + Write-Warning "SQL query failed: $_" + return $null + } +} + +function Format-Label { + param([string]$Value) + $Value -replace '"', '' -replace '\\', '' -replace "`n", ' ' -replace "`r", '' +} + +# ============================================================================ +# INSTANCE METRICS +# ============================================================================ + +function Get-InstanceMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + # Version info + $version = Invoke-SqlQuery "SELECT SERVERPROPERTY('ProductVersion') AS Version, SERVERPROPERTY('Edition') AS Edition" + if ($version) { + $ver = Format-Label $version.Version + $ed = Format-Label $version.Edition + [void]$sb.AppendLine('# HELP mssql_version_info SQL Server version information (always 1)') + [void]$sb.AppendLine('# TYPE mssql_version_info gauge') + [void]$sb.AppendLine("mssql_version_info{version=`"$ver`",edition=`"$ed`"} 1") + [void]$sb.AppendLine('') + } + + # Uptime + $uptime = Invoke-SqlQuery "SELECT DATEDIFF(SECOND, sqlserver_start_time, GETDATE()) AS uptime_seconds FROM sys.dm_os_sys_info" + if ($uptime) { + [void]$sb.AppendLine('# HELP mssql_instance_uptime_seconds Seconds since SQL Server instance started') + [void]$sb.AppendLine('# TYPE mssql_instance_uptime_seconds gauge') + [void]$sb.AppendLine("mssql_instance_uptime_seconds $($uptime.uptime_seconds)") + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect instance metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# DATABASE SIZE METRICS +# ============================================================================ + +function Get-DatabaseSizeMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $sizes = Invoke-SqlQuery " + SELECT + d.name AS database_name, + SUM(CASE WHEN mf.type = 0 THEN mf.size END) * 8 * 1024 AS data_size_bytes, + SUM(CASE WHEN mf.type = 1 THEN mf.size END) * 8 * 1024 AS log_size_bytes + FROM sys.databases d + JOIN sys.master_files mf ON d.database_id = mf.database_id + WHERE d.state = 0 + GROUP BY d.name" + + if ($sizes) { + [void]$sb.AppendLine('# HELP mssql_database_size_bytes Data file size per database in bytes') + [void]$sb.AppendLine('# TYPE mssql_database_size_bytes gauge') + foreach ($row in $sizes) { + $dbName = Format-Label $row.database_name + $dataSize = if ($row.data_size_bytes) { $row.data_size_bytes } else { 0 } + [void]$sb.AppendLine("mssql_database_size_bytes{database=`"$dbName`"} $dataSize") + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP mssql_database_log_size_bytes Log file size per database in bytes') + [void]$sb.AppendLine('# TYPE mssql_database_log_size_bytes gauge') + foreach ($row in $sizes) { + $dbName = Format-Label $row.database_name + $logSize = if ($row.log_size_bytes) { $row.log_size_bytes } else { 0 } + [void]$sb.AppendLine("mssql_database_log_size_bytes{database=`"$dbName`"} $logSize") + } + [void]$sb.AppendLine('') + } + + # Log usage percentage + $logUsage = Invoke-SqlQuery "DBCC SQLPERF(LOGSPACE) WITH NO_INFOMSGS" + if ($logUsage) { + [void]$sb.AppendLine('# HELP mssql_database_log_usage_percent Log file usage percentage per database') + [void]$sb.AppendLine('# TYPE mssql_database_log_usage_percent gauge') + foreach ($row in $logUsage) { + $dbName = Format-Label $row.'Database Name' + $usage = Format-MetricValue $row.'Log Space Used (%)' + [void]$sb.AppendLine("mssql_database_log_usage_percent{database=`"$dbName`"} $usage") + } + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect database size metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# BACKUP FRESHNESS METRICS +# ============================================================================ + +function Get-BackupMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + # Full backup age + $fullBackups = Invoke-SqlQuery " + SELECT + d.name AS database_name, + CASE + WHEN MAX(b.backup_finish_date) IS NULL THEN -1 + ELSE DATEDIFF(HOUR, MAX(b.backup_finish_date), GETDATE()) + END AS hours_since_backup + FROM sys.databases d + LEFT JOIN msdb.dbo.backupset b ON d.name = b.database_name AND b.type = 'D' + WHERE d.database_id > 4 AND d.state = 0 + GROUP BY d.name" + + if ($fullBackups) { + [void]$sb.AppendLine('# HELP mssql_backup_age_full_hours Hours since last full backup per database (-1 = no backup)') + [void]$sb.AppendLine('# TYPE mssql_backup_age_full_hours gauge') + foreach ($row in $fullBackups) { + $dbName = Format-Label $row.database_name + [void]$sb.AppendLine("mssql_backup_age_full_hours{database=`"$dbName`"} $($row.hours_since_backup)") + } + [void]$sb.AppendLine('') + } + + # Differential backup age + $diffBackups = Invoke-SqlQuery " + SELECT + d.name AS database_name, + CASE + WHEN MAX(b.backup_finish_date) IS NULL THEN -1 + ELSE DATEDIFF(HOUR, MAX(b.backup_finish_date), GETDATE()) + END AS hours_since_backup + FROM sys.databases d + LEFT JOIN msdb.dbo.backupset b ON d.name = b.database_name AND b.type = 'I' + WHERE d.database_id > 4 AND d.state = 0 + GROUP BY d.name" + + if ($diffBackups) { + [void]$sb.AppendLine('# HELP mssql_backup_age_diff_hours Hours since last differential backup per database (-1 = no backup)') + [void]$sb.AppendLine('# TYPE mssql_backup_age_diff_hours gauge') + foreach ($row in $diffBackups) { + $dbName = Format-Label $row.database_name + [void]$sb.AppendLine("mssql_backup_age_diff_hours{database=`"$dbName`"} $($row.hours_since_backup)") + } + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect backup metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# CONNECTION METRICS +# ============================================================================ + +function Get-ConnectionMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + # Active connections per database + $connections = Invoke-SqlQuery " + SELECT DB_NAME(database_id) AS database_name, COUNT(*) AS active_connections + FROM sys.dm_exec_sessions + WHERE is_user_process = 1 AND database_id > 0 + GROUP BY database_id" + + if ($connections) { + [void]$sb.AppendLine('# HELP mssql_connections_active Active connections per database') + [void]$sb.AppendLine('# TYPE mssql_connections_active gauge') + foreach ($row in $connections) { + $dbName = Format-Label $row.database_name + [void]$sb.AppendLine("mssql_connections_active{database=`"$dbName`"} $($row.active_connections)") + } + [void]$sb.AppendLine('') + } + + # Total active sessions + $totalSessions = Invoke-SqlQuery "SELECT COUNT(*) AS total FROM sys.dm_exec_sessions WHERE is_user_process = 1" + if ($totalSessions) { + [void]$sb.AppendLine('# HELP mssql_sessions_total Total active user sessions') + [void]$sb.AppendLine('# TYPE mssql_sessions_total gauge') + [void]$sb.AppendLine("mssql_sessions_total $($totalSessions.total)") + [void]$sb.AppendLine('') + } + + # Blocked sessions + $blocked = Invoke-SqlQuery "SELECT COUNT(*) AS blocked FROM sys.dm_exec_requests WHERE blocking_session_id > 0" + if ($blocked) { + [void]$sb.AppendLine('# HELP mssql_sessions_blocked Currently blocked sessions') + [void]$sb.AppendLine('# TYPE mssql_sessions_blocked gauge') + [void]$sb.AppendLine("mssql_sessions_blocked $($blocked.blocked)") + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect connection metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# WAIT STATISTICS +# ============================================================================ + +function Get-WaitMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $waits = Invoke-SqlQuery " + SELECT TOP 10 + wait_type, + wait_time_ms / 1000.0 AS wait_time_seconds, + waiting_tasks_count + FROM sys.dm_os_wait_stats + WHERE wait_type NOT IN ( + 'SLEEP_TASK', 'BROKER_TO_FLUSH', 'SQLTRACE_BUFFER_FLUSH', + 'CLR_AUTO_EVENT', 'CLR_MANUAL_EVENT', 'LAZYWRITER_SLEEP', + 'CHECKPOINT_QUEUE', 'WAITFOR', 'XE_TIMER_EVENT', + 'BROKER_EVENTHANDLER', 'FT_IFTS_SCHEDULER_IDLE_WAIT', + 'XE_DISPATCHER_WAIT', 'SQLTRACE_INCREMENTAL_FLUSH_SLEEP', + 'HADR_FILESTREAM_IOMGR_IOCOMPLETION', 'DIRTY_PAGE_POLL', + 'SP_SERVER_DIAGNOSTICS_SLEEP', 'BROKER_TASK_STOP', + 'HADR_LOGCAPTURE_WAIT', 'ONDEMAND_TASK_QUEUE', + 'DBMIRROR_EVENTS_QUEUE', 'QDS_PERSIST_TASK_MAIN_LOOP_SLEEP', + 'QDS_ASYNC_QUEUE', 'QDS_CLEANUP_STALE_QUERIES_TASK_MAIN_LOOP_SLEEP', + 'DISPATCHER_QUEUE_SEMAPHORE', 'REQUEST_FOR_DEADLOCK_SEARCH', + 'HADR_TIMER_TASK', 'BROKER_RECEIVE_WAITFOR', + 'PREEMPTIVE_XE_GETTARGETSTATE', 'PREEMPTIVE_XE_SESSIONCOMMIT', + 'SLEEP_BPOOL_FLUSH', 'SLEEP_DBSTARTUP', 'SLEEP_DCOMSTARTUP', + 'SLEEP_MASTERDBREADY', 'SLEEP_MASTERMDREADY', 'SLEEP_MASTERUPGRADED', + 'SLEEP_MSDBSTARTUP', 'SLEEP_SYSTEMTASK', 'SLEEP_TEMPDBSTARTUP', + 'SNI_HTTP_ACCEPT', 'WAIT_XTP_OFFLINE_CKPT_NEW_LOG' + ) + AND wait_time_ms > 0 + ORDER BY wait_time_ms DESC" + + if ($waits) { + [void]$sb.AppendLine('# HELP mssql_wait_time_seconds Cumulative wait time by wait type') + [void]$sb.AppendLine('# TYPE mssql_wait_time_seconds gauge') + foreach ($row in $waits) { + $waitType = Format-Label $row.wait_type + $waitTime = Format-MetricValue $row.wait_time_seconds + [void]$sb.AppendLine("mssql_wait_time_seconds{wait_type=`"$waitType`"} $waitTime") + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP mssql_waiting_tasks Waiting task count by wait type') + [void]$sb.AppendLine('# TYPE mssql_waiting_tasks gauge') + foreach ($row in $waits) { + $waitType = Format-Label $row.wait_type + [void]$sb.AppendLine("mssql_waiting_tasks{wait_type=`"$waitType`"} $($row.waiting_tasks_count)") + } + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect wait metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# BUFFER CACHE METRICS +# ============================================================================ + +function Get-BufferCacheMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + # Buffer cache hit ratio + $hitRatio = Invoke-SqlQuery " + SELECT + CAST( + (SELECT cntr_value FROM sys.dm_os_performance_counters + WHERE counter_name = 'Buffer cache hit ratio' + AND object_name LIKE '%Buffer Manager%') * 100.0 + / + NULLIF((SELECT cntr_value FROM sys.dm_os_performance_counters + WHERE counter_name = 'Buffer cache hit ratio base' + AND object_name LIKE '%Buffer Manager%'), 0) + AS DECIMAL(5,2)) AS hit_ratio" + + if ($hitRatio -and $hitRatio.hit_ratio) { + [void]$sb.AppendLine('# HELP mssql_buffer_cache_hit_ratio Buffer cache hit ratio (0-100)') + [void]$sb.AppendLine('# TYPE mssql_buffer_cache_hit_ratio gauge') + [void]$sb.AppendLine("mssql_buffer_cache_hit_ratio $(Format-MetricValue $hitRatio.hit_ratio)") + [void]$sb.AppendLine('') + } + + # Page life expectancy + $ple = Invoke-SqlQuery " + SELECT cntr_value AS ple_seconds + FROM sys.dm_os_performance_counters + WHERE counter_name = 'Page life expectancy' + AND object_name LIKE '%Buffer Manager%'" + + if ($ple) { + [void]$sb.AppendLine('# HELP mssql_page_life_expectancy_seconds Page life expectancy in seconds') + [void]$sb.AppendLine('# TYPE mssql_page_life_expectancy_seconds gauge') + [void]$sb.AppendLine("mssql_page_life_expectancy_seconds $($ple.ple_seconds)") + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect buffer cache metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# SQL AGENT JOB METRICS +# ============================================================================ + +function Get-AgentJobMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $jobs = Invoke-SqlQuery " + SELECT + j.name AS job_name, + CASE h.run_status + WHEN 1 THEN 1 + ELSE 0 + END AS last_run_succeeded, + CASE + WHEN h.run_duration IS NULL THEN 0 + ELSE (h.run_duration / 10000) * 3600 + + ((h.run_duration / 100) % 100) * 60 + + (h.run_duration % 100) + END AS duration_seconds + FROM msdb.dbo.sysjobs j + OUTER APPLY ( + SELECT TOP 1 run_status, run_duration + FROM msdb.dbo.sysjobhistory + WHERE job_id = j.job_id AND step_id = 0 + ORDER BY run_date DESC, run_time DESC + ) h + WHERE j.enabled = 1" + + if ($jobs) { + [void]$sb.AppendLine('# HELP mssql_agent_job_status Last run outcome per job (1=success, 0=fail)') + [void]$sb.AppendLine('# TYPE mssql_agent_job_status gauge') + foreach ($row in $jobs) { + $jobName = Format-Label $row.job_name + [void]$sb.AppendLine("mssql_agent_job_status{job=`"$jobName`"} $($row.last_run_succeeded)") + } + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP mssql_agent_job_duration_seconds Last run duration per job in seconds') + [void]$sb.AppendLine('# TYPE mssql_agent_job_duration_seconds gauge') + foreach ($row in $jobs) { + $jobName = Format-Label $row.job_name + [void]$sb.AppendLine("mssql_agent_job_duration_seconds{job=`"$jobName`"} $($row.duration_seconds)") + } + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect SQL Agent job metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# TEMPDB METRICS +# ============================================================================ + +function Get-TempDbMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $tempdb = Invoke-SqlQuery " + SELECT + SUM(size) * 8 * 1024 AS total_size_bytes, + SUM(FILEPROPERTY(name, 'SpaceUsed')) * 8 * 1024 AS used_bytes + FROM tempdb.sys.database_files + WHERE type = 0" + + if ($tempdb) { + [void]$sb.AppendLine('# HELP mssql_tempdb_size_bytes TempDB total data file size in bytes') + [void]$sb.AppendLine('# TYPE mssql_tempdb_size_bytes gauge') + $totalSize = if ($tempdb.total_size_bytes) { $tempdb.total_size_bytes } else { 0 } + [void]$sb.AppendLine("mssql_tempdb_size_bytes $totalSize") + [void]$sb.AppendLine('') + + [void]$sb.AppendLine('# HELP mssql_tempdb_usage_bytes TempDB space used in bytes') + [void]$sb.AppendLine('# TYPE mssql_tempdb_usage_bytes gauge') + $usedBytes = if ($tempdb.used_bytes) { $tempdb.used_bytes } else { 0 } + [void]$sb.AppendLine("mssql_tempdb_usage_bytes $usedBytes") + [void]$sb.AppendLine('') + } + } + catch { + Write-Warning "Failed to collect TempDB metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# ERROR COUNT +# ============================================================================ + +function Get-ErrorMetrics { + $sb = [System.Text.StringBuilder]::new() + + try { + $errors = Invoke-SqlQuery " + SELECT COUNT(*) AS error_count + FROM sys.dm_exec_requests + WHERE status = 'running' AND total_elapsed_time > 0 + AND sql_handle IS NOT NULL" + + [void]$sb.AppendLine('# HELP mssql_error_count Active error count from current requests') + [void]$sb.AppendLine('# TYPE mssql_error_count gauge') + $count = if ($errors -and $errors.error_count) { $errors.error_count } else { 0 } + [void]$sb.AppendLine("mssql_error_count $count") + [void]$sb.AppendLine('') + } + catch { + Write-Warning "Failed to collect error metrics: $_" + } + + $sb.ToString() +} + +# ============================================================================ +# COLLECT ALL METRICS +# ============================================================================ + +function Get-AllMetrics { + $scriptStart = Get-Date + $sb = [System.Text.StringBuilder]::new() + + # Test connectivity + $testResult = Invoke-SqlQuery "SELECT 1 AS connected" + $isUp = if ($testResult) { 1 } else { 0 } + + [void]$sb.AppendLine('# HELP mssql_up SQL Server reachable (1=up, 0=down)') + [void]$sb.AppendLine('# TYPE mssql_up gauge') + [void]$sb.AppendLine("mssql_up $isUp") + [void]$sb.AppendLine('') + + if ($isUp -eq 1) { + [void]$sb.Append((Get-InstanceMetrics)) + [void]$sb.Append((Get-DatabaseSizeMetrics)) + [void]$sb.Append((Get-BackupMetrics)) + [void]$sb.Append((Get-ConnectionMetrics)) + [void]$sb.Append((Get-WaitMetrics)) + [void]$sb.Append((Get-BufferCacheMetrics)) + [void]$sb.Append((Get-AgentJobMetrics)) + [void]$sb.Append((Get-TempDbMetrics)) + [void]$sb.Append((Get-ErrorMetrics)) + } + + # Collector duration + $scriptEnd = Get-Date + $duration = Format-MetricValue ($scriptEnd - $scriptStart).TotalSeconds + + [void]$sb.AppendLine('# HELP mssql_collector_duration_seconds Time to generate all metrics') + [void]$sb.AppendLine('# TYPE mssql_collector_duration_seconds gauge') + [void]$sb.AppendLine("mssql_collector_duration_seconds $duration") + [void]$sb.AppendLine('') + + $sb.ToString() +} + +# ============================================================================ +# HTTP SERVER MODE +# ============================================================================ + +function Start-HttpServer { + param([int]$ListenPort) + + $prefix = "http://+:$ListenPort/" + $listener = [System.Net.HttpListener]::new() + $listener.Prefixes.Add($prefix) + + try { + $listener.Start() + Write-Host "Starting MSSQL metrics exporter on port $ListenPort..." -ForegroundColor Green + Write-Host "Metrics available at http://localhost:$ListenPort/metrics" + + while ($listener.IsListening) { + $context = $listener.GetContext() + $request = $context.Request + $response = $context.Response + + if ($request.Url.AbsolutePath -eq '/metrics') { + $metrics = Get-AllMetrics + $buffer = [System.Text.Encoding]::UTF8.GetBytes($metrics) + $response.ContentType = 'text/plain; version=0.0.4; charset=utf-8' + } + else { + $html = @" + + +MSSQL Metrics Exporter v1.0 + +

MSSQL Metrics Exporter v1.0

+

Metrics

+

Sections

+
    +
  • Instance health (version, uptime)
  • +
  • Database sizes (data files, log files, log usage)
  • +
  • Backup freshness (full, differential)
  • +
  • Connections and sessions (active, blocked)
  • +
  • Wait statistics (top 10 waits)
  • +
  • Buffer cache (hit ratio, page life expectancy)
  • +
  • SQL Agent jobs (status, duration)
  • +
  • TempDB (size, usage)
  • +
+ + +"@ + $buffer = [System.Text.Encoding]::UTF8.GetBytes($html) + $response.ContentType = 'text/html; charset=utf-8' + } + + $response.ContentLength64 = $buffer.Length + $response.OutputStream.Write($buffer, 0, $buffer.Length) + $response.OutputStream.Close() + } + } + catch { + Write-Error "HTTP server error: $_" + Write-Error "If access denied, run: netsh http add urlacl url=http://+:$ListenPort/ user=Everyone" + } + finally { + if ($listener.IsListening) { + $listener.Stop() + } + } +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +switch ($Mode) { + 'http' { + Start-HttpServer -ListenPort $HttpPort + } + 'textfile' { + $outputFile = Join-Path $TextfileDir 'mssql_metrics.prom' + + $outputDir = Split-Path $outputFile -Parent + if (-not (Test-Path $outputDir)) { + New-Item -Path $outputDir -ItemType Directory -Force | Out-Null + } + + $tempFile = Join-Path $outputDir ".mssql_metrics.$PID.tmp" + + try { + $metrics = Get-AllMetrics + $metrics | Out-File -FilePath $tempFile -Encoding utf8 -NoNewline + + $lineCount = ($metrics -split "`n").Count + if ($lineCount -lt 5) { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Metrics file too small ($lineCount lines), keeping previous" + exit 1 + } + + Move-Item -Path $tempFile -Destination $outputFile -Force + Write-Host "Metrics written to $outputFile ($lineCount lines)" -ForegroundColor Green + } + catch { + Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue + Write-Error "Failed to generate metrics: $_" + exit 1 + } + } + default { + Get-AllMetrics | Write-Output + } +} diff --git a/mysql-exporter.sh b/mysql-exporter.sh new file mode 100644 index 0000000..8829dcd --- /dev/null +++ b/mysql-exporter.sh @@ -0,0 +1,432 @@ +#!/bin/bash +################################################################################ +# Script Name: mysql-exporter.sh +# Version: 1.0 +# Description: Prometheus textfile exporter for MySQL — server status, +# connections, QPS, slow queries, buffer pool, replication, +# thread counts, and table lock waits via mysql CLI +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - mysql CLI client +# - Credentials via ~/.my.cnf or --user/--password flags +# - Network access to MySQL server +# - netcat (nc) for HTTP mode +# +# Usage: +# # Output to stdout +# ./mysql-exporter.sh +# +# # HTTP server mode +# ./mysql-exporter.sh --http -p 9207 +# +# # Textfile collector mode +# ./mysql-exporter.sh --textfile +# +# Metrics Exported: +# - mysql_overview_up - Exporter status (1=up, 0=down) +# - mysql_overview_version_info{version} - Server version +# - mysql_overview_uptime_seconds - Server uptime +# - mysql_overview_active_connections - Current active connections +# - mysql_overview_max_connections - Maximum allowed connections +# - mysql_overview_aborted_connections_total - Aborted connections +# - mysql_overview_aborted_clients_total - Aborted clients +# - mysql_overview_queries_total - Total queries executed +# - mysql_overview_slow_queries_total - Total slow queries +# - mysql_overview_tmp_disk_tables_total - Temp tables on disk +# - mysql_overview_threads_running - Running threads +# - mysql_overview_threads_cached - Cached threads +# - mysql_overview_threads_connected - Connected threads +# - mysql_overview_buffer_pool_size_bytes - Buffer pool total size +# - mysql_overview_buffer_pool_used_bytes - Buffer pool used size +# - mysql_overview_buffer_pool_hit_rate - Buffer pool hit rate +# - mysql_overview_table_lock_waits_total - Table lock waits +# - mysql_overview_slave_running - Replication running status +# - mysql_overview_seconds_behind_master - Replication lag +# - mysql_overview_relay_log_space_bytes - Relay log space +# - mysql_overview_binlog_size_bytes - Binary log size +# - mysql_overview_exporter_duration_seconds - Script execution time +# - mysql_overview_exporter_last_run_timestamp - Last successful run +# +# Configuration: +# Default HTTP port: 9207 +# Textfile directory: /var/lib/node_exporter +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9207 +MYSQL_USER="" +MYSQL_PASSWORD="" +MYSQL_HOST="127.0.0.1" +MYSQL_PORT="3306" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# Build mysql command with optional credentials +build_mysql_cmd() { + local cmd="mysql -h $MYSQL_HOST -P $MYSQL_PORT" + [ -n "$MYSQL_USER" ] && cmd="$cmd -u $MYSQL_USER" + [ -n "$MYSQL_PASSWORD" ] && cmd="$cmd -p$MYSQL_PASSWORD" + cmd="$cmd --batch --skip-column-names" + echo "$cmd" +} + +# Check if mysql client is available and server is reachable +check_mysql() { + if ! command -v mysql >/dev/null 2>&1; then + echo "ERROR: mysql client not found" >&2 + return 1 + fi + + local mysql_cmd + mysql_cmd=$(build_mysql_cmd) + if ! $mysql_cmd -e "SELECT 1" >/dev/null 2>&1; then + echo "ERROR: Cannot connect to MySQL server" >&2 + return 1 + fi + + return 0 +} + +# Execute a MySQL query and return the result +mysql_query() { + local query="$1" + local mysql_cmd + mysql_cmd=$(build_mysql_cmd) + $mysql_cmd -e "$query" 2>/dev/null +} + +# Get a specific global status variable +get_status_var() { + local var_name="$1" + mysql_query "SHOW GLOBAL STATUS LIKE '$var_name'" | awk '{print $2}' +} + +# Get a specific global variable +get_variable() { + local var_name="$1" + mysql_query "SHOW GLOBAL VARIABLES LIKE '$var_name'" | awk '{print $2}' +} + +# ============================================================================ +# METRIC GENERATION +# ============================================================================ + +generate_metrics() { + local script_start + script_start=$(date +%s) + + if ! check_mysql; then + cat </dev/null) + uptime=$(get_status_var "Uptime") + max_conn=$(get_variable "max_connections") + active_conn=$(get_status_var "Threads_connected") + + local queries slow_queries tmp_disk_tables + queries=$(get_status_var "Queries") + slow_queries=$(get_status_var "Slow_queries") + tmp_disk_tables=$(get_status_var "Created_tmp_disk_tables") + + local threads_running threads_cached threads_connected + threads_running=$(get_status_var "Threads_running") + threads_cached=$(get_status_var "Threads_cached") + threads_connected=$(get_status_var "Threads_connected") + + local bp_size bp_pages_total bp_pages_free bp_read_requests bp_reads + bp_size=$(get_variable "innodb_buffer_pool_size") + bp_pages_total=$(get_status_var "Innodb_buffer_pool_pages_total") + bp_pages_free=$(get_status_var "Innodb_buffer_pool_pages_free") + bp_read_requests=$(get_status_var "Innodb_buffer_pool_read_requests") + bp_reads=$(get_status_var "Innodb_buffer_pool_reads") + + local table_lock_waits aborted_conn aborted_clients + table_lock_waits=$(get_status_var "Table_locks_waited") + aborted_conn=$(get_status_var "Aborted_connects") + aborted_clients=$(get_status_var "Aborted_clients") + + # Calculate buffer pool metrics + local bp_used_pages bp_used_bytes bp_hit_rate + bp_pages_total=${bp_pages_total:-0} + bp_pages_free=${bp_pages_free:-0} + bp_used_pages=$((bp_pages_total - bp_pages_free)) + # InnoDB page size is 16384 bytes by default + bp_used_bytes=$((bp_used_pages * 16384)) + + bp_read_requests=${bp_read_requests:-0} + bp_reads=${bp_reads:-0} + if [ "$bp_read_requests" -gt 0 ] 2>/dev/null; then + bp_hit_rate=$(awk "BEGIN {printf \"%.6f\", 1 - ($bp_reads / $bp_read_requests)}" 2>/dev/null || echo "0") + else + bp_hit_rate="1.000000" + fi + + # Replication status + local slave_running seconds_behind relay_log_space + local repl_output + repl_output=$(mysql_query "SHOW SLAVE STATUS\G" 2>/dev/null) + if [ -n "$repl_output" ]; then + slave_running=$(echo "$repl_output" | grep "Slave_SQL_Running:" | awk '{print $2}') + [ "$slave_running" = "Yes" ] && slave_running=1 || slave_running=0 + seconds_behind=$(echo "$repl_output" | grep "Seconds_Behind_Master:" | awk '{print $2}') + [ "$seconds_behind" = "NULL" ] && seconds_behind=-1 + relay_log_space=$(echo "$repl_output" | grep "Relay_Log_Space:" | awk '{print $2}') + else + slave_running=0 + seconds_behind=-1 + relay_log_space=0 + fi + + # Binary log size + local binlog_size + binlog_size=$(mysql_query "SHOW BINARY LOGS" 2>/dev/null | awk '{sum += $2} END {print sum+0}') + + # Output all metrics + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + while true; do + { + read -r request + if [[ "$request" =~ ^GET\ /metrics ]]; then + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" + generate_metrics + else + echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" + cat < + +MySQL Exporter + +

MySQL Prometheus Exporter

+

Metrics

+ + +EOF + fi + } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.mysql_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 10 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/n8n-exporter.sh b/n8n-exporter.sh new file mode 100755 index 0000000..e8b0d67 --- /dev/null +++ b/n8n-exporter.sh @@ -0,0 +1,446 @@ +#!/bin/bash +################################################################################ +# Script Name: n8n-exporter.sh +# Version: 1.0 +# Description: Prometheus exporter for n8n workflow automation — pulls built-in +# /metrics data and supplements with API-sourced per-workflow info, +# execution stats, credential counts, and health checks. Designed +# for node_exporter textfile collector. +# +# Author: Phil Connor +# Contact: contact@mylinux.work +# Website: https://mylinux.work +# License: MIT +# +# Prerequisites: +# - n8n with N8N_METRICS=true (for /metrics endpoint) +# - curl and jq +# - n8n API key (for supplemental metrics — optional) +# - netcat (nc) for HTTP mode +# +# Usage: +# ./n8n-exporter.sh # stdout +# ./n8n-exporter.sh --textfile # node_exporter textfile +# ./n8n-exporter.sh --http -p 9200 # HTTP server +# ./n8n-exporter.sh --url http://n8n:5678 --api-key KEY +# +################################################################################ + +# ============================================================================ +# CONFIGURATION VARIABLES +# ============================================================================ + +TEXTFILE_DIR="/var/lib/node_exporter" +OUTPUT_FILE="" +HTTP_MODE=false +HTTP_PORT=9200 + +# Source environment file if present (for cron deployments) +[ -f /etc/default/n8n-exporter ] && . /etc/default/n8n-exporter + +N8N_URL="${N8N_URL:-http://localhost:5678}" +N8N_API_KEY="${N8N_API_KEY:-}" + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +prom_escape() { + local s="$1" + s=${s//\\/\\\\} + s=${s//\"/\\\"} + s=${s//$'\n'/\\n} + printf '%s\n' "$s" +} + +show_usage() { + cat <&2; exit 1 ;; + esac + done +} + +# ============================================================================ +# METRICS GENERATION +# ============================================================================ + +generate_metrics() { + local script_start + script_start=$(date +%s) + + # ======================================================================== + # Prerequisite Check + # ======================================================================== + + if ! command -v curl >/dev/null 2>&1; then + echo "# ERROR: curl is required but not found" >&2 + cat </dev/null 2>&1; then + echo "# WARNING: jq not found, skipping supplemental API metrics" >&2 + N8N_API_KEY="" + fi + + # ======================================================================== + # Exporter Identity + # ======================================================================== + local exporter_up=1 + + cat </dev/null) + + if [ -n "$builtin_metrics" ]; then + echo "# ── Built-in n8n metrics (/metrics) ──" + echo "$builtin_metrics" + echo "" + else + echo "# WARNING: could not read ${N8N_URL}/metrics (check n8n is running with N8N_METRICS=true)" >&2 + exporter_up=0 + fi + + # ======================================================================== + # Step 2: Health Check (/healthz) + # ======================================================================== + local health_status + local health_http_code + health_http_code=$(curl -s --max-time 5 -o /dev/null -w '%{http_code}' "${N8N_URL}/healthz" 2>/dev/null) + + if [ $? -eq 0 ] && [ "$health_http_code" = "200" ]; then + health_status=1 + else + health_status=0 + fi + + cat <&2 + else + local api_header="X-N8N-API-KEY: ${N8N_API_KEY}" + + # ==================================================================== + # Step 3: Workflow Info (/api/v1/workflows) + # ==================================================================== + local workflows_json + workflows_json=$(curl -sf --max-time 10 -H "$api_header" "${N8N_URL}/api/v1/workflows?limit=250" 2>/dev/null) + + local workflow_info_lines="" + + if [ -n "$workflows_json" ] && [ "$workflows_json" != "null" ]; then + local workflow_data + workflow_data=$(echo "$workflows_json" | jq -r ' + .data // [] | .[] | + "\(.id)\t\(.name)\t\(.active)\t\((.tags // []) | map(.name) | join(","))" + ' 2>/dev/null) + + if [ -n "$workflow_data" ]; then + while IFS=$'\t' read -r wf_id wf_name wf_active wf_tags; do + [ -z "$wf_id" ] && continue + local esc_id esc_name esc_active esc_tags + esc_id=$(prom_escape "$wf_id") + esc_name=$(prom_escape "$wf_name") + esc_active=$(prom_escape "$wf_active") + esc_tags=$(prom_escape "$wf_tags") + workflow_info_lines="${workflow_info_lines}n8n_exporter_workflow_info{id=\"${esc_id}\",name=\"${esc_name}\",active=\"${esc_active}\",tags=\"${esc_tags}\"} 1 +" + done <<< "$workflow_data" + fi + else + echo "# WARNING: could not read /api/v1/workflows" >&2 + fi + + if [ -n "$workflow_info_lines" ]; then + echo "# HELP n8n_exporter_workflow_info Workflow information (always 1)" + echo "# TYPE n8n_exporter_workflow_info gauge" + printf '%s' "$workflow_info_lines" + echo "" + fi + + # ==================================================================== + # Step 4: Execution Stats (/api/v1/executions) + # ==================================================================== + local executions_json + executions_json=$(curl -sf --max-time 10 -H "$api_header" "${N8N_URL}/api/v1/executions?limit=100" 2>/dev/null) + + local last_exec_lines="" + local error_count_lines="" + + if [ -n "$executions_json" ] && [ "$executions_json" != "null" ]; then + # Extract per-workflow last execution timestamp and error counts + local exec_stats + exec_stats=$(echo "$executions_json" | jq -r ' + .data // [] | + group_by(.workflowId) | .[] | + { + id: .[0].workflowId, + name: (.[0].workflowData.name // "unknown"), + last_finished: (map(select(.stoppedAt != null) | .stoppedAt) | sort | last // ""), + errors: (map(select(.status == "error" or .status == "failed")) | length) + } | + "\(.id)\t\(.name)\t\(.last_finished)\t\(.errors)" + ' 2>/dev/null) + + if [ -n "$exec_stats" ]; then + while IFS=$'\t' read -r ex_wf_id ex_wf_name ex_last_ts ex_errors; do + [ -z "$ex_wf_id" ] && continue + local esc_ex_id esc_ex_name + esc_ex_id=$(prom_escape "$ex_wf_id") + esc_ex_name=$(prom_escape "$ex_wf_name") + local labels="id=\"${esc_ex_id}\",name=\"${esc_ex_name}\"" + + # Last execution timestamp + if [ -n "$ex_last_ts" ] && [ "$ex_last_ts" != "null" ] && [ "$ex_last_ts" != "" ]; then + local epoch_ts + epoch_ts=$(date -d "$ex_last_ts" +%s 2>/dev/null) + if [ -n "$epoch_ts" ]; then + last_exec_lines="${last_exec_lines}n8n_exporter_workflow_last_execution_timestamp{${labels}} ${epoch_ts} +" + fi + fi + + # Error count + ex_errors=${ex_errors:-0} + error_count_lines="${error_count_lines}n8n_exporter_workflow_errors_recent{${labels}} ${ex_errors} +" + done <<< "$exec_stats" + fi + else + echo "# WARNING: could not read /api/v1/executions" >&2 + fi + + if [ -n "$last_exec_lines" ]; then + echo "# HELP n8n_exporter_workflow_last_execution_timestamp Unix timestamp of last finished execution per workflow" + echo "# TYPE n8n_exporter_workflow_last_execution_timestamp gauge" + printf '%s' "$last_exec_lines" + echo "" + fi + + if [ -n "$error_count_lines" ]; then + echo "# HELP n8n_exporter_workflow_errors_recent Errored executions per workflow (from last 100 executions)" + echo "# TYPE n8n_exporter_workflow_errors_recent gauge" + printf '%s' "$error_count_lines" + echo "" + fi + + # ==================================================================== + # Step 5: Credentials by Type (/api/v1/credentials) + # ==================================================================== + local credentials_json + credentials_json=$(curl -sf --max-time 10 -H "$api_header" "${N8N_URL}/api/v1/credentials?limit=250" 2>/dev/null) + + local cred_type_lines="" + + if [ -n "$credentials_json" ] && [ "$credentials_json" != "null" ]; then + local cred_counts + cred_counts=$(echo "$credentials_json" | jq -r ' + .data // [] | + group_by(.type) | .[] | + "\(.[0].type)\t\(length)" + ' 2>/dev/null) + + if [ -n "$cred_counts" ]; then + while IFS=$'\t' read -r cred_type cred_count; do + [ -z "$cred_type" ] && continue + local esc_cred_type + esc_cred_type=$(prom_escape "$cred_type") + cred_type_lines="${cred_type_lines}n8n_exporter_credentials_by_type{type=\"${esc_cred_type}\"} ${cred_count} +" + done <<< "$cred_counts" + fi + else + echo "# WARNING: could not read /api/v1/credentials" >&2 + fi + + if [ -n "$cred_type_lines" ]; then + echo "# HELP n8n_exporter_credentials_by_type Number of credentials by type" + echo "# TYPE n8n_exporter_credentials_by_type gauge" + printf '%s' "$cred_type_lines" + echo "" + fi + fi + + # ======================================================================== + # Exporter Runtime + # ======================================================================== + local script_end script_duration + script_end=$(date +%s) + script_duration=$((script_end - script_start)) + + cat <&2 + + if ! command -v nc >/dev/null 2>&1; then + echo "ERROR: netcat (nc) required for HTTP mode" >&2 + exit 1 + fi + + trap 'echo "Shutting down n8n exporter..." >&2; exit 0' INT TERM + + while true; do + { + read -r request + local body + if [[ "$request" =~ ^GET\ /metrics ]]; then + body=$(generate_metrics) + printf "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body" + else + body=$(cat <<'HTMLEOF' + + +n8n Exporter v1.0 + +

n8n Exporter v1.0

+

Metrics

+

Sections

+
    +
  • Built-in n8n /metrics (version, workflows, executions, cache, queue)
  • +
  • Health check status
  • +
  • Per-workflow info (name, active, tags)
  • +
  • Per-workflow execution stats (last run, error count)
  • +
  • Credentials by type
  • +
+ + +HTMLEOF +) + printf "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\nContent-Length: %d\r\nConnection: close\r\n\r\n%s" "${#body}" "$body" + fi + } | if nc -h 2>&1 | grep -q 'GNU\|traditional'; then + nc -l -p "$HTTP_PORT" -q 1 2>/dev/null + else + nc -l "$HTTP_PORT" 2>/dev/null + fi + done +} + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +main() { + parse_args "$@" + + if [ "$HTTP_MODE" = true ]; then + run_http_server + elif [ -n "$OUTPUT_FILE" ]; then + local output_dir + output_dir="$(dirname "$OUTPUT_FILE")" + mkdir -p "$output_dir" + + local temp_file + temp_file=$(mktemp "${output_dir}/.n8n_metrics.XXXXXX") + + if ! generate_metrics > "$temp_file" 2>/dev/null; then + rm -f "$temp_file" + echo "ERROR: Failed to generate metrics" >&2 + exit 1 + fi + + local file_lines + file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) + + if [ "$file_lines" -lt 3 ]; then + rm -f "$temp_file" + echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 + exit 1 + fi + + chmod 644 "$temp_file" + mv -f "$temp_file" "$OUTPUT_FILE" + + echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 + else + generate_metrics + fi +} + +main "$@" diff --git a/nagios-core-installer.sh b/nagios-core-installer.sh new file mode 100644 index 0000000..e6c03fd --- /dev/null +++ b/nagios-core-installer.sh @@ -0,0 +1,588 @@ +#!/bin/bash +############################################################# +#### Nagios Core Installer #### +#### Automated source compilation of Nagios Core, #### +#### plugins, web server config, and systemd setup #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 2.0 #### +#### #### +#### Usage: ./nagios-core-installer.sh [OPTIONS] #### +############################################################# +# +# Supported operating systems: +# - Ubuntu 20.04, 22.04, 24.04 +# - Debian 11, 12 +# - RHEL 8, 9 +# - Rocky Linux 8, 9 +# - Alma Linux 8, 9 +# +# What this script does: +# 1. Installs build dependencies +# 2. Creates nagios user and nagcmd group +# 3. Downloads and compiles Nagios Core from source +# 4. Downloads and compiles Nagios Plugins +# 5. Configures Apache or Nginx with authentication +# 6. Optionally installs NRPE daemon +# 7. Enables and starts the systemd service +# 8. Validates configuration +# +set -euo pipefail + +######################### +### Configuration ### +######################### + +NAGIOS_VERSION="${NAGIOS_VERSION:-4.5.9}" +PLUGINS_VERSION="${PLUGINS_VERSION:-2.4.12}" +NRPE_VERSION="${NRPE_VERSION:-4.1.3}" +WEBSERVER="${WEBSERVER:-apache}" +NAGIOS_ADMIN_USER="${NAGIOS_ADMIN_USER:-nagiosadmin}" +NAGIOS_ADMIN_PASS="" +INSTALL_NRPE=false +NAGIOS_HOME="/usr/local/nagios" +BUILD_DIR="/tmp/nagios-build" + +######################### +### Logging ### +######################### + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +log_step() { + echo -e "${BLUE}[STEP]${NC} $1" +} + +######################### +### Parse Arguments ### +######################### + +show_help() { + cat </dev/null; then + groupadd nagcmd + log_info "Created group: nagcmd" + fi + + if ! id nagios &>/dev/null; then + useradd -r -g nagcmd -d "$NAGIOS_HOME" -s /bin/false nagios + log_info "Created user: nagios" + fi + + # Add web server user to nagcmd group + local www_user + if [[ "$OS_FAMILY" == "debian" ]]; then + www_user="www-data" + else + www_user="apache" + [[ "$WEBSERVER" == "nginx" ]] && www_user="nginx" + fi + + usermod -aG nagcmd "$www_user" 2>/dev/null || true + usermod -aG nagcmd nagios 2>/dev/null || true + + log_info "User setup complete" +} + +######################### +### Compile Nagios ### +######################### + +compile_nagios() { + log_step "Downloading and compiling Nagios Core $NAGIOS_VERSION..." + + mkdir -p "$BUILD_DIR" + cd "$BUILD_DIR" + + local tarball="nagios-${NAGIOS_VERSION}.tar.gz" + local url="https://github.com/NagiosEnterprises/nagioscore/releases/download/nagios-${NAGIOS_VERSION}/${tarball}" + + if [[ ! -f "$tarball" ]]; then + wget -q "$url" -O "$tarball" + fi + + tar xzf "$tarball" + cd "nagios-${NAGIOS_VERSION}" + + ./configure --with-command-group=nagcmd \ + --with-httpd-conf=/etc/apache2/sites-enabled \ + --with-nagios-user=nagios \ + --with-nagios-group=nagcmd \ + > /dev/null 2>&1 + + make all > /dev/null 2>&1 + make install > /dev/null 2>&1 + make install-init > /dev/null 2>&1 + make install-commandmode > /dev/null 2>&1 + make install-config > /dev/null 2>&1 + + if [[ "$WEBSERVER" == "apache" ]]; then + make install-webconf > /dev/null 2>&1 + fi + + log_info "Nagios Core $NAGIOS_VERSION compiled and installed to $NAGIOS_HOME" +} + +######################### +### Compile Plugins ### +######################### + +compile_plugins() { + log_step "Downloading and compiling Nagios Plugins $PLUGINS_VERSION..." + + cd "$BUILD_DIR" + + local tarball="nagios-plugins-${PLUGINS_VERSION}.tar.gz" + local url="https://github.com/nagios-plugins/nagios-plugins/releases/download/release-${PLUGINS_VERSION}/${tarball}" + + if [[ ! -f "$tarball" ]]; then + wget -q "$url" -O "$tarball" + fi + + tar xzf "$tarball" + cd "nagios-plugins-${PLUGINS_VERSION}" + + ./configure --with-nagios-user=nagios --with-nagios-group=nagcmd \ + > /dev/null 2>&1 + + make > /dev/null 2>&1 + make install > /dev/null 2>&1 + + log_info "Nagios Plugins $PLUGINS_VERSION compiled and installed" +} + +######################### +### Web Server Config ### +######################### + +configure_apache() { + log_step "Configuring Apache for Nagios..." + + if [[ "$OS_FAMILY" == "debian" ]]; then + a2enmod rewrite cgi 2>/dev/null || true + + cat > /etc/apache2/sites-enabled/nagios.conf <<'APACHECONF' +ScriptAlias /nagios/cgi-bin "/usr/local/nagios/sbin" +Alias /nagios "/usr/local/nagios/share" + + + Options ExecCGI + AllowOverride None + AuthType Basic + AuthName "Nagios Access" + AuthUserFile /usr/local/nagios/etc/htpasswd.users + Require valid-user + + + + Options None + AllowOverride None + AuthType Basic + AuthName "Nagios Access" + AuthUserFile /usr/local/nagios/etc/htpasswd.users + Require valid-user + +APACHECONF + else + cat > /etc/httpd/conf.d/nagios.conf <<'APACHECONF' +ScriptAlias /nagios/cgi-bin "/usr/local/nagios/sbin" +Alias /nagios "/usr/local/nagios/share" + + + Options ExecCGI + AllowOverride None + AuthType Basic + AuthName "Nagios Access" + AuthUserFile /usr/local/nagios/etc/htpasswd.users + Require valid-user + + + + Options None + AllowOverride None + AuthType Basic + AuthName "Nagios Access" + AuthUserFile /usr/local/nagios/etc/htpasswd.users + Require valid-user + +APACHECONF + fi + + log_info "Apache configured" +} + +configure_nginx() { + log_step "Configuring Nginx for Nagios..." + + local php_sock + if [[ "$OS_FAMILY" == "debian" ]]; then + php_sock=$(find /var/run/php/ -name "php*-fpm.sock" 2>/dev/null | head -1) + [[ -z "$php_sock" ]] && php_sock="/var/run/php/php-fpm.sock" + else + php_sock="/run/php-fpm/www.sock" + fi + + cat > /etc/nginx/conf.d/nagios.conf < /dev/null 2>&1 + + make all > /dev/null 2>&1 + make install > /dev/null 2>&1 + make install-config > /dev/null 2>&1 + make install-init > /dev/null 2>&1 + + systemctl enable nrpe + systemctl start nrpe + + log_info "NRPE $NRPE_VERSION installed and started" +} + +######################### +### Systemd Setup ### +######################### + +setup_systemd() { + log_step "Configuring systemd services..." + + systemctl daemon-reload + + # Enable and start Nagios + systemctl enable nagios + systemctl start nagios + + # Enable and restart web server + if [[ "$WEBSERVER" == "apache" ]]; then + if [[ "$OS_FAMILY" == "debian" ]]; then + systemctl enable apache2 + systemctl restart apache2 + else + systemctl enable httpd + systemctl restart httpd + fi + else + systemctl enable nginx + systemctl restart nginx + systemctl enable fcgiwrap 2>/dev/null || true + systemctl start fcgiwrap 2>/dev/null || true + systemctl enable php-fpm 2>/dev/null || systemctl enable "php*-fpm" 2>/dev/null || true + systemctl restart php-fpm 2>/dev/null || systemctl restart "php*-fpm" 2>/dev/null || true + fi + + log_info "Systemd services enabled and started" +} + +######################### +### Validate Config ### +######################### + +validate_config() { + log_step "Validating Nagios configuration..." + + if "$NAGIOS_HOME/bin/nagios" -v "$NAGIOS_HOME/etc/nagios.cfg" > /dev/null 2>&1; then + log_info "Configuration validation passed" + else + log_warn "Configuration validation returned warnings — review with:" + log_warn " $NAGIOS_HOME/bin/nagios -v $NAGIOS_HOME/etc/nagios.cfg" + fi +} + +######################### +### Cleanup ### +######################### + +cleanup() { + log_step "Cleaning up build directory..." + rm -rf "$BUILD_DIR" + log_info "Build directory removed" +} + +######################### +### Summary ### +######################### + +show_summary() { + local ip + ip=$(hostname -I 2>/dev/null | awk '{print $1}') + [[ -z "$ip" ]] && ip="" + + echo "" + echo "=============================================" + echo " Nagios Core Installation Complete" + echo "=============================================" + echo "" + echo " Nagios Core: $NAGIOS_VERSION" + echo " Plugins: $PLUGINS_VERSION" + echo " Web Server: $WEBSERVER" + echo " Install Path: $NAGIOS_HOME" + echo "" + echo " Web Interface: http://${ip}/nagios" + echo " Username: $NAGIOS_ADMIN_USER" + echo "" + if [[ "$INSTALL_NRPE" == "true" ]]; then + echo " NRPE: $NRPE_VERSION (installed)" + fi + echo "" + echo " Config dir: $NAGIOS_HOME/etc/" + echo " Validate: $NAGIOS_HOME/bin/nagios -v $NAGIOS_HOME/etc/nagios.cfg" + echo " Service: systemctl status nagios" + echo "" + echo "=============================================" +} + +######################### +### Main ### +######################### + +main() { + parse_args "$@" + check_root + detect_os + prompt_password + install_dependencies + create_nagios_user + compile_nagios + compile_plugins + configure_webserver + install_nrpe + setup_systemd + validate_config + cleanup + show_summary +} + +main "$@" diff --git a/nagios-to-prometheus-bridge.sh b/nagios-to-prometheus-bridge.sh new file mode 100644 index 0000000..e613ce6 --- /dev/null +++ b/nagios-to-prometheus-bridge.sh @@ -0,0 +1,260 @@ +#!/bin/bash +############################################################# +#### Nagios to Prometheus Bridge #### +#### Run Nagios check plugins and convert output to #### +#### Prometheus textfile collector format #### +#### #### +#### Author: Phil Connor #### +#### Contact: contact@mylinux.work #### +#### License: MIT #### +#### Version: 1.0 #### +#### #### +#### Usage: ./nagios-to-prometheus-bridge.sh [OPTIONS] #### +############################################################# +# +# Converts Nagios plugin output (exit codes + performance data) +# into Prometheus .prom files for the node_exporter textfile collector. +# +# Metrics generated: +# - nagios_check_status (exit code: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN) +# - nagios_check_duration_seconds (execution time) +# - nagios_check_last_run_timestamp (unix timestamp) +# - nagios_check_perfdata_