Add all 44 scripts, update CI: error severity baseline, PowerShell validation, multi-distro testing
Amp-Thread-ID: https://ampcode.com/threads/T-019cc404-c628-759e-a50b-f5eeea35b91f Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
+513
@@ -0,0 +1,513 @@
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Monitors Salt Minion service status and exports metrics for Prometheus windows_exporter.
|
||||
|
||||
.DESCRIPTION
|
||||
This script checks the status of the Salt Minion service and creates Prometheus-formatted metrics.
|
||||
The metrics are written to a text file that can be consumed by the windows_exporter.
|
||||
It can also create a scheduled task to run periodically.
|
||||
|
||||
.PARAMETER ValidateNotNullOrEmpty
|
||||
Switch to validate that the MetricsFilePath parameter is not null or empty.
|
||||
|
||||
.PARAMETER ValidateScript
|
||||
Validate that the MetricsFilePath parameter is a valid Windows path.
|
||||
|
||||
.PARAMETER MetricsFilePath
|
||||
The path where the Prometheus metrics file will be written.
|
||||
|
||||
.PARAMETER InstallScheduledTask
|
||||
Switch to create a scheduled task for periodic monitoring.
|
||||
|
||||
.PARAMETER TaskIntervalMinutes
|
||||
The interval in minutes for the scheduled task. Default is 15 minutes.
|
||||
|
||||
.PARAMETER TimeoutSeconds
|
||||
Timeout in seconds for service status checks. Default is 30 seconds.
|
||||
|
||||
.PARAMETER TimeoutSeconds
|
||||
Timeout in seconds for service status checks. Default is 30 seconds.
|
||||
|
||||
.PARAMETER SaltMasterPort
|
||||
The port number for the Salt Master. Default is 4505.
|
||||
|
||||
.PARAMETER DryRun
|
||||
Switch to output metrics to console instead of writing to file.
|
||||
|
||||
.PARAMETER Verbose
|
||||
Switch to enable verbose debug output for troubleshooting.
|
||||
|
||||
.PARAMETER Quiet
|
||||
Switch to suppress non-error output (useful for scheduled tasks).
|
||||
|
||||
.PARAMETER NoCron
|
||||
Switch to skip scheduled task installation.
|
||||
|
||||
.PARAMETER Version
|
||||
Switch to display script version and exit.
|
||||
|
||||
.NOTES
|
||||
Version: 3.3.0-20250915
|
||||
Author: Phil Connor, contact@mylinux.work
|
||||
License: MIT
|
||||
Created: 2025-01-24 loosly based on my salt_status.sh used with the linux servers.
|
||||
#>
|
||||
|
||||
param(
|
||||
[ValidateNotNullOrEmpty()]
|
||||
[ValidateScript({
|
||||
$parentPath = Split-Path $_ -Parent
|
||||
if ($parentPath -and -not (Test-Path $parentPath)) {
|
||||
throw "Directory does not exist: $parentPath"
|
||||
}
|
||||
if ($_ -match '^[A-Za-z]:\\') {
|
||||
return $true
|
||||
}
|
||||
throw "Invalid file path format"
|
||||
})]
|
||||
[string]$MetricsFilePath = "$env:ProgramFiles\windows_exporter\textfile_inputs\salt_status.prom",
|
||||
[switch]$InstallScheduledTask = $false,
|
||||
[ValidateRange(1, 1440)] # Validate the interval is between 1 and 1440 minutes for the scheduled task
|
||||
[int]$TaskIntervalMinutes = 15,
|
||||
[ValidateRange(1, 300)] # Validate the timeout is between 1 and 3600 seconds for service status checks
|
||||
[int]$TimeoutSeconds = 30,
|
||||
[int]$SaltMasterPort = 4505,
|
||||
[switch]$DryRun = $false, # Output metrics to console instead of file
|
||||
[switch]$Verbose = $false, # Enable verbose debug output
|
||||
[switch]$Quiet = $false, # Suppress non-error output
|
||||
[switch]$NoCron = $false, # Skip scheduled task installation
|
||||
[switch]$Version = $false # Show version and exit
|
||||
)
|
||||
|
||||
# Handle version display
|
||||
if ($Version) {
|
||||
Write-Host "Salt Status Monitor PowerShell Script"
|
||||
Write-Host "Version: 3.3.0-20250915"
|
||||
Write-Host "Author: Phil Connor pconnor@ara.com"
|
||||
exit 0
|
||||
}
|
||||
|
||||
# Set up logging preferences based on Verbose/Quiet flags
|
||||
if ($Verbose) {
|
||||
$VerbosePreference = 'Continue'
|
||||
$InformationPreference = 'Continue'
|
||||
}
|
||||
if ($Quiet) {
|
||||
$VerbosePreference = 'SilentlyContinue'
|
||||
$InformationPreference = 'SilentlyContinue'
|
||||
$WarningPreference = 'SilentlyContinue'
|
||||
}
|
||||
|
||||
# Logging functions
|
||||
function Write-VerboseLog {
|
||||
param([string]$Message)
|
||||
if ($Verbose) {
|
||||
Write-Host "[VERBOSE] $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss') $Message" -ForegroundColor Cyan
|
||||
}
|
||||
}
|
||||
|
||||
function Write-InfoLog {
|
||||
param([string]$Message)
|
||||
if (-not $Quiet) {
|
||||
Write-Host "[INFO] $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss') $Message" -ForegroundColor Green
|
||||
}
|
||||
}
|
||||
|
||||
# Create a scheduled task to run this script every 15 minutes
|
||||
if ($InstallScheduledTask -and -not $NoCron) {
|
||||
$taskName = "SaltMinionStatusCheck"
|
||||
$existingTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue
|
||||
|
||||
if (-not $existingTask) {
|
||||
$taskAction = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$($MyInvocation.MyCommand.Path)`""
|
||||
# Add validation
|
||||
if (-not $TaskIntervalMinutes -or $TaskIntervalMinutes -le 0) {
|
||||
throw "TaskIntervalMinutes must be a positive integer"
|
||||
}
|
||||
|
||||
$taskTrigger = New-ScheduledTaskTrigger -Once -At (Get-Date).AddMinutes(1) -RepetitionInterval (New-TimeSpan -Minutes $TaskIntervalMinutes) -RepetitionDuration (New-TimeSpan -Days 365)
|
||||
$taskPrincipal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest
|
||||
|
||||
try {
|
||||
Write-InfoLog "Creating scheduled task: $taskName"
|
||||
Register-ScheduledTask -TaskName $taskName -Action $taskAction -Trigger $taskTrigger -Principal $taskPrincipal -Description "Monitors Salt Minion status every $TaskIntervalMinutes minutes"
|
||||
|
||||
# Verify the task was created
|
||||
$createdTask = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue
|
||||
if (-not $createdTask) {
|
||||
throw "Failed to verify scheduled task creation"
|
||||
}
|
||||
Write-InfoLog "Successfully created scheduled task: $taskName"
|
||||
} catch {
|
||||
Write-Error "Failed to create auto-start task: $($_.Exception.Message)"
|
||||
throw
|
||||
}
|
||||
} else {
|
||||
Write-InfoLog "Scheduled task $taskName already exists. Skipping creation."
|
||||
}
|
||||
}
|
||||
|
||||
# Function to check if required commands are available
|
||||
function Test-CommandAvailability {
|
||||
param([string]$Command)
|
||||
|
||||
try {
|
||||
Get-Command $Command -ErrorAction Stop | Out-Null
|
||||
return $true
|
||||
} catch {
|
||||
Write-Warning "Required command '$Command' is not available"
|
||||
return $false
|
||||
}
|
||||
}
|
||||
|
||||
# Function to check if the salt-master is connected
|
||||
function Test-Port4505Connection {
|
||||
try {
|
||||
# Use netstat to check for active connections on the salt-master port
|
||||
$portCheck = netstat -an 2>$null | Select-String "\s+[^:]+:$SaltMasterPort\s+"
|
||||
|
||||
# Check if we found any active connections on the port
|
||||
if ($null -ne $portCheck) {
|
||||
Write-VerboseLog "Port $SaltMasterPort is in use and has active connections"
|
||||
return $true
|
||||
} else {
|
||||
Write-VerboseLog "No active connections found on port $SaltMasterPort"
|
||||
return $false
|
||||
}
|
||||
} catch [System.Management.Automation.ActionPreferenceStopException] {
|
||||
# Silently ignore this specific exception when error action is set to Stop
|
||||
} catch {
|
||||
# Log any other unexpected errors and return failure status
|
||||
Write-Warning "Failed to check port $SaltMasterPort : $($_.Exception.Message)"
|
||||
return $false
|
||||
}
|
||||
}
|
||||
|
||||
# Function to check if the salt-master responds to ping
|
||||
function Test-SaltPing {
|
||||
param(
|
||||
[int]$TimeoutSeconds = $TimeoutSeconds
|
||||
)
|
||||
if (-not (Test-CommandAvailability "salt-call")) {
|
||||
Write-Warning "Salt-call command not found"
|
||||
return $false
|
||||
}
|
||||
|
||||
$job = $null
|
||||
try {
|
||||
$job = Start-Job -ScriptBlock { salt-call test.ping --local 2>$null } -ErrorAction Stop
|
||||
$completed = $job | Wait-Job -Timeout $TimeoutSeconds
|
||||
if (-not $completed) {
|
||||
Write-Warning "Salt-call test.ping timed out after $TimeoutSeconds seconds"
|
||||
return $false
|
||||
}
|
||||
$saltTest = $job | Receive-Job -ErrorAction SilentlyContinue
|
||||
if ($null -eq $saltTest) {
|
||||
Write-Host "No response from salt-call test.ping"
|
||||
return $false
|
||||
}
|
||||
|
||||
if ($saltTest -is [array]) {
|
||||
$saltTest = $saltTest -join "`n"
|
||||
}
|
||||
|
||||
if ($saltTest -match "local:\s*True" -or $saltTest -match "^\s*True\s*$") {
|
||||
Write-VerboseLog "Salt-call test.ping returned True"
|
||||
return $true
|
||||
} else {
|
||||
Write-VerboseLog "Salt-call test.ping failed or returned unexpected output: $saltTest"
|
||||
return $false
|
||||
}
|
||||
} catch {
|
||||
Write-Warning "Salt-Call failed: $($_.Exception.Message)"
|
||||
return $false
|
||||
} finally {
|
||||
if ($null -ne $job) {
|
||||
try {
|
||||
if ($job.State -eq 'Running') {
|
||||
$job | Stop-Job -Force -ErrorAction SilentlyContinue
|
||||
}
|
||||
} finally {
|
||||
$job | Remove-Job -Force -ErrorAction SilentlyContinue
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Function to check if prometheus named metrics are sanitized or not
|
||||
function Test-PrometheusMetricName {
|
||||
param([string]$MetricName)
|
||||
|
||||
# Prometheus metric names should match: [a-zA-Z_:][a-zA-Z0-9_:]*
|
||||
if ($MetricName -match '^[a-zA-Z_:][a-zA-Z0-9_:]*$') {
|
||||
return $true
|
||||
}
|
||||
return $false
|
||||
}
|
||||
|
||||
# Function to format and add a metric to the metrics array
|
||||
function Add-PrometheusMetric {
|
||||
param(
|
||||
[string]$Name,
|
||||
[string]$Help,
|
||||
[string]$Type,
|
||||
[object]$Value,
|
||||
[ref]$MetricsArray
|
||||
)
|
||||
|
||||
if (-not (Test-PrometheusMetricName $Name)) {
|
||||
Write-Warning "Invalid metric name: $Name"
|
||||
return
|
||||
}
|
||||
|
||||
$MetricsArray.Value += "# HELP $Name $Help"
|
||||
$MetricsArray.Value += "# TYPE $Name $Type"
|
||||
$MetricsArray.Value += "$Name $Value"
|
||||
}
|
||||
|
||||
# Function to check Windows service status
|
||||
function Test-SaltMinionService {
|
||||
try {
|
||||
$service = Get-Service -Name "salt-minion" -ErrorAction SilentlyContinue
|
||||
if ($null -eq $service) {
|
||||
Write-Warning "Salt-minion service not found"
|
||||
return 2 # Service not found
|
||||
}
|
||||
|
||||
if ($service.Status -eq 'Running') {
|
||||
return 1 # Service is running
|
||||
} else {
|
||||
return 0 # Service is not running
|
||||
}
|
||||
} catch {
|
||||
Write-Warning "Failed to check salt-minion service status: $($_.Exception.Message)"
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
# Function to get Salt version
|
||||
function Get-SaltVersion {
|
||||
if (-not (Test-CommandAvailability "salt-call")) {
|
||||
return "0"
|
||||
}
|
||||
|
||||
try {
|
||||
$versionOutput = & salt-call --version 2>$null
|
||||
if ($versionOutput -match "(\d+\.\d+)") {
|
||||
return $matches[1]
|
||||
}
|
||||
return "0"
|
||||
} catch {
|
||||
Write-Warning "Failed to get Salt version: $($_.Exception.Message)"
|
||||
return "0"
|
||||
}
|
||||
}
|
||||
|
||||
# Function to get Salt-minion memory usage
|
||||
function Get-SaltMemoryUsage {
|
||||
try {
|
||||
$saltProcesses = Get-Process -Name "salt-minion" -ErrorAction SilentlyContinue
|
||||
if ($null -eq $saltProcesses) {
|
||||
return 0
|
||||
}
|
||||
|
||||
$totalMemory = 0
|
||||
foreach ($process in $saltProcesses) {
|
||||
$totalMemory += $process.WorkingSet64
|
||||
}
|
||||
return $totalMemory
|
||||
} catch {
|
||||
Write-Warning "Failed to get salt-minion memory usage: $($_.Exception.Message)"
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
# Function to get last successful communication timestamp
|
||||
function Get-LastCommunicationTimestamp {
|
||||
if (-not (Test-CommandAvailability "salt-call")) {
|
||||
return 0
|
||||
}
|
||||
|
||||
try {
|
||||
$pingResult = Test-SaltPing
|
||||
if ($pingResult) {
|
||||
return [int][double]::Parse((Get-Date -UFormat %s))
|
||||
}
|
||||
return 0
|
||||
} catch {
|
||||
Write-Warning "Failed to get last communication timestamp: $($_.Exception.Message)"
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
# Function to count recent Salt errors in Windows Event Log
|
||||
function Get-SaltErrorCount {
|
||||
try {
|
||||
$24HoursAgo = (Get-Date).AddHours(-24)
|
||||
$errorEvents = Get-WinEvent -FilterHashtable @{
|
||||
LogName = 'Application'
|
||||
Source = 'salt-minion'
|
||||
Level = 2 # Error level
|
||||
StartTime = $24HoursAgo
|
||||
} -ErrorAction SilentlyContinue
|
||||
|
||||
if ($null -eq $errorEvents) {
|
||||
return 0
|
||||
}
|
||||
|
||||
return $errorEvents.Count
|
||||
} catch {
|
||||
# Fallback: try to read from salt log file if it exists
|
||||
$logPath = "${env:ProgramData}\Salt Project\Salt\var\log\salt\minion"
|
||||
if (Test-Path $logPath) {
|
||||
try {
|
||||
$logContent = Get-Content $logPath -Tail 1000 -ErrorAction SilentlyContinue
|
||||
$errorLines = $logContent | Where-Object { $_ -match "\[ERROR\]" }
|
||||
return $errorLines.Count
|
||||
} catch {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
# Function to export Prometheus metrics
|
||||
function Export-PrometheusMetrics {
|
||||
#Starts the metrics export.
|
||||
$startTime = Get-Date
|
||||
$metrics = @()
|
||||
$errors = @()
|
||||
|
||||
try {
|
||||
# Connection status metric (port 4505)
|
||||
try {
|
||||
if (-not (Test-CommandAvailability "netstat")) {
|
||||
$errors += "netstat command not found"
|
||||
$connectionStatus = 2
|
||||
} else {
|
||||
$connectionStatus = if (Test-Port4505Connection) { 1 } else { 0 }
|
||||
}
|
||||
Add-PrometheusMetric -Name "minion_connection_status" -Help "Shows if Salt-Minion is connected to Salt-Master." -Type "gauge" -Value $connectionStatus -MetricsArray ([ref]$metrics)
|
||||
} catch {
|
||||
$errors += "Port 4505 check failed: $($_.Exception.Message)"
|
||||
Add-PrometheusMetric -Name "minion_connection_status" -Help "Shows if Salt-Minion is connected to Salt-Master." -Type "gauge" -Value 0 -MetricsArray ([ref]$metrics)
|
||||
}
|
||||
|
||||
# Salt ping metric
|
||||
try {
|
||||
if (-not (Test-CommandAvailability "salt-call")) {
|
||||
$errors += "salt-call command not found"
|
||||
$pingStatus = 2
|
||||
} else {
|
||||
$pingStatus = if (Test-SaltPing) { 1 } else { 0 }
|
||||
}
|
||||
Add-PrometheusMetric -Name "minion_ping_status" -Help "Shows if Salt-Minion is able to ping Salt-Master." -Type "gauge" -Value $pingStatus -MetricsArray ([ref]$metrics)
|
||||
} catch {
|
||||
$errors += "Salt ping check failed: $($_.Exception.Message)"
|
||||
Add-PrometheusMetric -Name "minion_ping_status" -Help "Shows if Salt-Minion is able to ping Salt-Master." -Type "gauge" -Value 0 -MetricsArray ([ref]$metrics)
|
||||
}
|
||||
|
||||
# Service status metric
|
||||
try {
|
||||
$serviceStatus = Test-SaltMinionService
|
||||
Add-PrometheusMetric -Name "minion_service_status" -Help "Shows if Salt-Minion service is active." -Type "gauge" -Value $serviceStatus -MetricsArray ([ref]$metrics)
|
||||
} catch {
|
||||
$errors += "Service status check failed: $($_.Exception.Message)"
|
||||
Add-PrometheusMetric -Name "minion_service_status" -Help "Shows if Salt-Minion service is active." -Type "gauge" -Value 0 -MetricsArray ([ref]$metrics)
|
||||
}
|
||||
|
||||
# Last communication timestamp
|
||||
try {
|
||||
$lastComm = Get-LastCommunicationTimestamp
|
||||
Add-PrometheusMetric -Name "minion_last_communication_timestamp" -Help "Timestamp of last successful communication with Salt-Master." -Type "gauge" -Value $lastComm -MetricsArray ([ref]$metrics)
|
||||
} catch {
|
||||
$errors += "Last communication check failed: $($_.Exception.Message)"
|
||||
Add-PrometheusMetric -Name "minion_last_communication_timestamp" -Help "Timestamp of last successful communication with Salt-Master." -Type "gauge" -Value 0 -MetricsArray ([ref]$metrics)
|
||||
}
|
||||
|
||||
# Salt version metric
|
||||
try {
|
||||
$version = Get-SaltVersion
|
||||
Add-PrometheusMetric -Name "minion_version" -Help "Salt-Minion version number." -Type "gauge" -Value $version -MetricsArray ([ref]$metrics)
|
||||
} catch {
|
||||
$errors += "Version check failed: $($_.Exception.Message)"
|
||||
Add-PrometheusMetric -Name "minion_version" -Help "Salt-Minion version number." -Type "gauge" -Value 0 -MetricsArray ([ref]$metrics)
|
||||
}
|
||||
|
||||
# Memory usage metric
|
||||
try {
|
||||
$memoryUsage = Get-SaltMemoryUsage
|
||||
Add-PrometheusMetric -Name "minion_memory_usage_bytes" -Help "Salt-Minion process memory usage in bytes." -Type "gauge" -Value $memoryUsage -MetricsArray ([ref]$metrics)
|
||||
} catch {
|
||||
$errors += "Memory usage check failed: $($_.Exception.Message)"
|
||||
Add-PrometheusMetric -Name "minion_memory_usage_bytes" -Help "Salt-Minion process memory usage in bytes." -Type "gauge" -Value 0 -MetricsArray ([ref]$metrics)
|
||||
}
|
||||
|
||||
# Error count metric
|
||||
try {
|
||||
$errorCount = Get-SaltErrorCount
|
||||
Add-PrometheusMetric -Name "minion_error_count" -Help "Number of error entries in Salt-Minion log file." -Type "counter" -Value $errorCount -MetricsArray ([ref]$metrics)
|
||||
} catch {
|
||||
$errors += "Error count check failed: $($_.Exception.Message)"
|
||||
Add-PrometheusMetric -Name "minion_error_count" -Help "Number of error entries in Salt-Minion log file." -Type "counter" -Value 0 -MetricsArray ([ref]$metrics)
|
||||
}
|
||||
|
||||
# Windows-specific: Script execution error count
|
||||
Add-PrometheusMetric -Name "windows_salt_script_errors_total" -Help "Total number of errors during script execution" -Type "counter" -Value $errors.Count -MetricsArray ([ref]$metrics)
|
||||
|
||||
# Windows-specific: Script runtime
|
||||
$scriptRuntime = (Get-Date) - $startTime
|
||||
Add-PrometheusMetric -Name "windows_salt_script_runtime_seconds" -Help "Total script execution time in seconds" -Type "gauge" -Value $scriptRuntime.TotalSeconds -MetricsArray ([ref]$metrics)
|
||||
|
||||
} finally {
|
||||
# Ensure cleanup happens regardless of success/failure
|
||||
if ($errors.Count -gt 0) {
|
||||
Write-Warning "Script completed with $($errors.Count) errors"
|
||||
}
|
||||
}
|
||||
|
||||
return $metrics
|
||||
}
|
||||
|
||||
|
||||
# Output metrics to console or file
|
||||
try {
|
||||
# Export metrics as an array of strings
|
||||
$exportedMetrics = Export-PrometheusMetrics
|
||||
if ($null -eq $exportedMetrics) {
|
||||
throw "Export-PrometheusMetrics returned null"
|
||||
}
|
||||
|
||||
if ($DryRun) {
|
||||
# Dry run mode: output to console
|
||||
Write-Host "=== DRY RUN MODE - Metrics that would be written to $MetricsFilePath ===" -ForegroundColor Yellow
|
||||
$exportedMetrics | ForEach-Object { Write-Host $_ }
|
||||
Write-Host "=== END DRY RUN OUTPUT ===" -ForegroundColor Yellow
|
||||
} else {
|
||||
# Normal mode: write to file with retry mechanism
|
||||
$retryCount = 0
|
||||
$maxRetries = 3
|
||||
do {
|
||||
try {
|
||||
# Write the metrics to the file
|
||||
$exportedMetrics | Out-File -FilePath $MetricsFilePath -Encoding UTF8 -Force
|
||||
break
|
||||
} catch [System.IO.IOException] {
|
||||
$retryCount++
|
||||
if ($retryCount -ge $maxRetries) {
|
||||
throw
|
||||
}
|
||||
# Wait 100ms before retrying
|
||||
Start-Sleep -Milliseconds 100
|
||||
}
|
||||
} while ($retryCount -lt $maxRetries)
|
||||
}
|
||||
} catch {
|
||||
Write-Error "Failed to export metrics: $($_.Exception.Message)"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Uncomment the following line to write metrics to the console
|
||||
# $exportedMetrics = Export-PrometheusMetrics
|
||||
|
||||
Reference in New Issue
Block a user