<# .SYNOPSIS Windows Failover Cluster Health Prometheus Exporter .DESCRIPTION Prometheus exporter for Windows Failover Clustering - cluster state, node status, resource group health, cluster shared volumes, network health, quorum status, and recent failover events via PowerShell. .PARAMETER Mode Output mode: 'stdout' (default), 'textfile', or 'http' .PARAMETER Port HTTP port for http mode (default: 9540) .PARAMETER TextfileDir Directory for textfile collector output (default: C:\ProgramData\node_exporter) .PARAMETER InstallScheduledTask Switch to create a scheduled task for automatic execution .PARAMETER TaskIntervalMinutes Interval in minutes for the scheduled task (default: 2) .NOTES Author: Phil Connor Contact: contact@mylinux.work Website: https://mylinux.work License: MIT Version: 1.0 Metrics Exported: Cluster: - windows_cluster_up - windows_cluster_exporter_info{version} - windows_cluster_node_count - windows_cluster_node_up{node,state} - windows_cluster_quorum_state{type} Resource Groups: - windows_cluster_group_state{group,node,state} - windows_cluster_group_count - windows_cluster_group_online_count Resources: - windows_cluster_resource_state{resource,group,type,node,state} CSV (Cluster Shared Volumes): - windows_cluster_csv_total_bytes{name,path} - windows_cluster_csv_used_bytes{name,path} - windows_cluster_csv_free_bytes{name,path} - windows_cluster_csv_utilization{name,path} Networks: - windows_cluster_network_state{network,state,role} Events: - windows_cluster_failover_events_total - windows_cluster_last_failover_timestamp Exporter: - windows_cluster_exporter_duration_seconds - windows_cluster_exporter_last_run_timestamp #> param( [ValidateSet('stdout','textfile','http')] [string]$Mode = 'stdout', [int]$Port = 9540, [string]$TextfileDir = 'C:\ProgramData\node_exporter', [switch]$InstallScheduledTask, [int]$TaskIntervalMinutes = 2 ) $ErrorActionPreference = 'SilentlyContinue' $Version = '1.0' # ============================================================================ # SCHEDULED TASK INSTALLER # ============================================================================ if ($InstallScheduledTask) { $scriptPath = $MyInvocation.MyCommand.Path $action = New-ScheduledTaskAction -Execute 'powershell.exe' ` -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$scriptPath`" -Mode textfile" $trigger = New-ScheduledTaskTrigger -RepetitionInterval (New-TimeSpan -Minutes $TaskIntervalMinutes) ` -Once -At (Get-Date) $settings = New-ScheduledTaskSettingsSet -AllowStartIfOnBatteries -DontStopIfGoingOnBatteries ` -StartWhenAvailable -ExecutionTimeLimit (New-TimeSpan -Minutes 5) Register-ScheduledTask -TaskName 'WindowsClusterHealthExporter' -Action $action -Trigger $trigger ` -Settings $settings -RunLevel Highest -User 'SYSTEM' -Force Write-Host 'Scheduled task "WindowsClusterHealthExporter" installed successfully.' exit 0 } # ============================================================================ # HELPER FUNCTIONS # ============================================================================ function Get-PrometheusEscape { param([string]$Value) $Value -replace '\\', '\\\\' -replace '"', '\"' -replace "`n", '\n' } function Write-MetricHeader { param([string]$Name, [string]$Type, [string]$Help) "# HELP $Name $Help" "# TYPE $Name $Type" } # ============================================================================ # METRIC COLLECTION # ============================================================================ function Get-ClusterHealthMetrics { $startTime = Get-Date $metrics = [System.Collections.Generic.List[string]]::new() # --- Exporter status --- $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_up' 'gauge' 'Cluster exporter status (1=up, 0=down)')) try { Import-Module FailoverClusters -ErrorAction Stop $cluster = Get-Cluster -ErrorAction Stop $metrics.Add('windows_cluster_up 1') } catch { $metrics.Add('windows_cluster_up 0') return ($metrics -join "`n") } $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_exporter_info' 'gauge' 'Exporter version information')) $metrics.Add("windows_cluster_exporter_info{version=`"$Version`"} 1") # --- Node status --- try { $nodes = Get-ClusterNode -ErrorAction Stop $nodeCount = ($nodes | Measure-Object).Count $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_node_count' 'gauge' 'Total number of cluster nodes')) $metrics.Add("windows_cluster_node_count $nodeCount") $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_node_up' 'gauge' 'Node state (1=Up, 0=other)')) foreach ($node in $nodes) { $nodeName = Get-PrometheusEscape $node.Name $stateStr = $node.State.ToString() $stateVal = if ($stateStr -eq 'Up') { 1 } else { 0 } $metrics.Add("windows_cluster_node_up{node=`"$nodeName`",state=`"$stateStr`"} $stateVal") } } catch { # Node enumeration failed } # --- Quorum --- try { $quorum = Get-ClusterQuorum -ErrorAction Stop $qType = Get-PrometheusEscape $quorum.QuorumType.ToString() $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_quorum_state' 'gauge' 'Quorum status (1=healthy)')) $metrics.Add("windows_cluster_quorum_state{type=`"$qType`"} 1") } catch { # Quorum info unavailable } # --- Resource groups --- try { $groups = Get-ClusterGroup -ErrorAction Stop $groupCount = ($groups | Measure-Object).Count $onlineCount = ($groups | Where-Object { $_.State -eq 'Online' } | Measure-Object).Count $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_group_count' 'gauge' 'Total resource groups')) $metrics.Add("windows_cluster_group_count $groupCount") $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_group_online_count' 'gauge' 'Online resource groups')) $metrics.Add("windows_cluster_group_online_count $onlineCount") $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_group_state' 'gauge' 'Resource group state (1=Online, 0=other)')) foreach ($group in $groups) { $gName = Get-PrometheusEscape $group.Name $gNode = Get-PrometheusEscape $group.OwnerNode.ToString() $gState = $group.State.ToString() $gVal = if ($gState -eq 'Online') { 1 } else { 0 } $metrics.Add("windows_cluster_group_state{group=`"$gName`",node=`"$gNode`",state=`"$gState`"} $gVal") } } catch { # Group enumeration failed } # --- Resources --- try { $resources = Get-ClusterResource -ErrorAction Stop $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_resource_state' 'gauge' 'Resource state (1=Online, 0=other)')) foreach ($res in $resources) { $rName = Get-PrometheusEscape $res.Name $rGroup = Get-PrometheusEscape $res.OwnerGroup.ToString() $rType = Get-PrometheusEscape $res.ResourceType.ToString() $rNode = Get-PrometheusEscape $res.OwnerNode.ToString() $rState = $res.State.ToString() $rVal = if ($rState -eq 'Online') { 1 } else { 0 } $metrics.Add("windows_cluster_resource_state{resource=`"$rName`",group=`"$rGroup`",type=`"$rType`",node=`"$rNode`",state=`"$rState`"} $rVal") } } catch { # Resource enumeration failed } # --- Cluster Shared Volumes --- try { $csvs = Get-ClusterSharedVolume -ErrorAction Stop $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_csv_total_bytes' 'gauge' 'CSV total size in bytes')) $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_csv_used_bytes' 'gauge' 'CSV used space in bytes')) $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_csv_free_bytes' 'gauge' 'CSV free space in bytes')) $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_csv_utilization' 'gauge' 'CSV utilization percentage')) foreach ($csv in $csvs) { $csvName = Get-PrometheusEscape $csv.Name foreach ($vol in $csv.SharedVolumeInfo) { $csvPath = Get-PrometheusEscape $vol.FriendlyVolumeName $partition = $vol.Partition $total = $partition.Size $free = $partition.FreeSpace $used = $total - $free $util = if ($total -gt 0) { [math]::Round(($used / $total) * 100, 2) } else { 0 } $metrics.Add("windows_cluster_csv_total_bytes{name=`"$csvName`",path=`"$csvPath`"} $total") $metrics.Add("windows_cluster_csv_used_bytes{name=`"$csvName`",path=`"$csvPath`"} $used") $metrics.Add("windows_cluster_csv_free_bytes{name=`"$csvName`",path=`"$csvPath`"} $free") $metrics.Add("windows_cluster_csv_utilization{name=`"$csvName`",path=`"$csvPath`"} $util") } } } catch { # CSV info unavailable (not all clusters use CSV) } # --- Networks --- try { $networks = Get-ClusterNetwork -ErrorAction Stop $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_network_state' 'gauge' 'Network state (1=Up, 0=other)')) foreach ($net in $networks) { $netName = Get-PrometheusEscape $net.Name $netState = $net.State.ToString() $netRole = $net.Role.ToString() $netVal = if ($netState -eq 'Up') { 1 } else { 0 } $metrics.Add("windows_cluster_network_state{network=`"$netName`",state=`"$netState`",role=`"$netRole`"} $netVal") } } catch { # Network info unavailable } # --- Failover events --- try { $events = Get-WinEvent -FilterHashtable @{ LogName = 'Microsoft-Windows-FailoverClustering/Operational' ID = 1069, 1070, 1641 } -MaxEvents 100 -ErrorAction Stop $last24h = ($events | Where-Object { $_.TimeCreated -gt (Get-Date).AddHours(-24) } | Measure-Object).Count $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_failover_events_total' 'gauge' 'Failover events in last 24h')) $metrics.Add("windows_cluster_failover_events_total $last24h") if ($events.Count -gt 0) { $lastEvent = [math]::Round(($events[0].TimeCreated - [datetime]'1970-01-01').TotalSeconds) $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_last_failover_timestamp' 'gauge' 'Unix timestamp of last failover event')) $metrics.Add("windows_cluster_last_failover_timestamp $lastEvent") } } catch { # Event log unavailable } # --- Duration and timestamp --- $duration = [math]::Round(((Get-Date) - $startTime).TotalSeconds, 2) $timestamp = [math]::Round((Get-Date -UFormat %s), 0) $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_exporter_duration_seconds' 'gauge' 'Script execution time')) $metrics.Add("windows_cluster_exporter_duration_seconds $duration") $metrics.AddRange([string[]](Write-MetricHeader 'windows_cluster_exporter_last_run_timestamp' 'gauge' 'Unix timestamp of last successful run')) $metrics.Add("windows_cluster_exporter_last_run_timestamp $timestamp") return ($metrics -join "`n") } # ============================================================================ # OUTPUT # ============================================================================ switch ($Mode) { 'stdout' { Get-ClusterHealthMetrics } 'textfile' { if (-not (Test-Path $TextfileDir)) { New-Item -ItemType Directory -Path $TextfileDir -Force | Out-Null } $tempFile = Join-Path $TextfileDir "windows-cluster-health-metrics.tmp" $finalFile = Join-Path $TextfileDir "windows-cluster-health-metrics.prom" Get-ClusterHealthMetrics | Out-File -FilePath $tempFile -Encoding utf8 -NoNewline Move-Item -Path $tempFile -Destination $finalFile -Force Write-Host "Wrote metrics to $finalFile" } 'http' { $prefix = "http://+:$Port/metrics/" $listener = [System.Net.HttpListener]::new() $listener.Prefixes.Add($prefix) $listener.Start() Write-Host "Listening on port $Port..." try { while ($listener.IsListening) { $context = $listener.GetContext() $response = $context.Response $metricsOutput = Get-ClusterHealthMetrics $buffer = [System.Text.Encoding]::UTF8.GetBytes($metricsOutput) $response.ContentType = 'text/plain; version=0.0.4; charset=utf-8' $response.ContentLength64 = $buffer.Length $response.OutputStream.Write($buffer, 0, $buffer.Length) $response.OutputStream.Close() } } finally { $listener.Stop() } } }