#!/bin/bash ################################################################################ # Script Name: postfix-metrics.sh # Description: Prometheus exporter for Postfix mail server metrics # # Usage: # # Output to stdout # ./postfix-metrics.sh # # # Textfile collector mode (atomic write) # ./postfix-metrics.sh --textfile # # # Custom output file # ./postfix-metrics.sh -o /path/to/metrics.prom # ################################################################################ # ============================================================================ # CONFIGURATION VARIABLES # ============================================================================ TEXTFILE_DIR="/var/lib/node_exporter" OUTPUT_FILE="" HTTP_MODE=false HTTP_PORT=9192 QUEUE_DIR="/var/spool/postfix" LOG_FILE="/var/log/mail.log" HOSTNAME=$(hostname) # ============================================================================ # HELPER FUNCTIONS # ============================================================================ show_usage() { cat <&2; exit 1 ;; esac done } # Helper function to count grep matches (returns 0 if no match) grep_count() { local result result=$(grep -c "$@" 2>/dev/null) || result=0 echo "$result" } # ============================================================================ # METRIC GENERATION # ============================================================================ generate_metrics() { local START_TIME START_TIME=$(date +%s.%N) # Queue sizes echo "# HELP postfix_queue_size Number of messages in each Postfix queue" echo "# TYPE postfix_queue_size gauge" for queue in incoming active deferred hold corrupt; do count=$(find "${QUEUE_DIR}/${queue}" -type f 2>/dev/null | wc -l) echo "postfix_queue_size{queue=\"${queue}\",hostname=\"${HOSTNAME}\"} ${count}" done # Oldest message in queue (seconds) echo "# HELP postfix_queue_oldest_seconds Age of oldest message in queue" echo "# TYPE postfix_queue_oldest_seconds gauge" for queue in deferred hold; do oldest=$(find "${QUEUE_DIR}/${queue}" -type f -printf '%T@\n' 2>/dev/null | sort -n | head -1) if [[ -n "$oldest" ]]; then age=$(echo "$(date +%s) - ${oldest%.*}" | bc) else age=0 fi echo "postfix_queue_oldest_seconds{queue=\"${queue}\",hostname=\"${HOSTNAME}\"} ${age}" done # Message counters by status echo "# HELP postfix_messages_total Total messages by status" echo "# TYPE postfix_messages_total counter" for status in sent bounced deferred expired; do count=$(grep_count "status=${status}" "$LOG_FILE") echo "postfix_messages_total{status=\"${status}\",hostname=\"${HOSTNAME}\"} ${count}" done rejected=$(grep_count 'reject:' "$LOG_FILE") echo "postfix_messages_total{status=\"rejected\",hostname=\"${HOSTNAME}\"} ${rejected}" # SMTP connections echo "# HELP postfix_smtp_connections SMTP connection stats" echo "# TYPE postfix_smtp_connections counter" connections=$(grep_count 'connect from' "$LOG_FILE") disconnections=$(grep_count 'disconnect from' "$LOG_FILE") echo "postfix_smtp_connections{type=\"connect\",hostname=\"${HOSTNAME}\"} ${connections}" echo "postfix_smtp_connections{type=\"disconnect\",hostname=\"${HOSTNAME}\"} ${disconnections}" # Connection timeouts echo "# HELP postfix_timeout_total Connection timeout events" echo "# TYPE postfix_timeout_total counter" timeout_count=$(grep_count 'timeout after' "$LOG_FILE") echo "postfix_timeout_total{hostname=\"${HOSTNAME}\"} ${timeout_count}" # SASL authentication echo "# HELP postfix_sasl_auth_total SASL authentication attempts" echo "# TYPE postfix_sasl_auth_total counter" sasl_success=$(grep_count 'sasl_username=' "$LOG_FILE") sasl_fail=$(grep_count 'authentication failed' "$LOG_FILE") echo "postfix_sasl_auth_total{result=\"success\",hostname=\"${HOSTNAME}\"} ${sasl_success}" echo "postfix_sasl_auth_total{result=\"failed\",hostname=\"${HOSTNAME}\"} ${sasl_fail}" # Message sizes (bytes) echo "# HELP postfix_message_size_bytes_total Total bytes of messages processed" echo "# TYPE postfix_message_size_bytes_total counter" total_bytes=$(grep -oP 'size=\K\d+' "$LOG_FILE" 2>/dev/null | awk '{sum+=$1} END {print sum+0}') echo "postfix_message_size_bytes_total{hostname=\"${HOSTNAME}\"} ${total_bytes}" echo "# HELP postfix_message_size_bytes_avg Average message size" echo "# TYPE postfix_message_size_bytes_avg gauge" avg_size=$(grep -oP 'size=\K\d+' "$LOG_FILE" 2>/dev/null | awk '{sum+=$1; count++} END {if(count>0) print int(sum/count); else print 0}') echo "postfix_message_size_bytes_avg{hostname=\"${HOSTNAME}\"} ${avg_size}" echo "# HELP postfix_message_size_bytes_max Largest message size" echo "# TYPE postfix_message_size_bytes_max gauge" max_size=$(grep -oP 'size=\K\d+' "$LOG_FILE" 2>/dev/null | sort -rn | head -1) echo "postfix_message_size_bytes_max{hostname=\"${HOSTNAME}\"} ${max_size:-0}" # Per-recipient domain stats (top domains) echo "# HELP postfix_recipient_domain_total Messages per recipient domain" echo "# TYPE postfix_recipient_domain_total counter" grep -oP 'to=<[^@]+@\K[^>]+' "$LOG_FILE" 2>/dev/null | sort | uniq -c | sort -rn | head -20 | while read -r count domain; do echo "postfix_recipient_domain_total{domain=\"${domain}\",hostname=\"${HOSTNAME}\"} ${count}" done # Sender domain stats echo "# HELP postfix_sender_domain_total Messages per sender domain" echo "# TYPE postfix_sender_domain_total counter" grep -oP 'from=<[^@]+@\K[^>]+' "$LOG_FILE" 2>/dev/null | sort | uniq -c | sort -rn | head -20 | while read -r count domain; do echo "postfix_sender_domain_total{domain=\"${domain}\",hostname=\"${HOSTNAME}\"} ${count}" done # Bounce reasons echo "# HELP postfix_bounce_reason_total Bounces by reason" echo "# TYPE postfix_bounce_reason_total counter" bounce_user=$(grep_count 'User unknown' "$LOG_FILE") bounce_quota=$(grep_count -i 'over quota\|mailbox full' "$LOG_FILE") bounce_spam=$(grep_count -i 'blocked\|spam\|blacklist' "$LOG_FILE") bounce_dns=$(grep_count 'Host or domain name not found' "$LOG_FILE") bounce_refused=$(grep_count 'Connection refused' "$LOG_FILE") echo "postfix_bounce_reason_total{reason=\"user_unknown\",hostname=\"${HOSTNAME}\"} ${bounce_user}" echo "postfix_bounce_reason_total{reason=\"over_quota\",hostname=\"${HOSTNAME}\"} ${bounce_quota}" echo "postfix_bounce_reason_total{reason=\"spam_blocked\",hostname=\"${HOSTNAME}\"} ${bounce_spam}" echo "postfix_bounce_reason_total{reason=\"dns_error\",hostname=\"${HOSTNAME}\"} ${bounce_dns}" echo "postfix_bounce_reason_total{reason=\"connection_refused\",hostname=\"${HOSTNAME}\"} ${bounce_refused}" # Relay stats echo "# HELP postfix_relay_total Messages by relay" echo "# TYPE postfix_relay_total counter" grep -oP 'relay=\K[^,\[]+' "$LOG_FILE" 2>/dev/null | sort | uniq -c | sort -rn | head -10 | while read -r count relay; do echo "postfix_relay_total{relay=\"${relay}\",hostname=\"${HOSTNAME}\"} ${count}" done # Client connections (top IPs) echo "# HELP postfix_client_connections_total Connections per client IP" echo "# TYPE postfix_client_connections_total counter" grep -oP 'connect from \S+\[\K[^\]]+' "$LOG_FILE" 2>/dev/null | sort | uniq -c | sort -rn | head -10 | while read -r count ip; do echo "postfix_client_connections_total{client_ip=\"${ip}\",hostname=\"${HOSTNAME}\"} ${count}" done # TLS stats echo "# HELP postfix_tls_connections_total TLS connection statistics" echo "# TYPE postfix_tls_connections_total counter" tls_in=$(grep_count 'Anonymous TLS connection established from' "$LOG_FILE") tls_out=$(grep_count 'Anonymous TLS connection established to' "$LOG_FILE") verified_in=$(grep_count 'Trusted TLS connection established from' "$LOG_FILE") verified_out=$(grep_count 'Trusted TLS connection established to' "$LOG_FILE") untrusted_in=$(grep_count 'Untrusted TLS connection established from' "$LOG_FILE") untrusted_out=$(grep_count 'Untrusted TLS connection established to' "$LOG_FILE") echo "postfix_tls_connections_total{direction=\"inbound\",verified=\"anonymous\",hostname=\"${HOSTNAME}\"} ${tls_in}" echo "postfix_tls_connections_total{direction=\"outbound\",verified=\"anonymous\",hostname=\"${HOSTNAME}\"} ${tls_out}" echo "postfix_tls_connections_total{direction=\"inbound\",verified=\"trusted\",hostname=\"${HOSTNAME}\"} ${verified_in}" echo "postfix_tls_connections_total{direction=\"outbound\",verified=\"trusted\",hostname=\"${HOSTNAME}\"} ${verified_out}" echo "postfix_tls_connections_total{direction=\"inbound\",verified=\"untrusted\",hostname=\"${HOSTNAME}\"} ${untrusted_in}" echo "postfix_tls_connections_total{direction=\"outbound\",verified=\"untrusted\",hostname=\"${HOSTNAME}\"} ${untrusted_out}" # TLS protocol versions echo "# HELP postfix_tls_protocol_total TLS protocol version usage" echo "# TYPE postfix_tls_protocol_total counter" for proto in TLSv1 TLSv1.1 TLSv1.2 TLSv1.3; do count=$(grep_count "${proto} with cipher" "$LOG_FILE") echo "postfix_tls_protocol_total{protocol=\"${proto}\",hostname=\"${HOSTNAME}\"} ${count}" done # Delay stats (queue time) echo "# HELP postfix_delay_seconds_total Total delay time in seconds" echo "# TYPE postfix_delay_seconds_total counter" total_delay=$(grep -oP 'delay=\K[\d.]+' "$LOG_FILE" 2>/dev/null | awk '{sum+=$1} END {print sum+0}') echo "postfix_delay_seconds_total{hostname=\"${HOSTNAME}\"} ${total_delay}" echo "# HELP postfix_delay_seconds_avg Average delivery delay" echo "# TYPE postfix_delay_seconds_avg gauge" avg_delay=$(grep -oP 'delay=\K[\d.]+' "$LOG_FILE" 2>/dev/null | awk '{sum+=$1; count++} END {if(count>0) printf "%.2f", sum/count; else print 0}') echo "postfix_delay_seconds_avg{hostname=\"${HOSTNAME}\"} ${avg_delay}" echo "# HELP postfix_delay_seconds_max Maximum delivery delay" echo "# TYPE postfix_delay_seconds_max gauge" max_delay=$(grep -oP 'delay=\K[\d.]+' "$LOG_FILE" 2>/dev/null | sort -rn | head -1) echo "postfix_delay_seconds_max{hostname=\"${HOSTNAME}\"} ${max_delay:-0}" # Postfix process count echo "# HELP postfix_processes Number of running postfix processes" echo "# TYPE postfix_processes gauge" proc_count=$(pgrep -c -f "postfix" 2>/dev/null) || proc_count=0 echo "postfix_processes{hostname=\"${HOSTNAME}\"} ${proc_count}" # Mail loop detection echo "# HELP postfix_mail_loop_total Detected mail loops" echo "# TYPE postfix_mail_loop_total counter" loops=$(grep_count 'mail forwarding loop' "$LOG_FILE") echo "postfix_mail_loop_total{hostname=\"${HOSTNAME}\"} ${loops}" # Service status echo "# HELP postfix_up Postfix service status (1=running, 0=stopped)" echo "# TYPE postfix_up gauge" if postfix status &>/dev/null || systemctl is-active postfix &>/dev/null; then echo "postfix_up{hostname=\"${HOSTNAME}\"} 1" else echo "postfix_up{hostname=\"${HOSTNAME}\"} 0" fi # Queue age distribution (messages by age bucket) echo "# HELP postfix_queue_age_bucket Messages in deferred queue by age" echo "# TYPE postfix_queue_age_bucket gauge" now=$(date +%s) for mins in 5 15 60 360 1440; do count=$(find "${QUEUE_DIR}/deferred" -type f -mmin +${mins} 2>/dev/null | wc -l) echo "postfix_queue_age_bucket{le=\"${mins}m\",hostname=\"${HOSTNAME}\"} ${count}" done # Delivery attempts (retries) echo "# HELP postfix_delivery_attempts_total Delivery attempts by result" echo "# TYPE postfix_delivery_attempts_total counter" first_attempt=$(grep_count 'delay=.*delays=0/' "$LOG_FILE") retry_attempt=$(grep -c 'status=deferred.*will be retried' "$LOG_FILE" 2>/dev/null) || retry_attempt=0 echo "postfix_delivery_attempts_total{type=\"first\",hostname=\"${HOSTNAME}\"} ${first_attempt}" echo "postfix_delivery_attempts_total{type=\"retry\",hostname=\"${HOSTNAME}\"} ${retry_attempt}" # DSN status codes breakdown echo "# HELP postfix_dsn_total Delivery Status Notification codes" echo "# TYPE postfix_dsn_total counter" for dsn in "2.0.0" "4.7.1" "5.1.1" "5.1.2" "5.2.1" "5.2.2" "5.4.1" "5.7.1"; do count=$(grep_count "dsn=${dsn}" "$LOG_FILE") echo "postfix_dsn_total{code=\"${dsn}\",hostname=\"${HOSTNAME}\"} ${count}" done # Delay breakdown by phase echo "# HELP postfix_delay_phase_seconds_total Delay time by phase" echo "# TYPE postfix_delay_phase_seconds_total counter" grep -oP 'delays=\K[\d.]+/[\d.]+/[\d.]+/[\d.]+' "$LOG_FILE" 2>/dev/null | awk -F'/' '{ before_qmgr+=$1; in_qmgr+=$2; conn_setup+=$3; transmission+=$4 } END { print "before_qmgr " before_qmgr+0 print "in_qmgr " in_qmgr+0 print "conn_setup " conn_setup+0 print "transmission " transmission+0 }' | while read -r phase total; do echo "postfix_delay_phase_seconds_total{phase=\"${phase}\",hostname=\"${HOSTNAME}\"} ${total}" done # RBL rejections (per blocklist) echo "# HELP postfix_rbl_reject_total Rejections by RBL" echo "# TYPE postfix_rbl_reject_total counter" for rbl in "zen.spamhaus.org" "bl.spamcop.net" "b.barracudacentral.org" "dnsbl.sorbs.net"; do count=$(grep_count "${rbl}" "$LOG_FILE") echo "postfix_rbl_reject_total{rbl=\"${rbl}\",hostname=\"${HOSTNAME}\"} ${count}" done # Invalid HELO/EHLO attempts echo "# HELP postfix_helo_invalid_total Invalid HELO/EHLO attempts" echo "# TYPE postfix_helo_invalid_total counter" helo_invalid=$(grep_count 'Helo command rejected' "$LOG_FILE") echo "postfix_helo_invalid_total{hostname=\"${HOSTNAME}\"} ${helo_invalid}" # Anvil rate limiting echo "# HELP postfix_rate_limited_total Anvil rate limit events" echo "# TYPE postfix_rate_limited_total counter" rate_conn=$(grep_count 'anvil.*connection rate' "$LOG_FILE") rate_msg=$(grep_count 'anvil.*message rate' "$LOG_FILE") rate_rcpt=$(grep_count 'anvil.*recipient rate' "$LOG_FILE") echo "postfix_rate_limited_total{type=\"connection\",hostname=\"${HOSTNAME}\"} ${rate_conn}" echo "postfix_rate_limited_total{type=\"message\",hostname=\"${HOSTNAME}\"} ${rate_msg}" echo "postfix_rate_limited_total{type=\"recipient\",hostname=\"${HOSTNAME}\"} ${rate_rcpt}" # Milter/content filter rejections echo "# HELP postfix_milter_reject_total Milter rejection events" echo "# TYPE postfix_milter_reject_total counter" milter_reject=$(grep_count 'milter-reject' "$LOG_FILE") echo "postfix_milter_reject_total{hostname=\"${HOSTNAME}\"} ${milter_reject}" # Header/body checks rejections echo "# HELP postfix_header_checks_reject_total Header/body check rejections" echo "# TYPE postfix_header_checks_reject_total counter" header_reject=$(grep_count 'header_checks:' "$LOG_FILE") body_reject=$(grep_count 'body_checks:' "$LOG_FILE") echo "postfix_header_checks_reject_total{type=\"header\",hostname=\"${HOSTNAME}\"} ${header_reject}" echo "postfix_header_checks_reject_total{type=\"body\",hostname=\"${HOSTNAME}\"} ${body_reject}" # Policy daemon deferrals echo "# HELP postfix_policyd_total Policy daemon events" echo "# TYPE postfix_policyd_total counter" policyd_defer=$(grep_count 'policy.*DEFER' "$LOG_FILE") policyd_reject=$(grep_count 'policy.*REJECT' "$LOG_FILE") echo "postfix_policyd_total{action=\"defer\",hostname=\"${HOSTNAME}\"} ${policyd_defer}" echo "postfix_policyd_total{action=\"reject\",hostname=\"${HOSTNAME}\"} ${policyd_reject}" # DKIM signing (if OpenDKIM is used) echo "# HELP postfix_dkim_total DKIM signing/verification results" echo "# TYPE postfix_dkim_total counter" dkim_signed=$(grep_count 'DKIM-Signature field added' "$LOG_FILE") dkim_pass=$(grep_count 'dkim=pass' "$LOG_FILE") dkim_fail=$(grep_count 'dkim=fail' "$LOG_FILE") echo "postfix_dkim_total{action=\"signed\",hostname=\"${HOSTNAME}\"} ${dkim_signed}" echo "postfix_dkim_total{result=\"pass\",hostname=\"${HOSTNAME}\"} ${dkim_pass}" echo "postfix_dkim_total{result=\"fail\",hostname=\"${HOSTNAME}\"} ${dkim_fail}" # SPF results echo "# HELP postfix_spf_total SPF check results" echo "# TYPE postfix_spf_total counter" for result in pass fail softfail neutral none permerror temperror; do count=$(grep_count -i "spf=${result}\|SPF: ${result}" "$LOG_FILE") echo "postfix_spf_total{result=\"${result}\",hostname=\"${HOSTNAME}\"} ${count}" done # DMARC results (if OpenDMARC is used) # OpenDMARC logs: "opendmarc[PID]: QUEUEID: domain.com pass/fail/none" echo "# HELP postfix_dmarc_total DMARC check results" echo "# TYPE postfix_dmarc_total counter" for result in pass fail none; do count=$(grep -cE "opendmarc\[.*\]: [A-F0-9]+: [^ ]+ ${result}$" "$LOG_FILE" 2>/dev/null) || count=0 echo "postfix_dmarc_total{result=\"${result}\",hostname=\"${HOSTNAME}\"} ${count}" done # Hourly volume (traffic patterns) echo "# HELP postfix_hourly_volume Messages processed per hour" echo "# TYPE postfix_hourly_volume gauge" current_date=$(date +%b" "%d) for hour in $(seq -w 0 23); do count=$(grep_count "^${current_date} ${hour}:" "$LOG_FILE" | grep -c 'status=sent' 2>/dev/null) || count=0 count=$(grep "^${current_date} ${hour}:" "$LOG_FILE" 2>/dev/null | grep -c 'status=sent') || count=0 echo "postfix_hourly_volume{hour=\"${hour}\",hostname=\"${HOSTNAME}\"} ${count}" done # Recent throughput (last 5/15/60 minutes) echo "# HELP postfix_messages_recent Messages sent in recent time windows" echo "# TYPE postfix_messages_recent gauge" for mins in 5 15 60; do since=$(date -d "${mins} minutes ago" '+%b %d %H:%M' 2>/dev/null) || since="" if [[ -n "$since" ]]; then count=$(awk -v since="$since" '$0 >= since && /status=sent/' "$LOG_FILE" 2>/dev/null | wc -l) else count=0 fi echo "postfix_messages_recent{window=\"${mins}m\",hostname=\"${HOSTNAME}\"} ${count}" done # Active SMTP sessions estimate echo "# HELP postfix_smtp_sessions_active Estimated active SMTP sessions" echo "# TYPE postfix_smtp_sessions_active gauge" smtp_procs=$(pgrep -c -x smtp 2>/dev/null) || smtp_procs=0 smtpd_procs=$(pgrep -c -x smtpd 2>/dev/null) || smtpd_procs=0 echo "postfix_smtp_sessions_active{type=\"outbound\",hostname=\"${HOSTNAME}\"} ${smtp_procs}" echo "postfix_smtp_sessions_active{type=\"inbound\",hostname=\"${HOSTNAME}\"} ${smtpd_procs}" # Qmgr active recipients echo "# HELP postfix_qmgr_recipients Active recipients in queue manager" echo "# TYPE postfix_qmgr_recipients gauge" active_recipients=$(find "${QUEUE_DIR}/active" -type f -exec cat {} \; 2>/dev/null | wc -l) || active_recipients=0 echo "postfix_qmgr_recipients{hostname=\"${HOSTNAME}\"} ${active_recipients}" # Estimated queue memory usage (based on file sizes) echo "# HELP postfix_queue_size_bytes Total size of queue files in bytes" echo "# TYPE postfix_queue_size_bytes gauge" for queue in incoming active deferred hold; do size=$(du -sb "${QUEUE_DIR}/${queue}" 2>/dev/null | cut -f1) || size=0 echo "postfix_queue_size_bytes{queue=\"${queue}\",hostname=\"${HOSTNAME}\"} ${size}" done # Warnings and fatal errors echo "# HELP postfix_log_events_total Log events by severity" echo "# TYPE postfix_log_events_total counter" warnings=$(grep_count 'warning:' "$LOG_FILE") fatals=$(grep_count 'fatal:' "$LOG_FILE") panics=$(grep_count 'panic:' "$LOG_FILE") echo "postfix_log_events_total{level=\"warning\",hostname=\"${HOSTNAME}\"} ${warnings}" echo "postfix_log_events_total{level=\"fatal\",hostname=\"${HOSTNAME}\"} ${fatals}" echo "postfix_log_events_total{level=\"panic\",hostname=\"${HOSTNAME}\"} ${panics}" # SMTP response codes echo "# HELP postfix_smtp_response_total SMTP response codes" echo "# TYPE postfix_smtp_response_total counter" smtp_2xx=$(grep_count 'status=sent' "$LOG_FILE") smtp_4xx=$(grep_count 'status=deferred' "$LOG_FILE") smtp_5xx=$(grep_count 'status=bounced' "$LOG_FILE") echo "postfix_smtp_response_total{code=\"2xx\",hostname=\"${HOSTNAME}\"} ${smtp_2xx}" echo "postfix_smtp_response_total{code=\"4xx\",hostname=\"${HOSTNAME}\"} ${smtp_4xx}" echo "postfix_smtp_response_total{code=\"5xx\",hostname=\"${HOSTNAME}\"} ${smtp_5xx}" # Specific SMTP error codes (check multiple patterns) # Postfix logs SMTP errors in various formats: # - "said: 550 5.1.1 User unknown" # - "status=bounced (host ... said: 550 ...)" # - "dsn=5.1.1" (DSN codes start with same digit) # - Remote server responses with just the code echo "# HELP postfix_smtp_error_code_total Specific SMTP error codes" echo "# TYPE postfix_smtp_error_code_total counter" for code in 421 450 451 452 500 501 502 503 504 550 551 552 553 554; do # Multiple patterns: "said: 550", "(550 ", "smtp.*550", host responses count=$(grep -cE "(said: ${code}|said:${code}|\(${code} |host .*\[.*\].*${code} |smtp.*${code}[^0-9])" "$LOG_FILE" 2>/dev/null) || count=0 echo "postfix_smtp_error_code_total{code=\"${code}\",hostname=\"${HOSTNAME}\"} ${count}" done # TLS cipher suites (top 10) # Requires smtpd_tls_loglevel=1 and smtp_tls_loglevel=1 in main.cf # Postfix logs: "TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)" echo "# HELP postfix_tls_cipher_total TLS cipher suite usage" echo "# TYPE postfix_tls_cipher_total counter" cipher_output=$({ grep -oP 'with cipher \K[A-Za-z0-9_-]+' "$LOG_FILE" 2>/dev/null grep -oP 'cipher=\K[A-Za-z0-9_-]+' "$LOG_FILE" 2>/dev/null } | sort | uniq -c | sort -rn | head -10) if [[ -n "$cipher_output" ]]; then echo "$cipher_output" | while read -r count cipher; do [[ -n "$cipher" ]] && echo "postfix_tls_cipher_total{cipher=\"${cipher}\",hostname=\"${HOSTNAME}\"} ${count}" done else echo "postfix_tls_cipher_total{cipher=\"unknown\",hostname=\"${HOSTNAME}\"} 0" fi # TLS certificate expiry (check multiple locations) echo "# HELP postfix_cert_expiry_seconds Seconds until TLS certificate expires" echo "# TYPE postfix_cert_expiry_seconds gauge" CERT_FILE="" for cert in "/etc/ssl/certs/postfix.pem" \ "/home/user-data/ssl/ssl_certificate.pem" \ "/etc/letsencrypt/live/$(hostname)/fullchain.pem" \ "/etc/letsencrypt/live/$(hostname -f)/fullchain.pem" \ "/etc/ssl/certs/ssl-cert-snakeoil.pem"; do if [[ -f "$cert" ]]; then CERT_FILE="$cert" break fi done cert_seconds=0 if [[ -n "$CERT_FILE" ]] && command -v openssl &>/dev/null; then expiry=$(openssl x509 -enddate -noout -in "$CERT_FILE" 2>/dev/null | cut -d= -f2) if [[ -n "$expiry" ]]; then expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null) || expiry_epoch=0 now=$(date +%s) cert_seconds=$((expiry_epoch - now)) fi fi echo "postfix_cert_expiry_seconds{hostname=\"${HOSTNAME}\"} ${cert_seconds}" # LMTP delivery metrics (Postfix side) # Matches: "postfix/lmtp[PID]: ... status=sent" echo "# HELP postfix_lmtp_delivery_total LMTP delivery stats" echo "# TYPE postfix_lmtp_delivery_total counter" lmtp_sent=$(grep_count 'postfix/lmtp\[.*status=sent' "$LOG_FILE") lmtp_deferred=$(grep_count 'postfix/lmtp\[.*status=deferred' "$LOG_FILE") lmtp_bounced=$(grep_count 'postfix/lmtp\[.*status=bounced' "$LOG_FILE") echo "postfix_lmtp_delivery_total{status=\"sent\",hostname=\"${HOSTNAME}\"} ${lmtp_sent}" echo "postfix_lmtp_delivery_total{status=\"deferred\",hostname=\"${HOSTNAME}\"} ${lmtp_deferred}" echo "postfix_lmtp_delivery_total{status=\"bounced\",hostname=\"${HOSTNAME}\"} ${lmtp_bounced}" echo "# HELP postfix_lmtp_connections_total LMTP connection events" echo "# TYPE postfix_lmtp_connections_total counter" lmtp_connect=$(grep_count 'postfix/lmtp\[.*connect' "$LOG_FILE") lmtp_disconnect=$(grep_count 'postfix/lmtp\[.*disconnect' "$LOG_FILE") lmtp_timeout=$(grep_count 'postfix/lmtp\[.*timeout' "$LOG_FILE") lmtp_refused=$(grep_count 'postfix/lmtp\[.*Connection refused' "$LOG_FILE") echo "postfix_lmtp_connections_total{type=\"connect\",hostname=\"${HOSTNAME}\"} ${lmtp_connect}" echo "postfix_lmtp_connections_total{type=\"disconnect\",hostname=\"${HOSTNAME}\"} ${lmtp_disconnect}" echo "postfix_lmtp_connections_total{type=\"timeout\",hostname=\"${HOSTNAME}\"} ${lmtp_timeout}" echo "postfix_lmtp_connections_total{type=\"refused\",hostname=\"${HOSTNAME}\"} ${lmtp_refused}" echo "# HELP postfix_lmtp_delay_seconds LMTP delivery delay stats" echo "# TYPE postfix_lmtp_delay_seconds gauge" lmtp_avg_delay=$(grep 'postfix/lmtp\[' "$LOG_FILE" 2>/dev/null | grep -oP 'delay=\K[\d.]+' | awk '{sum+=$1; count++} END {if(count>0) printf "%.2f", sum/count; else print 0}') lmtp_max_delay=$(grep 'postfix/lmtp\[' "$LOG_FILE" 2>/dev/null | grep -oP 'delay=\K[\d.]+' | sort -rn | head -1) echo "postfix_lmtp_delay_seconds{stat=\"avg\",hostname=\"${HOSTNAME}\"} ${lmtp_avg_delay}" echo "postfix_lmtp_delay_seconds{stat=\"max\",hostname=\"${HOSTNAME}\"} ${lmtp_max_delay:-0}" # Dovecot LMTP/LDA delivery stats (check multiple log locations) DOVECOT_LOG="" for log in "/var/log/dovecot.log" "/var/log/mail.log" "/var/log/syslog"; do if [[ -f "$log" ]] && grep -q 'dovecot' "$log" 2>/dev/null; then DOVECOT_LOG="$log" break fi done if [[ -n "$DOVECOT_LOG" ]]; then echo "# HELP postfix_dovecot_delivery_total Dovecot local delivery stats" echo "# TYPE postfix_dovecot_delivery_total counter" lmtp_delivered=$(grep_count 'lmtp.*saved mail' "$DOVECOT_LOG") lda_delivered=$(grep_count 'lda.*saved mail' "$DOVECOT_LOG") echo "postfix_dovecot_delivery_total{type=\"lmtp\",hostname=\"${HOSTNAME}\"} ${lmtp_delivered}" echo "postfix_dovecot_delivery_total{type=\"lda\",hostname=\"${HOSTNAME}\"} ${lda_delivered}" echo "# HELP postfix_dovecot_sieve_total Dovecot sieve filter actions" echo "# TYPE postfix_dovecot_sieve_total counter" sieve_fileinto=$(grep_count 'sieve.*fileinto' "$DOVECOT_LOG") sieve_discard=$(grep_count 'sieve.*discard' "$DOVECOT_LOG") sieve_redirect=$(grep_count 'sieve.*redirect' "$DOVECOT_LOG") echo "postfix_dovecot_sieve_total{action=\"fileinto\",hostname=\"${HOSTNAME}\"} ${sieve_fileinto}" echo "postfix_dovecot_sieve_total{action=\"discard\",hostname=\"${HOSTNAME}\"} ${sieve_discard}" echo "postfix_dovecot_sieve_total{action=\"redirect\",hostname=\"${HOSTNAME}\"} ${sieve_redirect}" echo "# HELP postfix_dovecot_auth_total Dovecot authentication attempts" echo "# TYPE postfix_dovecot_auth_total counter" auth_success=$(grep_count 'auth.*successful' "$DOVECOT_LOG") auth_fail=$(grep_count 'auth.*failed' "$DOVECOT_LOG") echo "postfix_dovecot_auth_total{result=\"success\",hostname=\"${HOSTNAME}\"} ${auth_success}" echo "postfix_dovecot_auth_total{result=\"failed\",hostname=\"${HOSTNAME}\"} ${auth_fail}" echo "# HELP postfix_dovecot_imap_connections_total Dovecot IMAP connections" echo "# TYPE postfix_dovecot_imap_connections_total counter" imap_login=$(grep_count 'imap-login:.*Login' "$DOVECOT_LOG") imap_disconnect=$(grep_count 'imap.*Disconnected' "$DOVECOT_LOG") echo "postfix_dovecot_imap_connections_total{type=\"login\",hostname=\"${HOSTNAME}\"} ${imap_login}" echo "postfix_dovecot_imap_connections_total{type=\"disconnect\",hostname=\"${HOSTNAME}\"} ${imap_disconnect}" echo "# HELP postfix_dovecot_pop3_connections_total Dovecot POP3 connections" echo "# TYPE postfix_dovecot_pop3_connections_total counter" pop3_login=$(grep_count 'pop3-login:.*Login' "$DOVECOT_LOG") pop3_disconnect=$(grep_count 'pop3.*Disconnected' "$DOVECOT_LOG") echo "postfix_dovecot_pop3_connections_total{type=\"login\",hostname=\"${HOSTNAME}\"} ${pop3_login}" echo "postfix_dovecot_pop3_connections_total{type=\"disconnect\",hostname=\"${HOSTNAME}\"} ${pop3_disconnect}" fi # SpamAssassin metrics (supports spamd, spampd, and amavis) SPAM_LOG="/var/log/mail.log" # Detect which spam daemon is in use (check spampd first as it's more specific) if grep -q 'spampd' "$SPAM_LOG" 2>/dev/null; then SPAM_DAEMON="spampd" elif grep -q 'spamd\[' "$SPAM_LOG" 2>/dev/null; then SPAM_DAEMON="spamd" elif grep -q 'amavis' "$SPAM_LOG" 2>/dev/null; then SPAM_DAEMON="amavis" else SPAM_DAEMON="" fi if [[ -n "$SPAM_DAEMON" ]]; then echo "# HELP postfix_spamassassin_total SpamAssassin scan results" echo "# TYPE postfix_spamassassin_total counter" if [[ "$SPAM_DAEMON" == "spampd" ]]; then # spampd format: "clean message <...> (SCORE/THRESHOLD)" or "identified spam <...> (SCORE/THRESHOLD)" spam_identified=$(grep_count 'spampd.*identified spam' "$SPAM_LOG") ham_clean=$(grep_count 'spampd.*clean message' "$SPAM_LOG") elif [[ "$SPAM_DAEMON" == "amavis" ]]; then spam_identified=$(grep_count 'amavis.*Blocked SPAM' "$SPAM_LOG") ham_clean=$(grep_count 'amavis.*Passed CLEAN' "$SPAM_LOG") else spam_identified=$(grep_count 'spamd.*identified spam' "$SPAM_LOG") ham_clean=$(grep_count 'spamd.*clean message' "$SPAM_LOG") fi echo "postfix_spamassassin_total{result=\"spam\",hostname=\"${HOSTNAME}\"} ${spam_identified}" echo "postfix_spamassassin_total{result=\"ham\",hostname=\"${HOSTNAME}\"} ${ham_clean}" echo "# HELP postfix_spamassassin_score_total SpamAssassin score distribution" echo "# TYPE postfix_spamassassin_score_total counter" if [[ "$SPAM_DAEMON" == "spampd" ]]; then # spampd format: (SCORE/THRESHOLD) like (-0.30/5.00) or (15.2/5.0) score_neg=$(grep -oP 'spampd.*\(\K-[\d.]+(?=/)' "$SPAM_LOG" 2>/dev/null | wc -l) score_0_5=$(grep -oP 'spampd.*\(\K-?[\d.]+(?=/)' "$SPAM_LOG" 2>/dev/null | awk '$1 >= 0 && $1 < 5 {count++} END {print count+0}') score_5_10=$(grep -oP 'spampd.*\(\K-?[\d.]+(?=/)' "$SPAM_LOG" 2>/dev/null | awk '$1 >= 5 && $1 < 10 {count++} END {print count+0}') score_10_plus=$(grep -oP 'spampd.*\(\K-?[\d.]+(?=/)' "$SPAM_LOG" 2>/dev/null | awk '$1 >= 10 {count++} END {print count+0}') elif [[ "$SPAM_DAEMON" == "amavis" ]]; then score_neg=$(grep -oP 'amavis.*Hits: \K-[\d.]+' "$SPAM_LOG" 2>/dev/null | wc -l) score_0_5=$(grep -oP 'amavis.*Hits: \K-?[\d.]+' "$SPAM_LOG" 2>/dev/null | awk '$1 >= 0 && $1 < 5 {count++} END {print count+0}') score_5_10=$(grep -oP 'amavis.*Hits: \K-?[\d.]+' "$SPAM_LOG" 2>/dev/null | awk '$1 >= 5 && $1 < 10 {count++} END {print count+0}') score_10_plus=$(grep -oP 'amavis.*Hits: \K-?[\d.]+' "$SPAM_LOG" 2>/dev/null | awk '$1 >= 10 {count++} END {print count+0}') else score_neg=0 score_0_5=$(grep -oP 'spamd.*score=\K[\d.]+' "$SPAM_LOG" 2>/dev/null | awk '$1 >= 0 && $1 < 5 {count++} END {print count+0}') score_5_10=$(grep -oP 'spamd.*score=\K[\d.]+' "$SPAM_LOG" 2>/dev/null | awk '$1 >= 5 && $1 < 10 {count++} END {print count+0}') score_10_plus=$(grep -oP 'spamd.*score=\K[\d.]+' "$SPAM_LOG" 2>/dev/null | awk '$1 >= 10 {count++} END {print count+0}') fi echo "postfix_spamassassin_score_total{bucket=\"negative\",hostname=\"${HOSTNAME}\"} ${score_neg:-0}" echo "postfix_spamassassin_score_total{bucket=\"0-5\",hostname=\"${HOSTNAME}\"} ${score_0_5}" echo "postfix_spamassassin_score_total{bucket=\"5-10\",hostname=\"${HOSTNAME}\"} ${score_5_10}" echo "postfix_spamassassin_score_total{bucket=\"10+\",hostname=\"${HOSTNAME}\"} ${score_10_plus}" echo "# HELP postfix_spamassassin_score_avg Average SpamAssassin score" echo "# TYPE postfix_spamassassin_score_avg gauge" if [[ "$SPAM_DAEMON" == "spampd" ]]; then avg_score=$(grep -oP 'spampd.*\(\K-?[\d.]+(?=/)' "$SPAM_LOG" 2>/dev/null | awk '{sum+=$1; count++} END {if(count>0) printf "%.2f", sum/count; else print 0}') elif [[ "$SPAM_DAEMON" == "amavis" ]]; then avg_score=$(grep -oP 'amavis.*Hits: \K-?[\d.]+' "$SPAM_LOG" 2>/dev/null | awk '{sum+=$1; count++} END {if(count>0) printf "%.2f", sum/count; else print 0}') else avg_score=$(grep -oP 'spamd.*score=\K[\d.]+' "$SPAM_LOG" 2>/dev/null | awk '{sum+=$1; count++} END {if(count>0) printf "%.2f", sum/count; else print 0}') fi echo "postfix_spamassassin_score_avg{hostname=\"${HOSTNAME}\"} ${avg_score}" echo "# HELP postfix_spamassassin_score_max Maximum SpamAssassin score seen" echo "# TYPE postfix_spamassassin_score_max gauge" if [[ "$SPAM_DAEMON" == "spampd" ]]; then max_score=$(grep -oP 'spampd.*\(\K-?[\d.]+(?=/)' "$SPAM_LOG" 2>/dev/null | sort -rn | head -1) elif [[ "$SPAM_DAEMON" == "amavis" ]]; then max_score=$(grep -oP 'amavis.*Hits: \K-?[\d.]+' "$SPAM_LOG" 2>/dev/null | sort -rn | head -1) else max_score=$(grep -oP 'spamd.*score=\K[\d.]+' "$SPAM_LOG" 2>/dev/null | sort -rn | head -1) fi echo "postfix_spamassassin_score_max{hostname=\"${HOSTNAME}\"} ${max_score:-0}" # Messages scanned total echo "# HELP postfix_spamassassin_scanned_total Total messages scanned" echo "# TYPE postfix_spamassassin_scanned_total counter" scanned_total=$((spam_identified + ham_clean)) echo "postfix_spamassassin_scanned_total{hostname=\"${HOSTNAME}\"} ${scanned_total}" echo "# HELP postfix_spamassassin_scan_time_seconds SpamAssassin scan time stats" echo "# TYPE postfix_spamassassin_scan_time_seconds gauge" if [[ "$SPAM_DAEMON" == "spampd" ]]; then # spampd format: "in 2.15s" avg_time=$(grep -oP 'spampd.* in \K[\d.]+(?=s)' "$SPAM_LOG" 2>/dev/null | awk '{sum+=$1; count++} END {if(count>0) printf "%.2f", sum/count; else print 0}') max_time=$(grep -oP 'spampd.* in \K[\d.]+(?=s)' "$SPAM_LOG" 2>/dev/null | sort -rn | head -1) else avg_time=$(grep -oP "${SPAM_DAEMON}.* in \K[\d.]+(?= seconds)" "$SPAM_LOG" 2>/dev/null | awk '{sum+=$1; count++} END {if(count>0) printf "%.2f", sum/count; else print 0}') max_time=$(grep -oP "${SPAM_DAEMON}.* in \K[\d.]+(?= seconds)" "$SPAM_LOG" 2>/dev/null | sort -rn | head -1) fi echo "postfix_spamassassin_scan_time_seconds{stat=\"avg\",hostname=\"${HOSTNAME}\"} ${avg_time:-0}" echo "postfix_spamassassin_scan_time_seconds{stat=\"max\",hostname=\"${HOSTNAME}\"} ${max_time:-0}" # spampd-specific: message size stats if [[ "$SPAM_DAEMON" == "spampd" ]]; then echo "# HELP postfix_spamassassin_message_size_bytes SpamAssassin processed message sizes" echo "# TYPE postfix_spamassassin_message_size_bytes gauge" avg_size=$(grep -oP 'spampd.*, \K\d+(?= bytes)' "$SPAM_LOG" 2>/dev/null | awk '{sum+=$1; count++} END {if(count>0) printf "%.0f", sum/count; else print 0}') max_size=$(grep -oP 'spampd.*, \K\d+(?= bytes)' "$SPAM_LOG" 2>/dev/null | sort -rn | head -1) echo "postfix_spamassassin_message_size_bytes{stat=\"avg\",hostname=\"${HOSTNAME}\"} ${avg_size:-0}" echo "postfix_spamassassin_message_size_bytes{stat=\"max\",hostname=\"${HOSTNAME}\"} ${max_size:-0}" echo "# HELP postfix_spamassassin_threshold SpamAssassin spam threshold" echo "# TYPE postfix_spamassassin_threshold gauge" threshold=$(grep -oP 'spampd.*/-?\K[\d.]+(?=\))' "$SPAM_LOG" 2>/dev/null | head -1) echo "postfix_spamassassin_threshold{hostname=\"${HOSTNAME}\"} ${threshold:-5}" fi # SpamAssassin rules (only available with spamd or if logging to separate file) # NOTE: spampd (used by Mail-in-a-Box) does NOT log individual rules to mail.log # Rules are only available if using standalone spamd with verbose logging or a separate log file SA_RULES_LOG="" for log in "/var/log/spamassassin.log" "/var/log/spamd.log" "$SPAM_LOG"; do if [[ -f "$log" ]] && grep -q 'tests=' "$log" 2>/dev/null; then SA_RULES_LOG="$log" break fi done if [[ -n "$SA_RULES_LOG" ]]; then echo "# HELP postfix_spamassassin_rules_total Top SpamAssassin rules triggered" echo "# TYPE postfix_spamassassin_rules_total counter" grep -oP 'tests=\K[^,\]\s]+' "$SA_RULES_LOG" 2>/dev/null | tr ',' '\n' | tr -d ' ' | sort | uniq -c | sort -rn | head -15 | while read -r count rule; do [[ -n "$rule" ]] && echo "postfix_spamassassin_rules_total{rule=\"${rule}\",hostname=\"${HOSTNAME}\"} ${count}" done fi # Daemon status echo "# HELP postfix_spamassassin_up SpamAssassin daemon status" echo "# TYPE postfix_spamassassin_up gauge" if pgrep -f "${SPAM_DAEMON}" &>/dev/null; then echo "postfix_spamassassin_up{daemon=\"${SPAM_DAEMON}\",hostname=\"${HOSTNAME}\"} 1" else echo "postfix_spamassassin_up{daemon=\"${SPAM_DAEMON}\",hostname=\"${HOSTNAME}\"} 0" fi echo "# HELP postfix_spamassassin_processes Number of spam daemon processes" echo "# TYPE postfix_spamassassin_processes gauge" spam_procs=$(pgrep -c -f "${SPAM_DAEMON}" 2>/dev/null) || spam_procs=0 echo "postfix_spamassassin_processes{daemon=\"${SPAM_DAEMON}\",hostname=\"${HOSTNAME}\"} ${spam_procs}" fi # Greylisting stats (postgrey) echo "# HELP postfix_greylist_total Greylisting events" echo "# TYPE postfix_greylist_total counter" greylist_defer=$(grep_count 'action=greylist' "$LOG_FILE") greylist_pass=$(grep_count 'action=pass.*reason=triplet' "$LOG_FILE") greylist_whitelist=$(grep_count 'action=pass.*reason=client whitelist\|action=pass, reason=client AWL' "$LOG_FILE") echo "postfix_greylist_total{action=\"defer\",hostname=\"${HOSTNAME}\"} ${greylist_defer}" echo "postfix_greylist_total{action=\"pass\",hostname=\"${HOSTNAME}\"} ${greylist_pass}" echo "postfix_greylist_total{action=\"whitelist\",hostname=\"${HOSTNAME}\"} ${greylist_whitelist}" echo "# HELP postfix_greylist_reason_total Greylisting by reason" echo "# TYPE postfix_greylist_reason_total counter" grey_new=$(grep_count 'reason=new' "$LOG_FILE") grey_early=$(grep_count 'reason=early-retry' "$LOG_FILE") grey_triplet=$(grep_count 'reason=triplet found' "$LOG_FILE") echo "postfix_greylist_reason_total{reason=\"new\",hostname=\"${HOSTNAME}\"} ${grey_new}" echo "postfix_greylist_reason_total{reason=\"early_retry\",hostname=\"${HOSTNAME}\"} ${grey_early}" echo "postfix_greylist_reason_total{reason=\"triplet_found\",hostname=\"${HOSTNAME}\"} ${grey_triplet}" echo "# HELP postfix_greylist_delay_seconds Greylist delay statistics" echo "# TYPE postfix_greylist_delay_seconds gauge" avg_delay=$(grep -oP 'delay=\K\d+' "$LOG_FILE" 2>/dev/null | grep -v '^0$' | awk '{sum+=$1; count++} END {if(count>0) printf "%.0f", sum/count; else print 0}') max_delay=$(grep -oP 'postgrey.*delay=\K\d+' "$LOG_FILE" 2>/dev/null | sort -rn | head -1) echo "postfix_greylist_delay_seconds{type=\"avg\",hostname=\"${HOSTNAME}\"} ${avg_delay:-0}" echo "postfix_greylist_delay_seconds{type=\"max\",hostname=\"${HOSTNAME}\"} ${max_delay:-0}" echo "# HELP postfix_greylist_clients_total Unique greylisted client IPs" echo "# TYPE postfix_greylist_clients_total gauge" grey_clients=$(grep 'action=greylist' "$LOG_FILE" 2>/dev/null | grep -oP 'client_address=\K[^,]+' | sort -u | wc -l) echo "postfix_greylist_clients_total{hostname=\"${HOSTNAME}\"} ${grey_clients:-0}" echo "# HELP postfix_greylist_top_senders Top greylisted sender domains" echo "# TYPE postfix_greylist_top_senders counter" grep 'action=greylist' "$LOG_FILE" 2>/dev/null | grep -oP 'sender=\K[^,]+' | sed 's/.*@//' | sort | uniq -c | sort -rn | head -10 | while read -r count domain; do [[ -n "$domain" ]] && echo "postfix_greylist_top_senders{domain=\"${domain}\",hostname=\"${HOSTNAME}\"} ${count}" done # Cleanup daemon stats (total messages entering system) echo "# HELP postfix_cleanup_total Messages processed by cleanup daemon" echo "# TYPE postfix_cleanup_total counter" cleanup_count=$(grep_count 'message-id=' "$LOG_FILE") echo "postfix_cleanup_total{hostname=\"${HOSTNAME}\"} ${cleanup_count}" # Virtual mailbox errors echo "# HELP postfix_virtual_errors_total Virtual mailbox lookup errors" echo "# TYPE postfix_virtual_errors_total counter" virtual_not_found=$(grep_count 'mailbox not found\|User unknown in virtual' "$LOG_FILE") echo "postfix_virtual_errors_total{hostname=\"${HOSTNAME}\"} ${virtual_not_found}" # Address verification failures echo "# HELP postfix_address_verify_total Address verification events" echo "# TYPE postfix_address_verify_total counter" verify_fail=$(grep_count 'address verification failed' "$LOG_FILE") verify_success=$(grep_count 'address verification succeeded\|cache hit' "$LOG_FILE") echo "postfix_address_verify_total{result=\"failed\",hostname=\"${HOSTNAME}\"} ${verify_fail}" echo "postfix_address_verify_total{result=\"success\",hostname=\"${HOSTNAME}\"} ${verify_success}" # Postfix master process uptime (based on pid file age) echo "# HELP postfix_master_uptime_seconds Postfix master process uptime" echo "# TYPE postfix_master_uptime_seconds gauge" MASTER_PID_FILE="/var/spool/postfix/pid/master.pid" if [[ -f "$MASTER_PID_FILE" ]]; then master_start=$(stat -c %Y "$MASTER_PID_FILE" 2>/dev/null) || master_start=0 if [[ $master_start -gt 0 ]]; then uptime_seconds=$(($(date +%s) - master_start)) else uptime_seconds=0 fi else uptime_seconds=0 fi echo "postfix_master_uptime_seconds{hostname=\"${HOSTNAME}\"} ${uptime_seconds}" # DNS lookup failures echo "# HELP postfix_dns_errors_total DNS lookup errors" echo "# TYPE postfix_dns_errors_total counter" dns_not_found=$(grep_count 'Host not found\|Name service error\|Host or domain name not found' "$LOG_FILE") dns_timeout=$(grep_count 'DNS lookup.*timeout\|name server.*timeout' "$LOG_FILE") dns_servfail=$(grep_count 'SERVFAIL\|server failure' "$LOG_FILE") echo "postfix_dns_errors_total{type=\"not_found\",hostname=\"${HOSTNAME}\"} ${dns_not_found}" echo "postfix_dns_errors_total{type=\"timeout\",hostname=\"${HOSTNAME}\"} ${dns_timeout}" echo "postfix_dns_errors_total{type=\"servfail\",hostname=\"${HOSTNAME}\"} ${dns_servfail}" # STARTTLS usage - count TLS connections vs total SMTP connections # "used" = successful TLS connections (inbound + outbound) # "total" = total SMTP connections for ratio calculation echo "# HELP postfix_starttls_total STARTTLS connection counts" echo "# TYPE postfix_starttls_total counter" starttls_inbound=$(grep_count 'TLS connection established from' "$LOG_FILE") starttls_outbound=$(grep_count 'TLS connection established to' "$LOG_FILE") echo "postfix_starttls_total{type=\"inbound\",hostname=\"${HOSTNAME}\"} ${starttls_inbound}" echo "postfix_starttls_total{type=\"outbound\",hostname=\"${HOSTNAME}\"} ${starttls_outbound}" # Sender/recipient access rejections echo "# HELP postfix_access_reject_total Sender/recipient access rejections" echo "# TYPE postfix_access_reject_total counter" sender_reject=$(grep_count 'Sender address rejected' "$LOG_FILE") recipient_reject=$(grep_count 'Recipient address rejected' "$LOG_FILE") client_reject=$(grep_count 'Client host rejected' "$LOG_FILE") echo "postfix_access_reject_total{type=\"sender\",hostname=\"${HOSTNAME}\"} ${sender_reject}" echo "postfix_access_reject_total{type=\"recipient\",hostname=\"${HOSTNAME}\"} ${recipient_reject}" echo "postfix_access_reject_total{type=\"client\",hostname=\"${HOSTNAME}\"} ${client_reject}" # Queue filesystem usage echo "# HELP postfix_queue_filesystem_usage_percent Queue filesystem usage percentage" echo "# TYPE postfix_queue_filesystem_usage_percent gauge" queue_usage=$(df "${QUEUE_DIR}" 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}') || queue_usage=0 echo "postfix_queue_filesystem_usage_percent{hostname=\"${HOSTNAME}\"} ${queue_usage:-0}" # Postfix file descriptor count (for master process) echo "# HELP postfix_file_descriptors Open file descriptors by postfix" echo "# TYPE postfix_file_descriptors gauge" if [[ -f "$MASTER_PID_FILE" ]]; then master_pid=$(tr -d '[:space:]' < "$MASTER_PID_FILE" 2>/dev/null) if [[ -n "$master_pid" ]] && [[ -d "/proc/${master_pid}/fd" ]]; then fd_count=$(find "/proc/${master_pid}/fd" -maxdepth 1 2>/dev/null | wc -l) else fd_count=0 fi else fd_count=0 fi echo "postfix_file_descriptors{hostname=\"${HOSTNAME}\"} ${fd_count}" # Script execution time # Dovecot IMAP/POP3 login metrics echo "# HELP dovecot_logins_total Successful logins by protocol" echo "# TYPE dovecot_logins_total counter" imap_logins=$(grep_count 'imap-login: Info: Login:' "$LOG_FILE") pop3_logins=$(grep_count 'pop3-login: Info: Login:' "$LOG_FILE") echo "dovecot_logins_total{protocol=\"imap\",hostname=\"${HOSTNAME}\"} ${imap_logins}" echo "dovecot_logins_total{protocol=\"pop3\",hostname=\"${HOSTNAME}\"} ${pop3_logins}" echo "# HELP dovecot_login_auth_method_total Logins by authentication method" echo "# TYPE dovecot_login_auth_method_total counter" for method in PLAIN LOGIN CRAM-MD5 DIGEST-MD5; do count=$(grep_count "Login:.*method=${method}" "$LOG_FILE") echo "dovecot_login_auth_method_total{method=\"${method}\",hostname=\"${HOSTNAME}\"} ${count}" done echo "# HELP dovecot_login_tls_total Logins with/without TLS" echo "# TYPE dovecot_login_tls_total counter" tls_logins=$(grep -c 'Login:.*TLS' "$LOG_FILE" 2>/dev/null) || tls_logins=0 notls_logins=$(grep 'Login:' "$LOG_FILE" 2>/dev/null | grep -cv 'TLS') || notls_logins=0 echo "dovecot_login_tls_total{tls=\"yes\",hostname=\"${HOSTNAME}\"} ${tls_logins}" echo "dovecot_login_tls_total{tls=\"no\",hostname=\"${HOSTNAME}\"} ${notls_logins}" echo "# HELP dovecot_login_failed_total Failed login attempts" echo "# TYPE dovecot_login_failed_total counter" imap_failed=$(grep_count 'imap-login: Info: Aborted login\|imap-login:.*auth failed' "$LOG_FILE") pop3_failed=$(grep_count 'pop3-login: Info: Aborted login\|pop3-login:.*auth failed' "$LOG_FILE") echo "dovecot_login_failed_total{protocol=\"imap\",hostname=\"${HOSTNAME}\"} ${imap_failed}" echo "dovecot_login_failed_total{protocol=\"pop3\",hostname=\"${HOSTNAME}\"} ${pop3_failed}" echo "# HELP dovecot_login_user_total Logins per user (top 20)" echo "# TYPE dovecot_login_user_total counter" grep -oP 'Login: user=<\K[^>]+' "$LOG_FILE" 2>/dev/null | sort | uniq -c | sort -rn | head -20 | while read -r count user; do echo "dovecot_login_user_total{user=\"${user}\",hostname=\"${HOSTNAME}\"} ${count}" done echo "# HELP dovecot_login_client_ip_total Logins per client IP (top 20)" echo "# TYPE dovecot_login_client_ip_total counter" grep -oP 'Login:.*rip=\K[^,]+' "$LOG_FILE" 2>/dev/null | sort | uniq -c | sort -rn | head -20 | while read -r count ip; do echo "dovecot_login_client_ip_total{client_ip=\"${ip}\",hostname=\"${HOSTNAME}\"} ${count}" done local END_TIME END_TIME=$(date +%s.%N) local DURATION DURATION=$(echo "$END_TIME - $START_TIME" | bc) echo "# HELP postfix_collector_duration_seconds Time taken to collect metrics" echo "# TYPE postfix_collector_duration_seconds gauge" echo "postfix_collector_duration_seconds{hostname=\"${HOSTNAME}\"} ${DURATION}" echo "# HELP postfix_collector_last_run_timestamp Unix timestamp of last collection" echo "# TYPE postfix_collector_last_run_timestamp gauge" echo "postfix_collector_last_run_timestamp{hostname=\"${HOSTNAME}\"} $(date +%s)" } # ============================================================================ # HTTP SERVER MODE # ============================================================================ run_http_server() { echo "Starting Postfix metrics exporter on port $HTTP_PORT..." >&2 if ! command -v nc >/dev/null 2>&1; then echo "ERROR: netcat (nc) required for HTTP mode" >&2 exit 1 fi while true; do { read -r request if [[ "$request" =~ ^GET\ /metrics ]]; then echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/plain; version=0.0.4\r\n\r" generate_metrics else echo -e "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r" cat < Postfix Metrics Exporter

Postfix Prometheus Exporter

Metrics

Available Metrics

EOF fi } | nc -l -p "$HTTP_PORT" -q 1 2>/dev/null done } # ============================================================================ # MAIN EXECUTION # ============================================================================ main() { parse_args "$@" if [ "$HTTP_MODE" = true ]; then run_http_server elif [ -n "$OUTPUT_FILE" ]; then # Textfile collector mode: write atomically using temp file local output_dir output_dir="$(dirname "$OUTPUT_FILE")" mkdir -p "$output_dir" # Create temp file in SAME directory for atomic rename (same filesystem) local temp_file temp_file=$(mktemp "${output_dir}/.postfix_metrics.XXXXXX") # Generate metrics to temp file if ! generate_metrics > "$temp_file" 2>/dev/null; then rm -f "$temp_file" echo "ERROR: Failed to generate metrics" >&2 exit 1 fi # Validate: file must exist and have content local file_lines file_lines=$(wc -l < "$temp_file" 2>/dev/null || echo 0) if [ "$file_lines" -lt 10 ]; then rm -f "$temp_file" echo "ERROR: Metrics file too small ($file_lines lines), keeping previous" >&2 exit 1 fi # Set permissions before move chmod 644 "$temp_file" # Atomic rename - no gap where file is missing mv -f "$temp_file" "$OUTPUT_FILE" echo "Metrics written to $OUTPUT_FILE ($file_lines lines)" >&2 else # Default: output to stdout generate_metrics fi } # Execute main function with all script arguments main "$@"