From ec6a341f3e65281209b08bc27d0140f6969a1269 Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 15:36:54 +0800 Subject: [PATCH] =?UTF-8?q?feat(m5):=20ADR-074=20M5=20=E2=80=94=20Docker?= =?UTF-8?q?=20188=20/=20Redis=20Streams=20/=20PostgreSQL=20=E7=A3=81?= =?UTF-8?q?=E7=A2=9F=E5=91=8A=E8=AD=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增 awoooi_infrastructure_detailed 告警群組: - DockerContainerUnhealthyDetailed: 188 容器無活動 > 2min - RedisStreamBacklogHigh: Stream 積壓 > 500 筆 - PostgreSQLDiskGrowthRate: 磁碟 1h 增長 > 500MB 2026-04-12 ogt (ADR-074 M5) Co-Authored-By: Claude Sonnet 4.6 --- k8s/monitoring/flywheel-alerts.yaml | 58 +++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/k8s/monitoring/flywheel-alerts.yaml b/k8s/monitoring/flywheel-alerts.yaml index e7f854e7..787afe18 100644 --- a/k8s/monitoring/flywheel-alerts.yaml +++ b/k8s/monitoring/flywheel-alerts.yaml @@ -117,6 +117,64 @@ spec: description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。" runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態" + - name: awoooi_infrastructure_detailed + interval: 60s + rules: + + # P1: Docker 188 容器不健康(docker inspect health=unhealthy) + # node-exporter + cAdvisor 暴露 container_last_seen / container_tasks_state + - alert: DockerContainerUnhealthyDetailed + expr: | + count by (name, instance) ( + container_tasks_state{state="running", instance=~"192.168.0.188.*"} + ) == 0 + or + container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120) + for: 5m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "188 主機容器 {{ $labels.name }} 異常" + description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。" + runbook: "SSH 到 192.168.0.188:docker inspect {{ $labels.name }} 確認健康狀態" + + # P1: Redis Streams 積壓過高(alert stream 或 incident stream) + - alert: RedisStreamBacklogHigh + expr: awoooi_redis_stream_len > 500 + for: 10m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆" + description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。" + runbook: "檢查 consumer group lag:XINFO GROUPS " + + # P1: PostgreSQL 磁碟增長率過快(1小時增長超過 500MB) + - alert: PostgreSQLDiskGrowthRate + expr: | + ( + node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} + - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} + ) + - ( + node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} + - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} + ) offset 1h + > 524288000 + for: 5m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "188 主機磁碟 1 小時增長超過 500MB" + description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B,可能是 PostgreSQL WAL 或日誌暴增。" + runbook: "SSH 188:df -h / && du -sh /var/lib/postgresql/*/pg_wal" + - name: awoooi_host_connectivity interval: 60s rules: