diff --git a/k8s/monitoring/flywheel-alerts.yaml b/k8s/monitoring/flywheel-alerts.yaml index e7f854e7..787afe18 100644 --- a/k8s/monitoring/flywheel-alerts.yaml +++ b/k8s/monitoring/flywheel-alerts.yaml @@ -117,6 +117,64 @@ spec: description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。" runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態" + - name: awoooi_infrastructure_detailed + interval: 60s + rules: + + # P1: Docker 188 容器不健康(docker inspect health=unhealthy) + # node-exporter + cAdvisor 暴露 container_last_seen / container_tasks_state + - alert: DockerContainerUnhealthyDetailed + expr: | + count by (name, instance) ( + container_tasks_state{state="running", instance=~"192.168.0.188.*"} + ) == 0 + or + container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120) + for: 5m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "188 主機容器 {{ $labels.name }} 異常" + description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。" + runbook: "SSH 到 192.168.0.188:docker inspect {{ $labels.name }} 確認健康狀態" + + # P1: Redis Streams 積壓過高(alert stream 或 incident stream) + - alert: RedisStreamBacklogHigh + expr: awoooi_redis_stream_len > 500 + for: 10m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆" + description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。" + runbook: "檢查 consumer group lag:XINFO GROUPS " + + # P1: PostgreSQL 磁碟增長率過快(1小時增長超過 500MB) + - alert: PostgreSQLDiskGrowthRate + expr: | + ( + node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} + - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} + ) + - ( + node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} + - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} + ) offset 1h + > 524288000 + for: 5m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "188 主機磁碟 1 小時增長超過 500MB" + description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B,可能是 PostgreSQL WAL 或日誌暴增。" + runbook: "SSH 188:df -h / && du -sh /var/lib/postgresql/*/pg_wal" + - name: awoooi_host_connectivity interval: 60s rules: