feat(m5): ADR-074 M5 — Docker 188 / Redis Streams / PostgreSQL 磁碟告警

新增 awoooi_infrastructure_detailed 告警群組:
- DockerContainerUnhealthyDetailed: 188 容器無活動 > 2min
- RedisStreamBacklogHigh: Stream 積壓 > 500 筆
- PostgreSQLDiskGrowthRate: 磁碟 1h 增長 > 500MB

2026-04-12 ogt (ADR-074 M5)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-12 15:36:54 +08:00
parent c1c96ab47b
commit ec6a341f3e

View File

@@ -117,6 +117,64 @@ spec:
description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。"
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
- name: awoooi_infrastructure_detailed
interval: 60s
rules:
# P1: Docker 188 容器不健康docker inspect health=unhealthy
# node-exporter + cAdvisor 暴露 container_last_seen / container_tasks_state
- alert: DockerContainerUnhealthyDetailed
expr: |
count by (name, instance) (
container_tasks_state{state="running", instance=~"192.168.0.188.*"}
) == 0
or
container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120)
for: 5m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "188 主機容器 {{ $labels.name }} 異常"
description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。"
runbook: "SSH 到 192.168.0.188docker inspect {{ $labels.name }} 確認健康狀態"
# P1: Redis Streams 積壓過高alert stream 或 incident stream
- alert: RedisStreamBacklogHigh
expr: awoooi_redis_stream_len > 500
for: 10m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆"
description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。"
runbook: "檢查 consumer group lagXINFO GROUPS <stream-key>"
# P1: PostgreSQL 磁碟增長率過快1小時增長超過 500MB
- alert: PostgreSQLDiskGrowthRate
expr: |
(
node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
- node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
)
- (
node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
- node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
) offset 1h
> 524288000
for: 5m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "188 主機磁碟 1 小時增長超過 500MB"
description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B可能是 PostgreSQL WAL 或日誌暴增。"
runbook: "SSH 188df -h / && du -sh /var/lib/postgresql/*/pg_wal"
- name: awoooi_host_connectivity
interval: 60s
rules: