feat(m5): ADR-074 M5 — Docker 188 / Redis Streams / PostgreSQL 磁碟告警
新增 awoooi_infrastructure_detailed 告警群組: - DockerContainerUnhealthyDetailed: 188 容器無活動 > 2min - RedisStreamBacklogHigh: Stream 積壓 > 500 筆 - PostgreSQLDiskGrowthRate: 磁碟 1h 增長 > 500MB 2026-04-12 ogt (ADR-074 M5) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -117,6 +117,64 @@ spec:
|
||||
description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。"
|
||||
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
|
||||
|
||||
- name: awoooi_infrastructure_detailed
|
||||
interval: 60s
|
||||
rules:
|
||||
|
||||
# P1: Docker 188 容器不健康(docker inspect health=unhealthy)
|
||||
# node-exporter + cAdvisor 暴露 container_last_seen / container_tasks_state
|
||||
- alert: DockerContainerUnhealthyDetailed
|
||||
expr: |
|
||||
count by (name, instance) (
|
||||
container_tasks_state{state="running", instance=~"192.168.0.188.*"}
|
||||
) == 0
|
||||
or
|
||||
container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "188 主機容器 {{ $labels.name }} 異常"
|
||||
description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。"
|
||||
runbook: "SSH 到 192.168.0.188:docker inspect {{ $labels.name }} 確認健康狀態"
|
||||
|
||||
# P1: Redis Streams 積壓過高(alert stream 或 incident stream)
|
||||
- alert: RedisStreamBacklogHigh
|
||||
expr: awoooi_redis_stream_len > 500
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆"
|
||||
description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。"
|
||||
runbook: "檢查 consumer group lag:XINFO GROUPS <stream-key>"
|
||||
|
||||
# P1: PostgreSQL 磁碟增長率過快(1小時增長超過 500MB)
|
||||
- alert: PostgreSQLDiskGrowthRate
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
|
||||
- node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
|
||||
)
|
||||
- (
|
||||
node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
|
||||
- node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
|
||||
) offset 1h
|
||||
> 524288000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "188 主機磁碟 1 小時增長超過 500MB"
|
||||
description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B,可能是 PostgreSQL WAL 或日誌暴增。"
|
||||
runbook: "SSH 188:df -h / && du -sh /var/lib/postgresql/*/pg_wal"
|
||||
|
||||
- name: awoooi_host_connectivity
|
||||
interval: 60s
|
||||
rules:
|
||||
|
||||
Reference in New Issue
Block a user