fix(monitoring): keep host alert ssh diagnostics canonical

2026-05-05 23:57:53 +08:00
parent 85d5b5c823
commit 2f50c67f5c
2 changed files with 22 additions and 2 deletions
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -39,6 +39,14 @@
 - `.gitea/workflows/cd.yaml` 的 `Acquire Docker Build Lock` 新增 `EMPTY_LOCK_SECONDS=300`。
 - lock 超過 5 分鐘且 host 上沒有 active docker build/push 時，自動移除空鎖後重新嘗試取得 lock；真正超過 2 小時的 stale lock 仍保留原有強制清理邏輯。

+## 2026-05-05 | Prometheus canonical alert source 補齊 SSH 診斷標籤
+
+**背景**：`scripts/ops/deploy-alerts.sh` 實際部署 `ops/monitoring/alerts-unified.yml`，但 repo 內 `alerts.yml` 比 canonical source 多了 HostHighCpuLoad、HostOutOfMemory、HostOutOfDiskSpace、HostDiskUsageHigh 的 SSH 診斷 annotation / bare-metal routing label。
+
+**本次修補**：
+- 將 canonical `ops/monitoring/alerts-unified.yml` 補齊 SSH diagnosis action、host_resource category、`mcp_provider=ssh_host` 與 guarded disk-prune route，避免下次 deploy-alerts 覆蓋掉 live baseline。
+- 維持原則：host/Docker 高負載先只讀診斷；stateful DB/ClickHouse/Harbor/Sentry 不允許通用 restart。
+
 ## 2026-05-05 | 重開機後排程與 startup baseline 修復

 **背景**：四台主機非預期重開機後，統帥要求確認所有服務、網站、工具、資料庫與排程都能正常恢復，不能只看容器 `healthy`。
--- a/ops/monitoring/alerts-unified.yml
+++ b/ops/monitoring/alerts-unified.yml
@@ -49,6 +49,9 @@ groups:
        annotations:
          summary: "主機 {{ $labels.host }} CPU 高負載"
          description: "CPU 使用率超過 90% 持續 10 分鐘；若 load5/core 未超過 1.5，先視為容量觀察與診斷，不直接修復。"
+          # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷而非 kubectl
+          auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷；禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)"
+          runbook: "host CPU 高負載排查：先 SSH ps aux 看 top 進程；若為第三方服務（Sentry/ClickHouse 等）寫 ADR 升級資源或調 limit，禁止 kubectl restart 跨 domain"

      - alert: HostLoadAverageSustainedHigh
        # 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
@@ -83,6 +86,9 @@ groups:
        annotations:
          summary: "主機 {{ $labels.host }} 記憶體不足"
          description: "記憶體使用率超過 85%"
+          # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷
+          auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%mem | head -20' (host 記憶體診斷；禁 kubectl restart — 主因常為第三方服務)"
+          runbook: "host 記憶體不足排查：SSH 看 top 進程；若為第三方服務需擴容或調 limit"

      - alert: HostOutOfDiskSpace
        expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
@@ -1097,13 +1103,19 @@ groups:
        labels:
          severity: warning
          layer: systemd-188
-          alert_category: infrastructure
+          alert_category: host_resource
          notification_type: TYPE-3
-          auto_repair: "false"
+          # 2026-05-02 ogt + Claude Sonnet 4.6: ADR-068 飛輪 — disk full SOP
+          # auto_repair: false → true，路由到 ssh_host MCP Group B `ssh_docker_prune`
+          # 工具內含 >=75% 磁碟守衛，低於閾值 no-op，避免誤刪
+          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
          supersedes: PostgreSQLDiskGrowthRate
        annotations:
          summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)"
          description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache."
+          auto_repair_action: "ssh {{ $labels.instance }} docker prune (image+volume+builder; gated by 75% disk usage)"
          runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker"

      - alert: HostDiskUsageCritical