- scripts/cron_backup_restore_test.sh: Velero restore dry-run 腳本 - k8s/awoooi-prod/16-cronjob-backup-restore-test.yaml: 每週日 02:00 台北執行 - k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml: 腳本 ConfigMap - flywheel-alerts.yaml: BackupRestoreTestFailed + BackupRestoreTestStale 告警 失敗時寫入 node-exporter textfile → Prometheus 告警 → TYPE-3 Incident 2026-04-12 ogt (ADR-074 M4) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
136 lines
5.3 KiB
YAML
136 lines
5.3 KiB
YAML
# =============================================================================
|
||
# 飛輪健康度告警規則 — ADR-074 M1
|
||
# =============================================================================
|
||
# Prometheus PrometheusRule CRD — 飛輪自監控告警
|
||
# 數據來源:/api/v1/stats/flywheel/metrics(awoooi-flywheel scrape job)
|
||
#
|
||
# 部署:kubectl apply -f k8s/monitoring/flywheel-alerts.yaml
|
||
#
|
||
# 2026-04-12 ogt (ADR-074 M1)
|
||
# =============================================================================
|
||
|
||
apiVersion: monitoring.coreos.com/v1
|
||
kind: PrometheusRule
|
||
metadata:
|
||
name: flywheel-alerts
|
||
namespace: monitoring
|
||
labels:
|
||
release: prometheus
|
||
app: prometheus
|
||
spec:
|
||
groups:
|
||
- name: awoooi_flywheel_health
|
||
interval: 5m
|
||
rules:
|
||
|
||
# P0: Playbook 完全沒有 → 飛輪學習節點失效
|
||
- alert: FlywheelPlaybookZero
|
||
expr: awoooi_flywheel_playbook_count == 0
|
||
for: 1h
|
||
labels:
|
||
severity: critical
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "飛輪 Playbook 數量為 0"
|
||
description: "Playbook 數量持續 1 小時為 0,飛輪學習節點完全失效。"
|
||
runbook: "執行 scripts/cold_start_playbooks.py 冷啟動"
|
||
|
||
# P0: 執行成功率極低
|
||
- alert: FlywheelExecutionSuccessLow
|
||
expr: awoooi_flywheel_execution_success_rate < 0.1
|
||
for: 2h
|
||
labels:
|
||
severity: warning
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "飛輪自動修復成功率低於 10%"
|
||
description: "執行成功率 {{ $value | humanizePercentage }},低於健康基線 10%。"
|
||
runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態"
|
||
|
||
# P0: KM 大量未向量化 → RAG 無法使用歷史案例
|
||
- alert: FlywheelKMVectorizationLow
|
||
expr: awoooi_flywheel_km_unvectorized_count > 10
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "飛輪 KM 未向量化數量 > 10"
|
||
description: "{{ $value }} 筆 KM 條目尚未向量化,RAG 查詢品質下降。"
|
||
runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態"
|
||
|
||
# P1: alertname NULL 率異常
|
||
- alert: FlywheelAlertnameNullHigh
|
||
expr: awoooi_flywheel_alertname_null_rate > 0.05
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "飛輪 alertname NULL 率超過 5%"
|
||
description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。"
|
||
runbook: "執行 scripts/backfill_alertname.py 回填"
|
||
|
||
# P1: Incident 卡住超過 24 小時
|
||
- alert: FlywheelIncidentsStuck
|
||
expr: awoooi_flywheel_incidents_stuck > 5
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時"
|
||
description: "大量 Incident 未推進,可能是決策引擎或 Telegram 通知阻塞。"
|
||
|
||
- name: awoooi_backup_restore
|
||
interval: 1h
|
||
rules:
|
||
|
||
# P0: 備份還原 dry-run 失敗
|
||
- alert: BackupRestoreTestFailed
|
||
expr: awoooi_backup_restore_test_success == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "備份還原 dry-run 測試失敗"
|
||
description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。"
|
||
runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
|
||
|
||
# P1: 備份還原測試超過 8 天未執行(週排程失效)
|
||
- alert: BackupRestoreTestStale
|
||
expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "備份還原測試超過 8 天未執行"
|
||
description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。"
|
||
runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
|
||
|
||
- name: awoooi_host_connectivity
|
||
interval: 60s
|
||
rules:
|
||
|
||
# P0: 主機間網路分區
|
||
- alert: HostNetworkPartition
|
||
expr: probe_success{job="host-connectivity"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
alert_category: infrastructure
|
||
notification_type: TYPE-3
|
||
annotations:
|
||
summary: "主機 {{ $labels.instance }} 無法連通"
|
||
description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。"
|
||
runbook: "SSH 檢查路由和防火牆規則"
|