feat(sprint5.1): Data Safety Guardrails 全鏈路整合 (L1-L5)
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m33s
Type Sync Check / check-type-sync (push) Failing after 58s

Layer 0 - K8s RBAC:
  - k8s/rbac/api-velero-reader.yaml: awoooi-executor SA Velero backup reader

Layer 1 - DB Migration (已在 188 執行):
  - M-002: approval_records 新增 approval_level/votes/required_votes
  - M-003: alert_event_type ENUM 新增 8 個值

Layer 2 - IaC:
  - ops/config/service-registry.yaml: 全服務 Stateful 分級清單 (BLOCK/CRITICAL_HITL/STANDARD_HITL/AUTO)

Layer 3 - Python Services:
  - service_registry.py: 讀取 YAML,提供 is_blocked/requires_multisig/get_required_votes
  - velero_client.py: kubectl 查詢 Velero 備份年齡,失敗 fallback 999h
  - preflight_service.py: Pre-flight 安全檢查 (Q2/Q4 決策)

Layer 1-M001 - Playbook model:
  - playbook.py: 新增 requires_approval_level/stateful_targets/requires_pre_backup

Layer 4 - 業務邏輯:
  - alert_operation_log_repository.py: 新增 8 個 event_type (Guardrail/Pre-flight/MultiSig/備份)
  - auto_repair_service.py: 注入 Service Registry Guardrail 檢查 (BLOCK → 直接拒絕)
  - webhooks.py: ALERT_RECEIVED 溯源記錄 + auto_repair flag Q9 + Langfuse trace_id Q10
  - db/models.py: ApprovalRecord 同步 approval_level/votes/required_votes 欄位
  - docker-health-monitor.sh: 純感知層改造(移除所有 docker restart 邏輯)

Layer 5 - Telegram 通知:
  - telegram_gateway.py: T1-T6 六個新通知方法 (Guardrail/Pre-flight/Backup/MultiSig/ChangeApplied)

參考: ADR-062 Data Safety Guardrails, ADR-063 Service Registry IaC

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-08 16:24:09 +08:00
parent 6f7a4be2c7
commit 88696dba9b
14 changed files with 997 additions and 191 deletions

View File

@@ -1,11 +1,13 @@
#!/usr/bin/env bash
# docker-health-monitor.sh
# Plan A: Docker 容器健康監控 + 自動修復
# Sprint 5.1 L4-6: 純感知層(偵測→送 Webhook禁止任何修復動作
#
# 部署: cron */5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1
# 設定: /etc/awoooi-ops/secrets.env
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
# 首席架構裁示: Intent→Action→Result 三段式,禁止靜默修復
# 架構裁示: Route B — 腳本只感知,所有修復決策由 AWOOOI API 執行ADR-062
# 注意: 禁止在此腳本中執行 docker restart / docker start
# 所有修復動作由 AWOOOI API Guardrail + Playbook + Approval 鏈路處理
set -euo pipefail
@@ -19,48 +21,45 @@ fi
: "${AWOOOI_API_URL:=https://awoooi.wooo.work}"
: "${TELEGRAM_BOT_TOKEN:=}"
: "${TELEGRAM_CHAT_ID:=}"
: "${WEBHOOK_HMAC_SECRET:=}"
: "${COOLDOWN_SECONDS:=300}"
: "${LOG_FILE:=/var/log/docker-health-monitor.log}"
# 冷卻期:避免同一容器在短時間內重複發送 webhook去重非修復冷卻
: "${SEND_COOLDOWN_SECONDS:=300}"
: "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}"
mkdir -p "$COOLDOWN_DIR"
# ─── 排除清單(禁止自動修復)───────────────────────────────────────────────
# 判斷方式: echo ":list:" | grep -q ":name:"
# 分類一:資料庫 — 禁止 restart
EXCLUDED_DB_LIST=":postgres:momo-db:langfuse-db:harbor-db:sentry-postgres:signoz-clickhouse:"
# 分類二Redis — 禁止 restart
EXCLUDED_REDIS_LIST=":redis:harbor-redis:sentry-redis:"
# 分類三:監控棧 exited → docker start保護 WAL
MONITORING_START_ONLY_LIST=":prometheus:grafana:alertmanager:"
# 分類四:監控棧 其他 → 僅告警
EXCLUDED_MONITORING_LIST=":blackbox-exporter:signoz-otel-collector:"
# 分類五:關鍵系統 — 永遠禁止Gitea restart 會殺活躍 SSH
EXCLUDED_CRITICAL_LIST=":gitea:"
# ─── 工具函數 ────────────────────────────────────────────────────────────────
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"
}
in_list() {
local name=":${1}:"
local list="$2"
[[ "$list" == *"$name"* ]]
# 發送冷卻期檢查(避免同一容器短時間重複送 webhook
is_in_send_cooldown() {
local container="$1"
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
if [[ -f "$cooldown_file" ]]; then
local last_sent now elapsed
last_sent=$(cat "$cooldown_file")
now=$(date +%s)
elapsed=$(( now - last_sent ))
if (( elapsed < SEND_COOLDOWN_SECONDS )); then
log "COOLDOWN: ${container} 距上次通知 ${elapsed}s跳過冷卻期 ${SEND_COOLDOWN_SECONDS}s"
return 0
fi
fi
return 1
}
# 計算 HMAC-SHA256 簽章
sign_payload() {
local payload="$1"
printf '%s' "$payload" | openssl dgst -sha256 -hmac "$WEBHOOK_HMAC_SECRET" -binary | xxd -p -c 256
set_send_cooldown() {
local container="$1"
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
}
# 傳送 TelegramFallbackAWOOOI API down 時直接呼叫 Bot API
# FallbackAWOOOI API down 時直接呼叫 Telegram Bot API
send_telegram_direct() {
local message="$1"
if [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then
log "WARN: Telegram 未設定,跳過通知"
log "WARN: Telegram 未設定,跳過 Fallback"
return 0
fi
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
@@ -69,186 +68,113 @@ send_telegram_direct() {
> /dev/null 2>&1 || true
}
# 傳送 AWOOOI Webhook若失敗則 Fallback 至 Telegram Bot API
send_awoooi_alert() {
local title="$1"
local message="$2"
local severity="${3:-WARNING}"
local source="docker-health-monitor"
# 傳送 Alertmanager 格式 Webhook 到 AWOOOI API
# 使用現有端點 /api/v1/webhooks/alertmanager內網免 HMAC
send_to_awoooi() {
local container="$1"
local status="$2" # unhealthy | exited | dead
local hostname
hostname=$(hostname)
local now_ts
now_ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
# 組裝 Alertmanager 格式 JSON符合現有 AlertmanagerPayload schema
local payload
payload=$(printf '{"title":"%s","message":"%s","severity":"%s","source":"%s","labels":{"monitor":"docker-health-monitor","plan":"A"}}' \
"$title" "$message" "$severity" "$source")
local timestamp
timestamp=$(date -u +%s)
local signature
signature=$(sign_payload "${timestamp}${payload}")
payload=$(cat <<JSON
{
"version": "4",
"groupKey": "docker-health-${hostname}-${container}",
"status": "firing",
"alerts": [{
"status": "firing",
"labels": {
"alertname": "DockerContainerUnhealthy",
"container": "${container}",
"host": "${hostname}",
"layer": "docker",
"severity": "warning",
"auto_repair": "true",
"source": "docker-health-monitor"
},
"annotations": {
"summary": "容器 ${container} 狀態異常: ${status}",
"description": "主機 ${hostname} 容器 ${container} 偵測狀態=${status},由 docker-health-monitor 感知層回報"
},
"startsAt": "${now_ts}"
}]
}
JSON
)
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
-X POST "${AWOOOI_API_URL}/api/v1/webhooks/custom-alert" \
-X POST "${AWOOOI_API_URL}/api/v1/webhooks/alertmanager" \
-H "Content-Type: application/json" \
-H "X-Timestamp: ${timestamp}" \
-H "X-Signature: sha256=${signature}" \
-d "$payload" \
--connect-timeout 10 \
--max-time 30 2>/dev/null) || http_code="0"
if [[ "$http_code" != "200" && "$http_code" != "202" ]]; then
if [[ "$http_code" == "200" || "$http_code" == "202" ]]; then
log "SENT: ${container} 狀態=${status} → AWOOOI API (${http_code})"
set_send_cooldown "$container"
else
log "WARN: AWOOOI API 回應 ${http_code}Fallback 到 Telegram Bot API"
send_telegram_direct "[docker-health-monitor Fallback]&#10;${title}&#10;${message}"
send_telegram_direct "🚨 [docker-health-monitor Fallback]&#10;主機: ${hostname}&#10;容器: ${container}&#10;狀態: ${status}&#10;(API 不可達,請人工處理)"
set_send_cooldown "$container"
fi
}
# 冷卻期檢查(避免同一容器短時間重複修復)
is_in_cooldown() {
local container="$1"
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
if [[ -f "$cooldown_file" ]]; then
local last_repair
last_repair=$(cat "$cooldown_file")
local now
now=$(date +%s)
local elapsed=$(( now - last_repair ))
if (( elapsed < COOLDOWN_SECONDS )); then
log "COOLDOWN: ${container} 仍在冷卻期 (${elapsed}s / ${COOLDOWN_SECONDS}s)"
return 0
fi
fi
return 1
}
set_cooldown() {
local container="$1"
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
}
# ─── 核心:處理不健康容器 ───────────────────────────────────────────────────
handle_unhealthy_container() {
local container="$1"
local status="$2" # unhealthy | exited | dead
# ─── 核心:掃描所有容器 ─────────────────────────────────────────────────────
check_containers() {
local hostname
hostname=$(hostname)
log "DETECTED: ${container} 狀態=${status} on ${hostname}"
# 取得所有容器(含停止的)
while IFS=$'\t' read -r container_id container_name state health; do
# 跳過 header 或空行
[[ -z "$container_name" ]] && continue
# ── 排除清單判斷 ─────────────────────────────────────────────────────────
if in_list "$container" "$EXCLUDED_CRITICAL_LIST"; then
log "SKIP: ${container} 屬於關鍵排除清單 (Gitea),僅告警"
send_awoooi_alert \
"[${hostname}] 關鍵服務異常: ${container}" \
"容器 ${container} 狀態=${status}。此服務禁止自動修復,請人工處理!" \
"CRITICAL"
return
fi
if in_list "$container" "$EXCLUDED_DB_LIST"; then
log "SKIP: ${container} 屬於資料庫排除清單,僅告警"
send_awoooi_alert \
"[${hostname}] 資料庫容器異常: ${container}" \
"容器 ${container} 狀態=${status}。資料庫禁止自動修復,需人工介入!" \
"CRITICAL"
return
fi
if in_list "$container" "$EXCLUDED_REDIS_LIST"; then
log "SKIP: ${container} 屬於 Redis 排除清單,僅告警"
send_awoooi_alert \
"[${hostname}] Redis 容器異常: ${container}" \
"容器 ${container} 狀態=${status}。Redis 禁止自動修復,需人工介入!" \
"CRITICAL"
return
fi
if in_list "$container" "$EXCLUDED_MONITORING_LIST"; then
log "SKIP: ${container} 屬於監控棧排除清單,僅告警"
send_awoooi_alert \
"[${hostname}] 監控元件異常: ${container}" \
"容器 ${container} 狀態=${status}。請人工處理。" \
"WARNING"
return
fi
# ── 冷卻期判斷 ────────────────────────────────────────────────────────────
if is_in_cooldown "$container"; then
log "SKIP: ${container} 在冷卻期內,跳過本次修復"
return
fi
# ── 決定修復動作 ─────────────────────────────────────────────────────────
local action_cmd="docker restart"
local action_desc="docker restart"
if in_list "$container" "$MONITORING_START_ONLY_LIST" && [[ "$status" == "exited" ]]; then
action_cmd="docker start"
action_desc="docker start保護 WAL非 restart"
fi
# ── Phase 1: Intent決策意圖通知──────────────────────────────────────
log "INTENT: 即將對 ${container} 執行 ${action_desc}"
send_awoooi_alert \
"[${hostname}] 自動修復 Intent: ${container}" \
"偵測到容器 ${container} 狀態=${status}。即將執行 ${action_desc}2 秒後開始修復。" \
"WARNING"
sleep 2
# ── Phase 2: Action執行修復──────────────────────────────────────────
log "ACTION: 執行 ${action_cmd} ${container}"
set_cooldown "$container"
local repair_ok=false
if $action_cmd "$container" >> "$LOG_FILE" 2>&1; then
repair_ok=true
fi
# ── Phase 3: Result執行結果通知──────────────────────────────────────
if $repair_ok; then
log "RESULT: ${container} 修復成功"
send_awoooi_alert \
"[${hostname}] 自動修復成功: ${container}" \
"容器 ${container} 已透過 ${action_desc} 成功恢復。原狀態=${status}" \
"INFO"
else
log "RESULT: ${container} 修復失敗!需人工介入"
send_awoooi_alert \
"[${hostname}] 自動修復失敗: ${container}" \
"容器 ${container} 執行 ${action_desc} 失敗!原狀態=${status}。需人工介入!" \
"CRITICAL"
fi
}
# ─── 主流程 ──────────────────────────────────────────────────────────────────
main() {
log "===== docker-health-monitor 啟動 (host=$(hostname)) ====="
# 取得所有容器狀態
# docker ps -a 格式: Names / Health / State
while IFS=$'\t' read -r name health_status container_status; do
[[ -z "$name" ]] && continue
local needs_repair=false
local needs_alert=false
local detected_status=""
if [[ "$health_status" == "unhealthy" ]]; then
needs_repair=true
# 偵測 exited / dead
if [[ "$state" == "exited" || "$state" == "dead" ]]; then
needs_alert=true
detected_status="$state"
fi
# 偵測 unhealthyhealth check 存在且失敗)
if [[ "$health" == "unhealthy" ]]; then
needs_alert=true
detected_status="unhealthy"
elif [[ "$container_status" == "exited" || "$container_status" == "dead" ]]; then
needs_repair=true
detected_status="$container_status"
elif [[ "$health_status" == "starting" ]]; then
log "INFO: ${name} health=starting等待中跳過"
continue
fi
if $needs_repair; then
handle_unhealthy_container "$name" "$detected_status"
if $needs_alert; then
log "DETECTED: ${container_name} 狀態=${detected_status} on ${hostname}"
# 冷卻期去重
if is_in_send_cooldown "$container_name"; then
continue
fi
# 送 Webhook — 只感知,不修復
send_to_awoooi "$container_name" "$detected_status"
fi
done < <(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.State}}\t{{.Status}}' | \
awk -F'\t' '{
health = ""
if ($4 ~ /\(unhealthy\)/) health = "unhealthy"
else if ($4 ~ /\(healthy\)/) health = "healthy"
print $1 "\t" $2 "\t" $3 "\t" health
}')
}
done < <(docker ps -a --format '{{.Names}} {{.Health}} {{.State}}' 2>/dev/null)
log "===== docker-health-monitor 完成 ====="
# ─── Main ───────────────────────────────────────────────────────────────────
main() {
log "=== docker-health-monitor 感知層啟動 (純感知,禁止修復) ==="
check_containers
log "=== 掃描完成 ==="
}
main "$@"