feat(sprint5.1): Data Safety Guardrails 全鏈路整合 (L1-L5)
Layer 0 - K8s RBAC: - k8s/rbac/api-velero-reader.yaml: awoooi-executor SA Velero backup reader Layer 1 - DB Migration (已在 188 執行): - M-002: approval_records 新增 approval_level/votes/required_votes - M-003: alert_event_type ENUM 新增 8 個值 Layer 2 - IaC: - ops/config/service-registry.yaml: 全服務 Stateful 分級清單 (BLOCK/CRITICAL_HITL/STANDARD_HITL/AUTO) Layer 3 - Python Services: - service_registry.py: 讀取 YAML,提供 is_blocked/requires_multisig/get_required_votes - velero_client.py: kubectl 查詢 Velero 備份年齡,失敗 fallback 999h - preflight_service.py: Pre-flight 安全檢查 (Q2/Q4 決策) Layer 1-M001 - Playbook model: - playbook.py: 新增 requires_approval_level/stateful_targets/requires_pre_backup Layer 4 - 業務邏輯: - alert_operation_log_repository.py: 新增 8 個 event_type (Guardrail/Pre-flight/MultiSig/備份) - auto_repair_service.py: 注入 Service Registry Guardrail 檢查 (BLOCK → 直接拒絕) - webhooks.py: ALERT_RECEIVED 溯源記錄 + auto_repair flag Q9 + Langfuse trace_id Q10 - db/models.py: ApprovalRecord 同步 approval_level/votes/required_votes 欄位 - docker-health-monitor.sh: 純感知層改造(移除所有 docker restart 邏輯) Layer 5 - Telegram 通知: - telegram_gateway.py: T1-T6 六個新通知方法 (Guardrail/Pre-flight/Backup/MultiSig/ChangeApplied) 參考: ADR-062 Data Safety Guardrails, ADR-063 Service Registry IaC Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
# docker-health-monitor.sh
|
||||
# Plan A: Docker 容器健康監控 + 自動修復
|
||||
# Sprint 5.1 L4-6: 純感知層(偵測→送 Webhook,禁止任何修復動作)
|
||||
#
|
||||
# 部署: cron */5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1
|
||||
# 設定: /etc/awoooi-ops/secrets.env
|
||||
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||||
# 首席架構師裁示: Intent→Action→Result 三段式,禁止靜默修復
|
||||
# 架構裁示: Route B — 腳本只感知,所有修復決策由 AWOOOI API 執行(ADR-062)
|
||||
# 注意: 禁止在此腳本中執行 docker restart / docker start
|
||||
# 所有修復動作由 AWOOOI API Guardrail + Playbook + Approval 鏈路處理
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -19,48 +21,45 @@ fi
|
||||
: "${AWOOOI_API_URL:=https://awoooi.wooo.work}"
|
||||
: "${TELEGRAM_BOT_TOKEN:=}"
|
||||
: "${TELEGRAM_CHAT_ID:=}"
|
||||
: "${WEBHOOK_HMAC_SECRET:=}"
|
||||
: "${COOLDOWN_SECONDS:=300}"
|
||||
: "${LOG_FILE:=/var/log/docker-health-monitor.log}"
|
||||
# 冷卻期:避免同一容器在短時間內重複發送 webhook(去重,非修復冷卻)
|
||||
: "${SEND_COOLDOWN_SECONDS:=300}"
|
||||
: "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}"
|
||||
|
||||
mkdir -p "$COOLDOWN_DIR"
|
||||
|
||||
# ─── 排除清單(禁止自動修復)───────────────────────────────────────────────
|
||||
# 判斷方式: echo ":list:" | grep -q ":name:"
|
||||
# 分類一:資料庫 — 禁止 restart
|
||||
EXCLUDED_DB_LIST=":postgres:momo-db:langfuse-db:harbor-db:sentry-postgres:signoz-clickhouse:"
|
||||
# 分類二:Redis — 禁止 restart
|
||||
EXCLUDED_REDIS_LIST=":redis:harbor-redis:sentry-redis:"
|
||||
# 分類三:監控棧 exited → docker start(保護 WAL)
|
||||
MONITORING_START_ONLY_LIST=":prometheus:grafana:alertmanager:"
|
||||
# 分類四:監控棧 其他 → 僅告警
|
||||
EXCLUDED_MONITORING_LIST=":blackbox-exporter:signoz-otel-collector:"
|
||||
# 分類五:關鍵系統 — 永遠禁止(Gitea restart 會殺活躍 SSH)
|
||||
EXCLUDED_CRITICAL_LIST=":gitea:"
|
||||
|
||||
# ─── 工具函數 ────────────────────────────────────────────────────────────────
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"
|
||||
}
|
||||
|
||||
in_list() {
|
||||
local name=":${1}:"
|
||||
local list="$2"
|
||||
[[ "$list" == *"$name"* ]]
|
||||
# 發送冷卻期檢查(避免同一容器短時間重複送 webhook)
|
||||
is_in_send_cooldown() {
|
||||
local container="$1"
|
||||
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
|
||||
if [[ -f "$cooldown_file" ]]; then
|
||||
local last_sent now elapsed
|
||||
last_sent=$(cat "$cooldown_file")
|
||||
now=$(date +%s)
|
||||
elapsed=$(( now - last_sent ))
|
||||
if (( elapsed < SEND_COOLDOWN_SECONDS )); then
|
||||
log "COOLDOWN: ${container} 距上次通知 ${elapsed}s,跳過(冷卻期 ${SEND_COOLDOWN_SECONDS}s)"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
# 計算 HMAC-SHA256 簽章
|
||||
sign_payload() {
|
||||
local payload="$1"
|
||||
printf '%s' "$payload" | openssl dgst -sha256 -hmac "$WEBHOOK_HMAC_SECRET" -binary | xxd -p -c 256
|
||||
set_send_cooldown() {
|
||||
local container="$1"
|
||||
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
|
||||
}
|
||||
|
||||
# 傳送 Telegram(Fallback:AWOOOI API down 時直接呼叫 Bot API)
|
||||
# Fallback:AWOOOI API down 時直接呼叫 Telegram Bot API
|
||||
send_telegram_direct() {
|
||||
local message="$1"
|
||||
if [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then
|
||||
log "WARN: Telegram 未設定,跳過通知"
|
||||
log "WARN: Telegram 未設定,跳過 Fallback"
|
||||
return 0
|
||||
fi
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
@@ -69,186 +68,113 @@ send_telegram_direct() {
|
||||
> /dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
# 傳送 AWOOOI Webhook(若失敗則 Fallback 至 Telegram Bot API)
|
||||
send_awoooi_alert() {
|
||||
local title="$1"
|
||||
local message="$2"
|
||||
local severity="${3:-WARNING}"
|
||||
local source="docker-health-monitor"
|
||||
# 傳送 Alertmanager 格式 Webhook 到 AWOOOI API
|
||||
# 使用現有端點 /api/v1/webhooks/alertmanager(內網免 HMAC)
|
||||
send_to_awoooi() {
|
||||
local container="$1"
|
||||
local status="$2" # unhealthy | exited | dead
|
||||
local hostname
|
||||
hostname=$(hostname)
|
||||
|
||||
local now_ts
|
||||
now_ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
# 組裝 Alertmanager 格式 JSON(符合現有 AlertmanagerPayload schema)
|
||||
local payload
|
||||
payload=$(printf '{"title":"%s","message":"%s","severity":"%s","source":"%s","labels":{"monitor":"docker-health-monitor","plan":"A"}}' \
|
||||
"$title" "$message" "$severity" "$source")
|
||||
|
||||
local timestamp
|
||||
timestamp=$(date -u +%s)
|
||||
local signature
|
||||
signature=$(sign_payload "${timestamp}${payload}")
|
||||
payload=$(cat <<JSON
|
||||
{
|
||||
"version": "4",
|
||||
"groupKey": "docker-health-${hostname}-${container}",
|
||||
"status": "firing",
|
||||
"alerts": [{
|
||||
"status": "firing",
|
||||
"labels": {
|
||||
"alertname": "DockerContainerUnhealthy",
|
||||
"container": "${container}",
|
||||
"host": "${hostname}",
|
||||
"layer": "docker",
|
||||
"severity": "warning",
|
||||
"auto_repair": "true",
|
||||
"source": "docker-health-monitor"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "容器 ${container} 狀態異常: ${status}",
|
||||
"description": "主機 ${hostname} 容器 ${container} 偵測狀態=${status},由 docker-health-monitor 感知層回報"
|
||||
},
|
||||
"startsAt": "${now_ts}"
|
||||
}]
|
||||
}
|
||||
JSON
|
||||
)
|
||||
|
||||
local http_code
|
||||
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
-X POST "${AWOOOI_API_URL}/api/v1/webhooks/custom-alert" \
|
||||
-X POST "${AWOOOI_API_URL}/api/v1/webhooks/alertmanager" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Timestamp: ${timestamp}" \
|
||||
-H "X-Signature: sha256=${signature}" \
|
||||
-d "$payload" \
|
||||
--connect-timeout 10 \
|
||||
--max-time 30 2>/dev/null) || http_code="0"
|
||||
|
||||
if [[ "$http_code" != "200" && "$http_code" != "202" ]]; then
|
||||
if [[ "$http_code" == "200" || "$http_code" == "202" ]]; then
|
||||
log "SENT: ${container} 狀態=${status} → AWOOOI API (${http_code})"
|
||||
set_send_cooldown "$container"
|
||||
else
|
||||
log "WARN: AWOOOI API 回應 ${http_code},Fallback 到 Telegram Bot API"
|
||||
send_telegram_direct "[docker-health-monitor Fallback] ${title} ${message}"
|
||||
send_telegram_direct "🚨 [docker-health-monitor Fallback] 主機: ${hostname} 容器: ${container} 狀態: ${status} (API 不可達,請人工處理)"
|
||||
set_send_cooldown "$container"
|
||||
fi
|
||||
}
|
||||
|
||||
# 冷卻期檢查(避免同一容器短時間重複修復)
|
||||
is_in_cooldown() {
|
||||
local container="$1"
|
||||
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
|
||||
if [[ -f "$cooldown_file" ]]; then
|
||||
local last_repair
|
||||
last_repair=$(cat "$cooldown_file")
|
||||
local now
|
||||
now=$(date +%s)
|
||||
local elapsed=$(( now - last_repair ))
|
||||
if (( elapsed < COOLDOWN_SECONDS )); then
|
||||
log "COOLDOWN: ${container} 仍在冷卻期 (${elapsed}s / ${COOLDOWN_SECONDS}s)"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
set_cooldown() {
|
||||
local container="$1"
|
||||
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
|
||||
}
|
||||
|
||||
# ─── 核心:處理不健康容器 ───────────────────────────────────────────────────
|
||||
handle_unhealthy_container() {
|
||||
local container="$1"
|
||||
local status="$2" # unhealthy | exited | dead
|
||||
# ─── 核心:掃描所有容器 ─────────────────────────────────────────────────────
|
||||
check_containers() {
|
||||
local hostname
|
||||
hostname=$(hostname)
|
||||
|
||||
log "DETECTED: ${container} 狀態=${status} on ${hostname}"
|
||||
# 取得所有容器(含停止的)
|
||||
while IFS=$'\t' read -r container_id container_name state health; do
|
||||
# 跳過 header 或空行
|
||||
[[ -z "$container_name" ]] && continue
|
||||
|
||||
# ── 排除清單判斷 ─────────────────────────────────────────────────────────
|
||||
|
||||
if in_list "$container" "$EXCLUDED_CRITICAL_LIST"; then
|
||||
log "SKIP: ${container} 屬於關鍵排除清單 (Gitea),僅告警"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 關鍵服務異常: ${container}" \
|
||||
"容器 ${container} 狀態=${status}。此服務禁止自動修復,請人工處理!" \
|
||||
"CRITICAL"
|
||||
return
|
||||
fi
|
||||
|
||||
if in_list "$container" "$EXCLUDED_DB_LIST"; then
|
||||
log "SKIP: ${container} 屬於資料庫排除清單,僅告警"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 資料庫容器異常: ${container}" \
|
||||
"容器 ${container} 狀態=${status}。資料庫禁止自動修復,需人工介入!" \
|
||||
"CRITICAL"
|
||||
return
|
||||
fi
|
||||
|
||||
if in_list "$container" "$EXCLUDED_REDIS_LIST"; then
|
||||
log "SKIP: ${container} 屬於 Redis 排除清單,僅告警"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] Redis 容器異常: ${container}" \
|
||||
"容器 ${container} 狀態=${status}。Redis 禁止自動修復,需人工介入!" \
|
||||
"CRITICAL"
|
||||
return
|
||||
fi
|
||||
|
||||
if in_list "$container" "$EXCLUDED_MONITORING_LIST"; then
|
||||
log "SKIP: ${container} 屬於監控棧排除清單,僅告警"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 監控元件異常: ${container}" \
|
||||
"容器 ${container} 狀態=${status}。請人工處理。" \
|
||||
"WARNING"
|
||||
return
|
||||
fi
|
||||
|
||||
# ── 冷卻期判斷 ────────────────────────────────────────────────────────────
|
||||
if is_in_cooldown "$container"; then
|
||||
log "SKIP: ${container} 在冷卻期內,跳過本次修復"
|
||||
return
|
||||
fi
|
||||
|
||||
# ── 決定修復動作 ─────────────────────────────────────────────────────────
|
||||
local action_cmd="docker restart"
|
||||
local action_desc="docker restart"
|
||||
if in_list "$container" "$MONITORING_START_ONLY_LIST" && [[ "$status" == "exited" ]]; then
|
||||
action_cmd="docker start"
|
||||
action_desc="docker start(保護 WAL,非 restart)"
|
||||
fi
|
||||
|
||||
# ── Phase 1: Intent(決策意圖通知)──────────────────────────────────────
|
||||
log "INTENT: 即將對 ${container} 執行 ${action_desc}"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 自動修復 Intent: ${container}" \
|
||||
"偵測到容器 ${container} 狀態=${status}。即將執行 ${action_desc},2 秒後開始修復。" \
|
||||
"WARNING"
|
||||
|
||||
sleep 2
|
||||
|
||||
# ── Phase 2: Action(執行修復)──────────────────────────────────────────
|
||||
log "ACTION: 執行 ${action_cmd} ${container}"
|
||||
set_cooldown "$container"
|
||||
|
||||
local repair_ok=false
|
||||
if $action_cmd "$container" >> "$LOG_FILE" 2>&1; then
|
||||
repair_ok=true
|
||||
fi
|
||||
|
||||
# ── Phase 3: Result(執行結果通知)──────────────────────────────────────
|
||||
if $repair_ok; then
|
||||
log "RESULT: ${container} 修復成功"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 自動修復成功: ${container}" \
|
||||
"容器 ${container} 已透過 ${action_desc} 成功恢復。原狀態=${status}。" \
|
||||
"INFO"
|
||||
else
|
||||
log "RESULT: ${container} 修復失敗!需人工介入"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 自動修復失敗: ${container}" \
|
||||
"容器 ${container} 執行 ${action_desc} 失敗!原狀態=${status}。需人工介入!" \
|
||||
"CRITICAL"
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── 主流程 ──────────────────────────────────────────────────────────────────
|
||||
main() {
|
||||
log "===== docker-health-monitor 啟動 (host=$(hostname)) ====="
|
||||
|
||||
# 取得所有容器狀態
|
||||
# docker ps -a 格式: Names / Health / State
|
||||
while IFS=$'\t' read -r name health_status container_status; do
|
||||
[[ -z "$name" ]] && continue
|
||||
|
||||
local needs_repair=false
|
||||
local needs_alert=false
|
||||
local detected_status=""
|
||||
|
||||
if [[ "$health_status" == "unhealthy" ]]; then
|
||||
needs_repair=true
|
||||
# 偵測 exited / dead
|
||||
if [[ "$state" == "exited" || "$state" == "dead" ]]; then
|
||||
needs_alert=true
|
||||
detected_status="$state"
|
||||
fi
|
||||
|
||||
# 偵測 unhealthy(health check 存在且失敗)
|
||||
if [[ "$health" == "unhealthy" ]]; then
|
||||
needs_alert=true
|
||||
detected_status="unhealthy"
|
||||
elif [[ "$container_status" == "exited" || "$container_status" == "dead" ]]; then
|
||||
needs_repair=true
|
||||
detected_status="$container_status"
|
||||
elif [[ "$health_status" == "starting" ]]; then
|
||||
log "INFO: ${name} health=starting,等待中跳過"
|
||||
continue
|
||||
fi
|
||||
|
||||
if $needs_repair; then
|
||||
handle_unhealthy_container "$name" "$detected_status"
|
||||
if $needs_alert; then
|
||||
log "DETECTED: ${container_name} 狀態=${detected_status} on ${hostname}"
|
||||
|
||||
# 冷卻期去重
|
||||
if is_in_send_cooldown "$container_name"; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# 送 Webhook — 只感知,不修復
|
||||
send_to_awoooi "$container_name" "$detected_status"
|
||||
fi
|
||||
done < <(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.State}}\t{{.Status}}' | \
|
||||
awk -F'\t' '{
|
||||
health = ""
|
||||
if ($4 ~ /\(unhealthy\)/) health = "unhealthy"
|
||||
else if ($4 ~ /\(healthy\)/) health = "healthy"
|
||||
print $1 "\t" $2 "\t" $3 "\t" health
|
||||
}')
|
||||
}
|
||||
|
||||
done < <(docker ps -a --format '{{.Names}} {{.Health}} {{.State}}' 2>/dev/null)
|
||||
|
||||
log "===== docker-health-monitor 完成 ====="
|
||||
# ─── Main ───────────────────────────────────────────────────────────────────
|
||||
main() {
|
||||
log "=== docker-health-monitor 感知層啟動 (純感知,禁止修復) ==="
|
||||
check_containers
|
||||
log "=== 掃描完成 ==="
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
||||
Reference in New Issue
Block a user