From 90647f294d2689007a8f4258b6afd8defd6d4b4c Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 19:47:24 +0800 Subject: [PATCH] fix(recovery): add controlled harbor repair mode --- .gitea/workflows/cd.yaml | 2 + .../test_cd_controlled_runtime_profile.py | 22 +++ scripts/reboot-recovery/harbor-watchdog.sh | 139 +++++++++++++++++- 3 files changed, 160 insertions(+), 3 deletions(-) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 400c152ab..65267d417 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -962,6 +962,8 @@ jobs: if [ "${attempt}" -ge "${LOGIN_ATTEMPTS}" ]; then echo "BLOCKER harbor_registry_public_route_unavailable registry_v2_status=${registry_status}" + echo "NEXT_ACTION run_on_110_local_console_or_restored_ssh: sudo /usr/local/bin/harbor-watchdog.sh --check" + echo "NEXT_ACTION if_check_confirms_unhealthy_on_110: sudo /usr/local/bin/harbor-watchdog.sh --repair-once" exit 1 fi diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index 082ba834d..684ae81e9 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -80,11 +80,33 @@ def test_harbor_login_has_public_route_retry_and_safe_secret_transport() -> None assert "docker login" in block assert "--password-stdin" in block assert "BLOCKER harbor_registry_public_route_unavailable" in block + assert "sudo /usr/local/bin/harbor-watchdog.sh --check" in block + assert "sudo /usr/local/bin/harbor-watchdog.sh --repair-once" in block assert "sleep \"${LOGIN_SLEEP_SECONDS}\"" in block assert "${HARBOR_PASSWORD}" in block assert "--password " not in block +def test_harbor_watchdog_exposes_controlled_check_and_one_shot_repair() -> None: + text = (ROOT / "scripts/reboot-recovery/harbor-watchdog.sh").read_text( + encoding="utf-8" + ) + + assert "--check" in text + assert "--repair-once" in text + assert "--apply-once" in text + assert "check_only=true" in text + assert "docker_compose_action_performed=false" in text + assert "container_remove_performed=false" in text + assert "AWOOI_ALLOW_NON_110_HARBOR_REPAIR" not in text + assert "AWOOOI_ALLOW_NON_110_HARBOR_REPAIR" in text + assert "EXPECTED_HOST_IP" in text + assert "192.168.0.110" in text + assert "REFUSE harbor repair" in text + assert "require_expected_host_for_apply || return 1" in text + assert "while true" in text + + def test_onboarding_warning_step_template_stays_on_controlled_runtime_profile() -> None: text = _workflow_text() assert "onboarding warning-step workflow is" in text diff --git a/scripts/reboot-recovery/harbor-watchdog.sh b/scripts/reboot-recovery/harbor-watchdog.sh index 32565fdbb..a5c57e9c4 100644 --- a/scripts/reboot-recovery/harbor-watchdog.sh +++ b/scripts/reboot-recovery/harbor-watchdog.sh @@ -18,22 +18,121 @@ HARBOR_DIR="/home/wooo/harbor/harbor" LOG="/var/log/harbor-watchdog.log" LOCKFILE="/var/lock/harbor-repair.lock" CHECK_INTERVAL=60 # 秒 +MODE="${1:---watch}" +EXPECTED_HOST_IP="${AWOOOI_HARBOR_EXPECTED_HOST_IP:-192.168.0.110}" # 注意:不使用 set -e,watchdog 是長駐 loop,-e 會在任何指令失敗時終止腳本 # 所有關鍵指令的退出碼需明確捕捉(I4) set -uo pipefail -log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [watchdog] $*" | tee -a "$LOG"; } +log() { + local line + line="[$(date '+%Y-%m-%d %H:%M:%S')] [watchdog] $*" + if { [ -e "$LOG" ] && [ -w "$LOG" ]; } || { [ ! -e "$LOG" ] && [ -w "$(dirname "$LOG")" ]; }; then + echo "$line" | tee -a "$LOG" + else + echo "$line" + fi +} -harbor_is_healthy() { +usage() { + cat <<'EOF' +Usage: harbor-watchdog.sh [--watch|--check|--repair-once|--apply-once] + +Modes: + --watch Default systemd watchdog loop. + --check Read-only Harbor readiness check; performs no Docker action. + --repair-once Run one bounded Harbor repair cycle on 192.168.0.110. + --apply-once Alias of --repair-once. + +Safety: + Repair modes fail closed unless the local host has 192.168.0.110, or + AWOOOI_ALLOW_NON_110_HARBOR_REPAIR=1 is explicitly set. +EOF +} + +probe_harbor_code() { local code # 注意:Harbor /v2/ 正常回傳 401(需認證),不能用 -f(-f 把 401 視為失敗 exit 22) # 使用 127.0.0.1 避免 IPv6 解析問題 code=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://127.0.0.1:5000/v2/ 2>/dev/null || echo "000") + if [ -z "$code" ]; then + code="000" + fi + echo "$code" +} + +harbor_is_healthy() { + local code + code="$(probe_harbor_code)" [ "$code" = "401" ] || [ "$code" = "200" ] } +host_has_expected_ip() { + command -v ip >/dev/null 2>&1 || return 1 + ip -o -4 addr show 2>/dev/null | grep -q " ${EXPECTED_HOST_IP}/" +} + +require_expected_host_for_apply() { + if [ "${AWOOOI_ALLOW_NON_110_HARBOR_REPAIR:-0}" = "1" ]; then + log "AWOOOI_ALLOW_NON_110_HARBOR_REPAIR=1,略過 expected host guard" + return 0 + fi + if host_has_expected_ip; then + return 0 + fi + log "REFUSE harbor repair: expected_host_ip=${EXPECTED_HOST_IP} not present; run on 110 local console/root shell" + return 1 +} + +check_harbor() { + local code ready docker_cli docker_active dir_exists compose_exists lock_exists host_match + code="$(probe_harbor_code)" + ready=false + if [ "$code" = "401" ] || [ "$code" = "200" ]; then + ready=true + fi + docker_cli=false + command -v docker >/dev/null 2>&1 && docker_cli=true + docker_active="unknown" + if command -v systemctl >/dev/null 2>&1; then + docker_active="$(systemctl is-active docker 2>/dev/null || true)" + fi + dir_exists=false + [ -d "$HARBOR_DIR" ] && dir_exists=true + compose_exists=false + [ -f "$HARBOR_DIR/docker-compose.yml" ] && compose_exists=true + lock_exists=false + [ -e "$LOCKFILE" ] && lock_exists=true + host_match=false + host_has_expected_ip && host_match=true + + cat <"$LOCKFILE" if ! flock -n 9; then @@ -81,7 +180,10 @@ repair_harbor() { # Phase 4: 啟動全組件(I2: 捕捉 stderr) log "Phase 4: 啟動 Harbor 全組件..." - docker compose up -d 2>&1 | while IFS= read -r line; do log " $line"; done || true + if ! docker compose up -d 2>&1 | while IFS= read -r line; do log " $line"; done; then + log "❌ Harbor 全組件啟動指令失敗" + return 1 + fi # S1: 等 30s 讓 harbor-core 完成初始化(原 10s 不夠) log "等待 30s 讓 harbor-core 完成初始化..." @@ -89,12 +191,43 @@ repair_harbor() { if harbor_is_healthy; then log "✅ Harbor 修復成功" + return 0 else log "❌ Harbor 修復後仍不健康,需人工介入" log " 手動: cd $HARBOR_DIR && docker compose down && docker compose up -d harbor-log && sleep 60 && docker compose up -d" + return 1 fi } +case "$MODE" in + --check) + check_harbor + exit 0 + ;; + --repair-once|--apply-once) + log "=== Harbor Watchdog 單次修復開始 ===" + if harbor_is_healthy; then + log "Harbor 已健康,跳過修復" + check_harbor + exit 0 + fi + repair_harbor + rc=$? + check_harbor + exit "$rc" + ;; + --watch) + ;; + -h|--help) + usage + exit 0 + ;; + *) + usage + exit 2 + ;; +esac + log "=== Harbor Watchdog 啟動 (interval=${CHECK_INTERVAL}s) ===" # I3: while loop 本體用 || true 保護,子 shell 異常不終止整個 watchdog