fix(recovery): add controlled harbor repair mode
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-30 19:47:24 +08:00
parent d735dea704
commit 90647f294d
3 changed files with 160 additions and 3 deletions

View File

@@ -962,6 +962,8 @@ jobs:
if [ "${attempt}" -ge "${LOGIN_ATTEMPTS}" ]; then
echo "BLOCKER harbor_registry_public_route_unavailable registry_v2_status=${registry_status}"
echo "NEXT_ACTION run_on_110_local_console_or_restored_ssh: sudo /usr/local/bin/harbor-watchdog.sh --check"
echo "NEXT_ACTION if_check_confirms_unhealthy_on_110: sudo /usr/local/bin/harbor-watchdog.sh --repair-once"
exit 1
fi

View File

@@ -80,11 +80,33 @@ def test_harbor_login_has_public_route_retry_and_safe_secret_transport() -> None
assert "docker login" in block
assert "--password-stdin" in block
assert "BLOCKER harbor_registry_public_route_unavailable" in block
assert "sudo /usr/local/bin/harbor-watchdog.sh --check" in block
assert "sudo /usr/local/bin/harbor-watchdog.sh --repair-once" in block
assert "sleep \"${LOGIN_SLEEP_SECONDS}\"" in block
assert "${HARBOR_PASSWORD}" in block
assert "--password " not in block
def test_harbor_watchdog_exposes_controlled_check_and_one_shot_repair() -> None:
text = (ROOT / "scripts/reboot-recovery/harbor-watchdog.sh").read_text(
encoding="utf-8"
)
assert "--check" in text
assert "--repair-once" in text
assert "--apply-once" in text
assert "check_only=true" in text
assert "docker_compose_action_performed=false" in text
assert "container_remove_performed=false" in text
assert "AWOOI_ALLOW_NON_110_HARBOR_REPAIR" not in text
assert "AWOOOI_ALLOW_NON_110_HARBOR_REPAIR" in text
assert "EXPECTED_HOST_IP" in text
assert "192.168.0.110" in text
assert "REFUSE harbor repair" in text
assert "require_expected_host_for_apply || return 1" in text
assert "while true" in text
def test_onboarding_warning_step_template_stays_on_controlled_runtime_profile() -> None:
text = _workflow_text()
assert "onboarding warning-step workflow is" in text

View File

@@ -18,22 +18,121 @@ HARBOR_DIR="/home/wooo/harbor/harbor"
LOG="/var/log/harbor-watchdog.log"
LOCKFILE="/var/lock/harbor-repair.lock"
CHECK_INTERVAL=60 # 秒
MODE="${1:---watch}"
EXPECTED_HOST_IP="${AWOOOI_HARBOR_EXPECTED_HOST_IP:-192.168.0.110}"
# 注意:不使用 set -ewatchdog 是長駐 loop-e 會在任何指令失敗時終止腳本
# 所有關鍵指令的退出碼需明確捕捉I4
set -uo pipefail
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [watchdog] $*" | tee -a "$LOG"; }
log() {
local line
line="[$(date '+%Y-%m-%d %H:%M:%S')] [watchdog] $*"
if { [ -e "$LOG" ] && [ -w "$LOG" ]; } || { [ ! -e "$LOG" ] && [ -w "$(dirname "$LOG")" ]; }; then
echo "$line" | tee -a "$LOG"
else
echo "$line"
fi
}
harbor_is_healthy() {
usage() {
cat <<'EOF'
Usage: harbor-watchdog.sh [--watch|--check|--repair-once|--apply-once]
Modes:
--watch Default systemd watchdog loop.
--check Read-only Harbor readiness check; performs no Docker action.
--repair-once Run one bounded Harbor repair cycle on 192.168.0.110.
--apply-once Alias of --repair-once.
Safety:
Repair modes fail closed unless the local host has 192.168.0.110, or
AWOOOI_ALLOW_NON_110_HARBOR_REPAIR=1 is explicitly set.
EOF
}
probe_harbor_code() {
local code
# 注意Harbor /v2/ 正常回傳 401需認證不能用 -f-f 把 401 視為失敗 exit 22
# 使用 127.0.0.1 避免 IPv6 解析問題
code=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://127.0.0.1:5000/v2/ 2>/dev/null || echo "000")
if [ -z "$code" ]; then
code="000"
fi
echo "$code"
}
harbor_is_healthy() {
local code
code="$(probe_harbor_code)"
[ "$code" = "401" ] || [ "$code" = "200" ]
}
host_has_expected_ip() {
command -v ip >/dev/null 2>&1 || return 1
ip -o -4 addr show 2>/dev/null | grep -q " ${EXPECTED_HOST_IP}/"
}
require_expected_host_for_apply() {
if [ "${AWOOOI_ALLOW_NON_110_HARBOR_REPAIR:-0}" = "1" ]; then
log "AWOOOI_ALLOW_NON_110_HARBOR_REPAIR=1略過 expected host guard"
return 0
fi
if host_has_expected_ip; then
return 0
fi
log "REFUSE harbor repair: expected_host_ip=${EXPECTED_HOST_IP} not present; run on 110 local console/root shell"
return 1
}
check_harbor() {
local code ready docker_cli docker_active dir_exists compose_exists lock_exists host_match
code="$(probe_harbor_code)"
ready=false
if [ "$code" = "401" ] || [ "$code" = "200" ]; then
ready=true
fi
docker_cli=false
command -v docker >/dev/null 2>&1 && docker_cli=true
docker_active="unknown"
if command -v systemctl >/dev/null 2>&1; then
docker_active="$(systemctl is-active docker 2>/dev/null || true)"
fi
dir_exists=false
[ -d "$HARBOR_DIR" ] && dir_exists=true
compose_exists=false
[ -f "$HARBOR_DIR/docker-compose.yml" ] && compose_exists=true
lock_exists=false
[ -e "$LOCKFILE" ] && lock_exists=true
host_match=false
host_has_expected_ip && host_match=true
cat <<EOF
AWOOOI_HARBOR_WATCHDOG_CHECK
mode=check
target=127.0.0.1:5000/v2/
expected_host_ip=${EXPECTED_HOST_IP}
expected_host_ip_present=${host_match}
harbor_dir=${HARBOR_DIR}
harbor_dir_exists=${dir_exists}
harbor_compose_exists=${compose_exists}
docker_cli_available=${docker_cli}
docker_service_state=${docker_active}
lockfile=${LOCKFILE}
lockfile_exists=${lock_exists}
harbor_local_v2_http_status=${code}
harbor_ready=${ready}
check_only=true
docker_compose_action_performed=false
container_remove_performed=false
service_restart_performed=false
host_reboot_performed=false
EOF
}
repair_harbor() {
require_expected_host_for_apply || return 1
# I1: lockfile 防止與 startup-110.sh 並行修復
exec 9>"$LOCKFILE"
if ! flock -n 9; then
@@ -81,7 +180,10 @@ repair_harbor() {
# Phase 4: 啟動全組件I2: 捕捉 stderr
log "Phase 4: 啟動 Harbor 全組件..."
docker compose up -d 2>&1 | while IFS= read -r line; do log " $line"; done || true
if ! docker compose up -d 2>&1 | while IFS= read -r line; do log " $line"; done; then
log "❌ Harbor 全組件啟動指令失敗"
return 1
fi
# S1: 等 30s 讓 harbor-core 完成初始化(原 10s 不夠)
log "等待 30s 讓 harbor-core 完成初始化..."
@@ -89,12 +191,43 @@ repair_harbor() {
if harbor_is_healthy; then
log "✅ Harbor 修復成功"
return 0
else
log "❌ Harbor 修復後仍不健康,需人工介入"
log " 手動: cd $HARBOR_DIR && docker compose down && docker compose up -d harbor-log && sleep 60 && docker compose up -d"
return 1
fi
}
case "$MODE" in
--check)
check_harbor
exit 0
;;
--repair-once|--apply-once)
log "=== Harbor Watchdog 單次修復開始 ==="
if harbor_is_healthy; then
log "Harbor 已健康,跳過修復"
check_harbor
exit 0
fi
repair_harbor
rc=$?
check_harbor
exit "$rc"
;;
--watch)
;;
-h|--help)
usage
exit 0
;;
*)
usage
exit 2
;;
esac
log "=== Harbor Watchdog 啟動 (interval=${CHECK_INTERVAL}s) ==="
# I3: while loop 本體用 || true 保護,子 shell 異常不終止整個 watchdog