fix(recovery): add controlled harbor repair mode
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
This commit is contained in:
@@ -962,6 +962,8 @@ jobs:
|
||||
|
||||
if [ "${attempt}" -ge "${LOGIN_ATTEMPTS}" ]; then
|
||||
echo "BLOCKER harbor_registry_public_route_unavailable registry_v2_status=${registry_status}"
|
||||
echo "NEXT_ACTION run_on_110_local_console_or_restored_ssh: sudo /usr/local/bin/harbor-watchdog.sh --check"
|
||||
echo "NEXT_ACTION if_check_confirms_unhealthy_on_110: sudo /usr/local/bin/harbor-watchdog.sh --repair-once"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
@@ -80,11 +80,33 @@ def test_harbor_login_has_public_route_retry_and_safe_secret_transport() -> None
|
||||
assert "docker login" in block
|
||||
assert "--password-stdin" in block
|
||||
assert "BLOCKER harbor_registry_public_route_unavailable" in block
|
||||
assert "sudo /usr/local/bin/harbor-watchdog.sh --check" in block
|
||||
assert "sudo /usr/local/bin/harbor-watchdog.sh --repair-once" in block
|
||||
assert "sleep \"${LOGIN_SLEEP_SECONDS}\"" in block
|
||||
assert "${HARBOR_PASSWORD}" in block
|
||||
assert "--password " not in block
|
||||
|
||||
|
||||
def test_harbor_watchdog_exposes_controlled_check_and_one_shot_repair() -> None:
|
||||
text = (ROOT / "scripts/reboot-recovery/harbor-watchdog.sh").read_text(
|
||||
encoding="utf-8"
|
||||
)
|
||||
|
||||
assert "--check" in text
|
||||
assert "--repair-once" in text
|
||||
assert "--apply-once" in text
|
||||
assert "check_only=true" in text
|
||||
assert "docker_compose_action_performed=false" in text
|
||||
assert "container_remove_performed=false" in text
|
||||
assert "AWOOI_ALLOW_NON_110_HARBOR_REPAIR" not in text
|
||||
assert "AWOOOI_ALLOW_NON_110_HARBOR_REPAIR" in text
|
||||
assert "EXPECTED_HOST_IP" in text
|
||||
assert "192.168.0.110" in text
|
||||
assert "REFUSE harbor repair" in text
|
||||
assert "require_expected_host_for_apply || return 1" in text
|
||||
assert "while true" in text
|
||||
|
||||
|
||||
def test_onboarding_warning_step_template_stays_on_controlled_runtime_profile() -> None:
|
||||
text = _workflow_text()
|
||||
assert "onboarding warning-step workflow is" in text
|
||||
|
||||
@@ -18,22 +18,121 @@ HARBOR_DIR="/home/wooo/harbor/harbor"
|
||||
LOG="/var/log/harbor-watchdog.log"
|
||||
LOCKFILE="/var/lock/harbor-repair.lock"
|
||||
CHECK_INTERVAL=60 # 秒
|
||||
MODE="${1:---watch}"
|
||||
EXPECTED_HOST_IP="${AWOOOI_HARBOR_EXPECTED_HOST_IP:-192.168.0.110}"
|
||||
|
||||
# 注意:不使用 set -e,watchdog 是長駐 loop,-e 會在任何指令失敗時終止腳本
|
||||
# 所有關鍵指令的退出碼需明確捕捉(I4)
|
||||
set -uo pipefail
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [watchdog] $*" | tee -a "$LOG"; }
|
||||
log() {
|
||||
local line
|
||||
line="[$(date '+%Y-%m-%d %H:%M:%S')] [watchdog] $*"
|
||||
if { [ -e "$LOG" ] && [ -w "$LOG" ]; } || { [ ! -e "$LOG" ] && [ -w "$(dirname "$LOG")" ]; }; then
|
||||
echo "$line" | tee -a "$LOG"
|
||||
else
|
||||
echo "$line"
|
||||
fi
|
||||
}
|
||||
|
||||
harbor_is_healthy() {
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: harbor-watchdog.sh [--watch|--check|--repair-once|--apply-once]
|
||||
|
||||
Modes:
|
||||
--watch Default systemd watchdog loop.
|
||||
--check Read-only Harbor readiness check; performs no Docker action.
|
||||
--repair-once Run one bounded Harbor repair cycle on 192.168.0.110.
|
||||
--apply-once Alias of --repair-once.
|
||||
|
||||
Safety:
|
||||
Repair modes fail closed unless the local host has 192.168.0.110, or
|
||||
AWOOOI_ALLOW_NON_110_HARBOR_REPAIR=1 is explicitly set.
|
||||
EOF
|
||||
}
|
||||
|
||||
probe_harbor_code() {
|
||||
local code
|
||||
# 注意:Harbor /v2/ 正常回傳 401(需認證),不能用 -f(-f 把 401 視為失敗 exit 22)
|
||||
# 使用 127.0.0.1 避免 IPv6 解析問題
|
||||
code=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://127.0.0.1:5000/v2/ 2>/dev/null || echo "000")
|
||||
if [ -z "$code" ]; then
|
||||
code="000"
|
||||
fi
|
||||
echo "$code"
|
||||
}
|
||||
|
||||
harbor_is_healthy() {
|
||||
local code
|
||||
code="$(probe_harbor_code)"
|
||||
[ "$code" = "401" ] || [ "$code" = "200" ]
|
||||
}
|
||||
|
||||
host_has_expected_ip() {
|
||||
command -v ip >/dev/null 2>&1 || return 1
|
||||
ip -o -4 addr show 2>/dev/null | grep -q " ${EXPECTED_HOST_IP}/"
|
||||
}
|
||||
|
||||
require_expected_host_for_apply() {
|
||||
if [ "${AWOOOI_ALLOW_NON_110_HARBOR_REPAIR:-0}" = "1" ]; then
|
||||
log "AWOOOI_ALLOW_NON_110_HARBOR_REPAIR=1,略過 expected host guard"
|
||||
return 0
|
||||
fi
|
||||
if host_has_expected_ip; then
|
||||
return 0
|
||||
fi
|
||||
log "REFUSE harbor repair: expected_host_ip=${EXPECTED_HOST_IP} not present; run on 110 local console/root shell"
|
||||
return 1
|
||||
}
|
||||
|
||||
check_harbor() {
|
||||
local code ready docker_cli docker_active dir_exists compose_exists lock_exists host_match
|
||||
code="$(probe_harbor_code)"
|
||||
ready=false
|
||||
if [ "$code" = "401" ] || [ "$code" = "200" ]; then
|
||||
ready=true
|
||||
fi
|
||||
docker_cli=false
|
||||
command -v docker >/dev/null 2>&1 && docker_cli=true
|
||||
docker_active="unknown"
|
||||
if command -v systemctl >/dev/null 2>&1; then
|
||||
docker_active="$(systemctl is-active docker 2>/dev/null || true)"
|
||||
fi
|
||||
dir_exists=false
|
||||
[ -d "$HARBOR_DIR" ] && dir_exists=true
|
||||
compose_exists=false
|
||||
[ -f "$HARBOR_DIR/docker-compose.yml" ] && compose_exists=true
|
||||
lock_exists=false
|
||||
[ -e "$LOCKFILE" ] && lock_exists=true
|
||||
host_match=false
|
||||
host_has_expected_ip && host_match=true
|
||||
|
||||
cat <<EOF
|
||||
AWOOOI_HARBOR_WATCHDOG_CHECK
|
||||
mode=check
|
||||
target=127.0.0.1:5000/v2/
|
||||
expected_host_ip=${EXPECTED_HOST_IP}
|
||||
expected_host_ip_present=${host_match}
|
||||
harbor_dir=${HARBOR_DIR}
|
||||
harbor_dir_exists=${dir_exists}
|
||||
harbor_compose_exists=${compose_exists}
|
||||
docker_cli_available=${docker_cli}
|
||||
docker_service_state=${docker_active}
|
||||
lockfile=${LOCKFILE}
|
||||
lockfile_exists=${lock_exists}
|
||||
harbor_local_v2_http_status=${code}
|
||||
harbor_ready=${ready}
|
||||
check_only=true
|
||||
docker_compose_action_performed=false
|
||||
container_remove_performed=false
|
||||
service_restart_performed=false
|
||||
host_reboot_performed=false
|
||||
EOF
|
||||
}
|
||||
|
||||
repair_harbor() {
|
||||
require_expected_host_for_apply || return 1
|
||||
|
||||
# I1: lockfile 防止與 startup-110.sh 並行修復
|
||||
exec 9>"$LOCKFILE"
|
||||
if ! flock -n 9; then
|
||||
@@ -81,7 +180,10 @@ repair_harbor() {
|
||||
|
||||
# Phase 4: 啟動全組件(I2: 捕捉 stderr)
|
||||
log "Phase 4: 啟動 Harbor 全組件..."
|
||||
docker compose up -d 2>&1 | while IFS= read -r line; do log " $line"; done || true
|
||||
if ! docker compose up -d 2>&1 | while IFS= read -r line; do log " $line"; done; then
|
||||
log "❌ Harbor 全組件啟動指令失敗"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# S1: 等 30s 讓 harbor-core 完成初始化(原 10s 不夠)
|
||||
log "等待 30s 讓 harbor-core 完成初始化..."
|
||||
@@ -89,12 +191,43 @@ repair_harbor() {
|
||||
|
||||
if harbor_is_healthy; then
|
||||
log "✅ Harbor 修復成功"
|
||||
return 0
|
||||
else
|
||||
log "❌ Harbor 修復後仍不健康,需人工介入"
|
||||
log " 手動: cd $HARBOR_DIR && docker compose down && docker compose up -d harbor-log && sleep 60 && docker compose up -d"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
case "$MODE" in
|
||||
--check)
|
||||
check_harbor
|
||||
exit 0
|
||||
;;
|
||||
--repair-once|--apply-once)
|
||||
log "=== Harbor Watchdog 單次修復開始 ==="
|
||||
if harbor_is_healthy; then
|
||||
log "Harbor 已健康,跳過修復"
|
||||
check_harbor
|
||||
exit 0
|
||||
fi
|
||||
repair_harbor
|
||||
rc=$?
|
||||
check_harbor
|
||||
exit "$rc"
|
||||
;;
|
||||
--watch)
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
|
||||
log "=== Harbor Watchdog 啟動 (interval=${CHECK_INTERVAL}s) ==="
|
||||
|
||||
# I3: while loop 本體用 || true 保護,子 shell 異常不終止整個 watchdog
|
||||
|
||||
Reference in New Issue
Block a user