fix(recovery): include 110 controlled lane verifier
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-30 21:41:31 +08:00
parent 182d22b8ac
commit 214f4c1e88
5 changed files with 54 additions and 2 deletions

View File

@@ -50672,6 +50672,7 @@ production browser smoke:
- verifier 僅讀 metadata不讀 `.runner` 內容、不印 runner token檢查 110 host selector、controlled drain lane `capacity=1``awoooi-host:host` / `awoooi-ubuntu` labels、ELF binary、registration metadata 存在、systemd CPU / memory / tasks / `NoNewPrivileges` guardrails、legacy runner fail-closed、root restore-source left `0`、active action container / heavy process / load 壓力。
- `awoooi-cd-lane-drain.service``awoooi-startup-110.sh` 產生的 controlled drain unit 新增 `ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner`,避免 service active 但未註冊時假裝可承接 `awoooi-host` queue。
- `ops/runner/verify-awoooi-non110-cd-closure.py` 的 Harbor 110 no-matching next action 改為先在 110 跑 `check-awoooi-110-controlled-cd-lane-readiness.sh`,通過後再恢復 `awoooi-host` control path 並重讀 queue/closure。
- `recover-110-control-path-and-harbor-local.sh``--check` 串入 controlled lane verifier`deploy-to-110.sh` 同步安裝 `/usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh`,讓 110 local console / root shell recovery 包同時涵蓋 SSH metadata、Harbor watchdog 與 `awoooi-host` lane readiness。
**本地驗證結果**
- `pytest ops/runner/test_check_awoooi_110_controlled_cd_lane_readiness.py ops/runner/test_verify_awoooi_non110_cd_closure.py ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_guard_gitea_runner_pressure.py scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py -q``56 passed`

View File

@@ -171,8 +171,10 @@ def test_deploy_to_110_syncs_local_control_path_recovery_helpers() -> None:
assert "repair-110-ssh-publickey-auth-local.sh" in text
assert "recover-110-control-path-and-harbor-local.sh" in text
assert "check-awoooi-110-controlled-cd-lane-readiness.sh" in text
assert "/usr/local/bin/repair-110-ssh-publickey-auth-local.sh" in text
assert "/usr/local/bin/recover-110-control-path-and-harbor-local.sh" in text
assert "/usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh" in text
def test_onboarding_warning_step_template_stays_on_controlled_runtime_profile() -> None:

View File

@@ -9,6 +9,7 @@
set -euo pipefail
HOST="wooo@192.168.0.110"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
echo "=== 部署 awoooi-startup-110 + harbor-watchdog 到 192.168.0.110 ==="
@@ -18,6 +19,7 @@ scp "$SCRIPT_DIR/awoooi-startup-110.sh" "$HOST:/tmp/awoooi-startup-110.sh"
scp "$SCRIPT_DIR/awoooi-startup-110.service" "$HOST:/tmp/awoooi-startup-110.service"
scp "$SCRIPT_DIR/repair-110-ssh-publickey-auth-local.sh" "$HOST:/tmp/repair-110-ssh-publickey-auth-local.sh"
scp "$SCRIPT_DIR/recover-110-control-path-and-harbor-local.sh" "$HOST:/tmp/recover-110-control-path-and-harbor-local.sh"
scp "$ROOT_DIR/ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh" "$HOST:/tmp/check-awoooi-110-controlled-cd-lane-readiness.sh"
# 2. 上傳 watchdog
echo "[2/5] 上傳 harbor-watchdog..."
@@ -32,6 +34,8 @@ ssh "$HOST" "sudo cp /tmp/awoooi-startup-110.sh /usr/local/bin/awoooi-startup-11
sudo chmod +x /usr/local/bin/repair-110-ssh-publickey-auth-local.sh && \
sudo cp /tmp/recover-110-control-path-and-harbor-local.sh /usr/local/bin/recover-110-control-path-and-harbor-local.sh && \
sudo chmod +x /usr/local/bin/recover-110-control-path-and-harbor-local.sh && \
sudo cp /tmp/check-awoooi-110-controlled-cd-lane-readiness.sh /usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh && \
sudo chmod +x /usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh && \
sudo cp /tmp/awoooi-startup-110.service /etc/systemd/system/awoooi-startup-110.service && \
sudo systemctl daemon-reload && \
sudo systemctl enable awoooi-startup-110.service && \

View File

@@ -13,6 +13,7 @@ EXPECTED_HOST_IP="${AWOOOI_110_EXPECTED_HOST_IP:-192.168.0.110}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SSH_REPAIR_SCRIPT="${AWOOOI_110_SSH_REPAIR_SCRIPT:-}"
HARBOR_WATCHDOG_SCRIPT="${AWOOOI_HARBOR_WATCHDOG_SCRIPT:-}"
CONTROLLED_LANE_VERIFIER_SCRIPT="${AWOOOI_110_CONTROLLED_LANE_VERIFIER_SCRIPT:-}"
RELOAD_SSH="${RELOAD_SSH:-0}"
usage() {
@@ -20,7 +21,7 @@ usage() {
Usage: recover-110-control-path-and-harbor-local.sh [--check|--apply-ssh-metadata|--repair-harbor-once|--apply-all]
Modes:
--check Read-only checks for SSH metadata and Harbor readiness.
--check Read-only checks for SSH metadata, Harbor readiness, and controlled CD lane readiness.
--apply-ssh-metadata Fix TARGET_USER home/.ssh/authorized_keys metadata only.
--repair-harbor-once Run one bounded Harbor watchdog repair cycle only.
--apply-all Apply SSH metadata repair, then one Harbor repair cycle.
@@ -120,6 +121,22 @@ resolve_harbor_watchdog_script() {
return 1
}
resolve_controlled_lane_verifier_script() {
if [ -n "$CONTROLLED_LANE_VERIFIER_SCRIPT" ] && [ -x "$CONTROLLED_LANE_VERIFIER_SCRIPT" ]; then
printf '%s\n' "$CONTROLLED_LANE_VERIFIER_SCRIPT"
return 0
fi
if [ -x "/usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh" ]; then
printf '%s\n' "/usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh"
return 0
fi
if [ -x "$SCRIPT_DIR/../../ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh" ]; then
printf '%s\n' "$SCRIPT_DIR/../../ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh"
return 0
fi
return 1
}
run_ssh_check() {
local script
if ! script="$(resolve_ssh_repair_script)"; then
@@ -158,9 +175,20 @@ run_harbor_repair_once() {
"$script" --repair-once
}
run_controlled_lane_check() {
local script
if ! script="$(resolve_controlled_lane_verifier_script)"; then
echo "CONTROLLED_LANE_VERIFIER_SCRIPT_STATUS=missing"
return 1
fi
TARGET_HOST_IP="$EXPECTED_HOST_IP" "$script"
}
echo "AWOOOI_110_CONTROL_PATH_AND_HARBOR_LOCAL_RECOVERY mode=${MODE} target_user=${TARGET_USER}"
echo "expected_host_ip=${EXPECTED_HOST_IP}"
echo "operation_boundary_secret_value_read=false"
echo "operation_boundary_runner_token_read=false"
echo "operation_boundary_raw_runner_registration_read=false"
echo "operation_boundary_host_reboot_performed=false"
echo "operation_boundary_docker_daemon_restart_performed=false"
echo "operation_boundary_node_drain_performed=false"
@@ -169,6 +197,7 @@ case "$MODE" in
check)
run_ssh_check || true
run_harbor_check || true
run_controlled_lane_check || true
;;
apply_ssh_metadata)
run_ssh_apply
@@ -179,6 +208,7 @@ case "$MODE" in
apply_all)
run_ssh_apply
run_harbor_repair_once
run_controlled_lane_check || true
;;
*)
echo "Unknown internal mode: $MODE" >&2

View File

@@ -18,11 +18,15 @@ def test_recover_110_orchestrator_contracts() -> None:
assert "--repair-harbor-once" in text
assert "--apply-all" in text
assert "operation_boundary_secret_value_read=false" in text
assert "operation_boundary_runner_token_read=false" in text
assert "operation_boundary_raw_runner_registration_read=false" in text
assert "operation_boundary_host_reboot_performed=false" in text
assert "operation_boundary_docker_daemon_restart_performed=false" in text
assert "repair-110-ssh-publickey-auth-local.sh" in text
assert "harbor-watchdog.sh" in text
assert "check-awoooi-110-controlled-cd-lane-readiness.sh" in text
assert "cat \"$home_dir/.ssh/authorized_keys\"" not in text
assert "cat \"$CD_LANE_DRAIN_DIR/data/.runner\"" not in text
forbidden = [
"systemctl restart docker",
@@ -41,6 +45,7 @@ def test_recover_110_orchestrator_contracts() -> None:
def test_recover_110_check_uses_fake_helpers_without_writes(tmp_path: Path) -> None:
ssh_helper = tmp_path / "ssh-helper.sh"
harbor_helper = tmp_path / "harbor-helper.sh"
lane_helper = tmp_path / "lane-helper.sh"
ssh_helper.write_text(
"#!/usr/bin/env bash\n"
"echo SSH_HELPER_MODE=$1\n"
@@ -53,7 +58,13 @@ def test_recover_110_check_uses_fake_helpers_without_writes(tmp_path: Path) -> N
"echo HARBOR_RUNTIME_WRITE=false\n",
encoding="utf-8",
)
for helper in (ssh_helper, harbor_helper):
lane_helper.write_text(
"#!/usr/bin/env bash\n"
"echo LANE_VERIFIER_MODE=check\n"
"echo LANE_VERIFIER_REGISTRATION_CONTENT_READ=false\n",
encoding="utf-8",
)
for helper in (ssh_helper, harbor_helper, lane_helper):
helper.chmod(helper.stat().st_mode | stat.S_IXUSR)
env = {
@@ -61,6 +72,7 @@ def test_recover_110_check_uses_fake_helpers_without_writes(tmp_path: Path) -> N
"ALLOW_NON_110": "1",
"AWOOOI_110_SSH_REPAIR_SCRIPT": str(ssh_helper),
"AWOOOI_HARBOR_WATCHDOG_SCRIPT": str(harbor_helper),
"AWOOOI_110_CONTROLLED_LANE_VERIFIER_SCRIPT": str(lane_helper),
}
result = subprocess.run(
["bash", str(RECOVERY), "--check"],
@@ -75,6 +87,9 @@ def test_recover_110_check_uses_fake_helpers_without_writes(tmp_path: Path) -> N
assert "AWOOOI_110_CONTROL_PATH_AND_HARBOR_LOCAL_RECOVERY mode=check" in result.stdout
assert "SSH_HELPER_MODE=--check" in result.stdout
assert "HARBOR_HELPER_MODE=--check" in result.stdout
assert "LANE_VERIFIER_MODE=check" in result.stdout
assert "SSH_METADATA_WRITE=false" in result.stdout
assert "HARBOR_RUNTIME_WRITE=false" in result.stdout
assert "LANE_VERIFIER_REGISTRATION_CONTENT_READ=false" in result.stdout
assert "operation_boundary_secret_value_read=false" in result.stdout
assert "operation_boundary_raw_runner_registration_read=false" in result.stdout