fix(cd): keep ops recovery checks on controlled profile
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 41s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
ogt
2026-07-01 23:40:18 +08:00
parent 670cf9afd6
commit 9f4f1b417c
5 changed files with 158 additions and 20 deletions

View File

@@ -400,6 +400,7 @@ def build_readback(
latest_harbor_110_repair_log_text
)
latest_cd_status = latest_cd_run.get("status", "")
latest_cd_success = latest_cd_status == "Success"
latest_cd_visible_blocked = latest_cd_status == "Blocked"
latest_cd_waiting = latest_cd_status == "Waiting"
host_pressure_waiting_from_stale_jobs = (
@@ -512,7 +513,41 @@ def build_readback(
current_cd_waiting_behind_harbor_110_repair_running = (
latest_cd_waiting and harbor_110_repair_running
)
harbor_110_repair_blocked = (
harbor_110_repair_historical_after_latest_cd_success = bool(
latest_cd_success
and latest_cd_run_id
and harbor_110_repair_run_id
and harbor_110_repair_run_id != latest_cd_run_id
)
effective_remote_ssh_publickey_auth_stalled = bool(
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_remote_control_channel_unavailable = bool(
harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_remote_local_registry_v2_unavailable = bool(
harbor_110_repair_log_classifier["local_registry_v2_unavailable"]
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_remote_public_registry_v2_unavailable = bool(
harbor_110_repair_log_classifier["public_registry_v2_unavailable"]
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_harbor_110_repair_failed = bool(
harbor_110_repair_failed
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_harbor_110_repair_jobs_stale_or_mismatched = bool(
harbor_110_repair_jobs_stale_or_mismatched
and not harbor_110_repair_historical_after_latest_cd_success
)
effective_harbor_110_repair_visible_failure_jobs_api_stale = bool(
harbor_110_repair_visible_failure_jobs_api_stale
and not harbor_110_repair_historical_after_latest_cd_success
)
harbor_110_repair_blocked_raw = (
harbor_110_repair_status_blocked
or harbor_110_repair_failed
or bool(harbor_110_repair_no_matching_runner_label)
@@ -520,11 +555,17 @@ def build_readback(
or harbor_110_repair_visible_running_jobs_api_stale
or bool(harbor_110_repair_log_classifier["failure_classifier"])
)
harbor_110_repair_blocked = bool(
harbor_110_repair_blocked_raw
and not harbor_110_repair_historical_after_latest_cd_success
)
safe_next_action = _queue_safe_next_action(
latest_cd_waiting=latest_cd_waiting,
latest_cd_status=latest_cd_status,
latest_cd_no_matching_runner_label=latest_cd_no_matching_runner_label,
cd_jobs_stale_or_mismatched=cd_jobs_stale_or_mismatched,
cd_jobs_stale_or_mismatched=(
cd_jobs_stale_or_mismatched and not latest_cd_success
),
cd_jobs_payload_classifier=cd_jobs_payload_classifier,
effective_host_pressure_classifier=effective_tests_log_classifier[
"host_pressure_classifier"
@@ -540,34 +581,36 @@ def build_readback(
],
harbor_110_repair_no_matching_runner_label=(
harbor_110_repair_no_matching_runner_label
if not harbor_110_repair_historical_after_latest_cd_success
else ""
),
harbor_110_repair_waiting=harbor_110_repair_waiting,
harbor_110_repair_running=harbor_110_repair_running,
harbor_110_repair_failed=harbor_110_repair_failed,
harbor_110_repair_failed=effective_harbor_110_repair_failed,
harbor_110_repair_waiting_after_cd_harbor_blocker=(
harbor_110_repair_waiting_after_cd_harbor_blocker
and not harbor_110_repair_historical_after_latest_cd_success
),
harbor_110_repair_jobs_stale_or_mismatched=(
harbor_110_repair_jobs_stale_or_mismatched
effective_harbor_110_repair_jobs_stale_or_mismatched
),
harbor_110_repair_jobs_payload_classifier=(
harbor_110_repair_jobs_payload_classifier
),
harbor_110_repair_visible_running_jobs_api_stale=(
harbor_110_repair_visible_running_jobs_api_stale
and not harbor_110_repair_historical_after_latest_cd_success
),
harbor_110_repair_visible_failure_jobs_api_stale=(
harbor_110_repair_visible_failure_jobs_api_stale
effective_harbor_110_repair_visible_failure_jobs_api_stale
),
current_cd_waiting_behind_harbor_110_repair_running=(
current_cd_waiting_behind_harbor_110_repair_running
),
remote_control_channel_unavailable=harbor_110_repair_log_classifier[
"remote_control_channel_unavailable"
],
remote_ssh_publickey_auth_stalled=harbor_110_repair_log_classifier[
"remote_ssh_publickey_auth_stalled"
],
remote_control_channel_unavailable=(
effective_remote_control_channel_unavailable
),
remote_ssh_publickey_auth_stalled=effective_remote_ssh_publickey_auth_stalled,
remote_ssh_publickey_offer_timeout=harbor_110_repair_log_classifier[
"remote_ssh_publickey_offer_timeout"
],
@@ -684,10 +727,16 @@ def build_readback(
"latest_visible_harbor_110_repair_waiting": harbor_110_repair_waiting,
"latest_visible_harbor_110_repair_running": harbor_110_repair_running,
"latest_visible_harbor_110_repair_failed": harbor_110_repair_failed,
"latest_visible_harbor_110_repair_historical_after_latest_cd_success": (
harbor_110_repair_historical_after_latest_cd_success
),
"latest_visible_harbor_110_repair_status_blocked": (
harbor_110_repair_status_blocked
),
"latest_visible_harbor_110_repair_blocked": harbor_110_repair_blocked,
"latest_visible_harbor_110_repair_blocked_raw": (
harbor_110_repair_blocked_raw
),
"latest_visible_harbor_110_repair_log_http_status": (
latest_harbor_110_repair_log_http_status
),
@@ -872,13 +921,13 @@ def build_readback(
else "blocked_latest_visible_cd_run"
if latest_cd_visible_blocked
else "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
if harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
if effective_remote_ssh_publickey_auth_stalled
else "blocked_harbor_110_remote_control_channel_unavailable"
if harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
if effective_remote_control_channel_unavailable
else "blocked_harbor_110_remote_local_registry_v2_unavailable"
if harbor_110_repair_log_classifier["local_registry_v2_unavailable"]
if effective_remote_local_registry_v2_unavailable
else "blocked_harbor_public_registry_v2_unavailable_after_remote_repair"
if harbor_110_repair_log_classifier["public_registry_v2_unavailable"]
if effective_remote_public_registry_v2_unavailable
else "blocked_current_cd_waiting_behind_stale_harbor_110_repair_readback"
if (
latest_cd_waiting
@@ -891,7 +940,7 @@ def build_readback(
else "blocked_harbor_110_repair_failed"
if (
build_log_classifier["harbor_public_route_blocked_or_retrying"]
and harbor_110_repair_failed
and effective_harbor_110_repair_failed
)
else (
"blocked_harbor_public_route_unavailable_after_harbor_110_repair_success"
@@ -925,13 +974,13 @@ def build_readback(
else "harbor_110_repair_running"
if harbor_110_repair_running
else "blocked_harbor_110_repair_failed"
if harbor_110_repair_failed
if effective_harbor_110_repair_failed
else "blocked_harbor_110_repair_run"
if harbor_110_repair_blocked
else "harbor_110_repair_jobs_stale_or_mismatched"
if harbor_110_repair_jobs_stale_or_mismatched
if effective_harbor_110_repair_jobs_stale_or_mismatched
else "cd_jobs_stale_or_mismatched"
if cd_jobs_stale_or_mismatched
if cd_jobs_stale_or_mismatched and not latest_cd_success
else "no_matching_runner_not_visible"
),
"readback": readback,
@@ -1014,7 +1063,11 @@ def build_readback(
"harbor_110_repair_waiting": harbor_110_repair_waiting,
"harbor_110_repair_running": harbor_110_repair_running,
"harbor_110_repair_failed": harbor_110_repair_failed,
"harbor_110_repair_historical_after_latest_cd_success": (
harbor_110_repair_historical_after_latest_cd_success
),
"harbor_110_repair_blocked": harbor_110_repair_blocked,
"harbor_110_repair_blocked_raw": harbor_110_repair_blocked_raw,
"harbor_110_repair_waiting_after_cd_harbor_blocker": (
harbor_110_repair_waiting_after_cd_harbor_blocker
),

View File

@@ -703,6 +703,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
expected_sources = [
"docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md)",
"docs/runbooks/FULL-STACK-COLD-START-SOP.md)",
"docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json)",
"docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md)",
"ops/monitoring/alerts-unified.yml)",
"ops/monitoring/alerts.yml)",
@@ -725,6 +726,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
"scripts/reboot-recovery/full-stack-recovery-scorecard.sh)",
"scripts/reboot-recovery/awoooi-startup-110.sh)",
"scripts/reboot-recovery/harbor-watchdog.sh)",
"scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)",
"scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh)",
"scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py)",
"scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py)",
@@ -750,6 +752,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
"../../scripts/reboot-recovery/momo-source-arrival-gate.py",
"../../scripts/reboot-recovery/full-stack-recovery-scorecard.sh",
"../../scripts/reboot-recovery/harbor-watchdog.sh",
"../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh",
"../../scripts/reboot-recovery/awoooi-startup-110.sh",
"../../scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh",
"../../scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh",

View File

@@ -142,6 +142,20 @@ def _actions_html_cd_failed_harbor_repair_failed() -> str:
)
def _actions_html_cd_success_harbor_repair_failed() -> str:
return (
_actions_html_cd_running_harbor_repair_waiting()
.replace('data-tooltip-content="Running"', 'data-tooltip-content="Success"', 1)
.replace('data-tooltip-content="Waiting"', 'data-tooltip-content="Failure"', 1)
.replace("4061", "4314")
.replace("4060", "4307")
.replace(
"fix(cd): keep harbor repair workflow on controlled profile",
"feat(web): surface AI automation production proof",
)
)
def _actions_html_harbor_repair_waiting_with_workflow_no_matching() -> str:
return """
<div class="menu">
@@ -696,6 +710,49 @@ def test_harbor_ssh_command_path_ready_overrides_raw_publickey_stall() -> None:
assert classifier["failure_classifier"] == ""
def test_latest_cd_success_makes_old_harbor_repair_failure_historical() -> None:
module = _load_module()
payload = module.build_readback(
actions_html=_actions_html_cd_success_harbor_repair_failed(),
actions_list_http_status=401,
actions_list_payload={"message": "token is required"},
cd_jobs_http_status=200,
cd_jobs_payload={"jobs": [], "total_count": 0},
harbor_110_repair_jobs_http_status=200,
harbor_110_repair_jobs_payload=_harbor_110_repair_stale_code_review_jobs(),
latest_harbor_110_repair_log_http_status=200,
latest_harbor_110_repair_log_text=(
_harbor_110_repair_publickey_auth_stalled_log()
),
)
assert payload["readback"]["latest_visible_cd_run_status"] == "Success"
assert (
payload["readback"][
"latest_visible_harbor_110_repair_historical_after_latest_cd_success"
]
is True
)
assert payload["readback"]["latest_visible_harbor_110_repair_failed"] is True
assert payload["readback"]["latest_visible_harbor_110_repair_blocked_raw"] is True
assert payload["readback"]["latest_visible_harbor_110_repair_blocked"] is False
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled"
]
is True
)
assert payload["status"] == "no_matching_runner_not_visible"
assert (
payload["rollups"][
"harbor_110_repair_historical_after_latest_cd_success"
]
is True
)
assert payload["rollups"]["harbor_110_repair_blocked"] is False
assert payload["rollups"]["harbor_110_repair_blocked_raw"] is True
def test_build_readback_classifies_harbor_502_after_110_repair_jobs_success() -> None:
module = _load_module()
payload = module.build_readback(