fix(cd): keep ops recovery checks on controlled profile
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 41s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 41s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -400,6 +400,7 @@ def build_readback(
|
||||
latest_harbor_110_repair_log_text
|
||||
)
|
||||
latest_cd_status = latest_cd_run.get("status", "")
|
||||
latest_cd_success = latest_cd_status == "Success"
|
||||
latest_cd_visible_blocked = latest_cd_status == "Blocked"
|
||||
latest_cd_waiting = latest_cd_status == "Waiting"
|
||||
host_pressure_waiting_from_stale_jobs = (
|
||||
@@ -512,7 +513,41 @@ def build_readback(
|
||||
current_cd_waiting_behind_harbor_110_repair_running = (
|
||||
latest_cd_waiting and harbor_110_repair_running
|
||||
)
|
||||
harbor_110_repair_blocked = (
|
||||
harbor_110_repair_historical_after_latest_cd_success = bool(
|
||||
latest_cd_success
|
||||
and latest_cd_run_id
|
||||
and harbor_110_repair_run_id
|
||||
and harbor_110_repair_run_id != latest_cd_run_id
|
||||
)
|
||||
effective_remote_ssh_publickey_auth_stalled = bool(
|
||||
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_remote_control_channel_unavailable = bool(
|
||||
harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_remote_local_registry_v2_unavailable = bool(
|
||||
harbor_110_repair_log_classifier["local_registry_v2_unavailable"]
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_remote_public_registry_v2_unavailable = bool(
|
||||
harbor_110_repair_log_classifier["public_registry_v2_unavailable"]
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_harbor_110_repair_failed = bool(
|
||||
harbor_110_repair_failed
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_harbor_110_repair_jobs_stale_or_mismatched = bool(
|
||||
harbor_110_repair_jobs_stale_or_mismatched
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
effective_harbor_110_repair_visible_failure_jobs_api_stale = bool(
|
||||
harbor_110_repair_visible_failure_jobs_api_stale
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
harbor_110_repair_blocked_raw = (
|
||||
harbor_110_repair_status_blocked
|
||||
or harbor_110_repair_failed
|
||||
or bool(harbor_110_repair_no_matching_runner_label)
|
||||
@@ -520,11 +555,17 @@ def build_readback(
|
||||
or harbor_110_repair_visible_running_jobs_api_stale
|
||||
or bool(harbor_110_repair_log_classifier["failure_classifier"])
|
||||
)
|
||||
harbor_110_repair_blocked = bool(
|
||||
harbor_110_repair_blocked_raw
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
)
|
||||
safe_next_action = _queue_safe_next_action(
|
||||
latest_cd_waiting=latest_cd_waiting,
|
||||
latest_cd_status=latest_cd_status,
|
||||
latest_cd_no_matching_runner_label=latest_cd_no_matching_runner_label,
|
||||
cd_jobs_stale_or_mismatched=cd_jobs_stale_or_mismatched,
|
||||
cd_jobs_stale_or_mismatched=(
|
||||
cd_jobs_stale_or_mismatched and not latest_cd_success
|
||||
),
|
||||
cd_jobs_payload_classifier=cd_jobs_payload_classifier,
|
||||
effective_host_pressure_classifier=effective_tests_log_classifier[
|
||||
"host_pressure_classifier"
|
||||
@@ -540,34 +581,36 @@ def build_readback(
|
||||
],
|
||||
harbor_110_repair_no_matching_runner_label=(
|
||||
harbor_110_repair_no_matching_runner_label
|
||||
if not harbor_110_repair_historical_after_latest_cd_success
|
||||
else ""
|
||||
),
|
||||
harbor_110_repair_waiting=harbor_110_repair_waiting,
|
||||
harbor_110_repair_running=harbor_110_repair_running,
|
||||
harbor_110_repair_failed=harbor_110_repair_failed,
|
||||
harbor_110_repair_failed=effective_harbor_110_repair_failed,
|
||||
harbor_110_repair_waiting_after_cd_harbor_blocker=(
|
||||
harbor_110_repair_waiting_after_cd_harbor_blocker
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
),
|
||||
harbor_110_repair_jobs_stale_or_mismatched=(
|
||||
harbor_110_repair_jobs_stale_or_mismatched
|
||||
effective_harbor_110_repair_jobs_stale_or_mismatched
|
||||
),
|
||||
harbor_110_repair_jobs_payload_classifier=(
|
||||
harbor_110_repair_jobs_payload_classifier
|
||||
),
|
||||
harbor_110_repair_visible_running_jobs_api_stale=(
|
||||
harbor_110_repair_visible_running_jobs_api_stale
|
||||
and not harbor_110_repair_historical_after_latest_cd_success
|
||||
),
|
||||
harbor_110_repair_visible_failure_jobs_api_stale=(
|
||||
harbor_110_repair_visible_failure_jobs_api_stale
|
||||
effective_harbor_110_repair_visible_failure_jobs_api_stale
|
||||
),
|
||||
current_cd_waiting_behind_harbor_110_repair_running=(
|
||||
current_cd_waiting_behind_harbor_110_repair_running
|
||||
),
|
||||
remote_control_channel_unavailable=harbor_110_repair_log_classifier[
|
||||
"remote_control_channel_unavailable"
|
||||
],
|
||||
remote_ssh_publickey_auth_stalled=harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_auth_stalled"
|
||||
],
|
||||
remote_control_channel_unavailable=(
|
||||
effective_remote_control_channel_unavailable
|
||||
),
|
||||
remote_ssh_publickey_auth_stalled=effective_remote_ssh_publickey_auth_stalled,
|
||||
remote_ssh_publickey_offer_timeout=harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_offer_timeout"
|
||||
],
|
||||
@@ -684,10 +727,16 @@ def build_readback(
|
||||
"latest_visible_harbor_110_repair_waiting": harbor_110_repair_waiting,
|
||||
"latest_visible_harbor_110_repair_running": harbor_110_repair_running,
|
||||
"latest_visible_harbor_110_repair_failed": harbor_110_repair_failed,
|
||||
"latest_visible_harbor_110_repair_historical_after_latest_cd_success": (
|
||||
harbor_110_repair_historical_after_latest_cd_success
|
||||
),
|
||||
"latest_visible_harbor_110_repair_status_blocked": (
|
||||
harbor_110_repair_status_blocked
|
||||
),
|
||||
"latest_visible_harbor_110_repair_blocked": harbor_110_repair_blocked,
|
||||
"latest_visible_harbor_110_repair_blocked_raw": (
|
||||
harbor_110_repair_blocked_raw
|
||||
),
|
||||
"latest_visible_harbor_110_repair_log_http_status": (
|
||||
latest_harbor_110_repair_log_http_status
|
||||
),
|
||||
@@ -872,13 +921,13 @@ def build_readback(
|
||||
else "blocked_latest_visible_cd_run"
|
||||
if latest_cd_visible_blocked
|
||||
else "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
|
||||
if harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
|
||||
if effective_remote_ssh_publickey_auth_stalled
|
||||
else "blocked_harbor_110_remote_control_channel_unavailable"
|
||||
if harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
|
||||
if effective_remote_control_channel_unavailable
|
||||
else "blocked_harbor_110_remote_local_registry_v2_unavailable"
|
||||
if harbor_110_repair_log_classifier["local_registry_v2_unavailable"]
|
||||
if effective_remote_local_registry_v2_unavailable
|
||||
else "blocked_harbor_public_registry_v2_unavailable_after_remote_repair"
|
||||
if harbor_110_repair_log_classifier["public_registry_v2_unavailable"]
|
||||
if effective_remote_public_registry_v2_unavailable
|
||||
else "blocked_current_cd_waiting_behind_stale_harbor_110_repair_readback"
|
||||
if (
|
||||
latest_cd_waiting
|
||||
@@ -891,7 +940,7 @@ def build_readback(
|
||||
else "blocked_harbor_110_repair_failed"
|
||||
if (
|
||||
build_log_classifier["harbor_public_route_blocked_or_retrying"]
|
||||
and harbor_110_repair_failed
|
||||
and effective_harbor_110_repair_failed
|
||||
)
|
||||
else (
|
||||
"blocked_harbor_public_route_unavailable_after_harbor_110_repair_success"
|
||||
@@ -925,13 +974,13 @@ def build_readback(
|
||||
else "harbor_110_repair_running"
|
||||
if harbor_110_repair_running
|
||||
else "blocked_harbor_110_repair_failed"
|
||||
if harbor_110_repair_failed
|
||||
if effective_harbor_110_repair_failed
|
||||
else "blocked_harbor_110_repair_run"
|
||||
if harbor_110_repair_blocked
|
||||
else "harbor_110_repair_jobs_stale_or_mismatched"
|
||||
if harbor_110_repair_jobs_stale_or_mismatched
|
||||
if effective_harbor_110_repair_jobs_stale_or_mismatched
|
||||
else "cd_jobs_stale_or_mismatched"
|
||||
if cd_jobs_stale_or_mismatched
|
||||
if cd_jobs_stale_or_mismatched and not latest_cd_success
|
||||
else "no_matching_runner_not_visible"
|
||||
),
|
||||
"readback": readback,
|
||||
@@ -1014,7 +1063,11 @@ def build_readback(
|
||||
"harbor_110_repair_waiting": harbor_110_repair_waiting,
|
||||
"harbor_110_repair_running": harbor_110_repair_running,
|
||||
"harbor_110_repair_failed": harbor_110_repair_failed,
|
||||
"harbor_110_repair_historical_after_latest_cd_success": (
|
||||
harbor_110_repair_historical_after_latest_cd_success
|
||||
),
|
||||
"harbor_110_repair_blocked": harbor_110_repair_blocked,
|
||||
"harbor_110_repair_blocked_raw": harbor_110_repair_blocked_raw,
|
||||
"harbor_110_repair_waiting_after_cd_harbor_blocker": (
|
||||
harbor_110_repair_waiting_after_cd_harbor_blocker
|
||||
),
|
||||
|
||||
@@ -703,6 +703,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
|
||||
expected_sources = [
|
||||
"docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md)",
|
||||
"docs/runbooks/FULL-STACK-COLD-START-SOP.md)",
|
||||
"docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json)",
|
||||
"docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md)",
|
||||
"ops/monitoring/alerts-unified.yml)",
|
||||
"ops/monitoring/alerts.yml)",
|
||||
@@ -725,6 +726,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
|
||||
"scripts/reboot-recovery/full-stack-recovery-scorecard.sh)",
|
||||
"scripts/reboot-recovery/awoooi-startup-110.sh)",
|
||||
"scripts/reboot-recovery/harbor-watchdog.sh)",
|
||||
"scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)",
|
||||
"scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh)",
|
||||
"scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py)",
|
||||
"scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py)",
|
||||
@@ -750,6 +752,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N
|
||||
"../../scripts/reboot-recovery/momo-source-arrival-gate.py",
|
||||
"../../scripts/reboot-recovery/full-stack-recovery-scorecard.sh",
|
||||
"../../scripts/reboot-recovery/harbor-watchdog.sh",
|
||||
"../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh",
|
||||
"../../scripts/reboot-recovery/awoooi-startup-110.sh",
|
||||
"../../scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh",
|
||||
"../../scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh",
|
||||
|
||||
@@ -142,6 +142,20 @@ def _actions_html_cd_failed_harbor_repair_failed() -> str:
|
||||
)
|
||||
|
||||
|
||||
def _actions_html_cd_success_harbor_repair_failed() -> str:
|
||||
return (
|
||||
_actions_html_cd_running_harbor_repair_waiting()
|
||||
.replace('data-tooltip-content="Running"', 'data-tooltip-content="Success"', 1)
|
||||
.replace('data-tooltip-content="Waiting"', 'data-tooltip-content="Failure"', 1)
|
||||
.replace("4061", "4314")
|
||||
.replace("4060", "4307")
|
||||
.replace(
|
||||
"fix(cd): keep harbor repair workflow on controlled profile",
|
||||
"feat(web): surface AI automation production proof",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _actions_html_harbor_repair_waiting_with_workflow_no_matching() -> str:
|
||||
return """
|
||||
<div class="menu">
|
||||
@@ -696,6 +710,49 @@ def test_harbor_ssh_command_path_ready_overrides_raw_publickey_stall() -> None:
|
||||
assert classifier["failure_classifier"] == ""
|
||||
|
||||
|
||||
def test_latest_cd_success_makes_old_harbor_repair_failure_historical() -> None:
|
||||
module = _load_module()
|
||||
payload = module.build_readback(
|
||||
actions_html=_actions_html_cd_success_harbor_repair_failed(),
|
||||
actions_list_http_status=401,
|
||||
actions_list_payload={"message": "token is required"},
|
||||
cd_jobs_http_status=200,
|
||||
cd_jobs_payload={"jobs": [], "total_count": 0},
|
||||
harbor_110_repair_jobs_http_status=200,
|
||||
harbor_110_repair_jobs_payload=_harbor_110_repair_stale_code_review_jobs(),
|
||||
latest_harbor_110_repair_log_http_status=200,
|
||||
latest_harbor_110_repair_log_text=(
|
||||
_harbor_110_repair_publickey_auth_stalled_log()
|
||||
),
|
||||
)
|
||||
|
||||
assert payload["readback"]["latest_visible_cd_run_status"] == "Success"
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_historical_after_latest_cd_success"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert payload["readback"]["latest_visible_harbor_110_repair_failed"] is True
|
||||
assert payload["readback"]["latest_visible_harbor_110_repair_blocked_raw"] is True
|
||||
assert payload["readback"]["latest_visible_harbor_110_repair_blocked"] is False
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert payload["status"] == "no_matching_runner_not_visible"
|
||||
assert (
|
||||
payload["rollups"][
|
||||
"harbor_110_repair_historical_after_latest_cd_success"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert payload["rollups"]["harbor_110_repair_blocked"] is False
|
||||
assert payload["rollups"]["harbor_110_repair_blocked_raw"] is True
|
||||
|
||||
|
||||
def test_build_readback_classifies_harbor_502_after_110_repair_jobs_success() -> None:
|
||||
module = _load_module()
|
||||
payload = module.build_readback(
|
||||
|
||||
Reference in New Issue
Block a user