From d7f62b623700e7c45cd3fc9cbfe741013b7065c1 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 08:06:01 +0800 Subject: [PATCH] feat(recovery): expose 110 control path readiness --- ...or_registry_controlled_recovery_receipt.py | 192 ++++++++++++++++++ ...or_registry_controlled_recovery_receipt.py | 75 +++++++ 2 files changed, 267 insertions(+) diff --git a/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py b/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py index 33221af25..193addaa7 100644 --- a/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py +++ b/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py @@ -75,6 +75,11 @@ def validate_harbor_registry_controlled_recovery_receipt( controlled_cd_lane=controlled_cd_lane, verifier=verifier, ) + control_path_readiness = _control_path_readiness( + ssh_diagnosis=ssh_diagnosis, + verifier=verifier, + gitea_queue=gitea_queue, + ) return { "schema_version": _SCHEMA_VERSION, @@ -117,6 +122,7 @@ def validate_harbor_registry_controlled_recovery_receipt( "post_apply_verifier": verifier, "gitea_actions_queue": gitea_queue, "deploy_marker": deploy_marker, + "control_path_readiness": control_path_readiness, }, "local_console_phase_readback": local_console_phase_readback, "controlled_apply_policy": { @@ -253,6 +259,28 @@ def validate_harbor_registry_controlled_recovery_receipt( "deploy_marker_production_image_matches_expected": deploy_marker[ "production_image_matches_expected" ], + "control_path_readiness_status": control_path_readiness["status"], + "control_path_readiness_primary_blocker": control_path_readiness[ + "primary_blocker" + ], + "control_path_readiness_blocker_count": control_path_readiness[ + "blocker_count" + ], + "control_path_readiness_node_high_load": control_path_readiness[ + "node_high_load" + ], + "control_path_readiness_runner_systemctl_timeout": ( + control_path_readiness["runner_systemctl_show_timeout"] + ), + "control_path_readiness_awoooi_host_unavailable": ( + control_path_readiness["awoooi_host_runner_unavailable"] + ), + "control_path_readiness_registry_v2_public_ready": ( + control_path_readiness["registry_v2_public_ready"] + ), + "control_path_readiness_registry_v2_internal_ready": ( + control_path_readiness["registry_v2_internal_ready"] + ), "metadata_writeback_contract_ready": True, "local_console_phase_count": local_console_phase_readback["phase_count"], "local_console_completed_phase_count": local_console_phase_readback[ @@ -280,6 +308,170 @@ def validate_harbor_registry_controlled_recovery_receipt( } +def _control_path_readiness( + *, + ssh_diagnosis: dict[str, Any], + verifier: dict[str, Any], + gitea_queue: dict[str, Any], +) -> dict[str, Any]: + public_registry_ready = bool(verifier["public_registry_v2_ready"]) + internal_registry_ready = bool(verifier["internal_registry_v2_ready"]) + queue_no_matching_runner = bool( + gitea_queue["harbor_110_repair_no_matching_runner"] + ) + queue_jobs_stale = bool(gitea_queue["harbor_110_repair_jobs_stale_or_mismatched"]) + queue_jobs_cross_workflow = bool( + gitea_queue["harbor_110_repair_jobs_cross_workflow_mismatch"] + ) + cd_jobs_stale = bool(gitea_queue["cd_run_jobs_stale_or_mismatched"]) + cd_jobs_head_sha_mismatch = bool(gitea_queue["cd_run_jobs_head_sha_mismatch"]) + runner_timeout = bool(ssh_diagnosis["runner_systemctl_show_timeout_seen"]) + node_high_load = bool(ssh_diagnosis["node_high_load_seen"]) + publickey_offer_timeout = bool(ssh_diagnosis["publickey_offer_timeout_seen"]) + server_accepts_key_then_timeout = bool( + ssh_diagnosis["server_accepts_key_then_timeout_seen"] + ) + awoooi_host_unavailable = queue_no_matching_runner or queue_jobs_stale + registry_v2_ready = public_registry_ready and internal_registry_ready + signal_ids = _control_path_signal_ids( + node_high_load=node_high_load, + publickey_offer_timeout=publickey_offer_timeout, + server_accepts_key_then_timeout=server_accepts_key_then_timeout, + runner_timeout=runner_timeout, + queue_no_matching_runner=queue_no_matching_runner, + queue_jobs_stale=queue_jobs_stale, + queue_jobs_cross_workflow=queue_jobs_cross_workflow, + cd_jobs_head_sha_mismatch=cd_jobs_head_sha_mismatch, + cd_jobs_stale=cd_jobs_stale, + public_registry_ready=public_registry_ready, + internal_registry_ready=internal_registry_ready, + ) + status = _control_path_readiness_status( + node_high_load=node_high_load, + runner_timeout=runner_timeout, + ssh_timeout=publickey_offer_timeout or server_accepts_key_then_timeout, + awoooi_host_unavailable=awoooi_host_unavailable, + registry_v2_ready=registry_v2_ready, + signal_ids=signal_ids, + ) + return { + "status": status, + "primary_blocker": signal_ids[0] if signal_ids else "", + "safe_next_action": _control_path_safe_next_action(status=status), + "signal_ids": signal_ids, + "blocker_count": len(signal_ids), + "ssh_diagnosis_receipt_seen": ssh_diagnosis["receipt_seen"], + "gitea_queue_readback_seen": gitea_queue["receipt_seen"], + "node_high_load": node_high_load, + "node_load_classifier": ssh_diagnosis["node_load_classifier"], + "node_load1_per_cpu": ssh_diagnosis["node_load1_per_cpu"], + "ssh_port_tcp_open": ssh_diagnosis["ssh_port_tcp_open"], + "ssh_publickey_offer_timeout": publickey_offer_timeout, + "ssh_server_accepts_key_then_timeout": server_accepts_key_then_timeout, + "runner_systemctl_show_timeout": runner_timeout, + "awoooi_host_runner_unavailable": awoooi_host_unavailable, + "harbor_110_repair_no_matching_runner": queue_no_matching_runner, + "harbor_110_repair_no_matching_runner_label": gitea_queue[ + "harbor_110_repair_no_matching_runner_label" + ], + "harbor_110_repair_jobs_stale_or_mismatched": queue_jobs_stale, + "harbor_110_repair_jobs_cross_workflow_mismatch": queue_jobs_cross_workflow, + "cd_jobs_stale_or_mismatched": cd_jobs_stale, + "cd_jobs_head_sha_mismatch": cd_jobs_head_sha_mismatch, + "registry_v2_public_http_status": verifier[ + "public_registry_v2_http_status" + ], + "registry_v2_internal_http_status": verifier[ + "internal_registry_v2_http_status" + ], + "registry_v2_public_ready": public_registry_ready, + "registry_v2_internal_ready": internal_registry_ready, + "registry_v2_ready": registry_v2_ready, + "metadata_only": True, + "raw_output_returned": False, + } + + +def _control_path_signal_ids( + *, + node_high_load: bool, + publickey_offer_timeout: bool, + server_accepts_key_then_timeout: bool, + runner_timeout: bool, + queue_no_matching_runner: bool, + queue_jobs_stale: bool, + queue_jobs_cross_workflow: bool, + cd_jobs_head_sha_mismatch: bool, + cd_jobs_stale: bool, + public_registry_ready: bool, + internal_registry_ready: bool, +) -> list[str]: + signal_ids: list[str] = [] + if node_high_load: + signal_ids.append("ssh_publickey_node_high_load_on_110") + if publickey_offer_timeout: + signal_ids.append("ssh_publickey_offer_timeout_on_wooo") + if server_accepts_key_then_timeout: + signal_ids.append("ssh_publickey_server_accepts_key_then_timeout_on_wooo") + if runner_timeout: + signal_ids.append("runner_systemctl_show_timeout_on_110") + if queue_no_matching_runner: + signal_ids.append("gitea_queue_harbor_110_repair_no_matching_runner") + if queue_jobs_cross_workflow: + signal_ids.append("gitea_queue_harbor_110_repair_jobs_cross_workflow_mismatch") + if queue_jobs_stale: + signal_ids.append("gitea_queue_harbor_110_repair_jobs_stale_or_mismatched") + if cd_jobs_head_sha_mismatch: + signal_ids.append("gitea_queue_cd_jobs_head_sha_mismatch") + if cd_jobs_stale: + signal_ids.append("gitea_queue_cd_jobs_stale_or_mismatched") + if not public_registry_ready: + signal_ids.append("public_registry_v2_verifier_not_green") + if not internal_registry_ready: + signal_ids.append("internal_registry_v2_verifier_not_green") + return signal_ids + + +def _control_path_readiness_status( + *, + node_high_load: bool, + runner_timeout: bool, + ssh_timeout: bool, + awoooi_host_unavailable: bool, + registry_v2_ready: bool, + signal_ids: list[str], +) -> str: + if not signal_ids: + return "ready" + if node_high_load and awoooi_host_unavailable: + return "blocked_110_high_load_and_awoooi_host_control_path_unavailable" + if awoooi_host_unavailable: + return "blocked_awoooi_host_runner_queue_unavailable" + if node_high_load: + return "blocked_110_node_high_load" + if runner_timeout or ssh_timeout: + return "blocked_110_ssh_or_runner_control_path_timeout" + if not registry_v2_ready: + return "blocked_registry_v2_verifier_not_green" + return "blocked_control_path_evidence_not_clear" + + +def _control_path_safe_next_action(*, status: str) -> str: + if status == "ready": + return "retry_gitea_cd_then_verify_deploy_marker_and_priority_readback" + if status == "blocked_110_high_load_and_awoooi_host_control_path_unavailable": + return "hold_110_capacity_protection_then_rerun_readonly_awoooi_host_and_registry_verifiers" + if status == "blocked_awoooi_host_runner_queue_unavailable": + return "run_awoooi_host_lane_readiness_verifier_before_controlled_lane_restore" + if status == "blocked_110_node_high_load": + return "wait_for_110_load_to_normalize_then_rerun_readonly_control_path_probe" + if status == "blocked_110_ssh_or_runner_control_path_timeout": + return "rerun_non_secret_110_ssh_and_runner_control_path_diagnosis" + if status == "blocked_registry_v2_verifier_not_green": + return "rerun_public_and_internal_registry_v2_verifier_before_cd_retry" + return "normalize_control_path_evidence_then_retry_readback" + + def _local_console_phase_readback( *, ssh_diagnosis: dict[str, Any], diff --git a/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py b/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py index 7530dac66..744e0d9eb 100644 --- a/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py +++ b/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py @@ -292,6 +292,81 @@ BLOCKER_COUNT=1 assert payload["rollups"]["controlled_cd_lane_blocker_count"] == 1 +def test_harbor_recovery_receipt_surfaces_control_path_readiness_blocker() -> None: + diagnosis_output = _ssh_publickey_diagnosis_output().replace( + "rc=124 classification=server_accepts_key_then_timeout", + "rc=255 classification=publickey_offer_timeout", + ).replace( + "NODE_LOAD1_PER_CPU=0.93", + "NODE_LOAD1_PER_CPU=4.53", + ).replace( + "NODE_LOAD_CLASSIFIER=load_not_high", + "NODE_LOAD_CLASSIFIER=high_load", + ) + + payload = validate_harbor_registry_controlled_recovery_receipt( + { + "ssh_publickey_diagnosis_output": diagnosis_output, + "public_registry_v2_http_status": 502, + "internal_registry_v2_http_status": 502, + "gitea_actions_queue_readback": _gitea_queue_cd_jobs_head_sha_mismatch(), + } + ) + + readiness = payload["readback"]["control_path_readiness"] + assert readiness["status"] == ( + "blocked_110_high_load_and_awoooi_host_control_path_unavailable" + ) + assert readiness["primary_blocker"] == "ssh_publickey_node_high_load_on_110" + assert readiness["safe_next_action"] == ( + "hold_110_capacity_protection_then_rerun_readonly_awoooi_host_and_registry_verifiers" + ) + assert readiness["node_high_load"] is True + assert readiness["node_load_classifier"] == "high_load" + assert readiness["node_load1_per_cpu"] == 4.53 + assert readiness["ssh_publickey_offer_timeout"] is True + assert readiness["runner_systemctl_show_timeout"] is True + assert readiness["awoooi_host_runner_unavailable"] is True + assert readiness["harbor_110_repair_no_matching_runner"] is True + assert readiness["harbor_110_repair_no_matching_runner_label"] == "awoooi-host" + assert readiness["harbor_110_repair_jobs_stale_or_mismatched"] is True + assert readiness["cd_jobs_head_sha_mismatch"] is True + assert readiness["registry_v2_public_http_status"] == 502 + assert readiness["registry_v2_internal_http_status"] == 502 + assert readiness["registry_v2_ready"] is False + assert readiness["metadata_only"] is True + assert readiness["raw_output_returned"] is False + assert readiness["signal_ids"] == [ + "ssh_publickey_node_high_load_on_110", + "ssh_publickey_offer_timeout_on_wooo", + "runner_systemctl_show_timeout_on_110", + "gitea_queue_harbor_110_repair_no_matching_runner", + "gitea_queue_harbor_110_repair_jobs_stale_or_mismatched", + "gitea_queue_cd_jobs_head_sha_mismatch", + "gitea_queue_cd_jobs_stale_or_mismatched", + "public_registry_v2_verifier_not_green", + "internal_registry_v2_verifier_not_green", + ] + assert payload["rollups"]["control_path_readiness_status"] == ( + "blocked_110_high_load_and_awoooi_host_control_path_unavailable" + ) + assert payload["rollups"]["control_path_readiness_blocker_count"] == 9 + assert payload["rollups"]["control_path_readiness_node_high_load"] is True + assert ( + payload["rollups"]["control_path_readiness_runner_systemctl_timeout"] + is True + ) + assert payload["rollups"]["control_path_readiness_awoooi_host_unavailable"] is True + assert ( + payload["rollups"]["control_path_readiness_registry_v2_public_ready"] + is False + ) + assert ( + payload["rollups"]["control_path_readiness_registry_v2_internal_ready"] + is False + ) + + def test_harbor_recovery_receipt_surfaces_gitea_queue_blockers() -> None: payload = validate_harbor_registry_controlled_recovery_receipt( {