feat(recovery): expose 110 control path readiness
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 37s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-01 08:06:01 +08:00
parent ecce119bbb
commit d7f62b6237
2 changed files with 267 additions and 0 deletions

View File

@@ -75,6 +75,11 @@ def validate_harbor_registry_controlled_recovery_receipt(
controlled_cd_lane=controlled_cd_lane,
verifier=verifier,
)
control_path_readiness = _control_path_readiness(
ssh_diagnosis=ssh_diagnosis,
verifier=verifier,
gitea_queue=gitea_queue,
)
return {
"schema_version": _SCHEMA_VERSION,
@@ -117,6 +122,7 @@ def validate_harbor_registry_controlled_recovery_receipt(
"post_apply_verifier": verifier,
"gitea_actions_queue": gitea_queue,
"deploy_marker": deploy_marker,
"control_path_readiness": control_path_readiness,
},
"local_console_phase_readback": local_console_phase_readback,
"controlled_apply_policy": {
@@ -253,6 +259,28 @@ def validate_harbor_registry_controlled_recovery_receipt(
"deploy_marker_production_image_matches_expected": deploy_marker[
"production_image_matches_expected"
],
"control_path_readiness_status": control_path_readiness["status"],
"control_path_readiness_primary_blocker": control_path_readiness[
"primary_blocker"
],
"control_path_readiness_blocker_count": control_path_readiness[
"blocker_count"
],
"control_path_readiness_node_high_load": control_path_readiness[
"node_high_load"
],
"control_path_readiness_runner_systemctl_timeout": (
control_path_readiness["runner_systemctl_show_timeout"]
),
"control_path_readiness_awoooi_host_unavailable": (
control_path_readiness["awoooi_host_runner_unavailable"]
),
"control_path_readiness_registry_v2_public_ready": (
control_path_readiness["registry_v2_public_ready"]
),
"control_path_readiness_registry_v2_internal_ready": (
control_path_readiness["registry_v2_internal_ready"]
),
"metadata_writeback_contract_ready": True,
"local_console_phase_count": local_console_phase_readback["phase_count"],
"local_console_completed_phase_count": local_console_phase_readback[
@@ -280,6 +308,170 @@ def validate_harbor_registry_controlled_recovery_receipt(
}
def _control_path_readiness(
*,
ssh_diagnosis: dict[str, Any],
verifier: dict[str, Any],
gitea_queue: dict[str, Any],
) -> dict[str, Any]:
public_registry_ready = bool(verifier["public_registry_v2_ready"])
internal_registry_ready = bool(verifier["internal_registry_v2_ready"])
queue_no_matching_runner = bool(
gitea_queue["harbor_110_repair_no_matching_runner"]
)
queue_jobs_stale = bool(gitea_queue["harbor_110_repair_jobs_stale_or_mismatched"])
queue_jobs_cross_workflow = bool(
gitea_queue["harbor_110_repair_jobs_cross_workflow_mismatch"]
)
cd_jobs_stale = bool(gitea_queue["cd_run_jobs_stale_or_mismatched"])
cd_jobs_head_sha_mismatch = bool(gitea_queue["cd_run_jobs_head_sha_mismatch"])
runner_timeout = bool(ssh_diagnosis["runner_systemctl_show_timeout_seen"])
node_high_load = bool(ssh_diagnosis["node_high_load_seen"])
publickey_offer_timeout = bool(ssh_diagnosis["publickey_offer_timeout_seen"])
server_accepts_key_then_timeout = bool(
ssh_diagnosis["server_accepts_key_then_timeout_seen"]
)
awoooi_host_unavailable = queue_no_matching_runner or queue_jobs_stale
registry_v2_ready = public_registry_ready and internal_registry_ready
signal_ids = _control_path_signal_ids(
node_high_load=node_high_load,
publickey_offer_timeout=publickey_offer_timeout,
server_accepts_key_then_timeout=server_accepts_key_then_timeout,
runner_timeout=runner_timeout,
queue_no_matching_runner=queue_no_matching_runner,
queue_jobs_stale=queue_jobs_stale,
queue_jobs_cross_workflow=queue_jobs_cross_workflow,
cd_jobs_head_sha_mismatch=cd_jobs_head_sha_mismatch,
cd_jobs_stale=cd_jobs_stale,
public_registry_ready=public_registry_ready,
internal_registry_ready=internal_registry_ready,
)
status = _control_path_readiness_status(
node_high_load=node_high_load,
runner_timeout=runner_timeout,
ssh_timeout=publickey_offer_timeout or server_accepts_key_then_timeout,
awoooi_host_unavailable=awoooi_host_unavailable,
registry_v2_ready=registry_v2_ready,
signal_ids=signal_ids,
)
return {
"status": status,
"primary_blocker": signal_ids[0] if signal_ids else "",
"safe_next_action": _control_path_safe_next_action(status=status),
"signal_ids": signal_ids,
"blocker_count": len(signal_ids),
"ssh_diagnosis_receipt_seen": ssh_diagnosis["receipt_seen"],
"gitea_queue_readback_seen": gitea_queue["receipt_seen"],
"node_high_load": node_high_load,
"node_load_classifier": ssh_diagnosis["node_load_classifier"],
"node_load1_per_cpu": ssh_diagnosis["node_load1_per_cpu"],
"ssh_port_tcp_open": ssh_diagnosis["ssh_port_tcp_open"],
"ssh_publickey_offer_timeout": publickey_offer_timeout,
"ssh_server_accepts_key_then_timeout": server_accepts_key_then_timeout,
"runner_systemctl_show_timeout": runner_timeout,
"awoooi_host_runner_unavailable": awoooi_host_unavailable,
"harbor_110_repair_no_matching_runner": queue_no_matching_runner,
"harbor_110_repair_no_matching_runner_label": gitea_queue[
"harbor_110_repair_no_matching_runner_label"
],
"harbor_110_repair_jobs_stale_or_mismatched": queue_jobs_stale,
"harbor_110_repair_jobs_cross_workflow_mismatch": queue_jobs_cross_workflow,
"cd_jobs_stale_or_mismatched": cd_jobs_stale,
"cd_jobs_head_sha_mismatch": cd_jobs_head_sha_mismatch,
"registry_v2_public_http_status": verifier[
"public_registry_v2_http_status"
],
"registry_v2_internal_http_status": verifier[
"internal_registry_v2_http_status"
],
"registry_v2_public_ready": public_registry_ready,
"registry_v2_internal_ready": internal_registry_ready,
"registry_v2_ready": registry_v2_ready,
"metadata_only": True,
"raw_output_returned": False,
}
def _control_path_signal_ids(
*,
node_high_load: bool,
publickey_offer_timeout: bool,
server_accepts_key_then_timeout: bool,
runner_timeout: bool,
queue_no_matching_runner: bool,
queue_jobs_stale: bool,
queue_jobs_cross_workflow: bool,
cd_jobs_head_sha_mismatch: bool,
cd_jobs_stale: bool,
public_registry_ready: bool,
internal_registry_ready: bool,
) -> list[str]:
signal_ids: list[str] = []
if node_high_load:
signal_ids.append("ssh_publickey_node_high_load_on_110")
if publickey_offer_timeout:
signal_ids.append("ssh_publickey_offer_timeout_on_wooo")
if server_accepts_key_then_timeout:
signal_ids.append("ssh_publickey_server_accepts_key_then_timeout_on_wooo")
if runner_timeout:
signal_ids.append("runner_systemctl_show_timeout_on_110")
if queue_no_matching_runner:
signal_ids.append("gitea_queue_harbor_110_repair_no_matching_runner")
if queue_jobs_cross_workflow:
signal_ids.append("gitea_queue_harbor_110_repair_jobs_cross_workflow_mismatch")
if queue_jobs_stale:
signal_ids.append("gitea_queue_harbor_110_repair_jobs_stale_or_mismatched")
if cd_jobs_head_sha_mismatch:
signal_ids.append("gitea_queue_cd_jobs_head_sha_mismatch")
if cd_jobs_stale:
signal_ids.append("gitea_queue_cd_jobs_stale_or_mismatched")
if not public_registry_ready:
signal_ids.append("public_registry_v2_verifier_not_green")
if not internal_registry_ready:
signal_ids.append("internal_registry_v2_verifier_not_green")
return signal_ids
def _control_path_readiness_status(
*,
node_high_load: bool,
runner_timeout: bool,
ssh_timeout: bool,
awoooi_host_unavailable: bool,
registry_v2_ready: bool,
signal_ids: list[str],
) -> str:
if not signal_ids:
return "ready"
if node_high_load and awoooi_host_unavailable:
return "blocked_110_high_load_and_awoooi_host_control_path_unavailable"
if awoooi_host_unavailable:
return "blocked_awoooi_host_runner_queue_unavailable"
if node_high_load:
return "blocked_110_node_high_load"
if runner_timeout or ssh_timeout:
return "blocked_110_ssh_or_runner_control_path_timeout"
if not registry_v2_ready:
return "blocked_registry_v2_verifier_not_green"
return "blocked_control_path_evidence_not_clear"
def _control_path_safe_next_action(*, status: str) -> str:
if status == "ready":
return "retry_gitea_cd_then_verify_deploy_marker_and_priority_readback"
if status == "blocked_110_high_load_and_awoooi_host_control_path_unavailable":
return "hold_110_capacity_protection_then_rerun_readonly_awoooi_host_and_registry_verifiers"
if status == "blocked_awoooi_host_runner_queue_unavailable":
return "run_awoooi_host_lane_readiness_verifier_before_controlled_lane_restore"
if status == "blocked_110_node_high_load":
return "wait_for_110_load_to_normalize_then_rerun_readonly_control_path_probe"
if status == "blocked_110_ssh_or_runner_control_path_timeout":
return "rerun_non_secret_110_ssh_and_runner_control_path_diagnosis"
if status == "blocked_registry_v2_verifier_not_green":
return "rerun_public_and_internal_registry_v2_verifier_before_cd_retry"
return "normalize_control_path_evidence_then_retry_readback"
def _local_console_phase_readback(
*,
ssh_diagnosis: dict[str, Any],

View File

@@ -292,6 +292,81 @@ BLOCKER_COUNT=1
assert payload["rollups"]["controlled_cd_lane_blocker_count"] == 1
def test_harbor_recovery_receipt_surfaces_control_path_readiness_blocker() -> None:
diagnosis_output = _ssh_publickey_diagnosis_output().replace(
"rc=124 classification=server_accepts_key_then_timeout",
"rc=255 classification=publickey_offer_timeout",
).replace(
"NODE_LOAD1_PER_CPU=0.93",
"NODE_LOAD1_PER_CPU=4.53",
).replace(
"NODE_LOAD_CLASSIFIER=load_not_high",
"NODE_LOAD_CLASSIFIER=high_load",
)
payload = validate_harbor_registry_controlled_recovery_receipt(
{
"ssh_publickey_diagnosis_output": diagnosis_output,
"public_registry_v2_http_status": 502,
"internal_registry_v2_http_status": 502,
"gitea_actions_queue_readback": _gitea_queue_cd_jobs_head_sha_mismatch(),
}
)
readiness = payload["readback"]["control_path_readiness"]
assert readiness["status"] == (
"blocked_110_high_load_and_awoooi_host_control_path_unavailable"
)
assert readiness["primary_blocker"] == "ssh_publickey_node_high_load_on_110"
assert readiness["safe_next_action"] == (
"hold_110_capacity_protection_then_rerun_readonly_awoooi_host_and_registry_verifiers"
)
assert readiness["node_high_load"] is True
assert readiness["node_load_classifier"] == "high_load"
assert readiness["node_load1_per_cpu"] == 4.53
assert readiness["ssh_publickey_offer_timeout"] is True
assert readiness["runner_systemctl_show_timeout"] is True
assert readiness["awoooi_host_runner_unavailable"] is True
assert readiness["harbor_110_repair_no_matching_runner"] is True
assert readiness["harbor_110_repair_no_matching_runner_label"] == "awoooi-host"
assert readiness["harbor_110_repair_jobs_stale_or_mismatched"] is True
assert readiness["cd_jobs_head_sha_mismatch"] is True
assert readiness["registry_v2_public_http_status"] == 502
assert readiness["registry_v2_internal_http_status"] == 502
assert readiness["registry_v2_ready"] is False
assert readiness["metadata_only"] is True
assert readiness["raw_output_returned"] is False
assert readiness["signal_ids"] == [
"ssh_publickey_node_high_load_on_110",
"ssh_publickey_offer_timeout_on_wooo",
"runner_systemctl_show_timeout_on_110",
"gitea_queue_harbor_110_repair_no_matching_runner",
"gitea_queue_harbor_110_repair_jobs_stale_or_mismatched",
"gitea_queue_cd_jobs_head_sha_mismatch",
"gitea_queue_cd_jobs_stale_or_mismatched",
"public_registry_v2_verifier_not_green",
"internal_registry_v2_verifier_not_green",
]
assert payload["rollups"]["control_path_readiness_status"] == (
"blocked_110_high_load_and_awoooi_host_control_path_unavailable"
)
assert payload["rollups"]["control_path_readiness_blocker_count"] == 9
assert payload["rollups"]["control_path_readiness_node_high_load"] is True
assert (
payload["rollups"]["control_path_readiness_runner_systemctl_timeout"]
is True
)
assert payload["rollups"]["control_path_readiness_awoooi_host_unavailable"] is True
assert (
payload["rollups"]["control_path_readiness_registry_v2_public_ready"]
is False
)
assert (
payload["rollups"]["control_path_readiness_registry_v2_internal_ready"]
is False
)
def test_harbor_recovery_receipt_surfaces_gitea_queue_blockers() -> None:
payload = validate_harbor_registry_controlled_recovery_receipt(
{