feat(recovery): expose 110 control path readiness
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 37s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 37s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -75,6 +75,11 @@ def validate_harbor_registry_controlled_recovery_receipt(
|
||||
controlled_cd_lane=controlled_cd_lane,
|
||||
verifier=verifier,
|
||||
)
|
||||
control_path_readiness = _control_path_readiness(
|
||||
ssh_diagnosis=ssh_diagnosis,
|
||||
verifier=verifier,
|
||||
gitea_queue=gitea_queue,
|
||||
)
|
||||
|
||||
return {
|
||||
"schema_version": _SCHEMA_VERSION,
|
||||
@@ -117,6 +122,7 @@ def validate_harbor_registry_controlled_recovery_receipt(
|
||||
"post_apply_verifier": verifier,
|
||||
"gitea_actions_queue": gitea_queue,
|
||||
"deploy_marker": deploy_marker,
|
||||
"control_path_readiness": control_path_readiness,
|
||||
},
|
||||
"local_console_phase_readback": local_console_phase_readback,
|
||||
"controlled_apply_policy": {
|
||||
@@ -253,6 +259,28 @@ def validate_harbor_registry_controlled_recovery_receipt(
|
||||
"deploy_marker_production_image_matches_expected": deploy_marker[
|
||||
"production_image_matches_expected"
|
||||
],
|
||||
"control_path_readiness_status": control_path_readiness["status"],
|
||||
"control_path_readiness_primary_blocker": control_path_readiness[
|
||||
"primary_blocker"
|
||||
],
|
||||
"control_path_readiness_blocker_count": control_path_readiness[
|
||||
"blocker_count"
|
||||
],
|
||||
"control_path_readiness_node_high_load": control_path_readiness[
|
||||
"node_high_load"
|
||||
],
|
||||
"control_path_readiness_runner_systemctl_timeout": (
|
||||
control_path_readiness["runner_systemctl_show_timeout"]
|
||||
),
|
||||
"control_path_readiness_awoooi_host_unavailable": (
|
||||
control_path_readiness["awoooi_host_runner_unavailable"]
|
||||
),
|
||||
"control_path_readiness_registry_v2_public_ready": (
|
||||
control_path_readiness["registry_v2_public_ready"]
|
||||
),
|
||||
"control_path_readiness_registry_v2_internal_ready": (
|
||||
control_path_readiness["registry_v2_internal_ready"]
|
||||
),
|
||||
"metadata_writeback_contract_ready": True,
|
||||
"local_console_phase_count": local_console_phase_readback["phase_count"],
|
||||
"local_console_completed_phase_count": local_console_phase_readback[
|
||||
@@ -280,6 +308,170 @@ def validate_harbor_registry_controlled_recovery_receipt(
|
||||
}
|
||||
|
||||
|
||||
def _control_path_readiness(
|
||||
*,
|
||||
ssh_diagnosis: dict[str, Any],
|
||||
verifier: dict[str, Any],
|
||||
gitea_queue: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
public_registry_ready = bool(verifier["public_registry_v2_ready"])
|
||||
internal_registry_ready = bool(verifier["internal_registry_v2_ready"])
|
||||
queue_no_matching_runner = bool(
|
||||
gitea_queue["harbor_110_repair_no_matching_runner"]
|
||||
)
|
||||
queue_jobs_stale = bool(gitea_queue["harbor_110_repair_jobs_stale_or_mismatched"])
|
||||
queue_jobs_cross_workflow = bool(
|
||||
gitea_queue["harbor_110_repair_jobs_cross_workflow_mismatch"]
|
||||
)
|
||||
cd_jobs_stale = bool(gitea_queue["cd_run_jobs_stale_or_mismatched"])
|
||||
cd_jobs_head_sha_mismatch = bool(gitea_queue["cd_run_jobs_head_sha_mismatch"])
|
||||
runner_timeout = bool(ssh_diagnosis["runner_systemctl_show_timeout_seen"])
|
||||
node_high_load = bool(ssh_diagnosis["node_high_load_seen"])
|
||||
publickey_offer_timeout = bool(ssh_diagnosis["publickey_offer_timeout_seen"])
|
||||
server_accepts_key_then_timeout = bool(
|
||||
ssh_diagnosis["server_accepts_key_then_timeout_seen"]
|
||||
)
|
||||
awoooi_host_unavailable = queue_no_matching_runner or queue_jobs_stale
|
||||
registry_v2_ready = public_registry_ready and internal_registry_ready
|
||||
signal_ids = _control_path_signal_ids(
|
||||
node_high_load=node_high_load,
|
||||
publickey_offer_timeout=publickey_offer_timeout,
|
||||
server_accepts_key_then_timeout=server_accepts_key_then_timeout,
|
||||
runner_timeout=runner_timeout,
|
||||
queue_no_matching_runner=queue_no_matching_runner,
|
||||
queue_jobs_stale=queue_jobs_stale,
|
||||
queue_jobs_cross_workflow=queue_jobs_cross_workflow,
|
||||
cd_jobs_head_sha_mismatch=cd_jobs_head_sha_mismatch,
|
||||
cd_jobs_stale=cd_jobs_stale,
|
||||
public_registry_ready=public_registry_ready,
|
||||
internal_registry_ready=internal_registry_ready,
|
||||
)
|
||||
status = _control_path_readiness_status(
|
||||
node_high_load=node_high_load,
|
||||
runner_timeout=runner_timeout,
|
||||
ssh_timeout=publickey_offer_timeout or server_accepts_key_then_timeout,
|
||||
awoooi_host_unavailable=awoooi_host_unavailable,
|
||||
registry_v2_ready=registry_v2_ready,
|
||||
signal_ids=signal_ids,
|
||||
)
|
||||
return {
|
||||
"status": status,
|
||||
"primary_blocker": signal_ids[0] if signal_ids else "",
|
||||
"safe_next_action": _control_path_safe_next_action(status=status),
|
||||
"signal_ids": signal_ids,
|
||||
"blocker_count": len(signal_ids),
|
||||
"ssh_diagnosis_receipt_seen": ssh_diagnosis["receipt_seen"],
|
||||
"gitea_queue_readback_seen": gitea_queue["receipt_seen"],
|
||||
"node_high_load": node_high_load,
|
||||
"node_load_classifier": ssh_diagnosis["node_load_classifier"],
|
||||
"node_load1_per_cpu": ssh_diagnosis["node_load1_per_cpu"],
|
||||
"ssh_port_tcp_open": ssh_diagnosis["ssh_port_tcp_open"],
|
||||
"ssh_publickey_offer_timeout": publickey_offer_timeout,
|
||||
"ssh_server_accepts_key_then_timeout": server_accepts_key_then_timeout,
|
||||
"runner_systemctl_show_timeout": runner_timeout,
|
||||
"awoooi_host_runner_unavailable": awoooi_host_unavailable,
|
||||
"harbor_110_repair_no_matching_runner": queue_no_matching_runner,
|
||||
"harbor_110_repair_no_matching_runner_label": gitea_queue[
|
||||
"harbor_110_repair_no_matching_runner_label"
|
||||
],
|
||||
"harbor_110_repair_jobs_stale_or_mismatched": queue_jobs_stale,
|
||||
"harbor_110_repair_jobs_cross_workflow_mismatch": queue_jobs_cross_workflow,
|
||||
"cd_jobs_stale_or_mismatched": cd_jobs_stale,
|
||||
"cd_jobs_head_sha_mismatch": cd_jobs_head_sha_mismatch,
|
||||
"registry_v2_public_http_status": verifier[
|
||||
"public_registry_v2_http_status"
|
||||
],
|
||||
"registry_v2_internal_http_status": verifier[
|
||||
"internal_registry_v2_http_status"
|
||||
],
|
||||
"registry_v2_public_ready": public_registry_ready,
|
||||
"registry_v2_internal_ready": internal_registry_ready,
|
||||
"registry_v2_ready": registry_v2_ready,
|
||||
"metadata_only": True,
|
||||
"raw_output_returned": False,
|
||||
}
|
||||
|
||||
|
||||
def _control_path_signal_ids(
|
||||
*,
|
||||
node_high_load: bool,
|
||||
publickey_offer_timeout: bool,
|
||||
server_accepts_key_then_timeout: bool,
|
||||
runner_timeout: bool,
|
||||
queue_no_matching_runner: bool,
|
||||
queue_jobs_stale: bool,
|
||||
queue_jobs_cross_workflow: bool,
|
||||
cd_jobs_head_sha_mismatch: bool,
|
||||
cd_jobs_stale: bool,
|
||||
public_registry_ready: bool,
|
||||
internal_registry_ready: bool,
|
||||
) -> list[str]:
|
||||
signal_ids: list[str] = []
|
||||
if node_high_load:
|
||||
signal_ids.append("ssh_publickey_node_high_load_on_110")
|
||||
if publickey_offer_timeout:
|
||||
signal_ids.append("ssh_publickey_offer_timeout_on_wooo")
|
||||
if server_accepts_key_then_timeout:
|
||||
signal_ids.append("ssh_publickey_server_accepts_key_then_timeout_on_wooo")
|
||||
if runner_timeout:
|
||||
signal_ids.append("runner_systemctl_show_timeout_on_110")
|
||||
if queue_no_matching_runner:
|
||||
signal_ids.append("gitea_queue_harbor_110_repair_no_matching_runner")
|
||||
if queue_jobs_cross_workflow:
|
||||
signal_ids.append("gitea_queue_harbor_110_repair_jobs_cross_workflow_mismatch")
|
||||
if queue_jobs_stale:
|
||||
signal_ids.append("gitea_queue_harbor_110_repair_jobs_stale_or_mismatched")
|
||||
if cd_jobs_head_sha_mismatch:
|
||||
signal_ids.append("gitea_queue_cd_jobs_head_sha_mismatch")
|
||||
if cd_jobs_stale:
|
||||
signal_ids.append("gitea_queue_cd_jobs_stale_or_mismatched")
|
||||
if not public_registry_ready:
|
||||
signal_ids.append("public_registry_v2_verifier_not_green")
|
||||
if not internal_registry_ready:
|
||||
signal_ids.append("internal_registry_v2_verifier_not_green")
|
||||
return signal_ids
|
||||
|
||||
|
||||
def _control_path_readiness_status(
|
||||
*,
|
||||
node_high_load: bool,
|
||||
runner_timeout: bool,
|
||||
ssh_timeout: bool,
|
||||
awoooi_host_unavailable: bool,
|
||||
registry_v2_ready: bool,
|
||||
signal_ids: list[str],
|
||||
) -> str:
|
||||
if not signal_ids:
|
||||
return "ready"
|
||||
if node_high_load and awoooi_host_unavailable:
|
||||
return "blocked_110_high_load_and_awoooi_host_control_path_unavailable"
|
||||
if awoooi_host_unavailable:
|
||||
return "blocked_awoooi_host_runner_queue_unavailable"
|
||||
if node_high_load:
|
||||
return "blocked_110_node_high_load"
|
||||
if runner_timeout or ssh_timeout:
|
||||
return "blocked_110_ssh_or_runner_control_path_timeout"
|
||||
if not registry_v2_ready:
|
||||
return "blocked_registry_v2_verifier_not_green"
|
||||
return "blocked_control_path_evidence_not_clear"
|
||||
|
||||
|
||||
def _control_path_safe_next_action(*, status: str) -> str:
|
||||
if status == "ready":
|
||||
return "retry_gitea_cd_then_verify_deploy_marker_and_priority_readback"
|
||||
if status == "blocked_110_high_load_and_awoooi_host_control_path_unavailable":
|
||||
return "hold_110_capacity_protection_then_rerun_readonly_awoooi_host_and_registry_verifiers"
|
||||
if status == "blocked_awoooi_host_runner_queue_unavailable":
|
||||
return "run_awoooi_host_lane_readiness_verifier_before_controlled_lane_restore"
|
||||
if status == "blocked_110_node_high_load":
|
||||
return "wait_for_110_load_to_normalize_then_rerun_readonly_control_path_probe"
|
||||
if status == "blocked_110_ssh_or_runner_control_path_timeout":
|
||||
return "rerun_non_secret_110_ssh_and_runner_control_path_diagnosis"
|
||||
if status == "blocked_registry_v2_verifier_not_green":
|
||||
return "rerun_public_and_internal_registry_v2_verifier_before_cd_retry"
|
||||
return "normalize_control_path_evidence_then_retry_readback"
|
||||
|
||||
|
||||
def _local_console_phase_readback(
|
||||
*,
|
||||
ssh_diagnosis: dict[str, Any],
|
||||
|
||||
@@ -292,6 +292,81 @@ BLOCKER_COUNT=1
|
||||
assert payload["rollups"]["controlled_cd_lane_blocker_count"] == 1
|
||||
|
||||
|
||||
def test_harbor_recovery_receipt_surfaces_control_path_readiness_blocker() -> None:
|
||||
diagnosis_output = _ssh_publickey_diagnosis_output().replace(
|
||||
"rc=124 classification=server_accepts_key_then_timeout",
|
||||
"rc=255 classification=publickey_offer_timeout",
|
||||
).replace(
|
||||
"NODE_LOAD1_PER_CPU=0.93",
|
||||
"NODE_LOAD1_PER_CPU=4.53",
|
||||
).replace(
|
||||
"NODE_LOAD_CLASSIFIER=load_not_high",
|
||||
"NODE_LOAD_CLASSIFIER=high_load",
|
||||
)
|
||||
|
||||
payload = validate_harbor_registry_controlled_recovery_receipt(
|
||||
{
|
||||
"ssh_publickey_diagnosis_output": diagnosis_output,
|
||||
"public_registry_v2_http_status": 502,
|
||||
"internal_registry_v2_http_status": 502,
|
||||
"gitea_actions_queue_readback": _gitea_queue_cd_jobs_head_sha_mismatch(),
|
||||
}
|
||||
)
|
||||
|
||||
readiness = payload["readback"]["control_path_readiness"]
|
||||
assert readiness["status"] == (
|
||||
"blocked_110_high_load_and_awoooi_host_control_path_unavailable"
|
||||
)
|
||||
assert readiness["primary_blocker"] == "ssh_publickey_node_high_load_on_110"
|
||||
assert readiness["safe_next_action"] == (
|
||||
"hold_110_capacity_protection_then_rerun_readonly_awoooi_host_and_registry_verifiers"
|
||||
)
|
||||
assert readiness["node_high_load"] is True
|
||||
assert readiness["node_load_classifier"] == "high_load"
|
||||
assert readiness["node_load1_per_cpu"] == 4.53
|
||||
assert readiness["ssh_publickey_offer_timeout"] is True
|
||||
assert readiness["runner_systemctl_show_timeout"] is True
|
||||
assert readiness["awoooi_host_runner_unavailable"] is True
|
||||
assert readiness["harbor_110_repair_no_matching_runner"] is True
|
||||
assert readiness["harbor_110_repair_no_matching_runner_label"] == "awoooi-host"
|
||||
assert readiness["harbor_110_repair_jobs_stale_or_mismatched"] is True
|
||||
assert readiness["cd_jobs_head_sha_mismatch"] is True
|
||||
assert readiness["registry_v2_public_http_status"] == 502
|
||||
assert readiness["registry_v2_internal_http_status"] == 502
|
||||
assert readiness["registry_v2_ready"] is False
|
||||
assert readiness["metadata_only"] is True
|
||||
assert readiness["raw_output_returned"] is False
|
||||
assert readiness["signal_ids"] == [
|
||||
"ssh_publickey_node_high_load_on_110",
|
||||
"ssh_publickey_offer_timeout_on_wooo",
|
||||
"runner_systemctl_show_timeout_on_110",
|
||||
"gitea_queue_harbor_110_repair_no_matching_runner",
|
||||
"gitea_queue_harbor_110_repair_jobs_stale_or_mismatched",
|
||||
"gitea_queue_cd_jobs_head_sha_mismatch",
|
||||
"gitea_queue_cd_jobs_stale_or_mismatched",
|
||||
"public_registry_v2_verifier_not_green",
|
||||
"internal_registry_v2_verifier_not_green",
|
||||
]
|
||||
assert payload["rollups"]["control_path_readiness_status"] == (
|
||||
"blocked_110_high_load_and_awoooi_host_control_path_unavailable"
|
||||
)
|
||||
assert payload["rollups"]["control_path_readiness_blocker_count"] == 9
|
||||
assert payload["rollups"]["control_path_readiness_node_high_load"] is True
|
||||
assert (
|
||||
payload["rollups"]["control_path_readiness_runner_systemctl_timeout"]
|
||||
is True
|
||||
)
|
||||
assert payload["rollups"]["control_path_readiness_awoooi_host_unavailable"] is True
|
||||
assert (
|
||||
payload["rollups"]["control_path_readiness_registry_v2_public_ready"]
|
||||
is False
|
||||
)
|
||||
assert (
|
||||
payload["rollups"]["control_path_readiness_registry_v2_internal_ready"]
|
||||
is False
|
||||
)
|
||||
|
||||
|
||||
def test_harbor_recovery_receipt_surfaces_gitea_queue_blockers() -> None:
|
||||
payload = validate_harbor_registry_controlled_recovery_receipt(
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user