fix(agent): publish queue safe next action
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 37s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-01 18:25:30 +08:00
parent 3c187aa8a5
commit d873a3258c
9 changed files with 628 additions and 3 deletions

View File

@@ -512,6 +512,63 @@ def build_readback(
or harbor_110_repair_visible_running_jobs_api_stale
or bool(harbor_110_repair_log_classifier["failure_classifier"])
)
safe_next_action = _queue_safe_next_action(
latest_cd_waiting=latest_cd_waiting,
latest_cd_status=latest_cd_status,
latest_cd_no_matching_runner_label=latest_cd_no_matching_runner_label,
cd_jobs_stale_or_mismatched=cd_jobs_stale_or_mismatched,
cd_jobs_payload_classifier=cd_jobs_payload_classifier,
effective_host_pressure_classifier=effective_tests_log_classifier[
"host_pressure_classifier"
],
effective_host_pressure_blocked_or_waiting=effective_tests_log_classifier[
"host_pressure_blocked_or_waiting"
],
build_harbor_public_route_blocked=build_log_classifier[
"harbor_public_route_blocked"
],
build_harbor_public_route_retrying_unavailable=build_log_classifier[
"harbor_public_route_retrying_unavailable"
],
harbor_110_repair_no_matching_runner_label=(
harbor_110_repair_no_matching_runner_label
),
harbor_110_repair_waiting=harbor_110_repair_waiting,
harbor_110_repair_running=harbor_110_repair_running,
harbor_110_repair_failed=harbor_110_repair_failed,
harbor_110_repair_waiting_after_cd_harbor_blocker=(
harbor_110_repair_waiting_after_cd_harbor_blocker
),
harbor_110_repair_jobs_stale_or_mismatched=(
harbor_110_repair_jobs_stale_or_mismatched
),
harbor_110_repair_jobs_payload_classifier=(
harbor_110_repair_jobs_payload_classifier
),
harbor_110_repair_visible_running_jobs_api_stale=(
harbor_110_repair_visible_running_jobs_api_stale
),
harbor_110_repair_visible_failure_jobs_api_stale=(
harbor_110_repair_visible_failure_jobs_api_stale
),
current_cd_waiting_behind_harbor_110_repair_running=(
current_cd_waiting_behind_harbor_110_repair_running
),
remote_control_channel_unavailable=harbor_110_repair_log_classifier[
"remote_control_channel_unavailable"
],
remote_ssh_publickey_auth_stalled=harbor_110_repair_log_classifier[
"remote_ssh_publickey_auth_stalled"
],
remote_ssh_publickey_offer_timeout=harbor_110_repair_log_classifier[
"remote_ssh_publickey_offer_timeout"
],
remote_ssh_server_accepts_key_then_session_timeout=(
harbor_110_repair_log_classifier[
"remote_ssh_server_accepts_key_then_session_timeout"
]
),
)
readback = {
"actions_page_visible_run_count": len(visible_runs),
@@ -728,6 +785,20 @@ def build_readback(
"current_cd_waiting_behind_harbor_110_repair_running": (
current_cd_waiting_behind_harbor_110_repair_running
),
"safe_next_action_id": safe_next_action["action_id"],
"safe_next_action_stage": safe_next_action["stage"],
"safe_next_action": safe_next_action["action"],
"safe_next_action_reason": safe_next_action["reason"],
"safe_next_action_command": safe_next_action["command"],
"safe_next_action_post_verifier": safe_next_action["post_verifier"],
"safe_next_action_requires_local_console": safe_next_action[
"requires_local_console"
],
"safe_next_action_metadata_only": safe_next_action["metadata_only"],
"safe_next_action_blocker_fields": safe_next_action["blocker_fields"],
"safe_next_action_forbidden_actions": safe_next_action[
"forbidden_actions"
],
"latest_visible_cd_host_pressure_classifier": effective_tests_log_classifier[
"host_pressure_classifier"
],
@@ -1014,6 +1085,12 @@ def build_readback(
"harbor_110_repair_visible_running_jobs_api_stale": (
harbor_110_repair_visible_running_jobs_api_stale
),
"safe_next_action_id": safe_next_action["action_id"],
"safe_next_action_stage": safe_next_action["stage"],
"safe_next_action_requires_local_console": safe_next_action[
"requires_local_console"
],
"safe_next_action_metadata_only": safe_next_action["metadata_only"],
},
"operation_boundaries": {
"public_gitea_read_only": True,
@@ -1029,6 +1106,283 @@ def build_readback(
}
def _queue_safe_next_action(
*,
latest_cd_waiting: bool,
latest_cd_status: str,
latest_cd_no_matching_runner_label: str,
cd_jobs_stale_or_mismatched: bool,
cd_jobs_payload_classifier: str,
effective_host_pressure_classifier: str,
effective_host_pressure_blocked_or_waiting: bool,
build_harbor_public_route_blocked: bool,
build_harbor_public_route_retrying_unavailable: bool,
harbor_110_repair_no_matching_runner_label: str,
harbor_110_repair_waiting: bool,
harbor_110_repair_running: bool,
harbor_110_repair_failed: bool,
harbor_110_repair_waiting_after_cd_harbor_blocker: bool,
harbor_110_repair_jobs_stale_or_mismatched: bool,
harbor_110_repair_jobs_payload_classifier: str,
harbor_110_repair_visible_running_jobs_api_stale: bool,
harbor_110_repair_visible_failure_jobs_api_stale: bool,
current_cd_waiting_behind_harbor_110_repair_running: bool,
remote_control_channel_unavailable: bool,
remote_ssh_publickey_auth_stalled: bool,
remote_ssh_publickey_offer_timeout: bool,
remote_ssh_server_accepts_key_then_session_timeout: bool,
) -> dict[str, Any]:
forbidden = [
"read_runner_token_or_runner_file",
"restart_docker_daemon",
"reboot_host",
"node_drain",
"workflow_dispatch",
"force_push_or_ref_delete",
"raw_secret_volume_read",
]
def action(
*,
action_id: str,
stage: str,
text: str,
reason: str,
command: str,
post_verifier: str,
blocker_fields: list[str],
requires_local_console: bool = False,
) -> dict[str, Any]:
return {
"action_id": action_id,
"stage": stage,
"action": text,
"reason": reason,
"command": command,
"post_verifier": post_verifier,
"requires_local_console": requires_local_console,
"metadata_only": True,
"blocker_fields": blocker_fields,
"forbidden_actions": forbidden,
}
if remote_ssh_publickey_auth_stalled:
blocker_fields = ["latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled"]
if remote_ssh_publickey_offer_timeout:
blocker_fields.append(
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout"
)
if remote_ssh_server_accepts_key_then_session_timeout:
blocker_fields.append(
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout"
)
return action(
action_id=(
"run_110_local_ssh_session_control_path_recovery_then_verify_cd_and_deploy_marker_readback"
),
stage="local_console_control_path_receipt_required",
text=(
"Use the 110 local-console controlled recovery package to verify "
"SSH account/session metadata, then rerun registry, queue, CD, "
"and deploy-marker readbacks."
),
reason=(
"The public queue shows TCP/SSH reachability but publickey "
"authentication or session setup stalls; remote SSH cannot safely "
"repair itself."
),
command="recover-110-control-path-and-harbor-local.sh --check",
post_verifier=(
"read-public-gitea-actions-queue.py --json && "
"check-awoooi-110-controlled-cd-lane-readiness.sh && "
"curl -k https://registry.wooo.work/v2/"
),
blocker_fields=blocker_fields,
requires_local_console=True,
)
if remote_control_channel_unavailable:
return action(
action_id="restore_110_remote_control_channel_readback_from_local_console",
stage="local_console_control_channel_readback_required",
text=(
"Verify the 110 local control path and publish a metadata-only "
"receipt before retrying Harbor or CD closure."
),
reason="The bounded remote SSH control channel is unavailable.",
command="recover-110-control-path-and-harbor-local.sh --check",
post_verifier="read-public-gitea-actions-queue.py --json",
blocker_fields=[
"latest_visible_harbor_110_repair_remote_control_channel_unavailable"
],
requires_local_console=True,
)
if current_cd_waiting_behind_harbor_110_repair_running:
blocker_fields = ["current_cd_waiting_behind_harbor_110_repair_running"]
if harbor_110_repair_visible_running_jobs_api_stale:
blocker_fields.append("harbor_110_repair_visible_running_jobs_api_stale")
return action(
action_id=(
"refresh_harbor_110_repair_log_truth_then_verify_cd_waiting_state"
),
stage="queue_truth_refresh_required",
text=(
"Keep the visible CD wait state, ignore stale Harbor jobs API "
"payloads, and refresh the Harbor repair log plus queue readback."
),
reason=(
"Current CD is waiting behind the visible Harbor repair lane, "
"while the jobs API may belong to another workflow or stale run."
),
command="read-public-gitea-actions-queue.py --json",
post_verifier="read-public-gitea-actions-queue.py --json",
blocker_fields=blocker_fields,
)
if latest_cd_waiting or latest_cd_no_matching_runner_label:
blocker_fields = ["latest_visible_cd_run_waiting"]
if latest_cd_no_matching_runner_label:
blocker_fields.append("latest_visible_cd_no_matching_runner_label")
return action(
action_id="verify_non110_runner_lane_before_retrying_current_cd",
stage="non110_runner_lane_readiness_required",
text=(
"Verify the non-110 controlled CD runner lane and keep legacy or "
"generic runner labels closed."
),
reason=(
f"The latest visible CD run is {latest_cd_status or 'unknown'} "
"or has no matching controlled runner label."
),
command="check-awoooi-non110-runner-readiness.sh",
post_verifier="read-public-gitea-actions-queue.py --json",
blocker_fields=blocker_fields,
)
if harbor_110_repair_no_matching_runner_label:
return action(
action_id="verify_awoooi_host_controlled_repair_runner_lane",
stage="awoooi_host_runner_lane_readiness_required",
text=(
"Verify the awoooi-host controlled repair lane before retrying "
"Harbor local repair."
),
reason="The Harbor 110 repair workflow has no matching controlled runner.",
command="check-awoooi-110-controlled-cd-lane-readiness.sh",
post_verifier="read-public-gitea-actions-queue.py --json",
blocker_fields=[
"latest_visible_harbor_110_repair_no_matching_runner_label"
],
)
if harbor_110_repair_waiting or harbor_110_repair_waiting_after_cd_harbor_blocker:
blocker_fields = ["latest_visible_harbor_110_repair_waiting"]
if harbor_110_repair_jobs_stale_or_mismatched:
blocker_fields.append("harbor_110_repair_jobs_stale_or_mismatched")
return action(
action_id="wait_for_harbor_110_repair_or_refresh_queue_truth",
stage="harbor_110_repair_queue_wait",
text=(
"Wait for the visible Harbor 110 repair lane or refresh queue truth "
"when the jobs API is stale."
),
reason="The Harbor 110 repair workflow is visible but not complete.",
command="read-public-gitea-actions-queue.py --json",
post_verifier="read-public-gitea-actions-queue.py --json",
blocker_fields=blocker_fields,
)
if harbor_110_repair_failed:
blocker_fields = ["latest_visible_harbor_110_repair_failed"]
if harbor_110_repair_visible_failure_jobs_api_stale:
blocker_fields.append("harbor_110_repair_visible_failure_jobs_api_stale")
if harbor_110_repair_jobs_payload_classifier:
blocker_fields.append("harbor_110_repair_jobs_payload_classifier")
return action(
action_id="use_harbor_repair_log_classifier_then_submit_recovery_receipt",
stage="harbor_110_repair_failure_receipt_required",
text=(
"Use the Harbor repair log classifier as truth, quarantine stale "
"jobs API payloads, and submit the metadata-only recovery receipt."
),
reason="The visible Harbor repair run failed.",
command="read-public-gitea-actions-queue.py --json",
post_verifier="harbor-registry-controlled-recovery-receipt readback",
blocker_fields=blocker_fields,
)
if build_harbor_public_route_blocked or build_harbor_public_route_retrying_unavailable:
return action(
action_id="run_harbor_registry_v2_verifier_before_repair",
stage="registry_v2_verifier_required",
text=(
"Verify public registry /v2/ first; only use Harbor repair if the "
"route is still below 200/401."
),
reason="The CD log still carries Harbor public route unavailable evidence.",
command="curl -k https://registry.wooo.work/v2/",
post_verifier="read-public-gitea-actions-queue.py --json",
blocker_fields=[
"latest_visible_cd_harbor_public_route_blocked",
"latest_visible_cd_harbor_public_route_retrying_unavailable",
],
)
if effective_host_pressure_blocked_or_waiting:
return action(
action_id="wait_host_pressure_gate_then_rerun_cd_readback",
stage="host_pressure_gate_wait",
text=(
"Keep the host pressure gate fail-closed and rerun CD readback "
"after pressure clears."
),
reason=effective_host_pressure_classifier or "host pressure is active",
command="awoooi-wait-host-web-build-pressure.sh",
post_verifier="read-public-gitea-actions-queue.py --json",
blocker_fields=["latest_visible_cd_host_pressure_classifier"],
)
if cd_jobs_stale_or_mismatched:
return action(
action_id="ignore_stale_cd_jobs_api_payload_and_poll_visible_cd_or_marker",
stage="cd_jobs_api_stale_payload_quarantine",
text=(
"Ignore stale CD jobs API payloads, poll the visible CD run/logs, "
"and verify production deploy marker before closing."
),
reason=cd_jobs_payload_classifier or "CD jobs API payload is stale",
command="read-public-gitea-actions-queue.py --json",
post_verifier="awoooi production deploy marker readback",
blocker_fields=["cd_run_jobs_payload_classifier"],
)
if harbor_110_repair_jobs_stale_or_mismatched:
return action(
action_id="ignore_stale_harbor_jobs_api_payload_and_poll_visible_repair_log",
stage="harbor_jobs_api_stale_payload_quarantine",
text=(
"Ignore stale Harbor repair jobs API payloads and poll the visible "
"repair log or queue status."
),
reason=harbor_110_repair_jobs_payload_classifier
or "Harbor repair jobs API payload is stale",
command="read-public-gitea-actions-queue.py --json",
post_verifier="read-public-gitea-actions-queue.py --json",
blocker_fields=["harbor_110_repair_jobs_payload_classifier"],
)
return action(
action_id="continue_public_gitea_queue_readback",
stage="queue_observation",
text="Continue public queue readback without credentials or runtime writes.",
reason="No more specific queue blocker was visible in the public readback.",
command="read-public-gitea-actions-queue.py --json",
post_verifier="read-public-gitea-actions-queue.py --json",
blocker_fields=[],
)
def classify_cd_build_log(text: str) -> dict[str, Any]:
attempt_statuses: list[str] = []
attempt_numbers: list[int] = []
@@ -1416,6 +1770,9 @@ def _human_summary(payload: dict[str, Any]) -> str:
"CURRENT_CD_WAITING_BEHIND_HARBOR_110_REPAIR_RUNNING="
f"{int(readback['current_cd_waiting_behind_harbor_110_repair_running'])}"
),
f"SAFE_NEXT_ACTION_ID={readback['safe_next_action_id']}",
f"SAFE_NEXT_ACTION_STAGE={readback['safe_next_action_stage']}",
f"SAFE_NEXT_ACTION_COMMAND={readback['safe_next_action_command']}",
"WRITE_PERFORMED=false",
"TOKEN_COLLECTED=false",
]

View File

@@ -657,6 +657,18 @@ def test_harbor_ssh_blocker_takes_precedence_over_current_cd_waiting() -> None:
payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"]
is True
)
assert payload["readback"]["safe_next_action_id"] == (
"run_110_local_ssh_session_control_path_recovery_then_verify_cd_and_deploy_marker_readback"
)
assert payload["readback"]["safe_next_action_stage"] == (
"local_console_control_path_receipt_required"
)
assert payload["readback"]["safe_next_action_requires_local_console"] is True
assert payload["readback"]["safe_next_action_metadata_only"] is True
assert "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled" in (
payload["readback"]["safe_next_action_blocker_fields"]
)
assert payload["rollups"]["safe_next_action_requires_local_console"] is True
def test_build_readback_classifies_harbor_502_after_110_repair_jobs_success() -> None:
@@ -920,6 +932,7 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non
"LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_AUTH_PERMISSION_DENIED=False"
in summary
)
assert "SAFE_NEXT_ACTION_STAGE=local_console_control_path_receipt_required" in summary
assert payload["operation_boundaries"]["secret_or_runner_token_read"] is False
assert payload["operation_boundaries"]["host_write_performed"] is False
@@ -1280,6 +1293,15 @@ def test_build_readback_flags_stale_cd_jobs_api_payload() -> None:
assert payload["readback"]["cd_run_jobs_payload_classifier"] == (
"cd_jobs_api_head_sha_mismatch_for_visible_cd_run"
)
assert payload["readback"]["safe_next_action_id"] == (
"ignore_stale_cd_jobs_api_payload_and_poll_visible_cd_or_marker"
)
assert payload["readback"]["safe_next_action_stage"] == (
"cd_jobs_api_stale_payload_quarantine"
)
assert payload["readback"]["safe_next_action_blocker_fields"] == [
"cd_run_jobs_payload_classifier"
]
assert payload["rollups"]["cd_run_jobs_stale_or_mismatched"] is True
assert payload["rollups"]["cd_run_jobs_payload_classifier"] == (
"cd_jobs_api_head_sha_mismatch_for_visible_cd_run"