fix(agent): publish queue safe next action
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 37s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 37s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -512,6 +512,63 @@ def build_readback(
|
||||
or harbor_110_repair_visible_running_jobs_api_stale
|
||||
or bool(harbor_110_repair_log_classifier["failure_classifier"])
|
||||
)
|
||||
safe_next_action = _queue_safe_next_action(
|
||||
latest_cd_waiting=latest_cd_waiting,
|
||||
latest_cd_status=latest_cd_status,
|
||||
latest_cd_no_matching_runner_label=latest_cd_no_matching_runner_label,
|
||||
cd_jobs_stale_or_mismatched=cd_jobs_stale_or_mismatched,
|
||||
cd_jobs_payload_classifier=cd_jobs_payload_classifier,
|
||||
effective_host_pressure_classifier=effective_tests_log_classifier[
|
||||
"host_pressure_classifier"
|
||||
],
|
||||
effective_host_pressure_blocked_or_waiting=effective_tests_log_classifier[
|
||||
"host_pressure_blocked_or_waiting"
|
||||
],
|
||||
build_harbor_public_route_blocked=build_log_classifier[
|
||||
"harbor_public_route_blocked"
|
||||
],
|
||||
build_harbor_public_route_retrying_unavailable=build_log_classifier[
|
||||
"harbor_public_route_retrying_unavailable"
|
||||
],
|
||||
harbor_110_repair_no_matching_runner_label=(
|
||||
harbor_110_repair_no_matching_runner_label
|
||||
),
|
||||
harbor_110_repair_waiting=harbor_110_repair_waiting,
|
||||
harbor_110_repair_running=harbor_110_repair_running,
|
||||
harbor_110_repair_failed=harbor_110_repair_failed,
|
||||
harbor_110_repair_waiting_after_cd_harbor_blocker=(
|
||||
harbor_110_repair_waiting_after_cd_harbor_blocker
|
||||
),
|
||||
harbor_110_repair_jobs_stale_or_mismatched=(
|
||||
harbor_110_repair_jobs_stale_or_mismatched
|
||||
),
|
||||
harbor_110_repair_jobs_payload_classifier=(
|
||||
harbor_110_repair_jobs_payload_classifier
|
||||
),
|
||||
harbor_110_repair_visible_running_jobs_api_stale=(
|
||||
harbor_110_repair_visible_running_jobs_api_stale
|
||||
),
|
||||
harbor_110_repair_visible_failure_jobs_api_stale=(
|
||||
harbor_110_repair_visible_failure_jobs_api_stale
|
||||
),
|
||||
current_cd_waiting_behind_harbor_110_repair_running=(
|
||||
current_cd_waiting_behind_harbor_110_repair_running
|
||||
),
|
||||
remote_control_channel_unavailable=harbor_110_repair_log_classifier[
|
||||
"remote_control_channel_unavailable"
|
||||
],
|
||||
remote_ssh_publickey_auth_stalled=harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_auth_stalled"
|
||||
],
|
||||
remote_ssh_publickey_offer_timeout=harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_offer_timeout"
|
||||
],
|
||||
remote_ssh_server_accepts_key_then_session_timeout=(
|
||||
harbor_110_repair_log_classifier[
|
||||
"remote_ssh_server_accepts_key_then_session_timeout"
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
readback = {
|
||||
"actions_page_visible_run_count": len(visible_runs),
|
||||
@@ -728,6 +785,20 @@ def build_readback(
|
||||
"current_cd_waiting_behind_harbor_110_repair_running": (
|
||||
current_cd_waiting_behind_harbor_110_repair_running
|
||||
),
|
||||
"safe_next_action_id": safe_next_action["action_id"],
|
||||
"safe_next_action_stage": safe_next_action["stage"],
|
||||
"safe_next_action": safe_next_action["action"],
|
||||
"safe_next_action_reason": safe_next_action["reason"],
|
||||
"safe_next_action_command": safe_next_action["command"],
|
||||
"safe_next_action_post_verifier": safe_next_action["post_verifier"],
|
||||
"safe_next_action_requires_local_console": safe_next_action[
|
||||
"requires_local_console"
|
||||
],
|
||||
"safe_next_action_metadata_only": safe_next_action["metadata_only"],
|
||||
"safe_next_action_blocker_fields": safe_next_action["blocker_fields"],
|
||||
"safe_next_action_forbidden_actions": safe_next_action[
|
||||
"forbidden_actions"
|
||||
],
|
||||
"latest_visible_cd_host_pressure_classifier": effective_tests_log_classifier[
|
||||
"host_pressure_classifier"
|
||||
],
|
||||
@@ -1014,6 +1085,12 @@ def build_readback(
|
||||
"harbor_110_repair_visible_running_jobs_api_stale": (
|
||||
harbor_110_repair_visible_running_jobs_api_stale
|
||||
),
|
||||
"safe_next_action_id": safe_next_action["action_id"],
|
||||
"safe_next_action_stage": safe_next_action["stage"],
|
||||
"safe_next_action_requires_local_console": safe_next_action[
|
||||
"requires_local_console"
|
||||
],
|
||||
"safe_next_action_metadata_only": safe_next_action["metadata_only"],
|
||||
},
|
||||
"operation_boundaries": {
|
||||
"public_gitea_read_only": True,
|
||||
@@ -1029,6 +1106,283 @@ def build_readback(
|
||||
}
|
||||
|
||||
|
||||
def _queue_safe_next_action(
|
||||
*,
|
||||
latest_cd_waiting: bool,
|
||||
latest_cd_status: str,
|
||||
latest_cd_no_matching_runner_label: str,
|
||||
cd_jobs_stale_or_mismatched: bool,
|
||||
cd_jobs_payload_classifier: str,
|
||||
effective_host_pressure_classifier: str,
|
||||
effective_host_pressure_blocked_or_waiting: bool,
|
||||
build_harbor_public_route_blocked: bool,
|
||||
build_harbor_public_route_retrying_unavailable: bool,
|
||||
harbor_110_repair_no_matching_runner_label: str,
|
||||
harbor_110_repair_waiting: bool,
|
||||
harbor_110_repair_running: bool,
|
||||
harbor_110_repair_failed: bool,
|
||||
harbor_110_repair_waiting_after_cd_harbor_blocker: bool,
|
||||
harbor_110_repair_jobs_stale_or_mismatched: bool,
|
||||
harbor_110_repair_jobs_payload_classifier: str,
|
||||
harbor_110_repair_visible_running_jobs_api_stale: bool,
|
||||
harbor_110_repair_visible_failure_jobs_api_stale: bool,
|
||||
current_cd_waiting_behind_harbor_110_repair_running: bool,
|
||||
remote_control_channel_unavailable: bool,
|
||||
remote_ssh_publickey_auth_stalled: bool,
|
||||
remote_ssh_publickey_offer_timeout: bool,
|
||||
remote_ssh_server_accepts_key_then_session_timeout: bool,
|
||||
) -> dict[str, Any]:
|
||||
forbidden = [
|
||||
"read_runner_token_or_runner_file",
|
||||
"restart_docker_daemon",
|
||||
"reboot_host",
|
||||
"node_drain",
|
||||
"workflow_dispatch",
|
||||
"force_push_or_ref_delete",
|
||||
"raw_secret_volume_read",
|
||||
]
|
||||
|
||||
def action(
|
||||
*,
|
||||
action_id: str,
|
||||
stage: str,
|
||||
text: str,
|
||||
reason: str,
|
||||
command: str,
|
||||
post_verifier: str,
|
||||
blocker_fields: list[str],
|
||||
requires_local_console: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"action_id": action_id,
|
||||
"stage": stage,
|
||||
"action": text,
|
||||
"reason": reason,
|
||||
"command": command,
|
||||
"post_verifier": post_verifier,
|
||||
"requires_local_console": requires_local_console,
|
||||
"metadata_only": True,
|
||||
"blocker_fields": blocker_fields,
|
||||
"forbidden_actions": forbidden,
|
||||
}
|
||||
|
||||
if remote_ssh_publickey_auth_stalled:
|
||||
blocker_fields = ["latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled"]
|
||||
if remote_ssh_publickey_offer_timeout:
|
||||
blocker_fields.append(
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout"
|
||||
)
|
||||
if remote_ssh_server_accepts_key_then_session_timeout:
|
||||
blocker_fields.append(
|
||||
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout"
|
||||
)
|
||||
return action(
|
||||
action_id=(
|
||||
"run_110_local_ssh_session_control_path_recovery_then_verify_cd_and_deploy_marker_readback"
|
||||
),
|
||||
stage="local_console_control_path_receipt_required",
|
||||
text=(
|
||||
"Use the 110 local-console controlled recovery package to verify "
|
||||
"SSH account/session metadata, then rerun registry, queue, CD, "
|
||||
"and deploy-marker readbacks."
|
||||
),
|
||||
reason=(
|
||||
"The public queue shows TCP/SSH reachability but publickey "
|
||||
"authentication or session setup stalls; remote SSH cannot safely "
|
||||
"repair itself."
|
||||
),
|
||||
command="recover-110-control-path-and-harbor-local.sh --check",
|
||||
post_verifier=(
|
||||
"read-public-gitea-actions-queue.py --json && "
|
||||
"check-awoooi-110-controlled-cd-lane-readiness.sh && "
|
||||
"curl -k https://registry.wooo.work/v2/"
|
||||
),
|
||||
blocker_fields=blocker_fields,
|
||||
requires_local_console=True,
|
||||
)
|
||||
|
||||
if remote_control_channel_unavailable:
|
||||
return action(
|
||||
action_id="restore_110_remote_control_channel_readback_from_local_console",
|
||||
stage="local_console_control_channel_readback_required",
|
||||
text=(
|
||||
"Verify the 110 local control path and publish a metadata-only "
|
||||
"receipt before retrying Harbor or CD closure."
|
||||
),
|
||||
reason="The bounded remote SSH control channel is unavailable.",
|
||||
command="recover-110-control-path-and-harbor-local.sh --check",
|
||||
post_verifier="read-public-gitea-actions-queue.py --json",
|
||||
blocker_fields=[
|
||||
"latest_visible_harbor_110_repair_remote_control_channel_unavailable"
|
||||
],
|
||||
requires_local_console=True,
|
||||
)
|
||||
|
||||
if current_cd_waiting_behind_harbor_110_repair_running:
|
||||
blocker_fields = ["current_cd_waiting_behind_harbor_110_repair_running"]
|
||||
if harbor_110_repair_visible_running_jobs_api_stale:
|
||||
blocker_fields.append("harbor_110_repair_visible_running_jobs_api_stale")
|
||||
return action(
|
||||
action_id=(
|
||||
"refresh_harbor_110_repair_log_truth_then_verify_cd_waiting_state"
|
||||
),
|
||||
stage="queue_truth_refresh_required",
|
||||
text=(
|
||||
"Keep the visible CD wait state, ignore stale Harbor jobs API "
|
||||
"payloads, and refresh the Harbor repair log plus queue readback."
|
||||
),
|
||||
reason=(
|
||||
"Current CD is waiting behind the visible Harbor repair lane, "
|
||||
"while the jobs API may belong to another workflow or stale run."
|
||||
),
|
||||
command="read-public-gitea-actions-queue.py --json",
|
||||
post_verifier="read-public-gitea-actions-queue.py --json",
|
||||
blocker_fields=blocker_fields,
|
||||
)
|
||||
|
||||
if latest_cd_waiting or latest_cd_no_matching_runner_label:
|
||||
blocker_fields = ["latest_visible_cd_run_waiting"]
|
||||
if latest_cd_no_matching_runner_label:
|
||||
blocker_fields.append("latest_visible_cd_no_matching_runner_label")
|
||||
return action(
|
||||
action_id="verify_non110_runner_lane_before_retrying_current_cd",
|
||||
stage="non110_runner_lane_readiness_required",
|
||||
text=(
|
||||
"Verify the non-110 controlled CD runner lane and keep legacy or "
|
||||
"generic runner labels closed."
|
||||
),
|
||||
reason=(
|
||||
f"The latest visible CD run is {latest_cd_status or 'unknown'} "
|
||||
"or has no matching controlled runner label."
|
||||
),
|
||||
command="check-awoooi-non110-runner-readiness.sh",
|
||||
post_verifier="read-public-gitea-actions-queue.py --json",
|
||||
blocker_fields=blocker_fields,
|
||||
)
|
||||
|
||||
if harbor_110_repair_no_matching_runner_label:
|
||||
return action(
|
||||
action_id="verify_awoooi_host_controlled_repair_runner_lane",
|
||||
stage="awoooi_host_runner_lane_readiness_required",
|
||||
text=(
|
||||
"Verify the awoooi-host controlled repair lane before retrying "
|
||||
"Harbor local repair."
|
||||
),
|
||||
reason="The Harbor 110 repair workflow has no matching controlled runner.",
|
||||
command="check-awoooi-110-controlled-cd-lane-readiness.sh",
|
||||
post_verifier="read-public-gitea-actions-queue.py --json",
|
||||
blocker_fields=[
|
||||
"latest_visible_harbor_110_repair_no_matching_runner_label"
|
||||
],
|
||||
)
|
||||
|
||||
if harbor_110_repair_waiting or harbor_110_repair_waiting_after_cd_harbor_blocker:
|
||||
blocker_fields = ["latest_visible_harbor_110_repair_waiting"]
|
||||
if harbor_110_repair_jobs_stale_or_mismatched:
|
||||
blocker_fields.append("harbor_110_repair_jobs_stale_or_mismatched")
|
||||
return action(
|
||||
action_id="wait_for_harbor_110_repair_or_refresh_queue_truth",
|
||||
stage="harbor_110_repair_queue_wait",
|
||||
text=(
|
||||
"Wait for the visible Harbor 110 repair lane or refresh queue truth "
|
||||
"when the jobs API is stale."
|
||||
),
|
||||
reason="The Harbor 110 repair workflow is visible but not complete.",
|
||||
command="read-public-gitea-actions-queue.py --json",
|
||||
post_verifier="read-public-gitea-actions-queue.py --json",
|
||||
blocker_fields=blocker_fields,
|
||||
)
|
||||
|
||||
if harbor_110_repair_failed:
|
||||
blocker_fields = ["latest_visible_harbor_110_repair_failed"]
|
||||
if harbor_110_repair_visible_failure_jobs_api_stale:
|
||||
blocker_fields.append("harbor_110_repair_visible_failure_jobs_api_stale")
|
||||
if harbor_110_repair_jobs_payload_classifier:
|
||||
blocker_fields.append("harbor_110_repair_jobs_payload_classifier")
|
||||
return action(
|
||||
action_id="use_harbor_repair_log_classifier_then_submit_recovery_receipt",
|
||||
stage="harbor_110_repair_failure_receipt_required",
|
||||
text=(
|
||||
"Use the Harbor repair log classifier as truth, quarantine stale "
|
||||
"jobs API payloads, and submit the metadata-only recovery receipt."
|
||||
),
|
||||
reason="The visible Harbor repair run failed.",
|
||||
command="read-public-gitea-actions-queue.py --json",
|
||||
post_verifier="harbor-registry-controlled-recovery-receipt readback",
|
||||
blocker_fields=blocker_fields,
|
||||
)
|
||||
|
||||
if build_harbor_public_route_blocked or build_harbor_public_route_retrying_unavailable:
|
||||
return action(
|
||||
action_id="run_harbor_registry_v2_verifier_before_repair",
|
||||
stage="registry_v2_verifier_required",
|
||||
text=(
|
||||
"Verify public registry /v2/ first; only use Harbor repair if the "
|
||||
"route is still below 200/401."
|
||||
),
|
||||
reason="The CD log still carries Harbor public route unavailable evidence.",
|
||||
command="curl -k https://registry.wooo.work/v2/",
|
||||
post_verifier="read-public-gitea-actions-queue.py --json",
|
||||
blocker_fields=[
|
||||
"latest_visible_cd_harbor_public_route_blocked",
|
||||
"latest_visible_cd_harbor_public_route_retrying_unavailable",
|
||||
],
|
||||
)
|
||||
|
||||
if effective_host_pressure_blocked_or_waiting:
|
||||
return action(
|
||||
action_id="wait_host_pressure_gate_then_rerun_cd_readback",
|
||||
stage="host_pressure_gate_wait",
|
||||
text=(
|
||||
"Keep the host pressure gate fail-closed and rerun CD readback "
|
||||
"after pressure clears."
|
||||
),
|
||||
reason=effective_host_pressure_classifier or "host pressure is active",
|
||||
command="awoooi-wait-host-web-build-pressure.sh",
|
||||
post_verifier="read-public-gitea-actions-queue.py --json",
|
||||
blocker_fields=["latest_visible_cd_host_pressure_classifier"],
|
||||
)
|
||||
|
||||
if cd_jobs_stale_or_mismatched:
|
||||
return action(
|
||||
action_id="ignore_stale_cd_jobs_api_payload_and_poll_visible_cd_or_marker",
|
||||
stage="cd_jobs_api_stale_payload_quarantine",
|
||||
text=(
|
||||
"Ignore stale CD jobs API payloads, poll the visible CD run/logs, "
|
||||
"and verify production deploy marker before closing."
|
||||
),
|
||||
reason=cd_jobs_payload_classifier or "CD jobs API payload is stale",
|
||||
command="read-public-gitea-actions-queue.py --json",
|
||||
post_verifier="awoooi production deploy marker readback",
|
||||
blocker_fields=["cd_run_jobs_payload_classifier"],
|
||||
)
|
||||
|
||||
if harbor_110_repair_jobs_stale_or_mismatched:
|
||||
return action(
|
||||
action_id="ignore_stale_harbor_jobs_api_payload_and_poll_visible_repair_log",
|
||||
stage="harbor_jobs_api_stale_payload_quarantine",
|
||||
text=(
|
||||
"Ignore stale Harbor repair jobs API payloads and poll the visible "
|
||||
"repair log or queue status."
|
||||
),
|
||||
reason=harbor_110_repair_jobs_payload_classifier
|
||||
or "Harbor repair jobs API payload is stale",
|
||||
command="read-public-gitea-actions-queue.py --json",
|
||||
post_verifier="read-public-gitea-actions-queue.py --json",
|
||||
blocker_fields=["harbor_110_repair_jobs_payload_classifier"],
|
||||
)
|
||||
|
||||
return action(
|
||||
action_id="continue_public_gitea_queue_readback",
|
||||
stage="queue_observation",
|
||||
text="Continue public queue readback without credentials or runtime writes.",
|
||||
reason="No more specific queue blocker was visible in the public readback.",
|
||||
command="read-public-gitea-actions-queue.py --json",
|
||||
post_verifier="read-public-gitea-actions-queue.py --json",
|
||||
blocker_fields=[],
|
||||
)
|
||||
|
||||
|
||||
def classify_cd_build_log(text: str) -> dict[str, Any]:
|
||||
attempt_statuses: list[str] = []
|
||||
attempt_numbers: list[int] = []
|
||||
@@ -1416,6 +1770,9 @@ def _human_summary(payload: dict[str, Any]) -> str:
|
||||
"CURRENT_CD_WAITING_BEHIND_HARBOR_110_REPAIR_RUNNING="
|
||||
f"{int(readback['current_cd_waiting_behind_harbor_110_repair_running'])}"
|
||||
),
|
||||
f"SAFE_NEXT_ACTION_ID={readback['safe_next_action_id']}",
|
||||
f"SAFE_NEXT_ACTION_STAGE={readback['safe_next_action_stage']}",
|
||||
f"SAFE_NEXT_ACTION_COMMAND={readback['safe_next_action_command']}",
|
||||
"WRITE_PERFORMED=false",
|
||||
"TOKEN_COLLECTED=false",
|
||||
]
|
||||
|
||||
@@ -657,6 +657,18 @@ def test_harbor_ssh_blocker_takes_precedence_over_current_cd_waiting() -> None:
|
||||
payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"]
|
||||
is True
|
||||
)
|
||||
assert payload["readback"]["safe_next_action_id"] == (
|
||||
"run_110_local_ssh_session_control_path_recovery_then_verify_cd_and_deploy_marker_readback"
|
||||
)
|
||||
assert payload["readback"]["safe_next_action_stage"] == (
|
||||
"local_console_control_path_receipt_required"
|
||||
)
|
||||
assert payload["readback"]["safe_next_action_requires_local_console"] is True
|
||||
assert payload["readback"]["safe_next_action_metadata_only"] is True
|
||||
assert "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled" in (
|
||||
payload["readback"]["safe_next_action_blocker_fields"]
|
||||
)
|
||||
assert payload["rollups"]["safe_next_action_requires_local_console"] is True
|
||||
|
||||
|
||||
def test_build_readback_classifies_harbor_502_after_110_repair_jobs_success() -> None:
|
||||
@@ -920,6 +932,7 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non
|
||||
"LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_AUTH_PERMISSION_DENIED=False"
|
||||
in summary
|
||||
)
|
||||
assert "SAFE_NEXT_ACTION_STAGE=local_console_control_path_receipt_required" in summary
|
||||
assert payload["operation_boundaries"]["secret_or_runner_token_read"] is False
|
||||
assert payload["operation_boundaries"]["host_write_performed"] is False
|
||||
|
||||
@@ -1280,6 +1293,15 @@ def test_build_readback_flags_stale_cd_jobs_api_payload() -> None:
|
||||
assert payload["readback"]["cd_run_jobs_payload_classifier"] == (
|
||||
"cd_jobs_api_head_sha_mismatch_for_visible_cd_run"
|
||||
)
|
||||
assert payload["readback"]["safe_next_action_id"] == (
|
||||
"ignore_stale_cd_jobs_api_payload_and_poll_visible_cd_or_marker"
|
||||
)
|
||||
assert payload["readback"]["safe_next_action_stage"] == (
|
||||
"cd_jobs_api_stale_payload_quarantine"
|
||||
)
|
||||
assert payload["readback"]["safe_next_action_blocker_fields"] == [
|
||||
"cd_run_jobs_payload_classifier"
|
||||
]
|
||||
assert payload["rollups"]["cd_run_jobs_stale_or_mismatched"] is True
|
||||
assert payload["rollups"]["cd_run_jobs_payload_classifier"] == (
|
||||
"cd_jobs_api_head_sha_mismatch_for_visible_cd_run"
|
||||
|
||||
Reference in New Issue
Block a user