From 97899ff5ad94c71eb649b174468871ceeb683bf6 Mon Sep 17 00:00:00 2001 From: ogt Date: Thu, 2 Jul 2026 00:23:02 +0800 Subject: [PATCH] fix(api): clear resolved ai loop control blocker --- .../awoooi_priority_work_order_readback.py | 219 ++++++++++++++---- ...awoooi_priority_work_order_readback_api.py | 20 ++ docs/LOGBOOK.md | 16 ++ 3 files changed, 216 insertions(+), 39 deletions(-) diff --git a/apps/api/src/services/awoooi_priority_work_order_readback.py b/apps/api/src/services/awoooi_priority_work_order_readback.py index 2c19ebe7..9b104c73 100644 --- a/apps/api/src/services/awoooi_priority_work_order_readback.py +++ b/apps/api/src/services/awoooi_priority_work_order_readback.py @@ -510,6 +510,82 @@ def apply_ai_loop_current_blocker_execution_queue( first_item = _dict(queue[0]) blocker_id = str(first_item.get("blocker_id") or "") + registry_v2_ready = bool(first_item.get("registry_v2_ready") is True) + registry_v2_status_classifier = str( + first_item.get("registry_v2_status_classifier") or "" + ) + deployment_closure_state = str(first_item.get("deployment_closure_state") or "") + deploy_marker_readback_required = bool( + first_item.get("deploy_marker_readback_required") is True + ) + current_cd_run_id = str(first_item.get("current_cd_run_id") or "") + current_cd_run_status = str(first_item.get("current_cd_run_status") or "") + current_cd_commit_sha = str(first_item.get("current_cd_commit_sha") or "") + cd_failed_after_registry_ready = bool( + first_item.get("cd_failed_after_registry_ready") is True + ) + harbor_110_repair_run_id = str(first_item.get("harbor_110_repair_run_id") or "") + harbor_110_repair_run_status = str( + first_item.get("harbor_110_repair_run_status") or "" + ) + harbor_110_repair_failure_classifier = str( + first_item.get("harbor_110_repair_failure_classifier") or "" + ) + harbor_110_repair_failed_after_registry_ready = bool( + first_item.get("harbor_110_repair_failed_after_registry_ready") is True + ) + external_blocker = str(first_item.get("external_control_path_blocker") or "") + pressure_blocker = str(first_item.get("control_path_pressure_blocker") or "") + node_load_classifier = str(first_item.get("node_load_classifier") or "") + + state = _dict(payload.setdefault("mainline_execution_state", {})) + current_head = _dict(payload.get("current_head")) + production_readback_verified = bool( + state.get("current_main_cd_run_status") == "production_readback_verified" + and current_head.get("production_source_truth_available") is True + and _is_sha(str(state.get("latest_successful_deployed_source_sha") or "")) + ) + deploy_marker_resolved_by_production_readback = bool( + production_readback_verified + and (deploy_marker_readback_required or cd_failed_after_registry_ready) + ) + queue_resolved_by_production_readback = bool( + production_readback_verified + and registry_v2_ready + and ( + deploy_marker_readback_required + or cd_failed_after_registry_ready + or harbor_110_repair_failed_after_registry_ready + ) + ) + if queue_resolved_by_production_readback: + _record_ai_loop_current_blocker_production_resolution( + payload=payload, + state=state, + queue_count=len(queue), + blocker_id=blocker_id, + registry_v2_ready=registry_v2_ready, + registry_v2_status_classifier=registry_v2_status_classifier, + deployment_closure_state=deployment_closure_state, + deploy_marker_readback_required=deploy_marker_readback_required, + current_cd_run_id=current_cd_run_id, + current_cd_run_status=current_cd_run_status, + current_cd_commit_sha=current_cd_commit_sha, + cd_failed_after_registry_ready=cd_failed_after_registry_ready, + harbor_110_repair_run_id=harbor_110_repair_run_id, + harbor_110_repair_run_status=harbor_110_repair_run_status, + harbor_110_repair_failure_classifier=( + harbor_110_repair_failure_classifier + ), + harbor_110_repair_failed_after_registry_ready=( + harbor_110_repair_failed_after_registry_ready + ), + external_blocker=external_blocker, + pressure_blocker=pressure_blocker, + node_load_classifier=node_load_classifier, + ) + return + if not _ai_loop_current_blocker_can_override( status=str(payload.get("status") or ""), blocker_id=blocker_id, @@ -575,9 +651,6 @@ def apply_ai_loop_current_blocker_execution_queue( _dict(item) for item in _list(context.get("log_source_tagging_contract")) ] forbidden_runtime_actions = _strings(first_item.get("forbidden_runtime_actions")) - external_blocker = str(first_item.get("external_control_path_blocker") or "") - pressure_blocker = str(first_item.get("control_path_pressure_blocker") or "") - node_load_classifier = str(first_item.get("node_load_classifier") or "") runtime_write_gate = str(first_item.get("runtime_write_gate") or "") safe_next_action_id = str(first_item.get("safe_next_action_id") or "") safe_next_action_stage = str(first_item.get("safe_next_action_stage") or "") @@ -592,42 +665,6 @@ def apply_ai_loop_current_blocker_execution_queue( safe_next_action_blocker_fields = _strings( first_item.get("safe_next_action_blocker_fields") ) - registry_v2_ready = bool(first_item.get("registry_v2_ready") is True) - registry_v2_status_classifier = str( - first_item.get("registry_v2_status_classifier") or "" - ) - deployment_closure_state = str(first_item.get("deployment_closure_state") or "") - deploy_marker_readback_required = bool( - first_item.get("deploy_marker_readback_required") is True - ) - current_cd_run_id = str(first_item.get("current_cd_run_id") or "") - current_cd_run_status = str(first_item.get("current_cd_run_status") or "") - current_cd_commit_sha = str(first_item.get("current_cd_commit_sha") or "") - cd_failed_after_registry_ready = bool( - first_item.get("cd_failed_after_registry_ready") is True - ) - harbor_110_repair_run_id = str(first_item.get("harbor_110_repair_run_id") or "") - harbor_110_repair_run_status = str( - first_item.get("harbor_110_repair_run_status") or "" - ) - harbor_110_repair_failure_classifier = str( - first_item.get("harbor_110_repair_failure_classifier") or "" - ) - harbor_110_repair_failed_after_registry_ready = bool( - first_item.get("harbor_110_repair_failed_after_registry_ready") is True - ) - - state = _dict(payload.setdefault("mainline_execution_state", {})) - current_head = _dict(payload.get("current_head")) - production_readback_verified = bool( - state.get("current_main_cd_run_status") == "production_readback_verified" - and current_head.get("production_source_truth_available") is True - and _is_sha(str(state.get("latest_successful_deployed_source_sha") or "")) - ) - deploy_marker_resolved_by_production_readback = bool( - production_readback_verified - and (deploy_marker_readback_required or cd_failed_after_registry_ready) - ) active_deployment_closure_state = ( "production_readback_verified" if deploy_marker_resolved_by_production_readback @@ -1132,6 +1169,110 @@ def apply_ai_loop_current_blocker_execution_queue( ) +def _record_ai_loop_current_blocker_production_resolution( + *, + payload: dict[str, Any], + state: dict[str, Any], + queue_count: int, + blocker_id: str, + registry_v2_ready: bool, + registry_v2_status_classifier: str, + deployment_closure_state: str, + deploy_marker_readback_required: bool, + current_cd_run_id: str, + current_cd_run_status: str, + current_cd_commit_sha: str, + cd_failed_after_registry_ready: bool, + harbor_110_repair_run_id: str, + harbor_110_repair_run_status: str, + harbor_110_repair_failure_classifier: str, + harbor_110_repair_failed_after_registry_ready: bool, + external_blocker: str, + pressure_blocker: str, + node_load_classifier: str, +) -> None: + """Record a resolved AI-loop queue item without reopening active P0 state.""" + production_sha = str(state.get("latest_successful_deployed_source_sha") or "") + production_run_id = str(state.get("current_main_cd_run_id") or "") + production_run_status = str(state.get("current_main_cd_run_status") or "") + resolved_fields: dict[str, Any] = { + "ai_loop_current_blocker_execution_queue_count": queue_count, + "ai_loop_current_blocker_id": blocker_id, + "ai_loop_current_blocker_resolved_by_production_readback": True, + "ai_loop_current_blocker_registry_v2_ready": registry_v2_ready, + "ai_loop_current_blocker_registry_v2_status_classifier": ( + registry_v2_status_classifier + ), + "ai_loop_current_blocker_deployment_closure_state": ( + "production_readback_verified" + ), + "ai_loop_current_blocker_deploy_marker_resolved_by_production_readback": ( + True + ), + "ai_loop_current_blocker_deploy_marker_readback_required": False, + "ai_loop_current_blocker_current_cd_run_id": production_run_id, + "ai_loop_current_blocker_current_cd_run_status": production_run_status, + "ai_loop_current_blocker_current_cd_commit_sha": production_sha, + "ai_loop_current_blocker_cd_failed_after_registry_ready": False, + "ai_loop_current_blocker_harbor_110_repair_failed_after_registry_ready": False, + "ai_loop_current_blocker_control_path_blocker": "", + "ai_loop_current_blocker_control_path_pressure_blocker": "", + "ai_loop_current_blocker_safe_next_action": "", + "ai_loop_current_blocker_safe_next_action_id": "", + "ai_loop_current_blocker_safe_next_action_stage": "", + "ai_loop_current_blocker_safe_next_action_command": "", + "ai_loop_current_blocker_safe_next_action_post_verifier": "", + "ai_loop_current_blocker_safe_next_action_requires_local_console": False, + "ai_loop_current_blocker_safe_next_action_blocker_fields": [], + "ai_loop_current_blocker_node_load_classifier": node_load_classifier, + "ai_loop_current_blocker_historical_deployment_closure_state": ( + deployment_closure_state + ), + "ai_loop_current_blocker_historical_deploy_marker_readback_required": ( + deploy_marker_readback_required + ), + "ai_loop_current_blocker_historical_current_cd_run_id": current_cd_run_id, + "ai_loop_current_blocker_historical_current_cd_run_status": ( + current_cd_run_status + ), + "ai_loop_current_blocker_historical_current_cd_commit_sha": ( + current_cd_commit_sha + ), + "ai_loop_current_blocker_historical_cd_failed_after_registry_ready": ( + cd_failed_after_registry_ready + ), + "ai_loop_current_blocker_historical_harbor_110_repair_run_id": ( + harbor_110_repair_run_id + ), + "ai_loop_current_blocker_historical_harbor_110_repair_run_status": ( + harbor_110_repair_run_status + ), + "ai_loop_current_blocker_historical_harbor_110_repair_failure_classifier": ( + harbor_110_repair_failure_classifier + ), + "ai_loop_current_blocker_historical_harbor_110_repair_failed_after_registry_ready": ( + harbor_110_repair_failed_after_registry_ready + ), + "ai_loop_current_blocker_historical_control_path_blocker": ( + external_blocker + ), + "ai_loop_current_blocker_historical_control_path_pressure_blocker": ( + pressure_blocker + ), + } + state.update(resolved_fields) + summary = _dict(payload.setdefault("summary", {})) + summary.update(resolved_fields) + + for item in _list(payload.get("in_progress_or_blocked_in_priority_order")): + workplan = _dict(item) + if workplan.get("workplan_id") != "P0-006": + continue + evidence = _dict(workplan.setdefault("evidence", {})) + evidence.update(resolved_fields) + break + + def _ai_loop_current_blocker_can_override( *, status: str, diff --git a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py index 5bd3910a..469de8df 100644 --- a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py +++ b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py @@ -841,7 +841,13 @@ def test_awoooi_priority_work_order_readback_does_not_reopen_stale_cd_failure_af state = payload["mainline_execution_state"] evidence = payload["in_progress_or_blocked_in_priority_order"][0]["evidence"] blockers = state["active_p0_live_active_blockers"] + assert payload["status"] == "p0_006_blocked_reboot_auto_recovery_slo_not_ready" + assert state["active_p0_state"] == "blocked_reboot_auto_recovery_slo_not_ready" + assert state["next_executable_mainline_workplan_id"] == ( + "P0-006-REBOOT-AUTO-RECOVERY-SLO-SCORECARD" + ) assert state["current_main_cd_run_status"] == "production_readback_verified" + assert state["ai_loop_current_blocker_resolved_by_production_readback"] is True assert state["ai_loop_current_blocker_deploy_marker_readback_required"] is False assert state["ai_loop_current_blocker_cd_failed_after_registry_ready"] is False assert ( @@ -854,14 +860,24 @@ def test_awoooi_priority_work_order_readback_does_not_reopen_stale_cd_failure_af assert state["ai_loop_current_blocker_current_cd_run_id"] == ( f"production_readback:{runtime_short_sha}" ) + assert state["ai_loop_current_blocker_control_path_blocker"] == "" + assert state["ai_loop_current_blocker_historical_control_path_blocker"] == ( + "remote_ssh_publickey_offer_timeout" + ) assert state["ai_loop_current_blocker_historical_current_cd_run_id"] == "4258" assert state["ai_loop_current_blocker_historical_current_cd_run_status"] == ( "Failure" ) assert "deploy_marker_readback_required_after_registry_ready" not in blockers assert "current_cd_failure_after_registry_ready" not in blockers + assert "remote_ssh_publickey_offer_timeout" not in blockers + assert evidence["ai_loop_current_blocker_resolved_by_production_readback"] is True assert evidence["ai_loop_current_blocker_deploy_marker_readback_required"] is False assert evidence["ai_loop_current_blocker_cd_failed_after_registry_ready"] is False + assert evidence["ai_loop_current_blocker_control_path_blocker"] == "" + assert evidence["ai_loop_current_blocker_historical_control_path_blocker"] == ( + "remote_ssh_publickey_offer_timeout" + ) assert evidence[ "ai_loop_current_blocker_historical_deploy_marker_readback_required" ] is True @@ -874,6 +890,10 @@ def test_awoooi_priority_work_order_readback_does_not_reopen_stale_cd_failure_af assert payload["summary"]["ai_loop_current_blocker_current_cd_run_status"] == ( "production_readback_verified" ) + assert payload["summary"]["ai_loop_current_blocker_control_path_blocker"] == "" + assert payload["summary"][ + "ai_loop_current_blocker_historical_control_path_blocker" + ] == "remote_ssh_publickey_offer_timeout" assert all( "P0-006-CD-DEPLOY-MARKER-READBACK" not in item for item in payload["next_execution_order"] diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c7e61a60..32bb25e0 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -51968,6 +51968,22 @@ production browser smoke: - rebase 到 Gitea main 最新 deploy marker,commit / push 本次 source 修法與 receipt。 - 讀回 Gitea CD / Prometheus;下一條主線是把 `RebootAutoRecoverySLOMissed` 的 6 個 blocker 轉成可自動判斷與可回滾 remediation,不消音告警。 +## 2026-07-02 — P0 AI loop stale control-path blocker production readback 修正 + +**完成內容**: +- 修正 `awoooi-priority-work-order-readback` 的 AI loop current blocker overlay:當 production source truth 已讀回 `production_readback_verified`、registry `/v2/` 已 ready,且舊 deploy marker / CD failure / Harbor 110 repair failure 已被 production readback 收斂時,不再把舊 `remote_ssh_publickey_offer_timeout` 視為 active P0 blocker。 +- API 現在保留 historical evidence:`ai_loop_current_blocker_historical_control_path_blocker=remote_ssh_publickey_offer_timeout`,但 active 欄位回到空值,並讓 active P0 回到 `P0-006-REBOOT-AUTO-RECOVERY-SLO-SCORECARD`。 +- 測試新增 active P0 / historical blocker / active blocker exclusion 斷言,避免已成功部署的舊 Harbor/110 failure 再覆蓋主線順序。 + +**本地驗證結果**: +- `DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_awoooi_priority_work_order_readback_api.py apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py ops/runner/test_read_public_gitea_actions_queue.py ops/runner/test_verify_awoooi_non110_cd_closure.py -q`:`86 passed`。 +- `python3.11 -m py_compile apps/api/src/services/awoooi_priority_work_order_readback.py ops/runner/read-public-gitea-actions-queue.py ops/runner/verify-awoooi-non110-cd-closure.py`:通過。 +- `git diff --check`:通過。 + +**仍維持**: +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有使用 GitHub / gh / GitHub API / GitHub Actions。 +- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB / firewall 操作,沒有 workflow_dispatch,沒有 force push。 + ## 2026-07-01 — 23:28 P0 110 sustained CPU pressure alert / controlled quota / alert-chain readback **完成內容**: