fix(api): clear resolved ai loop control blocker
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
This commit is contained in:
@@ -510,6 +510,82 @@ def apply_ai_loop_current_blocker_execution_queue(
|
||||
|
||||
first_item = _dict(queue[0])
|
||||
blocker_id = str(first_item.get("blocker_id") or "")
|
||||
registry_v2_ready = bool(first_item.get("registry_v2_ready") is True)
|
||||
registry_v2_status_classifier = str(
|
||||
first_item.get("registry_v2_status_classifier") or ""
|
||||
)
|
||||
deployment_closure_state = str(first_item.get("deployment_closure_state") or "")
|
||||
deploy_marker_readback_required = bool(
|
||||
first_item.get("deploy_marker_readback_required") is True
|
||||
)
|
||||
current_cd_run_id = str(first_item.get("current_cd_run_id") or "")
|
||||
current_cd_run_status = str(first_item.get("current_cd_run_status") or "")
|
||||
current_cd_commit_sha = str(first_item.get("current_cd_commit_sha") or "")
|
||||
cd_failed_after_registry_ready = bool(
|
||||
first_item.get("cd_failed_after_registry_ready") is True
|
||||
)
|
||||
harbor_110_repair_run_id = str(first_item.get("harbor_110_repair_run_id") or "")
|
||||
harbor_110_repair_run_status = str(
|
||||
first_item.get("harbor_110_repair_run_status") or ""
|
||||
)
|
||||
harbor_110_repair_failure_classifier = str(
|
||||
first_item.get("harbor_110_repair_failure_classifier") or ""
|
||||
)
|
||||
harbor_110_repair_failed_after_registry_ready = bool(
|
||||
first_item.get("harbor_110_repair_failed_after_registry_ready") is True
|
||||
)
|
||||
external_blocker = str(first_item.get("external_control_path_blocker") or "")
|
||||
pressure_blocker = str(first_item.get("control_path_pressure_blocker") or "")
|
||||
node_load_classifier = str(first_item.get("node_load_classifier") or "")
|
||||
|
||||
state = _dict(payload.setdefault("mainline_execution_state", {}))
|
||||
current_head = _dict(payload.get("current_head"))
|
||||
production_readback_verified = bool(
|
||||
state.get("current_main_cd_run_status") == "production_readback_verified"
|
||||
and current_head.get("production_source_truth_available") is True
|
||||
and _is_sha(str(state.get("latest_successful_deployed_source_sha") or ""))
|
||||
)
|
||||
deploy_marker_resolved_by_production_readback = bool(
|
||||
production_readback_verified
|
||||
and (deploy_marker_readback_required or cd_failed_after_registry_ready)
|
||||
)
|
||||
queue_resolved_by_production_readback = bool(
|
||||
production_readback_verified
|
||||
and registry_v2_ready
|
||||
and (
|
||||
deploy_marker_readback_required
|
||||
or cd_failed_after_registry_ready
|
||||
or harbor_110_repair_failed_after_registry_ready
|
||||
)
|
||||
)
|
||||
if queue_resolved_by_production_readback:
|
||||
_record_ai_loop_current_blocker_production_resolution(
|
||||
payload=payload,
|
||||
state=state,
|
||||
queue_count=len(queue),
|
||||
blocker_id=blocker_id,
|
||||
registry_v2_ready=registry_v2_ready,
|
||||
registry_v2_status_classifier=registry_v2_status_classifier,
|
||||
deployment_closure_state=deployment_closure_state,
|
||||
deploy_marker_readback_required=deploy_marker_readback_required,
|
||||
current_cd_run_id=current_cd_run_id,
|
||||
current_cd_run_status=current_cd_run_status,
|
||||
current_cd_commit_sha=current_cd_commit_sha,
|
||||
cd_failed_after_registry_ready=cd_failed_after_registry_ready,
|
||||
harbor_110_repair_run_id=harbor_110_repair_run_id,
|
||||
harbor_110_repair_run_status=harbor_110_repair_run_status,
|
||||
harbor_110_repair_failure_classifier=(
|
||||
harbor_110_repair_failure_classifier
|
||||
),
|
||||
harbor_110_repair_failed_after_registry_ready=(
|
||||
harbor_110_repair_failed_after_registry_ready
|
||||
),
|
||||
external_blocker=external_blocker,
|
||||
pressure_blocker=pressure_blocker,
|
||||
node_load_classifier=node_load_classifier,
|
||||
)
|
||||
return
|
||||
|
||||
if not _ai_loop_current_blocker_can_override(
|
||||
status=str(payload.get("status") or ""),
|
||||
blocker_id=blocker_id,
|
||||
@@ -575,9 +651,6 @@ def apply_ai_loop_current_blocker_execution_queue(
|
||||
_dict(item) for item in _list(context.get("log_source_tagging_contract"))
|
||||
]
|
||||
forbidden_runtime_actions = _strings(first_item.get("forbidden_runtime_actions"))
|
||||
external_blocker = str(first_item.get("external_control_path_blocker") or "")
|
||||
pressure_blocker = str(first_item.get("control_path_pressure_blocker") or "")
|
||||
node_load_classifier = str(first_item.get("node_load_classifier") or "")
|
||||
runtime_write_gate = str(first_item.get("runtime_write_gate") or "")
|
||||
safe_next_action_id = str(first_item.get("safe_next_action_id") or "")
|
||||
safe_next_action_stage = str(first_item.get("safe_next_action_stage") or "")
|
||||
@@ -592,42 +665,6 @@ def apply_ai_loop_current_blocker_execution_queue(
|
||||
safe_next_action_blocker_fields = _strings(
|
||||
first_item.get("safe_next_action_blocker_fields")
|
||||
)
|
||||
registry_v2_ready = bool(first_item.get("registry_v2_ready") is True)
|
||||
registry_v2_status_classifier = str(
|
||||
first_item.get("registry_v2_status_classifier") or ""
|
||||
)
|
||||
deployment_closure_state = str(first_item.get("deployment_closure_state") or "")
|
||||
deploy_marker_readback_required = bool(
|
||||
first_item.get("deploy_marker_readback_required") is True
|
||||
)
|
||||
current_cd_run_id = str(first_item.get("current_cd_run_id") or "")
|
||||
current_cd_run_status = str(first_item.get("current_cd_run_status") or "")
|
||||
current_cd_commit_sha = str(first_item.get("current_cd_commit_sha") or "")
|
||||
cd_failed_after_registry_ready = bool(
|
||||
first_item.get("cd_failed_after_registry_ready") is True
|
||||
)
|
||||
harbor_110_repair_run_id = str(first_item.get("harbor_110_repair_run_id") or "")
|
||||
harbor_110_repair_run_status = str(
|
||||
first_item.get("harbor_110_repair_run_status") or ""
|
||||
)
|
||||
harbor_110_repair_failure_classifier = str(
|
||||
first_item.get("harbor_110_repair_failure_classifier") or ""
|
||||
)
|
||||
harbor_110_repair_failed_after_registry_ready = bool(
|
||||
first_item.get("harbor_110_repair_failed_after_registry_ready") is True
|
||||
)
|
||||
|
||||
state = _dict(payload.setdefault("mainline_execution_state", {}))
|
||||
current_head = _dict(payload.get("current_head"))
|
||||
production_readback_verified = bool(
|
||||
state.get("current_main_cd_run_status") == "production_readback_verified"
|
||||
and current_head.get("production_source_truth_available") is True
|
||||
and _is_sha(str(state.get("latest_successful_deployed_source_sha") or ""))
|
||||
)
|
||||
deploy_marker_resolved_by_production_readback = bool(
|
||||
production_readback_verified
|
||||
and (deploy_marker_readback_required or cd_failed_after_registry_ready)
|
||||
)
|
||||
active_deployment_closure_state = (
|
||||
"production_readback_verified"
|
||||
if deploy_marker_resolved_by_production_readback
|
||||
@@ -1132,6 +1169,110 @@ def apply_ai_loop_current_blocker_execution_queue(
|
||||
)
|
||||
|
||||
|
||||
def _record_ai_loop_current_blocker_production_resolution(
|
||||
*,
|
||||
payload: dict[str, Any],
|
||||
state: dict[str, Any],
|
||||
queue_count: int,
|
||||
blocker_id: str,
|
||||
registry_v2_ready: bool,
|
||||
registry_v2_status_classifier: str,
|
||||
deployment_closure_state: str,
|
||||
deploy_marker_readback_required: bool,
|
||||
current_cd_run_id: str,
|
||||
current_cd_run_status: str,
|
||||
current_cd_commit_sha: str,
|
||||
cd_failed_after_registry_ready: bool,
|
||||
harbor_110_repair_run_id: str,
|
||||
harbor_110_repair_run_status: str,
|
||||
harbor_110_repair_failure_classifier: str,
|
||||
harbor_110_repair_failed_after_registry_ready: bool,
|
||||
external_blocker: str,
|
||||
pressure_blocker: str,
|
||||
node_load_classifier: str,
|
||||
) -> None:
|
||||
"""Record a resolved AI-loop queue item without reopening active P0 state."""
|
||||
production_sha = str(state.get("latest_successful_deployed_source_sha") or "")
|
||||
production_run_id = str(state.get("current_main_cd_run_id") or "")
|
||||
production_run_status = str(state.get("current_main_cd_run_status") or "")
|
||||
resolved_fields: dict[str, Any] = {
|
||||
"ai_loop_current_blocker_execution_queue_count": queue_count,
|
||||
"ai_loop_current_blocker_id": blocker_id,
|
||||
"ai_loop_current_blocker_resolved_by_production_readback": True,
|
||||
"ai_loop_current_blocker_registry_v2_ready": registry_v2_ready,
|
||||
"ai_loop_current_blocker_registry_v2_status_classifier": (
|
||||
registry_v2_status_classifier
|
||||
),
|
||||
"ai_loop_current_blocker_deployment_closure_state": (
|
||||
"production_readback_verified"
|
||||
),
|
||||
"ai_loop_current_blocker_deploy_marker_resolved_by_production_readback": (
|
||||
True
|
||||
),
|
||||
"ai_loop_current_blocker_deploy_marker_readback_required": False,
|
||||
"ai_loop_current_blocker_current_cd_run_id": production_run_id,
|
||||
"ai_loop_current_blocker_current_cd_run_status": production_run_status,
|
||||
"ai_loop_current_blocker_current_cd_commit_sha": production_sha,
|
||||
"ai_loop_current_blocker_cd_failed_after_registry_ready": False,
|
||||
"ai_loop_current_blocker_harbor_110_repair_failed_after_registry_ready": False,
|
||||
"ai_loop_current_blocker_control_path_blocker": "",
|
||||
"ai_loop_current_blocker_control_path_pressure_blocker": "",
|
||||
"ai_loop_current_blocker_safe_next_action": "",
|
||||
"ai_loop_current_blocker_safe_next_action_id": "",
|
||||
"ai_loop_current_blocker_safe_next_action_stage": "",
|
||||
"ai_loop_current_blocker_safe_next_action_command": "",
|
||||
"ai_loop_current_blocker_safe_next_action_post_verifier": "",
|
||||
"ai_loop_current_blocker_safe_next_action_requires_local_console": False,
|
||||
"ai_loop_current_blocker_safe_next_action_blocker_fields": [],
|
||||
"ai_loop_current_blocker_node_load_classifier": node_load_classifier,
|
||||
"ai_loop_current_blocker_historical_deployment_closure_state": (
|
||||
deployment_closure_state
|
||||
),
|
||||
"ai_loop_current_blocker_historical_deploy_marker_readback_required": (
|
||||
deploy_marker_readback_required
|
||||
),
|
||||
"ai_loop_current_blocker_historical_current_cd_run_id": current_cd_run_id,
|
||||
"ai_loop_current_blocker_historical_current_cd_run_status": (
|
||||
current_cd_run_status
|
||||
),
|
||||
"ai_loop_current_blocker_historical_current_cd_commit_sha": (
|
||||
current_cd_commit_sha
|
||||
),
|
||||
"ai_loop_current_blocker_historical_cd_failed_after_registry_ready": (
|
||||
cd_failed_after_registry_ready
|
||||
),
|
||||
"ai_loop_current_blocker_historical_harbor_110_repair_run_id": (
|
||||
harbor_110_repair_run_id
|
||||
),
|
||||
"ai_loop_current_blocker_historical_harbor_110_repair_run_status": (
|
||||
harbor_110_repair_run_status
|
||||
),
|
||||
"ai_loop_current_blocker_historical_harbor_110_repair_failure_classifier": (
|
||||
harbor_110_repair_failure_classifier
|
||||
),
|
||||
"ai_loop_current_blocker_historical_harbor_110_repair_failed_after_registry_ready": (
|
||||
harbor_110_repair_failed_after_registry_ready
|
||||
),
|
||||
"ai_loop_current_blocker_historical_control_path_blocker": (
|
||||
external_blocker
|
||||
),
|
||||
"ai_loop_current_blocker_historical_control_path_pressure_blocker": (
|
||||
pressure_blocker
|
||||
),
|
||||
}
|
||||
state.update(resolved_fields)
|
||||
summary = _dict(payload.setdefault("summary", {}))
|
||||
summary.update(resolved_fields)
|
||||
|
||||
for item in _list(payload.get("in_progress_or_blocked_in_priority_order")):
|
||||
workplan = _dict(item)
|
||||
if workplan.get("workplan_id") != "P0-006":
|
||||
continue
|
||||
evidence = _dict(workplan.setdefault("evidence", {}))
|
||||
evidence.update(resolved_fields)
|
||||
break
|
||||
|
||||
|
||||
def _ai_loop_current_blocker_can_override(
|
||||
*,
|
||||
status: str,
|
||||
|
||||
@@ -841,7 +841,13 @@ def test_awoooi_priority_work_order_readback_does_not_reopen_stale_cd_failure_af
|
||||
state = payload["mainline_execution_state"]
|
||||
evidence = payload["in_progress_or_blocked_in_priority_order"][0]["evidence"]
|
||||
blockers = state["active_p0_live_active_blockers"]
|
||||
assert payload["status"] == "p0_006_blocked_reboot_auto_recovery_slo_not_ready"
|
||||
assert state["active_p0_state"] == "blocked_reboot_auto_recovery_slo_not_ready"
|
||||
assert state["next_executable_mainline_workplan_id"] == (
|
||||
"P0-006-REBOOT-AUTO-RECOVERY-SLO-SCORECARD"
|
||||
)
|
||||
assert state["current_main_cd_run_status"] == "production_readback_verified"
|
||||
assert state["ai_loop_current_blocker_resolved_by_production_readback"] is True
|
||||
assert state["ai_loop_current_blocker_deploy_marker_readback_required"] is False
|
||||
assert state["ai_loop_current_blocker_cd_failed_after_registry_ready"] is False
|
||||
assert (
|
||||
@@ -854,14 +860,24 @@ def test_awoooi_priority_work_order_readback_does_not_reopen_stale_cd_failure_af
|
||||
assert state["ai_loop_current_blocker_current_cd_run_id"] == (
|
||||
f"production_readback:{runtime_short_sha}"
|
||||
)
|
||||
assert state["ai_loop_current_blocker_control_path_blocker"] == ""
|
||||
assert state["ai_loop_current_blocker_historical_control_path_blocker"] == (
|
||||
"remote_ssh_publickey_offer_timeout"
|
||||
)
|
||||
assert state["ai_loop_current_blocker_historical_current_cd_run_id"] == "4258"
|
||||
assert state["ai_loop_current_blocker_historical_current_cd_run_status"] == (
|
||||
"Failure"
|
||||
)
|
||||
assert "deploy_marker_readback_required_after_registry_ready" not in blockers
|
||||
assert "current_cd_failure_after_registry_ready" not in blockers
|
||||
assert "remote_ssh_publickey_offer_timeout" not in blockers
|
||||
assert evidence["ai_loop_current_blocker_resolved_by_production_readback"] is True
|
||||
assert evidence["ai_loop_current_blocker_deploy_marker_readback_required"] is False
|
||||
assert evidence["ai_loop_current_blocker_cd_failed_after_registry_ready"] is False
|
||||
assert evidence["ai_loop_current_blocker_control_path_blocker"] == ""
|
||||
assert evidence["ai_loop_current_blocker_historical_control_path_blocker"] == (
|
||||
"remote_ssh_publickey_offer_timeout"
|
||||
)
|
||||
assert evidence[
|
||||
"ai_loop_current_blocker_historical_deploy_marker_readback_required"
|
||||
] is True
|
||||
@@ -874,6 +890,10 @@ def test_awoooi_priority_work_order_readback_does_not_reopen_stale_cd_failure_af
|
||||
assert payload["summary"]["ai_loop_current_blocker_current_cd_run_status"] == (
|
||||
"production_readback_verified"
|
||||
)
|
||||
assert payload["summary"]["ai_loop_current_blocker_control_path_blocker"] == ""
|
||||
assert payload["summary"][
|
||||
"ai_loop_current_blocker_historical_control_path_blocker"
|
||||
] == "remote_ssh_publickey_offer_timeout"
|
||||
assert all(
|
||||
"P0-006-CD-DEPLOY-MARKER-READBACK" not in item
|
||||
for item in payload["next_execution_order"]
|
||||
|
||||
@@ -51968,6 +51968,22 @@ production browser smoke:
|
||||
- rebase 到 Gitea main 最新 deploy marker,commit / push 本次 source 修法與 receipt。
|
||||
- 讀回 Gitea CD / Prometheus;下一條主線是把 `RebootAutoRecoverySLOMissed` 的 6 個 blocker 轉成可自動判斷與可回滾 remediation,不消音告警。
|
||||
|
||||
## 2026-07-02 — P0 AI loop stale control-path blocker production readback 修正
|
||||
|
||||
**完成內容**:
|
||||
- 修正 `awoooi-priority-work-order-readback` 的 AI loop current blocker overlay:當 production source truth 已讀回 `production_readback_verified`、registry `/v2/` 已 ready,且舊 deploy marker / CD failure / Harbor 110 repair failure 已被 production readback 收斂時,不再把舊 `remote_ssh_publickey_offer_timeout` 視為 active P0 blocker。
|
||||
- API 現在保留 historical evidence:`ai_loop_current_blocker_historical_control_path_blocker=remote_ssh_publickey_offer_timeout`,但 active 欄位回到空值,並讓 active P0 回到 `P0-006-REBOOT-AUTO-RECOVERY-SLO-SCORECARD`。
|
||||
- 測試新增 active P0 / historical blocker / active blocker exclusion 斷言,避免已成功部署的舊 Harbor/110 failure 再覆蓋主線順序。
|
||||
|
||||
**本地驗證結果**:
|
||||
- `DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_awoooi_priority_work_order_readback_api.py apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py ops/runner/test_read_public_gitea_actions_queue.py ops/runner/test_verify_awoooi_non110_cd_closure.py -q`:`86 passed`。
|
||||
- `python3.11 -m py_compile apps/api/src/services/awoooi_priority_work_order_readback.py ops/runner/read-public-gitea-actions-queue.py ops/runner/verify-awoooi-non110-cd-closure.py`:通過。
|
||||
- `git diff --check`:通過。
|
||||
|
||||
**仍維持**:
|
||||
- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有使用 GitHub / gh / GitHub API / GitHub Actions。
|
||||
- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB / firewall 操作,沒有 workflow_dispatch,沒有 force push。
|
||||
|
||||
## 2026-07-01 — 23:28 P0 110 sustained CPU pressure alert / controlled quota / alert-chain readback
|
||||
|
||||
**完成內容**:
|
||||
|
||||
Reference in New Issue
Block a user