fix(agent): surface harbor cd retry receipt blocker
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 36s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 36s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -146,6 +146,15 @@ def validate_harbor_registry_controlled_recovery_receipt(
|
||||
"gitea_queue_harbor_110_repair_run_status": gitea_queue[
|
||||
"latest_visible_harbor_110_repair_run_status"
|
||||
],
|
||||
"gitea_queue_current_cd_inflight_classifier": gitea_queue[
|
||||
"current_cd_inflight_classifier"
|
||||
],
|
||||
"gitea_queue_current_cd_harbor_latest_registry_v2_status": (
|
||||
gitea_queue["current_cd_harbor_latest_registry_v2_status"]
|
||||
),
|
||||
"gitea_queue_current_cd_harbor_retrying_unavailable": gitea_queue[
|
||||
"current_cd_harbor_retrying_unavailable"
|
||||
],
|
||||
"deploy_marker_readback_seen": deploy_marker["receipt_seen"],
|
||||
"deploy_marker_verified": deploy_marker["deploy_marker_verified"],
|
||||
"deploy_marker_blocker_count": deploy_marker["blocker_count"],
|
||||
@@ -372,6 +381,9 @@ def _gitea_queue_readback(value: Any) -> dict[str, Any]:
|
||||
"latest_visible_cd_run_id": "",
|
||||
"latest_visible_cd_run_status": "",
|
||||
"latest_visible_cd_run_commit_sha": "",
|
||||
"current_cd_inflight_classifier": "",
|
||||
"current_cd_harbor_latest_registry_v2_status": "",
|
||||
"current_cd_harbor_retrying_unavailable": False,
|
||||
"latest_visible_harbor_110_repair_run_id": "",
|
||||
"latest_visible_harbor_110_repair_run_status": "",
|
||||
"harbor_110_repair_no_matching_runner_label": "",
|
||||
@@ -410,6 +422,24 @@ def _gitea_queue_readback(value: Any) -> dict[str, Any]:
|
||||
rollups.get("harbor_110_repair_blocked") is True
|
||||
or readback.get("latest_visible_harbor_110_repair_blocked") is True
|
||||
)
|
||||
current_cd_inflight_classifier = str(
|
||||
rollups.get("current_main_cd_inflight_classifier")
|
||||
or readback.get("latest_visible_cd_inflight_classifier")
|
||||
or ""
|
||||
)
|
||||
current_cd_latest_registry_status = str(
|
||||
rollups.get("current_main_cd_harbor_latest_registry_v2_status")
|
||||
or readback.get("latest_visible_cd_harbor_latest_registry_v2_status")
|
||||
or ""
|
||||
)
|
||||
current_cd_harbor_retrying = bool(
|
||||
rollups.get("current_main_cd_harbor_public_route_retrying_unavailable")
|
||||
is True
|
||||
or readback.get("latest_visible_cd_harbor_public_route_retrying_unavailable")
|
||||
is True
|
||||
or current_cd_inflight_classifier
|
||||
== "harbor_registry_public_route_unavailable_pending_retry"
|
||||
)
|
||||
boundary_violation = any(
|
||||
operation_boundaries.get(flag) is True
|
||||
for flag in (
|
||||
@@ -424,6 +454,7 @@ def _gitea_queue_readback(value: Any) -> dict[str, Any]:
|
||||
blockers = _gitea_queue_blockers(
|
||||
no_matching_runner=bool(no_matching_label),
|
||||
jobs_stale=jobs_stale,
|
||||
current_cd_harbor_retrying=current_cd_harbor_retrying,
|
||||
blocked=blocked,
|
||||
boundary_violation=boundary_violation,
|
||||
)
|
||||
@@ -438,6 +469,11 @@ def _gitea_queue_readback(value: Any) -> dict[str, Any]:
|
||||
"latest_visible_cd_run_commit_sha": str(
|
||||
readback.get("latest_visible_cd_run_commit_sha") or ""
|
||||
),
|
||||
"current_cd_inflight_classifier": current_cd_inflight_classifier,
|
||||
"current_cd_harbor_latest_registry_v2_status": (
|
||||
current_cd_latest_registry_status
|
||||
),
|
||||
"current_cd_harbor_retrying_unavailable": current_cd_harbor_retrying,
|
||||
"latest_visible_harbor_110_repair_run_id": str(
|
||||
readback.get("latest_visible_harbor_110_repair_run_id") or ""
|
||||
),
|
||||
@@ -471,10 +507,13 @@ def _gitea_queue_blockers(
|
||||
*,
|
||||
no_matching_runner: bool,
|
||||
jobs_stale: bool,
|
||||
current_cd_harbor_retrying: bool,
|
||||
blocked: bool,
|
||||
boundary_violation: bool,
|
||||
) -> list[str]:
|
||||
blockers: list[str] = []
|
||||
if current_cd_harbor_retrying:
|
||||
blockers.append("gitea_queue_current_cd_harbor_retrying_unavailable")
|
||||
if no_matching_runner:
|
||||
blockers.append("gitea_queue_harbor_110_repair_no_matching_runner")
|
||||
elif blocked:
|
||||
|
||||
@@ -219,6 +219,50 @@ def test_harbor_recovery_receipt_requires_deploy_marker_http_readback() -> None:
|
||||
assert payload["rollups"]["deploy_marker_verified"] is False
|
||||
|
||||
|
||||
def test_harbor_recovery_receipt_surfaces_inflight_cd_harbor_retry() -> None:
|
||||
payload = validate_harbor_registry_controlled_recovery_receipt(
|
||||
{
|
||||
"gitea_actions_queue_readback": _gitea_queue_no_matching_runner_with_cd_retry(),
|
||||
}
|
||||
)
|
||||
|
||||
assert payload["status"] == "blocked_waiting_harbor_controlled_recovery_receipt"
|
||||
assert payload["safe_next_step"] == (
|
||||
"submit_non_secret_ssh_local_or_harbor_watchdog_receipt"
|
||||
)
|
||||
assert payload["active_blockers"] == [
|
||||
"harbor_watchdog_check_receipt_missing",
|
||||
"public_registry_v2_verifier_not_green",
|
||||
"internal_registry_v2_verifier_not_green",
|
||||
"gitea_queue_current_cd_harbor_retrying_unavailable",
|
||||
"gitea_queue_harbor_110_repair_no_matching_runner",
|
||||
"gitea_queue_harbor_110_repair_jobs_stale_or_mismatched",
|
||||
]
|
||||
queue = payload["readback"]["gitea_actions_queue"]
|
||||
assert queue["latest_visible_cd_run_status"] == "Running"
|
||||
assert (
|
||||
queue["current_cd_inflight_classifier"]
|
||||
== "harbor_registry_public_route_unavailable_pending_retry"
|
||||
)
|
||||
assert queue["current_cd_harbor_latest_registry_v2_status"] == "502"
|
||||
assert queue["current_cd_harbor_retrying_unavailable"] is True
|
||||
assert (
|
||||
payload["rollups"]["gitea_queue_current_cd_inflight_classifier"]
|
||||
== "harbor_registry_public_route_unavailable_pending_retry"
|
||||
)
|
||||
assert (
|
||||
payload["rollups"][
|
||||
"gitea_queue_current_cd_harbor_latest_registry_v2_status"
|
||||
]
|
||||
== "502"
|
||||
)
|
||||
assert (
|
||||
payload["rollups"]["gitea_queue_current_cd_harbor_retrying_unavailable"]
|
||||
is True
|
||||
)
|
||||
assert payload["rollups"]["gitea_queue_blocker_count"] == 3
|
||||
|
||||
|
||||
def test_harbor_recovery_receipt_endpoint_redacts_raw_output() -> None:
|
||||
app = FastAPI()
|
||||
app.include_router(router, prefix="/api/v1")
|
||||
@@ -392,3 +436,27 @@ def _deploy_marker_stale() -> dict:
|
||||
"raw_session_or_sqlite_read_allowed": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _gitea_queue_no_matching_runner_with_cd_retry() -> dict:
|
||||
payload = _gitea_queue_no_matching_runner()
|
||||
payload["readback"].update(
|
||||
{
|
||||
"latest_visible_cd_run_status": "Running",
|
||||
"latest_visible_cd_inflight_classifier": (
|
||||
"harbor_registry_public_route_unavailable_pending_retry"
|
||||
),
|
||||
"latest_visible_cd_harbor_latest_registry_v2_status": "502",
|
||||
"latest_visible_cd_harbor_public_route_retrying_unavailable": True,
|
||||
}
|
||||
)
|
||||
payload["rollups"].update(
|
||||
{
|
||||
"current_main_cd_inflight_classifier": (
|
||||
"harbor_registry_public_route_unavailable_pending_retry"
|
||||
),
|
||||
"current_main_cd_harbor_latest_registry_v2_status": "502",
|
||||
"current_main_cd_harbor_public_route_retrying_unavailable": True,
|
||||
}
|
||||
)
|
||||
return payload
|
||||
|
||||
@@ -1,3 +1,18 @@
|
||||
## 2026-06-30 — 23:18 Harbor recovery receipt absorbs live CD 502 retry
|
||||
|
||||
**照主線修正的問題**:
|
||||
- 最新 Gitea main `a7b79b7b feat(agent): expose harbor receipt input contract` 的 CD `#4113` 仍是 `Running`,但 public queue 已讀到 `latest_visible_cd_inflight_classifier=harbor_registry_public_route_unavailable_pending_retry`、latest registry `/v2` status `502`、Harbor login attempt `8`。
|
||||
- Harbor repair workflow `#4112` 仍 `Waiting`,`latest_visible_harbor_110_repair_no_matching_runner_label=awoooi-host`,jobs API 仍是 stale/mismatched `ai-code-review`。這代表主線下一步仍是 110 control path / local Harbor repair,不是等 CD 自然完成。
|
||||
- `harbor_registry_controlled_recovery_receipt` 現在把 live queue 的 in-flight CD retry 轉成 receipt blocker:`gitea_queue_current_cd_harbor_retrying_unavailable`,並露出 `current_cd_inflight_classifier`、`current_cd_harbor_latest_registry_v2_status`、`current_cd_harbor_retrying_unavailable` 與 rollup。
|
||||
- 用 live queue 重跑 receipt validator 回 `status=blocked_waiting_harbor_controlled_recovery_receipt`、active blockers 包含 `harbor_watchdog_check_receipt_missing`、`public_registry_v2_verifier_not_green`、`internal_registry_v2_verifier_not_green`、`gitea_queue_current_cd_harbor_retrying_unavailable`、`gitea_queue_harbor_110_repair_no_matching_runner`、`gitea_queue_harbor_110_repair_jobs_stale_or_mismatched`。
|
||||
|
||||
**驗證**:
|
||||
- `DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/api pytest apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py -q` 通過(5 passed)。
|
||||
- `py_compile`、`ruff check` 通過。
|
||||
- Live receipt readback 確認 `gitea_queue_current_cd_harbor_retrying_unavailable=true`、latest registry `/v2` status `502`、`gitea_queue_blocker_count=3`。
|
||||
|
||||
**邊界**:只改 receipt validator / tests / LOGBOOK / workplan;只讀 public Gitea queue 與本機 source;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未 SSH 寫入、未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall。
|
||||
|
||||
## 2026-06-30 — 22:55 Harbor CD running retry blocker readback
|
||||
|
||||
**照主線修正的問題**:
|
||||
|
||||
@@ -18,8 +18,8 @@
|
||||
| P0-1 | BLOCKED | 全主機 cold-start / 10 分鐘自動恢復 SLO | 22:55 live cold-start artifact `/tmp/awoooi-cold-start-live-after-ff.log` 回 `PASS=68 WARN=4 BLOCKED=4`;blockers 是 110 registry external `/v2`、110 SSH read-only check、K3s registry pull refused by `110:5000`、SignOz TLS / 502。22:31 SLO scorecard `/tmp/awoooi-reboot-slo-live-20260630-2231-scorecard.json` 仍回 `can_claim_all_services_recovered_within_target=false`;22:28 post-reboot summary `/tmp/awoooi-post-reboot-readiness-20260630-222856/summary.txt` 回 `SERVICE_GREEN=0`、`PRODUCT_DATA_GREEN=0`、`BACKUP_CORE_GREEN=0`、`HOST_188_SERVICE_GREEN=0`。 | 先修第一個 runtime blocker:110 control path / Harbor registry `/v2`。重跑同一 summary / cold-start / SLO scorecard 到 `SERVICE_GREEN=1`、`POST_START_BLOCKED=0`、`PASS` 無 BLOCKED、all-host required observed/reachable 且 `awoooi_reboot_auto_recovery_slo_ready=1`;不可只用 route 200 或 CD `Running` 宣稱恢復。 |
|
||||
| P0-2 | DONE_THIS_INCIDENT | 使用者可見 502:Tsenyang | `www.tsenyang.com` / `tsenyang.com` 由 502 恢復為 200;188 `tsenyang-website` container running;local `127.0.0.1:3000` 回 200。 | 下次同類 502 先查 release symlink / image / container;不先動 Nginx、DNS、DB、主機重啟。 |
|
||||
| P0-3 | BLOCKED | StockPlatform data freshness | 22:50 public `/api/v1/system/freshness` 與 `/api/v1/system/ingestion` 回 `status=not_configured`、`blockers=["postgres_not_ready"]`;public route `https://stock.wooo.work/` 為 200 只代表網站可達,不代表資料最新。 | 恢復 110 control path 後,read-only 查 `/home/wooo/stockplatform-v2` compose / DB schema / migration status;禁止 fake freshness、manual DB rows、restore/prune。 |
|
||||
| P0-4 | BLOCKED | AWOOOI production 版本最新性 | Gitea SSH `main` 最新已到 `152482c4f feat(agent): expose local console recovery phases`。Public Gitea queue 22:55 讀到 latest CD `#4105 Running`,但 `current_main_cd_inflight_classifier=harbor_registry_public_route_unavailable_pending_retry`、latest registry `/v2` status `502`、controlled repair skip reason `not_110_host`。production 尚不能證明已部署最新 source。 | 補 deploy marker / runtime SHA / endpoint readback 一致;Harbor `/v2` 恢復前 CD 無法把最新 source 發到 production,未一致前不可宣稱 AWOOOI 最新。 |
|
||||
| P0-5 | BLOCKED | 110 control path / Harbor registry `/v2` | 22:55 queue readback 回 `status=blocked_harbor_110_repair_no_matching_runner`,latest CD `#4105 Running` 但 registry retrying unavailable `true`、latest `/v2` status `502`;Harbor repair workflow 仍 `Waiting`,no-matching label `awoooi-host`。22:55 cold-start 同步證明 110 registry `/v2` blocked、110 SSH read-only check blocked、K3s pull refused by `110:5000`。 | 讓 110-local repair workflow 或 110 console/local script 真正執行 `recover-110-control-path-and-harbor-local.sh --check` / `--apply-all`,並讀回 public/internal `/v2` 為 `200/401`。恢復 SSH read-only command path 後才能驗證 Stock DB、Gitea dump、110 backup completeness。 |
|
||||
| P0-4 | BLOCKED | AWOOOI production 版本最新性 | Gitea SSH `main` 最新已到 `a7b79b7b feat(agent): expose harbor receipt input contract`。Public Gitea queue 23:18 讀到 latest CD `#4113 Running`,但 `current_main_cd_inflight_classifier=harbor_registry_public_route_unavailable_pending_retry`、latest registry `/v2` status `502`、Harbor login attempt `8`。production 尚不能證明已部署最新 source。 | 補 deploy marker / runtime SHA / endpoint readback 一致;Harbor `/v2` 恢復前 CD 無法把最新 source 發到 production,未一致前不可宣稱 AWOOOI 最新。 |
|
||||
| P0-5 | BLOCKED | 110 control path / Harbor registry `/v2` | 23:18 queue readback 回 `status=blocked_harbor_110_repair_no_matching_runner`;Harbor repair workflow `#4112` 仍 `Waiting`,no-matching label `awoooi-host`,jobs API 仍 stale/mismatched `ai-code-review`。Harbor receipt validator 對 live queue 回 `status=blocked_waiting_harbor_controlled_recovery_receipt`,active blockers 含 `gitea_queue_current_cd_harbor_retrying_unavailable`、`gitea_queue_harbor_110_repair_no_matching_runner`、`gitea_queue_harbor_110_repair_jobs_stale_or_mismatched`、`public_registry_v2_verifier_not_green`、`internal_registry_v2_verifier_not_green`。22:55 cold-start 同步證明 110 registry `/v2` blocked、110 SSH read-only check blocked、K3s pull refused by `110:5000`。 | 讓 110-local repair workflow 或 110 console/local script 真正執行 `recover-110-control-path-and-harbor-local.sh --check` / `--apply-all`,並讀回 public/internal `/v2` 為 `200/401`。恢復 SSH read-only command path 後才能驗證 Stock DB、Gitea dump、110 backup completeness。 |
|
||||
| P0-6 | BLOCKED_BACKUP_COMPLETENESS | Gitea repo visibility 與完整備份 | Gitea version API 200;public repo search 只列 4 個 public repo;`stockplatform-v2` public page/API 404,但 internal `git ls-remote` 成功;188 `/home/ollama/backup/110/gitea` 起初為空。已建立 verified emergency bundle `/home/ollama/backup/110/gitea/git-bundles/20260630-190931`:4 個 public/internal repo bundle verify + checksum 成功,`AwoooGo`、`stockplatform-v2`、`vibework` 因 private auth fail-closed。20:18 summary 因 110 `backup-status` 不可讀回,`BACKUP_CORE_GREEN=0`、`DR_ESCROW_BLOCKED=1`、`DR_ESCROW_EVIDENCE_UNKNOWN=1`。 | 188 `gitea_repo_mirror_from_110` subtree metric / alert 已補;下一步仍是恢復 110 SSH command path 後跑正式 `gitea dump`、private repo 非互動備份、repo count、backup-status 與 restore drill readback。unknown 不得當作 backup / DR green。 |
|
||||
| P0-7 | SOURCE_READY_RUNTIME_BLOCKED | 99 VMware / VM autostart | repo 已有 `windows99-vmware-autostart.ps1`;22:05 host probe 讀到 99 ping reachable 但 `boot_id=reachable_unknown_boot` / uptime unknown,111 不可達,112/120/121/188 可讀,188 startup unit failed/degraded。先前只讀 readback 顯示 99 RDP 3389 / SSH 22 可達、WinRM 5985 fail,`administrator@192.168.0.99` SSH publickey denied。 | 恢復 99 可控通道或由 console 套用腳本;完成後讀回 111/188/120/121/112 boot evidence,要求 all-host required observed/reachable 且 99 不再是 unknown uptime。 |
|
||||
| P0-8 | SOURCE_READY_RUNTIME_BLOCKED | 502 maintenance fallback / Telegram / backup alert | L0/L1 fallback runbook、Nginx snippet、reboot / backup alert rules 已在 source;runtime 尚需部署與外部 L1 provider readback。 | L0 以測試 vhost 驗證 `X-AWOOOI-Fallback`;L1 需外部雲端/CDN probe;Telegram 以脫敏 alert receipt 驗證。 |
|
||||
|
||||
Reference in New Issue
Block a user