diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index b571aacfc..d37f76312 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -1706,58 +1706,78 @@ jobs: import json import os import sys + import time import urllib.request expected = os.environ["IMAGE_TAG"].strip().lower() expected_short = expected[:10] url = "https://awoooi.wooo.work/api/v1/agents/delivery-closure-workbench" - try: - with urllib.request.urlopen(url, timeout=20) as response: - payload = json.load(response) - except Exception as exc: - print( - "production_workbench_deploy_readback_failed=" - f"{type(exc).__name__}", - file=sys.stderr, - ) - raise SystemExit(1) from exc - - summary = payload.get("summary") if isinstance(payload, dict) else {} - if not isinstance(summary, dict): - summary = {} - runtime_short = str( - summary.get("production_deploy_runtime_build_commit_short_sha") or "" - ) - desired_short = str( - summary.get("production_deploy_desired_main_api_image_tag_short_sha") - or "" - ) - desired_status = str( - summary.get( - "production_deploy_desired_main_api_image_tag_readback_status" - ) - or "" - ) - matches_main = summary.get("production_deploy_image_tag_matches_main") is True - if ( - runtime_short != expected_short - or desired_short != expected_short - or desired_status != "ok" - or not matches_main - ): - print( - "production_deploy_readback_mismatch=" - f"expected={expected_short};runtime={runtime_short};" - f"desired={desired_short};desired_status={desired_status};" - f"matches_main={matches_main}", - file=sys.stderr, - ) - raise SystemExit(1) + attempts = int(os.environ.get("DEPLOY_READBACK_ATTEMPTS", "18")) + sleep_seconds = int(os.environ.get("DEPLOY_READBACK_SLEEP_SECONDS", "10")) + last_error = "" + for attempt in range(1, attempts + 1): + try: + with urllib.request.urlopen(url, timeout=20) as response: + payload = json.load(response) + except Exception as exc: + last_error = f"fetch_failed={type(exc).__name__}" + print( + "production_deploy_readback_attempt=" + f"{attempt}/{attempts};{last_error}", + file=sys.stderr, + ) + else: + summary = payload.get("summary") if isinstance(payload, dict) else {} + if not isinstance(summary, dict): + summary = {} + runtime_short = str( + summary.get("production_deploy_runtime_build_commit_short_sha") + or "" + ) + desired_short = str( + summary.get( + "production_deploy_desired_main_api_image_tag_short_sha" + ) + or "" + ) + desired_status = str( + summary.get( + "production_deploy_desired_main_api_image_tag_readback_status" + ) + or "" + ) + matches_main = ( + summary.get("production_deploy_image_tag_matches_main") is True + ) + if ( + runtime_short == expected_short + and desired_short == expected_short + and desired_status == "ok" + and matches_main + ): + print( + "✅ Production deploy readback matches Gitea main desired " + f"image tag ({expected_short}) on attempt {attempt}/{attempts}" + ) + raise SystemExit(0) + last_error = ( + f"expected={expected_short};runtime={runtime_short};" + f"desired={desired_short};desired_status={desired_status};" + f"matches_main={matches_main}" + ) + print( + "production_deploy_readback_attempt=" + f"{attempt}/{attempts};{last_error}", + file=sys.stderr, + ) + if attempt < attempts: + time.sleep(sleep_seconds) print( - "✅ Production deploy readback matches Gitea main desired image tag " - f"({expected_short})" + "production_deploy_readback_mismatch=" + last_error, + file=sys.stderr, ) + raise SystemExit(1) PY fi diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 9d3a65656..bf45d6392 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,20 @@ +## 2026-06-30 — 09:24 production deploy readback bounded poll + +**照主線處理的問題**: +- Gitea CD `#4014` 對 `eb137bb4e fix(recovery): bound cold-start monitor probes` 轉 Failure;public job log 顯示 tests job 成功、build-and-deploy 已 build/push、deploy marker `90daa544f chore(cd): deploy eb137bb [skip ci]` 已 push、ArgoCD `Synced/Healthy`、三個 deployment rollout 成功、API health 200。 +- 失敗點不是 runner label、runner token 或測試,而是 deploy 完成後立刻讀 production Workbench 仍看到上一版:`production_deploy_readback_mismatch=expected=eb137bb4e0;runtime=4295b3383a;desired=4295b3383a;desired_status=ok;matches_main=True`。 +- `.gitea/workflows/cd.yaml` 已把 production deploy readback 從單次 request 改成 bounded polling:預設 `DEPLOY_READBACK_ATTEMPTS=18`、`DEPLOY_READBACK_SLEEP_SECONDS=10`,每次輸出 `production_deploy_readback_attempt=`;若最後仍 mismatch 才 fail-closed。 +- `ops/runner/test_cd_controlled_runtime_profile.py` 已鎖住 readback polling contract,避免未來又回到 rollout 後單次瞬讀 false failure。 + +**驗證**: +- CD profile guard + bounded monitor tests:`29 passed`。 +- Gitea runner pressure guard:`workflow_files=11`、`auto_branch_events_on_110=0`、`generic_runner_labels=0`。 +- Gitea step env secret guard:`no Gitea run/with secrets or legacy Telegram routes`。 +- `git diff --check`:通過。 +- Production Workbench 在 rollout 後延遲收斂已讀回 `production_deploy_runtime_build_commit_short_sha=eb137bb4e0`、`production_deploy_desired_main_api_image_tag_short_sha=eb137bb4e0`、`production_deploy_desired_main_api_image_tag_readback_status=ok`、`production_deploy_image_tag_matches_main=true`。 + +**邊界**:未 workflow_dispatch,未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未手動改 K8s / DB,未讀 secret / token / raw sessions / SQLite / `.env`,未使用 GitHub / `gh` / GitHub API。 + ## 2026-06-30 — 09:07 P0-006 cold-start monitor bounded probe hardening **照主線處理的問題**: diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index fbb2a6d10..f4362cdab 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -55,6 +55,10 @@ def test_cd_requires_production_deploy_readback_after_rollout() -> None: assert "apps/api/tests/test_awoooi_production_deploy_readback_blocker.py)" in text assert "tests/test_awoooi_production_deploy_readback_blocker.py" in text assert "production_deploy_readback_mismatch=" in text + assert 'attempts = int(os.environ.get("DEPLOY_READBACK_ATTEMPTS", "18"))' in text + assert 'sleep_seconds = int(os.environ.get("DEPLOY_READBACK_SLEEP_SECONDS", "10"))' in text + assert "production_deploy_readback_attempt=" in text + assert "time.sleep(sleep_seconds)" in text assert "production_deploy_runtime_build_commit_short_sha" in text assert "production_deploy_desired_main_api_image_tag_short_sha" in text assert "production_deploy_desired_main_api_image_tag_readback_status" in text