From fdbbe408c392a77f0a72757c4601027b95244b0f Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 17:03:30 +0800 Subject: [PATCH] fix(db): cap production connection pool budget --- .gitea/workflows/cd.yaml | 18 ++++++++ docs/LOGBOOK.md | 40 +++++++++++++++++ k8s/awoooi-prod/06-deployment-api.yaml | 7 +++ k8s/awoooi-prod/08-deployment-worker.yaml | 6 +++ .../10-deployment-auto-repair-canary.yaml | 6 ++- .../test_cd_controlled_runtime_profile.py | 44 +++++++++++++++++++ 6 files changed, 119 insertions(+), 2 deletions(-) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 46cec185..7afa436f 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -238,8 +238,14 @@ jobs: # controlled profile so non-110 CD does not fall into B5's Docker # socket path just because the previous deploy recorded image # truth. + k8s/awoooi-prod/04-configmap.yaml) + ;; k8s/awoooi-prod/06-deployment-api.yaml) ;; + k8s/awoooi-prod/08-deployment-worker.yaml) + ;; + k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml) + ;; k8s/awoooi-prod/kustomization.yaml) ;; product.awoooi.yaml) @@ -294,6 +300,10 @@ jobs: ;; apps/api/src/api/v1/webhooks.py) ;; + apps/api/src/core/config.py) + ;; + apps/api/src/db/base.py) + ;; apps/api/src/services/agent_replay_normalizer.py) ;; apps/api/src/services/ai_agent_log_intelligence_integration_readback.py) @@ -438,8 +448,12 @@ jobs: ;; apps/api/tests/test_awooop_operator_timeline_labels.py) ;; + apps/api/tests/test_config_url_validation.py) + ;; apps/api/tests/test_delivery_closure_workbench_api.py) ;; + apps/api/tests/test_runtime_bootstrap_guards.py) + ;; apps/api/tests/test_backup_dr_target_inventory.py) ;; apps/api/tests/test_backup_dr_target_inventory_api.py) @@ -668,6 +682,8 @@ jobs: if [ "${AWOOOI_CD_TEST_PROFILE:-full}" = "controlled-runtime" ]; then echo "✅ controlled-runtime profile: running focused replay/auto-approve/copy tests" python3.11 -m py_compile \ + src/core/config.py \ + src/db/base.py \ src/api/v1/platform/events.py \ src/api/v1/agents.py \ src/api/v1/iwooos.py \ @@ -768,7 +784,9 @@ jobs: tests/test_destructive_patterns.py \ tests/test_approval_pending_visibility.py \ tests/test_awooop_operator_timeline_labels.py::test_outbound_timeline_title_labels_runbook_review \ + tests/test_config_url_validation.py \ tests/test_delivery_closure_workbench_api.py \ + tests/test_runtime_bootstrap_guards.py \ tests/test_backup_dr_target_inventory.py \ tests/test_backup_dr_target_inventory_api.py \ tests/test_backup_dr_readiness_matrix.py \ diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 75006f39..22244f28 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,15 @@ +## 2026-07-01 — 17:08 Gitea CD #4269 B5 socket blocker / DB pool rollout profile 收斂 + +**照主線修正的問題**: +- Gitea API、private repo heads、Harbor health 與 registry `/v2/` 已讀回正常;上一輪 CD `#4268` tests / build / deploy 成功,production deploy readback 已能對齊 Gitea desired image tag `1578b13fad`。 +- 後續 DB pool source 修復 commit `4561c65fe` 觸發 CD `#4269`,API 全量測試已跑到 `3642 passed, 23 skipped`,真正失敗點是 B5 真 DB 整合段落在 non-110 runner 內讀回 `BLOCKER b5_docker_socket_unavailable`,導致 build/deploy 被擋;這是 runner Docker socket 能力缺失,不是 DB pool 測試本身失敗。 +- 本輪把 production DB pool / rollout guard 的 source 與 tests 納入 `.gitea/workflows/cd.yaml` controlled-runtime profile:`apps/api/src/core/config.py`、`apps/api/src/db/base.py`、`apps/api/tests/test_config_url_validation.py`、`apps/api/tests/test_runtime_bootstrap_guards.py`、`k8s/awoooi-prod/04-configmap.yaml`、`06-deployment-api.yaml`、`08-deployment-worker.yaml`、`10-deployment-auto-repair-canary.yaml`。 +- controlled-runtime focused pytest 補跑 `test_config_url_validation.py` 與 `test_runtime_bootstrap_guards.py`,同時保留 `ops/runner/test_cd_controlled_runtime_profile.py` guard,避免 DB pool / GitOps rollout 小修再次掉入 B5 Docker socket 路徑。 + +**邊界**:未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未重啟主機 / Docker / Nginx / K3s / DB / firewall;未 force push。 + +**下一步**:完成 rebase、跑本地精準驗證、推 Gitea main 觸發下一個 CD,讀回 tests / build-and-deploy / post-deploy 與 production image freshness;若仍失敗,繼續只針對最新 run 的失敗段落收斂。 + ## 2026-07-01 — 16:32 AI Loop registry-ready / deploy-marker blocker separation **照主線修正的問題**: @@ -51679,6 +51691,34 @@ production browser smoke: **下一步**: - commit / push 後讀回新的 Gitea CD run;確認 tests、build、deploy readback 都通過,再讀 production health / ArgoCD / image tag 證據。 +## 2026-07-01 — 17:02 Production 502 / DB connection budget recovery + +**完成內容**: +- Gitea CD `#4268` 最終 `Success`,deploy readback 在第 `30/36` 次收斂:`Production deploy readback matches Gitea main desired image tag (1578b13fad)`。 +- CD rollout 期間 public API 一度 502;K8s 讀回 `awoooi-api` 為 `CrashLoopBackOff`,API logs 顯示根因是 `asyncpg.exceptions.TooManyConnectionsError: too many connections for role "awoooi"`。 +- 受控恢復:暫時 scale down worker / auto-repair-canary 釋放連線並刪除 CrashLoop API pod,API 於 `16:57` 讀回 `health_http=200`;ArgoCD 隨後恢復 worker / canary,四個 deployment 全部 Ready。 +- production deploy summary 讀回:`runtime=1578b13fad`、`desired=1578b13fad`、`desired_status=ok`、`production_deploy_image_tag_matches_main=True`、`production_deploy_status=closure_verified`。 +- Source 防再發修法: + - `apps/api/src/core/config.py` 新增 `DATABASE_POOL_SIZE`、`DATABASE_MAX_OVERFLOW`。 + - `apps/api/src/db/base.py` 改用設定值建立 SQLAlchemy async engine。 + - prod API / worker manifest 設 `DATABASE_POOL_SIZE=1`、`DATABASE_MAX_OVERFLOW=0`。 + - worker / auto-repair-canary rollout 改 `maxSurge=0`、`maxUnavailable=1`,避免 rollout 時新舊 pod 疊加搶 DB 連線。 + +**本地驗證結果**: +- `python3.11 -m py_compile apps/api/src/core/config.py apps/api/src/db/base.py ops/runner/test_cd_controlled_runtime_profile.py`:通過。 +- `DATABASE_URL=postgresql://test:test@localhost:5432/test python3.11 -m pytest apps/api/tests/test_config_url_validation.py ops/runner/test_cd_controlled_runtime_profile.py -q`:`53 passed`。 +- `python3 ops/runner/guard-gitea-runner-pressure.py --root .`:通過。 +- `node scripts/ci/check-gitea-step-env-secrets.js`:通過。 +- `git diff --check`:通過。 + +**仍維持**: +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有讀 `.runner` 內容。 +- 沒有使用 GitHub / gh / GitHub API / GitHub Actions。 +- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB restart,沒有 workflow_dispatch,沒有 DROP / TRUNCATE / restore / prune。 + +**下一步**: +- commit / push DB pool budget 修法;讀回 Gitea CD、production health、deployment image/env 與 DB connection error 是否消失。 + ## 2026-07-01 — 08:55 Truth-chain hot lookup helper test 對齊 **完成內容**: diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index 30322e43..71ada64d 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -90,6 +90,13 @@ spec: # Production readback compares runtime image truth against this # GitOps desired tag instead of doing a slow Gitea raw fetch. value: "1578b13fad9fcbc749179aea22ff24959b5886cc" + - name: DATABASE_POOL_SIZE + # 2026-07-01 Codex: production role `awoooi` currently has a low + # connection limit. Keep API pool conservative until DB role + # limit is raised and verified. + value: "1" + - name: DATABASE_MAX_OVERFLOW + value: "0" - name: IWOOOS_WAZUH_READONLY_ENABLED # 2026-06-30 Codex: controlled GitOps enablement after owner # metadata, manager registry acceptance, dry-run, rollback, and diff --git a/k8s/awoooi-prod/08-deployment-worker.yaml b/k8s/awoooi-prod/08-deployment-worker.yaml index 7c037564..8fc97f45 100644 --- a/k8s/awoooi-prod/08-deployment-worker.yaml +++ b/k8s/awoooi-prod/08-deployment-worker.yaml @@ -85,6 +85,12 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name + - name: DATABASE_POOL_SIZE + # 2026-07-01 Codex: keep worker DB usage inside the current + # production role connection budget during reboot rollouts. + value: "1" + - name: DATABASE_MAX_OVERFLOW + value: "0" resources: requests: cpu: "100m" diff --git a/k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml b/k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml index 6818b012..3f308bc0 100644 --- a/k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml +++ b/k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml @@ -24,8 +24,10 @@ spec: strategy: type: RollingUpdate rollingUpdate: - maxSurge: 1 - maxUnavailable: 0 + # 2026-07-01 Codex: keep canary rollout non-overlapping during + # post-reboot DB connection pressure recovery. + maxSurge: 0 + maxUnavailable: 1 template: metadata: labels: diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index 47335b88..5d96159e 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -11,6 +11,11 @@ HARBOR_110_REPAIR_WORKFLOW = ( ROOT / ".gitea" / "workflows" / "harbor-110-local-repair.yaml" ) WAIT_HOST_PRESSURE = ROOT / "scripts" / "ci" / "wait-host-web-build-pressure.sh" +PROD_API_DEPLOYMENT = ROOT / "k8s" / "awoooi-prod" / "06-deployment-api.yaml" +PROD_WORKER_DEPLOYMENT = ROOT / "k8s" / "awoooi-prod" / "08-deployment-worker.yaml" +PROD_CANARY_DEPLOYMENT = ( + ROOT / "k8s" / "awoooi-prod" / "10-deployment-auto-repair-canary.yaml" +) def _workflow_text() -> str: @@ -35,10 +40,49 @@ def test_product_manifest_changes_stay_on_controlled_runtime_profile() -> None: def test_deploy_marker_k8s_files_stay_on_controlled_runtime_profile() -> None: text = _workflow_text() assert "build-and-deploy writes only these GitOps" in text + assert "k8s/awoooi-prod/04-configmap.yaml)" in text assert "k8s/awoooi-prod/06-deployment-api.yaml)" in text + assert "k8s/awoooi-prod/08-deployment-worker.yaml)" in text + assert "k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml)" in text assert "k8s/awoooi-prod/kustomization.yaml)" in text +def test_prod_db_pool_recovery_sources_stay_on_controlled_runtime_profile() -> None: + text = _workflow_text() + expected_sources = [ + "apps/api/src/core/config.py)", + "apps/api/src/db/base.py)", + "apps/api/tests/test_config_url_validation.py)", + "apps/api/tests/test_runtime_bootstrap_guards.py)", + "src/core/config.py", + "src/db/base.py", + "tests/test_config_url_validation.py", + "tests/test_runtime_bootstrap_guards.py", + ] + for source in expected_sources: + assert source in text + + +def test_prod_db_pool_budget_and_non_overlap_rollouts_are_source_controlled() -> None: + api = PROD_API_DEPLOYMENT.read_text(encoding="utf-8") + worker = PROD_WORKER_DEPLOYMENT.read_text(encoding="utf-8") + canary = PROD_CANARY_DEPLOYMENT.read_text(encoding="utf-8") + + assert "DATABASE_POOL_SIZE" in api + assert "value: \"1\"" in api + assert "DATABASE_MAX_OVERFLOW" in api + assert "value: \"0\"" in api + + assert "DATABASE_POOL_SIZE" in worker + assert "DATABASE_MAX_OVERFLOW" in worker + assert "maxSurge: 0" in worker + assert "maxUnavailable: 1" in worker + + assert "post-reboot DB connection pressure recovery" in canary + assert "maxSurge: 0" in canary + assert "maxUnavailable: 1" in canary + + def test_workflow_secret_transport_sources_stay_on_controlled_runtime_profile() -> None: text = _workflow_text() assert "workflow secret-transport and guard-only" in text