fix(api): skip bootstrap ddl statement timeout
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 1m7s
CD Pipeline / build-and-deploy (push) Successful in 5m1s
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Successful in 57s
CD Pipeline / post-deploy-checks (push) Successful in 3m23s
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 1m7s
CD Pipeline / build-and-deploy (push) Successful in 5m1s
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Successful in 57s
CD Pipeline / post-deploy-checks (push) Successful in 3m23s
This commit is contained in:
@@ -216,6 +216,29 @@ def _is_database_connection_budget_error(exc: BaseException) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_database_bootstrap_ddl_timeout(exc: BaseException) -> bool:
|
||||||
|
"""Return True only for optional bootstrap DDL canceled by DB timeout."""
|
||||||
|
seen: set[int] = set()
|
||||||
|
current: BaseException | None = exc
|
||||||
|
timeout_markers = (
|
||||||
|
"querycancelederror",
|
||||||
|
"canceling statement due to statement timeout",
|
||||||
|
"statement timeout",
|
||||||
|
)
|
||||||
|
|
||||||
|
while current is not None and id(current) not in seen:
|
||||||
|
seen.add(id(current))
|
||||||
|
message = f"{type(current).__name__}: {current}".lower()
|
||||||
|
if any(marker in message for marker in timeout_markers):
|
||||||
|
return True
|
||||||
|
current = (
|
||||||
|
getattr(current, "orig", None)
|
||||||
|
or getattr(current, "__cause__", None)
|
||||||
|
or getattr(current, "__context__", None)
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def init_db() -> None:
|
async def init_db() -> None:
|
||||||
"""
|
"""
|
||||||
Initialize database tables
|
Initialize database tables
|
||||||
@@ -251,6 +274,14 @@ async def init_db() -> None:
|
|||||||
timeout_seconds=_DB_BOOTSTRAP_DDL_WAIT_SECONDS,
|
timeout_seconds=_DB_BOOTSTRAP_DDL_WAIT_SECONDS,
|
||||||
lock_name=_DB_BOOTSTRAP_LOCK_NAME,
|
lock_name=_DB_BOOTSTRAP_LOCK_NAME,
|
||||||
)
|
)
|
||||||
|
except DBAPIError as exc:
|
||||||
|
if not _is_database_bootstrap_ddl_timeout(exc):
|
||||||
|
raise
|
||||||
|
logger.warning(
|
||||||
|
"database_bootstrap_statement_timeout_skipped",
|
||||||
|
error_type=type(exc).__name__,
|
||||||
|
lock_name=_DB_BOOTSTRAP_LOCK_NAME,
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
await lock_conn.execute(
|
await lock_conn.execute(
|
||||||
text("SELECT pg_advisory_unlock(hashtext(:lock_name))"),
|
text("SELECT pg_advisory_unlock(hashtext(:lock_name))"),
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from collections.abc import Awaitable
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from sqlalchemy.exc import DBAPIError
|
||||||
|
|
||||||
|
|
||||||
class _FakeScalarResult:
|
class _FakeScalarResult:
|
||||||
@@ -190,6 +191,29 @@ async def test_init_db_releases_bootstrap_lock_when_ddl_times_out(monkeypatch):
|
|||||||
assert any("pg_advisory_unlock" in stmt for stmt in fake_engine.lock_conn.statements)
|
assert any("pg_advisory_unlock" in stmt for stmt in fake_engine.lock_conn.statements)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_init_db_skips_bootstrap_when_postgres_statement_timeout(monkeypatch):
|
||||||
|
from src.db import base as db_base
|
||||||
|
|
||||||
|
fake_engine = _FakeEngine()
|
||||||
|
|
||||||
|
async def fake_run_init_db_ddl(_engine: object) -> None:
|
||||||
|
raise DBAPIError(
|
||||||
|
"ALTER TABLE approval_records ADD COLUMN IF NOT EXISTS telegram_message_id",
|
||||||
|
{},
|
||||||
|
Exception("canceling statement due to statement timeout"),
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(db_base, "get_engine", lambda: fake_engine)
|
||||||
|
monkeypatch.setattr(db_base, "_run_init_db_ddl", fake_run_init_db_ddl)
|
||||||
|
|
||||||
|
await db_base.init_db()
|
||||||
|
|
||||||
|
assert "pg_try_advisory_lock" in fake_engine.lock_conn.statements[0]
|
||||||
|
assert any("pg_advisory_unlock" in stmt for stmt in fake_engine.lock_conn.statements)
|
||||||
|
assert "COMMIT" in fake_engine.lock_conn.statements
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_signal_worker_initializes_worker_redis_pool_before_tasks(monkeypatch):
|
async def test_signal_worker_initializes_worker_redis_pool_before_tasks(monkeypatch):
|
||||||
from src.workers import signal_worker
|
from src.workers import signal_worker
|
||||||
|
|||||||
@@ -1,3 +1,18 @@
|
|||||||
|
## 2026-07-02 — 14:12 API rollout CrashLoop root cause 與 bootstrap DDL timeout 修復
|
||||||
|
|
||||||
|
**完成內容**:
|
||||||
|
- `f9469bcc2` CD rollout 期間 production API 變成 `502`;read-only K8s 查詢確認 `awoooi-api` 新 pod `CrashLoopBackOff`,web / worker / auto-repair-canary 皆 Running。
|
||||||
|
- `kubectl logs --previous` 顯示 API startup 在 `init_db()` 執行 `ALTER TABLE approval_records ADD COLUMN IF NOT EXISTS telegram_message_id...` 時被 PostgreSQL `statement_timeout` 取消,SQLAlchemy 包成 `DBAPIError` 後重新 raise,導致 container exit 3。
|
||||||
|
- `apps/api/src/db/base.py` 新增 `_is_database_bootstrap_ddl_timeout()`,只針對 optional bootstrap DDL 的 `QueryCanceledError` / `canceling statement due to statement timeout` fail-visible skip;其他 DBAPIError 仍 raise,避免吞真正 migration / SQL 錯誤。
|
||||||
|
- `apps/api/tests/test_runtime_bootstrap_guards.py` 新增 regression:模擬 `DBAPIError(... statement timeout ...)` 時,`init_db()` 必須釋放 advisory lock、commit unlock,且不讓 API startup crash。
|
||||||
|
|
||||||
|
**驗證**:
|
||||||
|
- `python3.11 -m py_compile apps/api/src/db/base.py`
|
||||||
|
- `DATABASE_URL=postgresql+asyncpg://test:test@localhost/test python3.11 -m pytest apps/api/tests/test_runtime_bootstrap_guards.py -q`:`9 passed`。
|
||||||
|
|
||||||
|
**仍維持**:
|
||||||
|
- 未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未重啟主機 / Docker / Nginx / K3s / DB / firewall;K8s 查詢為 read-only。
|
||||||
|
|
||||||
## 2026-07-02 — 13:55 Telegram 告警 receipt / AI route coverage 缺口讀回
|
## 2026-07-02 — 13:55 Telegram 告警 receipt / AI route coverage 缺口讀回
|
||||||
|
|
||||||
**完成內容**:
|
**完成內容**:
|
||||||
|
|||||||
Reference in New Issue
Block a user