diff --git a/apps/api/src/services/run_state_machine.py b/apps/api/src/services/run_state_machine.py index 3fda1f26..e84c2b41 100644 --- a/apps/api/src/services/run_state_machine.py +++ b/apps/api/src/services/run_state_machine.py @@ -29,7 +29,7 @@ from __future__ import annotations import socket import uuid -from datetime import datetime, timedelta, timezone +from datetime import UTC, datetime, timedelta from typing import TYPE_CHECKING import structlog @@ -66,6 +66,11 @@ TERMINAL_STATES = frozenset({"completed", "failed", "cancelled", "timeout"}) _WORKER_ID = f"{socket.gethostname()}:{uuid.uuid4().hex[:8]}" +def _utc_now_naive() -> datetime: + """Return UTC now matching AwoooP timestamp-without-timezone columns.""" + return datetime.now(UTC).replace(tzinfo=None) + + # ───────────────────────────────────────────────────────────────────────────── # FSM 驗證 # ───────────────────────────────────────────────────────────────────────────── @@ -98,8 +103,8 @@ async def acquire_pending_run( 同時只有一個 worker 可取得同一筆 run(PostgreSQL SKIP LOCKED 保證)。 Returns None 表示目前沒有待處理的 run。 """ - lease_until = datetime.now(timezone.utc) + timedelta(seconds=LEASE_TTL_SECONDS) - now = datetime.now(timezone.utc) + now = _utc_now_naive() + lease_until = now + timedelta(seconds=LEASE_TTL_SECONDS) async with get_db_context(project_id) as db: # SKIP LOCKED:其他 worker 已鎖定的 row 直接跳過 @@ -154,9 +159,10 @@ async def acquire_pending_run( return run -async def heartbeat(run_id: "UUID", project_id: str) -> None: +async def heartbeat(run_id: UUID, project_id: str) -> None: """更新 run 的 heartbeat + 延長 lease TTL""" - new_lease = datetime.now(timezone.utc) + timedelta(seconds=LEASE_TTL_SECONDS) + now = _utc_now_naive() + new_lease = now + timedelta(seconds=LEASE_TTL_SECONDS) async with get_db_context(project_id) as db: await db.execute( update(AwoooPRunState) @@ -165,14 +171,14 @@ async def heartbeat(run_id: "UUID", project_id: str) -> None: AwoooPRunState.state == "running", ) .values( - heartbeat_at=datetime.now(timezone.utc), + heartbeat_at=now, lease_until=new_lease, ) ) async def transition( - run_id: "UUID", + run_id: UUID, project_id: str, to_state: str, *, @@ -214,7 +220,7 @@ async def transition( if step_count_delta: values["step_count"] = AwoooPRunState.step_count + step_count_delta if to_state in TERMINAL_STATES: - values["completed_at"] = datetime.now(timezone.utc) + values["completed_at"] = _utc_now_naive() values["lease_until"] = None values["worker_id"] = None @@ -245,7 +251,7 @@ async def reap_stale_runs(project_id: str) -> int: Returns: 處理的 stale run 數 """ - now = datetime.now(timezone.utc) + now = _utc_now_naive() reaped = 0 async with get_db_context(project_id) as db: diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 70691cc6..dfba3860 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -27,6 +27,7 @@ - `ops/monitoring/alerts*.yml`:新增 `HostLoadAverageSustainedHigh`、`DockerContainerCpuSustainedHigh`、`DockerContainerCpuRunawayCritical`、`DockerContainerMemoryLimitPressure`、`DockerContainerRestartSpike`。 - `apps/api/alert_rules.yaml`:新增 Docker/Host 過載路由,強制走 `SSH_DIAGNOSE`,禁止通用 docker restart。 - API GitOps:用最新 `main` (`a57e3d3d`) 加本次兩個 API 修補檔,在 188 建置並推送 `192.168.0.110:5000/awoooi/api:resource-baseline-20260505-a57e3d3`;`k8s/awoooi-prod/kustomization.yaml` 指向此 tag,避免手動 `kubectl set image` 被 Argo 回滾。 +- API follow-up:新 image 上線後發現 AwoooP worker stale reaper 送 timezone-aware datetime 到 `TIMESTAMP WITHOUT TIME ZONE` 欄位,補 `_utc_now_naive()`,重建 `192.168.0.110:5000/awoooi/api:resource-baseline-20260505-e8e6748` 並將 GitOps tag 更新到此版。 - `docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md`:記錄 live 配額盤點、baseline policy、反模式與下一步 rollout 順序。 - Prometheus 已 reload,97 條規則載入;新 baseline rules 全部存在。 @@ -38,8 +39,9 @@ - Prometheus 新 baseline alerts 查詢目前無 firing。 - 新規則目前 pending:110 `HostLoadAverageSustainedHigh`、110 `DockerContainerCpuSustainedHigh` for Sentry ClickHouse。 - `apps/api/.venv/bin/python -m pytest apps/api/tests/test_classify_alert_early.py apps/api/tests/test_alert_rule_engine_validation.py -q` → 89 passed。 +- `apps/api/.venv/bin/python -m ruff check apps/api/src/services/run_state_machine.py` + `py_compile` → passed。 - `ruff check apps/api/src/services/proactive_inspector.py`、`py_compile scripts/ops/docker-stats-textfile-exporter.py`、`git diff --check` → passed。 -- `kubectl kustomize k8s/awoooi-prod` → API/worker image 均解析為 `resource-baseline-20260505-a57e3d3`。 +- `kubectl kustomize k8s/awoooi-prod` → API/worker image 均解析為 `resource-baseline-20260505-e8e6748`。 **下一步**: - 不要再降低 ClickHouse / Kafka memory limit;先觀察 backlog drain。 diff --git a/k8s/awoooi-prod/kustomization.yaml b/k8s/awoooi-prod/kustomization.yaml index b3300fc3..a910aa7f 100644 --- a/k8s/awoooi-prod/kustomization.yaml +++ b/k8s/awoooi-prod/kustomization.yaml @@ -39,7 +39,7 @@ resources: images: - name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/api - newTag: resource-baseline-20260505-a57e3d3 + newTag: resource-baseline-20260505-e8e6748 - name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/web newTag: 00684403887745e35848bbbab5ac795cfdd6fd58