fix(awooop): use naive utc for run lease timestamps
This commit is contained in:
@@ -29,7 +29,7 @@ from __future__ import annotations
|
||||
|
||||
import socket
|
||||
import uuid
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import structlog
|
||||
@@ -66,6 +66,11 @@ TERMINAL_STATES = frozenset({"completed", "failed", "cancelled", "timeout"})
|
||||
_WORKER_ID = f"{socket.gethostname()}:{uuid.uuid4().hex[:8]}"
|
||||
|
||||
|
||||
def _utc_now_naive() -> datetime:
|
||||
"""Return UTC now matching AwoooP timestamp-without-timezone columns."""
|
||||
return datetime.now(UTC).replace(tzinfo=None)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# FSM 驗證
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
@@ -98,8 +103,8 @@ async def acquire_pending_run(
|
||||
同時只有一個 worker 可取得同一筆 run(PostgreSQL SKIP LOCKED 保證)。
|
||||
Returns None 表示目前沒有待處理的 run。
|
||||
"""
|
||||
lease_until = datetime.now(timezone.utc) + timedelta(seconds=LEASE_TTL_SECONDS)
|
||||
now = datetime.now(timezone.utc)
|
||||
now = _utc_now_naive()
|
||||
lease_until = now + timedelta(seconds=LEASE_TTL_SECONDS)
|
||||
|
||||
async with get_db_context(project_id) as db:
|
||||
# SKIP LOCKED:其他 worker 已鎖定的 row 直接跳過
|
||||
@@ -154,9 +159,10 @@ async def acquire_pending_run(
|
||||
return run
|
||||
|
||||
|
||||
async def heartbeat(run_id: "UUID", project_id: str) -> None:
|
||||
async def heartbeat(run_id: UUID, project_id: str) -> None:
|
||||
"""更新 run 的 heartbeat + 延長 lease TTL"""
|
||||
new_lease = datetime.now(timezone.utc) + timedelta(seconds=LEASE_TTL_SECONDS)
|
||||
now = _utc_now_naive()
|
||||
new_lease = now + timedelta(seconds=LEASE_TTL_SECONDS)
|
||||
async with get_db_context(project_id) as db:
|
||||
await db.execute(
|
||||
update(AwoooPRunState)
|
||||
@@ -165,14 +171,14 @@ async def heartbeat(run_id: "UUID", project_id: str) -> None:
|
||||
AwoooPRunState.state == "running",
|
||||
)
|
||||
.values(
|
||||
heartbeat_at=datetime.now(timezone.utc),
|
||||
heartbeat_at=now,
|
||||
lease_until=new_lease,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
async def transition(
|
||||
run_id: "UUID",
|
||||
run_id: UUID,
|
||||
project_id: str,
|
||||
to_state: str,
|
||||
*,
|
||||
@@ -214,7 +220,7 @@ async def transition(
|
||||
if step_count_delta:
|
||||
values["step_count"] = AwoooPRunState.step_count + step_count_delta
|
||||
if to_state in TERMINAL_STATES:
|
||||
values["completed_at"] = datetime.now(timezone.utc)
|
||||
values["completed_at"] = _utc_now_naive()
|
||||
values["lease_until"] = None
|
||||
values["worker_id"] = None
|
||||
|
||||
@@ -245,7 +251,7 @@ async def reap_stale_runs(project_id: str) -> int:
|
||||
|
||||
Returns: 處理的 stale run 數
|
||||
"""
|
||||
now = datetime.now(timezone.utc)
|
||||
now = _utc_now_naive()
|
||||
reaped = 0
|
||||
|
||||
async with get_db_context(project_id) as db:
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
- `ops/monitoring/alerts*.yml`:新增 `HostLoadAverageSustainedHigh`、`DockerContainerCpuSustainedHigh`、`DockerContainerCpuRunawayCritical`、`DockerContainerMemoryLimitPressure`、`DockerContainerRestartSpike`。
|
||||
- `apps/api/alert_rules.yaml`:新增 Docker/Host 過載路由,強制走 `SSH_DIAGNOSE`,禁止通用 docker restart。
|
||||
- API GitOps:用最新 `main` (`a57e3d3d`) 加本次兩個 API 修補檔,在 188 建置並推送 `192.168.0.110:5000/awoooi/api:resource-baseline-20260505-a57e3d3`;`k8s/awoooi-prod/kustomization.yaml` 指向此 tag,避免手動 `kubectl set image` 被 Argo 回滾。
|
||||
- API follow-up:新 image 上線後發現 AwoooP worker stale reaper 送 timezone-aware datetime 到 `TIMESTAMP WITHOUT TIME ZONE` 欄位,補 `_utc_now_naive()`,重建 `192.168.0.110:5000/awoooi/api:resource-baseline-20260505-e8e6748` 並將 GitOps tag 更新到此版。
|
||||
- `docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md`:記錄 live 配額盤點、baseline policy、反模式與下一步 rollout 順序。
|
||||
- Prometheus 已 reload,97 條規則載入;新 baseline rules 全部存在。
|
||||
|
||||
@@ -38,8 +39,9 @@
|
||||
- Prometheus 新 baseline alerts 查詢目前無 firing。
|
||||
- 新規則目前 pending:110 `HostLoadAverageSustainedHigh`、110 `DockerContainerCpuSustainedHigh` for Sentry ClickHouse。
|
||||
- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_classify_alert_early.py apps/api/tests/test_alert_rule_engine_validation.py -q` → 89 passed。
|
||||
- `apps/api/.venv/bin/python -m ruff check apps/api/src/services/run_state_machine.py` + `py_compile` → passed。
|
||||
- `ruff check apps/api/src/services/proactive_inspector.py`、`py_compile scripts/ops/docker-stats-textfile-exporter.py`、`git diff --check` → passed。
|
||||
- `kubectl kustomize k8s/awoooi-prod` → API/worker image 均解析為 `resource-baseline-20260505-a57e3d3`。
|
||||
- `kubectl kustomize k8s/awoooi-prod` → API/worker image 均解析為 `resource-baseline-20260505-e8e6748`。
|
||||
|
||||
**下一步**:
|
||||
- 不要再降低 ClickHouse / Kafka memory limit;先觀察 backlog drain。
|
||||
|
||||
@@ -39,7 +39,7 @@ resources:
|
||||
images:
|
||||
- name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/api
|
||||
newTag: resource-baseline-20260505-a57e3d3
|
||||
newTag: resource-baseline-20260505-e8e6748
|
||||
- name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/web
|
||||
newTag: 00684403887745e35848bbbab5ac795cfdd6fd58
|
||||
|
||||
Reference in New Issue
Block a user