feat(ai-ops): ADR-012 Phase 2/3/4 完整實作
All checks were successful
CD Pipeline / deploy (push) Successful in 1m11s
All checks were successful
CD Pipeline / deploy (push) Successful in 1m11s
Phase 2 — Hermes L1 Observer 真實接入:
- services/event_router.py::_hermes_observe() 呼叫 hermes3:latest
@192.168.0.111:11434/api/generate,做 stack trace 翻譯
- 輸出 JSON {summary, probable_cause, actions},容錯 markdown fence
- scheduler.py run_auto_import_task / run_momo_task 兩個 outer
except 改走 event_router.dispatch(),帶完整 trace
Phase 3 — NemoTron L2 Investigator 規則式實作:
- event_router._L2_RULES: event_type → [(action, params)] 規則表
• db_connection_error → query_km + retry_task(60s backoff)
• crawler_timeout → silence_alert(30min) + retry_task(300s)
• nim_quota_exhausted → silence_alert(720min)
• embedding_failure → silence_alert(10min)
- agent_actions.retry_task 真實實作: threading.Timer + exponential
backoff (60→120→240s) + _retry_state 追蹤 + ALLOWED_RETRY_TASKS
白名單 + 非 scheduler 容器回 'deferred'
Phase 4 — L3 HITL Ops 擴充:
- agent_actions: pause_task / resume_task / force_retry_now / is_task_paused
- OPS_ACTIONS 白名單與 SAFE_ACTIONS 嚴格分離(L2 不可呼叫 L3)
- telegram_templates.ops_action_request(): 4 按鈕 inline keyboard
(暫停1h / 暫停6h / 立即重試 / 解除暫停)
- telegram_bot_service._handle_ops_callback(): 接 momo:ops:<action>:<task>
- scheduler.py run_momo_task + run_auto_import_task 開頭加
is_task_paused() 檢查(Phase 4 暫停機制生效)
安全邊界(ADR-012 §①):
- L1 Hermes 只讀 → 失敗降 L0 + 🟡 標記
- L2 NemoTron 只碰 ai_insights + 發 Telegram + SAFE_ACTIONS
- L3 OpenClaw 任意動作必經 HITL inline keyboard 批准
- 不做容器重啟按鈕(需 docker socket,風險過高)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
70
scheduler.py
70
scheduler.py
@@ -200,6 +200,15 @@ def managed_scraper_resources(window_size='1920,5000', debug=False, timeout=45,
|
||||
|
||||
def run_momo_task():
|
||||
"""V8.1 邏輯:處理所有分類並存入資料庫"""
|
||||
# ADR-012 Phase 4: HITL 暫停檢查
|
||||
try:
|
||||
from services.agent_actions import is_task_paused
|
||||
if is_task_paused("run_momo_task"):
|
||||
logging.info("[Crawler] [MOMO] ⏸️ 任務被 HITL 暫停中,本次跳過")
|
||||
return
|
||||
except Exception:
|
||||
pass # agent_actions 未就緒時不阻塞排程
|
||||
|
||||
try:
|
||||
# V-New: 每次執行任務時,動態從 JSON 檔案重新讀取分類
|
||||
# 這解決了「修改設定需重啟」的問題,也避免了重啟造成的系統崩潰
|
||||
@@ -452,9 +461,26 @@ def run_momo_task():
|
||||
logging.error(f"[Crawler] [MOMO] ❌ 發送通知失敗 | Error: {e}")
|
||||
|
||||
except Exception as e:
|
||||
import traceback as _tb
|
||||
logging.error(f"[Crawler] [MOMO] 🚨 任務中斷 | Error: {e}")
|
||||
stats = { "status": "Failed", "error": str(e) }
|
||||
_save_stats('momo_task', stats)
|
||||
# ADR-012 Phase 2: 走 EventRouter(Hermes L1 翻譯 + 三層式訊息)
|
||||
try:
|
||||
from services.event_router import dispatch as _dispatch
|
||||
_dispatch({
|
||||
"source": "Scheduler.MOMOCrawler",
|
||||
"event_type": "crawler_timeout",
|
||||
"severity": "alert",
|
||||
"title": "MOMO 爬蟲任務中斷",
|
||||
"status": "任務失敗",
|
||||
"impact": "P1 - 熱銷商品監控中斷",
|
||||
"summary": str(e)[:200],
|
||||
"trace": _tb.format_exc(),
|
||||
"payload": {"task_name": "run_momo_task"},
|
||||
})
|
||||
except Exception as _router_e:
|
||||
logging.error(f"[Crawler] [MOMO] event_router 失敗: {_router_e}")
|
||||
finally:
|
||||
logging.info("[Crawler] [MOMO] 🏁 所有類別爬取結束")
|
||||
|
||||
@@ -1391,6 +1417,15 @@ def run_auto_import_task():
|
||||
V-New: 自動從 Google Drive 匯入當日業績
|
||||
每半小時檢查一次 Google Drive 是否有新的 Excel 檔案
|
||||
"""
|
||||
# ADR-012 Phase 4: HITL 暫停檢查
|
||||
try:
|
||||
from services.agent_actions import is_task_paused
|
||||
if is_task_paused("run_auto_import_task"):
|
||||
logging.info("[Scheduler] [AutoImport] ⏸️ 任務被 HITL 暫停中,本次跳過")
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
from services.import_service import import_service
|
||||
from services.notification_manager import NotificationManager
|
||||
@@ -1507,26 +1542,37 @@ def run_auto_import_task():
|
||||
_save_stats('auto_import_task', stats)
|
||||
|
||||
except Exception as e:
|
||||
import traceback as _tb
|
||||
logging.error(f"[Scheduler] [AutoImport] 🚨 自動匯入任務異常 | Error: {e}")
|
||||
stats = {"status": "Failed", "error": str(e)}
|
||||
_save_stats('auto_import_task', stats)
|
||||
|
||||
# V-New: 任務異常時也發送通知
|
||||
# ADR-012 Phase 2: 改走 EventRouter(Hermes L1 翻譯 + 降級鏈)
|
||||
# LINE 通道保留(event_router 只處理 Telegram)
|
||||
try:
|
||||
from services.event_router import dispatch as _dispatch
|
||||
_dispatch({
|
||||
"source": "Scheduler.AutoImport",
|
||||
"event_type": "db_connection_error" if "translate host" in str(e).lower() or "operational" in str(e).lower() else "import_failure",
|
||||
"severity": "alert",
|
||||
"title": "當日業績自動匯入異常",
|
||||
"status": "匯入失敗",
|
||||
"impact": "P1 - 當日業績未更新",
|
||||
"summary": str(e)[:200],
|
||||
"trace": _tb.format_exc(),
|
||||
"payload": {"task_name": "run_auto_import_task"},
|
||||
})
|
||||
except Exception as _router_e:
|
||||
logging.error(f"[Scheduler] [AutoImport] event_router 失敗: {_router_e}")
|
||||
|
||||
# LINE 通知保留(獨立通道,不經 event_router)
|
||||
try:
|
||||
from services.notification_manager import NotificationManager
|
||||
now_str = datetime.now(TAIPEI_TZ).strftime('%Y-%m-%d %H:%M')
|
||||
message = (
|
||||
f"🚨 當日業績自動匯入異常 ({now_str})\n"
|
||||
f"{'='*30}\n"
|
||||
f"❌ 系統錯誤:{str(e)}\n"
|
||||
f"{'='*30}\n"
|
||||
f"請聯絡系統管理員"
|
||||
)
|
||||
notifier = NotificationManager()
|
||||
notifier._send_line_messages([message])
|
||||
notifier._send_telegram_messages([message])
|
||||
message = f"🚨 當日業績自動匯入異常 ({now_str})\n系統錯誤:{str(e)[:200]}"
|
||||
NotificationManager()._send_line_messages([message])
|
||||
except Exception as notify_error:
|
||||
logging.error(f"[Scheduler] [AutoImport] ❌ 發送異常通知時發生錯誤 | Error: {notify_error}")
|
||||
logging.error(f"[Scheduler] [AutoImport] ❌ LINE 通知失敗 | Error: {notify_error}")
|
||||
|
||||
def run_competitor_price_feeder_task():
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user