fix(alertmanager): keep auto repair moving on ai fallback
This commit is contained in:
@@ -139,6 +139,38 @@ def _should_use_alertmanager_rule_first(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _analyze_alertmanager_with_timeout(
|
||||||
|
openclaw,
|
||||||
|
alert_context: dict,
|
||||||
|
*,
|
||||||
|
alert_id: str,
|
||||||
|
alertname: str,
|
||||||
|
) -> tuple:
|
||||||
|
"""Run Alertmanager AI analysis without letting it block the workflow forever."""
|
||||||
|
|
||||||
|
try:
|
||||||
|
return await asyncio.wait_for(
|
||||||
|
openclaw.analyze_alert(alert_context),
|
||||||
|
timeout=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
except TimeoutError:
|
||||||
|
logger.warning(
|
||||||
|
"alertmanager_openclaw_timeout_fallback",
|
||||||
|
alert_id=alert_id,
|
||||||
|
alertname=alertname,
|
||||||
|
timeout_sec=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
return None, "fallback_timeout", "", None, "", 0, 0.0
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(
|
||||||
|
"alertmanager_openclaw_failed_fallback",
|
||||||
|
alert_id=alert_id,
|
||||||
|
alertname=alertname,
|
||||||
|
error=str(exc),
|
||||||
|
)
|
||||||
|
return None, "fallback_error", "", None, "", 0, 0.0
|
||||||
|
|
||||||
|
|
||||||
async def _escalate_auto_repair_unavailable(
|
async def _escalate_auto_repair_unavailable(
|
||||||
*,
|
*,
|
||||||
incident_id: str,
|
incident_id: str,
|
||||||
@@ -796,6 +828,7 @@ async def verify_webhook_signature(
|
|||||||
|
|
||||||
# 戰略 B: 滑動時間窗 (ADR-073: 5 分鐘改 30 分鐘,防同一問題反覆重建 Incident,2026-04-12 ogt)
|
# 戰略 B: 滑動時間窗 (ADR-073: 5 分鐘改 30 分鐘,防同一問題反覆重建 Incident,2026-04-12 ogt)
|
||||||
DEBOUNCE_WINDOW_MINUTES = 30
|
DEBOUNCE_WINDOW_MINUTES = 30
|
||||||
|
ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS = 90.0
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -1109,7 +1142,12 @@ async def receive_alert(
|
|||||||
# 呼叫 OpenClaw LLM 分析 (v7.0 含 SignOz 整合)
|
# 呼叫 OpenClaw LLM 分析 (v7.0 含 SignOz 整合)
|
||||||
# 2026-03-29 ogt: 加入 Token/Cost 追蹤
|
# 2026-03-29 ogt: 加入 Token/Cost 追蹤
|
||||||
openclaw = get_openclaw()
|
openclaw = get_openclaw()
|
||||||
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
|
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
|
||||||
|
openclaw,
|
||||||
|
alert_context,
|
||||||
|
alert_id=alert_id,
|
||||||
|
alertname=alert.alert_type,
|
||||||
|
)
|
||||||
|
|
||||||
if analysis_result:
|
if analysis_result:
|
||||||
# LLM 分析成功
|
# LLM 分析成功
|
||||||
@@ -1815,7 +1853,12 @@ async def _process_new_alert_background(
|
|||||||
record_alert_chain_success("alertmanager")
|
record_alert_chain_success("alertmanager")
|
||||||
return
|
return
|
||||||
|
|
||||||
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
|
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
|
||||||
|
openclaw,
|
||||||
|
alert_context,
|
||||||
|
alert_id=alert_id,
|
||||||
|
alertname=alertname,
|
||||||
|
)
|
||||||
|
|
||||||
if analysis_result:
|
if analysis_result:
|
||||||
risk_mapping = {
|
risk_mapping = {
|
||||||
@@ -2115,11 +2158,17 @@ async def _process_new_alert_background(
|
|||||||
else:
|
else:
|
||||||
# LLM 失敗 - 使用預設值
|
# LLM 失敗 - 使用預設值
|
||||||
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg,讓 extra_metadata 可觀測
|
# 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg,讓 extra_metadata 可觀測
|
||||||
|
_matched_playbook_id_cs4 = await resolve_playbook_id_for_alert(
|
||||||
|
rule_id=str(rule_response.get("rule_id", "")),
|
||||||
|
alertname=alertname,
|
||||||
|
affected_services=[target_resource] if target_resource else [],
|
||||||
|
severity="medium",
|
||||||
|
)
|
||||||
_approval_metadata_cs4 = {
|
_approval_metadata_cs4 = {
|
||||||
"source": "fallback",
|
"source": "fallback",
|
||||||
"confidence_score": None,
|
"confidence_score": None,
|
||||||
"is_rule_based": False,
|
"is_rule_based": False,
|
||||||
"playbook_id": None,
|
"playbook_id": _matched_playbook_id_cs4,
|
||||||
}
|
}
|
||||||
fallback_create = ApprovalRequestCreate(
|
fallback_create = ApprovalRequestCreate(
|
||||||
action="OBSERVE",
|
action="OBSERVE",
|
||||||
@@ -2134,6 +2183,7 @@ async def _process_new_alert_background(
|
|||||||
dry_run_checks=[],
|
dry_run_checks=[],
|
||||||
requested_by="OpenClaw (fallback)",
|
requested_by="OpenClaw (fallback)",
|
||||||
metadata=_approval_metadata_cs4,
|
metadata=_approval_metadata_cs4,
|
||||||
|
matched_playbook_id=_matched_playbook_id_cs4,
|
||||||
)
|
)
|
||||||
|
|
||||||
approval = await service.create_approval_with_fingerprint(
|
approval = await service.create_approval_with_fingerprint(
|
||||||
@@ -2205,6 +2255,37 @@ async def _process_new_alert_background(
|
|||||||
annotations=alert_context.get("annotations", {}),
|
annotations=alert_context.get("annotations", {}),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_is_heartbeat = is_heartbeat_alertname(alertname)
|
||||||
|
if can_auto_repair and not _is_heartbeat:
|
||||||
|
await _try_auto_repair_background(
|
||||||
|
incident_id=fallback_incident_id,
|
||||||
|
approval_id=str(approval.id),
|
||||||
|
alert_type=alert_type,
|
||||||
|
target_resource=target_resource,
|
||||||
|
namespace=namespace,
|
||||||
|
)
|
||||||
|
elif not can_auto_repair and not _is_heartbeat:
|
||||||
|
from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
|
||||||
|
_op_log_fallback = get_alert_operation_log_repository()
|
||||||
|
await _op_log_fallback.append(
|
||||||
|
"GUARDRAIL_BLOCKED",
|
||||||
|
incident_id=fallback_incident_id,
|
||||||
|
approval_id=str(approval.id),
|
||||||
|
actor="prometheus-rule",
|
||||||
|
action_detail=f"Prometheus rule 設定 auto_repair=false,fallback 轉人工: {alertname}",
|
||||||
|
success=False,
|
||||||
|
context={"alertname": alertname, "auto_repair_flag": False},
|
||||||
|
)
|
||||||
|
await _escalate_auto_repair_unavailable(
|
||||||
|
incident_id=fallback_incident_id,
|
||||||
|
approval_id=str(approval.id),
|
||||||
|
alert_type=alert_type,
|
||||||
|
target_resource=target_resource,
|
||||||
|
namespace=namespace,
|
||||||
|
failure_reason="Prometheus rule auto_repair=false,fallback 未進入自動修復評估",
|
||||||
|
attempted_actions="llm_fallback -> guardrail:auto_repair_false -> emergency_intervention",
|
||||||
|
)
|
||||||
|
|
||||||
await _push_to_telegram_background(
|
await _push_to_telegram_background(
|
||||||
approval_id=str(approval.id),
|
approval_id=str(approval.id),
|
||||||
risk_level="medium",
|
risk_level="medium",
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
|
import asyncio
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from src.api.v1.webhooks import (
|
from src.api.v1.webhooks import (
|
||||||
|
_analyze_alertmanager_with_timeout,
|
||||||
_should_bypass_alertmanager_llm,
|
_should_bypass_alertmanager_llm,
|
||||||
_should_use_alertmanager_rule_first,
|
_should_use_alertmanager_rule_first,
|
||||||
)
|
)
|
||||||
@@ -111,6 +115,43 @@ def test_alertmanager_llm_inflight_lock_key_is_fingerprint_scoped():
|
|||||||
assert ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS == 600
|
assert ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS == 600
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_alertmanager_analysis_timeout_returns_fallback(monkeypatch):
|
||||||
|
from src.api.v1 import webhooks as webhooks_module
|
||||||
|
|
||||||
|
class SlowOpenClaw:
|
||||||
|
async def analyze_alert(self, alert_context):
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
return "unexpected"
|
||||||
|
|
||||||
|
monkeypatch.setattr(webhooks_module, "ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS", 0.01)
|
||||||
|
|
||||||
|
result = await _analyze_alertmanager_with_timeout(
|
||||||
|
SlowOpenClaw(),
|
||||||
|
{"alertname": "AwoooPTimeoutCanary"},
|
||||||
|
alert_id="alert-timeout",
|
||||||
|
alertname="AwoooPTimeoutCanary",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == (None, "fallback_timeout", "", None, "", 0, 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_alertmanager_analysis_error_returns_fallback():
|
||||||
|
class BrokenOpenClaw:
|
||||||
|
async def analyze_alert(self, alert_context):
|
||||||
|
raise RuntimeError("provider chain failed")
|
||||||
|
|
||||||
|
result = await _analyze_alertmanager_with_timeout(
|
||||||
|
BrokenOpenClaw(),
|
||||||
|
{"alertname": "AwoooPErrorCanary"},
|
||||||
|
alert_id="alert-error",
|
||||||
|
alertname="AwoooPErrorCanary",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == (None, "fallback_error", "", None, "", 0, 0.0)
|
||||||
|
|
||||||
|
|
||||||
def test_resolved_guard_stamp_without_timestamp_is_clean():
|
def test_resolved_guard_stamp_without_timestamp_is_clean():
|
||||||
assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"
|
assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user