fix(aiops): Task2 drift auto-adopt 根因修復 + Task3 coverage gap 規則自動生成
All checks were successful
Code Review / ai-code-review (push) Successful in 48s
All checks were successful
Code Review / ai-code-review (push) Successful in 48s
Task 2 — Drift 自動採納修根因:
根因: _analyze_and_notify() 中 report 是 in-memory 物件,
update_interpretation() 只更新 DB,不回寫 report.interpretation,
導致 auto_adopt_if_safe() 永遠看到 None → 觸發「尚無 Nemotron 意圖分析」
→ Drift 自動採納 0 筆
修法: report.interpretation = interpretation(DB 寫入後立即回寫記憶體)
附加: DRIFT_AUTO_ADOPT_ENABLED flag(default=True,回滾: kubectl set env ...=false)
Task 3 — Coverage Gap → AI 規則自動生成執行器:
根因: evaluate_once() 只分析 red 缺口,但無執行器將分析轉為實際規則
→ alert_rule_catalog 的 ai_generated source 永遠為 0 條
修法: 新增 _auto_create_rules_for_uncovered_assets(run_id)
· 查 auto_alerting=red 的 top 5 host/k8s_workload asset
· 依 asset_type 生成範本化 PromQL rule(host→up, k8s→replicas_available)
· UPSERT 進 alert_rule_catalog(source='ai_generated', review_status='pending_review')
· Redis 24h 冷卻防重複,Redis 不可用時降級繼續
附加: COVERAGE_AUTO_RULE_ENABLED flag(default=True,回滾: kubectl set env ...=false)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -177,34 +177,43 @@ async def _analyze_and_notify(report: DriftReport) -> None:
|
||||
interpretation = await interpreter.analyze(report)
|
||||
repo = get_drift_repository()
|
||||
await repo.update_interpretation(report.report_id, interpretation)
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: 修根因 — report 是 in-memory 物件,
|
||||
# update_interpretation 只更新 DB,不會回寫 report.interpretation,
|
||||
# 導致 auto_adopt_if_safe 永遠看到 None → 觸發「尚無 Nemotron 意圖分析」條件
|
||||
report.interpretation = interpretation
|
||||
|
||||
# 2026-04-24: 嘗試低風險自動採納
|
||||
auto_adopted = False
|
||||
auto_block_reason = ""
|
||||
from src.core.config import get_settings as _gs
|
||||
_drift_auto_enabled = getattr(_gs(), "DRIFT_AUTO_ADOPT_ENABLED", True)
|
||||
if not _drift_auto_enabled:
|
||||
auto_block_reason = "DRIFT_AUTO_ADOPT_ENABLED=false,功能已停用"
|
||||
try:
|
||||
adopt_svc = get_drift_adopt_service()
|
||||
auto_result = await adopt_svc.auto_adopt_if_safe(report)
|
||||
if auto_result.get("success"):
|
||||
# 自動採納成功:更新狀態,跳過人工卡片
|
||||
await repo.update_status(
|
||||
report.report_id,
|
||||
DriftStatus.ADOPTED,
|
||||
resolved_at=now_taipei(),
|
||||
)
|
||||
auto_adopted = True
|
||||
_logger.info(
|
||||
"drift_auto_adopted",
|
||||
report_id=report.report_id,
|
||||
pr_url=auto_result.get("pr_url"),
|
||||
)
|
||||
else:
|
||||
auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped"
|
||||
_logger.info(
|
||||
"drift_auto_adopt_skipped",
|
||||
report_id=report.report_id,
|
||||
reason=auto_block_reason,
|
||||
skipped=auto_result.get("skipped", True),
|
||||
)
|
||||
if _drift_auto_enabled:
|
||||
adopt_svc = get_drift_adopt_service()
|
||||
auto_result = await adopt_svc.auto_adopt_if_safe(report)
|
||||
if auto_result.get("success"):
|
||||
# 自動採納成功:更新狀態,跳過人工卡片
|
||||
await repo.update_status(
|
||||
report.report_id,
|
||||
DriftStatus.ADOPTED,
|
||||
resolved_at=now_taipei(),
|
||||
)
|
||||
auto_adopted = True
|
||||
_logger.info(
|
||||
"drift_auto_adopted",
|
||||
report_id=report.report_id,
|
||||
pr_url=auto_result.get("pr_url"),
|
||||
)
|
||||
else:
|
||||
auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped"
|
||||
_logger.info(
|
||||
"drift_auto_adopt_skipped",
|
||||
report_id=report.report_id,
|
||||
reason=auto_block_reason,
|
||||
skipped=auto_result.get("skipped", True),
|
||||
)
|
||||
except Exception as e:
|
||||
auto_block_reason = f"auto adopt error: {str(e)[:120]}"
|
||||
_logger.warning("drift_auto_adopt_error", report_id=report.report_id, error=str(e))
|
||||
|
||||
@@ -662,6 +662,24 @@ class Settings(BaseSettings):
|
||||
default=True,
|
||||
description="ADR-091 T1: True=AI 自學規則雙寫 alert_rule_catalog DB, False=僅 YAML(回滾用)",
|
||||
)
|
||||
# ==========================================================================
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: Drift 自動採納開關
|
||||
# 根因修復後啟用(report.interpretation in-memory 未更新 bug 已修)
|
||||
# 回滾指令: kubectl set env deployment/awoooi-api DRIFT_AUTO_ADOPT_ENABLED=false
|
||||
# ==========================================================================
|
||||
DRIFT_AUTO_ADOPT_ENABLED: bool = Field(
|
||||
default=True,
|
||||
description="2026-05-04: True=啟用 drift auto_adopt_if_safe 自動採納低風險漂移, False=回滾停用",
|
||||
)
|
||||
# ==========================================================================
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成
|
||||
# evaluate_once() 末段:對 auto_alerting=red 的 asset 自動生成 alert_rule_catalog 記錄
|
||||
# 回滾指令: kubectl set env deployment/awoooi-api COVERAGE_AUTO_RULE_ENABLED=false
|
||||
# ==========================================================================
|
||||
COVERAGE_AUTO_RULE_ENABLED: bool = Field(
|
||||
default=True,
|
||||
description="2026-05-04: True=coverage 缺口自動生成 alert_rule_catalog(source='ai_generated',review_status='pending_review'), False=停用",
|
||||
)
|
||||
# 2026-04-27 P3.1-T2-PathA by Claude — DiagAggregator 信號分類層補 PDI
|
||||
# 路徑 A 已啟用:DA 只取 PDI 已收集的 raw 資料做業務邏輯分類(OOMKilled/CrashLoop 等),
|
||||
# 不重複呼叫 K8s/SignOz API(純邏輯分類,不打外部服務)。
|
||||
|
||||
@@ -86,6 +86,7 @@ async def evaluate_once() -> dict[str, int]:
|
||||
"monitoring_updated": 0, "alerting_updated": 0, "km_updated": 0,
|
||||
"playbook_updated": 0, "remediation_updated": 0,
|
||||
"rule_matching_updated": 0, "rule_creation_updated": 0,
|
||||
"rules_auto_created": 0,
|
||||
}
|
||||
error_msg: str | None = None
|
||||
|
||||
@@ -129,6 +130,13 @@ async def evaluate_once() -> dict[str, int]:
|
||||
stats["llm_analyzed"] = True
|
||||
await _send_telegram_gaps(red_summary, llm_analysis)
|
||||
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成執行器
|
||||
# 對 auto_alerting=red 的 asset 自動生成 alert_rule_catalog 記錄
|
||||
# COVERAGE_AUTO_RULE_ENABLED flag 控制(預設啟用)
|
||||
if getattr(settings, "COVERAGE_AUTO_RULE_ENABLED", True):
|
||||
created = await _auto_create_rules_for_uncovered_assets(run_id)
|
||||
stats["rules_auto_created"] = created
|
||||
|
||||
await _log_aol(stats, duration_ms, error_msg)
|
||||
|
||||
logger.info(
|
||||
@@ -140,6 +148,7 @@ async def evaluate_once() -> dict[str, int]:
|
||||
remediation=stats["remediation_updated"],
|
||||
rule_matching=stats["rule_matching_updated"],
|
||||
rule_creation=stats["rule_creation_updated"],
|
||||
rules_auto_created=stats.get("rules_auto_created", 0),
|
||||
llm_analyzed=bool(llm_analysis),
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
@@ -744,3 +753,162 @@ async def _log_aol(stats: dict[str, int], duration_ms: int, error: str | None) -
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("coverage_evaluator_aol_failed", error=str(e))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成執行器
|
||||
# ============================================================================
|
||||
|
||||
_COVERAGE_RULE_COOLDOWN_SEC = 86400 # 每個 asset 24h 冷卻,避免重複建規則
|
||||
|
||||
|
||||
async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int:
|
||||
"""
|
||||
對 auto_alerting=red 的 top 3 asset 自動生成 alert_rule_catalog 記錄。
|
||||
|
||||
流程:
|
||||
1. 查最新 run 中 auto_alerting=red 的 host/k8s_workload(最多 5 筆)
|
||||
2. 每個 asset 用 Redis 24h 冷卻防重複
|
||||
3. 依 asset_type 建立範本化 PromQL rule
|
||||
4. UPSERT 進 alert_rule_catalog(source='ai_generated', review_status='pending_review')
|
||||
5. 回傳成功建立數量
|
||||
|
||||
設計鐵律:
|
||||
- 只建 pending_review,不自動 approve
|
||||
- rule_name UNIQUE 鍵:CoverageAuto_{type}_{safe_key}
|
||||
- Redis 不可用時跳過冷卻檢查(不中斷主流程)
|
||||
"""
|
||||
from sqlalchemy import text as _sql
|
||||
from src.db.base import get_db_context
|
||||
import json as _j
|
||||
import re
|
||||
|
||||
if not run_id:
|
||||
return 0
|
||||
|
||||
created = 0
|
||||
try:
|
||||
async with get_db_context() as db:
|
||||
# 查 auto_alerting=red 的 host 和 k8s_workload asset(最多 5 筆)
|
||||
rows = await db.execute(
|
||||
_sql("""
|
||||
SELECT ai.asset_id, ai.asset_key, ai.asset_type,
|
||||
ai.name, ai.host, ai.namespace,
|
||||
ai.metadata->>'internal_ip' AS internal_ip
|
||||
FROM asset_coverage_snapshot cs
|
||||
JOIN asset_inventory ai ON cs.asset_id = ai.asset_id
|
||||
WHERE cs.run_id = CAST(:rid AS uuid)
|
||||
AND cs.dimension = 'auto_alerting'
|
||||
AND cs.coverage_status = 'red'
|
||||
AND ai.asset_type IN ('host', 'k8s_workload')
|
||||
ORDER BY ai.asset_type, ai.asset_key
|
||||
LIMIT 5
|
||||
"""),
|
||||
{"rid": run_id},
|
||||
)
|
||||
assets = rows.fetchall()
|
||||
|
||||
for asset in assets:
|
||||
asset_key = str(asset.asset_key or "")
|
||||
asset_type = str(asset.asset_type or "")
|
||||
name = str(asset.name or "")
|
||||
host = str(asset.host or "")
|
||||
namespace = str(asset.namespace or "")
|
||||
internal_ip = str(asset.internal_ip or "")
|
||||
|
||||
# Redis 24h 冷卻
|
||||
cooldown_key = f"coverage_rule_created:{asset_key}"
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
already = await redis.get(cooldown_key)
|
||||
if already:
|
||||
logger.debug("coverage_auto_rule_cooldown", asset_key=asset_key)
|
||||
continue
|
||||
except Exception:
|
||||
pass # Redis 不可用,繼續建規則
|
||||
|
||||
# 建立 PromQL 規則
|
||||
safe_key = re.sub(r"[^a-zA-Z0-9]", "_", asset_key)[:60]
|
||||
if asset_type == "host":
|
||||
ip_for_match = internal_ip or host
|
||||
if not ip_for_match:
|
||||
continue
|
||||
rule_name = f"CoverageAuto_HostDown_{safe_key}"
|
||||
expr = f'up{{instance=~"{ip_for_match}:.*"}} == 0'
|
||||
severity = "warning"
|
||||
labels = {"host": host or ip_for_match, "layer": "infrastructure", "source": "coverage_auto"}
|
||||
annotations = {
|
||||
"summary": f"主機 {host or ip_for_match} 無 Prometheus 探測響應",
|
||||
"description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve",
|
||||
}
|
||||
duration_seconds = 120
|
||||
elif asset_type == "k8s_workload":
|
||||
if not name:
|
||||
continue
|
||||
rule_name = f"CoverageAuto_WorkloadDown_{safe_key}"
|
||||
ns_selector = f',namespace="{namespace}"' if namespace else ""
|
||||
expr = f'kube_deployment_status_replicas_available{{deployment="{name}"{ns_selector}}} == 0'
|
||||
severity = "warning"
|
||||
labels = {"namespace": namespace or "default", "deployment": name, "source": "coverage_auto"}
|
||||
annotations = {
|
||||
"summary": f"{name} 在 {namespace or 'default'} 無可用副本",
|
||||
"description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve",
|
||||
}
|
||||
duration_seconds = 180
|
||||
else:
|
||||
continue
|
||||
|
||||
# UPSERT 進 alert_rule_catalog(source='ai_generated')
|
||||
try:
|
||||
async with get_db_context() as db:
|
||||
await db.execute(
|
||||
_sql("""
|
||||
INSERT INTO alert_rule_catalog (
|
||||
rule_name, source, expr, duration_seconds,
|
||||
severity, labels, annotations,
|
||||
created_by_agent, review_status,
|
||||
created_at, updated_at
|
||||
) VALUES (
|
||||
:rname, 'ai_generated', :expr, :dur,
|
||||
:sev, CAST(:labels AS jsonb), CAST(:ann AS jsonb),
|
||||
'coverage_evaluator', 'pending_review',
|
||||
NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (rule_name) DO NOTHING
|
||||
"""),
|
||||
{
|
||||
"rname": rule_name[:200],
|
||||
"expr": expr[:4000],
|
||||
"dur": duration_seconds,
|
||||
"sev": severity,
|
||||
"labels": _j.dumps(labels, ensure_ascii=False),
|
||||
"ann": _j.dumps(annotations, ensure_ascii=False),
|
||||
},
|
||||
)
|
||||
created += 1
|
||||
logger.info(
|
||||
"coverage_auto_rule_created",
|
||||
rule_name=rule_name,
|
||||
asset_key=asset_key,
|
||||
asset_type=asset_type,
|
||||
)
|
||||
|
||||
# 設置 Redis 冷卻
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
await redis.set(cooldown_key, "1", ex=_COVERAGE_RULE_COOLDOWN_SEC)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("coverage_auto_rule_upsert_failed", asset_key=asset_key, error=str(e))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("coverage_auto_create_rules_failed", error=str(e))
|
||||
|
||||
if created > 0:
|
||||
logger.info("coverage_auto_rules_summary", created=created)
|
||||
|
||||
return created
|
||||
|
||||
Reference in New Issue
Block a user