fix(aiops): Task2 drift auto-adopt 根因修復 + Task3 coverage gap 規則自動生成
All checks were successful
Code Review / ai-code-review (push) Successful in 48s

Task 2 — Drift 自動採納修根因:
  根因: _analyze_and_notify() 中 report 是 in-memory 物件,
        update_interpretation() 只更新 DB,不回寫 report.interpretation,
        導致 auto_adopt_if_safe() 永遠看到 None → 觸發「尚無 Nemotron 意圖分析」
        → Drift 自動採納 0 筆
  修法: report.interpretation = interpretation(DB 寫入後立即回寫記憶體)
  附加: DRIFT_AUTO_ADOPT_ENABLED flag(default=True,回滾: kubectl set env ...=false)

Task 3 — Coverage Gap → AI 規則自動生成執行器:
  根因: evaluate_once() 只分析 red 缺口,但無執行器將分析轉為實際規則
        → alert_rule_catalog 的 ai_generated source 永遠為 0 條
  修法: 新增 _auto_create_rules_for_uncovered_assets(run_id)
    · 查 auto_alerting=red 的 top 5 host/k8s_workload asset
    · 依 asset_type 生成範本化 PromQL rule(host→up, k8s→replicas_available)
    · UPSERT 進 alert_rule_catalog(source='ai_generated', review_status='pending_review')
    · Redis 24h 冷卻防重複,Redis 不可用時降級繼續
  附加: COVERAGE_AUTO_RULE_ENABLED flag(default=True,回滾: kubectl set env ...=false)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-05-04 14:22:51 +08:00
parent 54a4e59af9
commit 72cd79ed8b
3 changed files with 218 additions and 23 deletions

View File

@@ -177,34 +177,43 @@ async def _analyze_and_notify(report: DriftReport) -> None:
interpretation = await interpreter.analyze(report)
repo = get_drift_repository()
await repo.update_interpretation(report.report_id, interpretation)
# 2026-05-04 ogt + Claude Sonnet 4.6: 修根因 — report 是 in-memory 物件,
# update_interpretation 只更新 DB不會回寫 report.interpretation
# 導致 auto_adopt_if_safe 永遠看到 None → 觸發「尚無 Nemotron 意圖分析」條件
report.interpretation = interpretation
# 2026-04-24: 嘗試低風險自動採納
auto_adopted = False
auto_block_reason = ""
from src.core.config import get_settings as _gs
_drift_auto_enabled = getattr(_gs(), "DRIFT_AUTO_ADOPT_ENABLED", True)
if not _drift_auto_enabled:
auto_block_reason = "DRIFT_AUTO_ADOPT_ENABLED=false功能已停用"
try:
adopt_svc = get_drift_adopt_service()
auto_result = await adopt_svc.auto_adopt_if_safe(report)
if auto_result.get("success"):
# 自動採納成功:更新狀態,跳過人工卡片
await repo.update_status(
report.report_id,
DriftStatus.ADOPTED,
resolved_at=now_taipei(),
)
auto_adopted = True
_logger.info(
"drift_auto_adopted",
report_id=report.report_id,
pr_url=auto_result.get("pr_url"),
)
else:
auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped"
_logger.info(
"drift_auto_adopt_skipped",
report_id=report.report_id,
reason=auto_block_reason,
skipped=auto_result.get("skipped", True),
)
if _drift_auto_enabled:
adopt_svc = get_drift_adopt_service()
auto_result = await adopt_svc.auto_adopt_if_safe(report)
if auto_result.get("success"):
# 自動採納成功:更新狀態,跳過人工卡片
await repo.update_status(
report.report_id,
DriftStatus.ADOPTED,
resolved_at=now_taipei(),
)
auto_adopted = True
_logger.info(
"drift_auto_adopted",
report_id=report.report_id,
pr_url=auto_result.get("pr_url"),
)
else:
auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped"
_logger.info(
"drift_auto_adopt_skipped",
report_id=report.report_id,
reason=auto_block_reason,
skipped=auto_result.get("skipped", True),
)
except Exception as e:
auto_block_reason = f"auto adopt error: {str(e)[:120]}"
_logger.warning("drift_auto_adopt_error", report_id=report.report_id, error=str(e))

View File

@@ -662,6 +662,24 @@ class Settings(BaseSettings):
default=True,
description="ADR-091 T1: True=AI 自學規則雙寫 alert_rule_catalog DB, False=僅 YAML回滾用",
)
# ==========================================================================
# 2026-05-04 ogt + Claude Sonnet 4.6: Drift 自動採納開關
# 根因修復後啟用report.interpretation in-memory 未更新 bug 已修)
# 回滾指令: kubectl set env deployment/awoooi-api DRIFT_AUTO_ADOPT_ENABLED=false
# ==========================================================================
DRIFT_AUTO_ADOPT_ENABLED: bool = Field(
default=True,
description="2026-05-04: True=啟用 drift auto_adopt_if_safe 自動採納低風險漂移, False=回滾停用",
)
# ==========================================================================
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成
# evaluate_once() 末段:對 auto_alerting=red 的 asset 自動生成 alert_rule_catalog 記錄
# 回滾指令: kubectl set env deployment/awoooi-api COVERAGE_AUTO_RULE_ENABLED=false
# ==========================================================================
COVERAGE_AUTO_RULE_ENABLED: bool = Field(
default=True,
description="2026-05-04: True=coverage 缺口自動生成 alert_rule_catalogsource='ai_generated'review_status='pending_review', False=停用",
)
# 2026-04-27 P3.1-T2-PathA by Claude — DiagAggregator 信號分類層補 PDI
# 路徑 A 已啟用DA 只取 PDI 已收集的 raw 資料做業務邏輯分類OOMKilled/CrashLoop 等),
# 不重複呼叫 K8s/SignOz API純邏輯分類不打外部服務

View File

@@ -86,6 +86,7 @@ async def evaluate_once() -> dict[str, int]:
"monitoring_updated": 0, "alerting_updated": 0, "km_updated": 0,
"playbook_updated": 0, "remediation_updated": 0,
"rule_matching_updated": 0, "rule_creation_updated": 0,
"rules_auto_created": 0,
}
error_msg: str | None = None
@@ -129,6 +130,13 @@ async def evaluate_once() -> dict[str, int]:
stats["llm_analyzed"] = True
await _send_telegram_gaps(red_summary, llm_analysis)
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成執行器
# 對 auto_alerting=red 的 asset 自動生成 alert_rule_catalog 記錄
# COVERAGE_AUTO_RULE_ENABLED flag 控制(預設啟用)
if getattr(settings, "COVERAGE_AUTO_RULE_ENABLED", True):
created = await _auto_create_rules_for_uncovered_assets(run_id)
stats["rules_auto_created"] = created
await _log_aol(stats, duration_ms, error_msg)
logger.info(
@@ -140,6 +148,7 @@ async def evaluate_once() -> dict[str, int]:
remediation=stats["remediation_updated"],
rule_matching=stats["rule_matching_updated"],
rule_creation=stats["rule_creation_updated"],
rules_auto_created=stats.get("rules_auto_created", 0),
llm_analyzed=bool(llm_analysis),
duration_ms=duration_ms,
)
@@ -744,3 +753,162 @@ async def _log_aol(stats: dict[str, int], duration_ms: int, error: str | None) -
)
except Exception as e:
logger.warning("coverage_evaluator_aol_failed", error=str(e))
# ============================================================================
# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成執行器
# ============================================================================
_COVERAGE_RULE_COOLDOWN_SEC = 86400 # 每個 asset 24h 冷卻,避免重複建規則
async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int:
"""
對 auto_alerting=red 的 top 3 asset 自動生成 alert_rule_catalog 記錄。
流程:
1. 查最新 run 中 auto_alerting=red 的 host/k8s_workload最多 5 筆)
2. 每個 asset 用 Redis 24h 冷卻防重複
3. 依 asset_type 建立範本化 PromQL rule
4. UPSERT 進 alert_rule_catalogsource='ai_generated', review_status='pending_review'
5. 回傳成功建立數量
設計鐵律:
- 只建 pending_review不自動 approve
- rule_name UNIQUE 鍵CoverageAuto_{type}_{safe_key}
- Redis 不可用時跳過冷卻檢查(不中斷主流程)
"""
from sqlalchemy import text as _sql
from src.db.base import get_db_context
import json as _j
import re
if not run_id:
return 0
created = 0
try:
async with get_db_context() as db:
# 查 auto_alerting=red 的 host 和 k8s_workload asset最多 5 筆)
rows = await db.execute(
_sql("""
SELECT ai.asset_id, ai.asset_key, ai.asset_type,
ai.name, ai.host, ai.namespace,
ai.metadata->>'internal_ip' AS internal_ip
FROM asset_coverage_snapshot cs
JOIN asset_inventory ai ON cs.asset_id = ai.asset_id
WHERE cs.run_id = CAST(:rid AS uuid)
AND cs.dimension = 'auto_alerting'
AND cs.coverage_status = 'red'
AND ai.asset_type IN ('host', 'k8s_workload')
ORDER BY ai.asset_type, ai.asset_key
LIMIT 5
"""),
{"rid": run_id},
)
assets = rows.fetchall()
for asset in assets:
asset_key = str(asset.asset_key or "")
asset_type = str(asset.asset_type or "")
name = str(asset.name or "")
host = str(asset.host or "")
namespace = str(asset.namespace or "")
internal_ip = str(asset.internal_ip or "")
# Redis 24h 冷卻
cooldown_key = f"coverage_rule_created:{asset_key}"
try:
from src.core.redis_client import get_redis
redis = get_redis()
already = await redis.get(cooldown_key)
if already:
logger.debug("coverage_auto_rule_cooldown", asset_key=asset_key)
continue
except Exception:
pass # Redis 不可用,繼續建規則
# 建立 PromQL 規則
safe_key = re.sub(r"[^a-zA-Z0-9]", "_", asset_key)[:60]
if asset_type == "host":
ip_for_match = internal_ip or host
if not ip_for_match:
continue
rule_name = f"CoverageAuto_HostDown_{safe_key}"
expr = f'up{{instance=~"{ip_for_match}:.*"}} == 0'
severity = "warning"
labels = {"host": host or ip_for_match, "layer": "infrastructure", "source": "coverage_auto"}
annotations = {
"summary": f"主機 {host or ip_for_match} 無 Prometheus 探測響應",
"description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve",
}
duration_seconds = 120
elif asset_type == "k8s_workload":
if not name:
continue
rule_name = f"CoverageAuto_WorkloadDown_{safe_key}"
ns_selector = f',namespace="{namespace}"' if namespace else ""
expr = f'kube_deployment_status_replicas_available{{deployment="{name}"{ns_selector}}} == 0'
severity = "warning"
labels = {"namespace": namespace or "default", "deployment": name, "source": "coverage_auto"}
annotations = {
"summary": f"{name}{namespace or 'default'} 無可用副本",
"description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve",
}
duration_seconds = 180
else:
continue
# UPSERT 進 alert_rule_catalogsource='ai_generated'
try:
async with get_db_context() as db:
await db.execute(
_sql("""
INSERT INTO alert_rule_catalog (
rule_name, source, expr, duration_seconds,
severity, labels, annotations,
created_by_agent, review_status,
created_at, updated_at
) VALUES (
:rname, 'ai_generated', :expr, :dur,
:sev, CAST(:labels AS jsonb), CAST(:ann AS jsonb),
'coverage_evaluator', 'pending_review',
NOW(), NOW()
)
ON CONFLICT (rule_name) DO NOTHING
"""),
{
"rname": rule_name[:200],
"expr": expr[:4000],
"dur": duration_seconds,
"sev": severity,
"labels": _j.dumps(labels, ensure_ascii=False),
"ann": _j.dumps(annotations, ensure_ascii=False),
},
)
created += 1
logger.info(
"coverage_auto_rule_created",
rule_name=rule_name,
asset_key=asset_key,
asset_type=asset_type,
)
# 設置 Redis 冷卻
try:
from src.core.redis_client import get_redis
redis = get_redis()
await redis.set(cooldown_key, "1", ex=_COVERAGE_RULE_COOLDOWN_SEC)
except Exception:
pass
except Exception as e:
logger.warning("coverage_auto_rule_upsert_failed", asset_key=asset_key, error=str(e))
except Exception as e:
logger.warning("coverage_auto_create_rules_failed", error=str(e))
if created > 0:
logger.info("coverage_auto_rules_summary", created=created)
return created