diff --git a/apps/api/src/api/v1/drift.py b/apps/api/src/api/v1/drift.py index e450620a..219317b8 100644 --- a/apps/api/src/api/v1/drift.py +++ b/apps/api/src/api/v1/drift.py @@ -177,34 +177,43 @@ async def _analyze_and_notify(report: DriftReport) -> None: interpretation = await interpreter.analyze(report) repo = get_drift_repository() await repo.update_interpretation(report.report_id, interpretation) + # 2026-05-04 ogt + Claude Sonnet 4.6: 修根因 — report 是 in-memory 物件, + # update_interpretation 只更新 DB,不會回寫 report.interpretation, + # 導致 auto_adopt_if_safe 永遠看到 None → 觸發「尚無 Nemotron 意圖分析」條件 + report.interpretation = interpretation # 2026-04-24: 嘗試低風險自動採納 auto_adopted = False auto_block_reason = "" + from src.core.config import get_settings as _gs + _drift_auto_enabled = getattr(_gs(), "DRIFT_AUTO_ADOPT_ENABLED", True) + if not _drift_auto_enabled: + auto_block_reason = "DRIFT_AUTO_ADOPT_ENABLED=false,功能已停用" try: - adopt_svc = get_drift_adopt_service() - auto_result = await adopt_svc.auto_adopt_if_safe(report) - if auto_result.get("success"): - # 自動採納成功:更新狀態,跳過人工卡片 - await repo.update_status( - report.report_id, - DriftStatus.ADOPTED, - resolved_at=now_taipei(), - ) - auto_adopted = True - _logger.info( - "drift_auto_adopted", - report_id=report.report_id, - pr_url=auto_result.get("pr_url"), - ) - else: - auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped" - _logger.info( - "drift_auto_adopt_skipped", - report_id=report.report_id, - reason=auto_block_reason, - skipped=auto_result.get("skipped", True), - ) + if _drift_auto_enabled: + adopt_svc = get_drift_adopt_service() + auto_result = await adopt_svc.auto_adopt_if_safe(report) + if auto_result.get("success"): + # 自動採納成功:更新狀態,跳過人工卡片 + await repo.update_status( + report.report_id, + DriftStatus.ADOPTED, + resolved_at=now_taipei(), + ) + auto_adopted = True + _logger.info( + "drift_auto_adopted", + report_id=report.report_id, + pr_url=auto_result.get("pr_url"), + ) + else: + auto_block_reason = auto_result.get("reason", "") or "auto adopt skipped" + _logger.info( + "drift_auto_adopt_skipped", + report_id=report.report_id, + reason=auto_block_reason, + skipped=auto_result.get("skipped", True), + ) except Exception as e: auto_block_reason = f"auto adopt error: {str(e)[:120]}" _logger.warning("drift_auto_adopt_error", report_id=report.report_id, error=str(e)) diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index 25ce767e..40bd83bc 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -662,6 +662,24 @@ class Settings(BaseSettings): default=True, description="ADR-091 T1: True=AI 自學規則雙寫 alert_rule_catalog DB, False=僅 YAML(回滾用)", ) + # ========================================================================== + # 2026-05-04 ogt + Claude Sonnet 4.6: Drift 自動採納開關 + # 根因修復後啟用(report.interpretation in-memory 未更新 bug 已修) + # 回滾指令: kubectl set env deployment/awoooi-api DRIFT_AUTO_ADOPT_ENABLED=false + # ========================================================================== + DRIFT_AUTO_ADOPT_ENABLED: bool = Field( + default=True, + description="2026-05-04: True=啟用 drift auto_adopt_if_safe 自動採納低風險漂移, False=回滾停用", + ) + # ========================================================================== + # 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成 + # evaluate_once() 末段:對 auto_alerting=red 的 asset 自動生成 alert_rule_catalog 記錄 + # 回滾指令: kubectl set env deployment/awoooi-api COVERAGE_AUTO_RULE_ENABLED=false + # ========================================================================== + COVERAGE_AUTO_RULE_ENABLED: bool = Field( + default=True, + description="2026-05-04: True=coverage 缺口自動生成 alert_rule_catalog(source='ai_generated',review_status='pending_review'), False=停用", + ) # 2026-04-27 P3.1-T2-PathA by Claude — DiagAggregator 信號分類層補 PDI # 路徑 A 已啟用:DA 只取 PDI 已收集的 raw 資料做業務邏輯分類(OOMKilled/CrashLoop 等), # 不重複呼叫 K8s/SignOz API(純邏輯分類,不打外部服務)。 diff --git a/apps/api/src/jobs/coverage_evaluator_job.py b/apps/api/src/jobs/coverage_evaluator_job.py index a254eb61..87e74fb7 100644 --- a/apps/api/src/jobs/coverage_evaluator_job.py +++ b/apps/api/src/jobs/coverage_evaluator_job.py @@ -86,6 +86,7 @@ async def evaluate_once() -> dict[str, int]: "monitoring_updated": 0, "alerting_updated": 0, "km_updated": 0, "playbook_updated": 0, "remediation_updated": 0, "rule_matching_updated": 0, "rule_creation_updated": 0, + "rules_auto_created": 0, } error_msg: str | None = None @@ -129,6 +130,13 @@ async def evaluate_once() -> dict[str, int]: stats["llm_analyzed"] = True await _send_telegram_gaps(red_summary, llm_analysis) + # 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成執行器 + # 對 auto_alerting=red 的 asset 自動生成 alert_rule_catalog 記錄 + # COVERAGE_AUTO_RULE_ENABLED flag 控制(預設啟用) + if getattr(settings, "COVERAGE_AUTO_RULE_ENABLED", True): + created = await _auto_create_rules_for_uncovered_assets(run_id) + stats["rules_auto_created"] = created + await _log_aol(stats, duration_ms, error_msg) logger.info( @@ -140,6 +148,7 @@ async def evaluate_once() -> dict[str, int]: remediation=stats["remediation_updated"], rule_matching=stats["rule_matching_updated"], rule_creation=stats["rule_creation_updated"], + rules_auto_created=stats.get("rules_auto_created", 0), llm_analyzed=bool(llm_analysis), duration_ms=duration_ms, ) @@ -744,3 +753,162 @@ async def _log_aol(stats: dict[str, int], duration_ms: int, error: str | None) - ) except Exception as e: logger.warning("coverage_evaluator_aol_failed", error=str(e)) + + +# ============================================================================ +# 2026-05-04 ogt + Claude Sonnet 4.6: Coverage Gap → AI 規則自動生成執行器 +# ============================================================================ + +_COVERAGE_RULE_COOLDOWN_SEC = 86400 # 每個 asset 24h 冷卻,避免重複建規則 + + +async def _auto_create_rules_for_uncovered_assets(run_id: str | None) -> int: + """ + 對 auto_alerting=red 的 top 3 asset 自動生成 alert_rule_catalog 記錄。 + + 流程: + 1. 查最新 run 中 auto_alerting=red 的 host/k8s_workload(最多 5 筆) + 2. 每個 asset 用 Redis 24h 冷卻防重複 + 3. 依 asset_type 建立範本化 PromQL rule + 4. UPSERT 進 alert_rule_catalog(source='ai_generated', review_status='pending_review') + 5. 回傳成功建立數量 + + 設計鐵律: + - 只建 pending_review,不自動 approve + - rule_name UNIQUE 鍵:CoverageAuto_{type}_{safe_key} + - Redis 不可用時跳過冷卻檢查(不中斷主流程) + """ + from sqlalchemy import text as _sql + from src.db.base import get_db_context + import json as _j + import re + + if not run_id: + return 0 + + created = 0 + try: + async with get_db_context() as db: + # 查 auto_alerting=red 的 host 和 k8s_workload asset(最多 5 筆) + rows = await db.execute( + _sql(""" + SELECT ai.asset_id, ai.asset_key, ai.asset_type, + ai.name, ai.host, ai.namespace, + ai.metadata->>'internal_ip' AS internal_ip + FROM asset_coverage_snapshot cs + JOIN asset_inventory ai ON cs.asset_id = ai.asset_id + WHERE cs.run_id = CAST(:rid AS uuid) + AND cs.dimension = 'auto_alerting' + AND cs.coverage_status = 'red' + AND ai.asset_type IN ('host', 'k8s_workload') + ORDER BY ai.asset_type, ai.asset_key + LIMIT 5 + """), + {"rid": run_id}, + ) + assets = rows.fetchall() + + for asset in assets: + asset_key = str(asset.asset_key or "") + asset_type = str(asset.asset_type or "") + name = str(asset.name or "") + host = str(asset.host or "") + namespace = str(asset.namespace or "") + internal_ip = str(asset.internal_ip or "") + + # Redis 24h 冷卻 + cooldown_key = f"coverage_rule_created:{asset_key}" + try: + from src.core.redis_client import get_redis + redis = get_redis() + already = await redis.get(cooldown_key) + if already: + logger.debug("coverage_auto_rule_cooldown", asset_key=asset_key) + continue + except Exception: + pass # Redis 不可用,繼續建規則 + + # 建立 PromQL 規則 + safe_key = re.sub(r"[^a-zA-Z0-9]", "_", asset_key)[:60] + if asset_type == "host": + ip_for_match = internal_ip or host + if not ip_for_match: + continue + rule_name = f"CoverageAuto_HostDown_{safe_key}" + expr = f'up{{instance=~"{ip_for_match}:.*"}} == 0' + severity = "warning" + labels = {"host": host or ip_for_match, "layer": "infrastructure", "source": "coverage_auto"} + annotations = { + "summary": f"主機 {host or ip_for_match} 無 Prometheus 探測響應", + "description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve", + } + duration_seconds = 120 + elif asset_type == "k8s_workload": + if not name: + continue + rule_name = f"CoverageAuto_WorkloadDown_{safe_key}" + ns_selector = f',namespace="{namespace}"' if namespace else "" + expr = f'kube_deployment_status_replicas_available{{deployment="{name}"{ns_selector}}} == 0' + severity = "warning" + labels = {"namespace": namespace or "default", "deployment": name, "source": "coverage_auto"} + annotations = { + "summary": f"{name} 在 {namespace or 'default'} 無可用副本", + "description": f"Coverage 缺口自動建規則 — asset_key={asset_key},請 SRE 複核 expr 後 approve", + } + duration_seconds = 180 + else: + continue + + # UPSERT 進 alert_rule_catalog(source='ai_generated') + try: + async with get_db_context() as db: + await db.execute( + _sql(""" + INSERT INTO alert_rule_catalog ( + rule_name, source, expr, duration_seconds, + severity, labels, annotations, + created_by_agent, review_status, + created_at, updated_at + ) VALUES ( + :rname, 'ai_generated', :expr, :dur, + :sev, CAST(:labels AS jsonb), CAST(:ann AS jsonb), + 'coverage_evaluator', 'pending_review', + NOW(), NOW() + ) + ON CONFLICT (rule_name) DO NOTHING + """), + { + "rname": rule_name[:200], + "expr": expr[:4000], + "dur": duration_seconds, + "sev": severity, + "labels": _j.dumps(labels, ensure_ascii=False), + "ann": _j.dumps(annotations, ensure_ascii=False), + }, + ) + created += 1 + logger.info( + "coverage_auto_rule_created", + rule_name=rule_name, + asset_key=asset_key, + asset_type=asset_type, + ) + + # 設置 Redis 冷卻 + try: + from src.core.redis_client import get_redis + redis = get_redis() + await redis.set(cooldown_key, "1", ex=_COVERAGE_RULE_COOLDOWN_SEC) + except Exception: + pass + + except Exception as e: + logger.warning("coverage_auto_rule_upsert_failed", asset_key=asset_key, error=str(e)) + + except Exception as e: + logger.warning("coverage_auto_create_rules_failed", error=str(e)) + + if created > 0: + logger.info("coverage_auto_rules_summary", created=created) + + return created