Files
awoooi/docs/evaluations/host_runaway_aiops_loop_readiness_2026-06-18.json
Your Name 0e72a6f428
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
feat(aiops): expose host runaway loop readiness
2026-06-18 15:28:15 +08:00

291 lines
9.7 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"schema_version": "host_runaway_aiops_loop_readiness_v1",
"generated_at": "2026-06-18T15:08:00+08:00",
"program_status": {
"overall_completion_percent": 100,
"current_priority": "P3",
"current_task_id": "P3-009",
"next_task_id": "P3-010",
"read_only_mode": true,
"runtime_authority": "host_runaway_aiops_loop_readiness_only_no_host_write",
"status_note": "110 CPU runaway AIOps loop 已完成監控、告警、AI event packet、PlayBook / KM contract 與 production readbackruntime remediation 仍需 owner approval / maintenance window / evidence ref / dry-run / post-check。"
},
"source_refs": [
"scripts/ops/host-runaway-process-exporter.py",
"scripts/ops/host-runaway-process-remediation.py",
"ops/monitoring/alerts-unified.yml",
"apps/api/src/services/telegram_gateway.py",
"docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md",
"docs/runbooks/FULL-STACK-COLD-START-SOP.md",
"docs/LOGBOOK.md"
],
"rollups": {
"loop_stage_count": 6,
"alert_lane_count": 2,
"asset_writeback_contract_count": 5,
"source_ref_count": 7,
"live_readback_metric_count": 8,
"blocked_runtime_action_count": 12,
"runtime_remediation_authorized_count": 0,
"telegram_send_count": 0,
"gateway_queue_write_count": 0,
"bot_api_call_count": 0,
"host_write_count": 0,
"process_termination_count": 0,
"docker_restart_count": 0,
"systemd_restart_count": 0,
"nginx_reload_count": 0,
"firewall_change_count": 0,
"kubectl_action_count": 0,
"production_write_count": 0
},
"loop_stages": [
{
"stage_id": "read_only_host_textfile_exporter",
"display_name": "110 textfile exporter",
"owner_agent": "openclaw",
"status": "production_readback_complete",
"completion_percent": 100,
"evidence_refs": [
"scripts/ops/host-runaway-process-exporter.py",
"/home/wooo/node_exporter_textfiles/host_runaway_process.prom"
],
"next_action": "保持每 2 分鐘 cron scrape若 monitor missing / stale 則先修 observability不宣稱 AI Ops 綠燈。",
"blocked_runtime_actions": [
"process_termination",
"docker_restart"
]
},
{
"stage_id": "prometheus_alert_rules",
"display_name": "Prometheus alert lanes",
"owner_agent": "hermes",
"status": "deployed",
"completion_percent": 100,
"evidence_refs": [
"ops/monitoring/alerts-unified.yml",
"deploy-alerts #3147"
],
"next_action": "用 HostOrphanBrowserSmokeHighCpu 分流 orphan smoke用 HostCiRunnerLoadSaturation 分流合法 CI load。",
"blocked_runtime_actions": [
"auto_silence",
"auto_restart"
]
},
{
"stage_id": "telegram_ai_event_packet",
"display_name": "Telegram / AI event packet",
"owner_agent": "hermes",
"status": "production_deployed",
"completion_percent": 100,
"evidence_refs": [
"apps/api/src/services/telegram_gateway.py",
"f358a0f6",
"2d278568"
],
"next_action": "告警實際 firing 時產生專屬 lane 與 dry-run / owner / evidence gate不只輸出泛用 CPU 文本。",
"blocked_runtime_actions": [
"telegram_direct_send_without_gate",
"bot_api_call_without_receipt"
]
},
{
"stage_id": "playbook_contract",
"display_name": "Runaway PlayBook",
"owner_agent": "openclaw",
"status": "ready",
"completion_percent": 100,
"evidence_refs": [
"docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md",
"docs/runbooks/FULL-STACK-COLD-START-SOP.md"
],
"next_action": "下一次事故依照 monitor -> alert -> triage -> dry-run -> approval -> post-check 順序執行。",
"blocked_runtime_actions": [
"skip_dry_run",
"skip_post_check"
]
},
{
"stage_id": "km_verifier_writeback_contract",
"display_name": "KM / Verifier writeback contract",
"owner_agent": "nemotron",
"status": "contract_ready",
"completion_percent": 100,
"evidence_refs": [
"docs/LOGBOOK.md",
"docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md"
],
"next_action": "真實事故需回寫 KM 條目、PlayBook 信任證據、Verifier post-check 與 recurrence guard。",
"blocked_runtime_actions": [
"claim_closed_without_km",
"claim_closed_without_verifier"
]
},
{
"stage_id": "gated_remediation_helper",
"display_name": "Gated remediation helper",
"owner_agent": "openclaw",
"status": "dry_run_ready_apply_blocked",
"completion_percent": 100,
"evidence_refs": [
"scripts/ops/host-runaway-process-remediation.py",
"scripts/ops/tests/test_host_runaway_process_exporter.py"
],
"next_action": "只有在 owner approval、maintenance window、evidence ref、rule、dry-run 與 post-check 都齊備後,才允許最小 SIGTERM。",
"blocked_runtime_actions": [
"sigkill",
"restart_service"
]
}
],
"alert_lanes": [
{
"alertname": "HostOrphanBrowserSmokeHighCpu",
"lane_id": "orphan_browser_smoke_runaway_process",
"classification": "host_resource_runaway_process",
"action_policy": "triage_packet_then_dry_run_then_gated_sigterm",
"dry_run_allowed": true,
"apply_allowed_without_owner_gate": false,
"runtime_write_gate": 0,
"next_action": "建立 triage packet列出 process group、CPU、age、source command、dry-run target 與 post-check。"
},
{
"alertname": "HostCiRunnerLoadSaturation",
"lane_id": "ci_runner_load_saturation",
"classification": "host_resource_capacity",
"action_policy": "capacity_triage_no_process_remediation",
"dry_run_allowed": false,
"apply_allowed_without_owner_gate": false,
"runtime_write_gate": 0,
"next_action": "關聯 Gitea Actions run、runner queue、load/core、swap trend合法 CI 不走 process kill。"
}
],
"asset_writeback_contract": [
{
"asset_id": "knowledge_base_incident_summary",
"display_name": "Knowledge Base incident summary",
"required_on_real_incident": true,
"live_write_enabled": false,
"required_fields": [
"incident_id",
"root_cause",
"evidence_refs",
"post_check_result"
]
},
{
"asset_id": "playbook_trust_evidence",
"display_name": "PlayBook trust evidence",
"required_on_real_incident": true,
"live_write_enabled": false,
"required_fields": [
"playbook_id",
"matched_rule",
"success_or_failure",
"negative_reinforcement_note"
]
},
{
"asset_id": "awooop_work_item_truth_chain",
"display_name": "AwoooP Work Item truth-chain",
"required_on_real_incident": true,
"live_write_enabled": false,
"required_fields": [
"owner",
"status",
"approval_ref",
"maintenance_window"
]
},
{
"asset_id": "verifier_post_check",
"display_name": "Verifier post-check",
"required_on_real_incident": true,
"live_write_enabled": false,
"required_fields": [
"before_metrics",
"after_metrics",
"alert_state",
"service_health"
]
},
{
"asset_id": "recurrence_guard",
"display_name": "Recurrence guard",
"required_on_real_incident": true,
"live_write_enabled": false,
"required_fields": [
"prometheus_metric",
"alert_lane",
"owner_followup",
"next_review_at"
]
}
],
"live_readback": {
"host_label": "110",
"monitor_up": 1,
"orphan_browser_group_count": 0,
"active_ci_container_count": 2,
"load5_per_core_upper_observed": 0.81,
"swap_used_ratio_upper_observed": 1.0,
"remediation_authorized_count": 0,
"alerts_firing_count": 0,
"deploy_marker": "2d278568",
"runtime_revision": "f358a0f6c3e614e407dedb6eee89bf10b2bc8173",
"argocd_sync": "Synced",
"argocd_health": "Healthy",
"production_route_count": 3,
"forbidden_public_hit_count": 0
},
"remediation_gate": {
"dry_run_required": true,
"owner_approval_required": true,
"maintenance_window_required": true,
"evidence_ref_required": true,
"post_check_required": true,
"allowed_signal_after_gate": "SIGTERM",
"process_termination_authorized": false,
"disallowed_actions": [
"SIGKILL",
"docker restart",
"systemctl restart",
"nginx reload",
"firewall change",
"kubectl action",
"secret read",
"production write"
]
},
"activation_boundaries": {
"read_only_readback_allowed": true,
"ai_triage_packet_allowed": true,
"dry_run_generation_allowed": true,
"runtime_remediation_enabled": false,
"process_termination_authorized": false,
"telegram_send_enabled": false,
"gateway_queue_write_enabled": false,
"bot_api_call_enabled": false,
"host_write_enabled": false,
"docker_restart_enabled": false,
"systemd_restart_enabled": false,
"nginx_reload_enabled": false,
"firewall_change_enabled": false,
"kubectl_action_enabled": false,
"production_write_enabled": false,
"secret_read_enabled": false
},
"next_steps": [
{
"step_id": "real_alert_receipt_fixture",
"description": "下一次真實 firing 或脫敏 fixture 應驗證 Telegram card、AwoooP Work Item、KM / PlayBook / Verifier 欄位一致。",
"runtime_write_allowed": false
},
{
"step_id": "owner_approved_apply_drill",
"description": "只有在人工批准維護窗口後,才可用非生產 fixture 演練 gated SIGTERM 與 post-check。",
"runtime_write_allowed": false
}
]
}