Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
291 lines
9.7 KiB
JSON
291 lines
9.7 KiB
JSON
{
|
||
"schema_version": "host_runaway_aiops_loop_readiness_v1",
|
||
"generated_at": "2026-06-18T15:08:00+08:00",
|
||
"program_status": {
|
||
"overall_completion_percent": 100,
|
||
"current_priority": "P3",
|
||
"current_task_id": "P3-009",
|
||
"next_task_id": "P3-010",
|
||
"read_only_mode": true,
|
||
"runtime_authority": "host_runaway_aiops_loop_readiness_only_no_host_write",
|
||
"status_note": "110 CPU runaway AIOps loop 已完成監控、告警、AI event packet、PlayBook / KM contract 與 production readback;runtime remediation 仍需 owner approval / maintenance window / evidence ref / dry-run / post-check。"
|
||
},
|
||
"source_refs": [
|
||
"scripts/ops/host-runaway-process-exporter.py",
|
||
"scripts/ops/host-runaway-process-remediation.py",
|
||
"ops/monitoring/alerts-unified.yml",
|
||
"apps/api/src/services/telegram_gateway.py",
|
||
"docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md",
|
||
"docs/runbooks/FULL-STACK-COLD-START-SOP.md",
|
||
"docs/LOGBOOK.md"
|
||
],
|
||
"rollups": {
|
||
"loop_stage_count": 6,
|
||
"alert_lane_count": 2,
|
||
"asset_writeback_contract_count": 5,
|
||
"source_ref_count": 7,
|
||
"live_readback_metric_count": 8,
|
||
"blocked_runtime_action_count": 12,
|
||
"runtime_remediation_authorized_count": 0,
|
||
"telegram_send_count": 0,
|
||
"gateway_queue_write_count": 0,
|
||
"bot_api_call_count": 0,
|
||
"host_write_count": 0,
|
||
"process_termination_count": 0,
|
||
"docker_restart_count": 0,
|
||
"systemd_restart_count": 0,
|
||
"nginx_reload_count": 0,
|
||
"firewall_change_count": 0,
|
||
"kubectl_action_count": 0,
|
||
"production_write_count": 0
|
||
},
|
||
"loop_stages": [
|
||
{
|
||
"stage_id": "read_only_host_textfile_exporter",
|
||
"display_name": "110 textfile exporter",
|
||
"owner_agent": "openclaw",
|
||
"status": "production_readback_complete",
|
||
"completion_percent": 100,
|
||
"evidence_refs": [
|
||
"scripts/ops/host-runaway-process-exporter.py",
|
||
"/home/wooo/node_exporter_textfiles/host_runaway_process.prom"
|
||
],
|
||
"next_action": "保持每 2 分鐘 cron scrape,若 monitor missing / stale 則先修 observability,不宣稱 AI Ops 綠燈。",
|
||
"blocked_runtime_actions": [
|
||
"process_termination",
|
||
"docker_restart"
|
||
]
|
||
},
|
||
{
|
||
"stage_id": "prometheus_alert_rules",
|
||
"display_name": "Prometheus alert lanes",
|
||
"owner_agent": "hermes",
|
||
"status": "deployed",
|
||
"completion_percent": 100,
|
||
"evidence_refs": [
|
||
"ops/monitoring/alerts-unified.yml",
|
||
"deploy-alerts #3147"
|
||
],
|
||
"next_action": "用 HostOrphanBrowserSmokeHighCpu 分流 orphan smoke,用 HostCiRunnerLoadSaturation 分流合法 CI load。",
|
||
"blocked_runtime_actions": [
|
||
"auto_silence",
|
||
"auto_restart"
|
||
]
|
||
},
|
||
{
|
||
"stage_id": "telegram_ai_event_packet",
|
||
"display_name": "Telegram / AI event packet",
|
||
"owner_agent": "hermes",
|
||
"status": "production_deployed",
|
||
"completion_percent": 100,
|
||
"evidence_refs": [
|
||
"apps/api/src/services/telegram_gateway.py",
|
||
"f358a0f6",
|
||
"2d278568"
|
||
],
|
||
"next_action": "告警實際 firing 時產生專屬 lane 與 dry-run / owner / evidence gate,不只輸出泛用 CPU 文本。",
|
||
"blocked_runtime_actions": [
|
||
"telegram_direct_send_without_gate",
|
||
"bot_api_call_without_receipt"
|
||
]
|
||
},
|
||
{
|
||
"stage_id": "playbook_contract",
|
||
"display_name": "Runaway PlayBook",
|
||
"owner_agent": "openclaw",
|
||
"status": "ready",
|
||
"completion_percent": 100,
|
||
"evidence_refs": [
|
||
"docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md",
|
||
"docs/runbooks/FULL-STACK-COLD-START-SOP.md"
|
||
],
|
||
"next_action": "下一次事故依照 monitor -> alert -> triage -> dry-run -> approval -> post-check 順序執行。",
|
||
"blocked_runtime_actions": [
|
||
"skip_dry_run",
|
||
"skip_post_check"
|
||
]
|
||
},
|
||
{
|
||
"stage_id": "km_verifier_writeback_contract",
|
||
"display_name": "KM / Verifier writeback contract",
|
||
"owner_agent": "nemotron",
|
||
"status": "contract_ready",
|
||
"completion_percent": 100,
|
||
"evidence_refs": [
|
||
"docs/LOGBOOK.md",
|
||
"docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md"
|
||
],
|
||
"next_action": "真實事故需回寫 KM 條目、PlayBook 信任證據、Verifier post-check 與 recurrence guard。",
|
||
"blocked_runtime_actions": [
|
||
"claim_closed_without_km",
|
||
"claim_closed_without_verifier"
|
||
]
|
||
},
|
||
{
|
||
"stage_id": "gated_remediation_helper",
|
||
"display_name": "Gated remediation helper",
|
||
"owner_agent": "openclaw",
|
||
"status": "dry_run_ready_apply_blocked",
|
||
"completion_percent": 100,
|
||
"evidence_refs": [
|
||
"scripts/ops/host-runaway-process-remediation.py",
|
||
"scripts/ops/tests/test_host_runaway_process_exporter.py"
|
||
],
|
||
"next_action": "只有在 owner approval、maintenance window、evidence ref、rule、dry-run 與 post-check 都齊備後,才允許最小 SIGTERM。",
|
||
"blocked_runtime_actions": [
|
||
"sigkill",
|
||
"restart_service"
|
||
]
|
||
}
|
||
],
|
||
"alert_lanes": [
|
||
{
|
||
"alertname": "HostOrphanBrowserSmokeHighCpu",
|
||
"lane_id": "orphan_browser_smoke_runaway_process",
|
||
"classification": "host_resource_runaway_process",
|
||
"action_policy": "triage_packet_then_dry_run_then_gated_sigterm",
|
||
"dry_run_allowed": true,
|
||
"apply_allowed_without_owner_gate": false,
|
||
"runtime_write_gate": 0,
|
||
"next_action": "建立 triage packet,列出 process group、CPU、age、source command、dry-run target 與 post-check。"
|
||
},
|
||
{
|
||
"alertname": "HostCiRunnerLoadSaturation",
|
||
"lane_id": "ci_runner_load_saturation",
|
||
"classification": "host_resource_capacity",
|
||
"action_policy": "capacity_triage_no_process_remediation",
|
||
"dry_run_allowed": false,
|
||
"apply_allowed_without_owner_gate": false,
|
||
"runtime_write_gate": 0,
|
||
"next_action": "關聯 Gitea Actions run、runner queue、load/core、swap trend;合法 CI 不走 process kill。"
|
||
}
|
||
],
|
||
"asset_writeback_contract": [
|
||
{
|
||
"asset_id": "knowledge_base_incident_summary",
|
||
"display_name": "Knowledge Base incident summary",
|
||
"required_on_real_incident": true,
|
||
"live_write_enabled": false,
|
||
"required_fields": [
|
||
"incident_id",
|
||
"root_cause",
|
||
"evidence_refs",
|
||
"post_check_result"
|
||
]
|
||
},
|
||
{
|
||
"asset_id": "playbook_trust_evidence",
|
||
"display_name": "PlayBook trust evidence",
|
||
"required_on_real_incident": true,
|
||
"live_write_enabled": false,
|
||
"required_fields": [
|
||
"playbook_id",
|
||
"matched_rule",
|
||
"success_or_failure",
|
||
"negative_reinforcement_note"
|
||
]
|
||
},
|
||
{
|
||
"asset_id": "awooop_work_item_truth_chain",
|
||
"display_name": "AwoooP Work Item truth-chain",
|
||
"required_on_real_incident": true,
|
||
"live_write_enabled": false,
|
||
"required_fields": [
|
||
"owner",
|
||
"status",
|
||
"approval_ref",
|
||
"maintenance_window"
|
||
]
|
||
},
|
||
{
|
||
"asset_id": "verifier_post_check",
|
||
"display_name": "Verifier post-check",
|
||
"required_on_real_incident": true,
|
||
"live_write_enabled": false,
|
||
"required_fields": [
|
||
"before_metrics",
|
||
"after_metrics",
|
||
"alert_state",
|
||
"service_health"
|
||
]
|
||
},
|
||
{
|
||
"asset_id": "recurrence_guard",
|
||
"display_name": "Recurrence guard",
|
||
"required_on_real_incident": true,
|
||
"live_write_enabled": false,
|
||
"required_fields": [
|
||
"prometheus_metric",
|
||
"alert_lane",
|
||
"owner_followup",
|
||
"next_review_at"
|
||
]
|
||
}
|
||
],
|
||
"live_readback": {
|
||
"host_label": "110",
|
||
"monitor_up": 1,
|
||
"orphan_browser_group_count": 0,
|
||
"active_ci_container_count": 2,
|
||
"load5_per_core_upper_observed": 0.81,
|
||
"swap_used_ratio_upper_observed": 1.0,
|
||
"remediation_authorized_count": 0,
|
||
"alerts_firing_count": 0,
|
||
"deploy_marker": "2d278568",
|
||
"runtime_revision": "f358a0f6c3e614e407dedb6eee89bf10b2bc8173",
|
||
"argocd_sync": "Synced",
|
||
"argocd_health": "Healthy",
|
||
"production_route_count": 3,
|
||
"forbidden_public_hit_count": 0
|
||
},
|
||
"remediation_gate": {
|
||
"dry_run_required": true,
|
||
"owner_approval_required": true,
|
||
"maintenance_window_required": true,
|
||
"evidence_ref_required": true,
|
||
"post_check_required": true,
|
||
"allowed_signal_after_gate": "SIGTERM",
|
||
"process_termination_authorized": false,
|
||
"disallowed_actions": [
|
||
"SIGKILL",
|
||
"docker restart",
|
||
"systemctl restart",
|
||
"nginx reload",
|
||
"firewall change",
|
||
"kubectl action",
|
||
"secret read",
|
||
"production write"
|
||
]
|
||
},
|
||
"activation_boundaries": {
|
||
"read_only_readback_allowed": true,
|
||
"ai_triage_packet_allowed": true,
|
||
"dry_run_generation_allowed": true,
|
||
"runtime_remediation_enabled": false,
|
||
"process_termination_authorized": false,
|
||
"telegram_send_enabled": false,
|
||
"gateway_queue_write_enabled": false,
|
||
"bot_api_call_enabled": false,
|
||
"host_write_enabled": false,
|
||
"docker_restart_enabled": false,
|
||
"systemd_restart_enabled": false,
|
||
"nginx_reload_enabled": false,
|
||
"firewall_change_enabled": false,
|
||
"kubectl_action_enabled": false,
|
||
"production_write_enabled": false,
|
||
"secret_read_enabled": false
|
||
},
|
||
"next_steps": [
|
||
{
|
||
"step_id": "real_alert_receipt_fixture",
|
||
"description": "下一次真實 firing 或脫敏 fixture 應驗證 Telegram card、AwoooP Work Item、KM / PlayBook / Verifier 欄位一致。",
|
||
"runtime_write_allowed": false
|
||
},
|
||
{
|
||
"step_id": "owner_approved_apply_drill",
|
||
"description": "只有在人工批准維護窗口後,才可用非生產 fixture 演練 gated SIGTERM 與 post-check。",
|
||
"runtime_write_allowed": false
|
||
}
|
||
]
|
||
}
|