diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index aa625d85..3e8ea589 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -42,9 +42,10 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: host_boot_detection = _dict(scorecard.get("host_boot_detection")) post_reboot_readiness = _dict(scorecard.get("post_reboot_readiness")) stockplatform = _dict(scorecard.get("stockplatform_data_freshness")) + source_controls = _dict(scorecard.get("source_controls")) active_blockers = _strings(scorecard.get("active_blockers")) required_checks = { - "source_controls_present": all(_dict(scorecard.get("source_controls")).values()), + "source_controls_present": all(source_controls.values()), "required_hosts_observed": not _strings(host_boot_detection.get("missing_hosts")), "required_hosts_reachable": not _strings(host_boot_detection.get("unreachable_hosts")), "service_green": post_reboot_readiness.get("service_green") is True, @@ -66,6 +67,8 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: can_claim_slo = ( scorecard.get("can_claim_all_services_recovered_within_target") is True ) + if can_claim_slo and not active_blockers: + readiness_percent = 100 latest_verify_metric = _dict(scorecard.get("latest_verify_only_metric")) active_blocker_count = len(active_blockers) observed_host_count = len(_strings(host_boot_detection.get("observed_hosts"))) @@ -174,6 +177,7 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "host_boot_detection": host_boot_detection, "post_reboot_readiness": post_reboot_readiness, "stockplatform_data_freshness": stockplatform, + "source_controls": source_controls, "active_blockers": active_blockers, "required_checks": required_checks, "rollups": rollups, diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 597c2f49..183ba3a0 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -100,6 +100,9 @@ def _assert_reboot_slo_payload(payload: dict): assert payload["latest_verify_only_metric_last_run_timestamp"] == 0 assert payload["stockplatform_freshness_status"] == "not_configured" assert payload["stockplatform_ingestion_status"] == "not_configured" + assert payload["source_controls"][ + "host_110_startup_controlled_drain_guarded_autostart_source_present" + ] is True assert payload["readback"]["workplan_id"] == "P0-006" assert payload["readback"]["target_minutes"] == 10 assert payload["readback"]["safe_next_step"] == ( diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index fa284443..8980f121 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,26 @@ +## 2026-07-01 — 20:34 P0-006 reboot SLO machine-readback source closure + +**照主線修正的問題**: +- Gitea / deploy closure 已恢復後,production `/api/v1/agents/delivery-closure-workbench` 仍讀回 P0-006 `blocked_reboot_auto_recovery_slo_not_ready`、readiness `18%`、active blockers `12`;這是目前主線,不是 GitHub / Gitea repo 消失。 +- `docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json` 只有 raw blocker/source fields,缺少 machine-readable `readback` / `rollups` / `summary`,容易讓 MCP / RAG / KM / PlayBook /泛用 readback 讀成空值或誤判。 +- `scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py` 現在會直接輸出 `required_checks`、`readback`、`rollups`、`summary`,保留 fail-closed truth:`active_blocker_count=12`、`readiness_percent=18`、`runtime_write_authorized=false`、`host_reboot_authorized=false`、`workflow_trigger_authorized=false`、`secret_value_collection_allowed=false`。 +- API loader `reboot_auto_recovery_slo_scorecard.py` 額外回傳 `source_controls`,並修正「無 blocker 且可 claim SLO 時 readiness 必須為 100」的一致性規則。 +- committed P0-006 snapshot 已回填 derived readback 欄位,但沒有把任何 runtime blocker 改成 green。 + +**驗證**: +- `python3 -m py_compile scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py apps/api/src/services/reboot_auto_recovery_slo_scorecard.py`:通過。 +- `DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/api python3.11 -m pytest scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py apps/api/tests/test_delivery_closure_workbench_api.py -q`:`18 passed`。 +- `DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/api python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py -q`:`39 passed`。 +- `git diff --check`:通過。 + +**仍維持**: +- 沒有使用 GitHub / `gh` / GitHub API / GitHub Actions。 +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth。 +- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB / firewall restart,沒有 workflow_dispatch,沒有 runtime write。 + +**下一步**: +- commit / push 後讀回 Gitea CD;部署後 production 應直接讀到 P0-006 `readback` / `rollups` / `summary` 與 `source_controls`,再依 safe_next_step 繼續處理 boot-triggered SLO timer / host probe evidence,仍不得把一般「繼續」解讀為 reboot 授權。 + ## 2026-07-01 — 19:32 110 startup fail-closed stub 修復 / systemd degraded 清零 **照主線修正的問題**: @@ -48,7 +71,6 @@ **邊界**:未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未 workflow_dispatch,未恢復 generic runner。 **下一步**:P0 轉入 backup / DR escrow / alert warning 收斂;先處理 `ESCROW_MISSING_COUNT=5` 與 backup stale / failed-component warnings,再回來做 188 Nginx privileged source apply 取代 SignOz temporary bridge。 - ## 2026-07-01 — 18:47 SignOz public route source drift 修正 **照主線修正的問題**: diff --git a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json index 0f216abd..f164d61d 100644 --- a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json +++ b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json @@ -146,6 +146,49 @@ "summary_present": true, "wazuh_dashboard_degraded": true }, + "readback": { + "active_blocker_count": 12, + "active_blockers": [ + "all_required_hosts_not_in_10_minute_reboot_window", + "backup_core_green_not_1", + "host_188_service_green_not_1", + "host_boot_observation_older_than_target_window", + "host_unreachable_after_reboot", + "host_uptime_unknown", + "local_disk_free_below_minimum", + "post_start_blocked_not_zero", + "product_data_green_not_1", + "reboot_event_required_host_unreachable", + "service_green_not_1", + "wazuh_dashboard_degraded" + ], + "blocked_by_fresh_reboot_window_only": false, + "host_reboot_authorized_by_this_scorecard": false, + "readiness_percent": 18, + "required_checks": { + "backup_core_green": false, + "can_claim_slo": false, + "fresh_reboot_window_observed": false, + "host_188_service_green": false, + "product_data_green": false, + "required_hosts_observed": true, + "required_hosts_reachable": false, + "service_green": false, + "source_controls_present": true, + "stockplatform_freshness_ok": false, + "stockplatform_ingestion_ok": false + }, + "runtime_write_authorized_by_this_scorecard": false, + "safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", + "secret_value_collection_allowed": false, + "source_controls_present": true, + "source_id": "reboot_auto_recovery_slo_scorecard", + "status": "blocked_reboot_auto_recovery_slo_not_ready", + "target_minutes": 10, + "workflow_trigger_authorized_by_this_scorecard": false, + "workplan_id": "P0-006", + "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO" + }, "reboot_event_detection": { "all_required_hosts_in_reboot_window": false, "all_required_hosts_observed": false, @@ -165,6 +208,53 @@ "111" ] }, + "required_checks": { + "backup_core_green": false, + "can_claim_slo": false, + "fresh_reboot_window_observed": false, + "host_188_service_green": false, + "product_data_green": false, + "required_hosts_observed": true, + "required_hosts_reachable": false, + "service_green": false, + "source_controls_present": true, + "stockplatform_freshness_ok": false, + "stockplatform_ingestion_ok": false + }, + "rollups": { + "active_blocker_count": 12, + "backup_core_green": false, + "blocked_by_fresh_reboot_window_only": false, + "can_claim_all_services_recovered_within_target": false, + "capacity_below_minimum": true, + "capacity_checked": true, + "capacity_free_gib": 0.751, + "capacity_min_free_gib": 2.0, + "completed_check_count": 2, + "host_188_service_green": false, + "latest_verify_only_metric_blocker_count": 0, + "latest_verify_only_metric_last_run_timestamp": 0, + "latest_verify_only_metric_max_host_uptime_seconds": 0, + "latest_verify_only_metric_present": false, + "latest_verify_only_metric_ready": 0, + "missing_host_count": 0, + "observed_host_count": 7, + "post_start_blocked": 8, + "product_data_green": false, + "readiness_percent": 18, + "required_check_count": 11, + "service_green": false, + "source_control_count": 14, + "source_control_ready_count": 14, + "source_controls_present": true, + "stale_host_count": 5, + "stockplatform_freshness_blocker_count": 1, + "stockplatform_freshness_status": "not_configured", + "stockplatform_ingestion_blocker_count": 1, + "stockplatform_ingestion_status": "not_configured", + "unknown_uptime_host_count": 1, + "unreachable_host_count": 1 + }, "safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", "schema_version": "awoooi_reboot_auto_recovery_slo_scorecard_v1", "sla_recovery_eta": { @@ -259,6 +349,27 @@ "successful_source_run_ids": [] } }, + "summary": { + "github_api_used": false, + "reboot_auto_recovery_active_blocker_count": 12, + "reboot_auto_recovery_backup_core_green": false, + "reboot_auto_recovery_can_claim_slo": false, + "reboot_auto_recovery_host_188_service_green": false, + "reboot_auto_recovery_observed_host_count": 7, + "reboot_auto_recovery_product_data_green": false, + "reboot_auto_recovery_readiness_percent": 18, + "reboot_auto_recovery_safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", + "reboot_auto_recovery_service_green": false, + "reboot_auto_recovery_source_controls_present": true, + "reboot_auto_recovery_stale_host_count": 5, + "reboot_auto_recovery_status": "blocked_reboot_auto_recovery_slo_not_ready", + "reboot_auto_recovery_stockplatform_freshness_status": "not_configured", + "reboot_auto_recovery_stockplatform_ingestion_status": "not_configured", + "reboot_auto_recovery_workplan_id": "P0-006", + "runtime_write_authorized": false, + "secret_values_collected": false, + "workflow_trigger_performed": false + }, "target_minutes": 10, "target_seconds": 600 } diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index cab4f271..ac3224c3 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -589,6 +589,206 @@ def choose_safe_next_step( ) +def percent(value: float) -> int: + return max(0, min(100, round(value))) + + +def build_required_checks(payload: dict[str, Any]) -> dict[str, bool]: + controls = payload.get("source_controls") + source_controls_present = ( + isinstance(controls, dict) and bool(controls) and all(controls.values()) + ) + host_boot_detection = payload.get("host_boot_detection") + if not isinstance(host_boot_detection, dict): + host_boot_detection = {} + post_reboot_readiness = payload.get("post_reboot_readiness") + if not isinstance(post_reboot_readiness, dict): + post_reboot_readiness = {} + stockplatform = payload.get("stockplatform_data_freshness") + if not isinstance(stockplatform, dict): + stockplatform = {} + + return { + "source_controls_present": source_controls_present, + "required_hosts_observed": not strings(host_boot_detection.get("missing_hosts")), + "required_hosts_reachable": not strings( + host_boot_detection.get("unreachable_hosts") + ), + "service_green": post_reboot_readiness.get("service_green") is True, + "product_data_green": post_reboot_readiness.get("product_data_green") is True, + "backup_core_green": post_reboot_readiness.get("backup_core_green") is True, + "host_188_service_green": ( + post_reboot_readiness.get("host_188_service_green") is True + ), + "stockplatform_freshness_ok": ( + stockplatform.get("freshness_status") == "ok" + ), + "stockplatform_ingestion_ok": ( + stockplatform.get("ingestion_status") in {"ok", "unknown"} + ), + "fresh_reboot_window_observed": not strings( + host_boot_detection.get("stale_hosts") + ), + "can_claim_slo": ( + payload.get("can_claim_all_services_recovered_within_target") is True + ), + } + + +def enrich_machine_readback(payload: dict[str, Any]) -> dict[str, Any]: + """Add machine-readable P0-006 readback fields to the source scorecard.""" + active_blockers = strings(payload.get("active_blockers")) + host_boot_detection = payload.get("host_boot_detection") + if not isinstance(host_boot_detection, dict): + host_boot_detection = {} + post_reboot_readiness = payload.get("post_reboot_readiness") + if not isinstance(post_reboot_readiness, dict): + post_reboot_readiness = {} + stockplatform = payload.get("stockplatform_data_freshness") + if not isinstance(stockplatform, dict): + stockplatform = {} + capacity = payload.get("capacity") + if not isinstance(capacity, dict): + capacity = {} + latest_verify_metric = payload.get("latest_verify_only_metric") + if not isinstance(latest_verify_metric, dict): + latest_verify_metric = {} + controls = payload.get("source_controls") + if not isinstance(controls, dict): + controls = {} + + required_checks = build_required_checks(payload) + completed_check_count = sum(1 for value in required_checks.values() if value) + required_check_count = len(required_checks) + readiness_percent = percent( + completed_check_count / max(required_check_count, 1) * 100 + ) + if ( + payload.get("can_claim_all_services_recovered_within_target") is True + and not active_blockers + ): + readiness_percent = 100 + blocked_by_fresh_reboot_window_only = active_blockers == [ + "host_boot_observation_older_than_target_window" + ] + source_control_ready_count = sum(1 for value in controls.values() if value) + source_controls_present = ( + bool(controls) and source_control_ready_count == len(controls) + ) + + rollups = { + "active_blocker_count": len(active_blockers), + "readiness_percent": readiness_percent, + "completed_check_count": completed_check_count, + "required_check_count": required_check_count, + "source_control_count": len(controls), + "source_control_ready_count": source_control_ready_count, + "source_controls_present": source_controls_present, + "can_claim_all_services_recovered_within_target": ( + payload.get("can_claim_all_services_recovered_within_target") is True + ), + "observed_host_count": len(strings(host_boot_detection.get("observed_hosts"))), + "missing_host_count": len(strings(host_boot_detection.get("missing_hosts"))), + "unreachable_host_count": len( + strings(host_boot_detection.get("unreachable_hosts")) + ), + "stale_host_count": len(strings(host_boot_detection.get("stale_hosts"))), + "unknown_uptime_host_count": len( + strings(host_boot_detection.get("unknown_uptime_hosts")) + ), + "post_start_blocked": int_value(post_reboot_readiness.get("post_start_blocked")), + "service_green": post_reboot_readiness.get("service_green") is True, + "product_data_green": post_reboot_readiness.get("product_data_green") is True, + "backup_core_green": post_reboot_readiness.get("backup_core_green") is True, + "host_188_service_green": ( + post_reboot_readiness.get("host_188_service_green") is True + ), + "blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only, + "latest_verify_only_metric_present": bool(latest_verify_metric), + "latest_verify_only_metric_ready": int_value(latest_verify_metric.get("ready")), + "latest_verify_only_metric_blocker_count": int_value( + latest_verify_metric.get("blocker_count") + ), + "latest_verify_only_metric_max_host_uptime_seconds": int_value( + latest_verify_metric.get("max_host_uptime_seconds") + ), + "latest_verify_only_metric_last_run_timestamp": int_value( + latest_verify_metric.get("last_run_timestamp") + ), + "stockplatform_freshness_status": str( + stockplatform.get("freshness_status") or "unknown" + ), + "stockplatform_ingestion_status": str( + stockplatform.get("ingestion_status") or "unknown" + ), + "stockplatform_freshness_blocker_count": len( + strings(stockplatform.get("freshness_blockers")) + ), + "stockplatform_ingestion_blocker_count": len( + strings(stockplatform.get("ingestion_blockers")) + ), + "capacity_checked": capacity.get("checked") is True, + "capacity_free_gib": capacity.get("free_gib"), + "capacity_min_free_gib": capacity.get("min_free_gib"), + "capacity_below_minimum": "local_disk_free_below_minimum" in active_blockers, + } + + readback = { + "workplan_id": "P0-006", + "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", + "source_id": "reboot_auto_recovery_slo_scorecard", + "status": str(payload.get("status") or "unknown"), + "target_minutes": int_value(payload.get("target_minutes")), + "safe_next_step": str(payload.get("safe_next_step") or ""), + "active_blockers": active_blockers, + "active_blocker_count": len(active_blockers), + "readiness_percent": readiness_percent, + "blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only, + "required_checks": required_checks, + "source_controls_present": source_controls_present, + "runtime_write_authorized_by_this_scorecard": False, + "host_reboot_authorized_by_this_scorecard": False, + "workflow_trigger_authorized_by_this_scorecard": False, + "secret_value_collection_allowed": False, + } + + summary = { + "reboot_auto_recovery_status": str(payload.get("status") or "unknown"), + "reboot_auto_recovery_workplan_id": "P0-006", + "reboot_auto_recovery_readiness_percent": readiness_percent, + "reboot_auto_recovery_active_blocker_count": len(active_blockers), + "reboot_auto_recovery_can_claim_slo": ( + payload.get("can_claim_all_services_recovered_within_target") is True + ), + "reboot_auto_recovery_service_green": rollups["service_green"], + "reboot_auto_recovery_product_data_green": rollups["product_data_green"], + "reboot_auto_recovery_backup_core_green": rollups["backup_core_green"], + "reboot_auto_recovery_host_188_service_green": rollups[ + "host_188_service_green" + ], + "reboot_auto_recovery_observed_host_count": rollups["observed_host_count"], + "reboot_auto_recovery_stale_host_count": rollups["stale_host_count"], + "reboot_auto_recovery_stockplatform_freshness_status": rollups[ + "stockplatform_freshness_status" + ], + "reboot_auto_recovery_stockplatform_ingestion_status": rollups[ + "stockplatform_ingestion_status" + ], + "reboot_auto_recovery_safe_next_step": readback["safe_next_step"], + "reboot_auto_recovery_source_controls_present": source_controls_present, + "secret_values_collected": False, + "github_api_used": False, + "workflow_trigger_performed": False, + "runtime_write_authorized": False, + } + + payload["required_checks"] = required_checks + payload["readback"] = readback + payload["rollups"] = rollups + payload["summary"] = summary + return payload + + def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: target_seconds = args.target_minutes * 60 generated_at = args.generated_at or datetime.now().astimezone().isoformat(timespec="seconds") @@ -695,7 +895,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: stockplatform=stockplatform, host_pressure=host_pressure, ) - return { + payload = { "schema_version": SCHEMA_VERSION, "generated_at": generated_at, "target_minutes": args.target_minutes, @@ -772,6 +972,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: "active_blockers": unique_blockers, "safe_next_step": safe_next_step, } + return enrich_machine_readback(payload) def main() -> int: diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py index a9b72fd8..56677984 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -169,6 +169,15 @@ def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) - assert payload["schema_version"] == "awoooi_reboot_auto_recovery_slo_scorecard_v1" assert payload["status"] == "slo_ready" assert payload["can_claim_all_services_recovered_within_target"] is True + assert payload["readback"]["workplan_id"] == "P0-006" + assert payload["readback"]["readiness_percent"] == 100 + assert payload["readback"]["active_blocker_count"] == 0 + assert payload["readback"]["runtime_write_authorized_by_this_scorecard"] is False + assert payload["rollups"]["source_controls_present"] is True + assert payload["rollups"]["readiness_percent"] == 100 + assert payload["summary"]["reboot_auto_recovery_workplan_id"] == "P0-006" + assert payload["summary"]["reboot_auto_recovery_can_claim_slo"] is True + assert payload["summary"]["runtime_write_authorized"] is False assert payload["source_controls"][ "host_110_startup_controlled_drain_guarded_autostart_source_present" ] is True @@ -186,6 +195,11 @@ def test_missing_probe_fails_closed(tmp_path: Path) -> None: assert payload["can_claim_all_services_recovered_within_target"] is False assert "all_host_reboot_detection_missing" in payload["active_blockers"] assert "host_boot_probe_missing_hosts" in payload["active_blockers"] + assert payload["readback"]["active_blocker_count"] == len( + payload["active_blockers"] + ) + assert payload["rollups"]["readiness_percent"] < 100 + assert payload["summary"]["reboot_auto_recovery_can_claim_slo"] is False def test_degraded_wazuh_and_old_boot_observation_block_slo(tmp_path: Path) -> None: