diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index ea7e4071..599d1356 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,16 @@ +## 2026-06-29 — 10:56 credential escrow reviewer acceptance deadlock fix + +**完成內容**: +- `post-reboot-credential-escrow-intake-scorecard.py` 修正 preflight ready 判定:當 redacted non-secret evidence refs 已通過 preflight,即使 marker 仍 missing,也進入 `ready_for_independent_reviewer_acceptance`,下一步才是 marker dry-run。 +- 保留 runtime gate、secret collection、credential marker write authorization 全部 `0`;不要求先寫 marker 才能進 reviewer acceptance,避免流程死鎖。 + +**驗證結果**: +- `python3.11 -m pytest scripts/reboot-recovery/tests/test_post_reboot_credential_escrow_intake_scorecard.py scripts/reboot-recovery/tests/test_post_reboot_owner_response_template.py scripts/reboot-recovery/tests/test_post_start_smoke_process_classifier.py scripts/reboot-recovery/tests/test_momo_source_arrival_gate.py ops/runner/test_verify_awoooi_non110_cd_closure.py -q`:`18 passed`。 +- `python3.11 -m py_compile scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py scripts/reboot-recovery/post-reboot-owner-response-template.py scripts/reboot-recovery/post-reboot-owner-response-preflight.py`:通過。 +- `git diff --check`:通過。 + +**邊界**:未讀 password / token / `.runner` / raw session / SQLite / auth / `.env`,未寫 credential marker,未操作 host / Docker / K8s / runner service,未使用 GitHub。 + ## 2026-06-29 — 10:44 non-110 CD closure verifier snapshot fallback **完成內容**: diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index d224785e..a8f33be0 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.80 +> Version: v1.81 > Last updated: 2026-06-29 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. @@ -16,7 +16,7 @@ v1.76 owner gate replay rule:同一輪 summary 產生後,owner packet 與 ow v1.79 active owner response template rule:同一輪 owner packet 產生後,placeholder response 必須由 `scripts/reboot-recovery/post-reboot-owner-response-template.py --owner-packet-file ` 生成,讓 `responses[].gate_id` 等於 active `owner_packets[].packet_id`。目前 2026-06-29 09:13 readback 只剩 `credential_escrow_evidence`,因此 generated template 不得帶入 `wazuh_manager_registry_export`。placeholder template 必須被 preflight 擋在 `blocked_waiting_owner_response_content`、`received=0`、`accepted=0`、`runtime_gate=0`;它是 no-secret intake aid,不是 owner accepted 或 marker-write 授權。 -v1.80 credential escrow intake scorecard rule:同一輪 owner response preflight 後,必須用 `scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py --summary-file "$ARTIFACT_DIR/summary.txt" --owner-packet-file --response-file --offsite-report-file --escrow-status-file ` 收斂 DR escrow gate。scorecard 只讀 sanitized artifacts;不得讀 secret value、不得寫 marker、不得送 owner request、不得開 runtime gate。2026-06-29 09:36 readback 期望 `STATUS=blocked_waiting_non_secret_credential_escrow_evidence`、`EFFECTIVE_ESCROW_MISSING_COUNT=5`、`OWNER_RESPONSE_RECEIVED_COUNT=0`、`OWNER_RESPONSE_ACCEPTED_COUNT=0`、`RUNTIME_GATE_COUNT=0`、`CREDENTIAL_MARKER_WRITE_AUTHORIZED_COUNT=0`。 +v1.80 / v1.81 credential escrow intake scorecard rule:同一輪 owner response preflight 後,必須用 `scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py --summary-file "$ARTIFACT_DIR/summary.txt" --owner-packet-file --response-file --offsite-report-file --escrow-status-file ` 收斂 DR escrow gate。scorecard 只讀 sanitized artifacts;不得讀 secret value、不得寫 marker、不得送 owner request、不得開 runtime gate。placeholder readback 期望 `STATUS=blocked_waiting_non_secret_credential_escrow_evidence`、`EFFECTIVE_ESCROW_MISSING_COUNT=5`、`OWNER_RESPONSE_RECEIVED_COUNT=0`、`OWNER_RESPONSE_ACCEPTED_COUNT=0`、`RUNTIME_GATE_COUNT=0`、`CREDENTIAL_MARKER_WRITE_AUTHORIZED_COUNT=0`。若未來收到合格 redacted owner response 並由 preflight 回 `ready_for_independent_reviewer_acceptance`,scorecard 應轉為 `STATUS=ready_for_independent_reviewer_acceptance`;即使 marker 尚未寫入,也只能進 `independent_reviewer_acceptance_then_marker_dry_run`,不得直接寫 marker 或宣稱 `DR_COMPLETE`。 2026-06-29 09:13 latest live summary:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260629-091918/summary.txt` 回傳 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_SERVICE_WARNINGS=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`BACKUP_CORE_GREEN=1`、`HOST_188_HYGIENE_BLOCKED=0`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=6`、`RUNTIME_ACTION_AUTHORIZED=0`、`NEXT_REQUIRED_GATES=credential_escrow_evidence`。目前仍不可宣稱 `DR_COMPLETE`,因為 `ESCROW_MISSING_COUNT=5`;owner packet contract guard 期望 `gates=1`。 diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index 272664d3..fd674697 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -15,7 +15,7 @@ | P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-26 07:19 readback shows 120 and 121 reachable, K3s active, `mon` and `mon1` both `Ready control-plane`, AWOOOI API/Web replicas split across both nodes, ArgoCD `awoooi-prod Synced / Healthy` at revision `1fd5e2a8b0f18d24eed16aa2a44286bcbf230603`, and `km-vectorize` official 03:00 台北時間 run succeeded with `lastSuccess=2026-06-25T19:00:14Z`. | | P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 98% | 2026-06-27 00:56 backup readback shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `integrity_stale=0`, `offsite_fresh=1`, `rclone_gdrive_fresh=1`, `configured_missing_188=0`, `escrow_missing=5`, last aggregate `2026-06-26 02:31:02`。188 MOMO backup crontab drift 已修復並保留 rollback crontab。DR remains blocked on real non-secret credential escrow evidence IDs; do not write placeholder markers or paste secret values. | | P2 service / data truth | DONE | 100% | Public routes 與 service health 為綠燈,MOMO health `V10.719`,current-month parity 為 `15383|15383|2026-06-01|2026-06-24|2026-06-01|2026-06-24`。StockPlatform `/api/v1/system/freshness` 為 `ok`,latest trading date `2026-06-26`,blockers `none`;先前 Stock EOD blocker 已由官方來源與正式 cron 自然收斂。 | -| P3 docs / automation contracts | DONE_WITH_BACKUP_CORE_RECOVERY_V180 | 100% | Workplan, SOP v1.80, post-reboot declaration guard, machine-readable post-reboot readiness summary with Wazuh registry detail fields and auto-persisted `summary.txt`, post-reboot next-gate dispatch checklist, owner-packet JSON generator, dynamic owner-packet contract guard, post-reboot owner response preflight, active-gate owner response template generator, credential escrow intake scorecard, one-page post-start quick check v1.18, route retry gate, delegated cold-start public-route / AWOOOI API warmup classifier, backup-status core-blocker readback, PyYAML-optional recovery-scorecard contract check, 188 MOMO backup crontab host-owned rollback evidence, deploy warmup classification, expanded public route list, StockPlatform freshness gate, StockPlatform cron-source recovery evidence, StockPlatform natural schedule green evidence, 110 orphan Chrome recurrence cleanup evidence, 188 fail-closed startup data recovery gate, 188 host hygiene read-only checklist, 188 PostgreSQL runtime-ready source-of-truth, 188 ACME route/timer hygiene, baseline `stockplatform_system_freshness_ok`, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat suppression, MOMO scheduler / current-month detector fix, exporter restore helpers, 110 Docker disk pressure cleanup boundary, notification-noise readback, MOMO import-boundary / Drive-auth fail-closed deploys, product version/readback matrix, and stricter product-data / route retry gates are updated. Declaration guard now machine-checks allowed / forbidden recovery statements from the same `summary.txt`: service/data/backup/188 host hygiene green may be declared when live summary says so, while `DR_COMPLETE`、`WAZUH_REGISTRY_RECOVERED` and `RUNTIME_ACTION_AUTHORIZED` remain forbidden until evidence gates close. | +| P3 docs / automation contracts | DONE_WITH_BACKUP_CORE_RECOVERY_V181 | 100% | Workplan, SOP v1.81, post-reboot declaration guard, machine-readable post-reboot readiness summary with Wazuh registry detail fields and auto-persisted `summary.txt`, post-reboot next-gate dispatch checklist, owner-packet JSON generator, dynamic owner-packet contract guard, post-reboot owner response preflight, active-gate owner response template generator, credential escrow intake scorecard, one-page post-start quick check v1.18, route retry gate, delegated cold-start public-route / AWOOOI API warmup classifier, backup-status core-blocker readback, PyYAML-optional recovery-scorecard contract check, 188 MOMO backup crontab host-owned rollback evidence, deploy warmup classification, expanded public route list, StockPlatform freshness gate, StockPlatform cron-source recovery evidence, StockPlatform natural schedule green evidence, 110 orphan Chrome recurrence cleanup evidence, 188 fail-closed startup data recovery gate, 188 host hygiene read-only checklist, 188 PostgreSQL runtime-ready source-of-truth, 188 ACME route/timer hygiene, baseline `stockplatform_system_freshness_ok`, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat suppression, MOMO scheduler / current-month detector fix, exporter restore helpers, 110 Docker disk pressure cleanup boundary, notification-noise readback, MOMO import-boundary / Drive-auth fail-closed deploys, product version/readback matrix, and stricter product-data / route retry gates are updated. Declaration guard now machine-checks allowed / forbidden recovery statements from the same `summary.txt`: service/data/backup/188 host hygiene green may be declared when live summary says so, while `DR_COMPLETE`、`WAZUH_REGISTRY_RECOVERED` and `RUNTIME_ACTION_AUTHORIZED` remain forbidden until evidence gates close. | 2026-06-26 12:13 machine-readable summary baseline supersedes the 07:47 / 08:59 gate set: `scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` stores delegated logs under `/tmp/awoooi-post-reboot-readiness-20260626-121303` and returns `SERVICE_GREEN=1`, `PRODUCT_DATA_GREEN=1`, `BACKUP_CORE_GREEN=1`, `DR_ESCROW_BLOCKED=1`, `ESCROW_MISSING_COUNT=5`, `HOST_188_SERVICE_GREEN=1`, `HOST_188_HYGIENE_BLOCKED=0`, `HOST_188_CHECK_RC=0`, `HOST_188_RESULT=HOST_188_HYGIENE_GREEN.`, `WAZUH_ROUTE_CODE=200`, `WAZUH_TRANSPORT_COUNT=6`, `WAZUH_COVERAGE_SCOPE=6`, `WAZUH_DIRECT_ACTIVE=2`, `WAZUH_NO_TRANSPORT=1`, `WAZUH_SSH_BLOCKED=3`, `WAZUH_DASHBOARD_API_CONNECTION=pending_or_spinning`, `WAZUH_DASHBOARD_INDEX_OK=3`, `WAZUH_MANAGER_REGISTRY_ACCEPTED=0`, `WAZUH_RUNTIME_GATE=0`, `RUNTIME_ACTION_AUTHORIZED=0`, `OVERALL_DECLARATION=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`, and `NEXT_REQUIRED_GATES=credential_escrow_evidence,wazuh_manager_registry_export`. This is now the preferred first operator/AI-agent entrypoint after reboot because it separates service health from DR and security registry evidence; 188 host hygiene is no longer a next gate unless the live checklist regresses. diff --git a/scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py b/scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py index 85821e60..c7918bfd 100755 --- a/scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py +++ b/scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py @@ -310,9 +310,12 @@ def evaluate(args: argparse.Namespace) -> dict[str, Any]: elif unexpected_packet_gates or unexpected_response_gates: status = "blocked_owner_packet_or_response_gate_mismatch" next_step = "regenerate_owner_packet_and_response_template_from_same_summary" - elif preflight_status == "ready_for_independent_reviewer_acceptance" and effective_missing == 0: + elif preflight_status == "ready_for_independent_reviewer_acceptance": status = "ready_for_independent_reviewer_acceptance" - next_step = "independent_reviewer_acceptance_then_marker_dry_run" + if effective_missing == 0: + next_step = "rerun_post_reboot_summary_to_close_dr_gate" + else: + next_step = "independent_reviewer_acceptance_then_marker_dry_run" else: status = "blocked_waiting_non_secret_credential_escrow_evidence" next_step = "collect_redacted_non_secret_evidence_refs_then_rerun_preflight" diff --git a/scripts/reboot-recovery/tests/test_post_reboot_credential_escrow_intake_scorecard.py b/scripts/reboot-recovery/tests/test_post_reboot_credential_escrow_intake_scorecard.py index 784d0204..9bc45863 100644 --- a/scripts/reboot-recovery/tests/test_post_reboot_credential_escrow_intake_scorecard.py +++ b/scripts/reboot-recovery/tests/test_post_reboot_credential_escrow_intake_scorecard.py @@ -71,6 +71,34 @@ def generate_template(packet_path: Path, tmp_path: Path) -> Path: return response_path +def generate_valid_redacted_response(packet_path: Path, tmp_path: Path) -> Path: + response_path = generate_template(packet_path, tmp_path) + response = json.loads(response_path.read_text(encoding="utf-8")) + item = response["responses"][0] + item.update( + { + "owner_role": "backup_dr_owner", + "owner_team": "platform_security", + "decision": "accepted", + "decision_reason": "redacted_non_secret_evidence_refs_supplied", + "redacted_evidence_refs": ["escrow-review-ticket-20260629-001"], + "followup_owner": "backup_dr_owner", + } + ) + for index, escrow_item in enumerate(item["escrow_items"], start=1): + escrow_item.update( + { + "non_secret_evidence_ref": f"escrow-evidence-ref-20260629-{index:03d}", + "recovery_owner": "backup_dr_owner", + "reviewer": "security_reviewer", + "last_reviewed_at": "2026-06-29", + "contains_secret_value": False, + } + ) + response_path.write_text(json.dumps(response, indent=2) + "\n", encoding="utf-8") + return response_path + + def run_scorecard( summary_path: Path, packet_path: Path, @@ -187,6 +215,34 @@ def test_scorecard_blocks_forbidden_runtime_or_marker_requests(tmp_path: Path) - assert scorecard["runtime_gate_count"] == 0 +def test_valid_redacted_response_reaches_independent_reviewer_even_before_markers(tmp_path: Path) -> None: + summary_path, offsite_path, escrow_status_path = write_common_artifacts(tmp_path) + packet_path = write_packet(tmp_path, ["credential_escrow_evidence"]) + response_path = generate_valid_redacted_response(packet_path, tmp_path) + + scorecard = run_scorecard( + summary_path, + packet_path, + response_path, + offsite_path, + escrow_status_path, + ) + + assert scorecard["status"] == "ready_for_independent_reviewer_acceptance" + assert scorecard["next_step"] == "independent_reviewer_acceptance_then_marker_dry_run" + assert scorecard["preflight_status"] == "ready_for_independent_reviewer_acceptance" + assert scorecard["effective_escrow_missing_count"] == 5 + assert scorecard["owner_response_received_count"] == 1 + assert scorecard["owner_response_accepted_count"] == 1 + assert scorecard["runtime_gate_count"] == 0 + assert scorecard["runtime_action_authorized"] == 0 + assert scorecard["host_write_authorized"] == 0 + assert scorecard["secret_value_collection_allowed"] == 0 + assert scorecard["credential_marker_write_requested_count"] == 0 + assert scorecard["credential_marker_write_authorized_count"] == 0 + assert scorecard["forbidden_true_field_count"] == 0 + + def test_scorecard_rejects_stale_extra_wazuh_response_gate(tmp_path: Path) -> None: summary_path, offsite_path, escrow_status_path = write_common_artifacts(tmp_path) packet_path = write_packet(tmp_path, ["credential_escrow_evidence"])