diff --git a/apps/api/src/services/awoooi_priority_work_order_readback.py b/apps/api/src/services/awoooi_priority_work_order_readback.py index 8695ef765..79d9a7694 100644 --- a/apps/api/src/services/awoooi_priority_work_order_readback.py +++ b/apps/api/src/services/awoooi_priority_work_order_readback.py @@ -664,20 +664,20 @@ _COMMANDER_INSERTED_REQUIREMENT_WORK_ITEMS: list[dict[str, Any]] = [ "id": "CIR-P0-GIT-001", "priority": "P0", "order": 35, - "status": "in_progress", + "status": "done", "lane": "gitea_backup_restore", "request": "Gitea 儲存庫都不見了?Gitea 沒完整備份嗎?", "normalized_work_item": "Gitea repository identity、backup proof、restore drill 需比對 SSH heads、repo path、bundle backup、restore sample。", "current_state": ( - "已有 Gitea bundle readback API / sample restore dry-run verifier;" - "live 188 metrics 顯示 9 expected repos present/ok、missing=0、" - "failed=0、checksum_missing=0;當前 blocker 是 bundle freshness " - "stale 與 restore dry-run metric missing,不是 repo missing。" + "2026-07-02 production Gitea bundle readback 已 ready:" + "9 expected repos present/ok、missing=0、failed=0、checksum_missing=0、" + "bundle_fresh=true、all_expected_ok=true、sample_restore_dry_run_ok=true。" + "本項關閉 repo bundle / restore dry-run 層;Gitea DB/settings/issues/packages/LFS " + "全量 dump 仍歸 backup contract 另列追蹤。" ), "acceptance": "Gitea repo bundle backup readback 與 sample restore dry-run verifier 可讀回;禁止刪 repo / 改 visibility。", "next_action": ( - "補跑 bundle backup / 188 sync / sample restore dry-run textfile metric," - "再讀 production gitea-repo-bundle-backup-readback。" + "維持每日 bundle backup + restore dry-run monitoring;另補 Gitea full dump / DB / issues / packages / LFS 備份 readback。" ), "mapped_workplan_id": "P0-003", }, diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index b4bfc5873..ab003e3cf 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -53448,3 +53448,20 @@ production browser smoke: **下一步**: - commit / push 到 Gitea main 並讀回 CD;deploy 後讀 production `/api/v1/agents/gitea-repo-bundle-backup-readback`。接續 P0 action 是補跑 bundle backup / 188 sync / sample restore dry-run textfile metric,讓 `bundle_fresh=true`、`all_expected_ok=true`、`sample_restore_dry_run_ok=true`。 + +## 2026-07-02 — P0 Gitea bundle backup runtime closure + +**完成內容**: +- `2f3db555e fix(gitea): expose bundle backup restore readback` 已推 Gitea main;CD `#4465` success,tests 讀回 `621 passed`、`AWOOOI_CD_TEST_PROFILE=controlled-runtime`、B5 skipped,deploy marker `7517b130d chore(cd): deploy 2f3db55 [skip ci]`。 +- production `/api/v1/agents/gitea-repo-bundle-backup-readback` 已先讀回 blocker,接著在 188 補 runtime evidence:本機使用既有 Gitea SSH transport 產生 9 個 repo bundle + `.sha256` + manifest,scp/rsync 到 188 `/home/ollama/backup/110/gitea/git-bundles/20260702-2330-codex-private-complete`,更新 `latest-private-complete`。 +- 188 `backup-health-textfile-exporter` 已刷新;node-exporter 讀回 `awoooi_gitea_bundle_fresh=1`、`awoooi_gitea_bundle_all_expected_ok=1`、`expected_repo_missing_count=0`、`failed_repo_count=0`、`checksum_missing_count=0`、`sample_restore_dry_run_ok=1`。 +- production API 最終讀回 `status=gitea_repo_bundle_backup_readback_ready`、`ready=true`、`active_blockers=[]`、`bundle_age_seconds=42`、`repo_row_count=9`、`sample_restore_dry_run_ok=true`。 +- priority work-order `CIR-P0-GIT-001` 更新為 repo bundle / restore dry-run 層 done;Gitea DB / settings / issues / packages / LFS full dump 仍歸 backup contract 下一層追蹤。 + +**仍維持**: +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有讀 private key 內容。 +- 沒有使用 GitHub / gh / GitHub API / GitHub Actions。 +- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB / firewall restart,沒有 workflow_dispatch,沒有 DROP / TRUNCATE / restore / prune,沒有 restore 到 production,沒有刪 repo 或改 visibility。 + +**下一步**: +- 轉回 P0 reboot SLO 主線:99 Windows / VMware autostart readback、111 reachability、backup full-dump coverage、service green 與 Wazuh degraded 依 scorecard blockers 順序收斂。 diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index 30eccf4b8..4d4b57a1e 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -20,7 +20,7 @@ | P0-3 | BLOCKED | StockPlatform data freshness | 23:27 public route / health 為 200,但 `/api/v1/system/freshness` 與 `/api/v1/system/ingestion` 回 `status=not_configured`、`blockers=["postgres_not_ready"]`、`latest_trading_date=null`;網站可達不代表資料最新。 | 恢復 110 control path 後,read-only 查 `/home/wooo/stockplatform-v2` compose / DB schema / migration status;禁止 fake freshness、manual DB rows、restore/prune。 | | P0-4 | BLOCKED | AWOOOI production 版本最新性 | Gitea SSH `main` 最新已到 `68084470 feat(agent): expose deploy marker receipt input`,且包含 `3de828f97 fix(agent): surface harbor cd retry receipt blocker`。Public Gitea queue 23:27 讀到 latest CD `#4117 Running` for `68084470`;production 尚不能證明已部署最新 source。 | 補 deploy marker / runtime SHA / endpoint readback 一致;Harbor `/v2` 恢復前 CD 無法把最新 source 發到 production,未一致前不可宣稱 AWOOOI 最新。 | | P0-5 | BLOCKED_SOURCE_ADVANCED | 110 control path / Harbor registry `/v2` | 23:27 queue readback 回 `status=blocked_harbor_110_repair_no_matching_runner`;Harbor repair workflow `#4115` 仍 `Waiting`,no-matching label `awoooi-host`,jobs API 仍 stale/mismatched。Live route probe 同步回 public registry `/v2=502`、internal `192.168.0.110:5000/v2=502`、Harbor health `502`、SignOz `502`。Harbor receipt validator 對 live queue 回 `status=blocked_waiting_harbor_controlled_recovery_receipt`,active blockers 含 `gitea_queue_harbor_110_repair_no_matching_runner`、`gitea_queue_harbor_110_repair_jobs_stale_or_mismatched`、`public_registry_v2_verifier_not_green`、`internal_registry_v2_verifier_not_green`。23:27 cold-start 同步證明 110 registry `/v2` blocked、110 SSH read-only check blocked、K3s pull refused by `110:5000`。23:30 source 修正讓 `awoooi-startup-110.sh` 預設 guard-on 啟動 controlled drain lane;legacy runner / generic label 仍 fail-closed。 | 部署 / 讀回 110 startup source 後,確認 `awoooi-cd-lane-drain.service` 只在 guardrails 通過時 active,讓 110-local repair workflow 或 110 console/local script 真正執行 `recover-110-control-path-and-harbor-local.sh --check` / `--apply-all`,並讀回 public/internal `/v2` 為 `200/401`。恢復 SSH read-only command path 後才能驗證 Stock DB、Gitea dump、110 backup completeness。 | -| P0-6 | BLOCKED_BACKUP_COMPLETENESS | Gitea repo visibility 與完整備份 | Gitea version API 200;public repo search 只列 public repo,不代表 private/internal repo 消失;internal refs / bundle / restore confidence 必須分層讀回。2026-07-02 live 188 metrics 顯示 `expected_repo_count=9`、`expected_repo_missing_count=0`、`failed_repo_count=0`、`checksum_missing_count=0`、9 個 repo `repo_ok=1`;目前 blocker 不是 repo missing,而是 `awoooi_gitea_bundle_fresh=0`、`all_expected_ok=0`、缺 `awoooi_gitea_bundle_sample_restore_dry_run_ok`。已新增 `/api/v1/agents/gitea-repo-bundle-backup-readback` 與 `scripts/backup/gitea-bundle-sample-restore-dry-run.sh`,讓 repo present / checksum / freshness / restore dry-run 分開判讀。 | 補跑 Gitea bundle backup 並同步到 188 後,執行 sample restore dry-run 寫入 textfile metric;再讀 production API readback 到 `bundle_fresh=true`、`all_expected_ok=true`、`sample_restore_dry_run_ok=true`。不得刪 repo、改 visibility、讀 token、restore 到 production 或用 public search 當完整備份證據。 | +| P0-6 | PARTIAL_GREEN_REPO_BUNDLE_BACKUP | Gitea repo visibility 與完整備份 | Gitea version API 200;public repo search 只列 public repo,不代表 private/internal repo 消失;internal refs / bundle / restore confidence 必須分層讀回。2026-07-02 production `/api/v1/agents/gitea-repo-bundle-backup-readback` 已讀回 `gitea_repo_bundle_backup_readback_ready`:`expected_repo_count=9`、`repo_row_count=9`、`expected_repo_missing_count=0`、`failed_repo_count=0`、`checksum_missing_count=0`、`bundle_fresh=true`、`all_expected_ok=true`、`sample_restore_dry_run_ok=true`;repo bundle / restore dry-run 層已綠,不是 repo missing。 | 維持每日 bundle backup + restore dry-run monitoring;下一層補 Gitea full dump / DB / settings / issues / packages / LFS 備份 readback。不得刪 repo、改 visibility、讀 token、restore 到 production 或用 public search 當完整備份證據。 | | P0-7 | SOURCE_READY_RUNTIME_BLOCKED | 99 VMware / VM autostart | repo 已有 `windows99-vmware-autostart.ps1`;22:05 host probe 讀到 99 ping reachable 但 `boot_id=reachable_unknown_boot` / uptime unknown,111 不可達,112/120/121/188 可讀,188 startup unit failed/degraded。先前只讀 readback 顯示 99 RDP 3389 / SSH 22 可達、WinRM 5985 fail,`administrator@192.168.0.99` SSH publickey denied。 | 恢復 99 可控通道或由 console 套用腳本;完成後讀回 111/188/120/121/112 boot evidence,要求 all-host required observed/reachable 且 99 不再是 unknown uptime。 | | P0-8 | SOURCE_READY_RUNTIME_BLOCKED | 502 maintenance fallback / Telegram / backup alert | L0/L1 fallback runbook、Nginx snippet、reboot / backup alert rules 已在 source;runtime 尚需部署與外部 L1 provider readback。 | L0 以測試 vhost 驗證 `X-AWOOOI-Fallback`;L1 需外部雲端/CDN probe;Telegram 以脫敏 alert receipt 驗證。 | diff --git a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md index 05b3facc3..1a592f152 100644 --- a/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md +++ b/docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md @@ -66,7 +66,7 @@ | 7 | CIR-P0-RBT-007 | P0 | 「所有備份包含主機、DB、網站、服務、套件、工具、日誌都沒有監控告警」 | Backup observability coverage:backup job inventory、last success、freshness、offsite、restore drill、Telegram receipt | 部分已有 backup health exporter / alert rules;全域 coverage 與 restore drill 未全綠 | 建立 backup coverage matrix:host / DB / website / service config / package list / tool scripts / logs,每列有 metric、alert、last_success、restore_verifier | | 8 | CIR-P0-RBT-008 | P0 | 「每次重啟排查都不一樣,也不知道多久恢復,不符合 SLA」 | 固定化 reboot runbook:fixed triage order、ETA、active blocker、remaining seconds、owner lane、next command | Production scorecard readback 已固定 `status=blocked_reboot_auto_recovery_slo_not_ready`、readiness `47%`、active blockers `11`;本輪再補 per-blocker Prometheus metric / Telegram alert 投影,避免只剩總 blocker_count;`next_safe_action=restore_windows99_no_secret_management_channel_or_collect_local_console_verify_readback_then_rerun_reboot_scorecard_no_reboot` | 優先收斂 99 no-secret Verify / 111 reachability / backup_core / service_green / Wazuh degraded;不得用不同排查路徑繞過 scorecard | | 9 | CIR-P0-RBT-009 | P0 | 「所有產品、網站都要是最新版本;版本和數據是否最新要驗證」 | Product freshness/version matrix:source commit、deploy marker、runtime image、public health、data freshness、latest source availability | AWOOOI Gitea main 已到 deploy marker `c68b74686`,production source readback `5d5bc86fed` verified;StockPlatform public freshness / ingestion 讀回 `ok`,latest trading date `2026-07-02`,core price/chips/margin/AI recommendations 都是 `2026-07-02` | 建立全產品 readback 表:product、canonical repo、main SHA、deploy marker、public URL、data freshness、blocked reason | -| 10 | CIR-P0-GIT-001 | P0 | 「Gitea 儲存庫都不見了?Gitea 沒完整備份嗎?」 | Gitea repository identity + backup proof + restore drill:不能只看 UI visible,要比對 SSH heads、repo path、bundle backup、restore sample | 2026-07-02 live 188 metrics 顯示 9 expected repos present/ok、missing=0、failed=0、checksum_missing=0;新增 `/api/v1/agents/gitea-repo-bundle-backup-readback`、`gitea-bundle-sample-restore-dry-run.sh` 與 `GiteaPrivateBundleRestoreDryRunMissing` alert。當前 blocker 是 bundle freshness stale + restore dry-run metric missing,不是 repo missing | 補跑 Gitea repo bundle backup / 188 sync / sample restore dry-run metric;禁止刪 repo / 改 visibility / 讀 token / restore 到 production | +| 10 | CIR-P0-GIT-001 | P0 | 「Gitea 儲存庫都不見了?Gitea 沒完整備份嗎?」 | Gitea repository identity + backup proof + restore drill:不能只看 UI visible,要比對 SSH heads、repo path、bundle backup、restore sample | 2026-07-02 production `/api/v1/agents/gitea-repo-bundle-backup-readback` 已 ready:9 expected repos present/ok、missing=0、failed=0、checksum_missing=0、bundle_fresh=true、all_expected_ok=true、sample_restore_dry_run_ok=true;repo bundle / restore dry-run 層已關閉,不是 repo missing。 | 維持每日 bundle backup + restore dry-run monitoring;另補 Gitea full dump / DB / settings / issues / packages / LFS 備份 readback。禁止刪 repo / 改 visibility / 讀 token / restore 到 production | | 11 | CIR-P0-CPU-001 | P0 | 「110 / 188 CPU 負載持續過高,為什麼沒監控告警、沒主動修復」 | Sustained CPU pressure automation:Alertmanager → controller → evidence → service playbook → verifier → KM writeback | 110 已有 `Host110SustainedModeratePressure`、Gitea playbook、Stock/Postgres evidence;188 仍需同級 controller/alerts readback | 下一步接 `postgres_hot_query_or_backup_export_playbook`;並補 188 equivalent readback,不以單次下降結案 | | 12 | CIR-P0-CPU-002 | P0 | 「噪音會影響真問題,要整合一起做」 | Alert noise / real issue correlation:backup aggregate noise、CPU pressure、Gitea queue、Stock freshness 要分清主因與次因 | 部分已在 SOP 註記;仍需統一 correlation scorecard | 建立 incident correlation readback:primary_blocker、secondary_noise、ignored_noise_reason、evidence_ref | | 13 | CIR-P0-CD-001 | P0 | 「所有專案都不能推版 / 要看到實作結果」 | Gitea-only CD baseline:每次 main push 要有 visible run、deploy marker、production readback;GitHub 不作解法 | AWOOOI 最新 main 可推,CD success/deploy marker 已多次證明;全產品未全綠 | 將 product governance matrix 接入各產品 Gitea CD readiness,不再只報 AWOOOI |