From 95f442adab4f061cb99d039d6368d56ceb6469ad Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 24 Jun 2026 06:37:44 +0800
Subject: [PATCH] fix(ops): harden 188 backup exporter recovery [skip ci]

---
 docs/LOGBOOK.md                               | 43 ++++++++++
 docs/runbooks/BACKUP-STATUS.md                | 43 ++++++++++
 docs/runbooks/FULL-STACK-COLD-START-SOP.md    | 74 +++++++++++++++---
 ...oot-cold-start-backup-recovery-workplan.md | 11 +--
 ops/monitoring/docker-compose.exporters.yaml  | 10 +--
 scripts/ops/188-db-exporters-restore.sh       | 78 +++++++++++++++++++
 scripts/ops/188-minio-velero-restore.sh       | 76 ++++++++++++++++++
 7 files changed, 315 insertions(+), 20 deletions(-)
 create mode 100755 scripts/ops/188-db-exporters-restore.sh
 create mode 100755 scripts/ops/188-minio-velero-restore.sh

diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md
index 9d40be9f..8fd8d4dc 100644
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,46 @@
+## 2026-06-24｜188 MinIO / Velero、DB exporter 與 110 磁碟壓力恢復
+
+**背景**：02:44 cold-start 已證明主機、K3s、public routes 與 MOMO DB parity 多數恢復，但後續 Alertmanager 仍暴露多個真實紅燈：188 PostgreSQL / Redis exporters down、110 disk pressure、Velero backup freshness 過期。這些都不能靠消音處理，必須恢復監控與備份鏈路。
+
+**188 exporter 修復**：
+- live `/home/ollama/monitoring/docker-compose.exporters.yaml` 原先使用 `network_mode: host`，在 Docker user namespace 下無法啟動；Redis live port 也應走 `6380`。
+- 已改為 bridge port mapping，PostgreSQL DSN 從 `/home/ollama/monitoring/.env.exporters` 注入，Redis 預設 `192.168.0.188:6380`，不在 repo 放任何密碼。
+- live helper `/home/ollama/bin/188-db-exporters-restore.sh` 已部署並驗證：`pg_up=1`、`redis_up=1`；Prometheus `up{job="postgres-exporter"}=1`、`pg_up=1`、`up{job="redis-exporter"}=1`、`redis_up=1`。
+- `PostgreSQLDown` / `RedisDown` 告警已解除。
+
+**110 磁碟壓力修復**：
+- 110 `/` 從 `92%` 使用率降到 `73%`。只做 Docker image / build cache 類安全清理；沒有執行 volume prune，避免刪除 stateful data。
+- `HostDiskUsageHigh`、`HostOutOfDiskSpace`、`HostDiskUsageCritical` 已解除。
+
+**188 MinIO / Velero 備份鏈路修復**：
+- `VeleroBackupNotRun` 根因是 188 `192.168.0.188:9000` MinIO endpoint connection refused；Velero `BackupStorageLocation default` 因 S3 list 失敗不可用。
+- 188 MinIO compose source 位於 `/home/ollama/minio/docker-compose.yml`；container 起不來的根因是 user namespace 對 root-owned data 目錄無寫入權限。
+- live 補 `docker-compose.override.yml` 使用 `userns_mode: host` 讓 MinIO 先恢復服務；這是恢復用 source-of-truth debt，後續應正式整理 data ownership 或 compose baseline。
+- MinIO health `/minio/health/live` 已通過，120 上 Velero `BackupStorageLocation/default` phase 回 `Available`。
+- 建立一次性備份 `reboot-recovery-202606240456`，phase `Completed`，start `2026-06-23T21:22:32Z`，complete `2026-06-23T21:22:51Z`。
+- 110 `backup-health-textfile-exporter.py` 已刷新：`awoooi_velero_monitor_up=1`、`awoooi_velero_latest_completed_backup_fresh=1`、restore-test cron present、last_success_fresh `1`、failed_jobs `0`。
+- Prometheus `ALERTS{alertname="VeleroBackupNotRun",alertstate="firing"}` 回空集合，告警已解除。
+
+**SOP / automation 補強**：
+- 新增 `scripts/ops/188-db-exporters-restore.sh`，作為 188 PostgreSQL / Redis exporter 重啟後恢復 helper。
+- 新增 `scripts/ops/188-minio-velero-restore.sh`，作為 188 MinIO / 120 Velero storage / one-off backup / 110 backup-health refresh 的分段 helper；預設只恢復 MinIO 與檢查 BSL，必須明確設定 `CREATE_VELERO_BACKUP=true` 才會建立一次性備份。
+- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 升級到 v1.30，加入 05:59 readback、MinIO/Velero 恢復流程、DB exporter 恢復流程與 110 Docker disk cleanup 邊界。
+- `docs/runbooks/BACKUP-STATUS.md` 與 `docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md` 已同步最新備份鏈路狀態。
+
+**06:35 Live 判定**：
+- `full-stack-cold-start-check.sh --monitor-read-only --no-color --watch --interval 1 --max-attempts 1`：`PASS=86 WARN=0 BLOCKED=1`。
+- 110 / 120 / 121 / 188 reachable；K3s `mon` / `mon1` Ready；public routes / TLS OK；110 / 188 backup health fresh；188 node-exporter、PostgreSQL exporter、Redis exporter、MinIO / Velero BSL 均恢復。
+- 110 `/` disk usage 維持 `73%`；188 exporter local readback `pg_up=1`、`redis_up=1`、MinIO health `ok`；120 Velero `BackupStorageLocation/default` phase `Available`。
+- 目前不能宣稱 full-stack green，因唯一 service blocker 仍是 `MOMO_DAILY_FRESHNESS 6|2026-06-17`。
+- DR 仍不能宣稱完成：credential escrow evidence missing 維持 `5`，不得偽造 marker，也不得把任何 secret value 放進 repo 或聊天。
+
+**剩餘告警分類**：
+- `ColdStartRecoveryBlocked`：正確反映 MOMO business data source stale，不可消音。
+- `BackupCredentialEscrowEvidenceMissing` x5：正確反映 DR escrow 缺口，不可偽造或消音。
+- `DockerContainerMemoryLimitPressure` / `DockerContainerMissingResourceLimit`：屬於下一階段 resource tuning / owner evidence 工作，不是本輪重啟恢復 blocker。
+- `ColdStartLastGreenTooOld`：因 MOMO data freshness blocker 讓 full green 不能更新，屬於正確 warning。
+- 正常 healthy heartbeat 已由 `a84a5a0b` 降噪，不應再每 30 分鐘洗 Telegram；warning / failure / recovery 類事件仍必須告警。
+
 ## 2026-06-24｜MOMO Google Drive 權限修復與資料新鮮度 Gate 補強
 
 **背景**：`mo.wooo.work` / `/health` 已恢復 200，MOMO containers 也 healthy，但頁面資料仍停在舊版本。只看 route 200 / container healthy 會誤判，因此重新查 DB、import jobs、Google Drive 來源與 scheduler logs。
diff --git a/docs/runbooks/BACKUP-STATUS.md b/docs/runbooks/BACKUP-STATUS.md
index a8a2f13f..c42612c1 100644
--- a/docs/runbooks/BACKUP-STATUS.md
+++ b/docs/runbooks/BACKUP-STATUS.md
@@ -9,6 +9,49 @@
 > 2026-06-13 Codex post-CD refresh: backup/offsite/alert contracts remain green after deploy marker `e4a349bc`; global SSH trust guardrail held; DR still blocked only by credential escrow evidence.
 > 2026-06-13 Codex escrow refresh: 13:10 live report confirms offsite/rclone/script readiness is green and only five non-secret credential escrow evidence markers remain missing.
 > 2026-06-18 Codex cold-start refresh: full-stack service readiness is green after stale failed Job classification; backup core remains green; DR still blocked only by five credential escrow evidence markers.
+> 2026-06-24 Codex Velero/exporter refresh: 188 MinIO / Velero backup freshness, 188 PostgreSQL / Redis exporters, 188 node-exporter, and 110 disk pressure are recovered; DR still blocked only by five credential escrow evidence markers and service full-green is blocked by MOMO data freshness.
+
+---
+
+## 2026-06-24 Velero / Exporter / Disk-Pressure Live Status
+
+2026-06-24 06:35 refresh:
+
+- 110 backup health remains fresh: 13 configured jobs, stale `0`, failed count `0`, config failed `0`。
+- 188 backup health remains fresh: 2 configured jobs, stale `0`, missing cron/script `0`。
+- 188 `node-exporter` textfile scrape is restored: Prometheus `up{job="node-exporter-188"}=1` and `awoooi_backup_health_monitor_up{host="188"}=1`。
+- 188 PostgreSQL exporter and Redis exporter are restored: local metrics `pg_up=1` / `redis_up=1`; Prometheus sees `up{job="postgres-exporter"}=1`, `pg_up=1`, `up{job="redis-exporter"}=1`, `redis_up=1`。
+- 188 MinIO endpoint is healthy on `192.168.0.188:9000`; 120 Velero `BackupStorageLocation/default` is `Available`。
+- One-off Velero backup `reboot-recovery-202606240456` completed successfully; 110 backup-health textfile reports `awoooi_velero_latest_completed_backup_fresh=1`。
+- `VeleroBackupNotRun`、`BackupHealthMonitorMissing188`、`PostgreSQLDown`、`RedisDown` and 110 disk-pressure alerts are resolved.
+- 110 `/` disk use is reduced from `92%` to `73%` after Docker image/build-cache cleanup only. Docker volume prune remains forbidden without explicit owner approval.
+- Credential escrow readback remains blocked: `ESCROW_MISSING_COUNT=5`。
+- Full service green is still blocked by MOMO business data freshness: `MOMO_DAILY_FRESHNESS 6|2026-06-17`。
+
+| Gate | Status | Evidence |
+|------|--------|----------|
+| 110 backup freshness | VERIFIED | 13/13 fresh, failed count 0, config failed 0. |
+| 188 backup freshness | VERIFIED | 2/2 fresh, node-exporter scrape and textfile metrics restored. |
+| Velero / MinIO storage | VERIFIED | MinIO health OK, BSL `Available`, backup `reboot-recovery-202606240456` `Completed`, freshness metric `1`. |
+| PostgreSQL / Redis exporters | VERIFIED | `pg_up=1`, `redis_up=1`, Prometheus scrape `up=1` for both exporters. |
+| Alert chain | VERIFIED_WITH_EXPECTED_REDLIGHTS | Exporter/Velero/disk alerts resolved; escrow missing and MOMO freshness blocker remain visible. |
+| Credential escrow | BLOCKED | Five non-secret evidence markers still missing. |
+| DR closeout | NO-GO | Must not be declared complete until real owner-provided non-secret evidence IDs are validated and markers are written. |
+
+Operational helpers:
+
+```bash
+# 188 PostgreSQL / Redis exporters
+ssh ollama@192.168.0.188 'bash /home/ollama/bin/188-db-exporters-restore.sh'
+
+# 188 MinIO / 120 Velero BSL readback
+ssh wooo@192.168.0.110 '/home/wooo/scripts/188-minio-velero-restore.sh'
+
+# Maintenance-window one-off Velero backup + backup-health textfile refresh
+ssh wooo@192.168.0.110 'CREATE_VELERO_BACKUP=true REFRESH_BACKUP_HEALTH=true /home/wooo/scripts/188-minio-velero-restore.sh'
+```
+
+Current policy: restore backup and monitoring red lights first; do not silence `VeleroBackupNotRun` or exporter-down alerts. Healthy heartbeat success messages are suppressed separately and should not be confused with real backup/data/escrow alerts.
 
 ---
 
diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md
index 54cb5765..1a6d6c41 100644
--- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md
+++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md
@@ -1,6 +1,6 @@
 # AWOOOI 全棧冷啟動與主機重啟 SOP
 
-> Version: v1.29
+> Version: v1.30
 > Last updated: 2026-06-24 Asia/Taipei
 > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path.
 
@@ -10,14 +10,15 @@
 
 本節是每次接手、開機、關機、重啟後的第一個判定錨點。若日期不是今天，必須先重跑 live check，再更新本節與 `docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md`。
 
-2026-06-24 02:44 live readback supersedes the earlier 02:08 green wording:
+2026-06-24 06:35 live readback supersedes the earlier 02:44 wording:
 
 ```text
 Repo-side reboot SOP / Plan B / automation contracts: COMPLETE, 100%.
 Live cold-start read-only check: PASS=86 WARN=0 BLOCKED=1, Result=BLOCKED.
-Service state: SERVICE_AVAILABLE_DATA_STALE_BLOCKED; 110/120/121/188 reachable, K3s mon/mon1 Ready, public routes/TLS green, 110/188 backup health fresh, 188 node-exporter textfile scrape restored.
+Service state: SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED; 110/120/121/188 reachable, K3s mon/mon1 Ready, public routes/TLS green, 110/188 backup health fresh, 188 node-exporter / PostgreSQL exporter / Redis exporter restored, 188 MinIO endpoint and Velero BackupStorageLocation restored, 110 disk pressure cleared.
 MOMO state: current-month daily_sales_snapshot and realtime_sales_monthly match, but both stop at 2026-06-17. MOMO_DAILY_FRESHNESS is 6 days, which is a hard blocker because business data is not current.
 Google Drive state: momo scheduler token ownership is fixed for Docker userns, Drive listing works, but folder 當日業績匯入 currently has no matching 即時業績_當日 Excel source file. Archive latest matching file is 2026-06-18 and was already imported.
+Backup / monitoring state: 188 MinIO is healthy, Velero BackupStorageLocation default is Available, one-off backup reboot-recovery-202606240456 completed, backup-health textfile reports Velero freshness green, and VeleroBackupNotRun / PostgreSQLDown / RedisDown / disk-pressure alerts resolved.
 Allowed declaration: core hosts, routes, K3s, backup/exporter surfaces are recovered; MOMO data pipeline is blocked waiting for a newer source file or owner-provided source evidence.
 Forbidden declaration: full-stack green, MOMO data current, DR complete, or runtime/security acceptance. Credential escrow evidence is still missing and must not be forged.
 ```
@@ -81,13 +82,13 @@ Allowed declaration: monitoring, alert rules, AI event packet, PlayBook / KM con
 Forbidden declaration: AI runtime remediation is enabled. Process termination, Docker/systemd restart, Nginx reload, firewall/K8s action, Telegram live send, Gateway queue write, Bot API call, production write, and secret read remain forbidden without owner approval, maintenance window, evidence ref, dry-run, and post-check.
 ```
 
-| 項目 | 2026-06-24 02:44 Asia/Taipei live result | 判定 |
+| 項目 | 2026-06-24 06:35 Asia/Taipei live result | 判定 |
 |------|-------------------------------------------|------|
-| Overall recovery readiness | `97%` | `SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED` |
+| Overall recovery readiness | `98%` | `SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED` |
 | P0 host / K3s recovery | `100%` | `DONE` |
-| P1 backup / alert / escrow | `93%` | `BLOCKED_DR_ESCROW` |
+| P1 backup / alert / escrow | `96%` | `BLOCKED_DR_ESCROW` |
 | P2 service / data truth | `96%` | `BLOCKED_MOMO_DATA_FRESHNESS` |
-| P3 docs / automation contracts | `100%` | `DONE_WITH_MOMO_FRESHNESS_GATE` |
+| P3 docs / automation contracts | `100%` | `DONE_WITH_VELERO_AND_EXPORTER_RECOVERY_GATE` |
 | 110 host runtime | `fwupd-refresh.timer` intentionally disabled/inactive after non-runtime firmware metadata refresh failed units were classified; `systemctl --failed` returns `0 loaded units listed`; rollback is `sudo systemctl enable --now fwupd-refresh.timer` | `GREEN_WITH_FWUPD_TIMER_DISABLED` |
 | 110 host runaway process guard | 14:31-14:32 live scrape confirms `monitor_up=1`, orphan browser groups `0`, active Gitea Actions containers `2`, `load5_per_core≈0.79-0.81`, `swap_used_ratio≈1.0`, and `remediation_authorized=0`; exporter/helper also remain in Ansible textfile exporter source-of-truth. | `LIVE_SCRAPED_RUNTIME_GATE_0` |
 | 120 reachability | ping OK, SSH OK, boot around `2026-06-14 02:23`, K3s active, node `mon Ready` | `GREEN` |
@@ -95,10 +96,10 @@ Forbidden declaration: AI runtime remediation is enabled. Process termination, D
 | 188 host runtime | production services green; host degraded only by `certbot.service` and `snap.certbot.renew.service` | `GREEN_WITH_CERTBOT_DEBT` |
 | K3s node state | `mon Ready control-plane`, `mon1 Ready control-plane`; bad pods `0`; `FAILED_JOBS=1`, `STALE_FAILED_JOBS=1`, `ACTIVE_FAILED_JOBS=0` | `GREEN_WITH_RETAINED_EVIDENCE` |
 | 110 -> 120 / 188 SSH trust | 00:33 cold-start exposed stale `known_hosts`; backup `/home/wooo/.ssh/known_hosts.before-120-188-refresh.20260613-003416`; final repair backup `/home/wooo/.ssh/known_hosts.before-120-188-final-refresh.20260613-011949`; CD fix `80e6ec1a` moves deploy trust to `/home/wooo/.ssh/deploy_known_hosts`; 01:28 global `known_hosts` still contains 120 / 188 and was not clobbered by deploy marker `e4a349bc` | `GREEN_WITH_GUARDRAIL` |
-| Backup status | 13:43 status: 110 backup health `13/13 fresh failed=0`, 188 backup health `2/2 fresh failed=0`; escrow readback still shows `ESCROW_MISSING_COUNT=5` | `GREEN_WITH_DR_ESCROW_WARNING` |
+| Backup status | 06:35 status: 110 backup health fresh, 188 backup health fresh, MinIO / Velero storage restored, latest Velero backup fresh, exporter scrape green; escrow readback still shows `ESCROW_MISSING_COUNT=5` | `GREEN_WITH_DR_ESCROW_WARNING` |
 | Offsite sync / verify | 01:28 textfile: `awoooi_backup_offsite_remote_verify_ok=1`, `full_verify_fresh=1`, all 13 repos have `snapshot_count=1` and `snapshot_latest_only=1`; latest scheduled verifier log is 2026-06-12 07:20 | `GREEN` |
 | Backup / cold-start alerts | 01:27 live visibility check confirms Prometheus and Alertmanager expose the 5 required credential escrow gap alerts; Prometheus rules API has all five required alert names healthy; label contract check loads 24 baseline backup alert rules | `GREEN_WITH_EXPECTED_REDLIGHTS` |
-| Cold-start scorecard | 13:43 stale Job classification 後只讀 scorecard：`PASS=84 WARN=0 BLOCKED=0`。Public routes / TLS、momo DB parity、backup exporters、120/121 K3s 與 AWOOOI API/Web 皆通過。 | `GREEN_FOR_SERVICE` |
+| Cold-start scorecard | 06:35 read-only scorecard：`PASS=86 WARN=0 BLOCKED=1`。Public routes / TLS、momo DB parity、backup exporters、120/121 K3s、MinIO / Velero、AWOOOI API/Web 皆通過；only blocker is MOMO data freshness. | `BLOCKED_MOMO_DATA_FRESHNESS` |
 | momo DB parity | `4571|4571|2026-06-01|2026-06-07|2026-06-01|2026-06-07` | `GREEN` |
 | momo scheduler | container healthy; scorecard reads `SCHEDULER_RECENT_ACTIVITY 1009`; detector widened and deployed to 110 | `GREEN` |
 | ArgoCD app health | This 13:43 cold-start pass does not reassert ArgoCD app health directly. Service release uses K8s node / route / workload / CronJob active-failure evidence. Any ArgoCD release claim still requires a fresh `awoooi-prod` app readback. | `NOT_REASSERTED_BY_THIS_GATE` |
@@ -1747,6 +1748,61 @@ SQL
 
 若 Drive pending folder 無新來源檔，不可手動 truncate、不可以舊 archive 檔重複匯入來製造「最新」，也不可把 DB parity 當 data freshness。下一個解除 blocker 的證據必須是：新的 `即時業績_當日` source file 可見、import job 成功、`sync_success=true`、`daily_sales_snapshot` 與 `realtime_sales_monthly` 日期上下界一致，且 `MOMO_DAILY_FRESHNESS <= 2`。
 
+### 14.29 2026-06-24 188 MinIO / Velero、DB exporter 與 110 disk pressure recovery
+
+2026-06-24 的第四段變更是恢復真正的備份與監控鏈路，而不是消音告警。`VeleroBackupNotRun`、`PostgreSQLDown`、`RedisDown`、110 disk pressure 都是有效紅燈；修復順序必須是 source-of-truth / service / exporter / Prometheus / Alertmanager / cold-start scorecard。
+
+| 項目 | 2026-06-24 06:35 recovery baseline |
+|------|-------------------------------------|
+| SOP version | `v1.30` |
+| 188 DB exporter root cause | Docker user namespace 下 exporter compose 不能使用 `network_mode: host`；Redis live port 是 `6380` |
+| 188 DB exporter source-of-truth | `ops/monitoring/docker-compose.exporters.yaml` 改為 bridge port mapping；PostgreSQL DSN 只從 host `.env.exporters` 注入，repo 不放密碼 |
+| 188 DB exporter helper | `scripts/ops/188-db-exporters-restore.sh`；live path `/home/ollama/bin/188-db-exporters-restore.sh` |
+| 188 DB exporter readback | local metrics `pg_up=1`、`redis_up=1`；Prometheus `up{job="postgres-exporter"}=1`、`pg_up=1`、`up{job="redis-exporter"}=1`、`redis_up=1` |
+| 110 disk pressure | `/` from `92%` used to `73%` used after Docker image / build cache cleanup only; no Docker volume prune |
+| MinIO / Velero root cause | 188 MinIO endpoint `192.168.0.188:9000` was down; Velero BSL S3 list failed; MinIO data path had userns write denial |
+| MinIO restore | live `/home/ollama/minio/docker-compose.override.yml` adds `userns_mode: host` for the `minio` service; MinIO health endpoint is OK |
+| Velero restore | 120 `BackupStorageLocation/default` phase is `Available`; one-off backup `reboot-recovery-202606240456` is `Completed` |
+| Backup-health textfile | 110 exporter refresh reports `awoooi_velero_monitor_up=1`, `awoooi_velero_latest_completed_backup_fresh=1`, restore-test cron present, failed jobs `0` |
+| Alert readback | `VeleroBackupNotRun`、`PostgreSQLDown`、`RedisDown`、110 disk-pressure alerts resolved |
+| Live cold-start readback | `PASS=86 WARN=0 BLOCKED=1`, result `BLOCKED`; only blocker remains `MOMO_DAILY_FRESHNESS 6|2026-06-17` |
+| Declaration limit | 可宣稱 backup / exporter / MinIO / Velero chain recovered；不可宣稱 full-stack green、MOMO data current、DR complete 或 runtime/security acceptance |
+
+188 PostgreSQL / Redis exporter post-reboot recovery:
+
+```bash
+ssh ollama@192.168.0.188 'bash /home/ollama/bin/188-db-exporters-restore.sh'
+curl -fsS http://192.168.0.188:9187/metrics | grep '^pg_up '
+curl -fsS http://192.168.0.188:9121/metrics | grep '^redis_up '
+```
+
+188 MinIO / 120 Velero recovery from 110:
+
+```bash
+ssh wooo@192.168.0.110 '/home/wooo/scripts/188-minio-velero-restore.sh'
+```
+
+如果需要在維護窗口中建立一次性 reboot-recovery 備份並刷新 110 backup-health textfile，必須明確設定：
+
+```bash
+ssh wooo@192.168.0.110 'CREATE_VELERO_BACKUP=true REFRESH_BACKUP_HEALTH=true /home/wooo/scripts/188-minio-velero-restore.sh'
+```
+
+本地 repo helper 可同步 live script：
+
+```bash
+scp -q scripts/ops/188-db-exporters-restore.sh ollama@192.168.0.188:/home/ollama/bin/188-db-exporters-restore.sh
+scp -q scripts/ops/188-minio-velero-restore.sh wooo@192.168.0.110:/home/wooo/scripts/188-minio-velero-restore.sh
+```
+
+110 disk pressure cleanup rule:
+
+```text
+Allowed in incident recovery: Docker image / build cache cleanup after checking `docker system df`.
+Forbidden without explicit owner approval: `docker volume prune`, deleting database / registry / MinIO / ClickHouse / Sentry / PostgreSQL volumes, or removing unknown bind-mounted state.
+Done gate: filesystem use below 85%, no active disk-pressure alerts, and no service regression in cold-start scorecard.
+```
+
 ### 14.22 重啟後時間軸驗證
 
 每次重啟後照時間軸推進，不要等到最後才一次判定。
diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md
index 6326e5ce..13f7313d 100644
--- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md
+++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md
@@ -11,13 +11,13 @@
 
 | Area | Status | Completion | Evidence |
 |------|--------|------------|----------|
-| Overall recovery readiness | SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED | 97% | 2026-06-24 02:44 live cold-start read-only gate returned `PASS=86 WARN=0 BLOCKED=1`, result `BLOCKED`。110 / 120 / 121 / 188 ping and SSH port are OK, K3s `mon` / `mon1` are Ready, `NODE_FS_ERROR_EVENTS=0`, `NODE_READONLY_FILESYSTEM_TRUE=0`, `NODE_DISK_PRESSURE_TRUE=0`, public routes/TLS are green, 110 / 188 runtime and backup checks are green。188 `node-exporter` textfile scrape is restored. Remaining service blocker is MOMO business data freshness: `MOMO_DAILY_FRESHNESS 6|2026-06-17`; Drive listing works after token owner repair, but `當日業績匯入` has no newer `即時業績_當日` Excel source file. DR remains blocked because credential escrow evidence markers are still missing and must not be forged. |
+| Overall recovery readiness | SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED | 98% | 2026-06-24 06:35 live cold-start read-only gate returned `PASS=86 WARN=0 BLOCKED=1`, result `BLOCKED`。110 / 120 / 121 / 188 ping and SSH port are OK, K3s `mon` / `mon1` are Ready, `NODE_FS_ERROR_EVENTS=0`, `NODE_READONLY_FILESYSTEM_TRUE=0`, `NODE_DISK_PRESSURE_TRUE=0`, public routes/TLS are green, 110 / 188 runtime and backup checks are green。188 `node-exporter`、PostgreSQL exporter、Redis exporter、MinIO / Velero BSL are restored; 110 disk pressure cleared to 73%。Remaining service blocker is MOMO business data freshness: `MOMO_DAILY_FRESHNESS 6|2026-06-17`; Drive listing works after token owner repair, but `當日業績匯入` has no newer `即時業績_當日` Excel source file. DR remains blocked because credential escrow evidence markers are still missing and must not be forged. |
 | P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-14 18:15 readback shows 120 is reachable, K3s is active, `mon` and `mon1` are both `Ready control-plane`, and cold-start P0/P1 checks are green. |
-| P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 93% | 2026-06-24 02:20 `backup-status` shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `escrow_missing=5`, last aggregate `2026-06-23 20:53:42`。02:24 restored 188 `node-exporter` textfile scrape; Prometheus now has `up{job="node-exporter-188"}=1` and `awoooi_backup_health_monitor_up{host="188"}=1`; `BackupHealthMonitorMissing188` resolved. DR remains blocked on real non-secret credential escrow evidence IDs. |
+| P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 96% | 2026-06-24 06:35 backup / alert readback shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `escrow_missing=5`。188 `node-exporter` textfile scrape、PostgreSQL exporter、Redis exporter、MinIO endpoint、Velero BSL and latest completed backup freshness are restored; `BackupHealthMonitorMissing188`、`PostgreSQLDown`、`RedisDown`、`VeleroBackupNotRun` and 110 disk-pressure alerts resolved. DR remains blocked on real non-secret credential escrow evidence IDs. |
 | P2 service / data truth | BLOCKED_MOMO_DATA_FRESHNESS | 96% | Public route/TLS, API/Web route, momo health, current-month parity `10936|10936|2026-06-01|2026-06-17|2026-06-01|2026-06-17`, backup exporters, schedules, K3s node readiness/storage conditions, VIP, and 110 / 188 runtime health are green. However MOMO latest business date is `2026-06-17`; stale age is `6` days. Drive pending folder has `0` matching files and archive latest is the already-imported 2026-06-18 file, so there is no safe newer source to import. |
-| P3 docs / automation contracts | DONE_WITH_MOMO_FRESHNESS_GATE | 100% | Workplan, SOP v1.29, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat Telegram suppression, MOMO scheduler / current-month detector fix, 188 node-exporter restore helper, MOMO Google Drive token userns readback, MOMO daily freshness blocker, and 2026-06-24 live readback are updated. Production image `a84a5a0b` is live with API `2/2`, Web `2/2`, Worker `1/1`; CD `#3289` is a known false-negative caused by worker startup / rollout timeout after deploy marker `4a7b5329`. |
+| P3 docs / automation contracts | DONE_WITH_VELERO_AND_EXPORTER_RECOVERY_GATE | 100% | Workplan, SOP v1.30, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat Telegram suppression, MOMO scheduler / current-month detector fix, 188 node-exporter restore helper, 188 DB/Redis exporter restore helper, 188 MinIO/Velero restore helper, 110 Docker disk pressure cleanup boundary, MOMO Google Drive token userns readback, MOMO daily freshness blocker, and 2026-06-24 06:35 live readback are updated. Production image `a84a5a0b` is live with API `2/2`, Web `2/2`, Worker `1/1`; CD `#3289` is a known false-negative caused by worker startup / rollout timeout after deploy marker `4a7b5329`. |
 
-Full cold-start service readiness may not be declared green for the latest verified evidence set. As of 2026-06-24 02:44, routes/hosts/K3s/backups are available, but the scorecard is `PASS=86 WARN=0 BLOCKED=1` because MOMO business data freshness is stale beyond 3 days. Do not declare DR scorecard complete while credential escrow evidence remains blocked.
+Full cold-start service readiness may not be declared green for the latest verified evidence set. As of 2026-06-24 06:35, routes/hosts/K3s/backups/exporters/Velero are available, but the scorecard is `PASS=86 WARN=0 BLOCKED=1` because MOMO business data freshness is stale beyond 3 days. Do not declare DR scorecard complete while credential escrow evidence remains blocked.
 
 2026-06-13 01:26 refresh: full cold-start is again green for the current evidence set. AWOOOI API/Web workload balancing survived the next normal CD deploy: Gitea main `e4a349bc`, ArgoCD revision `e4a349bc`, images from `414413a5`, API/Web split across `mon` / `mon1`, and global `known_hosts` retained 120 / 188 after CD fix `80e6ec1a`. Do not declare DR complete while credential escrow is missing. `km-vectorize` remediation is `90%`: schedule/label fix is live, and the remaining gate is the next official 03:00 CronJob success readback.
 
@@ -147,6 +147,7 @@ Next: <single next action>
 | P1-012 | DONE | 100 | Audit credential escrow marker write safety | 2026-06-12 15:02 `mark-credential-escrow-verified.sh --status` reports all five allowed items missing; `offsite-escrow-evidence-report.sh --no-color` reports rclone/offsite configured and `ESCROW_MISSING_COUNT=5`; repo search found only runbooks/placeholders/rules, not real evidence IDs. | Write markers only after a real non-secret evidence ID exists for each item; never write placeholder or secret. | The marker blocker is narrowed to missing external evidence IDs, not missing script/config/offsite readiness. |
 | P1-014 | DONE | 100 | Publish credential escrow owner request package | 2026-06-13 13:10 live report confirms `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`, `PASS=8 WARN=5 BLOCKED=0`. New owner request package defines allowed evidence-id types, forbidden secret values, safe dry-run flow, write flow, and closeout gates. | Dispatch to the credential owners without collecting secret values; keep marker write gated until owner gives real non-secret evidence IDs. | `docs/security/CREDENTIAL-ESCROW-EVIDENCE-OWNER-REQUEST.md` and snapshot exist and validate. |
 | P1-013 | DONE_FOR_SERVICE_READINESS | 100 | Remediate `km-vectorize` CronJob health debt | The retained `km-vectorize-29689620` failed Job is now classified as stale evidence, not an active blocker, because later official `km-vectorize` Jobs completed successfully. 2026-06-18 13:43 cold-start reads `FAILED_JOBS=1`, `STALE_FAILED_JOBS=1`, `ACTIVE_FAILED_JOBS=0`, `BAD_PODS=0`, and returns `PASS=84 WARN=0 BLOCKED=0`. | Keep retained failed Job as evidence unless an explicit maintenance window authorizes cleanup. Reassert ArgoCD app health only with a fresh ArgoCD app readback, not from the cold-start scorecard alone. | Service readiness no longer warns on stale failed Job evidence; active failed Job detection remains guarded. |
+| P1-015 | DONE | 100 | Restore 188 MinIO / Velero backup freshness and DB exporters | 2026-06-24 06:35 resolved real backup / exporter red lights: 188 PostgreSQL exporter and Redis exporter now expose `pg_up=1` / `redis_up=1`; 188 MinIO health is live; 120 Velero BSL is `Available`; one-off backup `reboot-recovery-202606240456` completed; 110 backup-health textfile reports latest Velero backup fresh. 110 disk pressure was reduced from 92% to 73% by Docker image/build-cache cleanup only. | Reconcile MinIO `userns_mode: host` override into formal source-of-truth or data ownership fix; keep Docker volume prune forbidden without explicit owner approval. | `VeleroBackupNotRun`、`PostgreSQLDown`、`RedisDown`、110 disk-pressure alerts are resolved, and SOP includes restore helpers. |
 
 ---
 
@@ -175,7 +176,7 @@ Next: <single next action>
 | P3-005 | DONE | 100 | Update cold-start SOP | SOP now includes start, shutdown, reboot, record, comparison, and 120 blocker handling. | Increment SOP version after each process change. | SOP has controlled power-operation sections and ledger template. |
 | P3-006 | DONE | 100 | Update backup status | Backup status now reflects current cron, rclone latest-only, failure-only alert posture, and escrow blocker. | Refresh after 120 backup rerun. | Backup status no longer claims noisy success Telegram notifications. |
 | P3-007 | DONE | 100 | Harden Gitea backup stale dump handling | 2026-06-05 manual Gitea backup failed because the container retained `/tmp/gitea-dump.zip` from the 02:00 failure. `scripts/backup/backup-gitea.sh` now renames stale container dump files to timestamped evidence before running a new dump, and the live 110 script is updated. | Watch the next 02:00 Gitea backup. | `bash -n` passes locally and on 110; manual Gitea backup completed after stale evidence rename. |
-| P3-008 | DONE | 100 | Continuously optimize host reboot SOP | SOP v1.29 adds startup judgment layers, GO/NO-GO decision tree, freeze execution checklist, host boot detection, 110/188/120/121 recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline, K3s filesystem event blocker, stale-vs-active K8s failed Job classification, post-reboot / post-CD recovery anchors, AA/AS 判定, workload 分散判定, CD SSH trust guardrail, CronJob failure evidence retention rule, `fwupd-refresh.timer` rollback note, 110 runaway browser / CI load 分流 PlayBook, healthy-heartbeat suppression, 188 node-exporter restore, MOMO Google Drive token userns readback, and MOMO data freshness hard blocker. | Use v1.29 for the next reboot record, then compare actual timing, Plan B trigger, degraded level, failed/stale/active Job counters, runaway-process metrics, CI load attribution, MOMO source availability, data freshness, and blockers against §1.4 plus §11.1 / §14.8 through §14.28. Before any real reboot, rerun same-day live cold-start / backup / offsite / alert / escrow / runaway-process checks. | SOP distinguishes `HOST_BOOTED`, `HOST_READY`, `SERVICE_READY`, `FULL_STACK_GREEN`, `K3S_CONTROL_PLANE_AA`, `WORKLOAD_BALANCED`, `B0_ABORTED_BEFORE_REBOOT`, `B1_HOST_RECOVERY_ONLY`, `B2_CORE_SERVICE_READY`, `B3_SERVICE_AVAILABLE_DEGRADED`, `B4_FULL_STACK_GREEN`, and `B5_DR_COMPLETE`; live cold-start now returns `PASS=86 WARN=0 BLOCKED=1` when MOMO data freshness is stale, preventing false green. |
+| P3-008 | DONE | 100 | Continuously optimize host reboot SOP | SOP v1.30 adds startup judgment layers, GO/NO-GO decision tree, freeze execution checklist, host boot detection, 110/188/120/121 recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline, K3s filesystem event blocker, stale-vs-active K8s failed Job classification, post-reboot / post-CD recovery anchors, AA/AS 判定, workload 分散判定, CD SSH trust guardrail, CronJob failure evidence retention rule, `fwupd-refresh.timer` rollback note, 110 runaway browser / CI load 分流 PlayBook, healthy-heartbeat suppression, 188 node-exporter restore, 188 DB/Redis exporter restore, 188 MinIO/Velero restore, 110 Docker disk cleanup boundary, MOMO Google Drive token userns readback, and MOMO data freshness hard blocker. | Use v1.30 for the next reboot record, then compare actual timing, Plan B trigger, degraded level, failed/stale/active Job counters, runaway-process metrics, CI load attribution, MOMO source availability, data freshness, Velero freshness, exporter scrape, disk usage, and blockers against §1.4 plus §11.1 / §14.8 through §14.29. Before any real reboot, rerun same-day live cold-start / backup / offsite / alert / escrow / runaway-process checks. | SOP distinguishes `HOST_BOOTED`, `HOST_READY`, `SERVICE_READY`, `FULL_STACK_GREEN`, `K3S_CONTROL_PLANE_AA`, `WORKLOAD_BALANCED`, `B0_ABORTED_BEFORE_REBOOT`, `B1_HOST_RECOVERY_ONLY`, `B2_CORE_SERVICE_READY`, `B3_SERVICE_AVAILABLE_DEGRADED`, `B4_FULL_STACK_GREEN`, and `B5_DR_COMPLETE`; live cold-start now returns `PASS=86 WARN=0 BLOCKED=1` when MOMO data freshness is stale, preventing false green. |
 | P3-009 | DONE | 100 | Assess 120/121 AA/AS role and host load balancing | 2026-06-12 15:19 live check confirms 120 and 121 are both `Ready control-plane`, `k3s active`, `k3s-agent inactive`, with no taints; however most AWOOOI / ArgoCD / Velero workload remains on 121 after 120 fsck recovery. New assessment defines control-plane AA vs workload AA, migration candidates from 110/188, and stateful migration blockers. | After P0 backup/offsite/cold-start green, implement topology spread for AWOOOI API/Web before moving additional services. | `docs/runbooks/HOST-ROLE-LOAD-BALANCING-ASSESSMENT.md` exists; SOP v1.6 links AA/AS and load-balancing checks; migration implementation remains explicitly `0%`. |
 | P3-010 | DONE | 100 | Update workload balancing docs with 2026-06-13 live truth | Host role assessment, workplan, SOP, backup status, and LOGBOOK are refreshed with current cold-start, backup, 188 certbot degraded, ArgoCD `km-vectorize` degraded, Gitea main `acaae999`, ArgoCD sync, and final pod placement evidence. | Keep updating this file after the next reboot or deploy. | Docs separate service-green status from DR escrow, workload rollout, and non-service governance debt. |
 | P3-011 | DONE | 100 | Record `km-vectorize` remediation status | LOGBOOK, this workplan, and SOP now state the schedule/label fix, ArgoCD sync evidence, the invalid manual Job boundary, and the 90% waiting-for-next-schedule gate. | After next 03:00 run, update this row and the top verdict with `lastSuccessfulTime` / ArgoCD health evidence. | No document claims ArgoCD green before official CronJob success evidence exists. |
diff --git a/ops/monitoring/docker-compose.exporters.yaml b/ops/monitoring/docker-compose.exporters.yaml
index e12904b7..7659154f 100644
--- a/ops/monitoring/docker-compose.exporters.yaml
+++ b/ops/monitoring/docker-compose.exporters.yaml
@@ -27,15 +27,14 @@ services:
     environment:
       # 連線字串 (使用環境變數注入密碼)
       # 2026-04-08 Claude Sonnet 4.6: 修正用戶名/資料庫名 (awoooi user, awoooi_prod db)
-      DATA_SOURCE_NAME: "postgresql://awoooi:${POSTGRES_PASSWORD:-awoooi_prod_2026}@localhost:5432/awoooi_prod?sslmode=disable"
+      DATA_SOURCE_NAME: "${POSTGRES_EXPORTER_DATA_SOURCE_NAME:?POSTGRES_EXPORTER_DATA_SOURCE_NAME is required}"
       # 自訂查詢配置
       PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml"
       # 日誌等級
       PG_EXPORTER_LOG_LEVEL: "info"
     volumes:
       - ./postgres-exporter-queries.yaml:/etc/postgres_exporter/queries.yaml:ro
-    # 直接使用 host network 連接本地 PostgreSQL
-    network_mode: host
+    # Docker userns cannot share the host network namespace. Connect through the host LAN address.
     labels:
       - "prometheus.scrape=true"
       - "prometheus.port=9187"
@@ -55,13 +54,12 @@ services:
       - "9121:9121"
     environment:
       # Redis 連線 (192.168.0.188:6380 是 AWOOOI Redis)
-      REDIS_ADDR: "redis://localhost:6380"
+      REDIS_ADDR: "${REDIS_EXPORTER_ADDR:-192.168.0.188:6380}"
       REDIS_PASSWORD: "${REDIS_PASSWORD:-}"
       # 啟用額外指標
       REDIS_EXPORTER_CHECK_KEYS: "awoooi:*"
       REDIS_EXPORTER_INCL_SYSTEM_METRICS: "true"
-    # 直接使用 host network 連接本地 Redis
-    network_mode: host
+    # Docker userns cannot share the host network namespace. Connect through the host LAN address.
     labels:
       - "prometheus.scrape=true"
       - "prometheus.port=9121"
diff --git a/scripts/ops/188-db-exporters-restore.sh b/scripts/ops/188-db-exporters-restore.sh
new file mode 100755
index 00000000..536c37da
--- /dev/null
+++ b/scripts/ops/188-db-exporters-restore.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+# Restore PostgreSQL / Redis exporters on 192.168.0.188 without host networking.
+#
+# Required on the host:
+#   /home/ollama/monitoring/.env.exporters
+#
+# Required variables:
+#   POSTGRES_EXPORTER_DATA_SOURCE_NAME=postgresql://...@192.168.0.188:5432/...?...sslmode=disable
+#
+# Optional variables:
+#   REDIS_EXPORTER_ADDR=192.168.0.188:6380
+#   REDIS_PASSWORD=
+
+set -euo pipefail
+
+ENV_FILE="${EXPORTER_ENV_FILE:-/home/ollama/monitoring/.env.exporters}"
+QUERIES_FILE="${POSTGRES_EXPORTER_QUERIES_FILE:-/home/ollama/monitoring/postgres-exporter-queries.yaml}"
+POSTGRES_IMAGE="${POSTGRES_EXPORTER_IMAGE:-prometheuscommunity/postgres-exporter:v0.15.0}"
+REDIS_IMAGE="${REDIS_EXPORTER_IMAGE:-oliver006/redis_exporter:v1.58.0}"
+
+if [ ! -f "$ENV_FILE" ]; then
+  echo "EXPORTER_ENV_FILE_MISSING $ENV_FILE" >&2
+  exit 2
+fi
+
+set -a
+# shellcheck disable=SC1090
+. "$ENV_FILE"
+set +a
+
+if [ -z "${POSTGRES_EXPORTER_DATA_SOURCE_NAME:-}" ]; then
+  echo "POSTGRES_EXPORTER_DATA_SOURCE_NAME_MISSING" >&2
+  exit 2
+fi
+
+if [ ! -f "$QUERIES_FILE" ]; then
+  echo "POSTGRES_EXPORTER_QUERIES_FILE_MISSING $QUERIES_FILE" >&2
+  exit 2
+fi
+
+REDIS_EXPORTER_ADDR="${REDIS_EXPORTER_ADDR:-192.168.0.188:6380}"
+REDIS_PASSWORD="${REDIS_PASSWORD:-}"
+
+docker rm -f postgres-exporter redis-exporter >/dev/null 2>&1 || true
+
+docker run -d \
+  --name postgres-exporter \
+  --restart unless-stopped \
+  -p 9187:9187 \
+  -e DATA_SOURCE_NAME="$POSTGRES_EXPORTER_DATA_SOURCE_NAME" \
+  -e PG_EXPORTER_EXTEND_QUERY_PATH=/etc/postgres_exporter/queries.yaml \
+  -e PG_EXPORTER_LOG_LEVEL=info \
+  -v "$QUERIES_FILE:/etc/postgres_exporter/queries.yaml:ro" \
+  "$POSTGRES_IMAGE" >/dev/null
+
+redis_args=(
+  docker run -d
+  --name redis-exporter
+  --restart unless-stopped
+  -p 9121:9121
+  -e "REDIS_ADDR=$REDIS_EXPORTER_ADDR"
+  -e "REDIS_EXPORTER_CHECK_KEYS=awoooi:*"
+  -e REDIS_EXPORTER_INCL_SYSTEM_METRICS=true
+)
+if [ -n "$REDIS_PASSWORD" ]; then
+  redis_args+=(-e "REDIS_PASSWORD=$REDIS_PASSWORD")
+fi
+redis_args+=("$REDIS_IMAGE")
+"${redis_args[@]}" >/dev/null
+
+pg_up="$(curl -fsS --max-time 5 http://127.0.0.1:9187/metrics | awk '/^pg_up / {print $2; exit}')"
+redis_up="$(curl -fsS --max-time 5 http://127.0.0.1:9121/metrics | awk '/^redis_up / {print $2; exit}')"
+
+echo "POSTGRES_EXPORTER_UP ${pg_up:-missing}"
+echo "REDIS_EXPORTER_UP ${redis_up:-missing}"
+
+test "${pg_up:-0}" = "1"
+test "${redis_up:-0}" = "1"
diff --git a/scripts/ops/188-minio-velero-restore.sh b/scripts/ops/188-minio-velero-restore.sh
new file mode 100755
index 00000000..28889661
--- /dev/null
+++ b/scripts/ops/188-minio-velero-restore.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# Restore the 188 MinIO endpoint used by Velero, then optionally create a
+# one-off Velero backup to re-establish backup freshness evidence.
+
+set -euo pipefail
+
+MINIO_HOST="${MINIO_HOST:-ollama@192.168.0.188}"
+K3S_HOST="${K3S_HOST:-wooo@192.168.0.120}"
+BACKUP_HEALTH_HOST="${BACKUP_HEALTH_HOST:-wooo@192.168.0.110}"
+MINIO_COMPOSE_FILE="${MINIO_COMPOSE_FILE:-/home/ollama/minio/docker-compose.yml}"
+MINIO_OVERRIDE_FILE="${MINIO_OVERRIDE_FILE:-/home/ollama/minio/docker-compose.override.yml}"
+VELERO_NAMESPACE="${VELERO_NAMESPACE:-velero}"
+VELERO_TARGET_NAMESPACE="${VELERO_TARGET_NAMESPACE:-awoooi-prod}"
+CREATE_VELERO_BACKUP="${CREATE_VELERO_BACKUP:-false}"
+REFRESH_BACKUP_HEALTH="${REFRESH_BACKUP_HEALTH:-false}"
+BACKUP_NAME="${VELERO_BACKUP_NAME:-reboot-recovery-$(date -u +%Y%m%d%H%M)}"
+
+ssh "$MINIO_HOST" "test -f '$MINIO_COMPOSE_FILE'"
+
+ssh "$MINIO_HOST" "cat > '$MINIO_OVERRIDE_FILE' <<'EOF'
+services:
+  minio:
+    userns_mode: host
+EOF"
+
+ssh "$MINIO_HOST" "docker compose -f '$MINIO_COMPOSE_FILE' -f '$MINIO_OVERRIDE_FILE' up -d"
+
+ssh "$MINIO_HOST" "for i in \$(seq 1 30); do curl -fsS --max-time 3 http://127.0.0.1:9000/minio/health/live >/dev/null && exit 0; sleep 2; done; docker logs --tail=80 minio >&2; exit 1"
+echo "MINIO_188_HEALTHY endpoint=192.168.0.188:9000"
+
+ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backupstoragelocations.velero.io default -o jsonpath='{.status.phase}'" | grep -qx Available
+echo "VELERO_BACKUP_STORAGE_LOCATION_AVAILABLE namespace=$VELERO_NAMESPACE"
+
+if [ "$CREATE_VELERO_BACKUP" = "true" ]; then
+  ssh "$K3S_HOST" "printf '%s\n' \
+    'apiVersion: velero.io/v1' \
+    'kind: Backup' \
+    'metadata:' \
+    '  name: $BACKUP_NAME' \
+    '  namespace: $VELERO_NAMESPACE' \
+    '  labels:' \
+    '    awoooi.wooo.work/source: reboot-recovery' \
+    'spec:' \
+    '  includedNamespaces:' \
+    '  - $VELERO_TARGET_NAMESPACE' \
+    '  storageLocation: default' \
+    '  ttl: 720h0m0s' \
+    | sudo -n k3s kubectl apply -f -"
+
+  for _ in $(seq 1 60); do
+    phase="$(ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o jsonpath='{.status.phase}'" || true)"
+    case "$phase" in
+      Completed)
+        echo "VELERO_BACKUP_COMPLETED name=$BACKUP_NAME"
+        break
+        ;;
+      Failed|PartiallyFailed)
+        ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o yaml" >&2 || true
+        echo "VELERO_BACKUP_FAILED name=$BACKUP_NAME phase=$phase" >&2
+        exit 1
+        ;;
+    esac
+    sleep 5
+  done
+
+  phase="$(ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o jsonpath='{.status.phase}'" || true)"
+  if [ "$phase" != "Completed" ]; then
+    echo "VELERO_BACKUP_TIMEOUT name=$BACKUP_NAME phase=${phase:-unknown}" >&2
+    exit 1
+  fi
+fi
+
+if [ "$REFRESH_BACKUP_HEALTH" = "true" ]; then
+  ssh "$BACKUP_HEALTH_HOST" "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin AIOPS_HOST_LABEL=110 NODE_EXPORTER_TEXTFILE_DIR=/home/wooo/node_exporter_textfiles /home/wooo/scripts/backup-health-textfile-exporter.py"
+  echo "BACKUP_HEALTH_TEXTFILE_REFRESHED host=110"
+fi