From 95f442adab4f061cb99d039d6368d56ceb6469ad Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 24 Jun 2026 06:37:44 +0800 Subject: [PATCH] fix(ops): harden 188 backup exporter recovery [skip ci] --- docs/LOGBOOK.md | 43 ++++++++++ docs/runbooks/BACKUP-STATUS.md | 43 ++++++++++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 74 +++++++++++++++--- ...oot-cold-start-backup-recovery-workplan.md | 11 +-- ops/monitoring/docker-compose.exporters.yaml | 10 +-- scripts/ops/188-db-exporters-restore.sh | 78 +++++++++++++++++++ scripts/ops/188-minio-velero-restore.sh | 76 ++++++++++++++++++ 7 files changed, 315 insertions(+), 20 deletions(-) create mode 100755 scripts/ops/188-db-exporters-restore.sh create mode 100755 scripts/ops/188-minio-velero-restore.sh diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 9d40be9f..8fd8d4dc 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,46 @@ +## 2026-06-24|188 MinIO / Velero、DB exporter 與 110 磁碟壓力恢復 + +**背景**:02:44 cold-start 已證明主機、K3s、public routes 與 MOMO DB parity 多數恢復,但後續 Alertmanager 仍暴露多個真實紅燈:188 PostgreSQL / Redis exporters down、110 disk pressure、Velero backup freshness 過期。這些都不能靠消音處理,必須恢復監控與備份鏈路。 + +**188 exporter 修復**: +- live `/home/ollama/monitoring/docker-compose.exporters.yaml` 原先使用 `network_mode: host`,在 Docker user namespace 下無法啟動;Redis live port 也應走 `6380`。 +- 已改為 bridge port mapping,PostgreSQL DSN 從 `/home/ollama/monitoring/.env.exporters` 注入,Redis 預設 `192.168.0.188:6380`,不在 repo 放任何密碼。 +- live helper `/home/ollama/bin/188-db-exporters-restore.sh` 已部署並驗證:`pg_up=1`、`redis_up=1`;Prometheus `up{job="postgres-exporter"}=1`、`pg_up=1`、`up{job="redis-exporter"}=1`、`redis_up=1`。 +- `PostgreSQLDown` / `RedisDown` 告警已解除。 + +**110 磁碟壓力修復**: +- 110 `/` 從 `92%` 使用率降到 `73%`。只做 Docker image / build cache 類安全清理;沒有執行 volume prune,避免刪除 stateful data。 +- `HostDiskUsageHigh`、`HostOutOfDiskSpace`、`HostDiskUsageCritical` 已解除。 + +**188 MinIO / Velero 備份鏈路修復**: +- `VeleroBackupNotRun` 根因是 188 `192.168.0.188:9000` MinIO endpoint connection refused;Velero `BackupStorageLocation default` 因 S3 list 失敗不可用。 +- 188 MinIO compose source 位於 `/home/ollama/minio/docker-compose.yml`;container 起不來的根因是 user namespace 對 root-owned data 目錄無寫入權限。 +- live 補 `docker-compose.override.yml` 使用 `userns_mode: host` 讓 MinIO 先恢復服務;這是恢復用 source-of-truth debt,後續應正式整理 data ownership 或 compose baseline。 +- MinIO health `/minio/health/live` 已通過,120 上 Velero `BackupStorageLocation/default` phase 回 `Available`。 +- 建立一次性備份 `reboot-recovery-202606240456`,phase `Completed`,start `2026-06-23T21:22:32Z`,complete `2026-06-23T21:22:51Z`。 +- 110 `backup-health-textfile-exporter.py` 已刷新:`awoooi_velero_monitor_up=1`、`awoooi_velero_latest_completed_backup_fresh=1`、restore-test cron present、last_success_fresh `1`、failed_jobs `0`。 +- Prometheus `ALERTS{alertname="VeleroBackupNotRun",alertstate="firing"}` 回空集合,告警已解除。 + +**SOP / automation 補強**: +- 新增 `scripts/ops/188-db-exporters-restore.sh`,作為 188 PostgreSQL / Redis exporter 重啟後恢復 helper。 +- 新增 `scripts/ops/188-minio-velero-restore.sh`,作為 188 MinIO / 120 Velero storage / one-off backup / 110 backup-health refresh 的分段 helper;預設只恢復 MinIO 與檢查 BSL,必須明確設定 `CREATE_VELERO_BACKUP=true` 才會建立一次性備份。 +- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 升級到 v1.30,加入 05:59 readback、MinIO/Velero 恢復流程、DB exporter 恢復流程與 110 Docker disk cleanup 邊界。 +- `docs/runbooks/BACKUP-STATUS.md` 與 `docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md` 已同步最新備份鏈路狀態。 + +**06:35 Live 判定**: +- `full-stack-cold-start-check.sh --monitor-read-only --no-color --watch --interval 1 --max-attempts 1`:`PASS=86 WARN=0 BLOCKED=1`。 +- 110 / 120 / 121 / 188 reachable;K3s `mon` / `mon1` Ready;public routes / TLS OK;110 / 188 backup health fresh;188 node-exporter、PostgreSQL exporter、Redis exporter、MinIO / Velero BSL 均恢復。 +- 110 `/` disk usage 維持 `73%`;188 exporter local readback `pg_up=1`、`redis_up=1`、MinIO health `ok`;120 Velero `BackupStorageLocation/default` phase `Available`。 +- 目前不能宣稱 full-stack green,因唯一 service blocker 仍是 `MOMO_DAILY_FRESHNESS 6|2026-06-17`。 +- DR 仍不能宣稱完成:credential escrow evidence missing 維持 `5`,不得偽造 marker,也不得把任何 secret value 放進 repo 或聊天。 + +**剩餘告警分類**: +- `ColdStartRecoveryBlocked`:正確反映 MOMO business data source stale,不可消音。 +- `BackupCredentialEscrowEvidenceMissing` x5:正確反映 DR escrow 缺口,不可偽造或消音。 +- `DockerContainerMemoryLimitPressure` / `DockerContainerMissingResourceLimit`:屬於下一階段 resource tuning / owner evidence 工作,不是本輪重啟恢復 blocker。 +- `ColdStartLastGreenTooOld`:因 MOMO data freshness blocker 讓 full green 不能更新,屬於正確 warning。 +- 正常 healthy heartbeat 已由 `a84a5a0b` 降噪,不應再每 30 分鐘洗 Telegram;warning / failure / recovery 類事件仍必須告警。 + ## 2026-06-24|MOMO Google Drive 權限修復與資料新鮮度 Gate 補強 **背景**:`mo.wooo.work` / `/health` 已恢復 200,MOMO containers 也 healthy,但頁面資料仍停在舊版本。只看 route 200 / container healthy 會誤判,因此重新查 DB、import jobs、Google Drive 來源與 scheduler logs。 diff --git a/docs/runbooks/BACKUP-STATUS.md b/docs/runbooks/BACKUP-STATUS.md index a8a2f13f..c42612c1 100644 --- a/docs/runbooks/BACKUP-STATUS.md +++ b/docs/runbooks/BACKUP-STATUS.md @@ -9,6 +9,49 @@ > 2026-06-13 Codex post-CD refresh: backup/offsite/alert contracts remain green after deploy marker `e4a349bc`; global SSH trust guardrail held; DR still blocked only by credential escrow evidence. > 2026-06-13 Codex escrow refresh: 13:10 live report confirms offsite/rclone/script readiness is green and only five non-secret credential escrow evidence markers remain missing. > 2026-06-18 Codex cold-start refresh: full-stack service readiness is green after stale failed Job classification; backup core remains green; DR still blocked only by five credential escrow evidence markers. +> 2026-06-24 Codex Velero/exporter refresh: 188 MinIO / Velero backup freshness, 188 PostgreSQL / Redis exporters, 188 node-exporter, and 110 disk pressure are recovered; DR still blocked only by five credential escrow evidence markers and service full-green is blocked by MOMO data freshness. + +--- + +## 2026-06-24 Velero / Exporter / Disk-Pressure Live Status + +2026-06-24 06:35 refresh: + +- 110 backup health remains fresh: 13 configured jobs, stale `0`, failed count `0`, config failed `0`。 +- 188 backup health remains fresh: 2 configured jobs, stale `0`, missing cron/script `0`。 +- 188 `node-exporter` textfile scrape is restored: Prometheus `up{job="node-exporter-188"}=1` and `awoooi_backup_health_monitor_up{host="188"}=1`。 +- 188 PostgreSQL exporter and Redis exporter are restored: local metrics `pg_up=1` / `redis_up=1`; Prometheus sees `up{job="postgres-exporter"}=1`, `pg_up=1`, `up{job="redis-exporter"}=1`, `redis_up=1`。 +- 188 MinIO endpoint is healthy on `192.168.0.188:9000`; 120 Velero `BackupStorageLocation/default` is `Available`。 +- One-off Velero backup `reboot-recovery-202606240456` completed successfully; 110 backup-health textfile reports `awoooi_velero_latest_completed_backup_fresh=1`。 +- `VeleroBackupNotRun`、`BackupHealthMonitorMissing188`、`PostgreSQLDown`、`RedisDown` and 110 disk-pressure alerts are resolved. +- 110 `/` disk use is reduced from `92%` to `73%` after Docker image/build-cache cleanup only. Docker volume prune remains forbidden without explicit owner approval. +- Credential escrow readback remains blocked: `ESCROW_MISSING_COUNT=5`。 +- Full service green is still blocked by MOMO business data freshness: `MOMO_DAILY_FRESHNESS 6|2026-06-17`。 + +| Gate | Status | Evidence | +|------|--------|----------| +| 110 backup freshness | VERIFIED | 13/13 fresh, failed count 0, config failed 0. | +| 188 backup freshness | VERIFIED | 2/2 fresh, node-exporter scrape and textfile metrics restored. | +| Velero / MinIO storage | VERIFIED | MinIO health OK, BSL `Available`, backup `reboot-recovery-202606240456` `Completed`, freshness metric `1`. | +| PostgreSQL / Redis exporters | VERIFIED | `pg_up=1`, `redis_up=1`, Prometheus scrape `up=1` for both exporters. | +| Alert chain | VERIFIED_WITH_EXPECTED_REDLIGHTS | Exporter/Velero/disk alerts resolved; escrow missing and MOMO freshness blocker remain visible. | +| Credential escrow | BLOCKED | Five non-secret evidence markers still missing. | +| DR closeout | NO-GO | Must not be declared complete until real owner-provided non-secret evidence IDs are validated and markers are written. | + +Operational helpers: + +```bash +# 188 PostgreSQL / Redis exporters +ssh ollama@192.168.0.188 'bash /home/ollama/bin/188-db-exporters-restore.sh' + +# 188 MinIO / 120 Velero BSL readback +ssh wooo@192.168.0.110 '/home/wooo/scripts/188-minio-velero-restore.sh' + +# Maintenance-window one-off Velero backup + backup-health textfile refresh +ssh wooo@192.168.0.110 'CREATE_VELERO_BACKUP=true REFRESH_BACKUP_HEALTH=true /home/wooo/scripts/188-minio-velero-restore.sh' +``` + +Current policy: restore backup and monitoring red lights first; do not silence `VeleroBackupNotRun` or exporter-down alerts. Healthy heartbeat success messages are suppressed separately and should not be confused with real backup/data/escrow alerts. --- diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 54cb5765..1a6d6c41 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.29 +> Version: v1.30 > Last updated: 2026-06-24 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. @@ -10,14 +10,15 @@ 本節是每次接手、開機、關機、重啟後的第一個判定錨點。若日期不是今天,必須先重跑 live check,再更新本節與 `docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md`。 -2026-06-24 02:44 live readback supersedes the earlier 02:08 green wording: +2026-06-24 06:35 live readback supersedes the earlier 02:44 wording: ```text Repo-side reboot SOP / Plan B / automation contracts: COMPLETE, 100%. Live cold-start read-only check: PASS=86 WARN=0 BLOCKED=1, Result=BLOCKED. -Service state: SERVICE_AVAILABLE_DATA_STALE_BLOCKED; 110/120/121/188 reachable, K3s mon/mon1 Ready, public routes/TLS green, 110/188 backup health fresh, 188 node-exporter textfile scrape restored. +Service state: SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED; 110/120/121/188 reachable, K3s mon/mon1 Ready, public routes/TLS green, 110/188 backup health fresh, 188 node-exporter / PostgreSQL exporter / Redis exporter restored, 188 MinIO endpoint and Velero BackupStorageLocation restored, 110 disk pressure cleared. MOMO state: current-month daily_sales_snapshot and realtime_sales_monthly match, but both stop at 2026-06-17. MOMO_DAILY_FRESHNESS is 6 days, which is a hard blocker because business data is not current. Google Drive state: momo scheduler token ownership is fixed for Docker userns, Drive listing works, but folder 當日業績匯入 currently has no matching 即時業績_當日 Excel source file. Archive latest matching file is 2026-06-18 and was already imported. +Backup / monitoring state: 188 MinIO is healthy, Velero BackupStorageLocation default is Available, one-off backup reboot-recovery-202606240456 completed, backup-health textfile reports Velero freshness green, and VeleroBackupNotRun / PostgreSQLDown / RedisDown / disk-pressure alerts resolved. Allowed declaration: core hosts, routes, K3s, backup/exporter surfaces are recovered; MOMO data pipeline is blocked waiting for a newer source file or owner-provided source evidence. Forbidden declaration: full-stack green, MOMO data current, DR complete, or runtime/security acceptance. Credential escrow evidence is still missing and must not be forged. ``` @@ -81,13 +82,13 @@ Allowed declaration: monitoring, alert rules, AI event packet, PlayBook / KM con Forbidden declaration: AI runtime remediation is enabled. Process termination, Docker/systemd restart, Nginx reload, firewall/K8s action, Telegram live send, Gateway queue write, Bot API call, production write, and secret read remain forbidden without owner approval, maintenance window, evidence ref, dry-run, and post-check. ``` -| 項目 | 2026-06-24 02:44 Asia/Taipei live result | 判定 | +| 項目 | 2026-06-24 06:35 Asia/Taipei live result | 判定 | |------|-------------------------------------------|------| -| Overall recovery readiness | `97%` | `SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED` | +| Overall recovery readiness | `98%` | `SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED` | | P0 host / K3s recovery | `100%` | `DONE` | -| P1 backup / alert / escrow | `93%` | `BLOCKED_DR_ESCROW` | +| P1 backup / alert / escrow | `96%` | `BLOCKED_DR_ESCROW` | | P2 service / data truth | `96%` | `BLOCKED_MOMO_DATA_FRESHNESS` | -| P3 docs / automation contracts | `100%` | `DONE_WITH_MOMO_FRESHNESS_GATE` | +| P3 docs / automation contracts | `100%` | `DONE_WITH_VELERO_AND_EXPORTER_RECOVERY_GATE` | | 110 host runtime | `fwupd-refresh.timer` intentionally disabled/inactive after non-runtime firmware metadata refresh failed units were classified; `systemctl --failed` returns `0 loaded units listed`; rollback is `sudo systemctl enable --now fwupd-refresh.timer` | `GREEN_WITH_FWUPD_TIMER_DISABLED` | | 110 host runaway process guard | 14:31-14:32 live scrape confirms `monitor_up=1`, orphan browser groups `0`, active Gitea Actions containers `2`, `load5_per_core≈0.79-0.81`, `swap_used_ratio≈1.0`, and `remediation_authorized=0`; exporter/helper also remain in Ansible textfile exporter source-of-truth. | `LIVE_SCRAPED_RUNTIME_GATE_0` | | 120 reachability | ping OK, SSH OK, boot around `2026-06-14 02:23`, K3s active, node `mon Ready` | `GREEN` | @@ -95,10 +96,10 @@ Forbidden declaration: AI runtime remediation is enabled. Process termination, D | 188 host runtime | production services green; host degraded only by `certbot.service` and `snap.certbot.renew.service` | `GREEN_WITH_CERTBOT_DEBT` | | K3s node state | `mon Ready control-plane`, `mon1 Ready control-plane`; bad pods `0`; `FAILED_JOBS=1`, `STALE_FAILED_JOBS=1`, `ACTIVE_FAILED_JOBS=0` | `GREEN_WITH_RETAINED_EVIDENCE` | | 110 -> 120 / 188 SSH trust | 00:33 cold-start exposed stale `known_hosts`; backup `/home/wooo/.ssh/known_hosts.before-120-188-refresh.20260613-003416`; final repair backup `/home/wooo/.ssh/known_hosts.before-120-188-final-refresh.20260613-011949`; CD fix `80e6ec1a` moves deploy trust to `/home/wooo/.ssh/deploy_known_hosts`; 01:28 global `known_hosts` still contains 120 / 188 and was not clobbered by deploy marker `e4a349bc` | `GREEN_WITH_GUARDRAIL` | -| Backup status | 13:43 status: 110 backup health `13/13 fresh failed=0`, 188 backup health `2/2 fresh failed=0`; escrow readback still shows `ESCROW_MISSING_COUNT=5` | `GREEN_WITH_DR_ESCROW_WARNING` | +| Backup status | 06:35 status: 110 backup health fresh, 188 backup health fresh, MinIO / Velero storage restored, latest Velero backup fresh, exporter scrape green; escrow readback still shows `ESCROW_MISSING_COUNT=5` | `GREEN_WITH_DR_ESCROW_WARNING` | | Offsite sync / verify | 01:28 textfile: `awoooi_backup_offsite_remote_verify_ok=1`, `full_verify_fresh=1`, all 13 repos have `snapshot_count=1` and `snapshot_latest_only=1`; latest scheduled verifier log is 2026-06-12 07:20 | `GREEN` | | Backup / cold-start alerts | 01:27 live visibility check confirms Prometheus and Alertmanager expose the 5 required credential escrow gap alerts; Prometheus rules API has all five required alert names healthy; label contract check loads 24 baseline backup alert rules | `GREEN_WITH_EXPECTED_REDLIGHTS` | -| Cold-start scorecard | 13:43 stale Job classification 後只讀 scorecard:`PASS=84 WARN=0 BLOCKED=0`。Public routes / TLS、momo DB parity、backup exporters、120/121 K3s 與 AWOOOI API/Web 皆通過。 | `GREEN_FOR_SERVICE` | +| Cold-start scorecard | 06:35 read-only scorecard:`PASS=86 WARN=0 BLOCKED=1`。Public routes / TLS、momo DB parity、backup exporters、120/121 K3s、MinIO / Velero、AWOOOI API/Web 皆通過;only blocker is MOMO data freshness. | `BLOCKED_MOMO_DATA_FRESHNESS` | | momo DB parity | `4571|4571|2026-06-01|2026-06-07|2026-06-01|2026-06-07` | `GREEN` | | momo scheduler | container healthy; scorecard reads `SCHEDULER_RECENT_ACTIVITY 1009`; detector widened and deployed to 110 | `GREEN` | | ArgoCD app health | This 13:43 cold-start pass does not reassert ArgoCD app health directly. Service release uses K8s node / route / workload / CronJob active-failure evidence. Any ArgoCD release claim still requires a fresh `awoooi-prod` app readback. | `NOT_REASSERTED_BY_THIS_GATE` | @@ -1747,6 +1748,61 @@ SQL 若 Drive pending folder 無新來源檔,不可手動 truncate、不可以舊 archive 檔重複匯入來製造「最新」,也不可把 DB parity 當 data freshness。下一個解除 blocker 的證據必須是:新的 `即時業績_當日` source file 可見、import job 成功、`sync_success=true`、`daily_sales_snapshot` 與 `realtime_sales_monthly` 日期上下界一致,且 `MOMO_DAILY_FRESHNESS <= 2`。 +### 14.29 2026-06-24 188 MinIO / Velero、DB exporter 與 110 disk pressure recovery + +2026-06-24 的第四段變更是恢復真正的備份與監控鏈路,而不是消音告警。`VeleroBackupNotRun`、`PostgreSQLDown`、`RedisDown`、110 disk pressure 都是有效紅燈;修復順序必須是 source-of-truth / service / exporter / Prometheus / Alertmanager / cold-start scorecard。 + +| 項目 | 2026-06-24 06:35 recovery baseline | +|------|-------------------------------------| +| SOP version | `v1.30` | +| 188 DB exporter root cause | Docker user namespace 下 exporter compose 不能使用 `network_mode: host`;Redis live port 是 `6380` | +| 188 DB exporter source-of-truth | `ops/monitoring/docker-compose.exporters.yaml` 改為 bridge port mapping;PostgreSQL DSN 只從 host `.env.exporters` 注入,repo 不放密碼 | +| 188 DB exporter helper | `scripts/ops/188-db-exporters-restore.sh`;live path `/home/ollama/bin/188-db-exporters-restore.sh` | +| 188 DB exporter readback | local metrics `pg_up=1`、`redis_up=1`;Prometheus `up{job="postgres-exporter"}=1`、`pg_up=1`、`up{job="redis-exporter"}=1`、`redis_up=1` | +| 110 disk pressure | `/` from `92%` used to `73%` used after Docker image / build cache cleanup only; no Docker volume prune | +| MinIO / Velero root cause | 188 MinIO endpoint `192.168.0.188:9000` was down; Velero BSL S3 list failed; MinIO data path had userns write denial | +| MinIO restore | live `/home/ollama/minio/docker-compose.override.yml` adds `userns_mode: host` for the `minio` service; MinIO health endpoint is OK | +| Velero restore | 120 `BackupStorageLocation/default` phase is `Available`; one-off backup `reboot-recovery-202606240456` is `Completed` | +| Backup-health textfile | 110 exporter refresh reports `awoooi_velero_monitor_up=1`, `awoooi_velero_latest_completed_backup_fresh=1`, restore-test cron present, failed jobs `0` | +| Alert readback | `VeleroBackupNotRun`、`PostgreSQLDown`、`RedisDown`、110 disk-pressure alerts resolved | +| Live cold-start readback | `PASS=86 WARN=0 BLOCKED=1`, result `BLOCKED`; only blocker remains `MOMO_DAILY_FRESHNESS 6|2026-06-17` | +| Declaration limit | 可宣稱 backup / exporter / MinIO / Velero chain recovered;不可宣稱 full-stack green、MOMO data current、DR complete 或 runtime/security acceptance | + +188 PostgreSQL / Redis exporter post-reboot recovery: + +```bash +ssh ollama@192.168.0.188 'bash /home/ollama/bin/188-db-exporters-restore.sh' +curl -fsS http://192.168.0.188:9187/metrics | grep '^pg_up ' +curl -fsS http://192.168.0.188:9121/metrics | grep '^redis_up ' +``` + +188 MinIO / 120 Velero recovery from 110: + +```bash +ssh wooo@192.168.0.110 '/home/wooo/scripts/188-minio-velero-restore.sh' +``` + +如果需要在維護窗口中建立一次性 reboot-recovery 備份並刷新 110 backup-health textfile,必須明確設定: + +```bash +ssh wooo@192.168.0.110 'CREATE_VELERO_BACKUP=true REFRESH_BACKUP_HEALTH=true /home/wooo/scripts/188-minio-velero-restore.sh' +``` + +本地 repo helper 可同步 live script: + +```bash +scp -q scripts/ops/188-db-exporters-restore.sh ollama@192.168.0.188:/home/ollama/bin/188-db-exporters-restore.sh +scp -q scripts/ops/188-minio-velero-restore.sh wooo@192.168.0.110:/home/wooo/scripts/188-minio-velero-restore.sh +``` + +110 disk pressure cleanup rule: + +```text +Allowed in incident recovery: Docker image / build cache cleanup after checking `docker system df`. +Forbidden without explicit owner approval: `docker volume prune`, deleting database / registry / MinIO / ClickHouse / Sentry / PostgreSQL volumes, or removing unknown bind-mounted state. +Done gate: filesystem use below 85%, no active disk-pressure alerts, and no service regression in cold-start scorecard. +``` + ### 14.22 重啟後時間軸驗證 每次重啟後照時間軸推進,不要等到最後才一次判定。 diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index 6326e5ce..13f7313d 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -11,13 +11,13 @@ | Area | Status | Completion | Evidence | |------|--------|------------|----------| -| Overall recovery readiness | SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED | 97% | 2026-06-24 02:44 live cold-start read-only gate returned `PASS=86 WARN=0 BLOCKED=1`, result `BLOCKED`。110 / 120 / 121 / 188 ping and SSH port are OK, K3s `mon` / `mon1` are Ready, `NODE_FS_ERROR_EVENTS=0`, `NODE_READONLY_FILESYSTEM_TRUE=0`, `NODE_DISK_PRESSURE_TRUE=0`, public routes/TLS are green, 110 / 188 runtime and backup checks are green。188 `node-exporter` textfile scrape is restored. Remaining service blocker is MOMO business data freshness: `MOMO_DAILY_FRESHNESS 6|2026-06-17`; Drive listing works after token owner repair, but `當日業績匯入` has no newer `即時業績_當日` Excel source file. DR remains blocked because credential escrow evidence markers are still missing and must not be forged. | +| Overall recovery readiness | SERVICE_AVAILABLE_MOMO_SOURCE_BLOCKED_DR_ESCROW_BLOCKED | 98% | 2026-06-24 06:35 live cold-start read-only gate returned `PASS=86 WARN=0 BLOCKED=1`, result `BLOCKED`。110 / 120 / 121 / 188 ping and SSH port are OK, K3s `mon` / `mon1` are Ready, `NODE_FS_ERROR_EVENTS=0`, `NODE_READONLY_FILESYSTEM_TRUE=0`, `NODE_DISK_PRESSURE_TRUE=0`, public routes/TLS are green, 110 / 188 runtime and backup checks are green。188 `node-exporter`、PostgreSQL exporter、Redis exporter、MinIO / Velero BSL are restored; 110 disk pressure cleared to 73%。Remaining service blocker is MOMO business data freshness: `MOMO_DAILY_FRESHNESS 6|2026-06-17`; Drive listing works after token owner repair, but `當日業績匯入` has no newer `即時業績_當日` Excel source file. DR remains blocked because credential escrow evidence markers are still missing and must not be forged. | | P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-14 18:15 readback shows 120 is reachable, K3s is active, `mon` and `mon1` are both `Ready control-plane`, and cold-start P0/P1 checks are green. | -| P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 93% | 2026-06-24 02:20 `backup-status` shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `escrow_missing=5`, last aggregate `2026-06-23 20:53:42`。02:24 restored 188 `node-exporter` textfile scrape; Prometheus now has `up{job="node-exporter-188"}=1` and `awoooi_backup_health_monitor_up{host="188"}=1`; `BackupHealthMonitorMissing188` resolved. DR remains blocked on real non-secret credential escrow evidence IDs. | +| P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 96% | 2026-06-24 06:35 backup / alert readback shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `escrow_missing=5`。188 `node-exporter` textfile scrape、PostgreSQL exporter、Redis exporter、MinIO endpoint、Velero BSL and latest completed backup freshness are restored; `BackupHealthMonitorMissing188`、`PostgreSQLDown`、`RedisDown`、`VeleroBackupNotRun` and 110 disk-pressure alerts resolved. DR remains blocked on real non-secret credential escrow evidence IDs. | | P2 service / data truth | BLOCKED_MOMO_DATA_FRESHNESS | 96% | Public route/TLS, API/Web route, momo health, current-month parity `10936|10936|2026-06-01|2026-06-17|2026-06-01|2026-06-17`, backup exporters, schedules, K3s node readiness/storage conditions, VIP, and 110 / 188 runtime health are green. However MOMO latest business date is `2026-06-17`; stale age is `6` days. Drive pending folder has `0` matching files and archive latest is the already-imported 2026-06-18 file, so there is no safe newer source to import. | -| P3 docs / automation contracts | DONE_WITH_MOMO_FRESHNESS_GATE | 100% | Workplan, SOP v1.29, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat Telegram suppression, MOMO scheduler / current-month detector fix, 188 node-exporter restore helper, MOMO Google Drive token userns readback, MOMO daily freshness blocker, and 2026-06-24 live readback are updated. Production image `a84a5a0b` is live with API `2/2`, Web `2/2`, Worker `1/1`; CD `#3289` is a known false-negative caused by worker startup / rollout timeout after deploy marker `4a7b5329`. | +| P3 docs / automation contracts | DONE_WITH_VELERO_AND_EXPORTER_RECOVERY_GATE | 100% | Workplan, SOP v1.30, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat Telegram suppression, MOMO scheduler / current-month detector fix, 188 node-exporter restore helper, 188 DB/Redis exporter restore helper, 188 MinIO/Velero restore helper, 110 Docker disk pressure cleanup boundary, MOMO Google Drive token userns readback, MOMO daily freshness blocker, and 2026-06-24 06:35 live readback are updated. Production image `a84a5a0b` is live with API `2/2`, Web `2/2`, Worker `1/1`; CD `#3289` is a known false-negative caused by worker startup / rollout timeout after deploy marker `4a7b5329`. | -Full cold-start service readiness may not be declared green for the latest verified evidence set. As of 2026-06-24 02:44, routes/hosts/K3s/backups are available, but the scorecard is `PASS=86 WARN=0 BLOCKED=1` because MOMO business data freshness is stale beyond 3 days. Do not declare DR scorecard complete while credential escrow evidence remains blocked. +Full cold-start service readiness may not be declared green for the latest verified evidence set. As of 2026-06-24 06:35, routes/hosts/K3s/backups/exporters/Velero are available, but the scorecard is `PASS=86 WARN=0 BLOCKED=1` because MOMO business data freshness is stale beyond 3 days. Do not declare DR scorecard complete while credential escrow evidence remains blocked. 2026-06-13 01:26 refresh: full cold-start is again green for the current evidence set. AWOOOI API/Web workload balancing survived the next normal CD deploy: Gitea main `e4a349bc`, ArgoCD revision `e4a349bc`, images from `414413a5`, API/Web split across `mon` / `mon1`, and global `known_hosts` retained 120 / 188 after CD fix `80e6ec1a`. Do not declare DR complete while credential escrow is missing. `km-vectorize` remediation is `90%`: schedule/label fix is live, and the remaining gate is the next official 03:00 CronJob success readback. @@ -147,6 +147,7 @@ Next: | P1-012 | DONE | 100 | Audit credential escrow marker write safety | 2026-06-12 15:02 `mark-credential-escrow-verified.sh --status` reports all five allowed items missing; `offsite-escrow-evidence-report.sh --no-color` reports rclone/offsite configured and `ESCROW_MISSING_COUNT=5`; repo search found only runbooks/placeholders/rules, not real evidence IDs. | Write markers only after a real non-secret evidence ID exists for each item; never write placeholder or secret. | The marker blocker is narrowed to missing external evidence IDs, not missing script/config/offsite readiness. | | P1-014 | DONE | 100 | Publish credential escrow owner request package | 2026-06-13 13:10 live report confirms `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`, `PASS=8 WARN=5 BLOCKED=0`. New owner request package defines allowed evidence-id types, forbidden secret values, safe dry-run flow, write flow, and closeout gates. | Dispatch to the credential owners without collecting secret values; keep marker write gated until owner gives real non-secret evidence IDs. | `docs/security/CREDENTIAL-ESCROW-EVIDENCE-OWNER-REQUEST.md` and snapshot exist and validate. | | P1-013 | DONE_FOR_SERVICE_READINESS | 100 | Remediate `km-vectorize` CronJob health debt | The retained `km-vectorize-29689620` failed Job is now classified as stale evidence, not an active blocker, because later official `km-vectorize` Jobs completed successfully. 2026-06-18 13:43 cold-start reads `FAILED_JOBS=1`, `STALE_FAILED_JOBS=1`, `ACTIVE_FAILED_JOBS=0`, `BAD_PODS=0`, and returns `PASS=84 WARN=0 BLOCKED=0`. | Keep retained failed Job as evidence unless an explicit maintenance window authorizes cleanup. Reassert ArgoCD app health only with a fresh ArgoCD app readback, not from the cold-start scorecard alone. | Service readiness no longer warns on stale failed Job evidence; active failed Job detection remains guarded. | +| P1-015 | DONE | 100 | Restore 188 MinIO / Velero backup freshness and DB exporters | 2026-06-24 06:35 resolved real backup / exporter red lights: 188 PostgreSQL exporter and Redis exporter now expose `pg_up=1` / `redis_up=1`; 188 MinIO health is live; 120 Velero BSL is `Available`; one-off backup `reboot-recovery-202606240456` completed; 110 backup-health textfile reports latest Velero backup fresh. 110 disk pressure was reduced from 92% to 73% by Docker image/build-cache cleanup only. | Reconcile MinIO `userns_mode: host` override into formal source-of-truth or data ownership fix; keep Docker volume prune forbidden without explicit owner approval. | `VeleroBackupNotRun`、`PostgreSQLDown`、`RedisDown`、110 disk-pressure alerts are resolved, and SOP includes restore helpers. | --- @@ -175,7 +176,7 @@ Next: | P3-005 | DONE | 100 | Update cold-start SOP | SOP now includes start, shutdown, reboot, record, comparison, and 120 blocker handling. | Increment SOP version after each process change. | SOP has controlled power-operation sections and ledger template. | | P3-006 | DONE | 100 | Update backup status | Backup status now reflects current cron, rclone latest-only, failure-only alert posture, and escrow blocker. | Refresh after 120 backup rerun. | Backup status no longer claims noisy success Telegram notifications. | | P3-007 | DONE | 100 | Harden Gitea backup stale dump handling | 2026-06-05 manual Gitea backup failed because the container retained `/tmp/gitea-dump.zip` from the 02:00 failure. `scripts/backup/backup-gitea.sh` now renames stale container dump files to timestamped evidence before running a new dump, and the live 110 script is updated. | Watch the next 02:00 Gitea backup. | `bash -n` passes locally and on 110; manual Gitea backup completed after stale evidence rename. | -| P3-008 | DONE | 100 | Continuously optimize host reboot SOP | SOP v1.29 adds startup judgment layers, GO/NO-GO decision tree, freeze execution checklist, host boot detection, 110/188/120/121 recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline, K3s filesystem event blocker, stale-vs-active K8s failed Job classification, post-reboot / post-CD recovery anchors, AA/AS 判定, workload 分散判定, CD SSH trust guardrail, CronJob failure evidence retention rule, `fwupd-refresh.timer` rollback note, 110 runaway browser / CI load 分流 PlayBook, healthy-heartbeat suppression, 188 node-exporter restore, MOMO Google Drive token userns readback, and MOMO data freshness hard blocker. | Use v1.29 for the next reboot record, then compare actual timing, Plan B trigger, degraded level, failed/stale/active Job counters, runaway-process metrics, CI load attribution, MOMO source availability, data freshness, and blockers against §1.4 plus §11.1 / §14.8 through §14.28. Before any real reboot, rerun same-day live cold-start / backup / offsite / alert / escrow / runaway-process checks. | SOP distinguishes `HOST_BOOTED`, `HOST_READY`, `SERVICE_READY`, `FULL_STACK_GREEN`, `K3S_CONTROL_PLANE_AA`, `WORKLOAD_BALANCED`, `B0_ABORTED_BEFORE_REBOOT`, `B1_HOST_RECOVERY_ONLY`, `B2_CORE_SERVICE_READY`, `B3_SERVICE_AVAILABLE_DEGRADED`, `B4_FULL_STACK_GREEN`, and `B5_DR_COMPLETE`; live cold-start now returns `PASS=86 WARN=0 BLOCKED=1` when MOMO data freshness is stale, preventing false green. | +| P3-008 | DONE | 100 | Continuously optimize host reboot SOP | SOP v1.30 adds startup judgment layers, GO/NO-GO decision tree, freeze execution checklist, host boot detection, 110/188/120/121 recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline, K3s filesystem event blocker, stale-vs-active K8s failed Job classification, post-reboot / post-CD recovery anchors, AA/AS 判定, workload 分散判定, CD SSH trust guardrail, CronJob failure evidence retention rule, `fwupd-refresh.timer` rollback note, 110 runaway browser / CI load 分流 PlayBook, healthy-heartbeat suppression, 188 node-exporter restore, 188 DB/Redis exporter restore, 188 MinIO/Velero restore, 110 Docker disk cleanup boundary, MOMO Google Drive token userns readback, and MOMO data freshness hard blocker. | Use v1.30 for the next reboot record, then compare actual timing, Plan B trigger, degraded level, failed/stale/active Job counters, runaway-process metrics, CI load attribution, MOMO source availability, data freshness, Velero freshness, exporter scrape, disk usage, and blockers against §1.4 plus §11.1 / §14.8 through §14.29. Before any real reboot, rerun same-day live cold-start / backup / offsite / alert / escrow / runaway-process checks. | SOP distinguishes `HOST_BOOTED`, `HOST_READY`, `SERVICE_READY`, `FULL_STACK_GREEN`, `K3S_CONTROL_PLANE_AA`, `WORKLOAD_BALANCED`, `B0_ABORTED_BEFORE_REBOOT`, `B1_HOST_RECOVERY_ONLY`, `B2_CORE_SERVICE_READY`, `B3_SERVICE_AVAILABLE_DEGRADED`, `B4_FULL_STACK_GREEN`, and `B5_DR_COMPLETE`; live cold-start now returns `PASS=86 WARN=0 BLOCKED=1` when MOMO data freshness is stale, preventing false green. | | P3-009 | DONE | 100 | Assess 120/121 AA/AS role and host load balancing | 2026-06-12 15:19 live check confirms 120 and 121 are both `Ready control-plane`, `k3s active`, `k3s-agent inactive`, with no taints; however most AWOOOI / ArgoCD / Velero workload remains on 121 after 120 fsck recovery. New assessment defines control-plane AA vs workload AA, migration candidates from 110/188, and stateful migration blockers. | After P0 backup/offsite/cold-start green, implement topology spread for AWOOOI API/Web before moving additional services. | `docs/runbooks/HOST-ROLE-LOAD-BALANCING-ASSESSMENT.md` exists; SOP v1.6 links AA/AS and load-balancing checks; migration implementation remains explicitly `0%`. | | P3-010 | DONE | 100 | Update workload balancing docs with 2026-06-13 live truth | Host role assessment, workplan, SOP, backup status, and LOGBOOK are refreshed with current cold-start, backup, 188 certbot degraded, ArgoCD `km-vectorize` degraded, Gitea main `acaae999`, ArgoCD sync, and final pod placement evidence. | Keep updating this file after the next reboot or deploy. | Docs separate service-green status from DR escrow, workload rollout, and non-service governance debt. | | P3-011 | DONE | 100 | Record `km-vectorize` remediation status | LOGBOOK, this workplan, and SOP now state the schedule/label fix, ArgoCD sync evidence, the invalid manual Job boundary, and the 90% waiting-for-next-schedule gate. | After next 03:00 run, update this row and the top verdict with `lastSuccessfulTime` / ArgoCD health evidence. | No document claims ArgoCD green before official CronJob success evidence exists. | diff --git a/ops/monitoring/docker-compose.exporters.yaml b/ops/monitoring/docker-compose.exporters.yaml index e12904b7..7659154f 100644 --- a/ops/monitoring/docker-compose.exporters.yaml +++ b/ops/monitoring/docker-compose.exporters.yaml @@ -27,15 +27,14 @@ services: environment: # 連線字串 (使用環境變數注入密碼) # 2026-04-08 Claude Sonnet 4.6: 修正用戶名/資料庫名 (awoooi user, awoooi_prod db) - DATA_SOURCE_NAME: "postgresql://awoooi:${POSTGRES_PASSWORD:-awoooi_prod_2026}@localhost:5432/awoooi_prod?sslmode=disable" + DATA_SOURCE_NAME: "${POSTGRES_EXPORTER_DATA_SOURCE_NAME:?POSTGRES_EXPORTER_DATA_SOURCE_NAME is required}" # 自訂查詢配置 PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml" # 日誌等級 PG_EXPORTER_LOG_LEVEL: "info" volumes: - ./postgres-exporter-queries.yaml:/etc/postgres_exporter/queries.yaml:ro - # 直接使用 host network 連接本地 PostgreSQL - network_mode: host + # Docker userns cannot share the host network namespace. Connect through the host LAN address. labels: - "prometheus.scrape=true" - "prometheus.port=9187" @@ -55,13 +54,12 @@ services: - "9121:9121" environment: # Redis 連線 (192.168.0.188:6380 是 AWOOOI Redis) - REDIS_ADDR: "redis://localhost:6380" + REDIS_ADDR: "${REDIS_EXPORTER_ADDR:-192.168.0.188:6380}" REDIS_PASSWORD: "${REDIS_PASSWORD:-}" # 啟用額外指標 REDIS_EXPORTER_CHECK_KEYS: "awoooi:*" REDIS_EXPORTER_INCL_SYSTEM_METRICS: "true" - # 直接使用 host network 連接本地 Redis - network_mode: host + # Docker userns cannot share the host network namespace. Connect through the host LAN address. labels: - "prometheus.scrape=true" - "prometheus.port=9121" diff --git a/scripts/ops/188-db-exporters-restore.sh b/scripts/ops/188-db-exporters-restore.sh new file mode 100755 index 00000000..536c37da --- /dev/null +++ b/scripts/ops/188-db-exporters-restore.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# Restore PostgreSQL / Redis exporters on 192.168.0.188 without host networking. +# +# Required on the host: +# /home/ollama/monitoring/.env.exporters +# +# Required variables: +# POSTGRES_EXPORTER_DATA_SOURCE_NAME=postgresql://...@192.168.0.188:5432/...?...sslmode=disable +# +# Optional variables: +# REDIS_EXPORTER_ADDR=192.168.0.188:6380 +# REDIS_PASSWORD= + +set -euo pipefail + +ENV_FILE="${EXPORTER_ENV_FILE:-/home/ollama/monitoring/.env.exporters}" +QUERIES_FILE="${POSTGRES_EXPORTER_QUERIES_FILE:-/home/ollama/monitoring/postgres-exporter-queries.yaml}" +POSTGRES_IMAGE="${POSTGRES_EXPORTER_IMAGE:-prometheuscommunity/postgres-exporter:v0.15.0}" +REDIS_IMAGE="${REDIS_EXPORTER_IMAGE:-oliver006/redis_exporter:v1.58.0}" + +if [ ! -f "$ENV_FILE" ]; then + echo "EXPORTER_ENV_FILE_MISSING $ENV_FILE" >&2 + exit 2 +fi + +set -a +# shellcheck disable=SC1090 +. "$ENV_FILE" +set +a + +if [ -z "${POSTGRES_EXPORTER_DATA_SOURCE_NAME:-}" ]; then + echo "POSTGRES_EXPORTER_DATA_SOURCE_NAME_MISSING" >&2 + exit 2 +fi + +if [ ! -f "$QUERIES_FILE" ]; then + echo "POSTGRES_EXPORTER_QUERIES_FILE_MISSING $QUERIES_FILE" >&2 + exit 2 +fi + +REDIS_EXPORTER_ADDR="${REDIS_EXPORTER_ADDR:-192.168.0.188:6380}" +REDIS_PASSWORD="${REDIS_PASSWORD:-}" + +docker rm -f postgres-exporter redis-exporter >/dev/null 2>&1 || true + +docker run -d \ + --name postgres-exporter \ + --restart unless-stopped \ + -p 9187:9187 \ + -e DATA_SOURCE_NAME="$POSTGRES_EXPORTER_DATA_SOURCE_NAME" \ + -e PG_EXPORTER_EXTEND_QUERY_PATH=/etc/postgres_exporter/queries.yaml \ + -e PG_EXPORTER_LOG_LEVEL=info \ + -v "$QUERIES_FILE:/etc/postgres_exporter/queries.yaml:ro" \ + "$POSTGRES_IMAGE" >/dev/null + +redis_args=( + docker run -d + --name redis-exporter + --restart unless-stopped + -p 9121:9121 + -e "REDIS_ADDR=$REDIS_EXPORTER_ADDR" + -e "REDIS_EXPORTER_CHECK_KEYS=awoooi:*" + -e REDIS_EXPORTER_INCL_SYSTEM_METRICS=true +) +if [ -n "$REDIS_PASSWORD" ]; then + redis_args+=(-e "REDIS_PASSWORD=$REDIS_PASSWORD") +fi +redis_args+=("$REDIS_IMAGE") +"${redis_args[@]}" >/dev/null + +pg_up="$(curl -fsS --max-time 5 http://127.0.0.1:9187/metrics | awk '/^pg_up / {print $2; exit}')" +redis_up="$(curl -fsS --max-time 5 http://127.0.0.1:9121/metrics | awk '/^redis_up / {print $2; exit}')" + +echo "POSTGRES_EXPORTER_UP ${pg_up:-missing}" +echo "REDIS_EXPORTER_UP ${redis_up:-missing}" + +test "${pg_up:-0}" = "1" +test "${redis_up:-0}" = "1" diff --git a/scripts/ops/188-minio-velero-restore.sh b/scripts/ops/188-minio-velero-restore.sh new file mode 100755 index 00000000..28889661 --- /dev/null +++ b/scripts/ops/188-minio-velero-restore.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Restore the 188 MinIO endpoint used by Velero, then optionally create a +# one-off Velero backup to re-establish backup freshness evidence. + +set -euo pipefail + +MINIO_HOST="${MINIO_HOST:-ollama@192.168.0.188}" +K3S_HOST="${K3S_HOST:-wooo@192.168.0.120}" +BACKUP_HEALTH_HOST="${BACKUP_HEALTH_HOST:-wooo@192.168.0.110}" +MINIO_COMPOSE_FILE="${MINIO_COMPOSE_FILE:-/home/ollama/minio/docker-compose.yml}" +MINIO_OVERRIDE_FILE="${MINIO_OVERRIDE_FILE:-/home/ollama/minio/docker-compose.override.yml}" +VELERO_NAMESPACE="${VELERO_NAMESPACE:-velero}" +VELERO_TARGET_NAMESPACE="${VELERO_TARGET_NAMESPACE:-awoooi-prod}" +CREATE_VELERO_BACKUP="${CREATE_VELERO_BACKUP:-false}" +REFRESH_BACKUP_HEALTH="${REFRESH_BACKUP_HEALTH:-false}" +BACKUP_NAME="${VELERO_BACKUP_NAME:-reboot-recovery-$(date -u +%Y%m%d%H%M)}" + +ssh "$MINIO_HOST" "test -f '$MINIO_COMPOSE_FILE'" + +ssh "$MINIO_HOST" "cat > '$MINIO_OVERRIDE_FILE' <<'EOF' +services: + minio: + userns_mode: host +EOF" + +ssh "$MINIO_HOST" "docker compose -f '$MINIO_COMPOSE_FILE' -f '$MINIO_OVERRIDE_FILE' up -d" + +ssh "$MINIO_HOST" "for i in \$(seq 1 30); do curl -fsS --max-time 3 http://127.0.0.1:9000/minio/health/live >/dev/null && exit 0; sleep 2; done; docker logs --tail=80 minio >&2; exit 1" +echo "MINIO_188_HEALTHY endpoint=192.168.0.188:9000" + +ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backupstoragelocations.velero.io default -o jsonpath='{.status.phase}'" | grep -qx Available +echo "VELERO_BACKUP_STORAGE_LOCATION_AVAILABLE namespace=$VELERO_NAMESPACE" + +if [ "$CREATE_VELERO_BACKUP" = "true" ]; then + ssh "$K3S_HOST" "printf '%s\n' \ + 'apiVersion: velero.io/v1' \ + 'kind: Backup' \ + 'metadata:' \ + ' name: $BACKUP_NAME' \ + ' namespace: $VELERO_NAMESPACE' \ + ' labels:' \ + ' awoooi.wooo.work/source: reboot-recovery' \ + 'spec:' \ + ' includedNamespaces:' \ + ' - $VELERO_TARGET_NAMESPACE' \ + ' storageLocation: default' \ + ' ttl: 720h0m0s' \ + | sudo -n k3s kubectl apply -f -" + + for _ in $(seq 1 60); do + phase="$(ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o jsonpath='{.status.phase}'" || true)" + case "$phase" in + Completed) + echo "VELERO_BACKUP_COMPLETED name=$BACKUP_NAME" + break + ;; + Failed|PartiallyFailed) + ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o yaml" >&2 || true + echo "VELERO_BACKUP_FAILED name=$BACKUP_NAME phase=$phase" >&2 + exit 1 + ;; + esac + sleep 5 + done + + phase="$(ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o jsonpath='{.status.phase}'" || true)" + if [ "$phase" != "Completed" ]; then + echo "VELERO_BACKUP_TIMEOUT name=$BACKUP_NAME phase=${phase:-unknown}" >&2 + exit 1 + fi +fi + +if [ "$REFRESH_BACKUP_HEALTH" = "true" ]; then + ssh "$BACKUP_HEALTH_HOST" "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin AIOPS_HOST_LABEL=110 NODE_EXPORTER_TEXTFILE_DIR=/home/wooo/node_exporter_textfiles /home/wooo/scripts/backup-health-textfile-exporter.py" + echo "BACKUP_HEALTH_TEXTFILE_REFRESHED host=110" +fi