From 216b7d78e202ead365c9320be69e91bf6bb3826a Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 12 May 2026 15:50:44 +0800 Subject: [PATCH] =?UTF-8?q?fix(backup):=20=E6=8E=A5=E5=85=A5=20MOMO=20PG?= =?UTF-8?q?=20=E5=82=99=E4=BB=BD=E5=A4=B1=E6=95=97=E9=80=9A=E7=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/LOGBOOK.md | 43 ++++++ infra/ansible/playbooks/188-ai-web.yml | 66 +++++++++ scripts/backup/backup-momo-188-pg.sh | 184 +++++++++++++++++++++++++ 3 files changed, 293 insertions(+) create mode 100755 scripts/backup/backup-momo-188-pg.sh diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 84d88d35..387cac6f 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,46 @@ +## 2026-05-12 | MOMO PostgreSQL 備份失敗通知接入 AwoooP + +**背景**:前一輪已把 188 `backup-from-110.sh` 收斂成 AWOOI API / AwoooP 優先、Telegram 只作 fallback;MOMO PostgreSQL daily backup 仍需要獨立腳本與 IaC 落地。最後決策是「成功不即時通知,避免洗版;失敗才送 AWOOI/AwoooP/TG」。 + +**本次修補**: +- 新增 `scripts/backup/backup-momo-188-pg.sh`: + - 部署目標為 `/home/ollama/momo-pro/scripts/pg_backup.sh`。 + - PostgreSQL 憑證只從 `momo-db` 容器環境讀取,禁止輸出或落地憑證值。 + - `pg_dump | gzip` 先寫 `.tmp`,檔案小於 `MIN_SIZE_BYTES` 視為失敗。 + - 成功後寫入 momo `backup_log`,保留 7 天備份。 + - `AWOOI_BACKUP_NOTIFY_SUCCESS` 預設為 `0`,成功路徑只寫 log;失敗路徑才呼叫 `notify-awoooi-ops.sh`。 +- `infra/ansible/playbooks/188-ai-web.yml`: + - 建立 `/home/ollama/momo-pro/scripts` 與 `/home/ollama/momo_backups`。 + - 部署 `notify-awoooi-ops.sh` 與 momo PG backup 腳本。 + - 安裝每日 02:00 cron。 + - 先移除現場未受 Ansible 管理的舊 momo PG cron 精確行,避免未來套 playbook 時重複排程。 + +**驗證與部署**: +- 本地檢查: + - `bash -n scripts/backup/backup-momo-188-pg.sh scripts/ops/notify-awoooi-ops.sh` → passed。 + - `ruby -e 'require "yaml"; YAML.load_file("infra/ansible/playbooks/188-ai-web.yml"); puts "yaml ok"'` → `yaml ok`。 + - `git diff --check` → clean。 + - `AWOOI_OPS_DRY_RUN=1 ... scripts/ops/notify-awoooi-ops.sh | python3 -m json.tool` → failure / success payload 皆可解析。 + - `AWOOI_OPS_DRY_RUN=1 DB_CONTAINER=definitely-missing-momo-db ... backup-momo-188-pg.sh` → exit `1`,失敗路徑可觸發通知 helper dry-run。 +- 已重新同步到 188: + - `/home/ollama/momo-pro/scripts/pg_backup.sh` + - `/home/ollama/momo-pro/scripts/notify-awoooi-ops.sh` + - 權限皆為 executable;`bash -n` passed。 +- 188 遠端 dry-run: + - `AWOOI_OPS_DRY_RUN=1 ... /home/ollama/momo-pro/scripts/notify-awoooi-ops.sh | python3 -m json.tool` → failure payload 可解析,`alertname=Backup.MomoPostgres`、`status=failed`。 +- 188 實際備份驗證: + - `AWOOI_BACKUP_LOG_STDOUT=1 AWOOI_BACKUP_NOTIFY_SUCCESS=0 /home/ollama/momo-pro/scripts/pg_backup.sh` → success。 + - 產出 `/home/ollama/momo_backups/momo_analytics_20260512_153807.sql.gz`,大小 `137M`。 + - log 顯示 `Backup success ... (137M, 26s)`、`backup_log insert success`、`Deleted old backups: 0`。 + - momo `backup_log` 最新列:`momo_analytics_20260512_153807.sql.gz|143502744|26|success`。 + - 成功路徑 log 顯示 `略過 AwoooP 成功通知;backup-health exporter 作為健康狀態來源`。 +- AwoooP 降噪確認: + - 實際成功備份前後 `/api/v1/platform/runs/list?per_page=1` total 維持 `42`。 + - 判讀:成功備份未新增 outbound/run,不會洗版;失敗路徑仍會走 AWOOI API / TelegramGateway / AwoooP。 +- 現場 cron: + - 188 目前已有 `0 2 * * * /home/ollama/momo-pro/scripts/pg_backup.sh >> /home/ollama/momo_backups/backup.log 2>&1`。 + - 本次 playbook 已加舊行清理,下一次套 Ansible 不會和 managed cron 重複。 + ## 2026-05-12 | Ops 通知旁路收斂到 AWOOI API / AwoooP **背景**:CI/CD 通知已改成先走 AWOOI Alertmanager 入口,並由 TelegramGateway 鏡像到 AwoooP Run Timeline;但 188 ops 腳本仍有直接 Telegram 發送路徑。這會讓備份、DR Drill、host backup 等營運事件繞過 AwoooP 的治理與稽核,只在 Telegram 群組出現。 diff --git a/infra/ansible/playbooks/188-ai-web.yml b/infra/ansible/playbooks/188-ai-web.yml index 9ae44bbe..b42ac5a2 100644 --- a/infra/ansible/playbooks/188-ai-web.yml +++ b/infra/ansible/playbooks/188-ai-web.yml @@ -45,6 +45,72 @@ - litellm tags: docker + # ======================================================================== + # 備份排程 + # ======================================================================== + - name: "Backup | 確認 momo backup 目錄存在" + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: ollama + group: ollama + mode: "0755" + loop: + - /home/ollama/momo-pro/scripts + - /home/ollama/momo_backups + tags: backup_jobs + + - name: "Backup | 安裝 AwoooP ops 通知 helper" + ansible.builtin.copy: + src: "{{ playbook_dir }}/../../../scripts/ops/notify-awoooi-ops.sh" + dest: /home/ollama/momo-pro/scripts/notify-awoooi-ops.sh + owner: ollama + group: ollama + mode: "0755" + tags: backup_jobs + + - name: "Backup | 安裝 momo PostgreSQL 備份腳本" + ansible.builtin.copy: + src: "{{ playbook_dir }}/../../../scripts/backup/backup-momo-188-pg.sh" + dest: /home/ollama/momo-pro/scripts/pg_backup.sh + owner: ollama + group: ollama + mode: "0755" + tags: backup_jobs + + - name: "Backup | 讀取既有 momo PostgreSQL crontab" + ansible.builtin.command: + cmd: "crontab -l -u ollama" + register: momo_pg_crontab + changed_when: false + failed_when: momo_pg_crontab.rc not in [0, 1] + tags: backup_jobs + + - name: "Backup | 移除未受 Ansible 管理的舊 momo PostgreSQL cron" + ansible.builtin.shell: | + set -euo pipefail + crontab -l -u ollama 2>/dev/null \ + | awk '$0 != "0 2 * * * /home/ollama/momo-pro/scripts/pg_backup.sh >> /home/ollama/momo_backups/backup.log 2>&1"' \ + | crontab -u ollama - + args: + executable: /bin/bash + when: >- + '0 2 * * * /home/ollama/momo-pro/scripts/pg_backup.sh >> /home/ollama/momo_backups/backup.log 2>&1' + in momo_pg_crontab.stdout_lines + tags: backup_jobs + + - name: "Backup | 安裝 momo PostgreSQL daily cron" + ansible.builtin.cron: + name: "AWOOOI momo PostgreSQL daily backup" + user: ollama + minute: "0" + hour: "2" + job: >- + PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + /home/ollama/momo-pro/scripts/pg_backup.sh + >> /home/ollama/momo_backups/backup.log 2>&1 + tags: backup_jobs + # ======================================================================== # n8n / open-webui (Sprint A 新啟動) # ======================================================================== diff --git a/scripts/backup/backup-momo-188-pg.sh b/scripts/backup/backup-momo-188-pg.sh new file mode 100755 index 00000000..9321d886 --- /dev/null +++ b/scripts/backup/backup-momo-188-pg.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash +# 192.168.0.188 主機層 momo PostgreSQL 備份控制器。 +# +# 由 Ansible 部署到: +# /home/ollama/momo-pro/scripts/pg_backup.sh +# +# PostgreSQL 憑證只從 momo-db 容器環境讀取;禁止輸出或落地憑證值。 + +set -Eeuo pipefail + +BACKUP_DIR="${BACKUP_DIR:-/home/ollama/momo_backups}" +DB_CONTAINER="${DB_CONTAINER:-momo-db}" +DB_USER="${DB_USER:-momo}" +DB_NAME="${DB_NAME:-momo_analytics}" +KEEP_DAYS="${KEEP_DAYS:-7}" +MIN_SIZE_BYTES="${MIN_SIZE_BYTES:-1048576}" +LOG_FILE="${LOG_FILE:-${BACKUP_DIR}/backup.log}" +TIMESTAMP="$(date +%Y%m%d_%H%M%S)" +FILENAME="momo_analytics_${TIMESTAMP}.sql.gz" +FILEPATH="${BACKUP_DIR}/${FILENAME}" +TMP_FILE="${FILEPATH}.tmp" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +START_TS="$(date +%s)" + +log() { + local line + line="$(printf '[%s] %s' "$(date '+%Y-%m-%d %H:%M:%S')" "$*")" + printf '%s\n' "$line" >> "${LOG_FILE}" + if [[ "${AWOOI_BACKUP_LOG_STDOUT:-0}" == "1" || -t 1 ]]; then + printf '%s\n' "$line" + fi +} + +cleanup_tmp() { + rm -f "${TMP_FILE}" +} + +elapsed_seconds() { + local now + now="$(date +%s)" + echo "$((now - START_TS))" +} + +notify_awoooi_ops() { + local status="$1" + local summary="$2" + local detail="$3" + local helper="${SCRIPT_DIR}/notify-awoooi-ops.sh" + + if [[ "${AWOOI_BACKUP_NOTIFY_ENABLED:-1}" != "1" ]]; then + return 0 + fi + [[ -x "$helper" ]] || return 1 + + AWOOI_OPS_ALERTNAME="Backup.MomoPostgres" \ + AWOOI_OPS_JOB_NAME="MOMO PostgreSQL 備份" \ + AWOOI_OPS_STATUS="$status" \ + AWOOI_OPS_SEVERITY="info" \ + AWOOI_OPS_SOURCE="momo-pg-backup" \ + AWOOI_OPS_COMPONENT="momo-postgres-backup" \ + AWOOI_OPS_SUMMARY="$summary" \ + AWOOI_OPS_DETAIL="$detail" \ + AWOOI_OPS_DURATION_SECONDS="$(elapsed_seconds)" \ + "$helper" >/dev/null +} + +notify_best_effort() { + local status="$1" + local summary="$2" + local detail="$3" + if [[ "$status" == "success" && "${AWOOI_BACKUP_NOTIFY_SUCCESS:-0}" != "1" ]]; then + log "略過 AwoooP 成功通知;backup-health exporter 作為健康狀態來源" + return 0 + fi + notify_awoooi_ops "$status" "$summary" "$detail" || log "WARN AwoooP notification failed" +} + +on_error() { + local exit_code="$?" + local line_no="${1:-unknown}" + set +e + notify_awoooi_ops \ + "failed" \ + "MOMO PostgreSQL 備份失敗" \ + "line=${line_no}; container=${DB_CONTAINER}; db=${DB_NAME}; file=${FILENAME}; log=${LOG_FILE}" \ + || true + exit "$exit_code" +} + +fail_backup() { + local message="$1" + log "ERROR ${message}" + notify_awoooi_ops \ + "failed" \ + "MOMO PostgreSQL 備份失敗" \ + "${message}; container=${DB_CONTAINER}; db=${DB_NAME}; file=${FILENAME}; log=${LOG_FILE}" \ + || true + exit 1 +} + +container_running() { + docker inspect -f '{{.State.Running}}' "${DB_CONTAINER}" 2>/dev/null | grep -qx true +} + +run_pg_dump() { + docker exec "${DB_CONTAINER}" sh -eu -c ' + : "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}" + PGPASSWORD="${POSTGRES_PASSWORD}" exec pg_dump \ + -U "${POSTGRES_USER:-momo}" \ + -d "${POSTGRES_DB:-momo_analytics}" \ + --no-password \ + --no-owner \ + --no-acl + ' +} + +insert_backup_log() { + local size_bytes="$1" + local duration="$2" + docker exec \ + -e BACKUP_FILENAME="${FILENAME}" \ + -e BACKUP_SIZE_BYTES="${size_bytes}" \ + -e BACKUP_DURATION_SECONDS="${duration}" \ + -e BACKUP_HOST="$(hostname)" \ + -e BACKUP_STORAGE_PATH="${FILEPATH}" \ + "${DB_CONTAINER}" sh -eu -c ' + : "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}" + PGPASSWORD="${POSTGRES_PASSWORD}" psql \ + -U "${POSTGRES_USER:-momo}" \ + -d "${POSTGRES_DB:-momo_analytics}" \ + --no-password \ + -v ON_ERROR_STOP=1 \ + -c "INSERT INTO backup_log (filename, file_size_bytes, duration_seconds, status, host, storage_path, completed_at) + VALUES (E'\''${BACKUP_FILENAME}'\'', ${BACKUP_SIZE_BYTES}, ${BACKUP_DURATION_SECONDS}, '\''success'\'', E'\''${BACKUP_HOST}'\'', E'\''${BACKUP_STORAGE_PATH}'\'', CURRENT_TIMESTAMP);" + ' >/dev/null 2>&1 +} + +main() { + mkdir -p "${BACKUP_DIR}" + trap cleanup_tmp EXIT + trap 'on_error ${LINENO}' ERR + + log "===== momo PostgreSQL backup start =====" + if ! container_running; then + fail_backup "${DB_CONTAINER} is not running" + fi + + local start_ts end_ts duration size_bytes size_human deleted backup_log_status + start_ts="$(date +%s)" + if run_pg_dump | gzip >"${TMP_FILE}"; then + size_bytes="$(stat -c%s "${TMP_FILE}" 2>/dev/null || stat -f%z "${TMP_FILE}" 2>/dev/null || echo 0)" + if [ "${size_bytes}" -lt "${MIN_SIZE_BYTES}" ]; then + fail_backup "backup file too small: ${size_bytes} bytes" + fi + mv "${TMP_FILE}" "${FILEPATH}" + chmod 0640 "${FILEPATH}" + else + fail_backup "pg_dump failed" + fi + + end_ts="$(date +%s)" + duration="$((end_ts - start_ts))" + size_human="$(du -h "${FILEPATH}" | awk '{print $1}')" + log "Backup success: ${FILENAME} (${size_human}, ${duration}s)" + + backup_log_status="success" + if insert_backup_log "${size_bytes}" "${duration}"; then + log "backup_log insert success" + else + backup_log_status="warning" + log "WARN backup_log insert failed; backup file is still valid" + fi + + deleted="$(find "${BACKUP_DIR}" -name 'momo_analytics_*.sql.gz' -mtime "+${KEEP_DAYS}" -print -delete | wc -l | tr -d ' ')" + log "Deleted old backups: ${deleted}" + log "===== momo PostgreSQL backup complete =====" + + notify_best_effort \ + "success" \ + "MOMO PostgreSQL 備份完成" \ + "file=${FILENAME}; size=${size_human}; duration=${duration}s; backup_log=${backup_log_status}; deleted_old=${deleted}" +} + +main "$@"