fix(backup): 接入 MOMO PG 備份失敗通知
This commit is contained in:
@@ -1,3 +1,46 @@
|
||||
## 2026-05-12 | MOMO PostgreSQL 備份失敗通知接入 AwoooP
|
||||
|
||||
**背景**:前一輪已把 188 `backup-from-110.sh` 收斂成 AWOOI API / AwoooP 優先、Telegram 只作 fallback;MOMO PostgreSQL daily backup 仍需要獨立腳本與 IaC 落地。最後決策是「成功不即時通知,避免洗版;失敗才送 AWOOI/AwoooP/TG」。
|
||||
|
||||
**本次修補**:
|
||||
- 新增 `scripts/backup/backup-momo-188-pg.sh`:
|
||||
- 部署目標為 `/home/ollama/momo-pro/scripts/pg_backup.sh`。
|
||||
- PostgreSQL 憑證只從 `momo-db` 容器環境讀取,禁止輸出或落地憑證值。
|
||||
- `pg_dump | gzip` 先寫 `.tmp`,檔案小於 `MIN_SIZE_BYTES` 視為失敗。
|
||||
- 成功後寫入 momo `backup_log`,保留 7 天備份。
|
||||
- `AWOOI_BACKUP_NOTIFY_SUCCESS` 預設為 `0`,成功路徑只寫 log;失敗路徑才呼叫 `notify-awoooi-ops.sh`。
|
||||
- `infra/ansible/playbooks/188-ai-web.yml`:
|
||||
- 建立 `/home/ollama/momo-pro/scripts` 與 `/home/ollama/momo_backups`。
|
||||
- 部署 `notify-awoooi-ops.sh` 與 momo PG backup 腳本。
|
||||
- 安裝每日 02:00 cron。
|
||||
- 先移除現場未受 Ansible 管理的舊 momo PG cron 精確行,避免未來套 playbook 時重複排程。
|
||||
|
||||
**驗證與部署**:
|
||||
- 本地檢查:
|
||||
- `bash -n scripts/backup/backup-momo-188-pg.sh scripts/ops/notify-awoooi-ops.sh` → passed。
|
||||
- `ruby -e 'require "yaml"; YAML.load_file("infra/ansible/playbooks/188-ai-web.yml"); puts "yaml ok"'` → `yaml ok`。
|
||||
- `git diff --check` → clean。
|
||||
- `AWOOI_OPS_DRY_RUN=1 ... scripts/ops/notify-awoooi-ops.sh | python3 -m json.tool` → failure / success payload 皆可解析。
|
||||
- `AWOOI_OPS_DRY_RUN=1 DB_CONTAINER=definitely-missing-momo-db ... backup-momo-188-pg.sh` → exit `1`,失敗路徑可觸發通知 helper dry-run。
|
||||
- 已重新同步到 188:
|
||||
- `/home/ollama/momo-pro/scripts/pg_backup.sh`
|
||||
- `/home/ollama/momo-pro/scripts/notify-awoooi-ops.sh`
|
||||
- 權限皆為 executable;`bash -n` passed。
|
||||
- 188 遠端 dry-run:
|
||||
- `AWOOI_OPS_DRY_RUN=1 ... /home/ollama/momo-pro/scripts/notify-awoooi-ops.sh | python3 -m json.tool` → failure payload 可解析,`alertname=Backup.MomoPostgres`、`status=failed`。
|
||||
- 188 實際備份驗證:
|
||||
- `AWOOI_BACKUP_LOG_STDOUT=1 AWOOI_BACKUP_NOTIFY_SUCCESS=0 /home/ollama/momo-pro/scripts/pg_backup.sh` → success。
|
||||
- 產出 `/home/ollama/momo_backups/momo_analytics_20260512_153807.sql.gz`,大小 `137M`。
|
||||
- log 顯示 `Backup success ... (137M, 26s)`、`backup_log insert success`、`Deleted old backups: 0`。
|
||||
- momo `backup_log` 最新列:`momo_analytics_20260512_153807.sql.gz|143502744|26|success`。
|
||||
- 成功路徑 log 顯示 `略過 AwoooP 成功通知;backup-health exporter 作為健康狀態來源`。
|
||||
- AwoooP 降噪確認:
|
||||
- 實際成功備份前後 `/api/v1/platform/runs/list?per_page=1` total 維持 `42`。
|
||||
- 判讀:成功備份未新增 outbound/run,不會洗版;失敗路徑仍會走 AWOOI API / TelegramGateway / AwoooP。
|
||||
- 現場 cron:
|
||||
- 188 目前已有 `0 2 * * * /home/ollama/momo-pro/scripts/pg_backup.sh >> /home/ollama/momo_backups/backup.log 2>&1`。
|
||||
- 本次 playbook 已加舊行清理,下一次套 Ansible 不會和 managed cron 重複。
|
||||
|
||||
## 2026-05-12 | Ops 通知旁路收斂到 AWOOI API / AwoooP
|
||||
|
||||
**背景**:CI/CD 通知已改成先走 AWOOI Alertmanager 入口,並由 TelegramGateway 鏡像到 AwoooP Run Timeline;但 188 ops 腳本仍有直接 Telegram 發送路徑。這會讓備份、DR Drill、host backup 等營運事件繞過 AwoooP 的治理與稽核,只在 Telegram 群組出現。
|
||||
|
||||
@@ -45,6 +45,72 @@
|
||||
- litellm
|
||||
tags: docker
|
||||
|
||||
# ========================================================================
|
||||
# 備份排程
|
||||
# ========================================================================
|
||||
- name: "Backup | 確認 momo backup 目錄存在"
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: ollama
|
||||
group: ollama
|
||||
mode: "0755"
|
||||
loop:
|
||||
- /home/ollama/momo-pro/scripts
|
||||
- /home/ollama/momo_backups
|
||||
tags: backup_jobs
|
||||
|
||||
- name: "Backup | 安裝 AwoooP ops 通知 helper"
|
||||
ansible.builtin.copy:
|
||||
src: "{{ playbook_dir }}/../../../scripts/ops/notify-awoooi-ops.sh"
|
||||
dest: /home/ollama/momo-pro/scripts/notify-awoooi-ops.sh
|
||||
owner: ollama
|
||||
group: ollama
|
||||
mode: "0755"
|
||||
tags: backup_jobs
|
||||
|
||||
- name: "Backup | 安裝 momo PostgreSQL 備份腳本"
|
||||
ansible.builtin.copy:
|
||||
src: "{{ playbook_dir }}/../../../scripts/backup/backup-momo-188-pg.sh"
|
||||
dest: /home/ollama/momo-pro/scripts/pg_backup.sh
|
||||
owner: ollama
|
||||
group: ollama
|
||||
mode: "0755"
|
||||
tags: backup_jobs
|
||||
|
||||
- name: "Backup | 讀取既有 momo PostgreSQL crontab"
|
||||
ansible.builtin.command:
|
||||
cmd: "crontab -l -u ollama"
|
||||
register: momo_pg_crontab
|
||||
changed_when: false
|
||||
failed_when: momo_pg_crontab.rc not in [0, 1]
|
||||
tags: backup_jobs
|
||||
|
||||
- name: "Backup | 移除未受 Ansible 管理的舊 momo PostgreSQL cron"
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
crontab -l -u ollama 2>/dev/null \
|
||||
| awk '$0 != "0 2 * * * /home/ollama/momo-pro/scripts/pg_backup.sh >> /home/ollama/momo_backups/backup.log 2>&1"' \
|
||||
| crontab -u ollama -
|
||||
args:
|
||||
executable: /bin/bash
|
||||
when: >-
|
||||
'0 2 * * * /home/ollama/momo-pro/scripts/pg_backup.sh >> /home/ollama/momo_backups/backup.log 2>&1'
|
||||
in momo_pg_crontab.stdout_lines
|
||||
tags: backup_jobs
|
||||
|
||||
- name: "Backup | 安裝 momo PostgreSQL daily cron"
|
||||
ansible.builtin.cron:
|
||||
name: "AWOOOI momo PostgreSQL daily backup"
|
||||
user: ollama
|
||||
minute: "0"
|
||||
hour: "2"
|
||||
job: >-
|
||||
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
/home/ollama/momo-pro/scripts/pg_backup.sh
|
||||
>> /home/ollama/momo_backups/backup.log 2>&1
|
||||
tags: backup_jobs
|
||||
|
||||
# ========================================================================
|
||||
# n8n / open-webui (Sprint A 新啟動)
|
||||
# ========================================================================
|
||||
|
||||
184
scripts/backup/backup-momo-188-pg.sh
Executable file
184
scripts/backup/backup-momo-188-pg.sh
Executable file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env bash
|
||||
# 192.168.0.188 主機層 momo PostgreSQL 備份控制器。
|
||||
#
|
||||
# 由 Ansible 部署到:
|
||||
# /home/ollama/momo-pro/scripts/pg_backup.sh
|
||||
#
|
||||
# PostgreSQL 憑證只從 momo-db 容器環境讀取;禁止輸出或落地憑證值。
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
BACKUP_DIR="${BACKUP_DIR:-/home/ollama/momo_backups}"
|
||||
DB_CONTAINER="${DB_CONTAINER:-momo-db}"
|
||||
DB_USER="${DB_USER:-momo}"
|
||||
DB_NAME="${DB_NAME:-momo_analytics}"
|
||||
KEEP_DAYS="${KEEP_DAYS:-7}"
|
||||
MIN_SIZE_BYTES="${MIN_SIZE_BYTES:-1048576}"
|
||||
LOG_FILE="${LOG_FILE:-${BACKUP_DIR}/backup.log}"
|
||||
TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
|
||||
FILENAME="momo_analytics_${TIMESTAMP}.sql.gz"
|
||||
FILEPATH="${BACKUP_DIR}/${FILENAME}"
|
||||
TMP_FILE="${FILEPATH}.tmp"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
START_TS="$(date +%s)"
|
||||
|
||||
log() {
|
||||
local line
|
||||
line="$(printf '[%s] %s' "$(date '+%Y-%m-%d %H:%M:%S')" "$*")"
|
||||
printf '%s\n' "$line" >> "${LOG_FILE}"
|
||||
if [[ "${AWOOI_BACKUP_LOG_STDOUT:-0}" == "1" || -t 1 ]]; then
|
||||
printf '%s\n' "$line"
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup_tmp() {
|
||||
rm -f "${TMP_FILE}"
|
||||
}
|
||||
|
||||
elapsed_seconds() {
|
||||
local now
|
||||
now="$(date +%s)"
|
||||
echo "$((now - START_TS))"
|
||||
}
|
||||
|
||||
notify_awoooi_ops() {
|
||||
local status="$1"
|
||||
local summary="$2"
|
||||
local detail="$3"
|
||||
local helper="${SCRIPT_DIR}/notify-awoooi-ops.sh"
|
||||
|
||||
if [[ "${AWOOI_BACKUP_NOTIFY_ENABLED:-1}" != "1" ]]; then
|
||||
return 0
|
||||
fi
|
||||
[[ -x "$helper" ]] || return 1
|
||||
|
||||
AWOOI_OPS_ALERTNAME="Backup.MomoPostgres" \
|
||||
AWOOI_OPS_JOB_NAME="MOMO PostgreSQL 備份" \
|
||||
AWOOI_OPS_STATUS="$status" \
|
||||
AWOOI_OPS_SEVERITY="info" \
|
||||
AWOOI_OPS_SOURCE="momo-pg-backup" \
|
||||
AWOOI_OPS_COMPONENT="momo-postgres-backup" \
|
||||
AWOOI_OPS_SUMMARY="$summary" \
|
||||
AWOOI_OPS_DETAIL="$detail" \
|
||||
AWOOI_OPS_DURATION_SECONDS="$(elapsed_seconds)" \
|
||||
"$helper" >/dev/null
|
||||
}
|
||||
|
||||
notify_best_effort() {
|
||||
local status="$1"
|
||||
local summary="$2"
|
||||
local detail="$3"
|
||||
if [[ "$status" == "success" && "${AWOOI_BACKUP_NOTIFY_SUCCESS:-0}" != "1" ]]; then
|
||||
log "略過 AwoooP 成功通知;backup-health exporter 作為健康狀態來源"
|
||||
return 0
|
||||
fi
|
||||
notify_awoooi_ops "$status" "$summary" "$detail" || log "WARN AwoooP notification failed"
|
||||
}
|
||||
|
||||
on_error() {
|
||||
local exit_code="$?"
|
||||
local line_no="${1:-unknown}"
|
||||
set +e
|
||||
notify_awoooi_ops \
|
||||
"failed" \
|
||||
"MOMO PostgreSQL 備份失敗" \
|
||||
"line=${line_no}; container=${DB_CONTAINER}; db=${DB_NAME}; file=${FILENAME}; log=${LOG_FILE}" \
|
||||
|| true
|
||||
exit "$exit_code"
|
||||
}
|
||||
|
||||
fail_backup() {
|
||||
local message="$1"
|
||||
log "ERROR ${message}"
|
||||
notify_awoooi_ops \
|
||||
"failed" \
|
||||
"MOMO PostgreSQL 備份失敗" \
|
||||
"${message}; container=${DB_CONTAINER}; db=${DB_NAME}; file=${FILENAME}; log=${LOG_FILE}" \
|
||||
|| true
|
||||
exit 1
|
||||
}
|
||||
|
||||
container_running() {
|
||||
docker inspect -f '{{.State.Running}}' "${DB_CONTAINER}" 2>/dev/null | grep -qx true
|
||||
}
|
||||
|
||||
run_pg_dump() {
|
||||
docker exec "${DB_CONTAINER}" sh -eu -c '
|
||||
: "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}"
|
||||
PGPASSWORD="${POSTGRES_PASSWORD}" exec pg_dump \
|
||||
-U "${POSTGRES_USER:-momo}" \
|
||||
-d "${POSTGRES_DB:-momo_analytics}" \
|
||||
--no-password \
|
||||
--no-owner \
|
||||
--no-acl
|
||||
'
|
||||
}
|
||||
|
||||
insert_backup_log() {
|
||||
local size_bytes="$1"
|
||||
local duration="$2"
|
||||
docker exec \
|
||||
-e BACKUP_FILENAME="${FILENAME}" \
|
||||
-e BACKUP_SIZE_BYTES="${size_bytes}" \
|
||||
-e BACKUP_DURATION_SECONDS="${duration}" \
|
||||
-e BACKUP_HOST="$(hostname)" \
|
||||
-e BACKUP_STORAGE_PATH="${FILEPATH}" \
|
||||
"${DB_CONTAINER}" sh -eu -c '
|
||||
: "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}"
|
||||
PGPASSWORD="${POSTGRES_PASSWORD}" psql \
|
||||
-U "${POSTGRES_USER:-momo}" \
|
||||
-d "${POSTGRES_DB:-momo_analytics}" \
|
||||
--no-password \
|
||||
-v ON_ERROR_STOP=1 \
|
||||
-c "INSERT INTO backup_log (filename, file_size_bytes, duration_seconds, status, host, storage_path, completed_at)
|
||||
VALUES (E'\''${BACKUP_FILENAME}'\'', ${BACKUP_SIZE_BYTES}, ${BACKUP_DURATION_SECONDS}, '\''success'\'', E'\''${BACKUP_HOST}'\'', E'\''${BACKUP_STORAGE_PATH}'\'', CURRENT_TIMESTAMP);"
|
||||
' >/dev/null 2>&1
|
||||
}
|
||||
|
||||
main() {
|
||||
mkdir -p "${BACKUP_DIR}"
|
||||
trap cleanup_tmp EXIT
|
||||
trap 'on_error ${LINENO}' ERR
|
||||
|
||||
log "===== momo PostgreSQL backup start ====="
|
||||
if ! container_running; then
|
||||
fail_backup "${DB_CONTAINER} is not running"
|
||||
fi
|
||||
|
||||
local start_ts end_ts duration size_bytes size_human deleted backup_log_status
|
||||
start_ts="$(date +%s)"
|
||||
if run_pg_dump | gzip >"${TMP_FILE}"; then
|
||||
size_bytes="$(stat -c%s "${TMP_FILE}" 2>/dev/null || stat -f%z "${TMP_FILE}" 2>/dev/null || echo 0)"
|
||||
if [ "${size_bytes}" -lt "${MIN_SIZE_BYTES}" ]; then
|
||||
fail_backup "backup file too small: ${size_bytes} bytes"
|
||||
fi
|
||||
mv "${TMP_FILE}" "${FILEPATH}"
|
||||
chmod 0640 "${FILEPATH}"
|
||||
else
|
||||
fail_backup "pg_dump failed"
|
||||
fi
|
||||
|
||||
end_ts="$(date +%s)"
|
||||
duration="$((end_ts - start_ts))"
|
||||
size_human="$(du -h "${FILEPATH}" | awk '{print $1}')"
|
||||
log "Backup success: ${FILENAME} (${size_human}, ${duration}s)"
|
||||
|
||||
backup_log_status="success"
|
||||
if insert_backup_log "${size_bytes}" "${duration}"; then
|
||||
log "backup_log insert success"
|
||||
else
|
||||
backup_log_status="warning"
|
||||
log "WARN backup_log insert failed; backup file is still valid"
|
||||
fi
|
||||
|
||||
deleted="$(find "${BACKUP_DIR}" -name 'momo_analytics_*.sql.gz' -mtime "+${KEEP_DAYS}" -print -delete | wc -l | tr -d ' ')"
|
||||
log "Deleted old backups: ${deleted}"
|
||||
log "===== momo PostgreSQL backup complete ====="
|
||||
|
||||
notify_best_effort \
|
||||
"success" \
|
||||
"MOMO PostgreSQL 備份完成" \
|
||||
"file=${FILENAME}; size=${size_human}; duration=${duration}s; backup_log=${backup_log_status}; deleted_old=${deleted}"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user