fix(backup): restore reboot freshness readback
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m1s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 21:09:29 +08:00
parent 130480207a
commit d4c513a022
5 changed files with 261 additions and 20 deletions

View File

@@ -15,37 +15,223 @@ source "$(dirname "$0")/common.sh"
SERVICE="awoooi-frequent"
AWOOOI_HOST="192.168.0.188"
AWOOOI_DB_USER="awoooi"
AWOOOI_DB_PASS="awoooi_prod_2026"
AWOOOI_DB_PASS="${AWOOOI_DB_PASS:-}"
AWOOOI_DB_HOST="localhost"
AWOOOI_DB_PORT="5432"
LOCAL_REPO="${BACKUP_BASE}/awoooi"
DUMP_DIR="/tmp/awoooi-freq-backup-$$"
AWOOOI_K8S_HOST="${AWOOOI_K8S_HOST:-192.168.0.120}"
AWOOOI_K8S_HOSTS="${AWOOOI_K8S_HOSTS:-${AWOOOI_K8S_HOST} 192.168.0.121 192.168.0.125}"
AWOOOI_K8S_SECRET_NAME="${AWOOOI_K8S_SECRET_NAME:-awoooi-secrets}"
AWOOOI_K8S_NAMESPACE="${AWOOOI_K8S_NAMESPACE:-awoooi-prod}"
AWOOOI_K8S_DATABASE_URL_KEYS="${AWOOOI_K8S_DATABASE_URL_KEYS:-AWOOOI_BACKUP_DATABASE_URL BACKUP_DATABASE_URL DATABASE_URL}"
FORCE_RLS_RESTORE_SQL=""
FORCE_RLS_RESTORE_DB=""
# 高頻備份保留策略
KEEP_HOURLY=28 # 保留 7 天的 6 小時快照7*4=28
KEEP_DAILY=30
KEEP_WEEKLY=12
KEEP_MONTHLY=24
# 2026-05-19 ogt + Codex: 保留策略統一交給 common.sh。
# 預設 latest-only keep-last=1避免高頻 DB snapshot 堆積。
resolve_database_url() {
if [ -n "${AWOOOI_DATABASE_URL:-}" ]; then
printf '%s\n' "${AWOOOI_DATABASE_URL}"
return 0
fi
if [ -n "${DATABASE_URL:-}" ]; then
printf '%s\n' "${DATABASE_URL}"
return 0
fi
# 2026-07-01 ogt + Codex: 優先使用專用備份 DB URL不存在時才退回
# runtime DATABASE_URL。只在遠端流程內解碼不把 secret value 寫入 log。
local k8s_host key encoded decoded
for k8s_host in ${AWOOOI_K8S_HOSTS}; do
for key in ${AWOOOI_K8S_DATABASE_URL_KEYS}; do
encoded="$(ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -o ConnectTimeout=8 "wooo@${k8s_host}" \
"sudo -n kubectl get secret ${AWOOOI_K8S_SECRET_NAME} -n ${AWOOOI_K8S_NAMESPACE} -o jsonpath='{.data.${key}}' 2>/dev/null || kubectl get secret ${AWOOOI_K8S_SECRET_NAME} -n ${AWOOOI_K8S_NAMESPACE} -o jsonpath='{.data.${key}}'" \
2>/dev/null || true)"
decoded="$(printf '%s' "${encoded}" | base64 -d 2>/dev/null || true)"
if [ -n "${decoded}" ]; then
printf '%s\n' "${decoded}"
return 0
fi
done
done
return 1
}
load_database_config() {
local database_url
database_url="$(resolve_database_url || true)"
if [ -z "${database_url}" ]; then
log_error "無法解析 AWOOOI DATABASE_URL拒絕使用舊硬編密碼"
return 1
fi
eval "$(
python3 - 3<<< "${database_url}" <<'PY'
import shlex
from urllib.parse import unquote, urlparse
with open(3) as source:
url = source.read().strip()
parsed = urlparse(url)
values = {
"AWOOOI_DB_USER": unquote(parsed.username or "awoooi"),
"AWOOOI_DB_PASS": unquote(parsed.password or ""),
"AWOOOI_DB_HOST": parsed.hostname or "localhost",
"AWOOOI_DB_PORT": str(parsed.port or 5432),
}
for key, value in values.items():
print(f"{key}={shlex.quote(value)}")
PY
)"
}
quote_remote() {
printf "%q" "$1"
}
pgpass_escape() {
local value="$1"
value="${value//\\/\\\\}"
value="${value//:/\\:}"
printf '%s' "${value}"
}
pgpass_line() {
local database="$1"
printf '%s:%s:%s:%s:%s\n' \
"$(pgpass_escape "${AWOOOI_DB_HOST}")" \
"$(pgpass_escape "${AWOOOI_DB_PORT}")" \
"$(pgpass_escape "${database}")" \
"$(pgpass_escape "${AWOOOI_DB_USER}")" \
"$(pgpass_escape "${AWOOOI_DB_PASS}")"
}
remote_psql_command() {
local database="$1"
printf "psql --no-password -U %s -h %s -p %s -d %s -v ON_ERROR_STOP=1" \
"$(quote_remote "${AWOOOI_DB_USER}")" \
"$(quote_remote "${AWOOOI_DB_HOST}")" \
"$(quote_remote "${AWOOOI_DB_PORT}")" \
"$(quote_remote "${database}")"
}
remote_pgpass_wrapper() {
local command="$1"
printf 'umask 077; pgpass=$(mktemp "${TMPDIR:-/tmp}/awoooi-pgpass.XXXXXX") || exit 1; cleanup() { rm -f "$pgpass"; }; trap cleanup EXIT HUP INT TERM; cat > "$pgpass"; PGOPTIONS="-c statement_timeout=0 -c max_parallel_workers_per_gather=0" PGPASSFILE="$pgpass" %s' "${command}"
}
run_remote_pgpass_command() {
local database="$1"
local command="$2"
pgpass_line "${database}" | ssh "ollama@${AWOOOI_HOST}" "$(remote_pgpass_wrapper "${command}")"
}
latest_restic_snapshot_id() {
restic -r "${LOCAL_REPO}" snapshots --latest 1 --json \
--password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
python3 -c 'import json,sys; rows=json.load(sys.stdin); row=max(rows,key=lambda r: r.get("time","")) if rows else {}; print(row.get("short_id","unknown"))' \
2>/dev/null || echo "unknown"
}
collect_force_rls_sql() {
local database="$1"
local mode="$2"
local query
query="
select format('ALTER TABLE %I.%I ${mode} ROW LEVEL SECURITY;', n.nspname, c.relname)
from pg_class c
join pg_namespace n on n.oid = c.relnamespace
where c.relkind in ('r', 'p')
and c.relforcerowsecurity
and pg_get_userbyid(c.relowner) = current_user
order by 1;
"
run_remote_pgpass_command "${database}" "$(remote_psql_command "${database}") -At -c $(quote_remote "${query}")"
}
apply_remote_sql() {
local database="$1"
local sql="$2"
[ -n "${sql}" ] || return 0
run_remote_pgpass_command "${database}" "$(remote_psql_command "${database}") -c $(quote_remote "${sql}") >/dev/null"
}
restore_force_rls() {
if [ -n "${FORCE_RLS_RESTORE_DB}" ] && [ -n "${FORCE_RLS_RESTORE_SQL}" ]; then
if apply_remote_sql "${FORCE_RLS_RESTORE_DB}" "${FORCE_RLS_RESTORE_SQL}"; then
log_info "FORCE ROW LEVEL SECURITY 已恢復 (${FORCE_RLS_RESTORE_DB})"
else
log_error "FORCE ROW LEVEL SECURITY 恢復失敗 (${FORCE_RLS_RESTORE_DB})"
return 1
fi
FORCE_RLS_RESTORE_DB=""
FORCE_RLS_RESTORE_SQL=""
fi
}
trap restore_force_rls EXIT
dump_database_with_rls_guard() {
local database="$1"
local output_file="$2"
local stderr_file="${output_file}.stderr"
local noforce_sql force_sql dump_rc
noforce_sql="$(collect_force_rls_sql "${database}" "NO FORCE")"
force_sql="$(printf '%s\n' "${noforce_sql}" | sed 's/NO FORCE/FORCE/')"
if [ -n "${noforce_sql}" ]; then
FORCE_RLS_RESTORE_DB="${database}"
FORCE_RLS_RESTORE_SQL="${force_sql}"
log_info "暫時解除 FORCE RLS 以完成完整 pg_dump (${database}, tables=$(printf '%s\n' "${noforce_sql}" | awk 'NF {count++} END {print count+0}'))"
apply_remote_sql "${database}" "${noforce_sql}"
fi
set +e
run_remote_pgpass_command "${database}" "pg_dump --no-password \
-U $(quote_remote "${AWOOOI_DB_USER}") -h $(quote_remote "${AWOOOI_DB_HOST}") -p $(quote_remote "${AWOOOI_DB_PORT}") \
$(quote_remote "${database}")" > "${output_file}" 2>"${stderr_file}"
dump_rc=$?
set -e
restore_force_rls
if [ "${dump_rc}" -ne 0 ]; then
log_error "${database} dump 失敗pg_dump stderr 尾端如下(已避免輸出 credential"
tail -40 "${stderr_file}" | sed -E 's/(password=)[^ ]+/\1REDACTED/g' || true
return "${dump_rc}"
fi
rm -f "${stderr_file}"
}
main() {
local start_time=$(date +%s)
log_info "========== AWOOOI 高頻備份 ($(date '+%H:%M')) =========="
mkdir -p "${DUMP_DIR}"
load_database_config || {
notify_clawbot "failed" "${SERVICE}" "AWOOOI 高頻備份失敗DATABASE_URL 不可用"
rm -rf "${DUMP_DIR}"
exit 1
}
local timestamp=$(date "+%Y%m%d_%H%M%S")
# 只備份 awoooi_prod高頻核心
if ssh ollama@${AWOOOI_HOST} "PGPASSWORD='${AWOOOI_DB_PASS}' pg_dump \
-U ${AWOOOI_DB_USER} -h ${AWOOOI_DB_HOST} -p ${AWOOOI_DB_PORT} \
awoooi_prod" > "${DUMP_DIR}/awoooi_prod_${timestamp}.sql" 2>&1; then
if dump_database_with_rls_guard "awoooi_prod" "${DUMP_DIR}/awoooi_prod_${timestamp}.sql"; then
local size=$(du -h "${DUMP_DIR}/awoooi_prod_${timestamp}.sql" | cut -f1)
log_success "awoooi_prod dump 完成 (${size})"
else
local status=$?
log_error "awoooi_prod dump 失敗"
notify_clawbot "failed" "${SERVICE}" "AWOOOI 高頻備份失敗"
rm -rf "${DUMP_DIR}"
exit 1
exit "${status}"
fi
# Restic 備份(同一倉庫,頻率不同)
@@ -54,18 +240,11 @@ main() {
--tag "service:awoooi" --tag "freq:6h" \
--tag "timestamp:${timestamp}" 2>&1
local snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json \
--password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
grep -oP '"short_id":"\K[^"]+' | head -1)
local snapshot_id
snapshot_id="$(latest_restic_snapshot_id)"
log_success "快照: ${snapshot_id}"
# GFS 清理(加入 hourly 保留)
restic -r "${LOCAL_REPO}" forget --prune \
--password-file "${RESTIC_PASSWORD_FILE}" \
--keep-hourly ${KEEP_HOURLY} \
--keep-daily ${KEEP_DAILY} \
--keep-weekly ${KEEP_WEEKLY} \
--keep-monthly ${KEEP_MONTHLY} 2>&1
cleanup_old_backups "${LOCAL_REPO}"
rm -rf "${DUMP_DIR}"

View File

@@ -643,6 +643,9 @@ def _offsite_and_escrow_metric_lines(host: str) -> list[str]:
if not offsite_configured:
next_step = "configure_google_drive_rclone_on_110_tty"
phase = 1
elif escrow_missing_count > 0 and full_fresh:
next_step = "complete_credential_escrow_review"
phase = 3
elif not any_partial_fresh:
next_step = "run_small_dry_run_then_partial_sync"
phase = 2

View File

@@ -84,3 +84,33 @@ def test_gitea_bundle_metrics_fail_when_checksum_missing(tmp_path: Path, monkeyp
assert all_ok == 0
assert 'awoooi_gitea_bundle_checksum_missing_count{host="188"' in rendered
assert rendered.rstrip().endswith(" 0")
def test_dr_phase_does_not_regress_when_full_offsite_is_fresh_and_partial_is_stale(
tmp_path: Path, monkeypatch
) -> None:
exporter = load_exporter()
offsite_dir = tmp_path / "offsite"
escrow_dir = tmp_path / "escrow"
offsite_dir.mkdir()
escrow_dir.mkdir()
now = 1_782_900_000
monkeypatch.setattr(exporter, "OFFSITE_STATUS_DIR", offsite_dir)
monkeypatch.setattr(exporter, "ESCROW_EVIDENCE_DIR", escrow_dir)
monkeypatch.setattr(exporter.time, "time", lambda: now)
monkeypatch.setattr(exporter, "_b2_configured", lambda: False)
monkeypatch.setattr(exporter, "_rclone_configured", lambda: True)
(offsite_dir / "rclone-last-success").write_text(str(now - 3600), encoding="utf-8")
(offsite_dir / "rclone-partial-last-success").write_text(str(now - 72 * 3600), encoding="utf-8")
metrics = exporter._offsite_and_escrow_metric_lines("110")
rendered = "\n".join(metrics)
assert 'awoooi_backup_offsite_fresh{host="110",provider="rclone",max_age_hours="48"} 1' in rendered
assert (
'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone",scope="partial",max_age_hours="48"} 0'
in rendered
)
assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 5' in rendered
assert 'awoooi_backup_dr_phase{host="110",next_step="complete_credential_escrow_review"} 3' in rendered