fix(ops): harden 188 backup exporter recovery [skip ci]
This commit is contained in:
78
scripts/ops/188-db-exporters-restore.sh
Executable file
78
scripts/ops/188-db-exporters-restore.sh
Executable file
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env bash
|
||||
# Restore PostgreSQL / Redis exporters on 192.168.0.188 without host networking.
|
||||
#
|
||||
# Required on the host:
|
||||
# /home/ollama/monitoring/.env.exporters
|
||||
#
|
||||
# Required variables:
|
||||
# POSTGRES_EXPORTER_DATA_SOURCE_NAME=postgresql://...@192.168.0.188:5432/...?...sslmode=disable
|
||||
#
|
||||
# Optional variables:
|
||||
# REDIS_EXPORTER_ADDR=192.168.0.188:6380
|
||||
# REDIS_PASSWORD=
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
ENV_FILE="${EXPORTER_ENV_FILE:-/home/ollama/monitoring/.env.exporters}"
|
||||
QUERIES_FILE="${POSTGRES_EXPORTER_QUERIES_FILE:-/home/ollama/monitoring/postgres-exporter-queries.yaml}"
|
||||
POSTGRES_IMAGE="${POSTGRES_EXPORTER_IMAGE:-prometheuscommunity/postgres-exporter:v0.15.0}"
|
||||
REDIS_IMAGE="${REDIS_EXPORTER_IMAGE:-oliver006/redis_exporter:v1.58.0}"
|
||||
|
||||
if [ ! -f "$ENV_FILE" ]; then
|
||||
echo "EXPORTER_ENV_FILE_MISSING $ENV_FILE" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
set -a
|
||||
# shellcheck disable=SC1090
|
||||
. "$ENV_FILE"
|
||||
set +a
|
||||
|
||||
if [ -z "${POSTGRES_EXPORTER_DATA_SOURCE_NAME:-}" ]; then
|
||||
echo "POSTGRES_EXPORTER_DATA_SOURCE_NAME_MISSING" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ ! -f "$QUERIES_FILE" ]; then
|
||||
echo "POSTGRES_EXPORTER_QUERIES_FILE_MISSING $QUERIES_FILE" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
REDIS_EXPORTER_ADDR="${REDIS_EXPORTER_ADDR:-192.168.0.188:6380}"
|
||||
REDIS_PASSWORD="${REDIS_PASSWORD:-}"
|
||||
|
||||
docker rm -f postgres-exporter redis-exporter >/dev/null 2>&1 || true
|
||||
|
||||
docker run -d \
|
||||
--name postgres-exporter \
|
||||
--restart unless-stopped \
|
||||
-p 9187:9187 \
|
||||
-e DATA_SOURCE_NAME="$POSTGRES_EXPORTER_DATA_SOURCE_NAME" \
|
||||
-e PG_EXPORTER_EXTEND_QUERY_PATH=/etc/postgres_exporter/queries.yaml \
|
||||
-e PG_EXPORTER_LOG_LEVEL=info \
|
||||
-v "$QUERIES_FILE:/etc/postgres_exporter/queries.yaml:ro" \
|
||||
"$POSTGRES_IMAGE" >/dev/null
|
||||
|
||||
redis_args=(
|
||||
docker run -d
|
||||
--name redis-exporter
|
||||
--restart unless-stopped
|
||||
-p 9121:9121
|
||||
-e "REDIS_ADDR=$REDIS_EXPORTER_ADDR"
|
||||
-e "REDIS_EXPORTER_CHECK_KEYS=awoooi:*"
|
||||
-e REDIS_EXPORTER_INCL_SYSTEM_METRICS=true
|
||||
)
|
||||
if [ -n "$REDIS_PASSWORD" ]; then
|
||||
redis_args+=(-e "REDIS_PASSWORD=$REDIS_PASSWORD")
|
||||
fi
|
||||
redis_args+=("$REDIS_IMAGE")
|
||||
"${redis_args[@]}" >/dev/null
|
||||
|
||||
pg_up="$(curl -fsS --max-time 5 http://127.0.0.1:9187/metrics | awk '/^pg_up / {print $2; exit}')"
|
||||
redis_up="$(curl -fsS --max-time 5 http://127.0.0.1:9121/metrics | awk '/^redis_up / {print $2; exit}')"
|
||||
|
||||
echo "POSTGRES_EXPORTER_UP ${pg_up:-missing}"
|
||||
echo "REDIS_EXPORTER_UP ${redis_up:-missing}"
|
||||
|
||||
test "${pg_up:-0}" = "1"
|
||||
test "${redis_up:-0}" = "1"
|
||||
76
scripts/ops/188-minio-velero-restore.sh
Executable file
76
scripts/ops/188-minio-velero-restore.sh
Executable file
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env bash
|
||||
# Restore the 188 MinIO endpoint used by Velero, then optionally create a
|
||||
# one-off Velero backup to re-establish backup freshness evidence.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MINIO_HOST="${MINIO_HOST:-ollama@192.168.0.188}"
|
||||
K3S_HOST="${K3S_HOST:-wooo@192.168.0.120}"
|
||||
BACKUP_HEALTH_HOST="${BACKUP_HEALTH_HOST:-wooo@192.168.0.110}"
|
||||
MINIO_COMPOSE_FILE="${MINIO_COMPOSE_FILE:-/home/ollama/minio/docker-compose.yml}"
|
||||
MINIO_OVERRIDE_FILE="${MINIO_OVERRIDE_FILE:-/home/ollama/minio/docker-compose.override.yml}"
|
||||
VELERO_NAMESPACE="${VELERO_NAMESPACE:-velero}"
|
||||
VELERO_TARGET_NAMESPACE="${VELERO_TARGET_NAMESPACE:-awoooi-prod}"
|
||||
CREATE_VELERO_BACKUP="${CREATE_VELERO_BACKUP:-false}"
|
||||
REFRESH_BACKUP_HEALTH="${REFRESH_BACKUP_HEALTH:-false}"
|
||||
BACKUP_NAME="${VELERO_BACKUP_NAME:-reboot-recovery-$(date -u +%Y%m%d%H%M)}"
|
||||
|
||||
ssh "$MINIO_HOST" "test -f '$MINIO_COMPOSE_FILE'"
|
||||
|
||||
ssh "$MINIO_HOST" "cat > '$MINIO_OVERRIDE_FILE' <<'EOF'
|
||||
services:
|
||||
minio:
|
||||
userns_mode: host
|
||||
EOF"
|
||||
|
||||
ssh "$MINIO_HOST" "docker compose -f '$MINIO_COMPOSE_FILE' -f '$MINIO_OVERRIDE_FILE' up -d"
|
||||
|
||||
ssh "$MINIO_HOST" "for i in \$(seq 1 30); do curl -fsS --max-time 3 http://127.0.0.1:9000/minio/health/live >/dev/null && exit 0; sleep 2; done; docker logs --tail=80 minio >&2; exit 1"
|
||||
echo "MINIO_188_HEALTHY endpoint=192.168.0.188:9000"
|
||||
|
||||
ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backupstoragelocations.velero.io default -o jsonpath='{.status.phase}'" | grep -qx Available
|
||||
echo "VELERO_BACKUP_STORAGE_LOCATION_AVAILABLE namespace=$VELERO_NAMESPACE"
|
||||
|
||||
if [ "$CREATE_VELERO_BACKUP" = "true" ]; then
|
||||
ssh "$K3S_HOST" "printf '%s\n' \
|
||||
'apiVersion: velero.io/v1' \
|
||||
'kind: Backup' \
|
||||
'metadata:' \
|
||||
' name: $BACKUP_NAME' \
|
||||
' namespace: $VELERO_NAMESPACE' \
|
||||
' labels:' \
|
||||
' awoooi.wooo.work/source: reboot-recovery' \
|
||||
'spec:' \
|
||||
' includedNamespaces:' \
|
||||
' - $VELERO_TARGET_NAMESPACE' \
|
||||
' storageLocation: default' \
|
||||
' ttl: 720h0m0s' \
|
||||
| sudo -n k3s kubectl apply -f -"
|
||||
|
||||
for _ in $(seq 1 60); do
|
||||
phase="$(ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o jsonpath='{.status.phase}'" || true)"
|
||||
case "$phase" in
|
||||
Completed)
|
||||
echo "VELERO_BACKUP_COMPLETED name=$BACKUP_NAME"
|
||||
break
|
||||
;;
|
||||
Failed|PartiallyFailed)
|
||||
ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o yaml" >&2 || true
|
||||
echo "VELERO_BACKUP_FAILED name=$BACKUP_NAME phase=$phase" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
sleep 5
|
||||
done
|
||||
|
||||
phase="$(ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o jsonpath='{.status.phase}'" || true)"
|
||||
if [ "$phase" != "Completed" ]; then
|
||||
echo "VELERO_BACKUP_TIMEOUT name=$BACKUP_NAME phase=${phase:-unknown}" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$REFRESH_BACKUP_HEALTH" = "true" ]; then
|
||||
ssh "$BACKUP_HEALTH_HOST" "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin AIOPS_HOST_LABEL=110 NODE_EXPORTER_TEXTFILE_DIR=/home/wooo/node_exporter_textfiles /home/wooo/scripts/backup-health-textfile-exporter.py"
|
||||
echo "BACKUP_HEALTH_TEXTFILE_REFRESHED host=110"
|
||||
fi
|
||||
Reference in New Issue
Block a user