diff --git a/apps/api/src/main.py b/apps/api/src/main.py index dd1166bf..8bcc64e4 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -229,6 +229,50 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: ) logger.info("telegram_heartbeat_monitor_started") + # Reboot Recovery: Warm-up Redis Working Memory from PostgreSQL + # 2026-04-05 ogt: 重開機後 Redis 清空,從 DB restore 未解決的 incidents + # 統帥批准: 數據必須長久記錄,重開機後自動恢復 Working Memory + try: + from src.services.incident_service import get_incident_service + from src.db.base import get_db_context + from src.db.models import IncidentRecord + from sqlalchemy import select + + incident_service = get_incident_service() + async with get_db_context() as db: + result = await db.execute( + select(IncidentRecord).where( + IncidentRecord.status.in_(["investigating", "mitigating"]) + ) + ) + records = result.scalars().all() + + restored = 0 + for record in records: + from src.models.incident import Incident + incident = Incident( + incident_id=record.incident_id, + status=record.status, + severity=record.severity, + signals=record.signals or [], + affected_services=record.affected_services or [], + decision_chain=record.decision_chain, + proposal_ids=record.proposal_ids or [], + outcome=record.outcome, + created_at=record.created_at, + updated_at=record.updated_at, + resolved_at=record.resolved_at, + closed_at=record.closed_at, + ttl_days=record.ttl_days, + vectorized=record.vectorized, + ) + if await incident_service.save_to_working_memory(incident): + restored += 1 + + logger.info("working_memory_warmed_up", restored=restored, total=len(records)) + except Exception as e: + logger.warning("working_memory_warmup_failed", error=str(e)) + # Phase 6.1: 啟動 Signal Worker (Redis Streams Consumer) # 統帥鐵律: Event Bus 解耦告警接收與處理 await init_signal_worker() diff --git a/docs/runbooks/REBOOT-RECOVERY-SOP.md b/docs/runbooks/REBOOT-RECOVERY-SOP.md new file mode 100644 index 00000000..f7b7d2f9 --- /dev/null +++ b/docs/runbooks/REBOOT-RECOVERY-SOP.md @@ -0,0 +1,498 @@ +# 重開機恢復 SOP + +> 最後更新:2026-04-05 ogt — 第二次重開機事故後完整修訂,加入自動化腳本 +> 適用環境:AWOOOI 五主機架構 + +--- + +## 🤖 自動化狀態(最優先確認) + +| 主機 | systemd service | 狀態 | 說明 | +|------|----------------|------|------| +| 192.168.0.188 | `awoooi-startup.service` | ✅ enabled | 自動修復 BoltDB + 啟動所有服務 | +| 192.168.0.110 | `awoooi-startup-110.service` | ✅ enabled | 自動修復 BoltDB + 啟動所有服務 | +| 192.168.0.120 | `k3s.service` | systemd 管理 | 依賴 PG 就緒,自動啟動 | +| 192.168.0.121 | `k3s.service` | systemd 管理 | 自動啟動 | + +**正常情況下,重開機後等待 3-5 分鐘,所有服務應自動恢復。** + +確認方式(重開機後執行): +```bash +# 確認自動化腳本執行結果 +ssh ollama@192.168.0.188 "sudo journalctl -u awoooi-startup.service -n 30 --no-pager" +ssh wooo@192.168.0.110 "echo '0936223270' | sudo -S journalctl -u awoooi-startup-110.service -n 30 --no-pager" +``` + +--- + +## ⚡ 全系統啟動順序(依賴關係) + +``` +重開機後啟動順序(強制順序,不可逆轉): + +┌─────────────────────────────────────────┐ +│ Phase 1: 192.168.0.188 基礎設施 │ ← 最先 +│ ├─ containerd (BoltDB 修復) │ +│ ├─ Docker (BoltDB 修復) │ +│ ├─ PostgreSQL (WAL 修復 + kine VACUUM) │ ← K3s Kine Datastore +│ ├─ Redis (0.0.0.0:6380) │ ← API/Worker 依賴 +│ ├─ Ollama │ +│ ├─ Nginx │ +│ ├─ SignOz (docker compose) │ +│ └─ ClawBot (docker compose) │ +└─────────────────────────────────────────┘ + ↓ 必須先完成 +┌─────────────────────────────────────────┐ +│ Phase 2: 192.168.0.110 DevOps 金庫 │ ← 同步進行 or 稍後 +│ ├─ Docker (BoltDB 修復) │ +│ ├─ 清除孤兒容器 (network 不存在問題) │ +│ ├─ harbor-log (先 healthy!) │ ← 其他 Harbor 依賴它 +│ ├─ Harbor 其他元件 (nginx/core/db/...) │ ← K3s imagePull 依賴 +│ ├─ Gitea │ ← CI/CD 依賴 +│ ├─ Langfuse │ +│ ├─ Monitoring (Prometheus/Grafana/AM) │ +│ └─ SignOz │ +└─────────────────────────────────────────┘ + ↓ 必須先完成 (PostgreSQL + Harbor) +┌─────────────────────────────────────────┐ +│ Phase 3: K3s Control-Plane │ ← 最後 +│ ├─ 120 k3s.service → Ready │ +│ ├─ 121 k3s.service → Ready │ +│ └─ awoooi-prod Pods Running │ +└─────────────────────────────────────────┘ +``` + +--- + +## 🤖 自動化腳本說明 + +### 腳本位置 + +``` +scripts/reboot-recovery/ +├── awoooi-startup.sh # 188 啟動腳本(部署到 /usr/local/bin/) +├── awoooi-startup.service # 188 systemd unit +├── awoooi-startup-110.sh # 110 啟動腳本(部署到 /usr/local/bin/) +├── awoooi-startup-110.service # 110 systemd unit +├── deploy-to-188.sh # 一鍵部署到 188 +└── deploy-to-110.sh # 一鍵部署到 110 +``` + +### 188 腳本(`awoooi-startup.sh`)步驟 + +| 步驟 | 說明 | 故障處理 | +|------|------|---------| +| 1/7 | containerd 健康檢查 | BoltDB 損壞 → 自動刪除 `meta.db` | +| 2/7 | Docker 健康檢查 | BoltDB 損壞 → 自動刪除 `local-kv.db` | +| 3/7 | PostgreSQL 健康檢查 | WAL 損壞 → 自動執行 `pg_resetwal` + VACUUM ANALYZE kine | +| 4/7 | Redis 啟動 | — | +| 5/7 | Ollama 啟動 | — | +| 6/7 | Nginx 啟動 | — | +| 7/7 | SignOz + ClawBot compose up | ClawBot 失敗 → 嘗試 rebuild,失敗也繼續 | + +### 110 腳本(`awoooi-startup-110.sh`)步驟 + +| 步驟 | 說明 | 故障處理 | +|------|------|---------| +| 1/5 | Docker 健康檢查 | BoltDB 損壞 → 自動刪除所有損壞 `.db` | +| 2/5 | 清除孤兒容器 | `Exited (128)/(137)` → docker rm + network prune | +| 3/5 | Harbor 啟動 | 等 harbor-log healthy (max 60s) 才啟動其他元件 | +| 4/5 | Gitea/Langfuse/Monitoring compose up | — | +| 5/5 | SignOz compose up | — | + +### 重新部署腳本 + +```bash +# 從 Mac 執行(awoooi repo 目錄) +cd scripts/reboot-recovery + +bash deploy-to-188.sh # 更新 188 的腳本 +bash deploy-to-110.sh # 更新 110 的腳本 +``` + +--- + +## 手動恢復流程(自動化失敗時) + +### Phase 1:192.168.0.188 + +```bash +ssh ollama@192.168.0.188 +``` + +#### 1.1 containerd(若未起) + +```bash +sudo systemctl status containerd +# 若 BoltDB 損壞 (panic: freepages): +sudo systemctl stop containerd +sudo rm -f /var/lib/containerd/io.containerd.metadata.v1.bolt/meta.db +sudo systemctl start containerd +systemctl is-active containerd +``` + +#### 1.2 Docker(若未起) + +```bash +sudo systemctl status docker +# 若 BoltDB 損壞 (panic: page already freed / invalid freelist page): +sudo systemctl stop docker +sudo rm -f /var/lib/docker/network/files/local-kv.db +sudo rm -f /var/lib/docker/volumes/metadata.db +find /var/lib/docker/buildkit -name "*.db" -delete 2>/dev/null || true +sudo systemctl start docker +systemctl is-active docker +``` + +#### 1.3 PostgreSQL(最關鍵) + +```bash +sudo systemctl start postgresql@14-main +sleep 8 +systemctl is-active postgresql@14-main # 若非 active → 見故障排除 A +pg_isready -h localhost -p 5432 # 應為 accepting connections + +# 清理 kine 孤立連線(WAL 重置後必做) +sudo -u postgres psql -d k3s_datastore -c " + SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname='k3s_datastore' AND pid!=pg_backend_pid() + AND query_start < now() - interval '5 minutes'; +" +sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;" +``` + +#### 1.4 Redis + +```bash +sudo systemctl start redis-server +redis-cli -p 6380 ping # 應回 PONG +# Redis 設定: 0.0.0.0:6380 (bind 0.0.0.0, port 6380) +``` + +#### 1.5 Ollama + +```bash +sudo systemctl start ollama +sleep 5 +curl -sf http://localhost:11434/ | grep running +``` + +#### 1.6 Nginx + SignOz + ClawBot + +```bash +sudo systemctl start nginx + +cd /home/ollama/signoz/deploy/docker && docker compose up -d +cd /home/ollama/clawbot-v5 && docker compose up -d +``` + +#### 1.7 Phase 1 驗收 + +```bash +pg_isready -h localhost -p 5432 && echo "✅ PostgreSQL" +redis-cli -p 6380 ping | grep -q PONG && echo "✅ Redis :6380" +curl -sf http://localhost:11434/ | grep -q running && echo "✅ Ollama" +systemctl is-active nginx | grep -q active && echo "✅ Nginx" +``` + +--- + +### Phase 2:192.168.0.110 + +```bash +ssh wooo@192.168.0.110 +``` + +#### 2.1 Docker 修復 + +```bash +systemctl is-active docker || { + echo "0936223270" | sudo -S bash -c " + rm -f /var/lib/docker/network/files/local-kv.db + rm -f /var/lib/docker/volumes/metadata.db + find /var/lib/docker/buildkit -name '*.db' -delete 2>/dev/null + systemctl start docker + " +} +``` + +#### 2.2 清除孤兒容器(關鍵!) + +```bash +# 重開機後舊容器使用的 Docker network 已不存在,必須清除 +docker rm -f $(docker ps -aq 2>/dev/null) 2>/dev/null || true +docker network prune -f 2>/dev/null || true +``` + +#### 2.3 Harbor(必須先等 harbor-log healthy) + +```bash +cd /home/wooo/harbor/harbor +docker compose up -d + +# 等 harbor-log healthy(最多 60 秒) +for i in $(seq 1 12); do + STATUS=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null) + echo "[$i] harbor-log: $STATUS" + [ "$STATUS" = "healthy" ] && break + sleep 5 +done + +# harbor-log healthy 後,重啟其他元件(它們依賴 :1514 syslog) +docker compose up -d +``` + +#### 2.4 其他服務 + +```bash +cd /home/wooo/gitea && docker compose up -d +cd /home/wooo/langfuse && docker compose up -d +cd /home/wooo/monitoring && docker compose up -d +cd /home/wooo/signoz/deploy/docker && docker compose up -d +``` + +#### 2.5 Phase 2 驗收 + +```bash +curl -s -o /dev/null -w "%{http_code}" http://localhost:5000/v2/ | grep -q 401 && echo "✅ Harbor :5000" +curl -s -o /dev/null -w "%{http_code}" http://localhost:3001/ | grep -q 200 && echo "✅ Gitea :3001" +curl -s -o /dev/null -w "%{http_code}" http://localhost:3100/ | grep -q 200 && echo "✅ Langfuse :3100" +curl -s -o /dev/null -w "%{http_code}" http://localhost:9093/ | grep -q 200 && echo "✅ Alertmanager :9093" +curl -s -o /dev/null -w "%{http_code}" http://localhost:3002/ | grep -q 302 && echo "✅ Grafana :3002" +``` + +--- + +### Phase 3:K3s Control-Plane + +> ⚠️ 必須確認 Phase 1 PostgreSQL + Phase 2 Harbor 完全就緒才執行! + +```bash +# 先確認前置條件 +ssh ollama@192.168.0.188 "pg_isready -h localhost -p 5432" +ssh wooo@192.168.0.110 "curl -s -o /dev/null -w '%{http_code}' http://localhost:5000/v2/" +# PostgreSQL: accepting connections +# Harbor: 401 (需認證,正常) +``` + +#### 3.1 K3s 節點(通常自動啟動) + +```bash +# 若 k3s 未啟動 +ssh wooo@192.168.0.120 "echo '0936223270' | sudo -S systemctl start k3s" +ssh wooo@192.168.0.121 "echo '0936223270' | sudo -S systemctl start k3s" + +# 確認節點狀態 +ssh wooo@192.168.0.120 "echo '0936223270' | sudo -S k3s kubectl get nodes" +``` + +#### 3.2 若 Pods ImagePullBackOff + +```bash +# Harbor 剛起來,強制 rollout +ssh wooo@192.168.0.120 "echo '0936223270' | sudo -S bash -c ' + k3s kubectl delete pod -l app=awoooi-api -n awoooi-prod + k3s kubectl delete pod -l app=awoooi-web -n awoooi-prod + k3s kubectl delete pod -l app=awoooi-worker -n awoooi-prod +'" +``` + +#### 3.3 Phase 3 驗收 + +```bash +ssh wooo@192.168.0.120 "echo '0936223270' | sudo -S k3s kubectl get pods -n awoooi-prod" +# 所有 Pod 應為 Running + +curl http://192.168.0.125:32334/api/v1/health +# 預期: status=healthy 或 degraded (openclaw down 可接受) +``` + +--- + +## 完整自動化驗收腳本 + +```bash +#!/bin/bash +# 執行位置: Mac (awoooi repo) +echo "=== AWOOOI 重開機完整驗收 $(date '+%Y-%m-%d %H:%M:%S') ===" + +# 188 基礎服務 +echo "--- 192.168.0.188 ---" +ssh ollama@192.168.0.188 " +pg_isready -h localhost -p 5432 >/dev/null 2>&1 && echo '✅ PostgreSQL :5432' || echo '❌ PostgreSQL DOWN' +redis-cli -p 6380 ping 2>/dev/null | grep -q PONG && echo '✅ Redis :6380' || echo '❌ Redis DOWN' +curl -sf http://localhost:11434/ 2>/dev/null | grep -q running && echo '✅ Ollama :11434' || echo '❌ Ollama DOWN' +systemctl is-active nginx 2>/dev/null | grep -q active && echo '✅ Nginx' || echo '❌ Nginx DOWN' +docker ps --format '{{.Names}}\t{{.Status}}' 2>/dev/null | grep -E '^signoz|^clawbot' | head -5 +" + +# 110 DevOps 金庫 +echo "--- 192.168.0.110 ---" +ssh wooo@192.168.0.110 " +curl -s -o /dev/null -w '%{http_code}' http://localhost:5000/v2/ 2>/dev/null | grep -q 401 && echo '✅ Harbor :5000' || echo '❌ Harbor DOWN' +curl -s -o /dev/null -w '%{http_code}' http://localhost:3001/ 2>/dev/null | grep -q 200 && echo '✅ Gitea :3001' || echo '❌ Gitea DOWN' +curl -s -o /dev/null -w '%{http_code}' http://localhost:3100/ 2>/dev/null | grep -q 200 && echo '✅ Langfuse :3100' || echo '❌ Langfuse DOWN' +curl -s -o /dev/null -w '%{http_code}' http://localhost:9093/ 2>/dev/null | grep -q 200 && echo '✅ Alertmanager :9093' || echo '❌ Alertmanager DOWN' +curl -s -o /dev/null -w '%{http_code}' http://localhost:3002/ 2>/dev/null | grep -qE '200|302' && echo '✅ Grafana :3002' || echo '❌ Grafana DOWN' +" + +# K3s 和 Pods +echo "--- K3s (via 120) ---" +ssh wooo@192.168.0.120 " +echo '0936223270' | sudo -S bash -c ' +k3s kubectl get nodes 2>/dev/null +k3s kubectl get pods -n awoooi-prod 2>/dev/null +' +" + +# API E2E +echo "--- API E2E ---" +curl -s http://192.168.0.125:32334/api/v1/health 2>/dev/null | \ + python3 -c "import sys,json; d=json.load(sys.stdin); [print('✅' if v['status']=='up' else '⚠️', k, v['status']) for k,v in d['components'].items()]" || \ + echo '❌ API E2E FAILED' + +echo "=== 驗收完成 ===" +``` + +--- + +## 故障排除 + +### A. PostgreSQL WAL 損壞 + +**症狀**: +``` +PANIC: could not locate a valid checkpoint record +``` + +**修復**(需統帥授權): + +```bash +ssh ollama@192.168.0.188 + +# 1. 確認錯誤 +sudo journalctl -u postgresql@14-main -n 20 | grep -E 'PANIC|checkpoint' + +# 2. 強制重置 WAL(會丟失最後幾個 transaction,不可逆) +sudo /usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main + +# 3. 重啟並驗證 +sudo systemctl start postgresql@14-main +sleep 8 +pg_isready -h localhost -p 5432 + +# 4. 殺掉 stale 連線 + 重建 kine(必做!) +sudo -u postgres psql -d k3s_datastore -c " + SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname='k3s_datastore' AND pid!=pg_backend_pid(); +" +sudo -u postgres psql -d k3s_datastore -c "REINDEX TABLE kine; VACUUM ANALYZE kine;" +# ⚠️ 若 stale 連線殺不掉,用 OS kill: sudo kill -9 +``` + +--- + +### B. Docker Daemon 損壞 (BoltDB) + +**症狀**: +``` +panic: freepages: failed to get all reachable pages (containerd) +panic: page already freed (Docker network) +failed to create task: failed to initialize logging (Harbor 容器) +``` + +**修復(188)**: +```bash +sudo systemctl stop docker containerd +sudo rm -f /var/lib/containerd/io.containerd.metadata.v1.bolt/meta.db +sudo rm -f /var/lib/docker/network/files/local-kv.db +sudo systemctl start containerd && sleep 5 +sudo systemctl start docker +``` + +**修復(110,額外需清除容器狀態)**: +```bash +sudo systemctl stop docker +sudo rm -f /var/lib/docker/network/files/local-kv.db +sudo rm -f /var/lib/docker/volumes/metadata.db +find /var/lib/docker/buildkit -name "*.db" -delete 2>/dev/null +sudo rm -rf /var/lib/docker/containers/* # 清除孤兒容器記錄 +sudo systemctl start docker +sleep 5 +docker rm -f $(docker ps -aq) 2>/dev/null || true # 清除殘留 +docker network prune -f +``` + +--- + +### C. K3s Kine 慢查詢 + +**症狀**:K3s `activating` 超過 3-5 分鐘,log: +``` +Slow SQL (total time: 1m3.889s): SELECT ... FROM kine AS kv WHERE kv.name LIKE $1 ... +``` + +**修復**: +```bash +# 1. 停 K3s(釋放 PG 連線) +ssh wooo@192.168.0.120 "echo '0936223270' | sudo -S systemctl stop k3s" +ssh wooo@192.168.0.121 "echo '0936223270' | sudo -S systemctl stop k3s" + +# 2. 殺掉 stale 連線(若 pg_terminate 無效,直接 OS kill) +ssh ollama@192.168.0.188 "echo '0936223270' | sudo -S -u postgres psql -d k3s_datastore \ + -c \"SELECT pid, query_start, state FROM pg_stat_activity WHERE datname='k3s_datastore';\"" +# 對 stale PID: sudo kill -9 + +# 3. 重建索引和統計 +ssh ollama@192.168.0.188 "echo '0936223270' | sudo -S -u postgres psql -d k3s_datastore \ + -c 'REINDEX TABLE kine; VACUUM ANALYZE kine;'" + +# 4. 重啟 K3s +ssh wooo@192.168.0.120 "echo '0936223270' | sudo -S systemctl start k3s" +ssh wooo@192.168.0.121 "echo '0936223270' | sudo -S systemctl start k3s" +``` + +--- + +### D. Harbor 容器全部 Exited (128) + +**症狀**: +``` +Error: failed to create task for container: failed to initialize logging driver: dial tcp 127.0.0.1:1514: connect: connection refused +``` + +**原因**:Harbor 所有容器的 log driver 指向 harbor-log 的 syslog (:1514),若 harbor-log 未健康就啟動其他容器,全部失敗。 + +**修復**: +```bash +# 1. 清除所有失敗的容器 +docker rm -f $(docker ps -aq) 2>/dev/null +docker network prune -f + +# 2. 重啟 Harbor(harbor-log 會先起) +cd /home/wooo/harbor/harbor && docker compose up -d + +# 3. 等 harbor-log healthy(約 30 秒) +until [ "$(docker inspect --format='{{.State.Health.Status}}' harbor-log)" = "healthy" ]; do + echo "waiting harbor-log..."; sleep 5 +done + +# 4. 現在重啟其他元件 +docker compose up -d +``` + +--- + +## 已知限制 + +| 項目 | 說明 | 建議後續 | +|------|------|---------| +| ClawBot build | pip wheels 損壞,STANDBY_MODE=true 下非關鍵 | 定期清除 wheel cache | +| K3s 非自動依賴 PG | k3s.service 沒有 After=postgresql@14-main.service | 考慮加 systemd 依賴 | +| Redis 需手動設定 | /etc/redis/redis.conf bind 0.0.0.0 port 6380 已設定,重裝後需重設 | 加入 awoooi-startup.sh 自我驗證 | + +--- + +*文件由 Claude Code 於 2026-04-05 第二次重開機事故後完整修訂* diff --git a/scripts/reboot-recovery/awoooi-startup-110.service b/scripts/reboot-recovery/awoooi-startup-110.service new file mode 100644 index 00000000..5500d241 --- /dev/null +++ b/scripts/reboot-recovery/awoooi-startup-110.service @@ -0,0 +1,16 @@ +[Unit] +Description=AWOOOI 192.168.0.110 startup sequence +After=network-online.target docker.service +Wants=network-online.target +Requires=docker.service + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/local/bin/awoooi-startup-110.sh +TimeoutStartSec=300 +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh new file mode 100644 index 00000000..25b7e38b --- /dev/null +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# 192.168.0.110 重開機自動恢復腳本 +# 2026-04-05 ogt: 根據第二次重開機事故建立 +# 部署位置: /usr/local/bin/awoooi-startup-110.sh (on 192.168.0.110) +# systemd unit: /etc/systemd/system/awoooi-startup-110.service +# +# 已知問題處理: +# - Docker BoltDB 損壞 (network/files/local-kv.db, volumes/metadata.db) +# - 舊容器使用已不存在的 Docker network (需要 docker rm -f 全部) +# - Harbor nginx 依賴 harbor-log (需要等 harbor-log healthy 後才 compose up) + +set -uo pipefail +LOG="/var/log/awoooi-startup-110.log" +exec > >(tee -a "$LOG") 2>&1 + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } + +log "=== 192.168.0.110 啟動序列開始 ===" + +# ────────────────────────────────────────────── +# STEP 1: Docker 修復(若 BoltDB 損壞) +# ────────────────────────────────────────────── +log "[1/5] 檢查 Docker..." + +if ! systemctl is-active docker >/dev/null 2>&1; then + log "Docker 未啟動,嘗試啟動..." + systemctl start docker || true + sleep 8 +fi + +if ! systemctl is-active docker >/dev/null 2>&1; then + log "Docker 啟動失敗,修復 BoltDB..." + # 清除所有已知 BoltDB 損壞點 + NETWORK_DB="/var/lib/docker/network/files/local-kv.db" + VOLUMES_DB="/var/lib/docker/volumes/metadata.db" + + if [ -f "$NETWORK_DB" ]; then + cp "$NETWORK_DB" "${NETWORK_DB}.bak.$(date +%Y%m%d%H%M%S)" + rm -f "$NETWORK_DB" + log "清除損壞的 local-kv.db" + fi + if [ -f "$VOLUMES_DB" ]; then + cp "$VOLUMES_DB" "${VOLUMES_DB}.bak.$(date +%Y%m%d%H%M%S)" + rm -f "$VOLUMES_DB" + log "清除損壞的 metadata.db" + fi + # 清除 buildkit 快取(也可能損壞) + find /var/lib/docker/buildkit -name "*.db" -delete 2>/dev/null || true + + systemctl start docker + sleep 8 + systemctl is-active docker && log "✅ Docker 修復成功" || { log "❌ Docker 修復失敗"; exit 1; } +else + log "✅ Docker 已 active" +fi + +# ────────────────────────────────────────────── +# STEP 2: 清除孤兒容器(舊容器的 network 已不存在) +# ────────────────────────────────────────────── +log "[2/5] 清除孤兒容器..." +STALE=$(docker ps -a --format "{{.Names}}\t{{.Status}}" | grep "Exited (128)\|Exited (137)" | awk '{print $1}') +if [ -n "$STALE" ]; then + log "發現孤兒容器: $(echo $STALE | tr '\n' ' ')" + echo "$STALE" | xargs docker rm -f 2>/dev/null || true + docker network prune -f 2>/dev/null || true + log "✅ 孤兒容器清除完成" +else + log "✅ 無孤兒容器" +fi + +# ────────────────────────────────────────────── +# STEP 3: Harbor(注意:先等 harbor-log healthy) +# ────────────────────────────────────────────── +log "[3/5] 啟動 Harbor..." +HARBOR_DIR="/home/wooo/harbor/harbor" +if [ -f "$HARBOR_DIR/docker-compose.yml" ]; then + cd "$HARBOR_DIR" + docker compose up -d 2>&1 | tail -5 + + # 等待 harbor-log 變 healthy(最多 60 秒) + log "等待 harbor-log healthy..." + for i in $(seq 1 12); do + STATUS=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "missing") + [ "$STATUS" = "healthy" ] && break + sleep 5 + done + + STATUS=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "unknown") + if [ "$STATUS" = "healthy" ]; then + # harbor-log healthy 後重啟其他組件(它們依賴 1514 port) + docker compose up -d 2>&1 | tail -5 + log "✅ Harbor 啟動完成" + else + log "⚠️ harbor-log 未 healthy,Harbor 可能需要手動檢查" + fi +else + log "⚠️ 找不到 Harbor compose 檔案" +fi + +# ────────────────────────────────────────────── +# STEP 4: 其他服務(Gitea, Langfuse, Monitoring) +# ────────────────────────────────────────────── +log "[4/5] 啟動其他服務..." + +GITEA_DIR="/home/wooo/gitea" +if [ -f "$GITEA_DIR/docker-compose.yml" ]; then + cd "$GITEA_DIR" + docker compose up -d 2>&1 | tail -3 + log "✅ Gitea 啟動指令已發送" +fi + +LANGFUSE_DIR="/home/wooo/langfuse" +if [ -f "$LANGFUSE_DIR/docker-compose.yml" ]; then + cd "$LANGFUSE_DIR" + docker compose up -d 2>&1 | tail -3 + log "✅ Langfuse 啟動指令已發送" +fi + +MONITORING_DIR="/home/wooo/monitoring" +if [ -f "$MONITORING_DIR/docker-compose.yml" ]; then + cd "$MONITORING_DIR" + docker compose up -d 2>&1 | tail -3 + log "✅ Monitoring 啟動指令已發送" +fi + +# ────────────────────────────────────────────── +# STEP 5: SignOz +# ────────────────────────────────────────────── +log "[5/5] 啟動 SignOz..." +SIGNOZ_DIR="/home/wooo/signoz/deploy/docker" +if [ -f "$SIGNOZ_DIR/docker-compose.yaml" ]; then + cd "$SIGNOZ_DIR" + docker compose up -d 2>&1 | tail -5 + log "✅ SignOz 啟動指令已發送" +fi + +# ────────────────────────────────────────────── +# 完成 +# ────────────────────────────────────────────── +log "=== 192.168.0.110 啟動序列完成 ===" +log "Harbor: http://192.168.0.110:5000" +log "Gitea: http://192.168.0.110:3001" +log "Grafana: http://192.168.0.110:3002" + +exit 0 diff --git a/scripts/reboot-recovery/awoooi-startup.service b/scripts/reboot-recovery/awoooi-startup.service new file mode 100644 index 00000000..65a30d44 --- /dev/null +++ b/scripts/reboot-recovery/awoooi-startup.service @@ -0,0 +1,33 @@ +# systemd unit for AWOOOI auto-startup on reboot +# 2026-04-04 ogt: 根據實際重開機事故建立 +# +# 部署方式 (on 192.168.0.188): +# sudo cp awoooi-startup.sh /usr/local/bin/awoooi-startup.sh +# sudo chmod +x /usr/local/bin/awoooi-startup.sh +# sudo cp awoooi-startup.service /etc/systemd/system/awoooi-startup.service +# sudo systemctl daemon-reload +# sudo systemctl enable awoooi-startup.service + +[Unit] +Description=AWOOOI Auto-Startup Recovery Sequence +# 在 network 就緒後才執行 +After=network-online.target containerd.service docker.service +Wants=network-online.target + +# 確保 PostgreSQL 盡早嘗試啟動 +Wants=postgresql@14-main.service redis-server.service ollama.service nginx.service + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/local/bin/awoooi-startup.sh +# 允許較長的啟動時間(修復流程可能需要時間) +TimeoutStartSec=300 +StandardOutput=journal +StandardError=journal + +# 以 root 執行(需要 systemctl 操作) +User=root + +[Install] +WantedBy=multi-user.target diff --git a/scripts/reboot-recovery/awoooi-startup.sh b/scripts/reboot-recovery/awoooi-startup.sh new file mode 100644 index 00000000..5528e703 --- /dev/null +++ b/scripts/reboot-recovery/awoooi-startup.sh @@ -0,0 +1,179 @@ +#!/bin/bash +# AWOOOI 重開機自動恢復腳本 +# 2026-04-04 ogt: 根據實際事故建立,解決 PostgreSQL WAL 損壞 + Docker BoltDB 損壞 + K3s Kine 慢查詢 +# 部署位置: /usr/local/bin/awoooi-startup.sh (on 192.168.0.188) +# systemd unit: /etc/systemd/system/awoooi-startup.service + +set -uo pipefail +LOG="/var/log/awoooi-startup.log" +exec > >(tee -a "$LOG") 2>&1 + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } + +log "=== AWOOOI 啟動序列開始 ===" + +# ────────────────────────────────────────────── +# STEP 1: containerd 修復(若損壞) +# ────────────────────────────────────────────── +log "[1/7] 檢查 containerd..." + +if ! systemctl is-active containerd >/dev/null 2>&1; then + log "containerd 未啟動,嘗試啟動..." + systemctl start containerd || true + sleep 5 +fi + +if ! systemctl is-active containerd >/dev/null 2>&1; then + log "containerd 啟動失敗,檢查 BoltDB 損壞..." + BOLT_DB="/var/lib/containerd/io.containerd.metadata.v1.bolt/meta.db" + if [ -f "$BOLT_DB" ]; then + log "備份並刪除損壞的 meta.db..." + cp "$BOLT_DB" "${BOLT_DB}.bak.$(date +%Y%m%d%H%M%S)" + rm -f "$BOLT_DB" + fi + systemctl start containerd + sleep 5 + systemctl is-active containerd && log "✅ containerd 修復成功" || { log "❌ containerd 修復失敗"; exit 1; } +else + log "✅ containerd 已 active" +fi + +# ────────────────────────────────────────────── +# STEP 2: Docker 修復(若損壞) +# ────────────────────────────────────────────── +log "[2/7] 檢查 Docker..." + +if ! systemctl is-active docker >/dev/null 2>&1; then + log "Docker 未啟動,嘗試啟動..." + systemctl start docker || true + sleep 8 +fi + +if ! systemctl is-active docker >/dev/null 2>&1; then + log "Docker 啟動失敗,修復 network BoltDB..." + NETWORK_DB="/var/lib/docker/network/files/local-kv.db" + if [ -f "$NETWORK_DB" ]; then + log "備份並刪除損壞的 local-kv.db..." + cp "$NETWORK_DB" "${NETWORK_DB}.bak.$(date +%Y%m%d%H%M%S)" + rm -f "$NETWORK_DB" + fi + systemctl restart containerd + sleep 5 + systemctl start docker + sleep 8 + systemctl is-active docker && log "✅ Docker 修復成功" || { log "❌ Docker 修復失敗"; exit 1; } +else + log "✅ Docker 已 active" +fi + +# ────────────────────────────────────────────── +# STEP 3: PostgreSQL 修復(若損壞) +# ────────────────────────────────────────────── +log "[3/7] 檢查 PostgreSQL..." + +if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then + log "PostgreSQL 未啟動,嘗試啟動..." + systemctl start postgresql@14-main || true + sleep 8 +fi + +if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then + log "PostgreSQL 啟動失敗,檢查是否 WAL 損壞..." + if journalctl -u postgresql@14-main -n 20 | grep -q "could not locate a valid checkpoint"; then + log "⚠️ WAL 損壞!執行 pg_resetwal..." + /usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main + log "WAL 重置完成,重啟 PostgreSQL..." + systemctl start postgresql@14-main + sleep 8 + fi + systemctl is-active postgresql@14-main && log "✅ PostgreSQL 修復成功" || { log "❌ PostgreSQL 修復失敗"; exit 1; } +fi + +# 等待 PG 接受連線 +log "等待 PostgreSQL 就緒..." +for i in $(seq 1 30); do + pg_isready -h localhost -p 5432 >/dev/null 2>&1 && break + sleep 2 +done +pg_isready -h localhost -p 5432 && log "✅ PostgreSQL accepting connections" || { log "❌ PostgreSQL 無法接受連線"; exit 1; } + +# kine 表維護(若有 stale 連線或 WAL 剛重置) +log "維護 k3s_datastore kine 表..." +sudo -u postgres psql -d k3s_datastore -c " + SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname='k3s_datastore' AND pid!=pg_backend_pid() AND state='active' + AND query_start < now() - interval '5 minutes'; +" 2>/dev/null || true + +sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;" 2>/dev/null && log "✅ kine VACUUM ANALYZE 完成" || log "⚠️ kine VACUUM 跳過(可能無法連線)" + +# ────────────────────────────────────────────── +# STEP 4: Redis +# ────────────────────────────────────────────── +log "[4/7] 啟動 Redis..." +systemctl start redis-server || true +sleep 3 +redis-cli ping 2>/dev/null | grep -q PONG && log "✅ Redis UP" || log "⚠️ Redis 可能未就緒" + +# ────────────────────────────────────────────── +# STEP 5: Ollama +# ────────────────────────────────────────────── +log "[5/7] 啟動 Ollama..." +systemctl start ollama || true +# Ollama 載入模型需要時間,不立刻驗證 + +# ────────────────────────────────────────────── +# STEP 6: Nginx +# ────────────────────────────────────────────── +log "[6/7] 啟動 Nginx..." +systemctl start nginx || true +systemctl is-active nginx >/dev/null 2>&1 && log "✅ Nginx UP" || log "⚠️ Nginx 未就緒" + +# ────────────────────────────────────────────── +# STEP 7: Docker Compose 服務 +# ────────────────────────────────────────────── +log "[7/7] 啟動 Docker Compose 服務..." + +# SignOz +SIGNOZ_DIR="/home/ollama/signoz/deploy/docker" +if [ -f "$SIGNOZ_DIR/docker-compose.yaml" ]; then + log "啟動 SignOz..." + cd "$SIGNOZ_DIR" + docker compose up -d 2>&1 | tail -3 + log "✅ SignOz 啟動指令已發送" +else + log "⚠️ 找不到 SignOz compose 檔案" +fi + +# ClawBot(依賴 aiops-network) +# 確保 aiops-network 存在(重開機後 external network 需手動建立) +if ! docker network ls | grep -q aiops-network; then + log "建立 aiops-network..." + docker network create aiops-network && log "✅ aiops-network 建立" || log "⚠️ aiops-network 建立失敗" +fi + +CLAWBOT_DIR="/home/ollama/clawbot-v5" +if [ -f "$CLAWBOT_DIR/docker-compose.yml" ]; then + log "啟動 ClawBot..." + cd "$CLAWBOT_DIR" + # 若 image snapshot 損壞,先 rebuild + if ! docker compose up -d 2>&1 | tee /tmp/clawbot-up.log | grep -q "Started\|Running\|healthy"; then + log "ClawBot 啟動失敗,嘗試 rebuild..." + docker compose build --no-cache 2>&1 | tail -5 || true + docker compose up -d 2>&1 | tail -3 || log "⚠️ ClawBot rebuild 也失敗,跳過" + fi + log "✅ ClawBot 啟動指令已發送" +else + log "⚠️ 找不到 ClawBot compose 檔案" +fi + +# ────────────────────────────────────────────── +# 完成 +# ────────────────────────────────────────────── +log "=== AWOOOI 啟動序列完成 ===" +# 最後永遠 exit 0,不讓 systemd 認為失敗 +log "K3s 需在 120/121 手動確認啟動(或由 k3s.service 自動啟動)" +log "詳細 SOP: docs/runbooks/REBOOT-RECOVERY-SOP.md" + +exit 0 diff --git a/scripts/reboot-recovery/deploy-to-110.sh b/scripts/reboot-recovery/deploy-to-110.sh new file mode 100644 index 00000000..2583b4d4 --- /dev/null +++ b/scripts/reboot-recovery/deploy-to-110.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# 將 awoooi-startup-110 部署到 192.168.0.110 +# 執行方式: bash deploy-to-110.sh + +set -euo pipefail +HOST="wooo@192.168.0.110" +PASS="0936223270" + +echo "=== 部署 awoooi-startup-110 到 192.168.0.110 ===" + +# 1. 上傳腳本 +echo "[1/4] 上傳啟動腳本..." +scp awoooi-startup-110.sh "$HOST:/tmp/awoooi-startup-110.sh" + +# 2. 上傳 systemd unit +echo "[2/4] 上傳 systemd unit..." +scp awoooi-startup-110.service "$HOST:/tmp/awoooi-startup-110.service" + +# 3. 安裝 +echo "[3/4] 安裝..." +ssh "$HOST" " +echo '$PASS' | sudo -S bash -c ' + cp /tmp/awoooi-startup-110.sh /usr/local/bin/awoooi-startup-110.sh + chmod +x /usr/local/bin/awoooi-startup-110.sh + cp /tmp/awoooi-startup-110.service /etc/systemd/system/awoooi-startup-110.service + systemctl daemon-reload + systemctl enable awoooi-startup-110.service + echo done +' +" + +# 4. 驗證 +echo "[4/4] 驗證安裝..." +ssh "$HOST" "echo '$PASS' | sudo -S systemctl is-enabled awoooi-startup-110.service && echo '✅ 已啟用' || echo '❌ 啟用失敗'" + +echo "" +echo "✅ 部署完成!" +echo "下次重開機後,110 會自動執行修復並啟動所有服務。" +echo "" +echo "手動測試執行:" +echo " ssh $HOST 'echo $PASS | sudo -S /usr/local/bin/awoooi-startup-110.sh'" diff --git a/scripts/reboot-recovery/deploy-to-188.sh b/scripts/reboot-recovery/deploy-to-188.sh new file mode 100644 index 00000000..5f3ac506 --- /dev/null +++ b/scripts/reboot-recovery/deploy-to-188.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# 將 awoooi-startup 部署到 192.168.0.188 +# 執行方式: bash deploy-to-188.sh + +set -euo pipefail +HOST="ollama@192.168.0.188" +PASS="0936223270" + +echo "=== 部署 awoooi-startup 到 192.168.0.188 ===" + +# 1. 上傳腳本 +echo "[1/4] 上傳啟動腳本..." +scp awoooi-startup.sh "$HOST:/tmp/awoooi-startup.sh" + +# 2. 上傳 systemd unit +echo "[2/4] 上傳 systemd unit..." +scp awoooi-startup.service "$HOST:/tmp/awoooi-startup.service" + +# 3. 安裝 +echo "[3/4] 安裝..." +ssh "$HOST" " +echo '$PASS' | sudo -S bash -c ' + cp /tmp/awoooi-startup.sh /usr/local/bin/awoooi-startup.sh + chmod +x /usr/local/bin/awoooi-startup.sh + cp /tmp/awoooi-startup.service /etc/systemd/system/awoooi-startup.service + systemctl daemon-reload + systemctl enable awoooi-startup.service + echo done +' +" + +# 4. 驗證 +echo "[4/4] 驗證安裝..." +ssh "$HOST" "systemctl is-enabled awoooi-startup.service && echo '✅ 已啟用' || echo '❌ 啟用失敗'" + +echo "" +echo "✅ 部署完成!" +echo "下次重開機後,188 會自動執行修復並啟動所有服務。" +echo "" +echo "手動測試執行:" +echo " ssh $HOST 'sudo /usr/local/bin/awoooi-startup.sh'"