diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 58e55ecd..4206c111 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -810,35 +810,39 @@ jobs: mkdir -p ~/.ssh echo "$SSH_KEY_188" > "${HOME}/.ssh/deploy_key_188" chmod 600 "${HOME}/.ssh/deploy_key_188" - ssh-keyscan 192.168.0.188 >> ~/.ssh/known_hosts 2>/dev/null + timeout -k 5s 10s ssh-keyscan 192.168.0.188 >> ~/.ssh/known_hosts 2>/dev/null \ + || echo "⚠️ 188 host key scan 失敗,改用 StrictHostKeyChecking=accept-new" SSH_188_OPTS=( -i "${HOME}/.ssh/deploy_key_188" -o BatchMode=yes + -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=10 -o ServerAliveCountMax=3 + -o LogLevel=ERROR + -n ) - timeout 30s ssh "${SSH_188_OPTS[@]}" ollama@192.168.0.188 \ + timeout -k 5s 30s ssh "${SSH_188_OPTS[@]}" ollama@192.168.0.188 \ "mkdir -p ~/awoooi-ops" \ || echo "⚠️ 188 ops 目錄確認失敗" # 同步 docker-health-monitor.sh - timeout 60s scp "${SSH_188_OPTS[@]}" \ + timeout -k 5s 60s scp "${SSH_188_OPTS[@]}" \ scripts/ops/docker-health-monitor.sh \ ollama@192.168.0.188:~/awoooi-ops/docker-health-monitor.sh \ && echo "✅ docker-health-monitor.sh 已同步" \ || echo "⚠️ docker-health-monitor.sh 同步失敗" # 同步 pg-backup.sh - timeout 60s scp "${SSH_188_OPTS[@]}" \ + timeout -k 5s 60s scp "${SSH_188_OPTS[@]}" \ scripts/ops/pg-backup.sh \ ollama@192.168.0.188:~/awoooi-ops/pg-backup.sh \ && echo "✅ pg-backup.sh 已同步" \ || echo "⚠️ pg-backup.sh 同步失敗" # 確保執行權限 - timeout 30s ssh "${SSH_188_OPTS[@]}" ollama@192.168.0.188 \ + timeout -k 5s 30s ssh "${SSH_188_OPTS[@]}" ollama@192.168.0.188 \ "chmod +x ~/awoooi-ops/docker-health-monitor.sh ~/awoooi-ops/pg-backup.sh && echo '✅ 權限設定完成'" \ || echo "⚠️ 權限設定失敗" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 5356c79c..9319ea77 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,15 @@ +## 2026-05-06 | CD 188 ops sync 防止 SSH 子程序停住 + +**背景**:`22453161` 的完整 CD 已完成 tests、API/Web image build、K8s GitOps deploy,但 `Sync Ops Scripts to 188` 卡住。現場 process 顯示 `timeout 30s ssh ... 192.168.0.188` 與子 `ssh` 進入 stopped 狀態,導致 job 無法前進到 post-deploy checks。 + +**本次修補**: +- `ssh-keyscan 192.168.0.188` 補 `timeout -k 5s 10s`,避免 host key 掃描無限等待。 +- 188 SSH options 補 `StrictHostKeyChecking=accept-new`、`LogLevel=ERROR`、`-n`,避免非互動 runner 被 SSH stdin / host key prompt 卡住。 +- 所有 188 ops sync 的 `ssh/scp` timeout 改為 `timeout -k 5s ...`,確保超時後會強制清理子程序。 + +**注意**: +- 188 ops sync 是 `continue-on-error: true`,不應阻塞主部署;若 188 不可達,只能警告並讓 post-deploy checks 繼續。 + ## 2026-05-06 | 告警路徑 Ollama 實證與動態基線 statsmodels 相容修正 **背景**:188 Ollama 退場後,需確認告警主鏈是否仍實際 fallback 到 Gemini;同時 production log 持續出現 `holt_winters_failed_fallback_to_stats`,讓動態基線訓練一直降級成滑動統計。