diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml index 4c82c521..9cf2df67 100644 --- a/.github/workflows/cd.yaml +++ b/.github/workflows/cd.yaml @@ -1,11 +1,11 @@ # ============================================================================= -# AWOOOI CD Pipeline v2.0 (完整沿用 AIOPS 最佳實踐) +# AWOOOI CD Pipeline v2.1 (序列建構修復 Runner 衝突) # ============================================================================= # 優化項目: # 1. Pre-flight Check (10s Fail-Fast) # 2. Runner 標籤 [self-hosted, harbor, k8s] # 3. dorny/paths-filter 精確路徑偵測 -# 4. API + Web 並行建構 +# 4. Web → API 序列建構 (2026-03-29 修復 _runner_file_commands 衝突) # 5. timeout-minutes 防止卡死 # 6. Telegram + OpenClaw 通知 # 7. force_deploy 強制重建選項 @@ -57,11 +57,6 @@ jobs: name: "Pre-flight Check" runs-on: [self-hosted, harbor, k8s] timeout-minutes: 1 - # 2026-03-29: Runner 全局 mutex,確保同一 Runner 不會並行執行任何 CD Job - # 使用固定 group 名稱 (非 run_id),所有 CD Jobs 共用同一把鎖 - concurrency: - group: runner-awoooi-cd-mutex - cancel-in-progress: false steps: # ======================================================================= # 2026-03-29: Runner _diag/pages 檔案衝突修復 (v3) @@ -135,9 +130,6 @@ jobs: runs-on: [self-hosted, harbor, k8s] needs: pre-flight-check timeout-minutes: 1 - concurrency: - group: runner-awoooi-cd-mutex - cancel-in-progress: false outputs: api: ${{ inputs.force_deploy == true && 'true' || steps.filter.outputs.api }} web: ${{ inputs.force_deploy == true && 'true' || steps.filter.outputs.web }} @@ -171,15 +163,15 @@ jobs: k3s-system: - 'k8s/k3s-system/**' - # ==================== 並行建構 API ==================== + # ==================== 序列建構 API (必須等 Web 完成) ==================== + # 2026-03-29 Claude Code: 改為序列執行,修復 Runner _runner_file_commands 衝突 + # 根因: 並行 Job 的 "Set up job" 階段會互相覆寫 RUNNER_TEMP 檔案 + # 參考: ops/runner/README.md build-api: name: "Build API" runs-on: [self-hosted, harbor, k8s] - needs: detect-changes + needs: [detect-changes, build-web] timeout-minutes: 20 - concurrency: - group: runner-awoooi-cd-mutex - cancel-in-progress: false if: | !inputs.skip_api && ( needs.detect-changes.outputs.api == 'true' || @@ -214,15 +206,13 @@ jobs: --file apps/api/Dockerfile . echo "✅ API: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}-api:${{ steps.tag.outputs.tag }}" - # ==================== 並行建構 Web ==================== + # ==================== 建構 Web (先執行) ==================== + # 2026-03-29 Claude Code: API 依賴 Web 完成,確保序列執行 build-web: name: "Build Web" runs-on: [self-hosted, harbor, k8s] needs: detect-changes timeout-minutes: 20 - concurrency: - group: runner-awoooi-cd-mutex - cancel-in-progress: false if: | !inputs.skip_web && ( needs.detect-changes.outputs.web == 'true' || diff --git a/ops/runner/README.md b/ops/runner/README.md index 4807b43a..19cd6861 100644 --- a/ops/runner/README.md +++ b/ops/runner/README.md @@ -17,19 +17,29 @@ Error: The file '/home/wooo/actions-runner-awoooi/_diag/pages/xxx.log' already e - `_work/_temp/_runner_file_commands` 在所有 Jobs 之間共享 - 清理此目錄會導致 "Missing file at path" 錯誤 -### 解決方案 (v3 - 最終版) +### 解決方案 (v4 - 最終版 2026-03-29) -#### 1. Workflow Concurrency (核心修復) +#### 1. 序列建構 (核心修復) +```yaml +# build-api 必須等 build-web 完成 +build-api: + needs: [detect-changes, build-web] # 關鍵: 依賴 build-web +``` + +**根因**: Job 並行時,"Set up job" 階段會同時寫入 `_runner_file_commands`,導致衝突 +**解法**: 改為序列執行,確保同一時間只有一個 Job 在 Runner 上 + +#### 2. Workflow Concurrency (輔助) ```yaml concurrency: group: cd-${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true # 必須為 true! + cancel-in-progress: true ``` -**關鍵**: `cancel-in-progress: true` 確保同一時間只有一個 workflow 在執行 +確保同一時間只有一個 workflow 在執行 -#### 2. Job 層清理 (輔助) -每個 Job 開始時只清理 `_diag/pages`,**不碰** `RUNNER_TEMP`: +#### 3. Job 層清理 (防禦性) +每個 Job 開始時清理 `_diag/pages`: ```yaml - name: "Clean Runner Diagnostics" @@ -39,7 +49,7 @@ concurrency: mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true ``` -**警告**: 絕對不要清理 `$RUNNER_TEMP/*`,會破壞其他 Job 的內部通訊 +**警告**: 絕對不要清理 `$RUNNER_TEMP/*`,會破壞 `_runner_file_commands` #### 2. Systemd Timer (背景清理) 每 5 分鐘自動清理過期的診斷檔案: @@ -78,4 +88,5 @@ sudo systemctl start runner-diag-cleanup.service - ADR: 待建立 (如果問題持續) --- -版本: v1.0 | 建立: 2026-03-29 | 作者: Claude Code +版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code +變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups