diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml index 9a8dba9b..7fc21929 100644 --- a/.github/workflows/cd.yaml +++ b/.github/workflows/cd.yaml @@ -56,13 +56,51 @@ jobs: runs-on: [self-hosted, harbor, k8s] timeout-minutes: 1 steps: - # 2026-03-26: 清理暫存目錄,避免 file conflict (pages + temp) - - name: "Clean Runner temp" + # ======================================================================= + # 2026-03-29: Runner _diag/pages 檔案衝突永久修復 + # 問題: 並行 Job 寫入同一診斷檔案導致 "file already exists" + # 解法: 強制清理 + flock 鎖定 + 重建目錄 + # ======================================================================= + - name: "Clean Runner Diagnostics (Anti-Collision)" run: | + set +e # 不因清理失敗而中斷 + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") - rm -rf "$RUNNER_TEMP"/* 2>/dev/null || true - rm -rf "$RUNNER_ROOT/_diag/pages"/* 2>/dev/null || true - rm -rf .claude/worktrees 2>/dev/null || true + DIAG_DIR="$RUNNER_ROOT/_diag" + PAGES_DIR="$DIAG_DIR/pages" + LOCK_FILE="/tmp/runner-diag-cleanup.lock" + + echo "🧹 Cleaning Runner diagnostics..." + echo " RUNNER_ROOT: $RUNNER_ROOT" + echo " PAGES_DIR: $PAGES_DIR" + + # 使用 flock 確保同一時間只有一個清理程序 + ( + flock -w 10 200 || { echo "⚠️ Lock timeout, proceeding anyway"; } + + # 1. 清理 _diag/pages (最關鍵) + if [ -d "$PAGES_DIR" ]; then + # 刪除所有 .log 檔案 + find "$PAGES_DIR" -name "*.log" -type f -delete 2>/dev/null + # 重建目錄確保乾淨 + rm -rf "$PAGES_DIR" 2>/dev/null + mkdir -p "$PAGES_DIR" 2>/dev/null + echo " ✅ Cleaned _diag/pages" + fi + + # 2. 清理 RUNNER_TEMP + rm -rf "$RUNNER_TEMP"/* 2>/dev/null + echo " ✅ Cleaned RUNNER_TEMP" + + # 3. 清理 Claude worktrees + rm -rf .claude/worktrees 2>/dev/null + + # 4. 清理陳舊的 _work 暫存 + find "$RUNNER_ROOT/_work" -name "*.tmp" -mmin +30 -delete 2>/dev/null || true + + ) 200>"$LOCK_FILE" + + echo "✅ Runner cleanup completed" # ======================================================================= # ADR-035: Telegram 告警鏈路強制驗證 @@ -122,11 +160,12 @@ jobs: web: ${{ inputs.force_deploy == true && 'true' || steps.filter.outputs.web }} k3s-system: ${{ steps.filter.outputs.k3s-system }} steps: - # 2026-03-26: 清理暫存目錄 (temp + pages) - - name: "Clean Runner temp" + # 2026-03-29: Runner 診斷檔案清理 (防止並行衝突) + - name: "Clean Runner Diagnostics" run: | RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") - rm -rf "$RUNNER_TEMP"/* "$RUNNER_ROOT/_diag/pages"/* .claude/worktrees 2>/dev/null || true + rm -rf "$RUNNER_TEMP"/* "$RUNNER_ROOT/_diag/pages" .claude/worktrees 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 with: @@ -162,11 +201,12 @@ jobs: outputs: image_tag: ${{ steps.tag.outputs.tag }} steps: - # 2026-03-26: 清理暫存目錄 (temp + pages) - - name: "Clean Runner temp" + # 2026-03-29: Runner 診斷檔案清理 (防止並行衝突) + - name: "Clean Runner Diagnostics" run: | RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") - rm -rf "$RUNNER_TEMP"/* "$RUNNER_ROOT/_diag/pages"/* .claude/worktrees 2>/dev/null || true + rm -rf "$RUNNER_TEMP"/* "$RUNNER_ROOT/_diag/pages" .claude/worktrees 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 @@ -200,11 +240,12 @@ jobs: outputs: image_tag: ${{ steps.tag.outputs.tag }} steps: - # 2026-03-26: 清理暫存目錄 (temp + pages) - - name: "Clean Runner temp" + # 2026-03-29: Runner 診斷檔案清理 (防止並行衝突) + - name: "Clean Runner Diagnostics" run: | RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") - rm -rf "$RUNNER_TEMP"/* "$RUNNER_ROOT/_diag/pages"/* .claude/worktrees 2>/dev/null || true + rm -rf "$RUNNER_TEMP"/* "$RUNNER_ROOT/_diag/pages" .claude/worktrees 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 @@ -245,11 +286,12 @@ jobs: if: always() && (needs.build-api.result == 'success' || needs.build-api.result == 'skipped') && (needs.build-web.result == 'success' || needs.build-web.result == 'skipped') environment: production steps: - # 2026-03-26: 清理暫存目錄 (temp + pages) - - name: "Clean Runner temp" + # 2026-03-29: Runner 診斷檔案清理 (防止並行衝突) + - name: "Clean Runner Diagnostics" run: | RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") - rm -rf "$RUNNER_TEMP"/* "$RUNNER_ROOT/_diag/pages"/* .claude/worktrees 2>/dev/null || true + rm -rf "$RUNNER_TEMP"/* "$RUNNER_ROOT/_diag/pages" .claude/worktrees 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - uses: actions/checkout@v4 with: diff --git a/ops/runner/README.md b/ops/runner/README.md new file mode 100644 index 00000000..59f649f0 --- /dev/null +++ b/ops/runner/README.md @@ -0,0 +1,66 @@ +# GitHub Actions Runner 穩定性修復 + +## 問題: `_diag/pages` 檔案衝突 + +``` +Error: The file '/home/wooo/actions-runner-awoooi/_diag/pages/xxx.log' already exists. +``` + +### 根因 +- GitHub Actions Runner 在執行 Job 時會寫入診斷日誌 +- 並行 Job 或快速連續執行可能產生 UUID 碰撞 +- 前次執行的殘留檔案未清理 + +### 解決方案 + +#### 1. CD Workflow 修復 (即時生效) +每個 Job 開始前強制清理並重建 `_diag/pages` 目錄: + +```yaml +- name: "Clean Runner Diagnostics" + run: | + RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") + rm -rf "$RUNNER_TEMP"/* "$RUNNER_ROOT/_diag/pages" .claude/worktrees 2>/dev/null || true + mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true +``` + +**關鍵**: 刪除整個目錄再重建,而非 `rm -rf _diag/pages/*` + +#### 2. Systemd Timer (背景清理) +每 5 分鐘自動清理過期的診斷檔案: + +```bash +# 部署 +ssh wooo@192.168.0.110 +cd /path/to/awoooi/ops/runner +bash deploy-runner-cleanup.sh +``` + +### 檔案說明 + +| 檔案 | 用途 | +|------|------| +| `cleanup-runner-diag.sh` | 清理腳本 (安裝到 Runner 目錄) | +| `runner-diag-cleanup.service` | Systemd service 定義 | +| `runner-diag-cleanup.timer` | Systemd timer (每 5 分鐘) | +| `deploy-runner-cleanup.sh` | 一鍵部署腳本 | + +### 監控 + +```bash +# 查看 timer 狀態 +sudo systemctl status runner-diag-cleanup.timer + +# 查看清理日誌 +journalctl -u runner-diag-cleanup.service -f + +# 手動觸發清理 +sudo systemctl start runner-diag-cleanup.service +``` + +### 相關文件 +- Memory: `feedback_runner_zombie_process.md` +- ADR: 待建立 (如果問題持續) + +--- +版本: v1.0 | 建立: 2026-03-29 | 作者: Claude Code diff --git a/ops/runner/cleanup-runner-diag.sh b/ops/runner/cleanup-runner-diag.sh new file mode 100644 index 00000000..b4d33f3f --- /dev/null +++ b/ops/runner/cleanup-runner-diag.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# ============================================================================= +# Runner Diagnostic Cleanup Script +# ============================================================================= +# 解決 _diag/pages 檔案衝突問題 +# +# 部署位置: 192.168.0.110 (awoooi-runner) +# 執行方式: systemd timer 每 5 分鐘執行 +# +# 版本: v1.0 +# 建立: 2026-03-29 (台北時區) +# 建立者: Claude Code (Runner 穩定性修復) +# ============================================================================= + +set -euo pipefail + +RUNNER_DIR="/home/wooo/actions-runner-awoooi" +DIAG_PAGES_DIR="${RUNNER_DIR}/_diag/pages" +LOG_FILE="/var/log/runner-diag-cleanup.log" +MAX_AGE_MINUTES=10 + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +# 只有當 Runner 不在執行 Job 時才清理 +check_runner_idle() { + # 檢查是否有正在執行的 Job + if pgrep -f "Runner.Worker" > /dev/null 2>&1; then + return 1 # Runner 忙碌中 + fi + return 0 # Runner 閒置 +} + +cleanup_diag_pages() { + if [[ ! -d "$DIAG_PAGES_DIR" ]]; then + log "DIAG_PAGES_DIR not found, skipping" + return 0 + fi + + # 統計檔案數量 + local count=$(find "$DIAG_PAGES_DIR" -type f -name "*.log" 2>/dev/null | wc -l) + + if [[ $count -eq 0 ]]; then + return 0 + fi + + log "Found $count diagnostic files" + + # 刪除超過 MAX_AGE_MINUTES 的檔案 + local deleted=$(find "$DIAG_PAGES_DIR" -type f -name "*.log" -mmin +${MAX_AGE_MINUTES} -delete -print 2>/dev/null | wc -l) + + if [[ $deleted -gt 0 ]]; then + log "Deleted $deleted stale files (older than ${MAX_AGE_MINUTES}m)" + fi +} + +cleanup_work_temp() { + # 清理 _work/_temp 目錄中的殘留檔案 + local temp_dir="${RUNNER_DIR}/_work/_temp" + if [[ -d "$temp_dir" ]]; then + local deleted=$(find "$temp_dir" -type f -mmin +30 -delete -print 2>/dev/null | wc -l) + if [[ $deleted -gt 0 ]]; then + log "Deleted $deleted temp files from _work/_temp" + fi + fi +} + +main() { + # 檢查 Runner 是否閒置 + if ! check_runner_idle; then + log "Runner is busy, skipping cleanup" + exit 0 + fi + + cleanup_diag_pages + cleanup_work_temp + + log "Cleanup completed" +} + +main "$@" diff --git a/ops/runner/deploy-runner-cleanup.sh b/ops/runner/deploy-runner-cleanup.sh new file mode 100644 index 00000000..fd848363 --- /dev/null +++ b/ops/runner/deploy-runner-cleanup.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# ============================================================================= +# Deploy Runner Diagnostic Cleanup Service +# ============================================================================= +# 在 192.168.0.110 (Runner 主機) 上執行此腳本 +# +# 執行方式: +# ssh wooo@192.168.0.110 +# bash /path/to/deploy-runner-cleanup.sh +# +# 版本: v1.0 +# 建立: 2026-03-29 (台北時區) +# ============================================================================= + +set -euo pipefail + +RUNNER_DIR="/home/wooo/actions-runner-awoooi" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +echo "🚀 Deploying Runner Diagnostic Cleanup Service..." + +# 1. 複製清理腳本 +echo "📋 Copying cleanup script..." +cp "$SCRIPT_DIR/cleanup-runner-diag.sh" "$RUNNER_DIR/" +chmod +x "$RUNNER_DIR/cleanup-runner-diag.sh" + +# 2. 安裝 systemd 服務 +echo "📋 Installing systemd service..." +sudo cp "$SCRIPT_DIR/runner-diag-cleanup.service" /etc/systemd/system/ +sudo cp "$SCRIPT_DIR/runner-diag-cleanup.timer" /etc/systemd/system/ + +# 3. 重載 systemd +echo "🔄 Reloading systemd..." +sudo systemctl daemon-reload + +# 4. 啟用並啟動 timer +echo "⏰ Enabling cleanup timer..." +sudo systemctl enable --now runner-diag-cleanup.timer + +# 5. 驗證 +echo "" +echo "✅ Deployment complete!" +echo "" +echo "📊 Timer status:" +sudo systemctl status runner-diag-cleanup.timer --no-pager || true +echo "" +echo "📊 Next scheduled runs:" +sudo systemctl list-timers runner-diag-cleanup.timer --no-pager || true +echo "" +echo "📝 To test manually:" +echo " sudo systemctl start runner-diag-cleanup.service" +echo " journalctl -u runner-diag-cleanup.service -f" diff --git a/ops/runner/runner-diag-cleanup.service b/ops/runner/runner-diag-cleanup.service new file mode 100644 index 00000000..ef78cc42 --- /dev/null +++ b/ops/runner/runner-diag-cleanup.service @@ -0,0 +1,21 @@ +# ============================================================================= +# Runner Diagnostic Cleanup Service +# ============================================================================= +# 部署: sudo cp runner-diag-cleanup.service /etc/systemd/system/ +# 啟用: sudo systemctl enable --now runner-diag-cleanup.timer +# ============================================================================= + +[Unit] +Description=GitHub Actions Runner Diagnostic Cleanup +After=network.target + +[Service] +Type=oneshot +User=wooo +Group=wooo +ExecStart=/home/wooo/actions-runner-awoooi/cleanup-runner-diag.sh +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target diff --git a/ops/runner/runner-diag-cleanup.timer b/ops/runner/runner-diag-cleanup.timer new file mode 100644 index 00000000..d1169717 --- /dev/null +++ b/ops/runner/runner-diag-cleanup.timer @@ -0,0 +1,18 @@ +# ============================================================================= +# Runner Diagnostic Cleanup Timer +# ============================================================================= +# 每 5 分鐘執行一次清理,防止 _diag/pages 檔案堆積 +# ============================================================================= + +[Unit] +Description=Runner Diagnostic Cleanup Timer +Requires=runner-diag-cleanup.service + +[Timer] +OnBootSec=1min +OnUnitActiveSec=5min +AccuracySec=30s +Persistent=true + +[Install] +WantedBy=timers.target