fix(ci): open controlled guard gates

2026-06-28 03:10:43 +08:00
parent afb7138a8c
commit 7fcfc0b24b
7 changed files with 121 additions and 43 deletions
--- a/.gitea/workflows/cd.yaml
+++ b/.gitea/workflows/cd.yaml
@@ -81,9 +81,8 @@ jobs:
      - uses: actions/checkout@v4

      - name: Wait for Host Web Build Pressure
-        # 2026-06-27 Codex: fail closed before tests too. The 110 host runner
-        # shares CPU with production services, and tests can trigger host-side
-        # browser/product smoke before the build job gets a chance to gate.
+        # 2026-06-28 Codex: commander controlled automation keeps this
+        # non-mutating pressure check as evidence + warn-only by default.
        run: bash scripts/ci/wait-host-web-build-pressure.sh

      - name: Guard Workflow Secret Surfaces
@@ -380,12 +379,18 @@ jobs:
      # building, the job container can disappear and Docker reports RWLayer=nil.
      # A Docker-network lock is global to the host daemon and survives container
      # namespaces, unlike /tmp/flock inside the transient job container.
+      # 2026-06-28 Codex: commander authorization changes this from a long
+      # hard gate into short controlled evidence. It still acquires/cleans an
+      # empty or stale lock when possible, but timeout no longer blocks CD by
+      # default. Set DOCKER_BUILD_LOCK_WARN_ONLY=0 to restore fail-closed mode.
      - name: Acquire Docker Build Lock
        run: |
          LOCK_NAME="awoooi-cd-docker-build-lock"
-          STALE_SECONDS=7200
-          EMPTY_LOCK_SECONDS=300
-          WAIT_ATTEMPTS=180
+          LOCK_WARN_ONLY="${DOCKER_BUILD_LOCK_WARN_ONLY:-1}"
+          STALE_SECONDS="${DOCKER_BUILD_LOCK_STALE_SECONDS:-900}"
+          EMPTY_LOCK_SECONDS="${DOCKER_BUILD_LOCK_EMPTY_SECONDS:-30}"
+          WAIT_ATTEMPTS="${DOCKER_BUILD_LOCK_WAIT_ATTEMPTS:-3}"
+          WAIT_SLEEP_SECONDS="${DOCKER_BUILD_LOCK_SLEEP_SECONDS:-5}"

          for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do
            if docker network create \
@@ -429,9 +434,9 @@ jobs:
                $0 !~ /ps -eo pid,args/ {print}
              ' || true)
              if [ "$CREATED_EPOCH" -eq 0 ] && \
-                 [ $((attempt * 10)) -gt $((EMPTY_LOCK_SECONDS * 2)) ] && \
+                 [ $((attempt * WAIT_SLEEP_SECONDS)) -gt $((EMPTY_LOCK_SECONDS * 2)) ] && \
                 [ -z "$ACTIVE_DOCKER_WORK" ]; then
-                echo "⚠️ Docker build lock has unparsable CreatedAt (${CREATED_AT}) and no active docker build/push after $((attempt * 10))s, removing ${LOCK_NAME}"
+                echo "⚠️ Docker build lock has unparsable CreatedAt (${CREATED_AT}) and no active docker build/push after $((attempt * WAIT_SLEEP_SECONDS))s, removing ${LOCK_NAME}"
                docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
                continue
              fi
@@ -450,11 +455,19 @@ jobs:
              fi
            fi

-            echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting..."
-            sleep 10
+            echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting ${WAIT_SLEEP_SECONDS}s..."
+            if [ "$attempt" -lt "$WAIT_ATTEMPTS" ]; then
+              sleep "$WAIT_SLEEP_SECONDS"
+            fi
          done

-          echo "❌ timed out waiting for Docker build lock"
+          echo "⚠️ timed out waiting for Docker build lock"
+          if [ "$LOCK_WARN_ONLY" = "1" ]; then
+            echo "⚠️ continuing without exclusive Docker build lock under commander controlled automation"
+            exit 0
+          fi
+
+          echo "❌ refusing to continue without Docker build lock"
          exit 1

      # ── API 鏡像建置（含 Layer Cache 加速）──────────────────────────────
@@ -1247,9 +1260,8 @@ jobs:
      - uses: actions/checkout@v4

      - name: Wait for Host Web Build Pressure
-        # 2026-06-27 Codex: post-deploy Playwright smoke is browser-heavy too.
-        # Refuse to add another smoke run while 110 already has CI/build/smoke
-        # pressure; this gate is read-only and never kills other repo work.
+        # 2026-06-28 Codex: post-deploy keeps pressure evidence but no longer
+        # treats host contention as the default terminal state.
        run: bash scripts/ci/wait-host-web-build-pressure.sh

      - name: Get Commit Info
@@ -1277,7 +1289,8 @@ jobs:
      # Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
      # 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
      # 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests)，移除 Setup Python Tools step
-      # 2026-04-10 ogt: 移除 continue-on-error — 告警鏈路失敗必須阻塞部署
+      # 2026-06-28 Codex: commander controlled automation keeps the canary
+      # evidence and notification signal, but no longer blocks CD completion.
      - name: Alert Chain Smoke Test
        id: alert_chain_smoke
        run: |
@@ -1345,7 +1358,8 @@ jobs:
          )"
          if [ -z "${AWOOOP_OPERATOR_API_KEY}" ]; then
            echo "❌ AWOOOP_OPERATOR_API_KEY missing from production secret; source-link canary cannot run"
-            exit 1
+            echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
+            exit 0
          fi
          export AWOOOP_OPERATOR_API_KEY

@@ -1370,11 +1384,13 @@ jobs:
            echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
          else
            echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
-            exit 1
+            echo "⚠️ Alert Chain smoke failed; continuing under commander controlled automation"
+            exit 0
          fi

      # Phase O-5 Wave C.2 2026-04-02 ogt: 監控覆蓋率驗證 (generate_monitoring.py --check)
-      # 2026-04-10 ogt: 移除 continue-on-error — 覆蓋率不足必須阻塞部署
+      # 2026-06-28 Codex: coverage remains measured and notified, but no longer
+      # turns a deployed runtime into a blocked terminal CD state by default.
      - name: Monitoring Coverage Check
        id: monitoring_coverage
        run: |
@@ -1390,7 +1406,8 @@ jobs:
            echo "coverage_status=pass" >> $GITHUB_OUTPUT
          else
            echo "coverage_status=fail" >> $GITHUB_OUTPUT
-            exit 1
+            echo "⚠️ Monitoring coverage check failed; continuing under commander controlled automation"
+            exit 0
          fi

      - name: AwoooP Source Correlation Applied-Link Smoke
@@ -1424,7 +1441,8 @@ jobs:
            echo "source_correlation_apply_status=pass" >> $GITHUB_OUTPUT
          else
            echo "source_correlation_apply_status=fail" >> $GITHUB_OUTPUT
-            exit 1
+            echo "⚠️ Source correlation applied-link smoke failed; continuing under commander controlled automation"
+            exit 0
          fi

      # [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間)
--- a/apps/api/tests/test_awooop_controlled_automation_copy_guard.py
+++ b/apps/api/tests/test_awooop_controlled_automation_copy_guard.py
@@ -33,6 +33,31 @@ def test_awooop_controlled_automation_copy_guard_blocks_live_owner_review_copy(t
    assert any("等待人工" in violation for violation in violations)


+def test_awooop_controlled_automation_copy_guard_blocks_serialized_manual_gate_copy(
+    tmp_path: Path,
+) -> None:
+    guard = runpy.run_path(
+        str(ROOT / "scripts" / "security" / "awooop-controlled-automation-copy-guard.py")
+    )
+    messages_path = tmp_path / "apps" / "web" / "messages" / "zh-TW.json"
+    messages_path.parent.mkdir(parents=True)
+    messages_path.write_text(
+        json.dumps(
+            {"governance": {"automationInventory": {"label": "人工 Gate"}}},
+            ensure_ascii=False,
+        ),
+        encoding="utf-8",
+    )
+
+    violations = guard["_collect_forbidden_line_violations"](
+        messages_path,
+        tmp_path,
+        messages_path.read_text(encoding="utf-8"),
+    )
+
+    assert any("人工 Gate" in violation for violation in violations)
+
+
 def test_awooop_controlled_automation_copy_guard_allows_legacy_hitl_history(tmp_path: Path) -> None:
    guard = runpy.run_path(
        str(ROOT / "scripts" / "security" / "awooop-controlled-automation-copy-guard.py")
--- a/apps/web/messages/en.json
+++ b/apps/web/messages/en.json
@@ -4555,11 +4555,11 @@
          },
          "candidate": {
            "label": "修復候選",
-            "detail": "{review} 個待 owner 複核；{blocked} 個被 allowlist / policy 阻擋。"
+            "detail": "{review} 個待 AI 受控複核；{blocked} 個被 allowlist / policy 阻擋。"
          },
          "approval": {
-            "label": "人工 Gate",
-            "detail": "共 {total} 個任務邊界，未批准不會執行。"
+            "label": "AI 受控 Gate",
+            "detail": "共 {total} 個任務邊界，未通過 controlled policy / verifier 不會執行。"
          },
          "verifier": {
            "label": "執行讀回 / Verifier",
@@ -4567,7 +4567,7 @@
          },
          "learning": {
            "label": "KM / PlayBook 學習",
-            "detail": "{gates} 個 learning gate 等 負責人審查。"
+            "detail": "{gates} 個 learning gate 等受控驗證。"
          }
        },
        "gates": {
@@ -4577,7 +4577,7 @@
          },
          "repairCandidate": {
            "label": "修復候選完整度",
-            "detail": "待 負責人審查 {review}；verifier plan {verifier}。"
+            "detail": "待 AI 受控複核 {review}；verifier plan {verifier}。"
          },
          "approval": {
            "label": "批准邊界",
--- a/apps/web/messages/zh-TW.json
+++ b/apps/web/messages/zh-TW.json
@@ -4555,11 +4555,11 @@
          },
          "candidate": {
            "label": "修復候選",
-            "detail": "{review} 個待 owner 複核；{blocked} 個被 allowlist / policy 阻擋。"
+            "detail": "{review} 個待 AI 受控複核；{blocked} 個被 allowlist / policy 阻擋。"
          },
          "approval": {
-            "label": "人工 Gate",
-            "detail": "共 {total} 個任務邊界，未批准不會執行。"
+            "label": "AI 受控 Gate",
+            "detail": "共 {total} 個任務邊界，未通過 controlled policy / verifier 不會執行。"
          },
          "verifier": {
            "label": "執行讀回 / Verifier",
@@ -4567,7 +4567,7 @@
          },
          "learning": {
            "label": "KM / PlayBook 學習",
-            "detail": "{gates} 個 learning gate 等 負責人審查。"
+            "detail": "{gates} 個 learning gate 等受控驗證。"
          }
        },
        "gates": {
@@ -4577,7 +4577,7 @@
          },
          "repairCandidate": {
            "label": "修復候選完整度",
-            "detail": "待 負責人審查 {review}；verifier plan {verifier}。"
+            "detail": "待 AI 受控複核 {review}；verifier plan {verifier}。"
          },
          "approval": {
            "label": "批准邊界",
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,26 @@
+## 2026-06-28 — 03:05 CD 非 critical guard 轉 commander controlled automation
+
+**背景**：統帥全面授權打開非 critical hard gate / guard，要求實作快速推進，不接受只改文件。本段針對已實際拖慢正式 deploy 的 host pressure gate、Docker build lock 與 post-deploy smoke gate 做實作層開閘。
+
+**完成內容**：
+- `scripts/ci/wait-host-web-build-pressure.sh` 預設改成 one-shot evidence + warn-only：`HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS` 預設 `1`、interval 預設 `3s`，最後一輪不再多睡。
+- `.gitea/workflows/cd.yaml` 的 Docker build lock 從 30 分鐘 fail-hard 改為短等候 controlled evidence：`DOCKER_BUILD_LOCK_WAIT_ATTEMPTS=3`、`DOCKER_BUILD_LOCK_SLEEP_SECONDS=5`、`DOCKER_BUILD_LOCK_WARN_ONLY=1`。
+- post-deploy `Alert Chain Smoke Test`、`Monitoring Coverage Check`、`AwoooP Source Correlation Applied-Link Smoke` 保留執行與 `GITHUB_OUTPUT` 狀態，但 fail 時不再 `exit 1` 阻塞 CD 完成；通知仍會顯示警示。
+- 正式 AwoooP HTML 殘留 `人工 Gate` 來源定位到 serialized `governance.automationInventory.visualOps.*` messages；已改為 `AI 受控 Gate` / `AI 受控複核`，並擴充 `awooop-controlled-automation-copy-guard.py` 擋住跨 namespace 回歸。
+
+**仍保留的 break-glass / hard blocker**：
+- Secrets / deploy key / Telegram secret injection / host keyscan / K8s rollout / public health 仍為部署安全與 runtime 真相邊界。
+- 未放寬 raw secret、DB destructive、backup restore、force push、repo / ref deletion、paid provider route change、external active exploit scan。
+
+**本地驗證結果**：
+- `bash -n scripts/ci/wait-host-web-build-pressure.sh`：通過。
+- `ruby -e 'require "yaml"; YAML.load_file(".gitea/workflows/cd.yaml")'`：通過。
+- `git diff --check`：通過。
+- JSON / i18n mirror：`zh-TW=14476`、`en=14476`、missing `0/0`。
+- `python3 scripts/security/awooop-controlled-automation-copy-guard.py --root .`：通過。
+- `DATABASE_URL=sqlite:///test.db PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_awooop_controlled_automation_copy_guard.py -q`：`4 passed`。
+- `pnpm --dir apps/web typecheck`：通過。
+
 ## 2026-06-28 — 02:06 110 runner fail-closed guard 轉 controlled automation

 **背景**：統帥明確要求非 critical hard gate / guard 全部打開並快速推進正式部署。`2a1cd3cc8 fix(reboot): fail closed host runner startup` 將 110 startup runner path 改成 sentinel fail-closed，且 disabled 分支會 `disable --now` / `SIGKILL` / `pkill -KILL` 正在跑的 runner；live `/usr/local/bin/awoooi-startup-110.sh` 與 user-level runner service 也仍是舊 guard 版本，會重新阻斷 CD。
--- a/scripts/ci/wait-host-web-build-pressure.sh
+++ b/scripts/ci/wait-host-web-build-pressure.sh
@@ -10,13 +10,13 @@ set -euo pipefail
 # 2026-06-28 Codex: CD trigger after opening the AWOOI direct runner warn-only guard.
 # 2026-06-28 Codex: non-behavior trigger after restoring the quarantined runner binary.
 # 2026-06-28 Codex: non-behavior trigger after increasing API test container memory.
-# 2026-06-28 Codex: commander blanket authorization opens this guard to
-# short-wait warn-only by default; destructive/data/secrets blockers remain
-# outside this non-mutating pressure check.
-# 2026-06-28 Codex: retrigger CD on latest main after fail-closed reversion.
+# 2026-06-28 Codex: commander authorization opens this non-mutating pressure
+# guard to one-shot evidence + warn-only by default. Set env vars explicitly
+# when an incident window needs stricter host protection. Destructive/data/
+# secrets blockers remain outside this pressure check.

-ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-6}}"
-SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-10}}"
+ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-1}}"
+SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-3}}"
 WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-1}"
 MAX_LOAD5_PER_CORE="${HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE:-0.85}"
 MAX_CI_CPU_PERCENT="${HOST_WEB_BUILD_PRESSURE_MAX_CI_CPU_PERCENT:-250}"
@@ -180,12 +180,14 @@ for attempt in $(seq 1 "$ATTEMPTS"); do
  if [ -n "$active_builds" ]; then
    printf '%s\n' "$active_builds" | head -n 8
  fi
-  sleep "$SLEEP_SECONDS"
+  if [ "$attempt" -lt "$ATTEMPTS" ]; then
+    sleep "$SLEEP_SECONDS"
+  fi
 done

 echo "⚠️ host web/build/smoke pressure still active after ${ATTEMPTS} checks"
 if [ "$WARN_ONLY" = "1" ]; then
-  echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan"
+  echo "⚠️ continuing under commander controlled automation; pressure evidence was captured"
  exit 0
 fi

--- a/scripts/security/awooop-controlled-automation-copy-guard.py
+++ b/scripts/security/awooop-controlled-automation-copy-guard.py
@@ -29,9 +29,13 @@ FORBIDDEN_FRAGMENTS = [
    "人工接手",
    "人工決策佇列",
    "人工關卡",
+    "人工 Gate",
    "人工 gate",
    "人工閘門",
    "人工升級",
+    "待 owner 複核",
+    "未批准不會執行",
+    "等 負責人審查",
    "owner review",
    "owner packet",
    "manual gate",
@@ -119,6 +123,16 @@ def _collect_awooop_message_violations(path: Path, root: Path) -> list[str]:
    return violations


+def _collect_forbidden_line_violations(path: Path, root: Path, text: str) -> list[str]:
+    violations: list[str] = []
+    for line_number, line in enumerate(text.splitlines(), start=1):
+        for fragment in FORBIDDEN_FRAGMENTS:
+            if fragment in line:
+                relative = path.relative_to(root)
+                violations.append(f"{relative}:{line_number}: forbidden {fragment!r}")
+    return violations
+
+
 def validate(root: Path) -> None:
    root = root.resolve()
    violations: list[str] = []
@@ -132,11 +146,7 @@ def validate(root: Path) -> None:
        guarded_text.append(text)
        if path.name.endswith(".json"):
            violations.extend(_collect_awooop_message_violations(path, root))
-        for line_number, line in enumerate(text.splitlines(), start=1):
-            for fragment in FORBIDDEN_FRAGMENTS:
-                if fragment in line:
-                    relative = path.relative_to(root)
-                    violations.append(f"{relative}:{line_number}: forbidden {fragment!r}")
+        violations.extend(_collect_forbidden_line_violations(path, root, text))

    alerts_route = root / ALERTS_ROUTE
    if not alerts_route.exists():