Files
awoooi/.gitea/workflows/cd.yaml
Your Name aaa617f00f
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 41s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
fix(reboot): expose windows99 management channel readback
2026-07-02 15:23:08 +08:00

2713 lines
140 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# =============================================================================
# AWOOOI CD Pipeline (Gitea Actions - 方案 B)
# =============================================================================
# 流程: Build → Push to Harbor → Deploy to K8s
# 加速措施:
# 1. Docker Layer Cache → Harbor registry cache
# 2. 內部 Mirror → 192.168.0.110:5001 (Harbor Proxy Cache for DockerHub)
# 3. 非 110 runner 的 Docker pull/push 走 registry.wooo.work HTTPS alias
# 避免要求 runner root 修改 insecure registryK8s image pull 仍保留
# 192.168.0.110:5000 內網 Harbor route。
# 2026-03-29 Claude Code (ADR-039) - Retry after creating Harbor project
name: CD Pipeline
on:
# 2026-06-29 Codex: restore main push CD only after the non-110
# awoooi-non110-* runner lane read back registration metadata, active
# service, capacity=1, pressure OK, rollback unit, and label target-match.
# 110 incident runner labels and generic labels remain fail-closed via
# ops/runner/guard-gitea-runner-pressure.py.
push:
branches:
- main
workflow_dispatch:
# 手動觸發永遠可用(用於補跑、緊急部署)
# 2026-04-02 Claude Code: 改為搶佔模式 — 新 push 立即取消舊 build只部署最新
# 原理: concurrency group 保證同時只有一個 job 跑cancel-in-progress:true 讓新的取代舊的
# 解決: 多個 commit 快速連推時不再排隊堆積,且 docker build 卡住時不會阻塞後續部署
# 安全: deploy 步驟本身有 kubectl rollout status 保護,不會出現半部署狀態
concurrency:
group: cd-deploy-${{ github.ref }}
cancel-in-progress: true
env:
HARBOR: registry.wooo.work
SRE_GROUP_CHAT_ID: "-1003711974679"
# Harbor Proxy Cache (指向 DockerHub 的內部 Mirror避免拉取限額)
HARBOR_MIRROR: 192.168.0.110:5001
# OTEL CI/CD 監控 (2026-03-31 #46c - 遷移到 Gitea)
OTEL_EXPORTER_OTLP_ENDPOINT: http://192.168.0.188:24318
OTEL_SERVICE_NAME: awoooi-cd
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=production
CI_IMAGE: registry.wooo.work/awoooi/ci-runner:act-22.04
# 2026-06-28 Codex: 110 runner pressure is an incident-grade capacity guard.
# Do not flip this to warn-only until non-110 readiness is verified.
HOST_WEB_BUILD_PRESSURE_WARN_ONLY: "0"
# 2026-06-30 Codex: CD is now pinned to the dedicated awoooi-non110-host lane.
# Keep the pressure guard fail-hard, but allow normal below-saturation load on
# that dedicated lane so P0 deploys are not stuck behind load5/core 0.85-1.05.
HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE: "1.05"
# Docker lock contention is also fail-hard during the same incident window.
DOCKER_BUILD_LOCK_WARN_ONLY: "0"
# 2026-05-24 Codex: deploy through the currently Ready control-plane node.
# 120 is NotReady/SchedulingDisabled and its SSH/API endpoints are currently
# unreachable; pinning CD to it blocks secret injection before GitOps deploy.
K8S_SSH_HOST: 192.168.0.121
K8S_API_SERVER: https://192.168.0.121:6443
# 2026-06-01 Codex: post-deploy health/smoke probes use the production
# public API. The old 192.168.0.125 NodePort VIP can be absent while the
# public route and in-cluster service are healthy, causing false failures.
API_HEALTH_URL: https://awoooi.wooo.work/api/v1/health
ALERT_CHAIN_API_URL: https://awoooi.wooo.work
jobs:
workflow-shape:
# 2026-06-28 Codex: Gitea 1.25 may mark a workflow invalid when every
# root job has a job-level `if`. Keep one no-op root job without `if` so
# cd.yaml stays parseable while deploy jobs remain guarded below.
runs-on: awoooi-non110-host
timeout-minutes: 1
steps:
- name: Confirm CD Workflow Shape
run: echo "cd.yaml root job present; deploy jobs remain guarded."
cancel-stale-cd:
# 2026-06-28 Codex: keep a visible no-op run for controlled queue
# cancellation. If every job is skipped, Gitea may not create a run and
# the stale pre-guard CD queue is not superseded by concurrency.
if: ${{ github.event_name == 'push' && contains(github.event.head_commit.message, 'cancel-stale-cd') }}
runs-on: awoooi-non110-host
timeout-minutes: 3
steps:
- name: Confirm Stale CD Queue Cancellation
run: |
echo "cancel-stale-cd marker accepted; deploy jobs are intentionally skipped."
tests:
# 2026-06-28 Codex: Gitea does not consistently short-circuit `[skip ci]`
# on CD-generated deploy commits. Skip jobs explicitly so marker commits
# do not trigger a self-feeding CD loop; `cancel-stale-cd` is a
# controlled no-op trigger used only to cancel stale pre-guard runs.
if: ${{ github.event_name != 'push' || (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'cancel-stale-cd')) }}
# 2026-04-30 Codex: run the tests job on the host runner and launch the
# CI image explicitly. The act-managed job container can disappear mid-test
# with Docker RWLayer=nil on the shared 110 daemon.
timeout-minutes: 30
runs-on: awoooi-non110-host
# 2026-04-10 ogt: B5 改用 docker run 本地啟動,移除 services: 宣告
# Gitea act runner 的 services: container name 為空,導致 CI 失敗
steps:
- name: Bootstrap Host Runner Tools
# 2026-06-28 Codex: awoooi-non110-host maps to the dedicated
# non-110 runner lane. Bootstrap tools defensively because host
# runners can start without the CI toolchain preinstalled.
run: |
if command -v apk >/dev/null 2>&1; then
apk add --no-cache nodejs npm git curl bash coreutils python3 openssh-client docker-cli docker-cli-buildx
fi
- uses: actions/checkout@v4
- name: Wait for Host Web Build Pressure
# 2026-06-28 Codex: 110 runner pressure remains incident-grade and
# fail-hard until runner work is moved or hard-limited.
run: bash scripts/ci/wait-host-web-build-pressure.sh
- name: Guard Workflow Secret Surfaces
run: node scripts/ci/check-gitea-step-env-secrets.js
# 2026-03-31 ogt: 優化告警格式 - 提高可讀性
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
- name: Notify Pipeline Start
# 2026-04-16 ogt + Claude Sonnet 4.6: 改用 HTML 結構化格式,提升可讀性
env:
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
# HTML escape commit message防特殊字元破壞 HTML
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '🚀 <b>AWOOOI 部署開始</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n└ 👤 %s' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
# 2026-05-02 Claude Opus 4.7 + 統帥 ogt: notify 失敗不該擋整條 CI鐵證:
# curl 400 從 5/1 起連續炸 14 個 commit 的 build-and-deploy— 對齊 line 922 既有 pattern
if AWOOI_CICD_STATUS=running \
AWOOI_CICD_STAGE=tests \
AWOOI_CICD_JOB_NAME="AWOOOI 部署開始" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD start notification mirrored through AWOOI API"
else
echo "AWOOI API notify failed; direct Telegram fallback disabled to preserve AwoooP receipt chain"
fi
# 2026-03-31 ogt: Phase 22.0 CI 測試 (禁止 Mock - feedback_no_mock_testing.md)
# 2026-04-01 ogt: 持久化 venv 加速 - /opt/api-venv 跨 run 保留
# pyproject.toml hash 變才重裝,其餘直接 activate (節省 ~6-7 min)
- name: Run API Tests
run: |
CHANGED_FILES=""
if [ -r "${GITHUB_EVENT_PATH:-}" ]; then
CHANGED_FILES="$(python3 - <<'PY'
import json
import os
event_path = os.environ.get("GITHUB_EVENT_PATH")
files = []
with open(event_path, "r", encoding="utf-8") as handle:
payload = json.load(handle)
for commit in payload.get("commits", []) or []:
for key in ("added", "modified", "removed"):
files.extend(commit.get(key, []) or [])
for path in dict.fromkeys(files):
print(path)
PY
)"
fi
if [ -z "$CHANGED_FILES" ]; then
BASE_SHA="${{ github.event.before }}"
if [ -n "$BASE_SHA" ] && ! printf '%s' "$BASE_SHA" | grep -Eq '^0+$'; then
git fetch --no-tags --depth=50 origin "${GITHUB_REF_NAME:-main}" >/dev/null 2>&1 || true
if git cat-file -e "${BASE_SHA}^{commit}" 2>/dev/null; then
CHANGED_FILES="$(git diff --name-only "$BASE_SHA" "${GITHUB_SHA:-HEAD}")"
fi
fi
fi
if [ -z "$CHANGED_FILES" ]; then
CHANGED_FILES="$(git show --format= --name-only --no-renames HEAD)"
fi
printf 'CD changed files:\n%s\n' "$CHANGED_FILES"
CONTROLLED_RUNTIME_TEST_PROFILE=1
while IFS= read -r changed_file; do
[ -z "$changed_file" ] && continue
case "$changed_file" in
# 2026-06-29 Codex: UI-only changes are verified by the
# frontend build in build-and-deploy. Keep them on the narrow
# profile so non-110 CD does not run B5's Docker/socket DB
# integration for copy/layout fixes.
apps/web/*)
;;
.gitea/workflows/cd.yaml)
;;
# 2026-06-30 Codex: workflow secret-transport and guard-only
# hardening must stay on the narrow profile. These changes are
# validated by workflow-shape, the secret-surface guard, and the
# runner pressure/profile tests; sending them to full/B5 would
# reintroduce the heavy runner path while not increasing coverage.
.gitea/workflows/cd-dev.yaml)
;;
.gitea/workflows/code-review.yaml)
;;
.gitea/workflows/deploy-alerts.yaml)
;;
.gitea/workflows/e2e-health.yaml)
;;
.gitea/workflows/ansible-lint.yml)
;;
.gitea/workflows/harbor-110-local-repair.yaml)
;;
.gitea/workflows/run-migration.yml)
;;
scripts/ci/check-gitea-step-env-secrets.js)
;;
# 2026-06-29 Codex: the onboarding warning-step workflow is
# copied in a disabled workflow_dispatch-only state. Treat the
# source and template files as controlled-runtime sources so the
# CD lane does not fall into full/B5 just for placing the inert
# guarded workflow shell.
.gitea/workflows/awoooi-onboarding-warning-step.yaml)
;;
docs/operations/templates/awoooi-gitea-onboarding-warning-step.workflow.yaml)
;;
# 2026-06-29 Codex: build-and-deploy writes only these GitOps
# deploy marker files after image push. A later merge commit can
# carry them back through CD; keep that marker merge on the
# controlled profile so non-110 CD does not fall into B5's Docker
# socket path just because the previous deploy recorded image
# truth.
k8s/awoooi-prod/04-configmap.yaml)
;;
k8s/awoooi-prod/06-deployment-api.yaml)
;;
k8s/awoooi-prod/08-deployment-worker.yaml)
;;
k8s/awoooi-prod/10-deployment-auto-repair-canary.yaml)
;;
k8s/awoooi-prod/kustomization.yaml)
;;
product.awoooi.yaml)
;;
docs/LOGBOOK.md)
;;
docs/runbooks/REBOOT-RECOVERY-SOP.md)
;;
docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md)
;;
docs/runbooks/FULL-STACK-COLD-START-SOP.md)
;;
docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md)
;;
docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md)
;;
docs/workplans/2026-07-02-commander-inserted-requirements-priority-ledger.md)
;;
docs/schemas/product_awoooi_manifest_v1.schema.json)
;;
docs/operations/product-awoooi-manifest-standard.snapshot.json)
;;
docs/operations/awoooi-priority-work-order-readback.snapshot.json)
;;
docs/operations/awooop-conversation-event-hot-path-index-apply-receipt-*.snapshot.json)
;;
docs/operations/awoooi-credential-escrow-evidence-controlled-closeout-receipt.snapshot.json)
;;
docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json)
;;
docs/operations/awoooi-gitea-private-inventory-p0-scorecard.snapshot.json)
;;
docs/operations/awoooi-gitea-private-inventory-controlled-closeout-receipt.snapshot.json)
;;
docs/operations/awoooi-gitea-authenticated-inventory-payload-validation.snapshot.json)
;;
docs/security/GITEA-REPO-INVENTORY-SNAPSHOT.md)
;;
docs/security/gitea-repo-inventory.snapshot.json)
;;
docs/operations/p0-cicd-baseline-source-readiness.snapshot.json)
;;
docs/operations/awoooi-gitea-onboarding-warning-step-template-copy-receipt.snapshot.json)
;;
.gitea/workflows/awoooi-onboarding-warning-step.yaml)
;;
docs/operations/templates/awoooi-gitea-onboarding-warning-step.workflow.yaml)
;;
docs/operations/awoooi-production-deploy-readback-blocker.snapshot.json)
;;
docs/evaluations/backup_dr_target_inventory_2026-06-04.json)
;;
docs/evaluations/backup_dr_readiness_matrix_2026-06-04.json)
;;
docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json)
;;
apps/api/src/api/v1/agents.py)
;;
apps/api/src/api/v1/iwooos.py)
;;
apps/api/src/api/v1/webhooks.py)
;;
apps/api/src/core/config.py)
;;
apps/api/src/db/base.py)
;;
apps/api/src/services/agent_replay_normalizer.py)
;;
apps/api/src/services/ai_agent_log_intelligence_integration_readback.py)
;;
apps/api/src/services/ai_agent_log_feedback_receipt_dry_run.py)
;;
apps/api/src/services/ai_agent_log_post_write_verifier_dry_run.py)
;;
apps/api/src/services/ai_agent_log_controlled_writeback_plan_readback.py)
;;
apps/api/src/services/ai_agent_log_controlled_writeback_executor_readback.py)
;;
apps/api/src/services/ai_agent_log_controlled_writeback_dispatch.py)
;;
apps/api/src/services/ai_agent_log_controlled_writeback_consumer_readback.py)
;;
apps/api/src/services/ai_agent_log_controlled_writeback_consumer_apply.py)
;;
apps/api/src/services/ai_agent_autonomous_runtime_control.py)
;;
apps/api/src/services/ai_agent_report_truth_actionability_review.py)
;;
apps/api/src/services/awooop_ansible_audit_service.py)
;;
apps/api/src/services/awooop_ansible_check_mode_service.py)
;;
apps/api/migrations/adr090e_ansible_learning_writeback_operation_type.sql)
;;
apps/api/migrations/adr090e_ansible_learning_writeback_operation_type_down.sql)
;;
apps/api/migrations/adr090f_log_controlled_writeback_dispatch_operation_type.sql)
;;
apps/api/migrations/adr090f_log_controlled_writeback_dispatch_operation_type_down.sql)
;;
apps/api/src/services/auto_approve.py)
;;
apps/api/src/services/decision_fusion.py)
;;
apps/api/src/services/heartbeat_report_service.py)
;;
apps/api/src/services/credential_escrow_evidence_intake_readiness.py)
;;
apps/api/src/services/gitea_authenticated_inventory_payload_validation.py)
;;
apps/api/src/services/gitea_owner_coverage_attestation_validation.py)
;;
apps/api/src/services/gitea_private_inventory_closeout_validation.py)
;;
apps/api/src/services/gitea_private_inventory_p0_scorecard.py)
;;
apps/api/src/services/gitea_workflow_runner_owner_attestation_request.py)
;;
apps/api/src/services/reboot_auto_recovery_slo_scorecard.py)
;;
apps/api/src/services/reboot_auto_recovery_drill_preflight.py)
;;
apps/api/src/services/stockplatform_public_api_runtime_readback.py)
;;
apps/api/src/services/stockplatform_public_api_controlled_recovery_preflight.py)
;;
apps/api/src/services/harbor_registry_controlled_recovery_preflight.py)
;;
apps/api/src/services/harbor_registry_controlled_recovery_receipt.py)
;;
apps/api/src/services/iwooos_security_operating_system.py)
;;
apps/api/Dockerfile)
;;
apps/api/src/services/awoooi_gitea_onboarding_warning_step_dashboard.py)
;;
apps/api/src/services/awoooi_gitea_onboarding_warning_step_owner_package.py)
;;
apps/api/src/services/awoooi_gitea_onboarding_warning_step_owner_response_preflight.py)
;;
apps/api/src/services/awoooi_gitea_onboarding_warning_step_template_copy_apply_gate.py)
;;
apps/api/src/services/awoooi_gitea_onboarding_warning_step_template_copy_execution_plan.py)
;;
apps/api/src/services/awoooi_gitea_onboarding_warning_step_template_copy_receipt.py)
;;
apps/api/src/services/awoooi_gitea_onboarding_warning_step_runtime_enablement_gate.py)
;;
apps/api/src/services/awoooi_new_product_onboarding_page_model.py)
;;
apps/api/src/services/awoooi_onboarding_reminder_contract.py)
;;
apps/api/src/services/awoooi_onboarding_source_contracts.py)
;;
apps/api/src/services/awoooi_priority_work_order_readback.py)
;;
apps/api/src/services/awoooi_product_onboarding_guard.py)
;;
apps/api/src/services/p0_cicd_baseline_source_readiness.py)
;;
apps/api/src/services/product_awoooi_manifest_standard.py)
;;
apps/api/src/api/v1/platform/events.py)
;;
apps/api/src/jobs/ai_slo_watchdog_job.py)
;;
apps/api/src/models/knowledge.py)
;;
apps/api/src/models/playbook.py)
;;
apps/api/src/services/auto_repair_service.py)
;;
apps/api/src/services/awoooi_production_deploy_readback_blocker.py)
;;
apps/api/src/services/backup_dr_target_inventory.py)
;;
apps/api/src/services/backup_dr_readiness_matrix.py)
;;
apps/api/src/services/decision_manager.py)
;;
apps/api/src/services/delivery_closure_workbench.py)
;;
apps/api/src/services/platform_operator_service.py)
;;
apps/api/src/services/telegram_gateway.py)
;;
apps/api/tests/test_agent_replay_normalizer.py)
;;
apps/api/tests/test_ai_agent_log_intelligence_integration_readback_api.py)
;;
apps/api/tests/test_ai_agent_log_feedback_receipt_dry_run_api.py)
;;
apps/api/tests/test_ai_agent_log_post_write_verifier_dry_run_api.py)
;;
apps/api/tests/test_ai_agent_log_controlled_writeback_plan_readback_api.py)
;;
apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py)
;;
apps/api/tests/test_ai_agent_log_controlled_writeback_dispatch_api.py)
;;
apps/api/tests/test_ai_agent_log_controlled_writeback_consumer_readback_api.py)
;;
apps/api/tests/test_ai_agent_log_controlled_writeback_consumer_apply_api.py)
;;
apps/api/tests/test_ai_agent_autonomous_runtime_control.py)
;;
apps/api/tests/test_ai_agent_report_truth_actionability_review.py)
;;
apps/api/tests/test_ai_agent_report_truth_actionability_review_api.py)
;;
apps/api/tests/test_awooop_truth_chain_service.py)
;;
apps/api/tests/test_shadow_auto_approve.py)
;;
apps/api/tests/test_destructive_patterns.py)
;;
apps/api/tests/test_approval_pending_visibility.py)
;;
apps/api/tests/test_awooop_operator_timeline_labels.py)
;;
apps/api/tests/test_config_url_validation.py)
;;
apps/api/tests/test_delivery_closure_workbench_api.py)
;;
apps/api/tests/test_runtime_bootstrap_guards.py)
;;
apps/api/tests/test_backup_dr_target_inventory.py)
;;
apps/api/tests/test_backup_dr_target_inventory_api.py)
;;
apps/api/tests/test_backup_dr_readiness_matrix.py)
;;
apps/api/tests/test_backup_dr_readiness_matrix_api.py)
;;
apps/api/tests/test_credential_escrow_evidence_intake_readiness_api.py)
;;
apps/api/tests/test_gitea_private_inventory_p0_scorecard_api.py)
;;
apps/api/tests/test_gitea_workflow_runner_owner_attestation_request_api.py)
;;
apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py)
;;
apps/api/tests/test_stockplatform_public_api_runtime_readback.py)
;;
apps/api/tests/test_stockplatform_public_api_controlled_recovery_preflight.py)
;;
apps/api/tests/test_harbor_registry_controlled_recovery_preflight.py)
;;
apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py)
;;
apps/api/tests/test_iwooos_security_operating_system.py)
;;
apps/api/tests/test_iwooos_wazuh_prod_manifest.py)
;;
apps/api/tests/test_awoooi_production_deploy_readback_blocker.py)
;;
apps/api/tests/test_awoooi_priority_work_order_readback_api.py)
;;
apps/api/tests/e2e_network_test.py)
;;
apps/api/tests/test_p0_cicd_baseline_source_readiness_api.py)
;;
apps/api/tests/test_product_awoooi_manifest_standard_api.py)
;;
apps/api/tests/test_trust_drift_watchdog.py)
;;
apps/web/src/app/\[locale\]/governance/tabs/events-tab.tsx)
;;
apps/web/src/app/\[locale\]/governance/tabs/queue-tab.tsx)
;;
apps/web/src/app/\[locale\]/governance/tabs/slo-tab.tsx)
;;
ops/runner/read-public-gitea-actions-queue.py)
;;
ops/runner/README.md)
;;
ops/runner/check-awoooi-non110-runner-readiness.sh)
;;
ops/runner/install-awoooi-non110-runner-user-service.sh)
;;
ops/runner/register-awoooi-110-controlled-cd-lane-drain.sh)
;;
ops/runner/test_read_public_gitea_actions_queue.py)
;;
ops/runner/test_cd_controlled_runtime_profile.py)
;;
ops/runner/test_check_awoooi_non110_runner_readiness.py)
;;
ops/runner/test_install_awoooi_non110_runner_user_service.py)
;;
ops/runner/test_register_awoooi_110_controlled_cd_lane_drain.py)
;;
ops/runner/test_check_awoooi_110_controlled_cd_lane_readiness.py)
;;
ops/runner/test_verify_awoooi_non110_cd_closure.py)
;;
ops/runner/awoooi-cd-lane-drain.service)
;;
ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh)
;;
ops/runner/verify-awoooi-non110-cd-closure.py)
;;
docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json)
;;
docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json)
;;
ops/reboot-recovery/full-stack-cold-start-baseline.yml)
;;
ops/monitoring/alerts-unified.yml)
;;
ops/monitoring/alerts.yml)
;;
scripts/dev/awoooi-navigation-coverage-guard.py)
;;
scripts/ci/wait-host-web-build-pressure.sh)
;;
# 2026-07-01 Codex: backup freshness/readback scripts are
# covered by shell syntax checks and focused exporter tests.
# Keep them off B5 so a metadata/script recovery patch does not
# require the host Docker socket.
scripts/backup/backup-awoooi-frequent.sh)
;;
scripts/backup/backup-status.sh)
;;
scripts/backup/gitea-repo-bundle-backup.sh)
;;
scripts/backup/tests/test_backup_status_contract.py)
;;
scripts/ops/backup-alert-label-contract-check.py)
;;
scripts/ops/backup-health-textfile-exporter.py)
;;
scripts/ops/docker-disk-pressure-retention-cleanup.py)
;;
scripts/ops/gitea-queue-hook-backlog-playbook.py)
;;
scripts/ops/host-runaway-process-exporter.py)
;;
scripts/ops/host-sustained-load-controller.py)
;;
scripts/ops/host-sustained-load-evidence.py)
;;
scripts/ops/tests/test_backup_health_textfile_exporter.py)
;;
scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py)
;;
scripts/ops/tests/test_gitea_queue_hook_backlog_playbook.py)
;;
scripts/ops/tests/test_host_runaway_process_exporter.py)
;;
scripts/ops/tests/test_host_pressure_alert_contract.py)
;;
scripts/reboot-recovery/deploy-to-110.sh)
;;
scripts/reboot-recovery/deploy-to-188.sh)
;;
scripts/reboot-recovery/enforce-110-runner-failclosed.sh)
;;
scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh)
;;
scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)
;;
scripts/reboot-recovery/awoooi-startup.sh)
;;
scripts/reboot-recovery/awoooi-startup.service)
;;
scripts/reboot-recovery/dr-escrow-evidence-checklist.py)
;;
scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)
;;
scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py)
;;
scripts/reboot-recovery/post-reboot-owner-response-preflight.py)
;;
scripts/reboot-recovery/post-start-quick-check.sh)
;;
scripts/reboot-recovery/reboot-recovery-readiness-audit.sh)
;;
scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh)
;;
scripts/reboot-recovery/full-stack-cold-start-check.sh)
;;
scripts/reboot-recovery/cold-start-textfile-exporter.sh)
;;
scripts/reboot-recovery/install-cold-start-monitor-110.sh)
;;
scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh)
;;
scripts/reboot-recovery/momo-source-arrival-gate.py)
;;
scripts/reboot-recovery/full-stack-recovery-scorecard.sh)
;;
scripts/reboot-recovery/harbor-watchdog.sh)
;;
scripts/reboot-recovery/awoooi-startup-110.sh)
;;
scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh)
;;
scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh)
;;
scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh)
;;
scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py)
;;
scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py)
;;
scripts/reboot-recovery/tests/test_momo_source_arrival_gate.py)
;;
scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service)
;;
scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.timer)
;;
scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh)
;;
scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh)
;;
scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh)
;;
scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py)
;;
scripts/reboot-recovery/windows99-vmware-autostart.ps1)
;;
scripts/reboot-recovery/windows99-management-channel-probe.py)
;;
scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py)
;;
scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py)
;;
scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py)
;;
scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py)
;;
scripts/reboot-recovery/tests/test_reboot_p0_operational_contract.py)
;;
scripts/reboot-recovery/tests/test_harbor_watchdog_contract.py)
;;
scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py)
;;
scripts/security/gitea-private-inventory-p0-scorecard.py)
;;
scripts/security/gitea-authenticated-inventory-payload-validator.py)
;;
scripts/security/tests/test_gitea_private_inventory_p0_scorecard.py)
;;
scripts/security/tests/test_gitea_authenticated_inventory_payload_validator.py)
;;
*)
CONTROLLED_RUNTIME_TEST_PROFILE=0
;;
esac
done <<EOF
$CHANGED_FILES
EOF
if [ "$CONTROLLED_RUNTIME_TEST_PROFILE" = "1" ]; then
export AWOOOI_CD_TEST_PROFILE=controlled-runtime
echo "AWOOOI_CD_TEST_PROFILE=controlled-runtime" >> "$GITHUB_ENV"
echo "✅ controlled-runtime API test profile selected"
else
export AWOOOI_CD_TEST_PROFILE=full
echo "AWOOOI_CD_TEST_PROFILE=full" >> "$GITHUB_ENV"
echo "✅ full API test profile selected"
fi
printf '%s\n' "$AWOOOI_CD_TEST_PROFILE" > .awoooi-cd-test-profile
cat > /tmp/awoooi-api-tests.sh <<'CI_SCRIPT'
VENV=/opt/api-venv
HASH_FILE=/opt/api-venv/.deps_hash
CURRENT_HASH=$(md5sum apps/api/pyproject.toml | awk '{print $1}')
# python3.11 是 runner 層級持久安裝,只在首次或版本消失時才 apt-get
# 2026-04-05 Claude Code: 分離 apt-get 與 venv hash-guard避免每次 deps 變更都重跑 apt
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 apt index 失敗 → 改用 --fix-missing + retry
if ! command -v python3.11 &>/dev/null; then
echo "📦 安裝 python3.11..."
apt-get clean && rm -rf /var/lib/apt/lists/*
apt-get update -q --fix-missing || apt-get update -q || true
apt-get install -y -q python3.11-venv python3.11 || \
(add-apt-repository ppa:deadsnakes/python -y 2>/dev/null && apt-get update -q && apt-get install -y -q python3.11-venv python3.11) || true
else
echo "⚡ python3.11 已安裝,跳過 apt-get"
fi
# 確保 python3.11 存在,否則 fallback 到系統 python3
if ! command -v python3.11 &>/dev/null; then
echo "⚠️ python3.11 安裝失敗,使用 python3 fallback"
ln -sf "$(which python3)" /usr/local/bin/python3.11 || true
fi
if [ ! -d "$VENV/bin" ] || [ "$(cat $HASH_FILE 2>/dev/null)" != "$CURRENT_HASH" ]; then
echo "📦 deps 已變更,重建 venv..."
# 2026-04-17 ogt: /opt/api-venv 是 volume mount不能 rm -rf 目錄本身
# 改用 find 清空內容,保留 mount point 目錄
find "$VENV" -mindepth 1 -delete 2>/dev/null || true
python3.11 -m venv $VENV
source $VENV/bin/activate
pip install -q uv
cd apps/api && uv pip install -q -e ".[dev]" && cd -
echo "$CURRENT_HASH" > $HASH_FILE
else
echo "⚡ 使用快取 venv (deps 未變更)"
source $VENV/bin/activate
fi
cd apps/api
cleanup_pytest_workspace_cache() {
# 2026-05-19 Codex: CI image runs as root against a bind-mounted
# checkout. Remove Python cache artifacts before act-runner cleanup
# so successful jobs do not end with root-owned __pycache__ noise.
find tests src -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true
rm -rf .pytest_cache 2>/dev/null || true
}
# CI 排除需外部服務的測試 (Redis pool / Ollama — 2026-04-01 Claude Code)
# 2026-04-05 Claude Code: 修正 exit code — | tail 會吃掉 segfault (exit 139)
# 改用 tee + PIPESTATUS[0] 正確捕捉 pytest 本身的 exit code
# 2026-04-05 Claude Code: 加 --ignore=tests/integration 排除需 asyncpg 連線的 DB 測試
# integration tests 在 prod K8s 部署後由 E2E Smoke Test 覆蓋
# PYTHONFAULTHANDLER=1: 若 C extension segfault輸出完整 Python stacktrace
# 2026-04-05 Claude Code: test_github_webhook.py 已根治
# 原問題: import src.main → asyncpg C ext segfault (exit 139)
# 修復: 改用最小化 app只掛載 github_webhook router不走 DB import chain
# 現在可安全加入 CI 測試
# 2026-04-22 ogt: DATABASE_URL 改為必填後,單元測試需要此 env var 讓 Settings 通過驗證
# 單元測試不連 DB此 CI placeholder 僅供 Pydantic 驗證,不產生真實連線
if [ "${AWOOOI_CD_TEST_PROFILE:-full}" = "controlled-runtime" ]; then
echo "✅ controlled-runtime profile: running focused replay/auto-approve/copy tests"
python3.11 -m py_compile \
src/core/config.py \
src/db/base.py \
src/api/v1/platform/events.py \
src/api/v1/agents.py \
src/api/v1/iwooos.py \
src/api/v1/webhooks.py \
src/jobs/ai_slo_watchdog_job.py \
src/models/knowledge.py \
src/models/playbook.py \
src/services/awoooi_production_deploy_readback_blocker.py \
src/services/agent_replay_normalizer.py \
src/services/ai_agent_log_intelligence_integration_readback.py \
src/services/ai_agent_log_feedback_receipt_dry_run.py \
src/services/ai_agent_log_post_write_verifier_dry_run.py \
src/services/ai_agent_log_controlled_writeback_plan_readback.py \
src/services/ai_agent_log_controlled_writeback_executor_readback.py \
src/services/ai_agent_log_controlled_writeback_dispatch.py \
src/services/ai_agent_log_controlled_writeback_consumer_readback.py \
src/services/ai_agent_log_controlled_writeback_consumer_apply.py \
src/services/ai_agent_autonomous_runtime_control.py \
src/services/awooop_ansible_audit_service.py \
src/services/awooop_ansible_check_mode_service.py \
src/services/auto_repair_service.py \
src/services/auto_approve.py \
src/services/backup_dr_target_inventory.py \
src/services/backup_dr_readiness_matrix.py \
src/services/decision_fusion.py \
src/services/delivery_closure_workbench.py \
src/services/heartbeat_report_service.py \
src/services/credential_escrow_evidence_intake_readiness.py \
src/services/gitea_authenticated_inventory_payload_validation.py \
src/services/gitea_owner_coverage_attestation_validation.py \
src/services/gitea_private_inventory_closeout_validation.py \
src/services/gitea_private_inventory_p0_scorecard.py \
src/services/gitea_workflow_runner_owner_attestation_request.py \
src/services/reboot_auto_recovery_slo_scorecard.py \
src/services/reboot_auto_recovery_drill_preflight.py \
src/services/stockplatform_public_api_runtime_readback.py \
src/services/stockplatform_public_api_controlled_recovery_preflight.py \
src/services/harbor_registry_controlled_recovery_preflight.py \
src/services/harbor_registry_controlled_recovery_receipt.py \
src/services/iwooos_security_operating_system.py \
src/services/awoooi_gitea_onboarding_warning_step_dashboard.py \
src/services/awoooi_gitea_onboarding_warning_step_owner_package.py \
src/services/awoooi_gitea_onboarding_warning_step_owner_response_preflight.py \
src/services/awoooi_gitea_onboarding_warning_step_template_copy_apply_gate.py \
src/services/awoooi_gitea_onboarding_warning_step_template_copy_execution_plan.py \
src/services/awoooi_gitea_onboarding_warning_step_template_copy_receipt.py \
src/services/awoooi_gitea_onboarding_warning_step_runtime_enablement_gate.py \
src/services/awoooi_new_product_onboarding_page_model.py \
src/services/awoooi_onboarding_reminder_contract.py \
src/services/awoooi_onboarding_source_contracts.py \
src/services/awoooi_priority_work_order_readback.py \
src/services/awoooi_product_onboarding_guard.py \
src/services/p0_cicd_baseline_source_readiness.py \
src/services/product_awoooi_manifest_standard.py \
src/services/platform_operator_service.py \
src/services/telegram_gateway.py
python3.11 -m py_compile \
../../scripts/reboot-recovery/dr-escrow-evidence-checklist.py \
../../scripts/reboot-recovery/post-reboot-owner-response-preflight.py \
../../scripts/reboot-recovery/momo-source-arrival-gate.py \
../../scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py \
../../scripts/reboot-recovery/windows99-management-channel-probe.py \
../../scripts/ops/backup-alert-label-contract-check.py \
../../scripts/ops/backup-health-textfile-exporter.py \
../../scripts/ops/docker-disk-pressure-retention-cleanup.py \
../../scripts/ops/gitea-queue-hook-backlog-playbook.py \
../../scripts/ops/host-runaway-process-exporter.py \
../../scripts/ops/host-sustained-load-controller.py \
../../scripts/ops/host-sustained-load-evidence.py \
../../scripts/security/gitea-private-inventory-p0-scorecard.py \
../../scripts/security/gitea-authenticated-inventory-payload-validator.py
python3.11 -c "import yaml; yaml.safe_load(open('../../ops/monitoring/alerts-unified.yml')); print('alerts-unified YAML OK')"
python3.11 -c "import yaml; yaml.safe_load(open('../../ops/monitoring/alerts.yml')); print('alerts YAML OK')"
python3.11 -c "import yaml; yaml.safe_load(open('../../ops/reboot-recovery/full-stack-cold-start-baseline.yml')); print('full-stack-cold-start-baseline YAML OK')"
bash -n \
../../ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh \
../../ops/runner/check-awoooi-non110-runner-readiness.sh \
../../ops/runner/install-awoooi-non110-runner-user-service.sh \
../../ops/runner/register-awoooi-110-controlled-cd-lane-drain.sh \
../../scripts/reboot-recovery/deploy-to-110.sh \
../../scripts/reboot-recovery/deploy-to-188.sh \
../../scripts/reboot-recovery/enforce-110-runner-failclosed.sh \
../../scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh \
../../scripts/reboot-recovery/awoooi-startup.sh \
../../scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh \
../../scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh \
../../scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh \
../../scripts/reboot-recovery/post-start-quick-check.sh \
../../scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh \
../../scripts/reboot-recovery/full-stack-cold-start-check.sh \
../../scripts/reboot-recovery/cold-start-textfile-exporter.sh \
../../scripts/reboot-recovery/install-cold-start-monitor-110.sh \
../../scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh \
../../scripts/reboot-recovery/full-stack-recovery-scorecard.sh \
../../scripts/reboot-recovery/harbor-watchdog.sh \
../../scripts/reboot-recovery/awoooi-startup-110.sh \
../../scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh \
../../scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh \
../../scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh \
../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh \
../../scripts/backup/backup-awoooi-frequent.sh \
../../scripts/backup/backup-status.sh \
../../scripts/backup/gitea-repo-bundle-backup.sh
bash -n ../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh
DATABASE_URL="${DATABASE_URL:-postgresql+asyncpg://ci:ci@localhost/ci}" \
PYTHONFAULTHANDLER=1 python3.11 -m pytest \
tests/test_agent_replay_normalizer.py \
tests/test_ai_agent_log_intelligence_integration_readback_api.py \
tests/test_ai_agent_log_feedback_receipt_dry_run_api.py \
tests/test_ai_agent_log_post_write_verifier_dry_run_api.py \
tests/test_ai_agent_log_controlled_writeback_plan_readback_api.py \
tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py \
tests/test_ai_agent_log_controlled_writeback_dispatch_api.py \
tests/test_ai_agent_log_controlled_writeback_consumer_readback_api.py \
tests/test_ai_agent_log_controlled_writeback_consumer_apply_api.py \
tests/test_ai_agent_autonomous_runtime_control.py \
tests/test_awooop_truth_chain_service.py \
tests/test_shadow_auto_approve.py \
tests/test_destructive_patterns.py \
tests/test_approval_pending_visibility.py \
tests/test_awooop_operator_timeline_labels.py::test_outbound_timeline_title_labels_runbook_review \
tests/test_config_url_validation.py \
tests/test_delivery_closure_workbench_api.py \
tests/test_runtime_bootstrap_guards.py \
tests/test_backup_dr_target_inventory.py \
tests/test_backup_dr_target_inventory_api.py \
tests/test_backup_dr_readiness_matrix.py \
tests/test_backup_dr_readiness_matrix_api.py \
tests/test_credential_escrow_evidence_intake_readiness_api.py \
tests/test_gitea_private_inventory_p0_scorecard_api.py \
tests/test_gitea_workflow_runner_owner_attestation_request_api.py \
tests/test_reboot_auto_recovery_slo_scorecard_api.py \
tests/test_stockplatform_public_api_runtime_readback.py \
tests/test_stockplatform_public_api_controlled_recovery_preflight.py \
tests/test_harbor_registry_controlled_recovery_preflight.py \
tests/test_harbor_registry_controlled_recovery_receipt.py \
tests/test_iwooos_security_operating_system.py \
tests/test_awoooi_production_deploy_readback_blocker.py \
tests/test_awoooi_priority_work_order_readback_api.py \
tests/e2e_network_test.py::TestHMACVerification::test_valid_hmac_signature \
tests/test_p0_cicd_baseline_source_readiness_api.py \
tests/test_product_awoooi_manifest_standard_api.py \
tests/test_trust_drift_watchdog.py \
../../ops/runner/test_read_public_gitea_actions_queue.py \
../../ops/runner/test_cd_controlled_runtime_profile.py \
../../ops/runner/test_check_awoooi_non110_runner_readiness.py \
../../ops/runner/test_install_awoooi_non110_runner_user_service.py \
../../ops/runner/test_register_awoooi_110_controlled_cd_lane_drain.py \
../../ops/runner/test_check_awoooi_110_controlled_cd_lane_readiness.py \
../../ops/runner/test_verify_awoooi_non110_cd_closure.py \
../../scripts/backup/tests/test_backup_status_contract.py \
../../scripts/ops/tests/test_backup_health_textfile_exporter.py \
../../scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py \
../../scripts/ops/tests/test_gitea_queue_hook_backlog_playbook.py \
../../scripts/ops/tests/test_host_runaway_process_exporter.py \
../../scripts/ops/tests/test_host_pressure_alert_contract.py \
../../scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py \
../../scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py \
../../scripts/reboot-recovery/tests/test_momo_source_arrival_gate.py \
../../scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py \
../../scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py \
../../scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py \
../../scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py \
../../scripts/reboot-recovery/tests/test_reboot_p0_operational_contract.py \
../../scripts/reboot-recovery/tests/test_harbor_watchdog_contract.py \
../../scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py \
../../scripts/security/tests/test_gitea_private_inventory_p0_scorecard.py \
../../scripts/security/tests/test_gitea_authenticated_inventory_payload_validator.py \
-v --tb=short -x -p no:cacheprovider \
2>&1 | tee /tmp/pytest-output.txt; PYTEST_EXIT=${PIPESTATUS[0]}
else
DATABASE_URL="${DATABASE_URL:-postgresql+asyncpg://ci:ci@localhost/ci}" \
PYTHONFAULTHANDLER=1 python3.11 -m pytest tests/ -v --tb=short -x -p no:cacheprovider \
--ignore=tests/integration \
--ignore=tests/test_anomaly_counter.py \
--ignore=tests/test_global_repair_cooldown.py \
--ignore=tests/test_redis_multisig.py \
--ignore=tests/test_model_regression.py \
--ignore=tests/test_prompt_validation.py \
--ignore=tests/e2e_network_test.py \
2>&1 | tee /tmp/pytest-output.txt; PYTEST_EXIT=${PIPESTATUS[0]}
fi
tail -60 /tmp/pytest-output.txt
cleanup_pytest_workspace_cache
exit $PYTEST_EXIT
CI_SCRIPT
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-api-tests" \
-e AWOOOI_CD_TEST_PROFILE="${AWOOOI_CD_TEST_PROFILE:-full}" \
--cpus "2.0" \
--memory "6g" \
--memory-swap "8g" \
-v "$PWD:/workspace" \
-v /tmp/awoooi-api-tests.sh:/tmp/awoooi-api-tests.sh:ro \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-api-tests.sh
# ── 整合測試 B5 (2026-04-10) ──────────────────────────────────────────
# B5 整合測試 — postgres-test 由 services: 提供localhost:15432 直連
# 2026-04-10 Claude Sonnet 4.6: 用 psql 直連 localhost:15432 初始化 schema
# (docker exec 在 act runner 內無法取得 service container name)
# B5: Gitea act runner 的 services: 實作與 GitHub Actions 不同
# service container 啟動後需直連,但 act 的 container name 可能為空
# 2026-04-10 ogt: 改用 docker run 本地啟動取代 services: 宣告
# 2026-04-19 ogt + Claude Opus 4.7: cd 連續 2 次 fail (run 984/985)
# 真因: act runner 把 ci-runner 跑在獨立 user-defined network,
# pg-test-b5 預設用 host bridge → 兩邊隔離無法連 (172.17.0.2 timeout)
# 修法: 把 pg-test-b5 加入 act task 的 network,用 container name 連線
- name: Integration Tests (B5 — 真實 DB)
run: |
if [ -z "${AWOOOI_CD_TEST_PROFILE:-}" ] && [ -f .awoooi-cd-test-profile ]; then
AWOOOI_CD_TEST_PROFILE="$(tr -d '\r\n' < .awoooi-cd-test-profile)"
export AWOOOI_CD_TEST_PROFILE
fi
echo "B5 effective test profile=${AWOOOI_CD_TEST_PROFILE:-full}"
if [ "${AWOOOI_CD_TEST_PROFILE:-full}" = "controlled-runtime" ]; then
echo "✅ controlled-runtime profile: B5 DB integration unchanged; skipping B5 for this narrow release lane"
exit 0
fi
cat > /tmp/awoooi-b5-tests.sh <<'CI_SCRIPT'
set -euo pipefail
cd apps/api
# 安裝 psql client
if ! command -v psql &>/dev/null; then
apt-get install -y -q postgresql-client
fi
if ! docker info >/dev/null 2>&1; then
echo "BLOCKER b5_docker_socket_unavailable"
echo "NEXT_ACTION ensure_b5_ci_container_runs_with_docker_socket_permission_then_retry_cd"
exit 65
fi
# 2026-04-19 ogt + Claude Opus 4.7 v3: 主動創 shared network
# 之前 grep ACT_NET 在 c0f3509 run 沒 match → fallback bridge → container name DNS 失效
# 真因: default bridge 不支援 container name DNS,必須 user-defined network
# 修法: 主動建 'b5-test-net' (idempotent),ci-runner + pg-test-b5 都加入
B5_NET="b5-test-net"
docker network create "$B5_NET" 2>/dev/null || true
# 當前 ci-runner container (hostname == short container id) 連上此 network
# 若已連 → docker network connect 回 error 1,用 || true 吞掉
docker network connect "$B5_NET" "$HOSTNAME" 2>/dev/null || true
echo "B5 shared network: $B5_NET (ci-runner hostname: $HOSTNAME)"
# 啟動測試 DB 於 shared network,用 container name 'pg-test-b5' 連線
docker rm -f pg-test-b5 2>/dev/null || true
docker run -d --name pg-test-b5 \
--network="$B5_NET" \
-e POSTGRES_DB=awoooi_test \
-e POSTGRES_USER=awoooi \
-e POSTGRES_PASSWORD=awoooi_test_2026 \
pgvector/pgvector:pg16
# 等待就緒(用 container name,最多 60 秒)
B5_DB_READY=0
for i in $(seq 1 30); do
if PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi; then
B5_DB_READY=1
break
fi
sleep 2
done
if [ "$B5_DB_READY" != "1" ]; then
echo "BLOCKER b5_pg_test_container_not_ready"
echo "NEXT_ACTION inspect_b5_test_network_and_docker_socket_then_retry_cd"
docker ps --filter name=pg-test-b5 --format 'b5_container={{.Names}} status={{.Status}}' || true
exit 66
fi
# 初始化 schema
PGPASSWORD=awoooi_test_2026 psql \
-h pg-test-b5 -p 5432 -U awoooi -d awoooi_test \
-f tests/integration/setup_test_schema.sql
# 跑測試
# B5 整合測試嚴格模式 (2026-04-13 ogt: 恢復 Break-Glass 移除)
# -m integration: override pyproject.toml addopts "-m 'not integration'",讓標記測試可執行
# 2026-04-22 ogt: DATABASE_URL 改為必填後import chain 需要此 env var 讓 Settings 通過驗證
DATABASE_URL="postgresql+asyncpg://awoooi:awoooi_test_2026@pg-test-b5:5432/awoooi_test?ssl=disable" \
TEST_DATABASE_URL="postgresql+asyncpg://awoooi:awoooi_test_2026@pg-test-b5:5432/awoooi_test?ssl=disable" \
/opt/api-venv/bin/pytest tests/integration/test_b5_core_flows.py -v --tb=short -m integration -p no:cacheprovider || PYTEST_EXIT=$?
# 清理
docker rm -f pg-test-b5 || true
# 2026-05-20 Codex: B5 imports shared tests helpers, so cleanup the
# whole tests tree to avoid root-owned __pycache__ act-runner noise.
find tests src -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true
rm -rf .pytest_cache 2>/dev/null || true
exit "${PYTEST_EXIT:-0}"
CI_SCRIPT
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-b5-tests" \
--user 0:0 \
--cpus "2.0" \
--memory "2g" \
-v "$PWD:/workspace" \
-v /tmp/awoooi-b5-tests.sh:/tmp/awoooi-b5-tests.sh:ro \
-v /var/run/docker.sock:/var/run/docker.sock \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-b5-tests.sh
- name: Clean Test Workspace Artifacts
if: always()
env:
HOST_RUNNER_CLEANUP_IMAGE: ${{ env.CI_IMAGE }}
run: bash scripts/ci/cleanup-host-runner-workspace.sh
- name: Notify Pipeline Failure
# 2026-04-30 Codex: tests job failure notifier; no jq dependency for host parity.
if: failure()
env:
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🧪 Stage: tests\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
if AWOOI_CICD_STATUS=failed \
AWOOI_CICD_STAGE=tests \
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD tests failure notification mirrored through AWOOI API"
else
echo "AWOOI API notify failed; direct Telegram fallback disabled to preserve AwoooP receipt chain"
fi
build-and-deploy:
# 2026-06-28 Codex: keep CD-generated `[skip ci]` deploy commits and
# `cancel-stale-cd` queue-cleaning commits from re-entering build/deploy.
# 2026-07-01 Codex: metadata-only controlled-runtime fixes already run the
# focused tests above; do not spend the Docker build lock or deploy marker
# when no app image can change.
if: ${{ github.event_name != 'push' || (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'cancel-stale-cd') && !contains(github.event.head_commit.message, '[metadata-only]')) }}
# 2026-04-30 Codex: Docker builds run on the host runner. Long docker build
# steps were killing the transient act job container with RWLayer=nil.
needs: [tests]
timeout-minutes: 60
runs-on: awoooi-non110-host
steps:
- name: Bootstrap Host Runner Tools
# 2026-05-05 Codex: keep the host-mode runner self-healing before
# actions/checkout@v4 and Telegram failure notifications run.
run: |
if command -v apk >/dev/null 2>&1; then
apk add --no-cache nodejs npm git curl bash coreutils python3 openssh-client docker-cli docker-cli-buildx
fi
- uses: actions/checkout@v4
- name: Wait for Host Web Build Pressure
# 2026-06-27 Codex: post-deploy smoke is also browser-heavy. Refuse to
# add another smoke run while active CI/build/smoke pressure is present.
run: bash scripts/ci/wait-host-web-build-pressure.sh
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
- name: Notify Build Deploy Start
run: |
ACTOR="${{ github.actor }}"
if AWOOI_CICD_STATUS=running \
AWOOI_CICD_STAGE=build-and-deploy \
AWOOI_CICD_JOB_NAME="AWOOOI 建置部署開始" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${{ steps.commit.outputs.message }}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD build-deploy start notification mirrored through AWOOI API"
else
echo "⚠️ CI/CD build-deploy start notification failed (non-fatal)"
fi
- name: Login to Harbor
env:
HARBOR_PASSWORD: ${{ secrets.HARBOR_PASSWORD }}
HARBOR_USERNAME: ${{ secrets.HARBOR_USERNAME }}
run: |
HARBOR_REGISTRY="${{ env.HARBOR }}"
LOGIN_ATTEMPTS="${HARBOR_LOGIN_ATTEMPTS:-3}"
LOGIN_SLEEP_SECONDS="${HARBOR_LOGIN_SLEEP_SECONDS:-5}"
WATCHDOG="/usr/local/bin/harbor-watchdog.sh"
HARBOR_110_NODE_EXPORTER_URL="${HARBOR_110_NODE_EXPORTER_URL:-http://192.168.0.110:9100/metrics}"
HARBOR_110_MAX_LOAD5_PER_CORE="${HARBOR_110_MAX_LOAD5_PER_CORE:-1.25}"
HARBOR_110_MAX_GITEA_CPU_CORES="${HARBOR_110_MAX_GITEA_CPU_CORES:-3.0}"
probe_registry_v2() {
curl --silent --show-error \
--output /dev/null \
--write-out "%{http_code}" \
--max-time 10 \
"https://${HARBOR_REGISTRY}/v2/" || true
}
host_has_110_ip() {
command -v ip >/dev/null 2>&1 || return 1
ip -o -4 addr show 2>/dev/null | grep -q " 192.168.0.110/"
}
run_watchdog() {
if [ "$(id -u)" = "0" ]; then
"${WATCHDOG}" "$@"
else
sudo -n "${WATCHDOG}" "$@"
fi
}
greater_than() {
awk -v left="$1" -v right="$2" 'BEGIN { exit !(left > right) }'
}
preflight_110_capacity() {
metrics="$(curl --silent --show-error --max-time 5 "${HARBOR_110_NODE_EXPORTER_URL}" || true)"
if [ -z "${metrics}" ]; then
echo "harbor_110_capacity_metrics_unavailable=1"
return 0
fi
load5_per_core="$(printf '%s\n' "${metrics}" | awk '$1 ~ /^awoooi_host_load5_per_core\\{/ && $0 ~ /host=\"110\"/ { value = $NF } END { print value }')"
gitea_cpu_cores="$(printf '%s\n' "${metrics}" | awk '$1 ~ /^docker_container_cpu_cores\\{/ && $0 ~ /container_name=\"gitea\"/ { value = $NF } END { print value }')"
if [ -n "${load5_per_core}" ]; then
echo "harbor_110_load5_per_core=${load5_per_core}"
if greater_than "${load5_per_core}" "${HARBOR_110_MAX_LOAD5_PER_CORE}"; then
echo "BLOCKER harbor_110_host_pressure_high load5_per_core=${load5_per_core} threshold=${HARBOR_110_MAX_LOAD5_PER_CORE}"
return 1
fi
fi
if [ -n "${gitea_cpu_cores}" ]; then
echo "harbor_110_gitea_cpu_cores=${gitea_cpu_cores}"
if greater_than "${gitea_cpu_cores}" "${HARBOR_110_MAX_GITEA_CPU_CORES}"; then
echo "BLOCKER harbor_110_gitea_cpu_saturated cpu_cores=${gitea_cpu_cores} threshold=${HARBOR_110_MAX_GITEA_CPU_CORES}"
return 1
fi
fi
return 0
}
controlled_harbor_repair_once() {
if [ "${AWOOOI_CD_HARBOR_CONTROLLED_REPAIR:-1}" != "1" ]; then
echo "harbor_controlled_repair_skipped=disabled"
return 1
fi
if ! host_has_110_ip; then
echo "harbor_controlled_repair_skipped=not_110_host"
return 1
fi
if [ ! -x "${WATCHDOG}" ]; then
echo "harbor_controlled_repair_skipped=watchdog_missing"
return 1
fi
echo "harbor_controlled_repair_check_start=1"
set +e
check_output="$(run_watchdog --check 2>&1)"
check_rc=$?
set -e
printf '%s\n' "${check_output}" | sed -n '1,80p'
echo "harbor_controlled_repair_check_rc=${check_rc}"
if [ "${check_rc}" -ne 0 ]; then
return 1
fi
if ! printf '%s\n' "${check_output}" | grep -q "AWOOOI_HARBOR_WATCHDOG_CHECK"; then
echo "harbor_controlled_repair_blocked=missing_watchdog_check_marker"
return 1
fi
if printf '%s\n' "${check_output}" | grep -q "harbor_ready=true"; then
echo "harbor_controlled_repair_skipped=already_ready"
return 0
fi
echo "harbor_controlled_repair_once_start=1"
set +e
repair_output="$(run_watchdog --repair-once 2>&1)"
repair_rc=$?
set -e
printf '%s\n' "${repair_output}" | sed -n '1,140p'
echo "harbor_controlled_repair_once_rc=${repair_rc}"
[ "${repair_rc}" -eq 0 ]
}
attempt=1
repair_attempted=0
if ! preflight_110_capacity; then
echo "NEXT_ACTION wait_for_110_load_to_normalize_then_rerun_harbor_watchdog_check"
echo "NEXT_ACTION if_110_load_stays_high_use_local_console: sudo /usr/local/bin/recover-110-control-path-and-harbor-local.sh --check"
exit 1
fi
while [ "${attempt}" -le "${LOGIN_ATTEMPTS}" ]; do
registry_status="$(probe_registry_v2)"
if [ -z "${registry_status}" ]; then
registry_status="000"
fi
if [ "${registry_status}" = "200" ] || [ "${registry_status}" = "401" ]; then
if printf '%s\n' "${HARBOR_PASSWORD}" | \
docker login "${HARBOR_REGISTRY}" \
-u "${HARBOR_USERNAME}" \
--password-stdin; then
echo "harbor_login_ready=1"
exit 0
fi
echo "harbor_login_attempt=${attempt} docker_login_failed"
else
echo "harbor_login_attempt=${attempt} registry_v2_status=${registry_status}"
if [ "${repair_attempted}" = "0" ]; then
repair_attempted=1
if controlled_harbor_repair_once; then
registry_status="$(probe_registry_v2)"
if [ -z "${registry_status}" ]; then
registry_status="000"
fi
echo "harbor_controlled_repair_public_registry_v2_status=${registry_status}"
continue
fi
fi
fi
if [ "${attempt}" -ge "${LOGIN_ATTEMPTS}" ]; then
echo "BLOCKER harbor_registry_public_route_unavailable registry_v2_status=${registry_status}"
echo "NEXT_ACTION run_on_110_local_console_or_restored_ssh: sudo /usr/local/bin/harbor-watchdog.sh --check"
echo "NEXT_ACTION if_check_confirms_unhealthy_on_110: sudo /usr/local/bin/harbor-watchdog.sh --repair-once"
echo "NEXT_ACTION combined_110_control_path_then_harbor: sudo /usr/local/bin/recover-110-control-path-and-harbor-local.sh --apply-all"
echo "NEXT_ACTION controlled_workflow_dispatch: .gitea/workflows/harbor-110-local-repair.yaml"
exit 1
fi
sleep "${LOGIN_SLEEP_SECONDS}"
attempt=$((attempt + 1))
done
# 2026-05-21 Codex: AWOOI workflow concurrency and the Docker network
# lock only protect AWOOI/Docker work. Other repos can still run
# host-side Next/Turbo builds on the same 110 runner and starve this
# deploy. Wait for those foreign web builds before starting our image
# build; the gate is read-only and never kills another process.
- name: Wait for Host Web Build Pressure
run: bash scripts/ci/wait-host-web-build-pressure.sh
# 2026-04-30 Codex: Gitea act-runner shares one Docker daemon across repos.
# When another repo starts a heavy docker build while AWOOOI Web is still
# building, the job container can disappear and Docker reports RWLayer=nil.
# A Docker-network lock is global to the host daemon and survives container
# namespaces, unlike /tmp/flock inside the transient job container.
# 2026-06-28 Codex: 110 runner pressure remains incident-grade; Docker
# build lock contention is fail-hard until non-110 readiness is verified.
- name: Acquire Docker Build Lock
run: |
LOCK_NAME="awoooi-cd-docker-build-lock"
LOCK_WARN_ONLY="${DOCKER_BUILD_LOCK_WARN_ONLY:-0}"
STALE_SECONDS="${DOCKER_BUILD_LOCK_STALE_SECONDS:-7200}"
EMPTY_LOCK_SECONDS="${DOCKER_BUILD_LOCK_EMPTY_SECONDS:-300}"
WAIT_ATTEMPTS="${DOCKER_BUILD_LOCK_WAIT_ATTEMPTS:-180}"
WAIT_SLEEP_SECONDS="${DOCKER_BUILD_LOCK_SLEEP_SECONDS:-10}"
for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do
if docker network create \
--label awoooi.ci-lock=docker-build \
--label awoooi.owner=cd-pipeline \
"$LOCK_NAME" >/dev/null 2>&1; then
echo "DOCKER_BUILD_LOCK=${LOCK_NAME}" >> "$GITHUB_ENV"
echo "✅ Docker build lock acquired: ${LOCK_NAME}"
exit 0
fi
CREATED_AT=$(docker network inspect "$LOCK_NAME" \
--format '{{.Created}}' 2>/dev/null || true)
if [ -n "$CREATED_AT" ]; then
# 2026-05-03 ogt: 修復 stale 偵測 — Docker 回傳 "2006-01-02 15:04:05.999999999 -0700 MST"
# date -d 不接受奈秒小數點與末尾時區縮寫CST/MST 等),導致 CREATED_EPOCH=0 → stale 永不觸發
# 2026-06-18 Codex: act-runner 容器可能沒有 GNU date / python3
# node 由 bootstrap 安裝,作為 Docker CreatedAt 的穩定解析 fallback。
# 2026-06-19 Codex: Docker / Gitea runner 可能回傳 ISO
# `2026-06-18T16:20:00.123456789Z`;若 CREATED_EPOCH=0
# empty lock 永遠不會自清,下一輪 deploy 會卡滿 30 分鐘。
CREATED_CLEAN=$(echo "$CREATED_AT" | sed 's/\.[0-9]*//' | sed 's/ [A-Z][A-Z]*$//')
CREATED_EPOCH=$(date -d "$CREATED_CLEAN" +%s 2>/dev/null || \
node -e 'const raw = process.argv[1] || ""; const base = raw.replace(/\.\d+/, "").replace(/\s+[A-Z]{2,4}$/, ""); const spaced = base.replace(/^(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2}:\d{2})\s+([+-]\d{2})(\d{2})$/, "$1T$2$3:$4"); const iso = base.replace(/^(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2}:\d{2})(Z|[+-]\d{2}:?\d{2})$/, "$1T$2$3"); const candidates = [raw, base, spaced, iso]; for (const candidate of candidates) { const ms = Date.parse(candidate); if (Number.isFinite(ms)) { console.log(Math.floor(ms / 1000)); process.exit(0); } } process.exit(1);' \
"$CREATED_AT" 2>/dev/null || \
python3 -c "import sys, datetime, re; ts = re.sub(r'\\.\d+', '', sys.argv[1]); ts = re.sub(r'\\s+[A-Z]{2,4}$', '', ts.strip()); print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))" \
"$CREATED_AT" 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s)
LOCK_AGE=$((NOW_EPOCH - CREATED_EPOCH))
# 2026-05-05 Codex: dirty reboot / cancelled Actions can leave
# the Docker-network lock behind with no active build or push.
# Waiting the full 30m CD timeout keeps deploys queued even
# though no job is protected, so clear empty locks after 5m.
# 2026-06-18 Codex: 只靠 bracket pattern 仍會命中 lock-check
# bash/awk 自己的指令列;必須排除檢查器本身,取消後留下的
# empty lock network 才能在 5 分鐘後自清。
ACTIVE_DOCKER_WORK=$(ps -eo pid,args | awk '
$0 ~ /[d]ocker (build|push)|[b]uildx build/ &&
$0 !~ /ACTIVE_DOCKER_WORK/ &&
$0 !~ /awk/ &&
$0 !~ /ps -eo pid,args/ {print}
' || true)
if [ "$CREATED_EPOCH" -eq 0 ] && \
[ $((attempt * WAIT_SLEEP_SECONDS)) -gt $((EMPTY_LOCK_SECONDS * 2)) ] && \
[ -z "$ACTIVE_DOCKER_WORK" ]; then
echo "⚠️ Docker build lock has unparsable CreatedAt (${CREATED_AT}) and no active docker build/push after $((attempt * WAIT_SLEEP_SECONDS))s, removing ${LOCK_NAME}"
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
continue
fi
if [ "$CREATED_EPOCH" -gt 0 ] && \
[ "$LOCK_AGE" -gt "$EMPTY_LOCK_SECONDS" ] && \
[ -z "$ACTIVE_DOCKER_WORK" ]; then
echo "⚠️ empty Docker build lock detected (age=${LOCK_AGE}s > ${EMPTY_LOCK_SECONDS}s, no active docker build/push), removing ${LOCK_NAME}"
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
continue
fi
if [ "$CREATED_EPOCH" -gt 0 ] && \
[ "$LOCK_AGE" -gt "$STALE_SECONDS" ]; then
echo "⚠️ stale Docker build lock detected (age=${LOCK_AGE}s > ${STALE_SECONDS}s), removing ${LOCK_NAME}"
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
continue
fi
fi
echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting ${WAIT_SLEEP_SECONDS}s..."
if [ "$attempt" -lt "$WAIT_ATTEMPTS" ]; then
sleep "$WAIT_SLEEP_SECONDS"
fi
done
echo "⚠️ timed out waiting for Docker build lock"
if [ "$LOCK_WARN_ONLY" = "1" ]; then
echo "⚠️ continuing without exclusive Docker build lock under commander controlled automation"
exit 0
fi
echo "❌ refusing to continue without Docker build lock"
exit 1
# ── API 鏡像建置(含 Layer Cache 加速)──────────────────────────────
# 2026-04-01 ogt: CACHE_BUST=git_sha 確保 src/ 和 models.json 層每次重建
# deps 層 (pip install) 仍可 cache → 加速;代碼/配置層強制失效
# 2026-05-05 Codex: host runner bootstrap installs docker-cli-buildx;
# keep BuildKit enabled because the web Dockerfile uses RUN --mount.
- name: Build and Push API
env:
DOCKER_BUILDKIT: "1"
run: |
docker build -f apps/api/Dockerfile \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--cache-from ${{ env.HARBOR }}/awoooi/api:latest \
--build-arg CACHE_BUST=${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/api:${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/api:latest \
.
docker push ${{ env.HARBOR }}/awoooi/api:${{ github.sha }}
docker push ${{ env.HARBOR }}/awoooi/api:latest
# 2026-03-31 ogt: 移除中間通知,減少訊息雜訊
# ── Web 鏡像建置(精準快取失效)──────────────────────────────
# 2026-03-30 ogt: NEXT_PUBLIC_* 必須用公網域名 (build-time 寫死)
# 2026-04-01 Claude Code: CACHE_BUST=git_sha 取代 --no-cache
# - deps 層 (pnpm install) 仍可 cache → 節省 ~2-3 min
# - COPY . . 以下由 CACHE_BUST 強制失效 → 業務邏輯/CSRF 等變更正確進入 bundle
# 2026-05-05 Codex: mirror API build mode; BuildKit required for cache mounts.
- name: Build and Push Web
env:
DOCKER_BUILDKIT: "1"
run: |
docker build -f apps/web/Dockerfile \
--build-arg NEXT_PUBLIC_API_URL=https://awoooi.wooo.work \
--build-arg CACHE_BUST=${{ github.sha }} \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--cache-from ${{ env.HARBOR }}/awoooi/web:latest \
-t ${{ env.HARBOR }}/awoooi/web:${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/web:latest \
.
docker push ${{ env.HARBOR }}/awoooi/web:${{ github.sha }}
docker push ${{ env.HARBOR }}/awoooi/web:latest
- name: Release Docker Build Lock
if: always()
run: |
if [ -n "${DOCKER_BUILD_LOCK:-}" ]; then
docker network rm "$DOCKER_BUILD_LOCK" >/dev/null 2>&1 || true
echo "✅ Docker build lock released: ${DOCKER_BUILD_LOCK}"
else
echo "⚡ no Docker build lock to release"
fi
# 2026-03-31 ogt: 移除中間通知
# 2026-03-31 ogt: P0-1 Secrets 自動注入 (ADR-035 強制)
# 2026-03-31 ogt: 加入 AI API Keys (修復 mock_fallback 問題)
- name: Inject K8s Secrets
env:
ARGOCD_API_TOKEN: ${{ secrets.ARGOCD_API_TOKEN }}
AWOOOI_GITEA_API_TOKEN: ${{ secrets.AWOOOI_GITEA_API_TOKEN }}
AWOOOI_GITEA_WEBHOOK_SECRET: ${{ secrets.AWOOOI_GITEA_WEBHOOK_SECRET }}
AWOOOP_OPERATOR_API_KEY: ${{ secrets.AWOOOP_OPERATOR_API_KEY }}
CLAUDE_API_KEY: ${{ secrets.CLAUDE_API_KEY }}
DATABASE_URL: ${{ secrets.DATABASE_URL }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
JWT_ALGORITHM: ${{ secrets.JWT_ALGORITHM }}
JWT_SECRET: ${{ secrets.JWT_SECRET }}
LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }}
LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }}
MIGRATION_DATABASE_URL: ${{ secrets.MIGRATION_DATABASE_URL }}
NEMOTRON_BOT_TOKEN: ${{ secrets.NEMOTRON_BOT_TOKEN }}
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
OPENCLAW_BOT_TOKEN: ${{ secrets.OPENCLAW_BOT_TOKEN }}
OPENCLAW_TG_USER_WHITELIST: ${{ secrets.OPENCLAW_TG_USER_WHITELIST }}
REDIS_URL: ${{ secrets.REDIS_URL }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
SMTP_HOST: ${{ secrets.SMTP_HOST }}
SRE_GROUP_CHAT_ID: ${{ secrets.SRE_GROUP_CHAT_ID }}
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
WEBHOOK_HMAC_SECRET: ${{ secrets.WEBHOOK_HMAC_SECRET }}
run: |
# 2026-06-30 Codex: do not inline action secret expressions in run scripts.
# Gitea renders the shell body into job logs before masking. Keep
# secret values in process env, then read by variable name only.
secret_b64_env() {
local env_name="$1"
if command -v python3.11 >/dev/null 2>&1; then
SECRET_ENV_NAME="${env_name}" python3.11 - <<'PY'
import base64
import os
data = os.environ.get(os.environ["SECRET_ENV_NAME"], "").encode()
data = data[:-1] if data.endswith(b"\n") else data
print(base64.b64encode(data).decode(), end="")
PY
elif command -v python3 >/dev/null 2>&1; then
SECRET_ENV_NAME="${env_name}" python3 - <<'PY'
import base64
import os
data = os.environ.get(os.environ["SECRET_ENV_NAME"], "").encode()
data = data[:-1] if data.endswith(b"\n") else data
print(base64.b64encode(data).decode(), end="")
PY
else
secret_value="$(printenv "${env_name}" || true)"
printf '%s' "${secret_value}" | base64 | tr -d '\n'
fi
}
prepare_deploy_key() {
mkdir -p "${HOME}/.ssh"
umask 077
local source_key="${AWOOOI_DEPLOY_SSH_KEY_PATH:-${HOME}/.ssh/deploy_key}"
if [ ! -r "${source_key}" ]; then
echo "❌ deploy ssh key file missing: ${source_key}" >&2
exit 1
fi
if [ "${source_key}" != "${HOME}/.ssh/deploy_key" ]; then
cp "${source_key}" "${HOME}/.ssh/deploy_key"
fi
chmod 600 "${HOME}/.ssh/deploy_key"
}
TG_BOT_TOKEN_B64="$(secret_b64_env TELEGRAM_BOT_TOKEN)"
TG_CHAT_ID_B64="$(secret_b64_env SRE_GROUP_CHAT_ID)"
NVIDIA_API_KEY_B64="$(secret_b64_env NVIDIA_API_KEY)"
GEMINI_API_KEY_B64="$(secret_b64_env GEMINI_API_KEY)"
LANGFUSE_PUBLIC_KEY_B64="$(secret_b64_env LANGFUSE_PUBLIC_KEY)"
LANGFUSE_SECRET_KEY_B64="$(secret_b64_env LANGFUSE_SECRET_KEY)"
TG_USER_WHITELIST_B64="$(secret_b64_env OPENCLAW_TG_USER_WHITELIST)"
SENTRY_AUTH_TOKEN_B64="$(secret_b64_env SENTRY_AUTH_TOKEN)"
GITEA_WEBHOOK_SECRET_B64="$(secret_b64_env AWOOOI_GITEA_WEBHOOK_SECRET)"
ARGOCD_API_TOKEN_B64="$(secret_b64_env ARGOCD_API_TOKEN)"
DATABASE_URL_B64="$(secret_b64_env DATABASE_URL)"
MIGRATION_DATABASE_URL_B64="$(secret_b64_env MIGRATION_DATABASE_URL)"
REDIS_URL_B64="$(secret_b64_env REDIS_URL)"
JWT_SECRET_B64="$(secret_b64_env JWT_SECRET)"
JWT_ALGORITHM_B64="$(secret_b64_env JWT_ALGORITHM)"
WEBHOOK_HMAC_SECRET_B64="$(secret_b64_env WEBHOOK_HMAC_SECRET)"
AWOOOP_OPERATOR_API_KEY_B64="$(secret_b64_env AWOOOP_OPERATOR_API_KEY)"
SENTRY_DSN_B64="$(secret_b64_env SENTRY_DSN)"
CLAUDE_API_KEY_B64="$(secret_b64_env CLAUDE_API_KEY)"
GITEA_API_TOKEN_B64="$(secret_b64_env AWOOOI_GITEA_API_TOKEN)"
NEMOTRON_BOT_TOKEN_B64="$(secret_b64_env NEMOTRON_BOT_TOKEN)"
OPENCLAW_BOT_TOKEN_B64="$(secret_b64_env OPENCLAW_BOT_TOKEN)"
SMTP_HOST_B64="$(secret_b64_env SMTP_HOST)"
SRE_GROUP_CHAT_ID_B64="$(secret_b64_env SRE_GROUP_CHAT_ID)"
# S1/S2: 統一命名 deploy_key改用 ssh-keyscan 與強制 host key 驗證。
prepare_deploy_key
# 2026-05-13 Codex: keyscan must include ED25519 explicitly. Some
# OpenSSH builds otherwise record only RSA/ECDSA, then strict deploy
# SSH fails with "No ED25519 host key is known" after image push.
# 2026-06-13 Codex: keep deploy-time host keys in a dedicated file.
# The runner user's global known_hosts is shared by cold-start and
# backup checks for 120/188; overwriting it here caused strict SSH
# recovery gates to flap after every CD run.
DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts"
ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null
test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; }
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10"
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" << SECRETS
set -e
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=\${K8S_API_SERVER}"
# 注入 Telegram Secrets (ADR-035 鐵律)
# 2026-06-12 Codex: OPENCLAW_TG_CHAT_ID 僅作舊欄位相容,
# 實際值必須與 SRE_GROUP_CHAT_ID 一致,避免正式告警旁路到其他群組。
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"${TG_BOT_TOKEN_B64}"},
{"op":"add","path":"/data/OPENCLAW_TG_CHAT_ID","value":"${TG_CHAT_ID_B64}"}
]' || { echo "❌ Telegram Secrets patch 失敗 — ADR-035 鐵律"; exit 1; }
# 2026-03-31 ogt: 注入 AI API Keys (修復 NVIDIA/Gemini mock_fallback)
# 2026-04-01 Claude Code: base64 -w 0 防止長 key 換行破壞 JSON
# NVIDIA NIM (免費 tier)
if [ -n "${NVIDIA_API_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/NVIDIA_API_KEY","value":"${NVIDIA_API_KEY_B64}"}
]' && echo "✅ NVIDIA_API_KEY 已注入" || echo "⚠️ NVIDIA_API_KEY patch 失敗"
else
echo "⚠️ NVIDIA_API_KEY 未設定,跳過"
fi
# Gemini (備援)
if [ -n "${GEMINI_API_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/GEMINI_API_KEY","value":"${GEMINI_API_KEY_B64}"}
]' && echo "✅ GEMINI_API_KEY 已注入" || echo "⚠️ GEMINI_API_KEY patch 失敗"
else
echo "⚠️ GEMINI_API_KEY 未設定,跳過"
fi
# 2026-04-01 Claude Code: Langfuse LLMOps keys (補齊 CD 注入,之前只有手動設定)
if [ -n "${LANGFUSE_PUBLIC_KEY_B64}" ] && [ -n "${LANGFUSE_SECRET_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/LANGFUSE_PUBLIC_KEY","value":"${LANGFUSE_PUBLIC_KEY_B64}"},
{"op":"add","path":"/data/LANGFUSE_SECRET_KEY","value":"${LANGFUSE_SECRET_KEY_B64}"}
]' && echo "✅ LANGFUSE keys 已注入" || echo "⚠️ LANGFUSE keys patch 失敗"
else
echo "⚠️ LANGFUSE_PUBLIC_KEY/SECRET_KEY 未設定,跳過 (現有 K8s secret 值維持不變)"
fi
# 2026-04-02 Claude Code: Telegram Whitelist (授權簽核用戶 ID)
if [ -n "${TG_USER_WHITELIST_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/OPENCLAW_TG_USER_WHITELIST","value":"${TG_USER_WHITELIST_B64}"}
]' && echo "✅ TG_USER_WHITELIST 已注入" || echo "⚠️ TG_USER_WHITELIST patch 失敗"
fi
# Phase O-4.1 2026-04-02: Sentry Auth Token (Wave A.1 ADR-037)
if [ -n "${SENTRY_AUTH_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SENTRY_AUTH_TOKEN","value":"${SENTRY_AUTH_TOKEN_B64}"}
]' && echo "✅ SENTRY_AUTH_TOKEN 已注入" || echo "⚠️ SENTRY_AUTH_TOKEN patch 失敗"
else
echo "⚠️ SENTRY_AUTH_TOKEN 未設定Sentry Comment API 將跳過"
fi
# ADR-059 2026-04-05 Claude Code: Gitea Webhook Secret
if [ -n "${GITEA_WEBHOOK_SECRET_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/GITEA_WEBHOOK_SECRET","value":"${GITEA_WEBHOOK_SECRET_B64}"}
]' && echo "✅ GITEA_WEBHOOK_SECRET 已注入" || echo "⚠️ GITEA_WEBHOOK_SECRET patch 失敗"
else
echo "⚠️ GITEA_WEBHOOK_SECRET 未設定Gitea Webhook 簽章驗證將在 prod 失效"
fi
# MCP Phase 3: ArgoCD API Token (2026-04-11 Claude Sonnet 4.6)
if [ -n "${ARGOCD_API_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/ARGOCD_API_TOKEN","value":"${ARGOCD_API_TOKEN_B64}"}
]' && echo "✅ ARGOCD_API_TOKEN 已注入" || echo "⚠️ ARGOCD_API_TOKEN patch 失敗"
else
echo "⚠️ ARGOCD_API_TOKEN 未設定ArgoCD MCP 將使用空 token"
fi
# ============================================================================
# ADR-090-B 2026-04-18 ogt + Claude Opus 4.7: L3-only 升級 L213 個 key
# ============================================================================
# 目的: 消滅「只存 K8s etcd 單點」的災難盲區Gitea Secret 成為正式真相來源
# 注意: 每個 block 與上方維持相同結構if guard + base64 -w 0 + json patch
# DATABASE_URL — PG 應用連線串2026-04-18 輪替)
if [ -n "${DATABASE_URL_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/DATABASE_URL","value":"${DATABASE_URL_B64}"}
]' && echo "✅ DATABASE_URL 已注入" || echo "⚠️ DATABASE_URL patch 失敗"
else
echo "⚠️ DATABASE_URL 未設定awoooi-api 將無法連 PG"
fi
# MIGRATION_DATABASE_URL — CI migration 用 awoooi_migrator 限權帳號ADR-090-B
if [ -n "${MIGRATION_DATABASE_URL_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/MIGRATION_DATABASE_URL","value":"${MIGRATION_DATABASE_URL_B64}"}
]' && echo "✅ MIGRATION_DATABASE_URL 已注入" || echo "⚠️ MIGRATION_DATABASE_URL patch 失敗"
fi
# REDIS_URL — Redis 連線6380 on 188
if [ -n "${REDIS_URL_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/REDIS_URL","value":"${REDIS_URL_B64}"}
]' && echo "✅ REDIS_URL 已注入" || echo "⚠️ REDIS_URL patch 失敗"
else
echo "⚠️ REDIS_URL 未設定"
fi
# JWT_SECRET / JWT_ALGORITHM — API 認證
if [ -n "${JWT_SECRET_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/JWT_SECRET","value":"${JWT_SECRET_B64}"}
]' && echo "✅ JWT_SECRET 已注入" || echo "⚠️ JWT_SECRET patch 失敗"
fi
if [ -n "${JWT_ALGORITHM_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/JWT_ALGORITHM","value":"${JWT_ALGORITHM_B64}"}
]' && echo "✅ JWT_ALGORITHM 已注入" || echo "⚠️ JWT_ALGORITHM patch 失敗"
fi
# WEBHOOK_HMAC_SECRET — Alertmanager webhook HMAC 簽章
if [ -n "${WEBHOOK_HMAC_SECRET_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/WEBHOOK_HMAC_SECRET","value":"${WEBHOOK_HMAC_SECRET_B64}"}
]' && echo "✅ WEBHOOK_HMAC_SECRET 已注入" || echo "⚠️ WEBHOOK_HMAC_SECRET patch 失敗"
fi
# AWOOOP_OPERATOR_API_KEY — AwoooP Operator mutation endpoints
if [ -n "${AWOOOP_OPERATOR_API_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/AWOOOP_OPERATOR_API_KEY","value":"${AWOOOP_OPERATOR_API_KEY_B64}"}
]' && echo "✅ AWOOOP_OPERATOR_API_KEY 已注入" || echo "⚠️ AWOOOP_OPERATOR_API_KEY patch 失敗"
fi
# SENTRY_DSN — Sentry 錯誤追蹤(不是 auth token
if [ -n "${SENTRY_DSN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SENTRY_DSN","value":"${SENTRY_DSN_B64}"}
]' && echo "✅ SENTRY_DSN 已注入" || echo "⚠️ SENTRY_DSN patch 失敗"
fi
# CLAUDE_API_KEY — Claude 備援 LLM
if [ -n "${CLAUDE_API_KEY_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/CLAUDE_API_KEY","value":"${CLAUDE_API_KEY_B64}"}
]' && echo "✅ CLAUDE_API_KEY 已注入" || echo "⚠️ CLAUDE_API_KEY patch 失敗"
fi
# GITEA_API_TOKEN — Gitea API Token從 AWOOOI_GITEA_API_TOKEN 映射)
if [ -n "${GITEA_API_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/GITEA_API_TOKEN","value":"${GITEA_API_TOKEN_B64}"}
]' && echo "✅ GITEA_API_TOKEN 已注入" || echo "⚠️ GITEA_API_TOKEN patch 失敗"
fi
# NEMOTRON_BOT_TOKEN / OPENCLAW_BOT_TOKEN — 多 Bot 架構
if [ -n "${NEMOTRON_BOT_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/NEMOTRON_BOT_TOKEN","value":"${NEMOTRON_BOT_TOKEN_B64}"}
]' && echo "✅ NEMOTRON_BOT_TOKEN 已注入" || echo "⚠️ NEMOTRON_BOT_TOKEN patch 失敗"
fi
if [ -n "${OPENCLAW_BOT_TOKEN_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/OPENCLAW_BOT_TOKEN","value":"${OPENCLAW_BOT_TOKEN_B64}"}
]' && echo "✅ OPENCLAW_BOT_TOKEN 已注入" || echo "⚠️ OPENCLAW_BOT_TOKEN patch 失敗"
fi
# SMTP_HOST / SRE_GROUP_CHAT_ID
if [ -n "${SMTP_HOST_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SMTP_HOST","value":"${SMTP_HOST_B64}"}
]' && echo "✅ SMTP_HOST 已注入" || echo "⚠️ SMTP_HOST patch 失敗"
fi
if [ -n "${SRE_GROUP_CHAT_ID_B64}" ]; then
\$KUBECTL patch secret awoooi-secrets -n awoooi-prod --type='json' -p='[
{"op":"add","path":"/data/SRE_GROUP_CHAT_ID","value":"${SRE_GROUP_CHAT_ID_B64}"}
]' && echo "✅ SRE_GROUP_CHAT_ID 已注入" || echo "⚠️ SRE_GROUP_CHAT_ID patch 失敗"
fi
# 2026-04-06 Claude Code: Sprint 3 T2 — known_hosts Secret (Security Fix A1)
# 替換關閉 host key 驗證的舊做法,讓 SSH 修復路徑使用已知主機指紋。
# asyncssh reads /etc/ssh-mcp/known_hosts and requires a non-empty
# OpenSSH known_hosts file. Keep hosts unhashed so both asyncssh and
# CLI diagnostics can trust the same secret.
# 2026-05-02 ogt + Claude Sonnet 4.6: 加 4 台主機完整性檢查
# 根因partial scan如 110 timeout、其他成功會讓 [-s file] 通過、
# 後續 patch 推進缺漏的 known_hosts → asyncssh 拒所有 SSH。
# 修法scan 完用 grep -c 驗證 4 台主機都在;缺任何一台就 abort
# 不能覆蓋現有 secret防止 production SSH 自動修復路徑癱瘓。
ssh-keyscan 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188 > /tmp/known_hosts_repair 2>/tmp/known_hosts_scan_err || true
EXPECTED_HOSTS=4
PRESENT=0
for ip in 192.168.0.110 192.168.0.120 192.168.0.121 192.168.0.188; do
if grep -qE "^\${ip}[[:space:]]" /tmp/known_hosts_repair 2>/dev/null; then
PRESENT=\$((PRESENT + 1))
else
echo "⚠️ ssh-keyscan 缺主機 \${ip}"
fi
done
if [ "\$PRESENT" -eq "\$EXPECTED_HOSTS" ]; then
\$KUBECTL create secret generic awoooi-repair-known-hosts \
-n awoooi-prod \
--from-file=known_hosts=/tmp/known_hosts_repair \
--dry-run=client -o yaml | \$KUBECTL apply -f - \
&& echo "✅ awoooi-repair-known-hosts Secret 已建立/更新" \
|| echo "⚠️ awoooi-repair-known-hosts Secret 建立失敗 (非致命)"
KNOWN_HOSTS_B64=\$(base64 -w 0 /tmp/known_hosts_repair)
\$KUBECTL patch secret ssh-mcp-key -n awoooi-prod --type=merge \
-p="{\"data\":{\"known_hosts\":\"\${KNOWN_HOSTS_B64}\"}}" \
&& echo "✅ ssh-mcp-key known_hosts 已更新4 台主機完整)" \
|| echo "⚠️ ssh-mcp-key known_hosts 更新失敗 (非致命)"
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
else
echo "❌ ssh-keyscan 只抓到 \${PRESENT}/\${EXPECTED_HOSTS} 台主機,跳過 patch保留現有 secret"
cat /tmp/known_hosts_scan_err 2>/dev/null | head -10
rm -f /tmp/known_hosts_repair /tmp/known_hosts_scan_err
fi
echo "✅ 所有 Secrets 注入完成"
SECRETS
# 2026-04-11 Claude Sonnet 4.6 (Sprint B-3 ADR-069):
# Deploy 改為 ArgoCD GitOps 模式:更新 kustomization.yaml → git push [skip ci] → ArgoCD sync
# 舊做法 (kubectl set image) 與 ArgoCD selfHeal 衝突 — ArgoCD 會 revert 任何直接 kubectl 操作
# 新做法流程:
# 1. 更新 kustomization.yaml image tag用 kustomize edit set image
# 2. Apply ConfigMap/ServiceRegistry不含 Deployment由 ArgoCD 管)
# 3. git commit [skip ci] + push → 觸發 ArgoCD automated sync
# 4. 等待 ArgoCD sync + rollout 完成
# 5. Health Check
- name: Deploy to K8s (ArgoCD GitOps)
env:
CD_PUSH_TOKEN: ${{ secrets.CD_PUSH_TOKEN }}
run: |
prepare_deploy_key() {
mkdir -p "${HOME}/.ssh"
umask 077
local source_key="${AWOOOI_DEPLOY_SSH_KEY_PATH:-${HOME}/.ssh/deploy_key}"
if [ ! -r "${source_key}" ]; then
echo "❌ deploy ssh key file missing: ${source_key}" >&2
exit 1
fi
if [ "${source_key}" != "${HOME}/.ssh/deploy_key" ]; then
cp "${source_key}" "${HOME}/.ssh/deploy_key"
fi
chmod 600 "${HOME}/.ssh/deploy_key"
}
mkdir -p ~/.ssh
prepare_deploy_key
# 2026-05-13 Codex: mirror Inject K8s Secrets host-key handling so the
# deploy job never reaches SSH with a known_hosts file missing ED25519.
# 2026-06-13 Codex: use the deploy-only known_hosts file so this
# stage cannot wipe cold-start/backup host trust for 120/188.
DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts"
ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null
test -s "${DEPLOY_KNOWN_HOSTS}" || { echo "❌ K8S host keyscan failed: ${K8S_SSH_HOST}"; exit 1; }
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10"
export IMAGE_TAG="${{ github.sha }}"
HARBOR=192.168.0.110:5000
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 DeploymentConfigMap 仍直接 apply) ───
cat k8s/awoooi-prod/04-configmap.yaml | \
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
echo "✅ ConfigMap 已更新"
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"KUBECTL='sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${{ env.K8S_API_SERVER }}'; \$KUBECTL apply -f -"
echo "✅ Service Registry ConfigMap 已更新"
# ─── Step 2: 更新 kustomization.yaml image tag ───
# host runner 不保證有 root 權限kustomize 安裝在使用者目錄。
export PATH="${HOME}/.local/bin:${PATH}"
if ! command -v kustomize &>/dev/null; then
mkdir -p "${HOME}/.local/bin"
curl -sL https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.3.0/kustomize_v5.3.0_linux_amd64.tar.gz \
| tar xz -C "${HOME}/.local/bin"
chmod +x "${HOME}/.local/bin/kustomize"
fi
cd k8s/awoooi-prod
# kustomize edit set image 更新 tag
kustomize edit set image \
192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER=${HARBOR}/awoooi/api:${IMAGE_TAG}
kustomize edit set image \
192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER=${HARBOR}/awoooi/web:${IMAGE_TAG}
cd ../..
# 2026-06-29 Codex: API deploy readback uses AWOOOI_BUILD_COMMIT_SHA
# to compare runtime image/source truth. Keep it in the same deploy
# marker commit as kustomization.yaml so the production Workbench does
# not depend on stale committed snapshots.
python3 - <<'PY'
import os
import re
from pathlib import Path
path = Path("k8s/awoooi-prod/06-deployment-api.yaml")
image_tag = os.environ["IMAGE_TAG"]
text = path.read_text(encoding="utf-8")
def replace_env_value(source: str, env_name: str) -> str:
pattern = (
rf'(\n\s+- name: {re.escape(env_name)}\n'
r'(?:\s+# [^\n]*\n)*'
r'\s+value: ")[^"]*(")'
)
if not re.search(pattern, source):
raise SystemExit(f"{env_name} env block not found")
return re.sub(
pattern,
lambda match: f"{match.group(1)}{image_tag}{match.group(2)}",
source,
count=1,
)
text = replace_env_value(text, "AWOOOI_BUILD_COMMIT_SHA")
text = replace_env_value(text, "AWOOOI_DESIRED_API_IMAGE_TAG")
path.write_text(text, encoding="utf-8")
PY
# ─── Step 3: git commit [skip ci] + push → 觸發 ArgoCD sync ───
git config user.email "cd@awoooi.internal"
git config user.name "AWOOOI CD"
git add k8s/awoooi-prod/kustomization.yaml k8s/awoooi-prod/06-deployment-api.yaml
DEPLOY_REVISION=""
git diff --cached --quiet && echo "⚡ kustomization.yaml 無變化,跳過 push" || {
git commit -m "chore(cd): deploy ${IMAGE_TAG::7} [skip ci]"
# 用 token 推送(避免 SSH key 需要額外設定 push 權限)
git remote remove gitea 2>/dev/null || true
git remote add gitea "http://wooo:${CD_PUSH_TOKEN}@192.168.0.110:3001/wooo/awoooi.git"
# 先 rebase 避免 non-fast-forward (其他 commit 在 CI 期間已推入)
# 2026-04-17 ogt: -X theirs — kustomization.yaml 衝突時採用當次部署的 image tag
git fetch gitea main
git rebase -X theirs gitea/main
DEPLOY_REVISION=$(git rev-parse HEAD)
git push gitea main
echo "✅ kustomization.yaml 已 push等待 ArgoCD sync 到 ${DEPLOY_REVISION:0:8}..."
}
# ─── Step 4: 等待 ArgoCD sync + rollout ───
ROLLOUT_LOG="$(mktemp)"
set +e
ssh $SSH_OPTS "wooo@${{ env.K8S_SSH_HOST }}" \
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" 2>&1 << 'ARGOCD_WAIT' | tee "$ROLLOUT_LOG"
set -e
K8S_API_SERVER="${{ env.K8S_API_SERVER }}"
KUBECTL="sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER}"
RISK_FILE="$(mktemp)"
UNKNOWN_STATUS_COUNT=0
HEALTH_FAILURE_COUNT=0
record_rollout_risk() {
local message="$1"
printf '%s\n' "$message" >> "$RISK_FILE"
echo "⚠️ Rollout risk observed: $message" >&2
}
emit_rollout_evidence() {
if [ -s "$RISK_FILE" ]; then
local summary
local kubectl_count
kubectl_count=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
summary=$(tr '\n' '; ' < "$RISK_FILE" | sed 's/[[:cntrl:]]//g' | cut -c1-700)
echo "AWOOOI_ROLLOUT_RISK=1"
echo "AWOOOI_ROLLOUT_SUMMARY=unknown_status_count=${UNKNOWN_STATUS_COUNT}; health_failure_count=${HEALTH_FAILURE_COUNT}; kubectl_failure_count=${kubectl_count}; ${summary}"
else
echo "AWOOOI_ROLLOUT_RISK=0"
fi
rm -f "$RISK_FILE"
}
trap emit_rollout_evidence EXIT
app_field() {
local jsonpath="$1"
local label="$2"
local output
local status
local kubectl_seen
set +e
output=$($KUBECTL get application awoooi-prod -n argocd -o jsonpath="$jsonpath" 2>&1)
status=$?
set -e
if [ "$status" -ne 0 ]; then
kubectl_seen=$(grep -c '^argocd_.*_query_failed=' "$RISK_FILE" 2>/dev/null || true)
if [ "$kubectl_seen" -lt 3 ]; then
record_rollout_risk "argocd_${label}_query_failed=$(echo "$output" | head -c 180)"
fi
printf 'Unknown'
return 0
fi
printf '%s' "$output"
}
probe_public_health() {
local phase="$1"
local http_code
local status
set +e
http_code=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 3 --max-time 8 "${{ env.API_HEALTH_URL }}" 2>/dev/null)
status=$?
set -e
if [ "$status" -ne 0 ]; then
http_code="curl_error_${status}"
fi
if [ "$http_code" != "200" ]; then
HEALTH_FAILURE_COUNT=$((HEALTH_FAILURE_COUNT + 1))
if [ "$HEALTH_FAILURE_COUNT" -le 3 ]; then
record_rollout_risk "public_health_${phase}_http=${http_code}"
fi
fi
}
collect_argocd_resource_evidence() {
local template
local output
local status
template='{{range .status.resources}}{{if ne .status "Synced"}}{{.kind}}/{{.name}}{{if .namespace}} ns={{.namespace}}{{end}} sync={{.status}}{{if .health.status}} health={{.health.status}}{{end}}{{"\n"}}{{end}}{{if .health.status}}{{if ne .health.status "Healthy"}}{{.kind}}/{{.name}}{{if .namespace}} ns={{.namespace}}{{end}} sync={{.status}} health={{.health.status}}{{if .health.message}} msg={{.health.message}}{{end}}{{"\n"}}{{end}}{{end}}{{end}}'
set +e
output=$($KUBECTL get application awoooi-prod -n argocd -o "go-template=${template}" 2>&1)
status=$?
set -e
if [ "$status" -ne 0 ]; then
local output_snippet
output_snippet=$(printf '%s' "$output" | head -c 180)
echo "resource_query_failed=${output_snippet}"
return 0
fi
echo "$output" \
| awk 'NF && !seen[$0]++ {print}' \
| head -5 \
| tr '\n' ';' \
| sed 's/[[:cntrl:]]//g; s/;*$//'
}
validate_argocd_source_contract() {
local target_revision
local image_override
target_revision=$(app_field '{.spec.source.targetRevision}' source_target_revision)
image_override=$(app_field '{.spec.source.kustomize.images}' source_kustomize_images)
if [ "$target_revision" != "main" ]; then
record_rollout_risk "argocd_source_target_revision_not_main targetRevision=$target_revision"
echo "❌ ArgoCD source targetRevision must be main, got: $target_revision" >&2
exit 1
fi
if [ -n "$image_override" ]; then
local image_override_snippet
image_override_snippet=$(printf '%s' "$image_override" | head -c 180)
record_rollout_risk "argocd_source_image_override_present images=${image_override_snippet}"
echo "❌ ArgoCD source kustomize.images override must be empty; image truth belongs in k8s/awoooi-prod/kustomization.yaml" >&2
exit 1
fi
}
# 等待 ArgoCD Application 同步到目標 revision最多 180s
# 2026-05-24 Codex: top-level Application health can stay Degraded
# without per-resource health detail. Treat that as rollout evidence,
# then let kubectl rollout status and API health decide pass/fail.
echo "⏳ 等待 ArgoCD sync..."
validate_argocd_source_contract
$KUBECTL annotate application awoooi-prod -n argocd \
argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true
for i in $(seq 1 36); do
SYNC=$(app_field '{.status.sync.status}' sync)
HEALTH=$(app_field '{.status.health.status}' health)
REVISION=$(app_field '{.status.sync.revision}' revision)
SHORT_REVISION=$(echo "$REVISION" | cut -c1-8)
SHORT_EXPECTED=$(echo "$EXPECTED_REVISION" | cut -c1-8)
echo " ArgoCD: sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
probe_public_health "argocd_wait"
if [ "$SYNC" = "Unknown" ] || [ "$HEALTH" = "Unknown" ] || [ "$REVISION" = "Unknown" ]; then
UNKNOWN_STATUS_COUNT=$((UNKNOWN_STATUS_COUNT + 1))
if [ "$UNKNOWN_STATUS_COUNT" -le 3 ]; then
record_rollout_risk "argocd_status_unknown sync=$SYNC health=$HEALTH revision=$SHORT_REVISION expected=${SHORT_EXPECTED:-any}"
fi
fi
if [ "$SYNC" = "Synced" ]; then
if [ -z "$EXPECTED_REVISION" ] || [ "$REVISION" = "$EXPECTED_REVISION" ]; then
if [ "$HEALTH" != "Healthy" ]; then
RESOURCE_EVIDENCE=$(collect_argocd_resource_evidence)
if [ -n "$RESOURCE_EVIDENCE" ]; then
record_rollout_risk "argocd_health_not_healthy health=$HEALTH revision=$SHORT_REVISION resources=$RESOURCE_EVIDENCE"
else
record_rollout_risk "argocd_health_not_healthy health=$HEALTH revision=$SHORT_REVISION resources=none_visible"
fi
fi
echo "✅ ArgoCD Synced to target revision (health=$HEALTH)"
break
fi
fi
if [ "$i" = "36" ]; then
echo "❌ ArgoCD 未在期限內同步到目標 revision"
exit 1
fi
sleep 5
done
# 確認 rollout 完成
$KUBECTL rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s
$KUBECTL rollout status deployment/awoooi-web -n awoooi-prod --timeout=120s
$KUBECTL rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s
echo "✅ 部署完成"
# Health Check
HEALTH_PASS=0
for i in 1 2 3; do
set +e
HTTP_CODE=$(curl -sS -w "%{http_code}" -o /dev/null --connect-timeout 10 --max-time 20 "${{ env.API_HEALTH_URL }}" 2>/dev/null)
CURL_STATUS=$?
set -e
if [ "$CURL_STATUS" -ne 0 ]; then
HTTP_CODE="curl_error_${CURL_STATUS}"
fi
if [ "$HTTP_CODE" = "200" ]; then
echo "✅ API 健康檢查通過"
HEALTH_PASS=1
break
fi
echo "⏳ 嘗試 #$i: HTTP $HTTP_CODE等待 10s..."
sleep 10
done
if [ "$HEALTH_PASS" = "0" ]; then
record_rollout_risk "public_health_final_failed"
echo "❌ API 健康檢查失敗"
exit 1
fi
ARGOCD_WAIT
ROLLOUT_EXIT=${PIPESTATUS[0]}
set -e
ROLLOUT_RISK="0"
ROLLOUT_SUMMARY=""
if grep -q '^AWOOOI_ROLLOUT_RISK=1$' "$ROLLOUT_LOG"; then
ROLLOUT_RISK="1"
ROLLOUT_SUMMARY=$(grep '^AWOOOI_ROLLOUT_SUMMARY=' "$ROLLOUT_LOG" | tail -1 | sed 's/^AWOOOI_ROLLOUT_SUMMARY=//' | cut -c1-700)
fi
if [ -n "${GITHUB_ENV:-}" ]; then
{
echo "AWOOI_ROLLOUT_RISK=${ROLLOUT_RISK}"
echo "AWOOI_ROLLOUT_SUMMARY=${ROLLOUT_SUMMARY}"
} >> "$GITHUB_ENV"
fi
rm -f "$ROLLOUT_LOG"
DEPLOY_READBACK_EXIT=0
python3 - <<'PY' || DEPLOY_READBACK_EXIT=$?
import json
import os
import sys
import time
import urllib.request
expected = os.environ["IMAGE_TAG"].strip().lower()
expected_short = expected[:10]
url = "https://awoooi.wooo.work/api/v1/agents/delivery-closure-workbench"
attempts = int(os.environ.get("DEPLOY_READBACK_ATTEMPTS", "36"))
sleep_seconds = int(os.environ.get("DEPLOY_READBACK_SLEEP_SECONDS", "10"))
last_error = ""
for attempt in range(1, attempts + 1):
try:
with urllib.request.urlopen(url, timeout=20) as response:
payload = json.load(response)
except Exception as exc:
last_error = f"fetch_failed={type(exc).__name__}"
print(
"production_deploy_readback_attempt="
f"{attempt}/{attempts};{last_error}",
file=sys.stderr,
)
else:
summary = payload.get("summary") if isinstance(payload, dict) else {}
if not isinstance(summary, dict):
summary = {}
runtime_short = str(
summary.get("production_deploy_runtime_build_commit_short_sha")
or ""
)
desired_short = str(
summary.get(
"production_deploy_desired_main_api_image_tag_short_sha"
)
or ""
)
desired_status = str(
summary.get(
"production_deploy_desired_main_api_image_tag_readback_status"
)
or ""
)
matches_main = (
summary.get("production_deploy_image_tag_matches_main") is True
)
if (
runtime_short == expected_short
and desired_short == expected_short
and desired_status == "ok"
):
print(
"✅ Production deploy readback matches this build and "
"GitOps desired image tag "
f"({expected_short}) on attempt {attempt}/{attempts};"
f"matches_main={matches_main}"
)
raise SystemExit(0)
last_error = (
f"expected={expected_short};runtime={runtime_short};"
f"desired={desired_short};desired_status={desired_status};"
f"matches_main={matches_main}"
)
print(
"production_deploy_readback_attempt="
f"{attempt}/{attempts};{last_error}",
file=sys.stderr,
)
if attempt < attempts:
time.sleep(sleep_seconds)
print(
"production_deploy_readback_mismatch=" + last_error,
file=sys.stderr,
)
raise SystemExit(1)
PY
if [ "$DEPLOY_READBACK_EXIT" -ne 0 ]; then
exit "$DEPLOY_READBACK_EXIT"
fi
if [ "$ROLLOUT_EXIT" -ne 0 ]; then
if [ "$ROLLOUT_RISK" = "1" ]; then
ROLLOUT_SUMMARY="${ROLLOUT_SUMMARY}; rollout_exit=${ROLLOUT_EXIT}; production_deploy_readback_matched=true"
else
ROLLOUT_RISK="1"
ROLLOUT_SUMMARY="rollout_exit=${ROLLOUT_EXIT}; production_deploy_readback_matched=true"
fi
ROLLOUT_SUMMARY=$(printf '%s' "$ROLLOUT_SUMMARY" | cut -c1-700)
if [ -n "${GITHUB_ENV:-}" ]; then
{
echo "AWOOI_ROLLOUT_RISK=${ROLLOUT_RISK}"
echo "AWOOI_ROLLOUT_SUMMARY=${ROLLOUT_SUMMARY}"
} >> "$GITHUB_ENV"
fi
echo "⚠️ ArgoCD/rollout wait exited ${ROLLOUT_EXIT}, but production deploy readback matched; treating as rollout risk, not deploy failure."
fi
if [ "$ROLLOUT_RISK" = "1" ]; then
ACTOR="${GITHUB_ACTOR:-${{ github.actor }}}"
if AWOOI_CICD_STATUS=pending \
AWOOI_CICD_STAGE=rollout-risk \
AWOOI_CICD_JOB_NAME="AWOOOI 部署完成但仍有風險證據" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${ROLLOUT_SUMMARY}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD rollout risk notification mirrored through AWOOI API"
else
echo "⚠️ CI/CD rollout risk notification failed (non-fatal)"
fi
fi
exit 0
- name: Notify Build Deploy Success
run: |
END_TIME=$(date +%s)
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
ACTOR="${{ github.actor }}"
if AWOOI_CICD_STATUS=success \
AWOOI_CICD_STAGE=build-and-deploy \
AWOOI_CICD_JOB_NAME="AWOOOI 建置部署完成" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_DURATION_SECONDS="${DURATION}" \
AWOOI_CICD_SUMMARY="Image build/push + ArgoCD rollout + API health passed" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD build-deploy success notification mirrored through AWOOI API"
else
echo "⚠️ CI/CD build-deploy success notification failed (non-fatal)"
fi
# 2026-04-09 Claude Sonnet 4.6: Sprint 5.2 — 同步 ops 腳本到 188 (ollama user)
# 188 deploy key is rotated and must not be read by this disabled step.
# 腳本: docker-health-monitor.sh + pg-backup.sh + notify-awoooi-ops.sh
# 感知層與備份通知都先走 AWOOI API/AwoooPTelegram 直發只保留 API 離線 fallback。
- name: Sync Ops Scripts to 188
# 2026-05-13 Codex T14e/P0:
# Disabled until the 188 ops sync path is moved to a file-secret or
# Ansible-controlled channel. Gitea Actions logs step env values, and
# multiline SSH secrets must not be exposed through CD logs.
if: ${{ false }}
continue-on-error: true
run: |
echo "188 ops script sync disabled pending secure key rotation path"
- name: Notify Pipeline Failure
if: failure()
env:
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
FAILURE_SUMMARY="${AWOOI_ROLLOUT_SUMMARY:-${COMMIT_MSG}}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🏗️ Stage: build-and-deploy\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
if AWOOI_CICD_STATUS=failed \
AWOOI_CICD_STAGE=build-and-deploy \
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${FAILURE_SUMMARY}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD build failure notification mirrored through AWOOI API"
else
echo "AWOOI API notify failed; direct Telegram fallback disabled to preserve AwoooP receipt chain"
fi
post-deploy-checks:
# 2026-06-28 Codex: post-deploy checks belong to real deploy runs; skip
# marker/no-op commits already accounted for by the previous deploy run.
# 2026-07-01 Codex: `[metadata-only]` commits do not roll a new image, so
# post-deploy smokes would only retest the previous production artifact.
if: ${{ github.event_name != 'push' || (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'cancel-stale-cd') && !contains(github.event.head_commit.message, '[metadata-only]')) }}
needs: [build-and-deploy]
timeout-minutes: 30
# 2026-04-30 Codex: keep post-deploy on the host runner too. Playwright
# install-deps can also kill the act-managed job container with RWLayer=nil.
runs-on: awoooi-non110-host
steps:
- name: Bootstrap Host Runner Tools
# 2026-05-05 Codex: post-deploy also uses checkout and curl-based
# notifications, so it needs the same runner bootstrap as earlier jobs.
run: |
if command -v apk >/dev/null 2>&1; then
apk add --no-cache nodejs npm git curl bash coreutils python3 openssh-client docker-cli docker-cli-buildx
fi
- uses: actions/checkout@v4
- name: Wait for Host Web Build Pressure
# 2026-06-28 Codex: post-deploy is browser-heavy; fail closed on host
# pressure until runner load is isolated from production.
run: bash scripts/ci/wait-host-web-build-pressure.sh
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
- name: Notify Post Deploy Checks Start
run: |
ACTOR="${{ github.actor }}"
if AWOOI_CICD_STATUS=running \
AWOOI_CICD_STAGE=post-deploy-checks \
AWOOI_CICD_JOB_NAME="AWOOOI 部署後驗證開始" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="Alert Chain / Source Link / Monitoring / Smoke gates started" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD post-deploy start notification mirrored through AWOOI API"
else
echo "⚠️ CI/CD post-deploy start notification failed (non-fatal)"
fi
# Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
# 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
# 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests),移除 Setup Python Tools step
# 2026-06-28 Codex: commander controlled automation keeps the canary
# evidence and notification signal, but no longer blocks CD completion.
- name: Alert Chain Smoke Test
id: alert_chain_smoke
run: |
prepare_deploy_key() {
mkdir -p "${HOME}/.ssh"
umask 077
local source_key="${AWOOOI_DEPLOY_SSH_KEY_PATH:-${HOME}/.ssh/deploy_key}"
if [ ! -r "${source_key}" ]; then
echo "❌ deploy ssh key file missing: ${source_key}" >&2
exit 1
fi
if [ "${source_key}" != "${HOME}/.ssh/deploy_key" ]; then
cp "${source_key}" "${HOME}/.ssh/deploy_key"
fi
chmod 600 "${HOME}/.ssh/deploy_key"
}
collect_observability_statuses() {
local component="$1"
ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \
"sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get pods -n observability -l app.kubernetes.io/name=${component} --no-headers -o custom-columns=STATUS:.status.phase"
}
capture_observability_statuses() {
local component="$1"
local output
if output="$(collect_observability_statuses "${component}" 2>&1)"; then
printf '%s' "${output}"
return 0
fi
printf '%s' "${output}"
return 1
}
# 2026-05-19 Codex: the smoke test runs inside CI_IMAGE, but the
# observability pod checks need the K3s host kubectl context. Capture
# those read-only statuses on the host and pass them into the
# container, instead of making the container own kube credentials.
OBSERVABILITY_PREFLIGHT_ERROR=""
OTEL_COLLECTOR_ERROR=""
EVENT_EXPORTER_ERROR=""
OTEL_COLLECTOR_STATUSES=""
EVENT_EXPORTER_STATUSES=""
prepare_deploy_key
DEPLOY_KNOWN_HOSTS="${HOME}/.ssh/deploy_known_hosts"
if ssh-keyscan -T 5 -t ed25519,rsa,ecdsa "${K8S_SSH_HOST}" > "${DEPLOY_KNOWN_HOSTS}" 2>/dev/null && test -s "${DEPLOY_KNOWN_HOSTS}"; then
SSH_OPTS="-i ${HOME}/.ssh/deploy_key -o BatchMode=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=${DEPLOY_KNOWN_HOSTS} -o ConnectTimeout=10"
if ! OTEL_COLLECTOR_STATUSES="$(capture_observability_statuses otel-collector)"; then
OTEL_COLLECTOR_ERROR="$(printf '%s' "${OTEL_COLLECTOR_STATUSES}" | tail -1 | head -c 200)"
OTEL_COLLECTOR_STATUSES=""
fi
if ! EVENT_EXPORTER_STATUSES="$(capture_observability_statuses event-exporter)"; then
EVENT_EXPORTER_ERROR="$(printf '%s' "${EVENT_EXPORTER_STATUSES}" | tail -1 | head -c 200)"
EVENT_EXPORTER_STATUSES=""
fi
else
OBSERVABILITY_PREFLIGHT_ERROR="K8s host keyscan failed"
OTEL_COLLECTOR_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
EVENT_EXPORTER_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}"
fi
SOURCE_LINK_RUN_REF="gitea-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}"
SOURCE_LINK_CANARY_WORK_ITEM_ID="source-evidence:sentry:upstream_canary:awoooi-source-link-canary-${SOURCE_LINK_RUN_REF}"
SOURCE_LINK_CANARY_EVENT_ID="sentry:source_correlation_linked:awoooi-source-link-canary-${SOURCE_LINK_RUN_REF}"
echo "source_link_canary_work_item_id=${SOURCE_LINK_CANARY_WORK_ITEM_ID}" >> "$GITHUB_OUTPUT"
echo "source_link_canary_event_id=${SOURCE_LINK_CANARY_EVENT_ID}" >> "$GITHUB_OUTPUT"
AWOOOP_OPERATOR_API_KEY="$(
ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \
"sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get secret awoooi-secrets -n awoooi-prod -o jsonpath='{.data.AWOOOP_OPERATOR_API_KEY}' | base64 -d"
)"
if [ -z "${AWOOOP_OPERATOR_API_KEY}" ]; then
echo "❌ AWOOOP_OPERATOR_API_KEY missing from production secret; source-link canary cannot run"
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
exit 0
fi
export AWOOOP_OPERATOR_API_KEY
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
--cpus "1.0" \
--memory "1g" \
-v "$PWD:/workspace" \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
-e AWOOOI_OTEL_COLLECTOR_STATUSES="${OTEL_COLLECTOR_STATUSES}" \
-e AWOOOI_OTEL_COLLECTOR_ERROR="${OTEL_COLLECTOR_ERROR}" \
-e AWOOOI_EVENT_EXPORTER_STATUSES="${EVENT_EXPORTER_STATUSES}" \
-e AWOOOI_EVENT_EXPORTER_ERROR="${EVENT_EXPORTER_ERROR}" \
-e AWOOOP_OPERATOR_API_KEY \
-e AWOOOP_OPERATOR_ID="gitea-cd-post-deploy" \
-e SOURCE_LINK_RUN_REF="${SOURCE_LINK_RUN_REF}" \
"${{ env.CI_IMAGE }}" \
bash -lc 'set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --source-link-canary-target-incident-id INC-20260505-25E744 --run-ref "${SOURCE_LINK_RUN_REF}" --json | tee /tmp/alert_chain_result.json'; then
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
else
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
echo "⚠️ Alert Chain smoke failed; continuing under commander controlled automation"
exit 0
fi
# Phase O-5 Wave C.2 2026-04-02 ogt: 監控覆蓋率驗證 (generate_monitoring.py --check)
# 2026-06-28 Codex: coverage remains measured and notified, but no longer
# turns a deployed runtime into a blocked terminal CD state by default.
- name: Monitoring Coverage Check
id: monitoring_coverage
run: |
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-coverage" \
--cpus "1.0" \
--memory "1g" \
-v "$PWD:/workspace" \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/generate_monitoring.py --check'; then
echo "coverage_status=pass" >> $GITHUB_OUTPUT
else
echo "coverage_status=fail" >> $GITHUB_OUTPUT
echo "⚠️ Monitoring coverage check failed; continuing under commander controlled automation"
exit 0
fi
- name: AwoooP Source Correlation Applied-Link Smoke
id: source_correlation_apply_smoke
run: |
SOURCE_LINK_CANARY_WORK_ITEM_ID="${{ steps.alert_chain_smoke.outputs.source_link_canary_work_item_id }}"
SOURCE_LINK_CANARY_EVENT_ID="${{ steps.alert_chain_smoke.outputs.source_link_canary_event_id }}"
export SOURCE_LINK_CANARY_WORK_ITEM_ID SOURCE_LINK_CANARY_EVENT_ID
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-source-link-smoke" \
--cpus "0.5" \
--memory "512m" \
-v "$PWD:/workspace" \
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
-e SOURCE_LINK_CANARY_WORK_ITEM_ID \
-e SOURCE_LINK_CANARY_EVENT_ID \
"${{ env.CI_IMAGE }}" \
bash -lc 'set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/awooop_source_correlation_apply_smoke.py \
--api-url ${{ env.ALERT_CHAIN_API_URL }} \
--target-incident-id INC-20260505-25E744 \
--work-item-id "${SOURCE_LINK_CANARY_WORK_ITEM_ID}" \
--expected-source-event-provider-event-id "${SOURCE_LINK_CANARY_EVENT_ID}" \
--allow-existing-apply \
--refresh-if-stale-days 6 \
--refresh-work-item-id "${SOURCE_LINK_CANARY_WORK_ITEM_ID}" \
--verify-refresh-candidate \
--reviewer-id gitea_cd_source_link_canary \
--operator-note "CD dedicated source-link canary; append-only status-chain proof" \
| tee /tmp/source_correlation_apply_smoke.json'; then
echo "source_correlation_apply_status=pass" >> $GITHUB_OUTPUT
else
echo "source_correlation_apply_status=fail" >> $GITHUB_OUTPUT
echo "⚠️ Source correlation applied-link smoke failed; continuing under commander controlled automation"
exit 0
fi
# [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間)
# continue-on-error: true — smoke 失敗不阻塞部署,但結果會反映在 TG 通知
- name: E2E Smoke Test
id: smoke
continue-on-error: true
run: |
cat > /tmp/awoooi-smoke.sh <<'CI_SCRIPT'
set -euo pipefail
# 首席架構師 Review I4 + 2026-04-05 Claude Code cache優化:
# playwright.config.ts import @playwright/test — 必須先安裝 pnpm node_modules
# pnpm store 持久化到 /opt/pnpm-storepnpm-lock.yaml hash 未變則 --prefer-offline
SOURCE_WORKDIR=/source
SMOKE_WORKDIR=/tmp/awoooi-smoke-workspace
cleanup_smoke_workspace_artifacts() {
rm -rf "$SMOKE_WORKDIR" \
/tmp/pnpm-install.log \
/tmp/playwright-install-deps.log \
2>/dev/null || true
}
trap cleanup_smoke_workspace_artifacts EXIT
rm -rf "$SMOKE_WORKDIR"
mkdir -p "$SMOKE_WORKDIR"
if command -v tar >/dev/null 2>&1; then
tar \
--exclude='./.git' \
--exclude='./node_modules' \
--exclude='./apps/web/node_modules' \
--exclude='./apps/web/test-results' \
--exclude='./apps/web/playwright-report' \
--exclude='./packages/*/node_modules' \
-cf - -C "$SOURCE_WORKDIR" . | tar -xf - -C "$SMOKE_WORKDIR"
else
cp -a "$SOURCE_WORKDIR/." "$SMOKE_WORKDIR/"
rm -rf "$SMOKE_WORKDIR/.git" \
"$SMOKE_WORKDIR/node_modules" \
"$SMOKE_WORKDIR/apps/web/node_modules" \
"$SMOKE_WORKDIR/apps/web/test-results" \
"$SMOKE_WORKDIR/apps/web/playwright-report" \
2>/dev/null || true
fi
cd "$SMOKE_WORKDIR"
PNPM_STORE=/opt/pnpm-store
PNPM_HASH_FILE=/opt/pnpm-store/.lock_hash
CURRENT_PNPM_HASH=$(md5sum pnpm-lock.yaml | awk '{print $1}')
corepack enable 2>/dev/null || npm install -g pnpm@9 -q
mkdir -p "$PNPM_STORE"
pnpm config set store-dir $PNPM_STORE
if [ "$(cat $PNPM_HASH_FILE 2>/dev/null)" != "$CURRENT_PNPM_HASH" ]; then
echo "📦 pnpm lock 已變更,重裝 node_modules..."
pnpm install --frozen-lockfile 2>&1 | tee /tmp/pnpm-install.log | tail -20
echo "$CURRENT_PNPM_HASH" > $PNPM_HASH_FILE
else
echo "⚡ 使用快取 pnpm store (lock 未變更)prefer-offline..."
pnpm install --frozen-lockfile --prefer-offline 2>&1 | tee /tmp/pnpm-install.log | tail -20
fi
pnpm --dir apps/web exec node -e "require.resolve('@playwright/test')"
cd apps/web
# Playwright Chromium 持久化到 /opt/playwright-browsers版本 hash guard
export PLAYWRIGHT_BROWSERS_PATH=/opt/playwright-browsers
PLAYWRIGHT_VER=$(node -e "console.log(require('./package.json').devDependencies['@playwright/test'] || '')" 2>/dev/null || echo "unknown")
PLAYWRIGHT_HASH_FILE=/opt/playwright-browsers/.version_hash
BROWSER_PATH=$(node -e "const { chromium } = require('@playwright/test'); console.log(chromium.executablePath())")
if [ "$(cat $PLAYWRIGHT_HASH_FILE 2>/dev/null)" != "$PLAYWRIGHT_VER" ] || [ ! -x "$BROWSER_PATH" ]; then
echo "📦 Playwright browser cache missing/stale ($PLAYWRIGHT_VER): $BROWSER_PATH"
pnpm exec playwright install chromium --with-deps 2>&1 | tail -20
BROWSER_PATH=$(node -e "const { chromium } = require('@playwright/test'); console.log(chromium.executablePath())")
test -x "$BROWSER_PATH" || { echo "❌ Playwright browser executable missing after install: $BROWSER_PATH"; exit 1; }
echo "$PLAYWRIGHT_VER" > $PLAYWRIGHT_HASH_FILE
else
echo "⚡ 使用快取 Playwright Chromium ($PLAYWRIGHT_VER): $BROWSER_PATH"
fi
# Browser cache 命中時也要確認 OS shared libs 存在;否則 smoke 會只測到
# chromium launch failure例如 libnspr4.so missing
if ! ldconfig -p 2>/dev/null | grep -q 'libnspr4'; then
echo "📦 Playwright system deps missing補安裝 Chromium deps..."
pnpm exec playwright install-deps chromium > /tmp/playwright-install-deps.log 2>&1 || {
tail -40 /tmp/playwright-install-deps.log
exit 1
}
tail -20 /tmp/playwright-install-deps.log
fi
# 對已部署的生產環境跑 smoke test
SMOKE_STATUS=pass
pnpm exec playwright test tests/e2e/smoke.spec.ts --reporter=line || SMOKE_STATUS=fail
echo "smoke_status=${SMOKE_STATUS}" >> $GITHUB_OUTPUT
CI_SCRIPT
SMOKE_OUTPUT="$PWD/.awoooi-smoke-output"
rm -f "$SMOKE_OUTPUT"
touch "$SMOKE_OUTPUT"
chmod 666 "$SMOKE_OUTPUT"
SMOKE_DOCKER_STATUS=0
# 2026-06-01 Codex: post-deploy smoke can pass, then hang in
# runner cleanup and incorrectly mark the deploy failed. Bound only
# the smoke container; preserve pass evidence if it was written.
if command -v timeout >/dev/null 2>&1; then
# 2026-06-14 Codex: act-runner host may provide BusyBox timeout,
# which rejects GNU-only --kill-after. The short -k form works
# with BusyBox and GNU timeout.
timeout -k 20s 300s docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-e2e-smoke" \
--cpus "1.5" \
--memory "2g" \
-v "$PWD:/source:ro" \
-v "$SMOKE_OUTPUT:/github-output" \
-v /tmp/awoooi-smoke.sh:/tmp/awoooi-smoke.sh:ro \
-v awoooi-pnpm-store:/opt/pnpm-store \
-v awoooi-playwright-browsers:/opt/playwright-browsers \
-w /tmp \
-e GITHUB_OUTPUT=/github-output \
-e CI=true \
-e PLAYWRIGHT_BASE_URL=https://awoooi.wooo.work \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-smoke.sh || SMOKE_DOCKER_STATUS=$?
else
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-e2e-smoke" \
--cpus "1.5" \
--memory "2g" \
-v "$PWD:/source:ro" \
-v "$SMOKE_OUTPUT:/github-output" \
-v /tmp/awoooi-smoke.sh:/tmp/awoooi-smoke.sh:ro \
-v awoooi-pnpm-store:/opt/pnpm-store \
-v awoooi-playwright-browsers:/opt/playwright-browsers \
-w /tmp \
-e GITHUB_OUTPUT=/github-output \
-e CI=true \
-e PLAYWRIGHT_BASE_URL=https://awoooi.wooo.work \
"${{ env.CI_IMAGE }}" \
bash /tmp/awoooi-smoke.sh || SMOKE_DOCKER_STATUS=$?
fi
if [ "$SMOKE_DOCKER_STATUS" != "0" ] && ! grep -q '^smoke_status=pass$' "$SMOKE_OUTPUT"; then
echo "smoke_status=fail" > "$SMOKE_OUTPUT"
echo "E2E smoke container failed before pass evidence: ${SMOKE_DOCKER_STATUS}"
exit "$SMOKE_DOCKER_STATUS"
fi
if [ "$SMOKE_DOCKER_STATUS" != "0" ]; then
echo "E2E smoke pass evidence was written; treating container exit ${SMOKE_DOCKER_STATUS} as cleanup timeout"
fi
cat "$SMOKE_OUTPUT" >> "$GITHUB_OUTPUT"
env:
CI: "true"
# 直接測試已部署的生產環境,不啟動本地 dev server
PLAYWRIGHT_BASE_URL: "https://awoooi.wooo.work"
- name: Notify Health Check Success
env:
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
SMOKE_RESULT: ${{ steps.smoke.outputs.smoke_status == 'pass' && '✅' || '⚠️' }}
ALERT_CHAIN_RESULT: ${{ steps.alert_chain_smoke.outputs.alert_chain_status == 'pass' && '✅' || '⚠️' }}
MONITORING_RESULT: ${{ steps.monitoring_coverage.outputs.coverage_status == 'pass' && '✅' || '⚠️' }}
SOURCE_LINK_RESULT: ${{ steps.source_correlation_apply_smoke.outputs.source_correlation_apply_status == 'pass' && '✅' || '⚠️' }}
run: |
END_TIME=$(date +%s)
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
MINUTES=$((DURATION / 60))
SECONDS=$((DURATION % 60))
# 2026-04-05 ogt: TG_MSG 必須在 shell 中組裝,才能展開 ${MINUTES}/${SECONDS} 等 shell 變數
# 2026-04-05 ogt: 移除 parse_mode=HTML避免 commit message 含特殊字元導致 400
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
TG_MSG="✅ AWOOOI 部署完成\n├ 📝 ${COMMIT_MSG}\n├ 🔖 ${SHORT_SHA}\n├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s\n├ 📦 API: ✅ Web: ✅\n├ 🩺 Health: ✅\n├ 🔗 Alert Chain: ${ALERT_CHAIN_RESULT}\n├ 🧷 Source Link: ${SOURCE_LINK_RESULT}\n├ 📊 Monitoring: ${MONITORING_RESULT}\n└ 🎭 Smoke: ${SMOKE_RESULT}"
if AWOOI_CICD_STATUS=success \
AWOOI_CICD_STAGE=post-deploy \
AWOOI_CICD_JOB_NAME="AWOOOI 部署完成" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_DURATION_SECONDS="${DURATION}" \
AWOOI_CICD_SUMMARY="API=✅; Web=✅; AlertChain=${ALERT_CHAIN_RESULT}; SourceLink=${SOURCE_LINK_RESULT}; Monitoring=${MONITORING_RESULT}; Smoke=${SMOKE_RESULT}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD success notification mirrored through AWOOI API"
else
echo "AWOOI API notify failed; direct Telegram fallback disabled to preserve AwoooP receipt chain"
fi
- name: Notify Pipeline Failure
# 2026-04-16 ogt + Claude Sonnet 4.6: 改用 HTML 結構化格式
if: failure()
env:
TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
run: |
COMMIT_MSG="${{ steps.commit.outputs.message }}"
SHORT_SHA="${{ steps.commit.outputs.short_sha }}"
ACTOR="${{ github.actor }}"
COMMIT_ESC=$(echo "$COMMIT_MSG" | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g')
MSG=$(printf '❌ <b>AWOOOI 部署失敗</b>\n├ 📝 <code>%s</code>\n├ 🔖 <code>%s</code>\n├ 👤 %s\n├ 🩺 Stage: post-deploy-checks\n└ 🔗 http://192.168.0.110:3001/wooo/awoooi/actions' "${COMMIT_ESC}" "${SHORT_SHA}" "${ACTOR}")
if AWOOI_CICD_STATUS=failed \
AWOOI_CICD_STAGE=post-deploy-checks \
AWOOI_CICD_JOB_NAME="AWOOOI 部署失敗" \
AWOOI_CICD_COMMIT_SHA="${GITHUB_SHA}" \
AWOOI_CICD_TRIGGERED_BY="${ACTOR}" \
AWOOI_CICD_SUMMARY="${COMMIT_MSG}" \
scripts/ci/notify-awoooi-cicd.sh; then
echo "✅ CI/CD post-deploy failure notification mirrored through AWOOI API"
else
echo "AWOOI API notify failed; direct Telegram fallback disabled to preserve AwoooP receipt chain"
fi
- name: Clean Post-Deploy Workspace Artifacts
if: always()
env:
HOST_RUNNER_CLEANUP_IMAGE: ${{ env.CI_IMAGE }}
run: bash scripts/ci/cleanup-host-runner-workspace.sh