From 9f4f1b417cf8d9ea46252131c30ba3b0aa46519f Mon Sep 17 00:00:00 2001 From: ogt Date: Wed, 1 Jul 2026 23:40:18 +0800 Subject: [PATCH] fix(cd): keep ops recovery checks on controlled profile --- .gitea/workflows/cd.yaml | 9 +- docs/LOGBOOK.md | 18 ++++ ops/runner/read-public-gitea-actions-queue.py | 91 +++++++++++++++---- .../test_cd_controlled_runtime_profile.py | 3 + .../test_read_public_gitea_actions_queue.py | 57 ++++++++++++ 5 files changed, 158 insertions(+), 20 deletions(-) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 7057b48d..5e52917a 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -532,6 +532,8 @@ jobs: ;; ops/runner/verify-awoooi-non110-cd-closure.py) ;; + docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json) + ;; ops/monitoring/alerts-unified.yml) ;; ops/monitoring/alerts.yml) @@ -574,6 +576,10 @@ jobs: ;; scripts/reboot-recovery/dr-escrow-evidence-checklist.py) ;; + scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh) + ;; + scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py) + ;; scripts/reboot-recovery/post-reboot-owner-response-preflight.py) ;; scripts/reboot-recovery/post-start-quick-check.sh) @@ -785,8 +791,8 @@ jobs: ../../scripts/reboot-recovery/post-reboot-owner-response-preflight.py \ ../../scripts/reboot-recovery/momo-source-arrival-gate.py \ ../../scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py \ - ../../scripts/ops/backup-health-textfile-exporter.py \ ../../scripts/ops/backup-alert-label-contract-check.py \ + ../../scripts/ops/backup-health-textfile-exporter.py \ ../../scripts/security/gitea-private-inventory-p0-scorecard.py \ ../../scripts/security/gitea-authenticated-inventory-payload-validator.py python3.11 -c "import yaml; yaml.safe_load(open('../../ops/monitoring/alerts-unified.yml')); print('alerts-unified YAML OK')" @@ -814,6 +820,7 @@ jobs: ../../scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh \ ../../scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh \ ../../scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh \ + ../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh \ ../../scripts/backup/backup-awoooi-frequent.sh \ ../../scripts/backup/backup-status.sh \ ../../scripts/backup/gitea-repo-bundle-backup.sh diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 16643f6b..d2ab15a6 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,21 @@ +## 2026-07-01 — 23:52 Gitea CD #4315 B5 socket / queue historical blocker 修正 + +**照主線修正的問題**: +- Gitea CD `#4315` 對 `d658f03a` 失敗於 tests job:`BLOCKER b5_docker_socket_unavailable`,build / deploy / post-deploy 因 tests failed 被跳過;根因是 ops / backup / alert / escrow 類 source patch 未完整列入 controlled-runtime profile,掉回需要 Docker socket 的 B5 lane。 +- `.gitea/workflows/cd.yaml` 補齊 `d658f03a` 涉及的 ops / backup / alert / escrow source 與 focused checks:backup status、backup alert label contract、host pressure alert contract、credential escrow closeout script、alerts YAML。 +- `read-public-gitea-actions-queue.py` 修正 latest CD Success 時舊 scheduled `harbor-110-local-repair` Failure 只能作 historical evidence,不得蓋過最新 main CD 成功;raw stalled evidence 保留,但 active queue status 不再被舊 run 拉回。 + +**驗證**: +- `python3.11 -m py_compile ops/runner/read-public-gitea-actions-queue.py scripts/ops/backup-alert-label-contract-check.py scripts/ops/backup-health-textfile-exporter.py`:通過。 +- `.gitea/workflows/cd.yaml`、`ops/monitoring/alerts-unified.yml`、`ops/monitoring/alerts.yml` YAML parse:通過。 +- `bash -n scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh scripts/backup/backup-status.sh`:通過。 +- `python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py ops/runner/test_cd_controlled_runtime_profile.py scripts/backup/tests/test_backup_status_contract.py scripts/ops/tests/test_backup_health_textfile_exporter.py scripts/ops/tests/test_host_pressure_alert_contract.py scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py -q`:`93 passed`。 +- `git diff --check`:通過。 + +**邊界**:未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未 workflow_dispatch,未 force push。 + +**下一步**:正常 push Gitea `main` 觸發下一個 CD;預期 B5 socket blocker 不再重現,latest CD Success 後 queue readback 不再被舊 harbor repair Failure 覆蓋。 + ## 2026-07-01 — 23:00 core cold-start GREEN / MOMO source-arrival gate 拆分 **照主線修正的問題**: diff --git a/ops/runner/read-public-gitea-actions-queue.py b/ops/runner/read-public-gitea-actions-queue.py index 4d7f6c6e..87fbf6a6 100644 --- a/ops/runner/read-public-gitea-actions-queue.py +++ b/ops/runner/read-public-gitea-actions-queue.py @@ -400,6 +400,7 @@ def build_readback( latest_harbor_110_repair_log_text ) latest_cd_status = latest_cd_run.get("status", "") + latest_cd_success = latest_cd_status == "Success" latest_cd_visible_blocked = latest_cd_status == "Blocked" latest_cd_waiting = latest_cd_status == "Waiting" host_pressure_waiting_from_stale_jobs = ( @@ -512,7 +513,41 @@ def build_readback( current_cd_waiting_behind_harbor_110_repair_running = ( latest_cd_waiting and harbor_110_repair_running ) - harbor_110_repair_blocked = ( + harbor_110_repair_historical_after_latest_cd_success = bool( + latest_cd_success + and latest_cd_run_id + and harbor_110_repair_run_id + and harbor_110_repair_run_id != latest_cd_run_id + ) + effective_remote_ssh_publickey_auth_stalled = bool( + harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] + and not harbor_110_repair_historical_after_latest_cd_success + ) + effective_remote_control_channel_unavailable = bool( + harbor_110_repair_log_classifier["remote_control_channel_unavailable"] + and not harbor_110_repair_historical_after_latest_cd_success + ) + effective_remote_local_registry_v2_unavailable = bool( + harbor_110_repair_log_classifier["local_registry_v2_unavailable"] + and not harbor_110_repair_historical_after_latest_cd_success + ) + effective_remote_public_registry_v2_unavailable = bool( + harbor_110_repair_log_classifier["public_registry_v2_unavailable"] + and not harbor_110_repair_historical_after_latest_cd_success + ) + effective_harbor_110_repair_failed = bool( + harbor_110_repair_failed + and not harbor_110_repair_historical_after_latest_cd_success + ) + effective_harbor_110_repair_jobs_stale_or_mismatched = bool( + harbor_110_repair_jobs_stale_or_mismatched + and not harbor_110_repair_historical_after_latest_cd_success + ) + effective_harbor_110_repair_visible_failure_jobs_api_stale = bool( + harbor_110_repair_visible_failure_jobs_api_stale + and not harbor_110_repair_historical_after_latest_cd_success + ) + harbor_110_repair_blocked_raw = ( harbor_110_repair_status_blocked or harbor_110_repair_failed or bool(harbor_110_repair_no_matching_runner_label) @@ -520,11 +555,17 @@ def build_readback( or harbor_110_repair_visible_running_jobs_api_stale or bool(harbor_110_repair_log_classifier["failure_classifier"]) ) + harbor_110_repair_blocked = bool( + harbor_110_repair_blocked_raw + and not harbor_110_repair_historical_after_latest_cd_success + ) safe_next_action = _queue_safe_next_action( latest_cd_waiting=latest_cd_waiting, latest_cd_status=latest_cd_status, latest_cd_no_matching_runner_label=latest_cd_no_matching_runner_label, - cd_jobs_stale_or_mismatched=cd_jobs_stale_or_mismatched, + cd_jobs_stale_or_mismatched=( + cd_jobs_stale_or_mismatched and not latest_cd_success + ), cd_jobs_payload_classifier=cd_jobs_payload_classifier, effective_host_pressure_classifier=effective_tests_log_classifier[ "host_pressure_classifier" @@ -540,34 +581,36 @@ def build_readback( ], harbor_110_repair_no_matching_runner_label=( harbor_110_repair_no_matching_runner_label + if not harbor_110_repair_historical_after_latest_cd_success + else "" ), harbor_110_repair_waiting=harbor_110_repair_waiting, harbor_110_repair_running=harbor_110_repair_running, - harbor_110_repair_failed=harbor_110_repair_failed, + harbor_110_repair_failed=effective_harbor_110_repair_failed, harbor_110_repair_waiting_after_cd_harbor_blocker=( harbor_110_repair_waiting_after_cd_harbor_blocker + and not harbor_110_repair_historical_after_latest_cd_success ), harbor_110_repair_jobs_stale_or_mismatched=( - harbor_110_repair_jobs_stale_or_mismatched + effective_harbor_110_repair_jobs_stale_or_mismatched ), harbor_110_repair_jobs_payload_classifier=( harbor_110_repair_jobs_payload_classifier ), harbor_110_repair_visible_running_jobs_api_stale=( harbor_110_repair_visible_running_jobs_api_stale + and not harbor_110_repair_historical_after_latest_cd_success ), harbor_110_repair_visible_failure_jobs_api_stale=( - harbor_110_repair_visible_failure_jobs_api_stale + effective_harbor_110_repair_visible_failure_jobs_api_stale ), current_cd_waiting_behind_harbor_110_repair_running=( current_cd_waiting_behind_harbor_110_repair_running ), - remote_control_channel_unavailable=harbor_110_repair_log_classifier[ - "remote_control_channel_unavailable" - ], - remote_ssh_publickey_auth_stalled=harbor_110_repair_log_classifier[ - "remote_ssh_publickey_auth_stalled" - ], + remote_control_channel_unavailable=( + effective_remote_control_channel_unavailable + ), + remote_ssh_publickey_auth_stalled=effective_remote_ssh_publickey_auth_stalled, remote_ssh_publickey_offer_timeout=harbor_110_repair_log_classifier[ "remote_ssh_publickey_offer_timeout" ], @@ -684,10 +727,16 @@ def build_readback( "latest_visible_harbor_110_repair_waiting": harbor_110_repair_waiting, "latest_visible_harbor_110_repair_running": harbor_110_repair_running, "latest_visible_harbor_110_repair_failed": harbor_110_repair_failed, + "latest_visible_harbor_110_repair_historical_after_latest_cd_success": ( + harbor_110_repair_historical_after_latest_cd_success + ), "latest_visible_harbor_110_repair_status_blocked": ( harbor_110_repair_status_blocked ), "latest_visible_harbor_110_repair_blocked": harbor_110_repair_blocked, + "latest_visible_harbor_110_repair_blocked_raw": ( + harbor_110_repair_blocked_raw + ), "latest_visible_harbor_110_repair_log_http_status": ( latest_harbor_110_repair_log_http_status ), @@ -872,13 +921,13 @@ def build_readback( else "blocked_latest_visible_cd_run" if latest_cd_visible_blocked else "blocked_harbor_110_remote_ssh_publickey_auth_stalled" - if harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] + if effective_remote_ssh_publickey_auth_stalled else "blocked_harbor_110_remote_control_channel_unavailable" - if harbor_110_repair_log_classifier["remote_control_channel_unavailable"] + if effective_remote_control_channel_unavailable else "blocked_harbor_110_remote_local_registry_v2_unavailable" - if harbor_110_repair_log_classifier["local_registry_v2_unavailable"] + if effective_remote_local_registry_v2_unavailable else "blocked_harbor_public_registry_v2_unavailable_after_remote_repair" - if harbor_110_repair_log_classifier["public_registry_v2_unavailable"] + if effective_remote_public_registry_v2_unavailable else "blocked_current_cd_waiting_behind_stale_harbor_110_repair_readback" if ( latest_cd_waiting @@ -891,7 +940,7 @@ def build_readback( else "blocked_harbor_110_repair_failed" if ( build_log_classifier["harbor_public_route_blocked_or_retrying"] - and harbor_110_repair_failed + and effective_harbor_110_repair_failed ) else ( "blocked_harbor_public_route_unavailable_after_harbor_110_repair_success" @@ -925,13 +974,13 @@ def build_readback( else "harbor_110_repair_running" if harbor_110_repair_running else "blocked_harbor_110_repair_failed" - if harbor_110_repair_failed + if effective_harbor_110_repair_failed else "blocked_harbor_110_repair_run" if harbor_110_repair_blocked else "harbor_110_repair_jobs_stale_or_mismatched" - if harbor_110_repair_jobs_stale_or_mismatched + if effective_harbor_110_repair_jobs_stale_or_mismatched else "cd_jobs_stale_or_mismatched" - if cd_jobs_stale_or_mismatched + if cd_jobs_stale_or_mismatched and not latest_cd_success else "no_matching_runner_not_visible" ), "readback": readback, @@ -1014,7 +1063,11 @@ def build_readback( "harbor_110_repair_waiting": harbor_110_repair_waiting, "harbor_110_repair_running": harbor_110_repair_running, "harbor_110_repair_failed": harbor_110_repair_failed, + "harbor_110_repair_historical_after_latest_cd_success": ( + harbor_110_repair_historical_after_latest_cd_success + ), "harbor_110_repair_blocked": harbor_110_repair_blocked, + "harbor_110_repair_blocked_raw": harbor_110_repair_blocked_raw, "harbor_110_repair_waiting_after_cd_harbor_blocker": ( harbor_110_repair_waiting_after_cd_harbor_blocker ), diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index 385ea9d1..67bb7b12 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -703,6 +703,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N expected_sources = [ "docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md)", "docs/runbooks/FULL-STACK-COLD-START-SOP.md)", + "docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json)", "docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md)", "ops/monitoring/alerts-unified.yml)", "ops/monitoring/alerts.yml)", @@ -725,6 +726,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N "scripts/reboot-recovery/full-stack-recovery-scorecard.sh)", "scripts/reboot-recovery/awoooi-startup-110.sh)", "scripts/reboot-recovery/harbor-watchdog.sh)", + "scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh)", "scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh)", "scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py)", "scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py)", @@ -750,6 +752,7 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N "../../scripts/reboot-recovery/momo-source-arrival-gate.py", "../../scripts/reboot-recovery/full-stack-recovery-scorecard.sh", "../../scripts/reboot-recovery/harbor-watchdog.sh", + "../../scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh", "../../scripts/reboot-recovery/awoooi-startup-110.sh", "../../scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh", "../../scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh", diff --git a/ops/runner/test_read_public_gitea_actions_queue.py b/ops/runner/test_read_public_gitea_actions_queue.py index 0c95f454..3ca7aba9 100644 --- a/ops/runner/test_read_public_gitea_actions_queue.py +++ b/ops/runner/test_read_public_gitea_actions_queue.py @@ -142,6 +142,20 @@ def _actions_html_cd_failed_harbor_repair_failed() -> str: ) +def _actions_html_cd_success_harbor_repair_failed() -> str: + return ( + _actions_html_cd_running_harbor_repair_waiting() + .replace('data-tooltip-content="Running"', 'data-tooltip-content="Success"', 1) + .replace('data-tooltip-content="Waiting"', 'data-tooltip-content="Failure"', 1) + .replace("4061", "4314") + .replace("4060", "4307") + .replace( + "fix(cd): keep harbor repair workflow on controlled profile", + "feat(web): surface AI automation production proof", + ) + ) + + def _actions_html_harbor_repair_waiting_with_workflow_no_matching() -> str: return """