diff --git a/apps/api/src/services/ai_agent_log_controlled_writeback_executor_readback.py b/apps/api/src/services/ai_agent_log_controlled_writeback_executor_readback.py index 9d97d345..e3a14c3a 100644 --- a/apps/api/src/services/ai_agent_log_controlled_writeback_executor_readback.py +++ b/apps/api/src/services/ai_agent_log_controlled_writeback_executor_readback.py @@ -614,6 +614,14 @@ def _queue_readback_normalizer_contract() -> list[dict[str, Any]]: ], "learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"], }, + { + "field_id": "latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout", + "purpose": "classify whether 110 times out after the client offers a public key but before key acceptance", + "writes_blockers": [ + "gitea_queue_harbor_110_remote_ssh_publickey_offer_timeout", + ], + "learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"], + }, { "field_id": "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout", "purpose": "classify whether 110 accepts the SSH key and then times out during session, PAM, account, or shell setup", diff --git a/apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py b/apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py index 6210e8f3..c8635b0d 100644 --- a/apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py +++ b/apps/api/tests/test_ai_agent_log_controlled_writeback_executor_readback_api.py @@ -187,7 +187,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False): ] for item in current_queue[0]["harbor_recovery_receipt_output_contract"] ) - assert current_queue[0]["queue_readback_normalizer_contract_count"] == 8 + assert current_queue[0]["queue_readback_normalizer_contract_count"] == 9 assert [ item["field_id"] for item in current_queue[0]["queue_readback_normalizer_contract"] @@ -197,6 +197,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False): "latest_visible_harbor_110_repair_no_matching_runner_label", "latest_visible_harbor_110_repair_remote_control_channel_unavailable", "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled", + "latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout", "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout", "current_cd_workflow_runner_readiness", "controlled_profile_no_matching_runner_labels", diff --git a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py index c1180681..13457d13 100644 --- a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py +++ b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py @@ -365,6 +365,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu "latest_visible_harbor_110_repair_no_matching_runner_label", "latest_visible_harbor_110_repair_remote_control_channel_unavailable", "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled", + "latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout", "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout", "current_cd_workflow_runner_readiness", "controlled_profile_no_matching_runner_labels", @@ -424,7 +425,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu payload["summary"][ "ai_loop_current_blocker_queue_readback_normalizer_contract_count" ] - == 8 + == 9 ) assert payload["summary"][ "ai_loop_current_blocker_queue_readback_normalizer_field_ids" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index b98ab023..aef8ec27 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -18,10 +18,12 @@ - `ce5bcab8b` 已讓 queue parser / AI Loop 能讀 `remote_ssh_server_accepts_key_then_session_timeout`,但 Harbor repair workflow 的 SSH 診斷尚未輸出該 marker,導致最新 queue 仍只能從舊 repair log 看到泛稱 `remote_ssh_publickey_auth_stalled`。 - `.gitea/workflows/harbor-110-local-repair.yaml` 的 `diagnose_ssh_control_channel` 現在在 OpenSSH `Server accepts key` + timeout 時輸出 `harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true` 與 metadata-only `SSH_AUTH user=wooo mode=publickey rc=... classification=server_accepts_key_then_timeout`。 - publickey auth stalled 判定同步接受 `Server accepts key` + timeout,不再只依賴 `we sent a publickey packet, wait for reply`。 +- `ops/runner/read-public-gitea-actions-queue.py` 同步新增 `remote_ssh_publickey_offer_timeout` readback / rollup,讓 110 診斷在 `publickey_offer_timeout` 與 `server_accepts_key_then_timeout` 間波動時都會歸入同一 SSH auth-stall blocker。 - workflow 仍維持 hourly / workflow_dispatch、bounded SSH、no secret、no raw SSH log、no reboot / Docker restart / node drain。 **驗證**: -- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider`:通過。 +- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider`:`68 passed`。 +- `python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider`:`33 passed`。 - `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`、`node scripts/ci/check-gitea-step-env-secrets.js .gitea/workflows/harbor-110-local-repair.yaml`、`git diff --check`:通過。 **邊界**:只改 Harbor repair workflow 診斷 marker、workflow profile test 與 LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未讀 authorized_keys 內容或 `.runner` 內容;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall;未執行 110 runtime apply。 diff --git a/docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json b/docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json index c59a7ea8..e192c9d6 100644 --- a/docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json +++ b/docs/operations/ai-agent-log-intelligence-runtime-sample-readback.snapshot.json @@ -85,7 +85,7 @@ "harbor_110_remote_control_channel_unavailable", "harbor_110_remote_ssh_reachable", "harbor_110_remote_ssh_publickey_auth_stalled", - "harbor_110_remote_ssh_server_accepts_key_then_session_timeout", + "harbor_110_remote_ssh_publickey_offer_timeout", "harbor_110_remote_ssh_auth_permission_denied_false", "harbor_110_repair_jobs_payload_stale_or_cross_workflow", "bounded_ssh_timeout_seen", @@ -102,7 +102,7 @@ "ssh_auth_classification", "remote_control_channel", "remote_ssh_publickey_auth_stalled", - "remote_ssh_server_accepts_key_then_session_timeout", + "remote_ssh_publickey_offer_timeout", "remote_ssh_auth_permission_denied", "harbor_110_repair_failure_classifier", "harbor_110_repair_jobs_payload_classifier", @@ -123,6 +123,7 @@ "bounded_ssh_timeout_seen": true, "remote_ssh_reachable": true, "remote_ssh_publickey_auth_stalled": true, + "remote_ssh_publickey_offer_timeout": true, "remote_ssh_server_accepts_key_then_session_timeout": true, "remote_ssh_auth_permission_denied": false, "harbor_110_repair_failure_classifier": "harbor_110_remote_ssh_publickey_auth_stalled", diff --git a/ops/runner/read-public-gitea-actions-queue.py b/ops/runner/read-public-gitea-actions-queue.py index 4ff8fe91..0e6631f2 100644 --- a/ops/runner/read-public-gitea-actions-queue.py +++ b/ops/runner/read-public-gitea-actions-queue.py @@ -652,6 +652,11 @@ def build_readback( "remote_ssh_publickey_reply_timeout_seen" ] ), + "latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout": ( + harbor_110_repair_log_classifier[ + "remote_ssh_publickey_offer_timeout" + ] + ), "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": ( harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] ), @@ -928,6 +933,11 @@ def build_readback( "remote_ssh_publickey_reply_timeout_seen" ] ), + "harbor_110_repair_remote_ssh_publickey_offer_timeout": ( + harbor_110_repair_log_classifier[ + "remote_ssh_publickey_offer_timeout" + ] + ), "harbor_110_repair_remote_ssh_publickey_auth_stalled": ( harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] ), @@ -1077,6 +1087,13 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: "harbor_110_remote_ssh_publickey_reply_timeout_seen", text, ) + remote_ssh_publickey_offer_timeout = ( + "classification=publickey_offer_timeout" in text + or ( + "we sent a publickey packet, wait for reply" in text + and _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None + ) + ) remote_ssh_publickey_auth_stalled = ( "harbor_110_remote_ssh_publickey_auth_stalled=true" in text or "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled" in text @@ -1085,6 +1102,7 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: and remote_ssh_publickey_offered is True and remote_ssh_publickey_reply_timeout_seen is True ) + or remote_ssh_publickey_offer_timeout ) remote_ssh_server_accepts_key_then_session_timeout = ( "harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true" in text @@ -1153,6 +1171,7 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: "remote_ssh_publickey_reply_timeout_seen": ( remote_ssh_publickey_reply_timeout_seen ), + "remote_ssh_publickey_offer_timeout": remote_ssh_publickey_offer_timeout, "remote_ssh_publickey_auth_stalled": remote_ssh_publickey_auth_stalled, "remote_ssh_server_accepts_key_then_session_timeout": ( remote_ssh_server_accepts_key_then_session_timeout diff --git a/ops/runner/test_read_public_gitea_actions_queue.py b/ops/runner/test_read_public_gitea_actions_queue.py index 68daabab..c0afd069 100644 --- a/ops/runner/test_read_public_gitea_actions_queue.py +++ b/ops/runner/test_read_public_gitea_actions_queue.py @@ -296,6 +296,7 @@ harbor_110_remote_ssh_userauth_service_accept_seen=true harbor_110_remote_ssh_publickey_offered=true harbor_110_remote_ssh_publickey_reply_timeout_seen=true harbor_110_remote_ssh_publickey_auth_stalled=true +SSH_AUTH user=wooo mode=publickey rc=255 classification=publickey_offer_timeout harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true SSH_AUTH user=wooo mode=publickey rc=124 classification=server_accepts_key_then_timeout BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=wooo@192.168.0.110 @@ -765,6 +766,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non ] is True ) + assert ( + payload["readback"][ + "latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout" + ] + is True + ) assert ( payload["readback"][ "latest_visible_harbor_110_repair_remote_ssh_userauth_service_accept_seen" @@ -781,6 +788,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"] is True ) + assert ( + payload["rollups"][ + "harbor_110_repair_remote_ssh_publickey_offer_timeout" + ] + is True + ) assert ( payload["rollups"][ "harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout"