fix(agent): classify ssh publickey offer timeout
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / build-and-deploy (push) Failing after 27s
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 12:58:23 +08:00
parent 5285bcd8aa
commit 3c13608055
7 changed files with 50 additions and 5 deletions

View File

@@ -614,6 +614,14 @@ def _queue_readback_normalizer_contract() -> list[dict[str, Any]]:
],
"learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"],
},
{
"field_id": "latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout",
"purpose": "classify whether 110 times out after the client offers a public key but before key acceptance",
"writes_blockers": [
"gitea_queue_harbor_110_remote_ssh_publickey_offer_timeout",
],
"learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"],
},
{
"field_id": "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout",
"purpose": "classify whether 110 accepts the SSH key and then times out during session, PAM, account, or shell setup",

View File

@@ -187,7 +187,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False):
]
for item in current_queue[0]["harbor_recovery_receipt_output_contract"]
)
assert current_queue[0]["queue_readback_normalizer_contract_count"] == 8
assert current_queue[0]["queue_readback_normalizer_contract_count"] == 9
assert [
item["field_id"]
for item in current_queue[0]["queue_readback_normalizer_contract"]
@@ -197,6 +197,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False):
"latest_visible_harbor_110_repair_no_matching_runner_label",
"latest_visible_harbor_110_repair_remote_control_channel_unavailable",
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled",
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout",
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout",
"current_cd_workflow_runner_readiness",
"controlled_profile_no_matching_runner_labels",

View File

@@ -365,6 +365,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu
"latest_visible_harbor_110_repair_no_matching_runner_label",
"latest_visible_harbor_110_repair_remote_control_channel_unavailable",
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled",
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout",
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout",
"current_cd_workflow_runner_readiness",
"controlled_profile_no_matching_runner_labels",
@@ -424,7 +425,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu
payload["summary"][
"ai_loop_current_blocker_queue_readback_normalizer_contract_count"
]
== 8
== 9
)
assert payload["summary"][
"ai_loop_current_blocker_queue_readback_normalizer_field_ids"

View File

@@ -18,10 +18,12 @@
- `ce5bcab8b` 已讓 queue parser / AI Loop 能讀 `remote_ssh_server_accepts_key_then_session_timeout`,但 Harbor repair workflow 的 SSH 診斷尚未輸出該 marker導致最新 queue 仍只能從舊 repair log 看到泛稱 `remote_ssh_publickey_auth_stalled`
- `.gitea/workflows/harbor-110-local-repair.yaml``diagnose_ssh_control_channel` 現在在 OpenSSH `Server accepts key` + timeout 時輸出 `harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true` 與 metadata-only `SSH_AUTH user=wooo mode=publickey rc=... classification=server_accepts_key_then_timeout`
- publickey auth stalled 判定同步接受 `Server accepts key` + timeout不再只依賴 `we sent a publickey packet, wait for reply`
- `ops/runner/read-public-gitea-actions-queue.py` 同步新增 `remote_ssh_publickey_offer_timeout` readback / rollup讓 110 診斷在 `publickey_offer_timeout``server_accepts_key_then_timeout` 間波動時都會歸入同一 SSH auth-stall blocker。
- workflow 仍維持 hourly / workflow_dispatch、bounded SSH、no secret、no raw SSH log、no reboot / Docker restart / node drain。
**驗證**
- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider`通過
- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider``68 passed`
- `python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider``33 passed`
- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .``node scripts/ci/check-gitea-step-env-secrets.js .gitea/workflows/harbor-110-local-repair.yaml``git diff --check`:通過。
**邊界**:只改 Harbor repair workflow 診斷 marker、workflow profile test 與 LOGBOOK未讀 secret / token / `.env` / raw sessions / SQLite / auth未讀 authorized_keys 內容或 `.runner` 內容;未使用 GitHub / `gh` / GitHub API未 workflow_dispatch未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall未執行 110 runtime apply。

View File

@@ -85,7 +85,7 @@
"harbor_110_remote_control_channel_unavailable",
"harbor_110_remote_ssh_reachable",
"harbor_110_remote_ssh_publickey_auth_stalled",
"harbor_110_remote_ssh_server_accepts_key_then_session_timeout",
"harbor_110_remote_ssh_publickey_offer_timeout",
"harbor_110_remote_ssh_auth_permission_denied_false",
"harbor_110_repair_jobs_payload_stale_or_cross_workflow",
"bounded_ssh_timeout_seen",
@@ -102,7 +102,7 @@
"ssh_auth_classification",
"remote_control_channel",
"remote_ssh_publickey_auth_stalled",
"remote_ssh_server_accepts_key_then_session_timeout",
"remote_ssh_publickey_offer_timeout",
"remote_ssh_auth_permission_denied",
"harbor_110_repair_failure_classifier",
"harbor_110_repair_jobs_payload_classifier",
@@ -123,6 +123,7 @@
"bounded_ssh_timeout_seen": true,
"remote_ssh_reachable": true,
"remote_ssh_publickey_auth_stalled": true,
"remote_ssh_publickey_offer_timeout": true,
"remote_ssh_server_accepts_key_then_session_timeout": true,
"remote_ssh_auth_permission_denied": false,
"harbor_110_repair_failure_classifier": "harbor_110_remote_ssh_publickey_auth_stalled",

View File

@@ -652,6 +652,11 @@ def build_readback(
"remote_ssh_publickey_reply_timeout_seen"
]
),
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout": (
harbor_110_repair_log_classifier[
"remote_ssh_publickey_offer_timeout"
]
),
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": (
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
),
@@ -928,6 +933,11 @@ def build_readback(
"remote_ssh_publickey_reply_timeout_seen"
]
),
"harbor_110_repair_remote_ssh_publickey_offer_timeout": (
harbor_110_repair_log_classifier[
"remote_ssh_publickey_offer_timeout"
]
),
"harbor_110_repair_remote_ssh_publickey_auth_stalled": (
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
),
@@ -1077,6 +1087,13 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
"harbor_110_remote_ssh_publickey_reply_timeout_seen",
text,
)
remote_ssh_publickey_offer_timeout = (
"classification=publickey_offer_timeout" in text
or (
"we sent a publickey packet, wait for reply" in text
and _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None
)
)
remote_ssh_publickey_auth_stalled = (
"harbor_110_remote_ssh_publickey_auth_stalled=true" in text
or "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled" in text
@@ -1085,6 +1102,7 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
and remote_ssh_publickey_offered is True
and remote_ssh_publickey_reply_timeout_seen is True
)
or remote_ssh_publickey_offer_timeout
)
remote_ssh_server_accepts_key_then_session_timeout = (
"harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true" in text
@@ -1153,6 +1171,7 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
"remote_ssh_publickey_reply_timeout_seen": (
remote_ssh_publickey_reply_timeout_seen
),
"remote_ssh_publickey_offer_timeout": remote_ssh_publickey_offer_timeout,
"remote_ssh_publickey_auth_stalled": remote_ssh_publickey_auth_stalled,
"remote_ssh_server_accepts_key_then_session_timeout": (
remote_ssh_server_accepts_key_then_session_timeout

View File

@@ -296,6 +296,7 @@ harbor_110_remote_ssh_userauth_service_accept_seen=true
harbor_110_remote_ssh_publickey_offered=true
harbor_110_remote_ssh_publickey_reply_timeout_seen=true
harbor_110_remote_ssh_publickey_auth_stalled=true
SSH_AUTH user=wooo mode=publickey rc=255 classification=publickey_offer_timeout
harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true
SSH_AUTH user=wooo mode=publickey rc=124 classification=server_accepts_key_then_timeout
BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=wooo@192.168.0.110
@@ -765,6 +766,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non
]
is True
)
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout"
]
is True
)
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_userauth_service_accept_seen"
@@ -781,6 +788,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non
payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"]
is True
)
assert (
payload["rollups"][
"harbor_110_repair_remote_ssh_publickey_offer_timeout"
]
is True
)
assert (
payload["rollups"][
"harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout"