fix(agent): classify ssh publickey offer timeout
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / build-and-deploy (push) Failing after 27s
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / build-and-deploy (push) Failing after 27s
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -614,6 +614,14 @@ def _queue_readback_normalizer_contract() -> list[dict[str, Any]]:
|
||||
],
|
||||
"learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"],
|
||||
},
|
||||
{
|
||||
"field_id": "latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout",
|
||||
"purpose": "classify whether 110 times out after the client offers a public key but before key acceptance",
|
||||
"writes_blockers": [
|
||||
"gitea_queue_harbor_110_remote_ssh_publickey_offer_timeout",
|
||||
],
|
||||
"learning_targets": ["km", "rag", "playbook", "mcp", "verifier", "ai_agent"],
|
||||
},
|
||||
{
|
||||
"field_id": "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout",
|
||||
"purpose": "classify whether 110 accepts the SSH key and then times out during session, PAM, account, or shell setup",
|
||||
|
||||
@@ -187,7 +187,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False):
|
||||
]
|
||||
for item in current_queue[0]["harbor_recovery_receipt_output_contract"]
|
||||
)
|
||||
assert current_queue[0]["queue_readback_normalizer_contract_count"] == 8
|
||||
assert current_queue[0]["queue_readback_normalizer_contract_count"] == 9
|
||||
assert [
|
||||
item["field_id"]
|
||||
for item in current_queue[0]["queue_readback_normalizer_contract"]
|
||||
@@ -197,6 +197,7 @@ def _assert_executor_readback(payload: dict, *, public_endpoint: bool = False):
|
||||
"latest_visible_harbor_110_repair_no_matching_runner_label",
|
||||
"latest_visible_harbor_110_repair_remote_control_channel_unavailable",
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled",
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout",
|
||||
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout",
|
||||
"current_cd_workflow_runner_readiness",
|
||||
"controlled_profile_no_matching_runner_labels",
|
||||
|
||||
@@ -365,6 +365,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu
|
||||
"latest_visible_harbor_110_repair_no_matching_runner_label",
|
||||
"latest_visible_harbor_110_repair_remote_control_channel_unavailable",
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled",
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout",
|
||||
"latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout",
|
||||
"current_cd_workflow_runner_readiness",
|
||||
"controlled_profile_no_matching_runner_labels",
|
||||
@@ -424,7 +425,7 @@ def test_awoooi_priority_work_order_readback_overlays_ai_loop_current_blocker_qu
|
||||
payload["summary"][
|
||||
"ai_loop_current_blocker_queue_readback_normalizer_contract_count"
|
||||
]
|
||||
== 8
|
||||
== 9
|
||||
)
|
||||
assert payload["summary"][
|
||||
"ai_loop_current_blocker_queue_readback_normalizer_field_ids"
|
||||
|
||||
@@ -18,10 +18,12 @@
|
||||
- `ce5bcab8b` 已讓 queue parser / AI Loop 能讀 `remote_ssh_server_accepts_key_then_session_timeout`,但 Harbor repair workflow 的 SSH 診斷尚未輸出該 marker,導致最新 queue 仍只能從舊 repair log 看到泛稱 `remote_ssh_publickey_auth_stalled`。
|
||||
- `.gitea/workflows/harbor-110-local-repair.yaml` 的 `diagnose_ssh_control_channel` 現在在 OpenSSH `Server accepts key` + timeout 時輸出 `harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true` 與 metadata-only `SSH_AUTH user=wooo mode=publickey rc=... classification=server_accepts_key_then_timeout`。
|
||||
- publickey auth stalled 判定同步接受 `Server accepts key` + timeout,不再只依賴 `we sent a publickey packet, wait for reply`。
|
||||
- `ops/runner/read-public-gitea-actions-queue.py` 同步新增 `remote_ssh_publickey_offer_timeout` readback / rollup,讓 110 診斷在 `publickey_offer_timeout` 與 `server_accepts_key_then_timeout` 間波動時都會歸入同一 SSH auth-stall blocker。
|
||||
- workflow 仍維持 hourly / workflow_dispatch、bounded SSH、no secret、no raw SSH log、no reboot / Docker restart / node drain。
|
||||
|
||||
**驗證**:
|
||||
- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider`:通過。
|
||||
- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider`:`68 passed`。
|
||||
- `python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider`:`33 passed`。
|
||||
- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`、`node scripts/ci/check-gitea-step-env-secrets.js .gitea/workflows/harbor-110-local-repair.yaml`、`git diff --check`:通過。
|
||||
|
||||
**邊界**:只改 Harbor repair workflow 診斷 marker、workflow profile test 與 LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未讀 authorized_keys 內容或 `.runner` 內容;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall;未執行 110 runtime apply。
|
||||
|
||||
@@ -85,7 +85,7 @@
|
||||
"harbor_110_remote_control_channel_unavailable",
|
||||
"harbor_110_remote_ssh_reachable",
|
||||
"harbor_110_remote_ssh_publickey_auth_stalled",
|
||||
"harbor_110_remote_ssh_server_accepts_key_then_session_timeout",
|
||||
"harbor_110_remote_ssh_publickey_offer_timeout",
|
||||
"harbor_110_remote_ssh_auth_permission_denied_false",
|
||||
"harbor_110_repair_jobs_payload_stale_or_cross_workflow",
|
||||
"bounded_ssh_timeout_seen",
|
||||
@@ -102,7 +102,7 @@
|
||||
"ssh_auth_classification",
|
||||
"remote_control_channel",
|
||||
"remote_ssh_publickey_auth_stalled",
|
||||
"remote_ssh_server_accepts_key_then_session_timeout",
|
||||
"remote_ssh_publickey_offer_timeout",
|
||||
"remote_ssh_auth_permission_denied",
|
||||
"harbor_110_repair_failure_classifier",
|
||||
"harbor_110_repair_jobs_payload_classifier",
|
||||
@@ -123,6 +123,7 @@
|
||||
"bounded_ssh_timeout_seen": true,
|
||||
"remote_ssh_reachable": true,
|
||||
"remote_ssh_publickey_auth_stalled": true,
|
||||
"remote_ssh_publickey_offer_timeout": true,
|
||||
"remote_ssh_server_accepts_key_then_session_timeout": true,
|
||||
"remote_ssh_auth_permission_denied": false,
|
||||
"harbor_110_repair_failure_classifier": "harbor_110_remote_ssh_publickey_auth_stalled",
|
||||
|
||||
@@ -652,6 +652,11 @@ def build_readback(
|
||||
"remote_ssh_publickey_reply_timeout_seen"
|
||||
]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout": (
|
||||
harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_offer_timeout"
|
||||
]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
|
||||
),
|
||||
@@ -928,6 +933,11 @@ def build_readback(
|
||||
"remote_ssh_publickey_reply_timeout_seen"
|
||||
]
|
||||
),
|
||||
"harbor_110_repair_remote_ssh_publickey_offer_timeout": (
|
||||
harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_offer_timeout"
|
||||
]
|
||||
),
|
||||
"harbor_110_repair_remote_ssh_publickey_auth_stalled": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
|
||||
),
|
||||
@@ -1077,6 +1087,13 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
|
||||
"harbor_110_remote_ssh_publickey_reply_timeout_seen",
|
||||
text,
|
||||
)
|
||||
remote_ssh_publickey_offer_timeout = (
|
||||
"classification=publickey_offer_timeout" in text
|
||||
or (
|
||||
"we sent a publickey packet, wait for reply" in text
|
||||
and _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None
|
||||
)
|
||||
)
|
||||
remote_ssh_publickey_auth_stalled = (
|
||||
"harbor_110_remote_ssh_publickey_auth_stalled=true" in text
|
||||
or "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled" in text
|
||||
@@ -1085,6 +1102,7 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
|
||||
and remote_ssh_publickey_offered is True
|
||||
and remote_ssh_publickey_reply_timeout_seen is True
|
||||
)
|
||||
or remote_ssh_publickey_offer_timeout
|
||||
)
|
||||
remote_ssh_server_accepts_key_then_session_timeout = (
|
||||
"harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true" in text
|
||||
@@ -1153,6 +1171,7 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
|
||||
"remote_ssh_publickey_reply_timeout_seen": (
|
||||
remote_ssh_publickey_reply_timeout_seen
|
||||
),
|
||||
"remote_ssh_publickey_offer_timeout": remote_ssh_publickey_offer_timeout,
|
||||
"remote_ssh_publickey_auth_stalled": remote_ssh_publickey_auth_stalled,
|
||||
"remote_ssh_server_accepts_key_then_session_timeout": (
|
||||
remote_ssh_server_accepts_key_then_session_timeout
|
||||
|
||||
@@ -296,6 +296,7 @@ harbor_110_remote_ssh_userauth_service_accept_seen=true
|
||||
harbor_110_remote_ssh_publickey_offered=true
|
||||
harbor_110_remote_ssh_publickey_reply_timeout_seen=true
|
||||
harbor_110_remote_ssh_publickey_auth_stalled=true
|
||||
SSH_AUTH user=wooo mode=publickey rc=255 classification=publickey_offer_timeout
|
||||
harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true
|
||||
SSH_AUTH user=wooo mode=publickey rc=124 classification=server_accepts_key_then_timeout
|
||||
BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=wooo@192.168.0.110
|
||||
@@ -765,6 +766,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_remote_ssh_userauth_service_accept_seen"
|
||||
@@ -781,6 +788,12 @@ def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> Non
|
||||
payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"]
|
||||
is True
|
||||
)
|
||||
assert (
|
||||
payload["rollups"][
|
||||
"harbor_110_repair_remote_ssh_publickey_offer_timeout"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert (
|
||||
payload["rollups"][
|
||||
"harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout"
|
||||
|
||||
Reference in New Issue
Block a user