From b09b103a5936a23adbd0f098f0623d368dcff3f9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 23:39:53 +0800 Subject: [PATCH] fix(recovery): classify ssh key accept timeout --- ...or_registry_controlled_recovery_receipt.py | 18 ++++++++ ...or_registry_controlled_recovery_receipt.py | 43 ++++++++++++++++--- .../diagnose-110-ssh-publickey-auth.sh | 6 ++- .../test_cold_start_monitor_bounded_probes.py | 1 + 4 files changed, 59 insertions(+), 9 deletions(-) diff --git a/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py b/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py index 7c427542..85499f80 100644 --- a/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py +++ b/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py @@ -145,6 +145,9 @@ def validate_harbor_registry_controlled_recovery_receipt( "ssh_publickey_offer_timeout_seen": ssh_diagnosis[ "publickey_offer_timeout_seen" ], + "ssh_publickey_server_accepts_key_then_timeout_seen": ( + ssh_diagnosis["server_accepts_key_then_timeout_seen"] + ), "ssh_publickey_node_exporter_ok": ssh_diagnosis["node_exporter_ok"], "ssh_publickey_port_tcp_open": ssh_diagnosis["ssh_port_tcp_open"], "ssh_local_repair_receipt_seen": ssh_local["receipt_seen"], @@ -307,6 +310,8 @@ def _ssh_metadata_phase_status( return "blocked_ssh_metadata_repair_receipt_not_ready" if ssh_diagnosis["publickey_offer_timeout_seen"]: return "blocked_waiting_ssh_metadata_repair_receipt_after_publickey_timeout" + if ssh_diagnosis["server_accepts_key_then_timeout_seen"]: + return "blocked_waiting_110_local_session_path_diagnosis_after_key_accept_timeout" if watchdog_check["receipt_seen"]: return "skipped_not_required" return "blocked_waiting_ssh_metadata_or_harbor_preflight_receipt" @@ -342,6 +347,11 @@ def _parse_ssh_publickey_diagnosis_output(output: str) -> dict[str, Any]: and item["classification"] == "publickey_offer_timeout" for item in auth_attempts ) + server_accepts_key_then_timeout_seen = any( + item["mode"] == "publickey" + and item["classification"] == "server_accepts_key_then_timeout" + for item in auth_attempts + ) preauth_timeout_count = sum( 1 for item in auth_attempts if item["classification"] == "preauth_timeout" ) @@ -362,6 +372,9 @@ def _parse_ssh_publickey_diagnosis_output(output: str) -> dict[str, Any]: "auth_classifications": auth_attempts, "wooo_publickey_classification": wooo_publickey, "publickey_offer_timeout_seen": publickey_offer_timeout_seen, + "server_accepts_key_then_timeout_seen": ( + server_accepts_key_then_timeout_seen + ), "preauth_timeout_count": preauth_timeout_count, "permission_denied_count": permission_denied_count, "diagnosis_ready": bool(marker_seen and ssh_port_tcp_open and auth_attempts), @@ -774,6 +787,11 @@ def _active_blockers( and not ssh_local["control_channel_metadata_ready"] ): blockers.append("ssh_publickey_offer_timeout_on_wooo") + if ( + ssh_diagnosis["server_accepts_key_then_timeout_seen"] + and not ssh_local["control_channel_metadata_ready"] + ): + blockers.append("ssh_publickey_server_accepts_key_then_timeout_on_wooo") if ssh_local["receipt_seen"] and not ssh_local["control_channel_metadata_ready"]: blockers.append("ssh_local_repair_receipt_metadata_not_ready") if not watchdog_check["receipt_seen"]: diff --git a/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py b/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py index 5743d0c3..1af68bd6 100644 --- a/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py +++ b/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py @@ -116,14 +116,18 @@ def test_harbor_recovery_receipt_accepts_ssh_publickey_diagnosis() -> None: assert payload["safe_next_step"] == ( "run_110_local_ssh_metadata_check_then_harbor_watchdog_check_mode" ) - assert "ssh_publickey_offer_timeout_on_wooo" in payload["active_blockers"] + assert "ssh_publickey_server_accepts_key_then_timeout_on_wooo" in payload[ + "active_blockers" + ] diagnosis = payload["readback"]["ssh_publickey_diagnosis"] assert diagnosis["diagnosis_ready"] is True assert diagnosis["node_exporter_ok"] is True assert diagnosis["ssh_port_tcp_open"] is True assert diagnosis["ssh_banner_seen"] is True - assert diagnosis["wooo_publickey_classification"] == "publickey_offer_timeout" - assert diagnosis["publickey_offer_timeout_seen"] is True + assert diagnosis["wooo_publickey_classification"] == ( + "server_accepts_key_then_timeout" + ) + assert diagnosis["server_accepts_key_then_timeout_seen"] is True assert diagnosis["raw_output_returned"] is False phases = { phase["phase_id"]: phase @@ -132,16 +136,41 @@ def test_harbor_recovery_receipt_accepts_ssh_publickey_diagnosis() -> None: assert phases["diagnose_ssh_publickey"]["status"] == "ready" assert phases["repair_ssh_metadata_if_check_confirms_metadata_drift"][ "status" - ] == "blocked_waiting_ssh_metadata_repair_receipt_after_publickey_timeout" + ] == "blocked_waiting_110_local_session_path_diagnosis_after_key_accept_timeout" assert payload["rollups"]["ssh_publickey_diagnosis_receipt_seen"] is True assert payload["rollups"]["ssh_publickey_wooo_publickey_classification"] == ( - "publickey_offer_timeout" + "server_accepts_key_then_timeout" ) assert payload["input_redaction"]["ssh_publickey_diagnosis_output"][ "line_count" ] > 0 +def test_harbor_recovery_receipt_classifies_publickey_offer_timeout() -> None: + diagnosis_output = _ssh_publickey_diagnosis_output().replace( + "rc=124 classification=server_accepts_key_then_timeout", + "rc=255 classification=publickey_offer_timeout", + ) + + payload = validate_harbor_registry_controlled_recovery_receipt( + { + "ssh_publickey_diagnosis_output": diagnosis_output, + } + ) + + assert "ssh_publickey_offer_timeout_on_wooo" in payload["active_blockers"] + diagnosis = payload["readback"]["ssh_publickey_diagnosis"] + assert diagnosis["wooo_publickey_classification"] == "publickey_offer_timeout" + assert diagnosis["publickey_offer_timeout_seen"] is True + phases = { + phase["phase_id"]: phase + for phase in payload["local_console_phase_readback"]["phases"] + } + assert phases["repair_ssh_metadata_if_check_confirms_metadata_drift"][ + "status" + ] == "blocked_waiting_ssh_metadata_repair_receipt_after_publickey_timeout" + + def test_harbor_recovery_receipt_surfaces_gitea_queue_blockers() -> None: payload = validate_harbor_registry_controlled_recovery_receipt( { @@ -351,12 +380,12 @@ NODE_LOAD1_PER_CPU=0.93 NODE_LOAD_CLASSIFIER=load_not_high SSH_PORT=tcp_open SSH_BANNER=SSH-2.0-OpenSSH_8.9p1 -SSH_AUTH user=wooo mode=publickey rc=255 classification=publickey_offer_timeout +SSH_AUTH user=wooo mode=publickey rc=124 classification=server_accepts_key_then_timeout SSH_AUTH user=root mode=publickey rc=255 classification=permission_denied SSH_AUTH user=git mode=publickey rc=255 classification=preauth_timeout SSH_AUTH user=ollama mode=publickey rc=255 classification=preauth_timeout SSH_AUTH user=wooo mode=password_disabled rc=255 classification=permission_denied -INTERPRETATION=publickey_offer_timeout_on_wooo_means_check_110_authorized_keys_permissions_pam_or_account_lookup_path +INTERPRETATION=server_accepts_key_then_timeout_on_wooo_means_check_110_session_pam_account_or_shell_path """ diff --git a/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh b/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh index f4b70e69..0e014a00 100755 --- a/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh +++ b/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh @@ -40,7 +40,9 @@ run_timeout() { classify_log() { local path="$1" - if grep -q 'Server accepts key' "$path"; then + if grep -q 'Server accepts key' "$path" && grep -Eiq 'timed out|not responding|Timeout' "$path"; then + echo "server_accepts_key_then_timeout" + elif grep -q 'Server accepts key' "$path"; then echo "server_accepts_key" elif grep -q 'Offering public key' "$path" && grep -Eiq 'timed out|not responding|Timeout' "$path"; then echo "publickey_offer_timeout" @@ -134,4 +136,4 @@ for user in "${USERS[@]}"; do probe_user "$user" "password_disabled" done -echo "INTERPRETATION=publickey_offer_timeout_on_wooo_means_check_110_authorized_keys_permissions_pam_or_account_lookup_path" +echo "INTERPRETATION=server_accepts_key_then_timeout_means_check_110_session_pam_account_or_shell_path;publickey_offer_timeout_means_check_110_authorized_keys_permissions_pam_or_account_lookup_path" diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index 98cbbb0a..619c23b9 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -113,6 +113,7 @@ def test_110_ssh_publickey_auth_diagnosis_is_bounded_and_read_only() -> None: assert "PasswordAuthentication=no" in text assert "PubkeyAuthentication=no" in text assert "NumberOfPasswordPrompts=0" in text + assert "server_accepts_key_then_timeout" in text assert "publickey_offer_timeout" in text assert "NODE_EXPORTER=ok" in text assert "NODE_LOAD1_PER_CPU" in text