From fa42099d85d5dd196441206bcf7ecb09398071dc Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 09:45:46 +0800 Subject: [PATCH] fix(runner): diagnose harbor ssh auth stalls --- .gitea/workflows/harbor-110-local-repair.yaml | 59 +++++++++ ops/runner/read-public-gitea-actions-queue.py | 122 +++++++++++++++++- .../test_read_public_gitea_actions_queue.py | 80 ++++++++++++ 3 files changed, 260 insertions(+), 1 deletion(-) diff --git a/.gitea/workflows/harbor-110-local-repair.yaml b/.gitea/workflows/harbor-110-local-repair.yaml index 1c244107..23dc5e99 100644 --- a/.gitea/workflows/harbor-110-local-repair.yaml +++ b/.gitea/workflows/harbor-110-local-repair.yaml @@ -70,7 +70,66 @@ jobs: timeout 30 "${ssh_base[@]}" "$@" } + diagnose_ssh_control_channel() { + set +e + diag_output="$( + timeout 20 ssh -vvv -4 \ + -o BatchMode=yes \ + -o PreferredAuthentications=publickey \ + -o PasswordAuthentication=no \ + -o KbdInteractiveAuthentication=no \ + -o GSSAPIAuthentication=no \ + -o NumberOfPasswordPrompts=0 \ + -o ConnectTimeout=8 \ + -o ConnectionAttempts=1 \ + -o ServerAliveInterval=3 \ + -o ServerAliveCountMax=1 \ + "${AWOOOI_110_SSH_TARGET}" \ + 'true' 2>&1 + )" + diag_rc=$? + set -e + + echo "harbor_110_remote_ssh_diag_rc=${diag_rc}" + if printf '%s\n' "${diag_output}" | grep -q "Connection established."; then + echo "harbor_110_remote_ssh_tcp_connected=true" + else + echo "harbor_110_remote_ssh_tcp_connected=false" + fi + if printf '%s\n' "${diag_output}" | grep -q "Remote protocol version"; then + echo "harbor_110_remote_ssh_banner_seen=true" + else + echo "harbor_110_remote_ssh_banner_seen=false" + fi + if printf '%s\n' "${diag_output}" | grep -q "SSH2_MSG_SERVICE_ACCEPT received"; then + echo "harbor_110_remote_ssh_userauth_service_accept_seen=true" + else + echo "harbor_110_remote_ssh_userauth_service_accept_seen=false" + fi + if printf '%s\n' "${diag_output}" | grep -q "Offering public key:"; then + echo "harbor_110_remote_ssh_publickey_offered=true" + else + echo "harbor_110_remote_ssh_publickey_offered=false" + fi + if printf '%s\n' "${diag_output}" | grep -q "we sent a publickey packet, wait for reply" \ + && printf '%s\n' "${diag_output}" | grep -Eq "timed out|Timeout, server"; then + echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=true" + echo "harbor_110_remote_ssh_publickey_auth_stalled=true" + echo "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=${AWOOOI_110_SSH_TARGET}" + else + echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=false" + echo "harbor_110_remote_ssh_publickey_auth_stalled=false" + fi + if printf '%s\n' "${diag_output}" | grep -q "Permission denied"; then + echo "harbor_110_remote_ssh_auth_permission_denied=true" + else + echo "harbor_110_remote_ssh_auth_permission_denied=false" + fi + echo "harbor_110_remote_ssh_diag_raw_log_printed=false" + } + if ! run_ssh "expected_host_ip=${AWOOOI_110_EXPECTED_HOST_IP}; printf 'remote_host=%s\n' \"\$(hostname 2>/dev/null || echo unknown)\"; printf 'remote_user=%s\n' \"\$(id -un 2>/dev/null || echo unknown)\"; hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx \"\${expected_host_ip}\""; then + diagnose_ssh_control_channel || true echo "BLOCKED harbor_110_remote_control_channel_unavailable target=${AWOOOI_110_SSH_TARGET}" echo "harbor_110_remote_ssh_reachable=false" exit 65 diff --git a/ops/runner/read-public-gitea-actions-queue.py b/ops/runner/read-public-gitea-actions-queue.py index 1161fc89..a8cd799f 100644 --- a/ops/runner/read-public-gitea-actions-queue.py +++ b/ops/runner/read-public-gitea-actions-queue.py @@ -85,6 +85,9 @@ _HARBOR_110_REMOTE_SSH_TIMEOUT_RE = re.compile( r"(Connection to 192\.168\.0\.110 port 22 timed out|" r"ssh: connect to host 192\.168\.0\.110 port 22: Operation timed out)" ) +_HARBOR_110_REMOTE_SSH_BOOL_RE_TEMPLATE = ( + r"{name}=(?Ptrue|false)" +) _HARBOR_110_REMOTE_LOCAL_V2_STATUS_RE = re.compile( r"harbor_110_remote_local_v2_http_status=(?P\d{3})" ) @@ -630,6 +633,31 @@ def build_readback( "latest_visible_harbor_110_repair_bounded_ssh_timeout_seen": ( harbor_110_repair_log_classifier["bounded_ssh_timeout_seen"] ), + "latest_visible_harbor_110_repair_remote_ssh_tcp_connected": ( + harbor_110_repair_log_classifier["remote_ssh_tcp_connected"] + ), + "latest_visible_harbor_110_repair_remote_ssh_banner_seen": ( + harbor_110_repair_log_classifier["remote_ssh_banner_seen"] + ), + "latest_visible_harbor_110_repair_remote_ssh_userauth_service_accept_seen": ( + harbor_110_repair_log_classifier[ + "remote_ssh_userauth_service_accept_seen" + ] + ), + "latest_visible_harbor_110_repair_remote_ssh_publickey_offered": ( + harbor_110_repair_log_classifier["remote_ssh_publickey_offered"] + ), + "latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": ( + harbor_110_repair_log_classifier[ + "remote_ssh_publickey_reply_timeout_seen" + ] + ), + "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": ( + harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] + ), + "latest_visible_harbor_110_repair_remote_ssh_auth_permission_denied": ( + harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"] + ), "latest_visible_harbor_110_repair_local_registry_v2_status": ( harbor_110_repair_log_classifier["local_registry_v2_status"] ), @@ -727,6 +755,8 @@ def build_readback( if latest_cd_visible_blocked else "blocked_current_cd_workflow_waiting_for_runner_or_queue" if latest_cd_waiting + else "blocked_harbor_110_remote_ssh_publickey_auth_stalled" + if harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] else "blocked_harbor_110_remote_control_channel_unavailable" if harbor_110_repair_log_classifier["remote_control_channel_unavailable"] else "blocked_harbor_110_remote_local_registry_v2_unavailable" @@ -874,6 +904,31 @@ def build_readback( "harbor_110_repair_bounded_ssh_timeout_seen": ( harbor_110_repair_log_classifier["bounded_ssh_timeout_seen"] ), + "harbor_110_repair_remote_ssh_tcp_connected": ( + harbor_110_repair_log_classifier["remote_ssh_tcp_connected"] + ), + "harbor_110_repair_remote_ssh_banner_seen": ( + harbor_110_repair_log_classifier["remote_ssh_banner_seen"] + ), + "harbor_110_repair_remote_ssh_userauth_service_accept_seen": ( + harbor_110_repair_log_classifier[ + "remote_ssh_userauth_service_accept_seen" + ] + ), + "harbor_110_repair_remote_ssh_publickey_offered": ( + harbor_110_repair_log_classifier["remote_ssh_publickey_offered"] + ), + "harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": ( + harbor_110_repair_log_classifier[ + "remote_ssh_publickey_reply_timeout_seen" + ] + ), + "harbor_110_repair_remote_ssh_publickey_auth_stalled": ( + harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] + ), + "harbor_110_repair_remote_ssh_auth_permission_denied": ( + harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"] + ), "harbor_110_repair_local_registry_v2_status": ( harbor_110_repair_log_classifier["local_registry_v2_status"] ), @@ -992,9 +1047,43 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: ) bounded_ssh_timeout_seen = _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None + remote_ssh_tcp_connected = _last_bool_marker( + "harbor_110_remote_ssh_tcp_connected", + text, + ) + remote_ssh_banner_seen = _last_bool_marker( + "harbor_110_remote_ssh_banner_seen", + text, + ) + remote_ssh_userauth_service_accept_seen = _last_bool_marker( + "harbor_110_remote_ssh_userauth_service_accept_seen", + text, + ) + remote_ssh_publickey_offered = _last_bool_marker( + "harbor_110_remote_ssh_publickey_offered", + text, + ) + remote_ssh_publickey_reply_timeout_seen = _last_bool_marker( + "harbor_110_remote_ssh_publickey_reply_timeout_seen", + text, + ) + remote_ssh_publickey_auth_stalled = ( + "harbor_110_remote_ssh_publickey_auth_stalled=true" in text + or "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled" in text + or ( + remote_ssh_userauth_service_accept_seen is True + and remote_ssh_publickey_offered is True + and remote_ssh_publickey_reply_timeout_seen is True + ) + ) + remote_ssh_auth_permission_denied = _last_bool_marker( + "harbor_110_remote_ssh_auth_permission_denied", + text, + ) remote_control_channel_unavailable = ( "harbor_110_remote_control_channel_unavailable" in text or (bounded_ssh_timeout_seen and remote_ssh_reachable is False) + or remote_ssh_publickey_auth_stalled ) local_registry_v2_unavailable = ( _HARBOR_110_REMOTE_LOCAL_V2_BLOCKER_RE.search(text) is not None @@ -1024,7 +1113,9 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: return { "failure_classifier": ( - "harbor_110_remote_control_channel_unavailable" + "harbor_110_remote_ssh_publickey_auth_stalled" + if remote_ssh_publickey_auth_stalled + else "harbor_110_remote_control_channel_unavailable" if remote_control_channel_unavailable else "harbor_110_remote_local_registry_v2_unavailable" if local_registry_v2_unavailable @@ -1035,6 +1126,17 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: "remote_control_channel_unavailable": remote_control_channel_unavailable, "remote_ssh_reachable": remote_ssh_reachable, "bounded_ssh_timeout_seen": bounded_ssh_timeout_seen, + "remote_ssh_tcp_connected": remote_ssh_tcp_connected, + "remote_ssh_banner_seen": remote_ssh_banner_seen, + "remote_ssh_userauth_service_accept_seen": ( + remote_ssh_userauth_service_accept_seen + ), + "remote_ssh_publickey_offered": remote_ssh_publickey_offered, + "remote_ssh_publickey_reply_timeout_seen": ( + remote_ssh_publickey_reply_timeout_seen + ), + "remote_ssh_publickey_auth_stalled": remote_ssh_publickey_auth_stalled, + "remote_ssh_auth_permission_denied": remote_ssh_auth_permission_denied, "local_registry_v2_status": local_status, "public_registry_v2_status": public_status, "local_registry_v2_unavailable": local_registry_v2_unavailable, @@ -1127,6 +1229,16 @@ def _last_named_match_group(pattern: re.Pattern[str], text: str, group: str) -> return matches[-1].group(group) if matches else "" +def _last_bool_marker(name: str, text: str) -> bool | None: + pattern = re.compile( + _HARBOR_110_REMOTE_SSH_BOOL_RE_TEMPLATE.format(name=re.escape(name)) + ) + matches = list(pattern.finditer(text)) + if not matches: + return None + return matches[-1].group("value") == "true" + + def _read_text_file(path: Path) -> str: return path.read_text(encoding="utf-8") @@ -1194,6 +1306,14 @@ def _human_summary(payload: dict[str, Any]) -> str: "LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_REACHABLE=" f"{readback['latest_visible_harbor_110_repair_remote_ssh_reachable']}" ), + ( + "LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_PUBLICKEY_AUTH_STALLED=" + f"{int(readback['latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled'])}" + ), + ( + "LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_PUBLICKEY_REPLY_TIMEOUT_SEEN=" + f"{readback['latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen']}" + ), ( "HARBOR_110_REPAIR_WAITING_AFTER_CD_HARBOR_BLOCKER=" f"{int(readback['harbor_110_repair_waiting_after_cd_harbor_blocker'])}" diff --git a/ops/runner/test_read_public_gitea_actions_queue.py b/ops/runner/test_read_public_gitea_actions_queue.py index 0a1922bc..bfcf5f5b 100644 --- a/ops/runner/test_read_public_gitea_actions_queue.py +++ b/ops/runner/test_read_public_gitea_actions_queue.py @@ -282,6 +282,28 @@ harbor_110_remote_ssh_reachable=false """ +def _harbor_110_repair_publickey_auth_stalled_log() -> str: + return """ +operation_boundary_secret_value_read=false +operation_boundary_docker_daemon_restart_performed=false +operation_boundary_host_reboot_performed=false +operation_boundary_node_drain_performed=false +operation_boundary_remote_ssh_bounded=true +harbor_110_remote_ssh_diag_rc=255 +harbor_110_remote_ssh_tcp_connected=true +harbor_110_remote_ssh_banner_seen=true +harbor_110_remote_ssh_userauth_service_accept_seen=true +harbor_110_remote_ssh_publickey_offered=true +harbor_110_remote_ssh_publickey_reply_timeout_seen=true +harbor_110_remote_ssh_publickey_auth_stalled=true +BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=wooo@192.168.0.110 +harbor_110_remote_ssh_auth_permission_denied=false +harbor_110_remote_ssh_diag_raw_log_printed=false +BLOCKED harbor_110_remote_control_channel_unavailable target=wooo@192.168.0.110 +harbor_110_remote_ssh_reachable=false +""" + + def _harbor_110_repair_success_jobs() -> dict: return { "total_count": 2, @@ -697,6 +719,64 @@ def test_build_readback_classifies_harbor_repair_remote_control_unavailable() -> assert payload["operation_boundaries"]["host_write_performed"] is False +def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> None: + module = _load_module() + payload = module.build_readback( + actions_html=_actions_html_cd_running_harbor_repair_waiting().replace( + 'data-tooltip-content="Waiting"', + 'data-tooltip-content="Failure"', + 1, + ), + actions_list_http_status=401, + actions_list_payload={"message": "token is required"}, + cd_jobs_http_status=200, + cd_jobs_payload={"jobs": [], "total_count": 0}, + harbor_110_repair_jobs_http_status=200, + harbor_110_repair_jobs_payload=_harbor_110_repair_stale_code_review_jobs(), + latest_cd_build_log_http_status=200, + latest_cd_build_log_text=_harbor_blocked_log(), + latest_harbor_110_repair_log_http_status=200, + latest_harbor_110_repair_log_text=( + _harbor_110_repair_publickey_auth_stalled_log() + ), + ) + + assert payload["status"] == "blocked_harbor_110_remote_ssh_publickey_auth_stalled" + assert payload["readback"]["latest_visible_harbor_110_repair_failure_classifier"] == ( + "harbor_110_remote_ssh_publickey_auth_stalled" + ) + assert ( + payload["readback"][ + "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled" + ] + is True + ) + assert ( + payload["readback"][ + "latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen" + ] + is True + ) + assert ( + payload["readback"][ + "latest_visible_harbor_110_repair_remote_ssh_userauth_service_accept_seen" + ] + is True + ) + assert ( + payload["readback"][ + "latest_visible_harbor_110_repair_remote_ssh_auth_permission_denied" + ] + is False + ) + assert ( + payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"] + is True + ) + assert payload["operation_boundaries"]["secret_or_runner_token_read"] is False + assert payload["operation_boundaries"]["host_write_performed"] is False + + def test_build_readback_surfaces_harbor_110_repair_no_matching_runner() -> None: module = _load_module() payload = module.build_readback(