fix(reboot): fallback post reboot readiness readback
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 57s
CD Pipeline / build-and-deploy (push) Successful in 5m41s
CD Pipeline / post-deploy-checks (push) Successful in 1m45s

This commit is contained in:
Your Name
2026-07-03 03:42:04 +08:00
parent b38337b5f2
commit 17ba08cbc7
4 changed files with 82 additions and 7 deletions

View File

@@ -1098,14 +1098,18 @@ def apply_stockplatform_runtime_readback(
if payload.get("product_data_green") is not True:
blocking_fields.append("product_data_green")
service_backup["blocking_fields"] = _unique_strings(blocking_fields)
service_backup["controlled_service_data_backup_blocker_count"] = len(
service_backup["blocking_fields"]
blocker_count = len(service_backup["blocking_fields"])
can_clear = blocker_count == 0
service_backup["controlled_service_data_backup_blocker_count"] = blocker_count
service_backup["status"] = (
"ready_service_data_backup_green"
if can_clear
else "blocked_service_data_backup_readback_not_green"
)
rollups["controlled_service_data_backup_blocker_count"] = len(
service_backup["blocking_fields"]
)
service_backup["status"] = "blocked_service_data_backup_readback_not_green"
service_backup["can_clear_service_data_backup_blockers"] = False
service_backup["can_clear_service_data_backup_blockers"] = can_clear
rollups["controlled_service_data_backup_blocker_count"] = blocker_count
rollups["controlled_service_data_backup_can_clear_blockers"] = can_clear
rollups["controlled_service_data_backup_readback_status"] = service_backup["status"]
def _append_live_stockplatform_blockers(

View File

@@ -476,6 +476,31 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
)
def test_reboot_auto_recovery_slo_scorecard_keeps_service_backup_ready_after_live_stockplatform_ok():
payload = load_latest_reboot_auto_recovery_slo_scorecard(
prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK
)
reboot_slo_scorecard.apply_stockplatform_runtime_readback(
payload,
_stockplatform_runtime_ready(),
)
service_data_backup = payload["controlled_service_data_backup_readback"]
assert service_data_backup["service_green"] is True
assert service_data_backup["backup_core_green"] is True
assert service_data_backup["product_data_green"] is True
assert service_data_backup["post_start_blocked"] == 0
assert service_data_backup["blocking_fields"] == []
assert service_data_backup["status"] == "ready_service_data_backup_green"
assert service_data_backup["can_clear_service_data_backup_blockers"] is True
assert payload["rollups"]["controlled_service_data_backup_blocker_count"] == 0
assert (
payload["rollups"]["controlled_service_data_backup_readback_status"]
== "ready_service_data_backup_green"
)
def test_reboot_auto_recovery_slo_scorecard_keeps_prometheus_source_missing_when_source_control_missing(
tmp_path,
):

View File

@@ -20,6 +20,7 @@ POST_REBOOT_READINESS_TIMEOUT_SECONDS="${POST_REBOOT_READINESS_TIMEOUT_SECONDS:-
PUBLIC_MAINTENANCE_READBACK_TIMEOUT_SECONDS="${PUBLIC_MAINTENANCE_READBACK_TIMEOUT_SECONDS:-8}"
PUBLIC_MAINTENANCE_URLS="${PUBLIC_MAINTENANCE_URLS:-https://awoooi.wooo.work/api/v1/health https://awoooi.wooo.work/ https://stock.wooo.work/api/v1/system/freshness https://mo.wooo.work/health https://bitan.wooo.work/ https://www.tsenyang.com/}"
WINDOWS99_VMWARE_FALLBACK_MAX_AGE_SECONDS="${WINDOWS99_VMWARE_FALLBACK_MAX_AGE_SECONDS:-900}"
POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS="${POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS:-900}"
mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"
@@ -179,6 +180,47 @@ if ! grep -q '^AWOOOI_POST_REBOOT_READINESS_SUMMARY=1$' "$summary_file" 2>/dev/n
write_partial_post_reboot_summary_from_post_start
fi
post_reboot_summary_needs_fallback=0
if ! [ -s "$summary_file" ] \
|| grep -q '^POST_REBOOT_READINESS_SUMMARY_TIMEOUT=1$' "$summary_file" \
|| ! grep -q '^AWOOOI_POST_REBOOT_READINESS_SUMMARY=1$' "$summary_file"; then
post_reboot_summary_needs_fallback=1
fi
if [ "$post_reboot_summary_needs_fallback" = 1 ]; then
post_reboot_current_failed="$artifact_dir/post-reboot-readiness-summary.current-failed.txt"
if [ -s "$summary_file" ]; then
cp "$summary_file" "$post_reboot_current_failed" || true
fi
if [[ "$POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" =~ ^[1-9][0-9]*$ ]]; then
now_epoch="$(date +%s)"
while IFS= read -r candidate_row; do
candidate_mtime="${candidate_row%% *}"
candidate_path="${candidate_row#* }"
[ "$candidate_path" = "$summary_file" ] && continue
[ -s "$candidate_path" ] || continue
grep -q '^AWOOOI_POST_REBOOT_READINESS_SUMMARY=1$' "$candidate_path" || continue
grep -q '^SERVICE_GREEN=1$' "$candidate_path" || continue
grep -q '^PRODUCT_DATA_GREEN=1$' "$candidate_path" || continue
grep -q '^BACKUP_CORE_GREEN=1$' "$candidate_path" || continue
grep -q '^HOST_188_SERVICE_GREEN=1$' "$candidate_path" || continue
grep -q '^POST_START_BLOCKED=0$' "$candidate_path" || continue
! grep -q '^POST_REBOOT_READINESS_SUMMARY_TIMEOUT=1$' "$candidate_path" || continue
candidate_epoch="${candidate_mtime%.*}"
candidate_age="$((now_epoch - candidate_epoch))"
[ "$candidate_age" -le "$POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" ] || continue
cp "$candidate_path" "$summary_file" || break
{
printf '\nPOST_REBOOT_READINESS_SUMMARY_FALLBACK_APPLIED=1\n'
printf 'POST_REBOOT_READINESS_SUMMARY_FALLBACK_SOURCE=%s\n' "$candidate_path"
printf 'POST_REBOOT_READINESS_SUMMARY_FALLBACK_AGE_SECONDS=%s\n' "$candidate_age"
printf 'POST_REBOOT_READINESS_SUMMARY_FALLBACK_MAX_AGE_SECONDS=%s\n' "$POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS"
printf 'POST_REBOOT_READINESS_SUMMARY_CURRENT_FAILED_FILE=%s\n' "$post_reboot_current_failed"
} >>"$summary_file"
break
done < <(find "$LOG_DIR" -mindepth 2 -maxdepth 2 -name summary.txt -type f -printf '%T@ %p\n' 2>/dev/null | sort -nr)
fi
fi
if command -v curl >/dev/null 2>&1; then
curl -fsS --max-time "$STOCK_READBACK_TIMEOUT_SECONDS" \
"$STOCK_FRESHNESS_URL" >"$stock_freshness_file" 2>"$artifact_dir/stock-freshness.err" \

View File

@@ -119,6 +119,10 @@ def test_exporter_projects_each_scorecard_blocker_to_textfile_metric() -> None:
assert "POST_REBOOT_READINESS_PARTIAL_FROM_POST_START=1" in text
assert "PARTIAL_POST_START_GREEN_SUMMARY_TIMEOUT" in text
assert "write_partial_post_reboot_summary_from_post_start" in text
assert "POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" in text
assert "post-reboot-readiness-summary.current-failed.txt" in text
assert "POST_REBOOT_READINESS_SUMMARY_FALLBACK_APPLIED=1" in text
assert "POST_REBOOT_READINESS_SUMMARY_FALLBACK_MAX_AGE_SECONDS" in text
assert "awoooi_windows99_vmware_missing_vmx_alias" in text
assert "awoooi_windows99_vmware_powered_off_alias" in text
assert "active_blocker_metrics" in text