diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index 6975138f6..36ab2cd8b 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -1098,14 +1098,18 @@ def apply_stockplatform_runtime_readback( if payload.get("product_data_green") is not True: blocking_fields.append("product_data_green") service_backup["blocking_fields"] = _unique_strings(blocking_fields) - service_backup["controlled_service_data_backup_blocker_count"] = len( - service_backup["blocking_fields"] + blocker_count = len(service_backup["blocking_fields"]) + can_clear = blocker_count == 0 + service_backup["controlled_service_data_backup_blocker_count"] = blocker_count + service_backup["status"] = ( + "ready_service_data_backup_green" + if can_clear + else "blocked_service_data_backup_readback_not_green" ) - rollups["controlled_service_data_backup_blocker_count"] = len( - service_backup["blocking_fields"] - ) - service_backup["status"] = "blocked_service_data_backup_readback_not_green" - service_backup["can_clear_service_data_backup_blockers"] = False + service_backup["can_clear_service_data_backup_blockers"] = can_clear + rollups["controlled_service_data_backup_blocker_count"] = blocker_count + rollups["controlled_service_data_backup_can_clear_blockers"] = can_clear + rollups["controlled_service_data_backup_readback_status"] = service_backup["status"] def _append_live_stockplatform_blockers( diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 7734ff8b1..f84dd7b96 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -476,6 +476,31 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics( ) +def test_reboot_auto_recovery_slo_scorecard_keeps_service_backup_ready_after_live_stockplatform_ok(): + payload = load_latest_reboot_auto_recovery_slo_scorecard( + prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK + ) + + reboot_slo_scorecard.apply_stockplatform_runtime_readback( + payload, + _stockplatform_runtime_ready(), + ) + + service_data_backup = payload["controlled_service_data_backup_readback"] + assert service_data_backup["service_green"] is True + assert service_data_backup["backup_core_green"] is True + assert service_data_backup["product_data_green"] is True + assert service_data_backup["post_start_blocked"] == 0 + assert service_data_backup["blocking_fields"] == [] + assert service_data_backup["status"] == "ready_service_data_backup_green" + assert service_data_backup["can_clear_service_data_backup_blockers"] is True + assert payload["rollups"]["controlled_service_data_backup_blocker_count"] == 0 + assert ( + payload["rollups"]["controlled_service_data_backup_readback_status"] + == "ready_service_data_backup_green" + ) + + def test_reboot_auto_recovery_slo_scorecard_keeps_prometheus_source_missing_when_source_control_missing( tmp_path, ): diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh index b849fe796..cbbed7af1 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh @@ -20,6 +20,7 @@ POST_REBOOT_READINESS_TIMEOUT_SECONDS="${POST_REBOOT_READINESS_TIMEOUT_SECONDS:- PUBLIC_MAINTENANCE_READBACK_TIMEOUT_SECONDS="${PUBLIC_MAINTENANCE_READBACK_TIMEOUT_SECONDS:-8}" PUBLIC_MAINTENANCE_URLS="${PUBLIC_MAINTENANCE_URLS:-https://awoooi.wooo.work/api/v1/health https://awoooi.wooo.work/ https://stock.wooo.work/api/v1/system/freshness https://mo.wooo.work/health https://bitan.wooo.work/ https://www.tsenyang.com/}" WINDOWS99_VMWARE_FALLBACK_MAX_AGE_SECONDS="${WINDOWS99_VMWARE_FALLBACK_MAX_AGE_SECONDS:-900}" +POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS="${POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS:-900}" mkdir -p "$TEXTFILE_DIR" "$LOG_DIR" @@ -179,6 +180,47 @@ if ! grep -q '^AWOOOI_POST_REBOOT_READINESS_SUMMARY=1$' "$summary_file" 2>/dev/n write_partial_post_reboot_summary_from_post_start fi +post_reboot_summary_needs_fallback=0 +if ! [ -s "$summary_file" ] \ + || grep -q '^POST_REBOOT_READINESS_SUMMARY_TIMEOUT=1$' "$summary_file" \ + || ! grep -q '^AWOOOI_POST_REBOOT_READINESS_SUMMARY=1$' "$summary_file"; then + post_reboot_summary_needs_fallback=1 +fi +if [ "$post_reboot_summary_needs_fallback" = 1 ]; then + post_reboot_current_failed="$artifact_dir/post-reboot-readiness-summary.current-failed.txt" + if [ -s "$summary_file" ]; then + cp "$summary_file" "$post_reboot_current_failed" || true + fi + if [[ "$POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" =~ ^[1-9][0-9]*$ ]]; then + now_epoch="$(date +%s)" + while IFS= read -r candidate_row; do + candidate_mtime="${candidate_row%% *}" + candidate_path="${candidate_row#* }" + [ "$candidate_path" = "$summary_file" ] && continue + [ -s "$candidate_path" ] || continue + grep -q '^AWOOOI_POST_REBOOT_READINESS_SUMMARY=1$' "$candidate_path" || continue + grep -q '^SERVICE_GREEN=1$' "$candidate_path" || continue + grep -q '^PRODUCT_DATA_GREEN=1$' "$candidate_path" || continue + grep -q '^BACKUP_CORE_GREEN=1$' "$candidate_path" || continue + grep -q '^HOST_188_SERVICE_GREEN=1$' "$candidate_path" || continue + grep -q '^POST_START_BLOCKED=0$' "$candidate_path" || continue + ! grep -q '^POST_REBOOT_READINESS_SUMMARY_TIMEOUT=1$' "$candidate_path" || continue + candidate_epoch="${candidate_mtime%.*}" + candidate_age="$((now_epoch - candidate_epoch))" + [ "$candidate_age" -le "$POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" ] || continue + cp "$candidate_path" "$summary_file" || break + { + printf '\nPOST_REBOOT_READINESS_SUMMARY_FALLBACK_APPLIED=1\n' + printf 'POST_REBOOT_READINESS_SUMMARY_FALLBACK_SOURCE=%s\n' "$candidate_path" + printf 'POST_REBOOT_READINESS_SUMMARY_FALLBACK_AGE_SECONDS=%s\n' "$candidate_age" + printf 'POST_REBOOT_READINESS_SUMMARY_FALLBACK_MAX_AGE_SECONDS=%s\n' "$POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" + printf 'POST_REBOOT_READINESS_SUMMARY_CURRENT_FAILED_FILE=%s\n' "$post_reboot_current_failed" + } >>"$summary_file" + break + done < <(find "$LOG_DIR" -mindepth 2 -maxdepth 2 -name summary.txt -type f -printf '%T@ %p\n' 2>/dev/null | sort -nr) + fi +fi + if command -v curl >/dev/null 2>&1; then curl -fsS --max-time "$STOCK_READBACK_TIMEOUT_SECONDS" \ "$STOCK_FRESHNESS_URL" >"$stock_freshness_file" 2>"$artifact_dir/stock-freshness.err" \ diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py index 30f193967..223152a83 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py @@ -119,6 +119,10 @@ def test_exporter_projects_each_scorecard_blocker_to_textfile_metric() -> None: assert "POST_REBOOT_READINESS_PARTIAL_FROM_POST_START=1" in text assert "PARTIAL_POST_START_GREEN_SUMMARY_TIMEOUT" in text assert "write_partial_post_reboot_summary_from_post_start" in text + assert "POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" in text + assert "post-reboot-readiness-summary.current-failed.txt" in text + assert "POST_REBOOT_READINESS_SUMMARY_FALLBACK_APPLIED=1" in text + assert "POST_REBOOT_READINESS_SUMMARY_FALLBACK_MAX_AGE_SECONDS" in text assert "awoooi_windows99_vmware_missing_vmx_alias" in text assert "awoooi_windows99_vmware_powered_off_alias" in text assert "active_blocker_metrics" in text