fix(momo): block EC404 auto-open with end-to-end URL guard

- normalize URLs at write time (scheduler crawlers, routes) to drop javascript:/EC404/placeholder i_code (momo_/manual_/pchome_) - add global click+auxclick guard in base.html and ewoooc_base.html that intercepts blocked MOMO URLs and redirects to safe i_code URL - per-page dashboards reuse the same isLikelyMomoIcode validation - /api/track_momo_link records blocked events for diagnosis - ship sanitize_momo_urls.py to clean existing polluted DB rows Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 12:00:34 +08:00
parent 026d0e7539
commit 75de76ac12
17 changed files with 1570 additions and 48 deletions
--- a/utils/momo_url_utils.py
+++ b/utils/momo_url_utils.py
@@ -0,0 +1,146 @@
+"""Utilities for MOMO product URL normalization and fallback."""
+
+import re
+from typing import Optional
+from urllib.parse import parse_qs, urlparse, urlunparse
+
+MOMO_BASE_DOMAINS = {
+    'www.momoshop.com.tw',
+    'm.momoshop.com.tw',
+}
+
+ERR404_PATH = '/ecm/js/err404/ec404.html'
+MOMO_ICODE_FALLBACK_MIN_LEN = 4
+MOMO_ICODE_RE = re.compile(r'^[A-Za-z0-9_-]+$')
+
+
+def is_probable_momo_icode(i_code: Optional[object]) -> bool:
+    """判斷值是否像是合理的 MOMO 商品代碼。"""
+    cleaned = str(i_code or '').strip()
+    if not cleaned:
+        return False
+
+    lowered = cleaned.lower()
+    if lowered in {'nan', 'none', 'null', 'undefined'}:
+        return False
+
+    if lowered.startswith(('momo_', 'manual_', 'pchome_')):
+        return False
+
+    if len(cleaned) < MOMO_ICODE_FALLBACK_MIN_LEN:
+        return False
+
+    return bool(MOMO_ICODE_RE.fullmatch(cleaned))
+
+
+def build_momo_product_url(i_code: Optional[object]) -> Optional[str]:
+    """Build fallback MOMO product detail URL from i_code."""
+    if not is_probable_momo_icode(i_code):
+        return None
+    return f"https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code={str(i_code).strip()}"
+
+
+def extract_momo_i_code(url: Optional[object]) -> Optional[str]:
+    """從 URL 萃取 i_code。"""
+    if not url:
+        return None
+
+    raw = str(url).strip()
+    if not raw:
+        return None
+
+    # URL 格式：直接解析
+    try:
+        normalized = raw if raw.startswith(('http://', 'https://')) else (
+            f'https:{raw}' if raw.startswith('//') else raw
+        )
+        parsed = urlparse(normalized)
+        if parsed.scheme in ('http', 'https'):
+            query = parse_qs(parsed.query or '')
+            i_code = (query.get('i_code') or [''])[0]
+            if i_code:
+                return i_code.strip()
+
+            match = re.search(r'/goodsdetail/([^/?#]+)', parsed.path or '', re.I)
+            if match:
+                return match.group(1).strip()
+    except Exception:
+        pass
+
+    # 備援匹配
+    match = re.search(r'[?&]i_code=([^&#]+)', raw, re.I)
+    if match:
+        return match.group(1).strip()
+
+    return None
+
+
+def _normalize_quoted_url(url: str) -> str:
+    """Normalize scheme-relative and path-relative URLs."""
+    cleaned = (url or '').strip()
+    if cleaned.startswith('//'):
+        return f'https:{cleaned}'
+    if cleaned.startswith('/'):
+        return f'https://www.momoshop.com.tw{cleaned}'
+    return cleaned
+
+
+def is_valid_momo_product_url(url: str) -> bool:
+    """Return whether URL looks like a valid MOMO product page."""
+    if not url:
+        return False
+
+    parsed = urlparse(url)
+    if parsed.scheme not in ('http', 'https'):
+        return False
+    if (parsed.hostname or '').lower() not in MOMO_BASE_DOMAINS:
+        return False
+
+    path = (parsed.path or '').lower()
+    if ERR404_PATH in path:
+        return False
+
+    # 商品頁通常會有 GoodsDetail.jsp 或 goodsDetail/xxx
+    if 'goodsdetail' in path:
+        if 'i_code' not in parse_qs(parsed.query or '') and not re.search(r'/goodsdetail/[^/]+', path):
+            return False
+        query = parse_qs(parsed.query or '')
+        if 'i_code' in query:
+            return True
+        # /goodsDetail/<i_code> 不一定有 query
+        return bool(re.search(r'/goodsdetail/[^/]+', path))
+
+    return False
+
+
+def normalize_momo_product_url(url: Optional[object], i_code: Optional[object]) -> Optional[str]:
+    """
+    Normalize a MOMO URL and fall back to i_code product detail URL when invalid.
+
+    Args:
+        url: Original link.
+        i_code: Product code for fallback URL.
+    """
+    fallback_code = extract_momo_i_code(url) or (str(i_code).strip() if is_probable_momo_icode(i_code) else None)
+    fallback = build_momo_product_url(fallback_code)
+
+    if not url:
+        return fallback
+
+    normalized = _normalize_quoted_url(str(url).strip())
+    if not normalized:
+        return fallback
+
+    lower = normalized.lower()
+    if lower.startswith('javascript:') or lower.startswith('void('):
+        return fallback
+
+    if is_valid_momo_product_url(normalized):
+        return normalized
+
+    # 兜底：若網址可解析且 host 仍是 MOMO，但不是預期路徑，仍可視為損壞資料
+    parsed = urlparse(normalized)
+    if parsed.scheme in ('http', 'https'):
+        return fallback
+
+    return fallback