fix(momo): block EC404 auto-open with end-to-end URL guard
Some checks failed
CD Pipeline / deploy (push) Has been cancelled
Some checks failed
CD Pipeline / deploy (push) Has been cancelled
- normalize URLs at write time (scheduler crawlers, routes) to drop javascript:/EC404/placeholder i_code (momo_/manual_/pchome_) - add global click+auxclick guard in base.html and ewoooc_base.html that intercepts blocked MOMO URLs and redirects to safe i_code URL - per-page dashboards reuse the same isLikelyMomoIcode validation - /api/track_momo_link records blocked events for diagnosis - ship sanitize_momo_urls.py to clean existing polluted DB rows Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
146
utils/momo_url_utils.py
Normal file
146
utils/momo_url_utils.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""Utilities for MOMO product URL normalization and fallback."""
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
from urllib.parse import parse_qs, urlparse, urlunparse
|
||||
|
||||
MOMO_BASE_DOMAINS = {
|
||||
'www.momoshop.com.tw',
|
||||
'm.momoshop.com.tw',
|
||||
}
|
||||
|
||||
ERR404_PATH = '/ecm/js/err404/ec404.html'
|
||||
MOMO_ICODE_FALLBACK_MIN_LEN = 4
|
||||
MOMO_ICODE_RE = re.compile(r'^[A-Za-z0-9_-]+$')
|
||||
|
||||
|
||||
def is_probable_momo_icode(i_code: Optional[object]) -> bool:
|
||||
"""判斷值是否像是合理的 MOMO 商品代碼。"""
|
||||
cleaned = str(i_code or '').strip()
|
||||
if not cleaned:
|
||||
return False
|
||||
|
||||
lowered = cleaned.lower()
|
||||
if lowered in {'nan', 'none', 'null', 'undefined'}:
|
||||
return False
|
||||
|
||||
if lowered.startswith(('momo_', 'manual_', 'pchome_')):
|
||||
return False
|
||||
|
||||
if len(cleaned) < MOMO_ICODE_FALLBACK_MIN_LEN:
|
||||
return False
|
||||
|
||||
return bool(MOMO_ICODE_RE.fullmatch(cleaned))
|
||||
|
||||
|
||||
def build_momo_product_url(i_code: Optional[object]) -> Optional[str]:
|
||||
"""Build fallback MOMO product detail URL from i_code."""
|
||||
if not is_probable_momo_icode(i_code):
|
||||
return None
|
||||
return f"https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code={str(i_code).strip()}"
|
||||
|
||||
|
||||
def extract_momo_i_code(url: Optional[object]) -> Optional[str]:
|
||||
"""從 URL 萃取 i_code。"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
raw = str(url).strip()
|
||||
if not raw:
|
||||
return None
|
||||
|
||||
# URL 格式:直接解析
|
||||
try:
|
||||
normalized = raw if raw.startswith(('http://', 'https://')) else (
|
||||
f'https:{raw}' if raw.startswith('//') else raw
|
||||
)
|
||||
parsed = urlparse(normalized)
|
||||
if parsed.scheme in ('http', 'https'):
|
||||
query = parse_qs(parsed.query or '')
|
||||
i_code = (query.get('i_code') or [''])[0]
|
||||
if i_code:
|
||||
return i_code.strip()
|
||||
|
||||
match = re.search(r'/goodsdetail/([^/?#]+)', parsed.path or '', re.I)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 備援匹配
|
||||
match = re.search(r'[?&]i_code=([^&#]+)', raw, re.I)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _normalize_quoted_url(url: str) -> str:
|
||||
"""Normalize scheme-relative and path-relative URLs."""
|
||||
cleaned = (url or '').strip()
|
||||
if cleaned.startswith('//'):
|
||||
return f'https:{cleaned}'
|
||||
if cleaned.startswith('/'):
|
||||
return f'https://www.momoshop.com.tw{cleaned}'
|
||||
return cleaned
|
||||
|
||||
|
||||
def is_valid_momo_product_url(url: str) -> bool:
|
||||
"""Return whether URL looks like a valid MOMO product page."""
|
||||
if not url:
|
||||
return False
|
||||
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme not in ('http', 'https'):
|
||||
return False
|
||||
if (parsed.hostname or '').lower() not in MOMO_BASE_DOMAINS:
|
||||
return False
|
||||
|
||||
path = (parsed.path or '').lower()
|
||||
if ERR404_PATH in path:
|
||||
return False
|
||||
|
||||
# 商品頁通常會有 GoodsDetail.jsp 或 goodsDetail/xxx
|
||||
if 'goodsdetail' in path:
|
||||
if 'i_code' not in parse_qs(parsed.query or '') and not re.search(r'/goodsdetail/[^/]+', path):
|
||||
return False
|
||||
query = parse_qs(parsed.query or '')
|
||||
if 'i_code' in query:
|
||||
return True
|
||||
# /goodsDetail/<i_code> 不一定有 query
|
||||
return bool(re.search(r'/goodsdetail/[^/]+', path))
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def normalize_momo_product_url(url: Optional[object], i_code: Optional[object]) -> Optional[str]:
|
||||
"""
|
||||
Normalize a MOMO URL and fall back to i_code product detail URL when invalid.
|
||||
|
||||
Args:
|
||||
url: Original link.
|
||||
i_code: Product code for fallback URL.
|
||||
"""
|
||||
fallback_code = extract_momo_i_code(url) or (str(i_code).strip() if is_probable_momo_icode(i_code) else None)
|
||||
fallback = build_momo_product_url(fallback_code)
|
||||
|
||||
if not url:
|
||||
return fallback
|
||||
|
||||
normalized = _normalize_quoted_url(str(url).strip())
|
||||
if not normalized:
|
||||
return fallback
|
||||
|
||||
lower = normalized.lower()
|
||||
if lower.startswith('javascript:') or lower.startswith('void('):
|
||||
return fallback
|
||||
|
||||
if is_valid_momo_product_url(normalized):
|
||||
return normalized
|
||||
|
||||
# 兜底:若網址可解析且 host 仍是 MOMO,但不是預期路徑,仍可視為損壞資料
|
||||
parsed = urlparse(normalized)
|
||||
if parsed.scheme in ('http', 'https'):
|
||||
return fallback
|
||||
|
||||
return fallback
|
||||
Reference in New Issue
Block a user