571 lines
21 KiB
Python
571 lines
21 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""外部市場來源與報價的正規化服務。
|
||
|
||
第一版只做資料規格、來源狀態與只讀統計,不主動抓資料、不寫 DB。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import logging
|
||
import csv
|
||
import io
|
||
from dataclasses import dataclass, field
|
||
from datetime import datetime
|
||
from typing import Any
|
||
|
||
from sqlalchemy import inspect, text
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
SOURCE_CONTRACTS = [
|
||
{
|
||
"code": "momo_reference",
|
||
"display_name": "MOMO 外部價格參考",
|
||
"platform_code": "momo",
|
||
"status_code": "active",
|
||
"status_label": "正在使用",
|
||
"source_kind": "legacy_bridge",
|
||
"input_methods": ["既有比價快取", "手動 CSV", "供應商 API"],
|
||
"data_quality_label": "只採用已確認同款",
|
||
"plain_note": "目前用已確認同款的 MOMO 參考價,協助判斷 PChome 商品是否需要調整售價或曝光。",
|
||
},
|
||
{
|
||
"code": "shopee",
|
||
"display_name": "蝦皮",
|
||
"platform_code": "shopee",
|
||
"status_code": "paused",
|
||
"status_label": "先暫停",
|
||
"source_kind": "connector_contract",
|
||
"input_methods": ["官方 API", "供應商 API", "手動 CSV"],
|
||
"data_quality_label": "暫不進告警",
|
||
"plain_note": "先保留資料接口,等有穩定合法來源後再啟用,不會影響目前作戰清單。",
|
||
},
|
||
{
|
||
"code": "coupang",
|
||
"display_name": "酷澎",
|
||
"platform_code": "coupang",
|
||
"status_code": "paused",
|
||
"status_label": "先暫停",
|
||
"source_kind": "connector_contract",
|
||
"input_methods": ["官方 API", "供應商 API", "手動 CSV"],
|
||
"data_quality_label": "暫不進告警",
|
||
"plain_note": "先保留資料接口,等有穩定合法來源後再啟用,不會影響目前作戰清單。",
|
||
},
|
||
]
|
||
|
||
NORMALIZED_OFFER_FIELDS = [
|
||
{
|
||
"name": "source_code",
|
||
"label": "資料來源",
|
||
"required": True,
|
||
"plain_note": "例如 momo_reference、shopee、coupang。",
|
||
},
|
||
{
|
||
"name": "source_product_id",
|
||
"label": "外部商品 ID",
|
||
"required": True,
|
||
"plain_note": "外部平台或資料供應商給的商品編號。",
|
||
},
|
||
{
|
||
"name": "title",
|
||
"label": "商品名稱",
|
||
"required": True,
|
||
"plain_note": "用來做人工確認與名稱比對。",
|
||
},
|
||
{
|
||
"name": "price",
|
||
"label": "售價",
|
||
"required": True,
|
||
"plain_note": "只填可直接比較的成交或頁面售價。",
|
||
},
|
||
{
|
||
"name": "observed_at",
|
||
"label": "資料時間",
|
||
"required": True,
|
||
"plain_note": "這筆價格看到的時間。",
|
||
},
|
||
{
|
||
"name": "ingestion_method",
|
||
"label": "取得方式",
|
||
"required": True,
|
||
"plain_note": "official_api、provider_api、manual_csv 或 legacy_competitor_cache。",
|
||
},
|
||
{
|
||
"name": "pchome_product_id",
|
||
"label": "PChome 商品 ID",
|
||
"required": False,
|
||
"plain_note": "若已確認同款才填,未確認就留空。",
|
||
},
|
||
{
|
||
"name": "quality_score",
|
||
"label": "資料可信度",
|
||
"required": False,
|
||
"plain_note": "0 到 100,低於 76 不進自動告警。",
|
||
},
|
||
]
|
||
|
||
CSV_HEADER_ALIASES = {
|
||
"source_code": {"source_code", "資料來源", "來源", "平台來源"},
|
||
"platform_code": {"platform_code", "平台", "平台代碼"},
|
||
"source_product_id": {"source_product_id", "外部商品 ID", "外部商品ID", "商品ID", "商品編號"},
|
||
"title": {"title", "商品名稱", "品名", "name"},
|
||
"price": {"price", "售價", "價格", "成交價"},
|
||
"observed_at": {"observed_at", "資料時間", "抓取時間", "看到時間", "時間"},
|
||
"ingestion_method": {"ingestion_method", "取得方式", "匯入方式", "來源方式"},
|
||
"currency": {"currency", "幣別"},
|
||
"original_price": {"original_price", "原價", "牌價"},
|
||
"product_url": {"product_url", "商品網址", "網址", "url"},
|
||
"brand": {"brand", "品牌"},
|
||
"category_text": {"category_text", "分類", "類別"},
|
||
"pchome_product_id": {"pchome_product_id", "PChome 商品 ID", "PChome商品ID", "pchome_id"},
|
||
"momo_sku": {"momo_sku", "MOMO SKU", "momo_sku", "momo_i_code"},
|
||
"match_status": {"match_status", "同款狀態", "比對狀態"},
|
||
"quality_score": {"quality_score", "資料可信度", "可信度", "品質分數"},
|
||
"data_quality_status": {"data_quality_status", "資料狀態", "品質狀態"},
|
||
"quality_note": {"quality_note", "備註", "品質備註"},
|
||
}
|
||
|
||
ALLOWED_SOURCE_CODES = {source["code"] for source in SOURCE_CONTRACTS}
|
||
PAUSED_SOURCE_CODES = {
|
||
source["code"] for source in SOURCE_CONTRACTS if source["status_code"] == "paused"
|
||
}
|
||
ACTIVE_SOURCE_CODES = {
|
||
source["code"] for source in SOURCE_CONTRACTS if source["status_code"] == "active"
|
||
}
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class ExternalOfferPayload:
|
||
source_code: str
|
||
platform_code: str
|
||
source_product_id: str
|
||
title: str
|
||
price: float | None
|
||
observed_at: str
|
||
ingestion_method: str
|
||
currency: str = "TWD"
|
||
original_price: float | None = None
|
||
product_url: str | None = None
|
||
brand: str | None = None
|
||
category_text: str | None = None
|
||
pchome_product_id: str | None = None
|
||
momo_sku: str | None = None
|
||
match_status: str = "unmatched"
|
||
quality_score: float = 0.0
|
||
data_quality_status: str = "needs_review"
|
||
quality_notes: list[str] = field(default_factory=list)
|
||
|
||
def to_record(self) -> dict[str, Any]:
|
||
return {
|
||
"source_code": self.source_code,
|
||
"platform_code": self.platform_code,
|
||
"source_product_id": self.source_product_id,
|
||
"source_offer_key": f"{self.source_code}:{self.source_product_id}",
|
||
"title": self.title,
|
||
"price": self.price,
|
||
"currency": self.currency or "TWD",
|
||
"original_price": self.original_price,
|
||
"product_url": self.product_url,
|
||
"brand": self.brand,
|
||
"category_text": self.category_text,
|
||
"observed_at": self.observed_at,
|
||
"ingestion_method": self.ingestion_method,
|
||
"pchome_product_id": self.pchome_product_id,
|
||
"momo_sku": self.momo_sku,
|
||
"match_status": self.match_status,
|
||
"quality_score": self.quality_score,
|
||
"data_quality_status": self.data_quality_status,
|
||
"quality_notes_json": json.dumps(self.quality_notes, ensure_ascii=False),
|
||
}
|
||
|
||
|
||
def _to_float(value: Any) -> float | None:
|
||
if value is None or value == "":
|
||
return None
|
||
try:
|
||
return float(value)
|
||
except (TypeError, ValueError):
|
||
return None
|
||
|
||
|
||
def _load_json_list(value: Any) -> list[Any]:
|
||
if not value:
|
||
return []
|
||
if isinstance(value, list):
|
||
return value
|
||
try:
|
||
parsed = json.loads(value)
|
||
return parsed if isinstance(parsed, list) else []
|
||
except Exception:
|
||
return []
|
||
|
||
|
||
def _has_table(conn, table_name: str) -> bool:
|
||
try:
|
||
return inspect(conn).has_table(table_name)
|
||
except Exception:
|
||
logger.warning("[ExternalMarket] table probe failed: %s", table_name, exc_info=True)
|
||
return False
|
||
|
||
|
||
def normalize_external_offer_payload(payload: dict[str, Any]) -> tuple[ExternalOfferPayload | None, list[str]]:
|
||
"""把 official API / provider API / manual CSV 的資料轉成同一份欄位。"""
|
||
errors: list[str] = []
|
||
source_code = str(payload.get("source_code") or "").strip()
|
||
platform_code = str(payload.get("platform_code") or source_code or "").strip()
|
||
source_product_id = str(payload.get("source_product_id") or "").strip()
|
||
title = str(payload.get("title") or payload.get("name") or "").strip()
|
||
ingestion_method = str(payload.get("ingestion_method") or "").strip()
|
||
observed_at = str(payload.get("observed_at") or "").strip()
|
||
price = _to_float(payload.get("price"))
|
||
|
||
required_values = {
|
||
"資料來源": source_code,
|
||
"外部商品 ID": source_product_id,
|
||
"商品名稱": title,
|
||
"售價": price,
|
||
"資料時間": observed_at,
|
||
"取得方式": ingestion_method,
|
||
}
|
||
for label, value in required_values.items():
|
||
if value is None or value == "":
|
||
errors.append(f"缺少{label}")
|
||
|
||
if price is not None and price <= 0:
|
||
errors.append("售價必須大於 0")
|
||
|
||
if observed_at:
|
||
try:
|
||
datetime.fromisoformat(observed_at.replace("Z", "+00:00"))
|
||
except ValueError:
|
||
errors.append("資料時間格式需為 ISO 格式,例如 2026-06-15T10:00:00")
|
||
|
||
if errors:
|
||
return None, errors
|
||
|
||
quality_score = _to_float(payload.get("quality_score"))
|
||
if quality_score is None:
|
||
quality_score = 0.0
|
||
quality_notes = _load_json_list(payload.get("quality_notes"))
|
||
if not quality_notes and payload.get("quality_note"):
|
||
quality_notes = [str(payload.get("quality_note"))]
|
||
|
||
record = ExternalOfferPayload(
|
||
source_code=source_code,
|
||
platform_code=platform_code,
|
||
source_product_id=source_product_id,
|
||
title=title,
|
||
price=price,
|
||
observed_at=observed_at,
|
||
ingestion_method=ingestion_method,
|
||
currency=str(payload.get("currency") or "TWD").strip() or "TWD",
|
||
original_price=_to_float(payload.get("original_price")),
|
||
product_url=payload.get("product_url"),
|
||
brand=payload.get("brand"),
|
||
category_text=payload.get("category_text"),
|
||
pchome_product_id=payload.get("pchome_product_id"),
|
||
momo_sku=payload.get("momo_sku"),
|
||
match_status=str(payload.get("match_status") or "unmatched"),
|
||
quality_score=max(0.0, min(100.0, quality_score)),
|
||
data_quality_status=str(payload.get("data_quality_status") or "needs_review"),
|
||
quality_notes=[str(item) for item in quality_notes],
|
||
)
|
||
return record, []
|
||
|
||
|
||
def _normalize_header(header: str) -> str:
|
||
cleaned = str(header or "").strip().replace("\ufeff", "")
|
||
for canonical, aliases in CSV_HEADER_ALIASES.items():
|
||
if cleaned in aliases:
|
||
return canonical
|
||
return cleaned
|
||
|
||
|
||
def _read_csv_rows(csv_text: str, limit: int) -> tuple[list[dict[str, Any]], list[str]]:
|
||
text_value = (csv_text or "").strip("\ufeff\n\r ")
|
||
if not text_value:
|
||
return [], ["CSV 內容是空的"]
|
||
|
||
sample = text_value[:4096]
|
||
try:
|
||
dialect = csv.Sniffer().sniff(sample, delimiters=",\t;")
|
||
except csv.Error:
|
||
dialect = csv.excel
|
||
|
||
reader = csv.DictReader(io.StringIO(text_value), dialect=dialect)
|
||
if not reader.fieldnames:
|
||
return [], ["找不到表頭列"]
|
||
|
||
raw_headers = [str(header or "").strip().replace("\ufeff", "") for header in reader.fieldnames]
|
||
normalized_headers = [_normalize_header(header) for header in raw_headers]
|
||
if len(set(normalized_headers)) != len(normalized_headers):
|
||
return [], ["表頭有重複欄位,請先合併或重新命名"]
|
||
|
||
rows = []
|
||
for index, raw_row in enumerate(reader, start=2):
|
||
if len(rows) >= limit:
|
||
break
|
||
normalized = {}
|
||
has_value = False
|
||
for raw_header, normalized_header in zip(raw_headers, normalized_headers):
|
||
value = raw_row.get(raw_header)
|
||
if value is not None and str(value).strip():
|
||
has_value = True
|
||
normalized[normalized_header] = str(value or "").strip()
|
||
if has_value:
|
||
normalized["_row_number"] = index
|
||
rows.append(normalized)
|
||
|
||
return rows, []
|
||
|
||
|
||
def _classify_offer_record(record: ExternalOfferPayload | None, errors: list[str]) -> dict[str, Any]:
|
||
if errors or record is None:
|
||
return {
|
||
"status_code": "blocked",
|
||
"status_label": "不能使用",
|
||
"can_enter_alerts": False,
|
||
"reasons": errors or ["資料格式需要修正"],
|
||
}
|
||
|
||
reasons: list[str] = []
|
||
source_code = record.source_code
|
||
match_status = (record.match_status or "").strip().lower()
|
||
is_verified_match = match_status in {"verified", "usable", "reviewed", "exact", "confirmed"}
|
||
has_pchome_id = bool(str(record.pchome_product_id or "").strip())
|
||
has_good_quality = record.quality_score >= 76
|
||
|
||
if source_code not in ALLOWED_SOURCE_CODES:
|
||
reasons.append("資料來源不在允許清單")
|
||
if source_code in PAUSED_SOURCE_CODES:
|
||
reasons.append("這個來源目前先暫停,不進告警")
|
||
if not is_verified_match:
|
||
reasons.append("尚未確認同款")
|
||
if not has_pchome_id:
|
||
reasons.append("缺少 PChome 商品 ID,無法連到業績")
|
||
if not has_good_quality:
|
||
reasons.append("資料可信度低於 76")
|
||
|
||
can_use = (
|
||
source_code in ACTIVE_SOURCE_CODES
|
||
and is_verified_match
|
||
and has_pchome_id
|
||
and has_good_quality
|
||
and not reasons
|
||
)
|
||
if can_use:
|
||
return {
|
||
"status_code": "ready",
|
||
"status_label": "可使用",
|
||
"can_enter_alerts": True,
|
||
"reasons": ["可進作戰清單"],
|
||
}
|
||
|
||
return {
|
||
"status_code": "review",
|
||
"status_label": "需人工確認",
|
||
"can_enter_alerts": False,
|
||
"reasons": reasons or ["需要人工確認"],
|
||
}
|
||
|
||
|
||
def dry_run_external_offer_csv(csv_text: str, *, limit: int = 200) -> dict[str, Any]:
|
||
"""檢查手動 CSV 是否能轉成外部報價格式;只讀,不寫 DB。"""
|
||
limit = max(1, min(int(limit or 200), 1000))
|
||
rows, parse_errors = _read_csv_rows(csv_text, limit=limit)
|
||
if parse_errors:
|
||
return {
|
||
"success": False,
|
||
"message": "CSV 預檢失敗,請先修正檔案格式。",
|
||
"summary": {
|
||
"total_rows": 0,
|
||
"ready_count": 0,
|
||
"review_count": 0,
|
||
"blocked_count": 0,
|
||
},
|
||
"errors": parse_errors,
|
||
"rows": [],
|
||
}
|
||
|
||
checked_rows = []
|
||
summary = {
|
||
"total_rows": len(rows),
|
||
"ready_count": 0,
|
||
"review_count": 0,
|
||
"blocked_count": 0,
|
||
}
|
||
for row in rows:
|
||
record, errors = normalize_external_offer_payload(row)
|
||
classification = _classify_offer_record(record, errors)
|
||
summary[f"{classification['status_code']}_count"] += 1
|
||
preview = record.to_record() if record else {}
|
||
checked_rows.append({
|
||
"row_number": row.get("_row_number"),
|
||
"status_code": classification["status_code"],
|
||
"status_label": classification["status_label"],
|
||
"can_enter_alerts": classification["can_enter_alerts"],
|
||
"reasons": classification["reasons"][:4],
|
||
"source_code": preview.get("source_code") or row.get("source_code") or "",
|
||
"source_product_id": preview.get("source_product_id") or row.get("source_product_id") or "",
|
||
"title": preview.get("title") or row.get("title") or "",
|
||
"price": preview.get("price"),
|
||
"pchome_product_id": preview.get("pchome_product_id") or "",
|
||
"quality_score": preview.get("quality_score") if preview else row.get("quality_score"),
|
||
})
|
||
|
||
return {
|
||
"success": True,
|
||
"message": "CSV 預檢完成,尚未寫入資料。",
|
||
"summary": summary,
|
||
"errors": [],
|
||
"rows": checked_rows,
|
||
"manual_csv": build_connector_contracts()["manual_csv"],
|
||
}
|
||
|
||
|
||
def _legacy_momo_reference_stats(conn) -> dict[str, Any]:
|
||
if not _has_table(conn, "competitor_prices"):
|
||
return {"usable_offer_count": 0, "last_seen_at": None}
|
||
|
||
if conn.dialect.name == "postgresql":
|
||
sql = """
|
||
SELECT COUNT(*) AS usable_offer_count, MAX(crawled_at) AS last_seen_at
|
||
FROM competitor_prices
|
||
WHERE source = 'pchome'
|
||
AND competitor_product_id IS NOT NULL
|
||
AND price IS NOT NULL
|
||
AND price > 0
|
||
AND COALESCE(match_score, 0) >= 0.76
|
||
AND (expires_at IS NULL OR expires_at > CURRENT_TIMESTAMP)
|
||
AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2'
|
||
"""
|
||
else:
|
||
sql = """
|
||
SELECT COUNT(*) AS usable_offer_count, MAX(crawled_at) AS last_seen_at
|
||
FROM competitor_prices
|
||
WHERE source = 'pchome'
|
||
AND competitor_product_id IS NOT NULL
|
||
AND price IS NOT NULL
|
||
AND price > 0
|
||
AND COALESCE(match_score, 0) >= 0.76
|
||
AND COALESCE(tags, '') LIKE '%identity_v2%'
|
||
"""
|
||
row = conn.execute(text(sql)).mappings().first() or {}
|
||
return {
|
||
"usable_offer_count": int(row.get("usable_offer_count") or 0),
|
||
"last_seen_at": str(row.get("last_seen_at") or "") or None,
|
||
}
|
||
|
||
|
||
def _normalized_offer_stats(conn) -> dict[str, dict[str, Any]]:
|
||
if not all(_has_table(conn, table) for table in {"external_market_sources", "external_offers"}):
|
||
return {}
|
||
|
||
rows = conn.execute(text("""
|
||
SELECT
|
||
s.code,
|
||
s.status,
|
||
s.enabled,
|
||
COUNT(o.id) AS offer_count,
|
||
SUM(CASE
|
||
WHEN o.data_quality_status IN ('verified', 'usable', 'reviewed')
|
||
AND COALESCE(o.quality_score, 0) >= 76
|
||
THEN 1 ELSE 0 END
|
||
) AS usable_offer_count,
|
||
MAX(o.observed_at) AS last_seen_at
|
||
FROM external_market_sources s
|
||
LEFT JOIN external_offers o ON o.source_code = s.code
|
||
GROUP BY s.code, s.status, s.enabled
|
||
""")).mappings().all()
|
||
|
||
return {
|
||
str(row["code"]): {
|
||
"status": row.get("status"),
|
||
"enabled": bool(row.get("enabled")),
|
||
"offer_count": int(row.get("offer_count") or 0),
|
||
"usable_offer_count": int(row.get("usable_offer_count") or 0),
|
||
"last_seen_at": str(row.get("last_seen_at") or "") or None,
|
||
}
|
||
for row in rows
|
||
}
|
||
|
||
|
||
def build_connector_contracts() -> dict[str, Any]:
|
||
"""回傳 connector 與手動 CSV 共同遵守的欄位規格。"""
|
||
return {
|
||
"success": True,
|
||
"version": "2026-06-15",
|
||
"plain_summary": "所有外部市場資料都先轉成同一份商品報價格式,再進作戰清單。",
|
||
"sources": SOURCE_CONTRACTS,
|
||
"normalized_offer_fields": NORMALIZED_OFFER_FIELDS,
|
||
"manual_csv": {
|
||
"encoding": "utf-8-sig",
|
||
"required_headers": [
|
||
field["name"] for field in NORMALIZED_OFFER_FIELDS if field["required"]
|
||
],
|
||
"optional_headers": [
|
||
field["name"] for field in NORMALIZED_OFFER_FIELDS if not field["required"]
|
||
],
|
||
"plain_rule": "低可信度或未確認同款的資料只進待補資料清單,不自動發告警。",
|
||
},
|
||
}
|
||
|
||
|
||
def build_external_source_readiness(engine=None) -> dict[str, Any]:
|
||
"""建立畫面可用的外部資料來源狀態。"""
|
||
sources = [dict(source) for source in SOURCE_CONTRACTS]
|
||
normalized_stats: dict[str, dict[str, Any]] = {}
|
||
legacy_stats: dict[str, Any] = {"usable_offer_count": 0, "last_seen_at": None}
|
||
schema_ready = False
|
||
|
||
if engine is not None:
|
||
try:
|
||
with engine.connect() as conn:
|
||
schema_ready = all(
|
||
_has_table(conn, table)
|
||
for table in {"external_market_sources", "external_offers"}
|
||
)
|
||
normalized_stats = _normalized_offer_stats(conn)
|
||
legacy_stats = _legacy_momo_reference_stats(conn)
|
||
except Exception:
|
||
logger.warning("[ExternalMarket] source readiness failed", exc_info=True)
|
||
|
||
for source in sources:
|
||
stats = normalized_stats.get(source["code"], {})
|
||
if source["code"] == "momo_reference":
|
||
usable = max(
|
||
int(stats.get("usable_offer_count") or 0),
|
||
int(legacy_stats.get("usable_offer_count") or 0),
|
||
)
|
||
last_seen_at = stats.get("last_seen_at") or legacy_stats.get("last_seen_at")
|
||
else:
|
||
usable = int(stats.get("usable_offer_count") or 0)
|
||
last_seen_at = stats.get("last_seen_at")
|
||
|
||
source["schema_ready"] = schema_ready
|
||
source["usable_offer_count"] = usable
|
||
source["last_seen_at"] = last_seen_at
|
||
source["can_alert"] = source["status_code"] == "active" and usable > 0
|
||
if source["status_code"] == "active":
|
||
source["plain_state"] = "已接入,可進作戰清單" if usable else "已接入,等待可用資料"
|
||
else:
|
||
source["plain_state"] = "先保留接口,不進告警"
|
||
|
||
active_count = sum(1 for source in sources if source["status_code"] == "active")
|
||
paused_count = sum(1 for source in sources if source["status_code"] == "paused")
|
||
usable_count = sum(int(source["usable_offer_count"]) for source in sources)
|
||
|
||
return {
|
||
"success": True,
|
||
"schema_ready": schema_ready,
|
||
"active_count": active_count,
|
||
"paused_count": paused_count,
|
||
"usable_offer_count": usable_count,
|
||
"sources": sources,
|
||
"connector_contract": build_connector_contracts(),
|
||
"plain_summary": "MOMO 先用;蝦皮與酷澎先保留接口,暫不進告警。",
|
||
}
|