zov-tech/backend-py/app/parsers/ozon.py

"""Парсер OZON — через composer-api (внутренний JSON API сайта).

OZON отдаёт JSON через `/api/composer-api.bx/page/json/v2?url=/search/?text=…`.
JSON содержит вложенные виджеты — нас интересует `widgetStates.searchResults...`.

Без прокси возвращает 307/403. Через резидентный РФ-IP проходит.
"""
from __future__ import annotations
import logging
import re
from typing import Any
from urllib.parse import quote_plus

import httpx
from bs4 import BeautifulSoup

from .. import proxy_pool
from . import playwright_engine

log = logging.getLogger("zov.parser.ozon")

_BASE_URL = "https://www.ozon.ru"
_API_URL = "https://www.ozon.ru/api/composer-api.bx/page/json/v2"

_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
    "Accept": "application/json",
    "Accept-Language": "ru-RU,ru;q=0.9",
    "x-o3-app-name": "dweb_client",
    "x-o3-app-version": "release_18.04",
    "x-o3-page-type": "search",
    "Referer": "https://www.ozon.ru/",
}

_PRICE_RE = re.compile(r"([\d\s]+)\s*₽")


def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
                max_retries: int = 1, use_playwright: bool = True) -> list[dict[str, Any]]:
    """Поиск товара в OZON.

    Сначала пробуем composer-api JSON (быстро), при challenge — Playwright (медленно но точно).
    """
    # Путь 1: быстрый composer-api
    url_param = f"/search/?text={quote_plus(query)}&from_global=true"
    params = {"url": url_param}
    for attempt in range(max_retries + 1):
        try:
            with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
                                            follow_redirects=False) as client:
                resp = client.get(_API_URL, params=params)
            if resp.status_code == 200:
                try:
                    return _extract_products(resp.json(), limit=limit)
                except Exception:
                    pass
            log.debug("OZON composer-api attempt %d: status=%s", attempt + 1, resp.status_code)
        except httpx.HTTPError as e:
            log.debug("OZON composer-api err: %s", e)

    # Путь 2: Playwright (рендерим обычную HTML-страницу поиска)
    if not use_playwright:
        return []
    log.info("OZON falling back to Playwright for query=%r", query)
    page_url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
    html = playwright_engine.fetch_page(
        page_url,
        wait_selector="a[href*='/product/'], [data-widget='searchResultsV2']",
        wait_ms=3500,
        timeout_ms=int(timeout * 1000),
    )
    if not html:
        return []
    return _parse_html_via_dom(html, limit=limit)


def _parse_html_via_dom(html: str, limit: int) -> list[dict[str, Any]]:
    """Fallback: парсим товары из отрендеренного Chrome HTML."""
    soup = BeautifulSoup(html, "html.parser")
    seen = set()
    results: list[dict[str, Any]] = []

    for link in soup.select("a[href*='/product/']"):
        if len(results) >= limit:
            break
        href = link.get("href") or ""
        if href in seen:
            continue
        seen.add(href)

        # Поднимаемся до карточки
        card = link.find_parent("div") or link
        title = link.get_text(strip=True) or (card.select_one("span") or {}).get_text(strip=True) if hasattr(card.select_one("span"), "get_text") else ""
        if not title or len(title) < 5:
            continue

        url = href if href.startswith("http") else f"{_BASE_URL}{href}"
        url = url.split("?")[0]

        # Цена в ближайшем родителе
        price = None
        price_card = link.find_parent("div", recursive=True)
        if price_card:
            txt = price_card.get_text(" ", strip=True)
            m = _PRICE_RE.search(txt)
            if m:
                price = _try_int(m.group(1).replace(" ", ""))

        # Картинка в карточке
        img = None
        img_el = card.find("img") if card else None
        if img_el:
            src = img_el.get("src") or ""
            if src.startswith("//"):
                src = "https:" + src
            if src and "data:image" not in src:
                img = src

        results.append({
            "title": title[:200],
            "url": url,
            "image_url": img,
            "price_min_rub": price,
            "price_max_rub": None,
            "rating": None,
            "reviews_count": None,
            "stores_count": None,
            "specs": {},
            "source": "ozon",
        })

    return results


def _try_int(v: Any) -> int | None:
    if v is None:
        return None
    try:
        return int(float(str(v).replace(" ", "").replace(",", ".")))
    except (ValueError, TypeError):
        return None


def _extract_products(data: dict, limit: int) -> list[dict[str, Any]]:
    """OZON прячет данные в widgetStates — ищем все ключи с 'searchResultsV2'."""
    widget_states = data.get("widgetStates") or {}
    products: list[dict[str, Any]] = []

    for key, raw in widget_states.items():
        if "searchResultsV2" not in key and "skuGrid" not in key and "searchCategories" not in key:
            continue
        try:
            import json as _j
            w = _j.loads(raw) if isinstance(raw, str) else raw
        except Exception:
            continue

        items = w.get("items") or w.get("products") or []
        for it in items:
            if len(products) >= limit:
                break
            item = _build_item(it)
            if item:
                products.append(item)
        if len(products) >= limit:
            break

    return products


def _build_item(it: dict[str, Any]) -> dict[str, Any] | None:
    """Парсит карточку товара из OZON widget items[]."""
    # Структура: { mainState: [...], action: { link: '/product/...' }, images: [...] }
    sku = it.get("sku") or it.get("id")
    if not sku:
        return None

    link = (it.get("action") or {}).get("link") or ""
    url = f"{_BASE_URL}{link}" if link.startswith("/") else link

    # Картинка
    image_url = None
    imgs = it.get("images") or it.get("tileImage") or []
    if isinstance(imgs, list) and imgs:
        first = imgs[0]
        image_url = first if isinstance(first, str) else (first.get("image") or first.get("src"))
    if not image_url:
        ti = it.get("tileImage") or {}
        if isinstance(ti, dict):
            items = ti.get("items") or []
            for x in items:
                if isinstance(x, dict) and x.get("image"):
                    image_url = x["image"].get("link") or x["image"].get("src")
                    break

    # Цена и название — берём из mainState текстовых атомов
    title = ""
    price_min = None
    price_max = None
    rating = None
    reviews = None

    for atom in (it.get("mainState") or []):
        atom_id = atom.get("id") or ""
        atom_type = atom.get("type") or ""

        if atom_type == "textAtom":
            text = ((atom.get("textAtom") or {}).get("text") or "").strip()
            if "name" in atom_id.lower() and not title:
                title = re.sub(r"<[^>]+>", "", text)
            elif "price" in atom_id.lower():
                m = _PRICE_RE.search(text)
                if m and not price_min:
                    price_min = int(m.group(1).replace(" ", "").replace(" ", ""))

        elif atom_type == "priceV2":
            pv = atom.get("priceV2") or {}
            for price_obj in (pv.get("price") or []):
                t = (price_obj.get("text") or "").strip()
                m = _PRICE_RE.search(t)
                if m:
                    val = int(m.group(1).replace(" ", "").replace(" ", ""))
                    if price_min is None or val < price_min:
                        price_min = val
                    if price_max is None or val > price_max:
                        price_max = val

        elif atom_type == "labelList":
            for lbl in ((atom.get("labelList") or {}).get("items") or []):
                t = (lbl.get("title") or "").strip()
                # Рейтинг типа "4.7"
                if re.fullmatch(r"\d\.\d", t):
                    rating = float(t)
                # Отзывы типа "1242 отзыва"
                m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", t)
                if m:
                    reviews = int(m.group(1).replace(" ", ""))

    if not title:
        # Резервный фолбэк — могут быть атомы в otherState
        for atom in (it.get("otherState") or []):
            text = ((atom.get("textAtom") or {}).get("text") or "").strip()
            if text and len(text) > 5:
                title = re.sub(r"<[^>]+>", "", text)
                break

    if not title:
        return None

    return {
        "title": title,
        "url": url,
        "image_url": image_url,
        "price_min_rub": price_min,
        "price_max_rub": price_max if price_max and price_max != price_min else None,
        "rating": rating,
        "reviews_count": reviews,
        "stores_count": None,
        "specs": {},
        "source": "ozon",
    }