zov-tech/backend-py/app/parsers/ozon.py
wasrusgen d5f290bd0a backend: Playwright + Chromium for JS-rendered sites (Я.Маркет, OZON fallback)
DOCKERFILE:
- + Chromium system deps (libnss3, libxkbcommon0, libgbm1, libgtk-3-0, etc.)
- + RUN python -m playwright install chromium (~150MB)
- + ENV PLAYWRIGHT_BROWSERS_PATH

REQUIREMENTS:
- + playwright >= 1.45

PARSERS:
- new playwright_engine.py — singleton browser, isolated context per request,
  blocks images/fonts/CSS to save memory, waits for selector + JS hydration
- yamarket.py — rewritten to use Playwright (Я.Маркет is React SPA)
- ozon.py — Playwright fallback when composer-api returns challenge (403)
- wb.py — exponential backoff on 429, still uses direct HTTP (JSON API, no JS needed)

STRATEGY (Hybrid Path C):
- Я.Маркет: Playwright (rendering JS)
- OZON: composer-api first, Playwright fallback
- WB: direct HTTP with backoff (JSON API, fast)
- DNS: kept but lower priority (Qrator hard to crack)
- No more proxy needed for primary path

DEPLOY: removed PROXY_STATIC_LIST from .env, expect ~5min for first build (Chromium download)
2026-05-11 13:25:05 +03:00

263 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Парсер OZON — через composer-api (внутренний JSON API сайта).
OZON отдаёт JSON через `/api/composer-api.bx/page/json/v2?url=/search/?text=…`.
JSON содержит вложенные виджеты — нас интересует `widgetStates.searchResults...`.
Без прокси возвращает 307/403. Через резидентный РФ-IP проходит.
"""
from __future__ import annotations
import logging
import re
from typing import Any
from urllib.parse import quote_plus
import httpx
from bs4 import BeautifulSoup
from .. import proxy_pool
from . import playwright_engine
log = logging.getLogger("zov.parser.ozon")
_BASE_URL = "https://www.ozon.ru"
_API_URL = "https://www.ozon.ru/api/composer-api.bx/page/json/v2"
_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
"Accept": "application/json",
"Accept-Language": "ru-RU,ru;q=0.9",
"x-o3-app-name": "dweb_client",
"x-o3-app-version": "release_18.04",
"x-o3-page-type": "search",
"Referer": "https://www.ozon.ru/",
}
_PRICE_RE = re.compile(r"([\d\s]+)\s*₽")
def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
max_retries: int = 1, use_playwright: bool = True) -> list[dict[str, Any]]:
"""Поиск товара в OZON.
Сначала пробуем composer-api JSON (быстро), при challenge — Playwright (медленно но точно).
"""
# Путь 1: быстрый composer-api
url_param = f"/search/?text={quote_plus(query)}&from_global=true"
params = {"url": url_param}
for attempt in range(max_retries + 1):
try:
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
follow_redirects=False) as client:
resp = client.get(_API_URL, params=params)
if resp.status_code == 200:
try:
return _extract_products(resp.json(), limit=limit)
except Exception:
pass
log.debug("OZON composer-api attempt %d: status=%s", attempt + 1, resp.status_code)
except httpx.HTTPError as e:
log.debug("OZON composer-api err: %s", e)
# Путь 2: Playwright (рендерим обычную HTML-страницу поиска)
if not use_playwright:
return []
log.info("OZON falling back to Playwright for query=%r", query)
page_url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
html = playwright_engine.fetch_page(
page_url,
wait_selector="a[href*='/product/'], [data-widget='searchResultsV2']",
wait_ms=3500,
timeout_ms=int(timeout * 1000),
)
if not html:
return []
return _parse_html_via_dom(html, limit=limit)
def _parse_html_via_dom(html: str, limit: int) -> list[dict[str, Any]]:
"""Fallback: парсим товары из отрендеренного Chrome HTML."""
soup = BeautifulSoup(html, "html.parser")
seen = set()
results: list[dict[str, Any]] = []
for link in soup.select("a[href*='/product/']"):
if len(results) >= limit:
break
href = link.get("href") or ""
if href in seen:
continue
seen.add(href)
# Поднимаемся до карточки
card = link.find_parent("div") or link
title = link.get_text(strip=True) or (card.select_one("span") or {}).get_text(strip=True) if hasattr(card.select_one("span"), "get_text") else ""
if not title or len(title) < 5:
continue
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
url = url.split("?")[0]
# Цена в ближайшем родителе
price = None
price_card = link.find_parent("div", recursive=True)
if price_card:
txt = price_card.get_text(" ", strip=True)
m = _PRICE_RE.search(txt)
if m:
price = _try_int(m.group(1).replace(" ", ""))
# Картинка в карточке
img = None
img_el = card.find("img") if card else None
if img_el:
src = img_el.get("src") or ""
if src.startswith("//"):
src = "https:" + src
if src and "data:image" not in src:
img = src
results.append({
"title": title[:200],
"url": url,
"image_url": img,
"price_min_rub": price,
"price_max_rub": None,
"rating": None,
"reviews_count": None,
"stores_count": None,
"specs": {},
"source": "ozon",
})
return results
def _try_int(v: Any) -> int | None:
if v is None:
return None
try:
return int(float(str(v).replace(" ", "").replace(",", ".")))
except (ValueError, TypeError):
return None
def _extract_products(data: dict, limit: int) -> list[dict[str, Any]]:
"""OZON прячет данные в widgetStates — ищем все ключи с 'searchResultsV2'."""
widget_states = data.get("widgetStates") or {}
products: list[dict[str, Any]] = []
for key, raw in widget_states.items():
if "searchResultsV2" not in key and "skuGrid" not in key and "searchCategories" not in key:
continue
try:
import json as _j
w = _j.loads(raw) if isinstance(raw, str) else raw
except Exception:
continue
items = w.get("items") or w.get("products") or []
for it in items:
if len(products) >= limit:
break
item = _build_item(it)
if item:
products.append(item)
if len(products) >= limit:
break
return products
def _build_item(it: dict[str, Any]) -> dict[str, Any] | None:
"""Парсит карточку товара из OZON widget items[]."""
# Структура: { mainState: [...], action: { link: '/product/...' }, images: [...] }
sku = it.get("sku") or it.get("id")
if not sku:
return None
link = (it.get("action") or {}).get("link") or ""
url = f"{_BASE_URL}{link}" if link.startswith("/") else link
# Картинка
image_url = None
imgs = it.get("images") or it.get("tileImage") or []
if isinstance(imgs, list) and imgs:
first = imgs[0]
image_url = first if isinstance(first, str) else (first.get("image") or first.get("src"))
if not image_url:
ti = it.get("tileImage") or {}
if isinstance(ti, dict):
items = ti.get("items") or []
for x in items:
if isinstance(x, dict) and x.get("image"):
image_url = x["image"].get("link") or x["image"].get("src")
break
# Цена и название — берём из mainState текстовых атомов
title = ""
price_min = None
price_max = None
rating = None
reviews = None
for atom in (it.get("mainState") or []):
atom_id = atom.get("id") or ""
atom_type = atom.get("type") or ""
if atom_type == "textAtom":
text = ((atom.get("textAtom") or {}).get("text") or "").strip()
if "name" in atom_id.lower() and not title:
title = re.sub(r"<[^>]+>", "", text)
elif "price" in atom_id.lower():
m = _PRICE_RE.search(text)
if m and not price_min:
price_min = int(m.group(1).replace(" ", "").replace(" ", ""))
elif atom_type == "priceV2":
pv = atom.get("priceV2") or {}
for price_obj in (pv.get("price") or []):
t = (price_obj.get("text") or "").strip()
m = _PRICE_RE.search(t)
if m:
val = int(m.group(1).replace(" ", "").replace(" ", ""))
if price_min is None or val < price_min:
price_min = val
if price_max is None or val > price_max:
price_max = val
elif atom_type == "labelList":
for lbl in ((atom.get("labelList") or {}).get("items") or []):
t = (lbl.get("title") or "").strip()
# Рейтинг типа "4.7"
if re.fullmatch(r"\d\.\d", t):
rating = float(t)
# Отзывы типа "1242 отзыва"
m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", t)
if m:
reviews = int(m.group(1).replace(" ", ""))
if not title:
# Резервный фолбэк — могут быть атомы в otherState
for atom in (it.get("otherState") or []):
text = ((atom.get("textAtom") or {}).get("text") or "").strip()
if text and len(text) > 5:
title = re.sub(r"<[^>]+>", "", text)
break
if not title:
return None
return {
"title": title,
"url": url,
"image_url": image_url,
"price_min_rub": price_min,
"price_max_rub": price_max if price_max and price_max != price_min else None,
"rating": rating,
"reviews_count": reviews,
"stores_count": None,
"specs": {},
"source": "ozon",
}