zov-tech/backend-py/app/parsers/yamarket.py
wasrusgen d5f290bd0a backend: Playwright + Chromium for JS-rendered sites (Я.Маркет, OZON fallback)
DOCKERFILE:
- + Chromium system deps (libnss3, libxkbcommon0, libgbm1, libgtk-3-0, etc.)
- + RUN python -m playwright install chromium (~150MB)
- + ENV PLAYWRIGHT_BROWSERS_PATH

REQUIREMENTS:
- + playwright >= 1.45

PARSERS:
- new playwright_engine.py — singleton browser, isolated context per request,
  blocks images/fonts/CSS to save memory, waits for selector + JS hydration
- yamarket.py — rewritten to use Playwright (Я.Маркет is React SPA)
- ozon.py — Playwright fallback when composer-api returns challenge (403)
- wb.py — exponential backoff on 429, still uses direct HTTP (JSON API, no JS needed)

STRATEGY (Hybrid Path C):
- Я.Маркет: Playwright (rendering JS)
- OZON: composer-api first, Playwright fallback
- WB: direct HTTP with backoff (JSON API, fast)
- DNS: kept but lower priority (Qrator hard to crack)
- No more proxy needed for primary path

DEPLOY: removed PROXY_STATIC_LIST from .env, expect ~5min for first build (Chromium download)
2026-05-11 13:25:05 +03:00

195 lines
6.5 KiB
Python

"""Парсер Я.Маркета — через Playwright (рендер JS).
Я.Маркет — SPA на React, товары подгружаются через XHR после первой загрузки.
Простой HTTP-запрос не вернёт каталог. Поэтому используем headless Chromium.
Ждём пока в DOM появятся карточки `[data-zone-name="snippet-card"]` или
`a[href*="/product--"]`, потом извлекаем данные.
"""
from __future__ import annotations
import logging
import re
from typing import Any
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from . import playwright_engine
log = logging.getLogger("zov.parser.yamarket")
_BASE_URL = "https://market.yandex.ru"
_PRICE_RE = re.compile(r"([\d\s]+)\s*₽")
def search_yamarket(query: str, limit: int = 3, timeout: float = 30.0,
max_retries: int = 1) -> list[dict[str, Any]]:
"""Поиск товара в Я.Маркете через headless Chromium."""
url = f"{_BASE_URL}/search?text={quote_plus(query)}"
html = None
for attempt in range(max_retries + 1):
html = playwright_engine.fetch_page(
url,
# Ждём появления товарных ссылок или контейнера выдачи
wait_selector="a[href*='/product--'], [data-auto='SerpItem'], [data-zone-name='snippet-card']",
wait_ms=3500,
timeout_ms=int(timeout * 1000),
)
if html:
break
if not html:
log.warning("YaMarket: no HTML for query=%r", query)
return []
if "showcaptcha" in html.lower() or "qrator" in html.lower()[:5000]:
log.warning("YaMarket: Qrator/captcha for query=%r", query)
return []
return _parse_html(html, limit=limit)
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
soup = BeautifulSoup(html, "html.parser")
results: list[dict[str, Any]] = []
# Основной селектор — товарные карточки на странице поиска
candidates = (
soup.select("[data-auto='SerpItem']")
or soup.select("[data-zone-name='snippet-card']")
or soup.select("article[data-baobab-name='card']")
or soup.select("article:has(a[href*='/product--'])")
)
for card in candidates:
if len(results) >= limit:
break
item = _extract_card(card)
if item:
results.append(item)
# Резерв — собрать по найденным ссылкам product--
if not results:
seen = set()
for a in soup.select("a[href*='/product--']")[:limit * 2]:
href = a.get("href") or ""
if href in seen:
continue
seen.add(href)
# Берём родительский article как карточку
card = a.find_parent("article") or a.find_parent("div")
if card:
item = _extract_card(card)
if item:
results.append(item)
if len(results) >= limit:
break
return results
def _extract_card(card) -> dict[str, Any] | None:
"""Достаём заголовок, ссылку, цену, рейтинг, отзывы, фото, кол-во магазинов."""
link_el = (
card.select_one("a[href*='/product--']")
or card.select_one("a[data-baobab-name='title']")
)
if not link_el:
return None
href = link_el.get("href") or ""
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
title_el = (
card.select_one("[data-zone-name='title'] span")
or card.select_one("h3 span")
or card.select_one("[data-auto='snippet-title']")
or link_el
)
title = title_el.get_text(strip=True) if title_el else (link_el.get_text(strip=True))
if not title:
return None
# Цена
price_min = price_max = None
price_el = (
card.select_one("[data-auto='snippet-price-current']")
or card.select_one("[data-auto='price-value']")
or card.select_one("[class*='Price']")
)
if price_el:
m = _PRICE_RE.search(price_el.get_text(" ", strip=True))
if m:
price_min = _try_int(m.group(1).replace(" ", "").replace(" ", ""))
# Картинка
img_url = None
img_el = card.select_one("img[src], img[srcset]")
if img_el:
src = img_el.get("src") or img_el.get("data-src") or ""
# Иногда src — заглушка 1x1px, основное в srcset
if "data:image" in src or not src:
srcset = img_el.get("srcset") or ""
if srcset:
src = srcset.split(",")[0].strip().split(" ")[0]
if src.startswith("//"):
src = "https:" + src
if src:
img_url = src
# Рейтинг
rating = None
rating_el = card.select_one("[data-auto='snippet-rating'], [class*='Rating'] span")
if rating_el:
rt = rating_el.get_text(strip=True)
m = re.search(r"\d[.,]\d", rt)
if m:
rating = _try_float(m.group(0))
# Отзывы
reviews = None
reviews_el = card.select_one("[data-auto='snippet-feedback'], a[href*='/reviews']")
if reviews_el:
m = re.search(r"\d[\d\s]*", reviews_el.get_text(" ", strip=True))
if m:
reviews = _try_int(m.group(0).replace(" ", ""))
# Кол-во магазинов / предложений
stores = None
stores_el = card.select_one("[data-auto='offer-count'], a[href*='/offers']")
if stores_el:
m = re.search(r"\d+", stores_el.get_text(" ", strip=True))
if m:
stores = int(m.group(0))
return {
"title": title,
"url": url,
"image_url": img_url,
"price_min_rub": price_min,
"price_max_rub": price_max if price_max and price_max != price_min else None,
"rating": rating,
"reviews_count": reviews,
"stores_count": stores,
"specs": {},
"source": "yamarket",
}
def _try_int(v: Any) -> int | None:
if v is None:
return None
try:
return int(float(str(v).replace(" ", "").replace(",", ".")))
except (ValueError, TypeError):
return None
def _try_float(v: Any) -> float | None:
if v is None:
return None
try:
return float(str(v).replace(" ", "").replace(",", "."))
except (ValueError, TypeError):
return None