mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 15:04:50 +00:00
backend: Playwright + Chromium for JS-rendered sites (Я.Маркет, OZON fallback)
DOCKERFILE: - + Chromium system deps (libnss3, libxkbcommon0, libgbm1, libgtk-3-0, etc.) - + RUN python -m playwright install chromium (~150MB) - + ENV PLAYWRIGHT_BROWSERS_PATH REQUIREMENTS: - + playwright >= 1.45 PARSERS: - new playwright_engine.py — singleton browser, isolated context per request, blocks images/fonts/CSS to save memory, waits for selector + JS hydration - yamarket.py — rewritten to use Playwright (Я.Маркет is React SPA) - ozon.py — Playwright fallback when composer-api returns challenge (403) - wb.py — exponential backoff on 429, still uses direct HTTP (JSON API, no JS needed) STRATEGY (Hybrid Path C): - Я.Маркет: Playwright (rendering JS) - OZON: composer-api first, Playwright fallback - WB: direct HTTP with backoff (JSON API, fast) - DNS: kept but lower priority (Qrator hard to crack) - No more proxy needed for primary path DEPLOY: removed PROXY_STATIC_LIST from .env, expect ~5min for first build (Chromium download)
This commit is contained in:
parent
3ee5275ea0
commit
d5f290bd0a
@ -1,9 +1,16 @@
|
|||||||
FROM python:3.12-slim
|
FROM python:3.12-slim
|
||||||
|
|
||||||
# НУЦ Минцифры root CA — для GigaChat SSL.
|
# НУЦ Минцифры root CA — для GigaChat SSL.
|
||||||
# Скачиваем актуальный bundle на этапе сборки и добавляем в системный trust store.
|
# + системные пакеты для Playwright/Chromium (рендеринг JS-сайтов).
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -y --no-install-recommends ca-certificates curl \
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
ca-certificates curl \
|
||||||
|
# Chromium dependencies for Playwright
|
||||||
|
libnss3 libnspr4 libatk-bridge2.0-0 libatk1.0-0 libcups2 \
|
||||||
|
libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
|
||||||
|
libgbm1 libgtk-3-0 libasound2 libpango-1.0-0 libcairo2 \
|
||||||
|
libdbus-1-3 libdrm2 libxshmfence1 \
|
||||||
|
fonts-liberation fonts-noto-color-emoji \
|
||||||
&& curl -fsSL -o /usr/local/share/ca-certificates/russian_trusted_root_ca.crt \
|
&& curl -fsSL -o /usr/local/share/ca-certificates/russian_trusted_root_ca.crt \
|
||||||
https://gu-st.ru/content/Other/doc/russian_trusted_root_ca.cer \
|
https://gu-st.ru/content/Other/doc/russian_trusted_root_ca.cer \
|
||||||
&& curl -fsSL -o /usr/local/share/ca-certificates/russian_trusted_sub_ca.crt \
|
&& curl -fsSL -o /usr/local/share/ca-certificates/russian_trusted_sub_ca.crt \
|
||||||
@ -16,6 +23,9 @@ WORKDIR /app
|
|||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Скачиваем только Chromium (без firefox/webkit) — ~150MB
|
||||||
|
RUN python -m playwright install chromium
|
||||||
|
|
||||||
COPY app /app/app
|
COPY app /app/app
|
||||||
|
|
||||||
# httpx по умолчанию использует certifi → принудительно указываем системный bundle,
|
# httpx по умолчанию использует certifi → принудительно указываем системный bundle,
|
||||||
@ -23,6 +33,9 @@ COPY app /app/app
|
|||||||
ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt
|
ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt
|
||||||
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
|
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
|
||||||
|
|
||||||
|
# Playwright кэш-каталог браузеров
|
||||||
|
ENV PLAYWRIGHT_BROWSERS_PATH=/root/.cache/ms-playwright
|
||||||
|
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers", "--forwarded-allow-ips=*"]
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers", "--forwarded-allow-ips=*"]
|
||||||
|
|||||||
@ -12,8 +12,10 @@ from typing import Any
|
|||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .. import proxy_pool
|
from .. import proxy_pool
|
||||||
|
from . import playwright_engine
|
||||||
|
|
||||||
log = logging.getLogger("zov.parser.ozon")
|
log = logging.getLogger("zov.parser.ozon")
|
||||||
|
|
||||||
@ -34,38 +36,110 @@ _HEADERS = {
|
|||||||
_PRICE_RE = re.compile(r"([\d\s]+)\s*₽")
|
_PRICE_RE = re.compile(r"([\d\s]+)\s*₽")
|
||||||
|
|
||||||
|
|
||||||
def search_ozon(query: str, limit: int = 3, timeout: float = 15.0,
|
def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
|
||||||
max_retries: int = 2) -> list[dict[str, Any]]:
|
max_retries: int = 1, use_playwright: bool = True) -> list[dict[str, Any]]:
|
||||||
"""Поиск товара в OZON через composer-api."""
|
"""Поиск товара в OZON.
|
||||||
|
|
||||||
|
Сначала пробуем composer-api JSON (быстро), при challenge — Playwright (медленно но точно).
|
||||||
|
"""
|
||||||
|
# Путь 1: быстрый composer-api
|
||||||
url_param = f"/search/?text={quote_plus(query)}&from_global=true"
|
url_param = f"/search/?text={quote_plus(query)}&from_global=true"
|
||||||
params = {"url": url_param}
|
params = {"url": url_param}
|
||||||
|
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
try:
|
try:
|
||||||
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
|
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
|
||||||
follow_redirects=False) as client:
|
follow_redirects=False) as client:
|
||||||
resp = client.get(_API_URL, params=params)
|
resp = client.get(_API_URL, params=params)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
try:
|
||||||
|
return _extract_products(resp.json(), limit=limit)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
log.debug("OZON composer-api attempt %d: status=%s", attempt + 1, resp.status_code)
|
||||||
except httpx.HTTPError as e:
|
except httpx.HTTPError as e:
|
||||||
log.warning("OZON request failed (attempt %d): %s", attempt + 1, e)
|
log.debug("OZON composer-api err: %s", e)
|
||||||
|
|
||||||
|
# Путь 2: Playwright (рендерим обычную HTML-страницу поиска)
|
||||||
|
if not use_playwright:
|
||||||
|
return []
|
||||||
|
log.info("OZON falling back to Playwright for query=%r", query)
|
||||||
|
page_url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
|
||||||
|
html = playwright_engine.fetch_page(
|
||||||
|
page_url,
|
||||||
|
wait_selector="a[href*='/product/'], [data-widget='searchResultsV2']",
|
||||||
|
wait_ms=3500,
|
||||||
|
timeout_ms=int(timeout * 1000),
|
||||||
|
)
|
||||||
|
if not html:
|
||||||
|
return []
|
||||||
|
return _parse_html_via_dom(html, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_html_via_dom(html: str, limit: int) -> list[dict[str, Any]]:
|
||||||
|
"""Fallback: парсим товары из отрендеренного Chrome HTML."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
seen = set()
|
||||||
|
results: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
for link in soup.select("a[href*='/product/']"):
|
||||||
|
if len(results) >= limit:
|
||||||
|
break
|
||||||
|
href = link.get("href") or ""
|
||||||
|
if href in seen:
|
||||||
|
continue
|
||||||
|
seen.add(href)
|
||||||
|
|
||||||
|
# Поднимаемся до карточки
|
||||||
|
card = link.find_parent("div") or link
|
||||||
|
title = link.get_text(strip=True) or (card.select_one("span") or {}).get_text(strip=True) if hasattr(card.select_one("span"), "get_text") else ""
|
||||||
|
if not title or len(title) < 5:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if resp.status_code in (301, 302, 307, 308):
|
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
||||||
log.info("OZON redirect %s, rotating proxy", resp.status_code)
|
url = url.split("?")[0]
|
||||||
continue
|
|
||||||
if resp.status_code != 200:
|
|
||||||
log.warning("OZON returned status=%s", resp.status_code)
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
# Цена в ближайшем родителе
|
||||||
data = resp.json()
|
price = None
|
||||||
except Exception as e:
|
price_card = link.find_parent("div", recursive=True)
|
||||||
log.warning("OZON JSON parse failed: %s", e)
|
if price_card:
|
||||||
continue
|
txt = price_card.get_text(" ", strip=True)
|
||||||
|
m = _PRICE_RE.search(txt)
|
||||||
|
if m:
|
||||||
|
price = _try_int(m.group(1).replace(" ", ""))
|
||||||
|
|
||||||
return _extract_products(data, limit=limit)
|
# Картинка в карточке
|
||||||
|
img = None
|
||||||
|
img_el = card.find("img") if card else None
|
||||||
|
if img_el:
|
||||||
|
src = img_el.get("src") or ""
|
||||||
|
if src.startswith("//"):
|
||||||
|
src = "https:" + src
|
||||||
|
if src and "data:image" not in src:
|
||||||
|
img = src
|
||||||
|
|
||||||
log.warning("OZON gave up after %d attempts for query=%r", max_retries + 1, query)
|
results.append({
|
||||||
return []
|
"title": title[:200],
|
||||||
|
"url": url,
|
||||||
|
"image_url": img,
|
||||||
|
"price_min_rub": price,
|
||||||
|
"price_max_rub": None,
|
||||||
|
"rating": None,
|
||||||
|
"reviews_count": None,
|
||||||
|
"stores_count": None,
|
||||||
|
"specs": {},
|
||||||
|
"source": "ozon",
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _try_int(v: Any) -> int | None:
|
||||||
|
if v is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return int(float(str(v).replace(" ", "").replace(",", ".")))
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _extract_products(data: dict, limit: int) -> list[dict[str, Any]]:
|
def _extract_products(data: dict, limit: int) -> list[dict[str, Any]]:
|
||||||
|
|||||||
129
backend-py/app/parsers/playwright_engine.py
Normal file
129
backend-py/app/parsers/playwright_engine.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
"""Singleton Playwright + Chromium для парсинга JS-сайтов.
|
||||||
|
|
||||||
|
Использование:
|
||||||
|
from .playwright_engine import fetch_page
|
||||||
|
html = fetch_page("https://market.yandex.ru/search?text=Bosch+KGN39")
|
||||||
|
|
||||||
|
Зачем синглтон: запуск Chromium ~2-3 сек. Держим один экземпляр, открываем
|
||||||
|
изолированный контекст (cookies/storage) на каждый запрос.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
log = logging.getLogger("zov.parser.playwright")
|
||||||
|
|
||||||
|
_lock = threading.Lock()
|
||||||
|
_playwright = None
|
||||||
|
_browser = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_browser():
|
||||||
|
"""Возвращает singleton Chromium browser. Инициализирует при первом обращении."""
|
||||||
|
global _playwright, _browser
|
||||||
|
with _lock:
|
||||||
|
if _browser is not None and _browser.is_connected():
|
||||||
|
return _browser
|
||||||
|
try:
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
except ImportError as e:
|
||||||
|
log.error("Playwright not installed: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
_playwright = sync_playwright().start()
|
||||||
|
_browser = _playwright.chromium.launch(
|
||||||
|
headless=True,
|
||||||
|
args=[
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-blink-features=AutomationControlled",
|
||||||
|
"--disable-dev-shm-usage", # snug на маленькой памяти
|
||||||
|
"--disable-gpu",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
log.info("Playwright Chromium started")
|
||||||
|
return _browser
|
||||||
|
except Exception as e:
|
||||||
|
log.error("Failed to start Playwright: %s", e)
|
||||||
|
_playwright = None
|
||||||
|
_browser = None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(url: str, wait_selector: Optional[str] = None,
|
||||||
|
wait_ms: int = 3000, timeout_ms: int = 25000,
|
||||||
|
user_agent: Optional[str] = None) -> Optional[str]:
|
||||||
|
"""Открывает страницу через headless Chromium, ждёт пока JS отрендерит,
|
||||||
|
возвращает текущий HTML.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: целевой URL
|
||||||
|
wait_selector: если задан — ждём пока этот CSS-селектор появится
|
||||||
|
wait_ms: фиксированная задержка после загрузки (для JS-hydration)
|
||||||
|
timeout_ms: общий таймаут навигации
|
||||||
|
user_agent: переопределить UA (по умолчанию используется playwright-овский)
|
||||||
|
"""
|
||||||
|
browser = _get_browser()
|
||||||
|
if not browser:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ctx = None
|
||||||
|
page = None
|
||||||
|
try:
|
||||||
|
ctx = browser.new_context(
|
||||||
|
user_agent=user_agent or
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
||||||
|
viewport={"width": 1280, "height": 800},
|
||||||
|
locale="ru-RU",
|
||||||
|
extra_http_headers={
|
||||||
|
"Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
page = ctx.new_page()
|
||||||
|
|
||||||
|
# Блокируем тяжёлые ресурсы — экономим время/память
|
||||||
|
def _route(route):
|
||||||
|
rt = route.request.resource_type
|
||||||
|
if rt in ("image", "font", "media", "stylesheet"):
|
||||||
|
return route.abort()
|
||||||
|
return route.continue_()
|
||||||
|
page.route("**/*", _route)
|
||||||
|
|
||||||
|
page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
|
||||||
|
|
||||||
|
if wait_selector:
|
||||||
|
try:
|
||||||
|
page.wait_for_selector(wait_selector, timeout=wait_ms + 5000)
|
||||||
|
except Exception:
|
||||||
|
log.debug("wait_selector %s not found, continuing", wait_selector)
|
||||||
|
else:
|
||||||
|
page.wait_for_timeout(wait_ms)
|
||||||
|
|
||||||
|
html = page.content()
|
||||||
|
return html
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("fetch_page failed for %s: %s", url, e)
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
if page:
|
||||||
|
try: page.close()
|
||||||
|
except: pass
|
||||||
|
if ctx:
|
||||||
|
try: ctx.close()
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
|
||||||
|
def shutdown():
|
||||||
|
"""Закрывает браузер при остановке приложения."""
|
||||||
|
global _playwright, _browser
|
||||||
|
with _lock:
|
||||||
|
if _browser:
|
||||||
|
try: _browser.close()
|
||||||
|
except: pass
|
||||||
|
_browser = None
|
||||||
|
if _playwright:
|
||||||
|
try: _playwright.stop()
|
||||||
|
except: pass
|
||||||
|
_playwright = None
|
||||||
@ -40,38 +40,44 @@ _HEADERS = {
|
|||||||
|
|
||||||
def search_wb(query: str, limit: int = 3, timeout: float = 12.0,
|
def search_wb(query: str, limit: int = 3, timeout: float = 12.0,
|
||||||
max_retries: int = 2) -> list[dict[str, Any]]:
|
max_retries: int = 2) -> list[dict[str, Any]]:
|
||||||
|
"""WB через прямой JSON API. Делает экспоненциальный backoff при 429."""
|
||||||
|
import time
|
||||||
params = {**_DEFAULT_PARAMS, "query": query}
|
params = {**_DEFAULT_PARAMS, "query": query}
|
||||||
|
|
||||||
|
backoff = 2.0
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
try:
|
try:
|
||||||
|
# Используем прямое подключение (без прокси) — WB лимитирует per-IP,
|
||||||
|
# но 1 запрос/несколько секунд проходит
|
||||||
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS) as client:
|
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS) as client:
|
||||||
resp = client.get(_SEARCH_URL, params=params)
|
resp = client.get(_SEARCH_URL, params=params)
|
||||||
except httpx.HTTPError as e:
|
except httpx.HTTPError as e:
|
||||||
log.warning("WB request failed (attempt %d): %s", attempt + 1, e)
|
log.warning("WB request failed (attempt %d): %s", attempt + 1, e)
|
||||||
|
time.sleep(backoff)
|
||||||
|
backoff *= 2
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if resp.status_code == 429:
|
if resp.status_code == 429:
|
||||||
log.warning("WB rate-limited on attempt %d, rotating proxy", attempt + 1)
|
log.warning("WB rate-limited on attempt %d, sleeping %.1fs", attempt + 1, backoff)
|
||||||
|
time.sleep(backoff)
|
||||||
|
backoff *= 2
|
||||||
continue
|
continue
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
log.warning("WB returned status=%s", resp.status_code)
|
log.warning("WB returned status=%s", resp.status_code)
|
||||||
continue
|
return []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning("WB JSON parse failed: %s", e)
|
log.warning("WB JSON parse failed: %s", e)
|
||||||
continue
|
return []
|
||||||
|
|
||||||
products = (data.get("data") or {}).get("products") or []
|
products = (data.get("data") or {}).get("products") or []
|
||||||
if not products:
|
if not products:
|
||||||
log.info("WB no products for query=%r", query)
|
log.info("WB no products for query=%r", query)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
results: list[dict[str, Any]] = []
|
return [_build_item(p) for p in products[:limit]]
|
||||||
for p in products[:limit]:
|
|
||||||
results.append(_build_item(p))
|
|
||||||
return results
|
|
||||||
|
|
||||||
log.warning("WB gave up after %d attempts for query=%r", max_retries + 1, query)
|
log.warning("WB gave up after %d attempts for query=%r", max_retries + 1, query)
|
||||||
return []
|
return []
|
||||||
|
|||||||
@ -1,162 +1,171 @@
|
|||||||
"""Парсер Я.Маркета — HTML страница поиска.
|
"""Парсер Я.Маркета — через Playwright (рендер JS).
|
||||||
|
|
||||||
Я.Маркет защищён Qrator. Через резидентный РФ-IP + правильные заголовки
|
Я.Маркет — SPA на React, товары подгружаются через XHR после первой загрузки.
|
||||||
+ cookies на сессию обычно проходит. Без прокси — 401.
|
Простой HTTP-запрос не вернёт каталог. Поэтому используем headless Chromium.
|
||||||
|
|
||||||
Из HTML вытаскиваем JSON, который Я.Маркет встраивает в <script type="application/json">.
|
Ждём пока в DOM появятся карточки `[data-zone-name="snippet-card"]` или
|
||||||
|
`a[href*="/product--"]`, потом извлекаем данные.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
import httpx
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .. import proxy_pool
|
from . import playwright_engine
|
||||||
|
|
||||||
log = logging.getLogger("zov.parser.yamarket")
|
log = logging.getLogger("zov.parser.yamarket")
|
||||||
|
|
||||||
_BASE_URL = "https://market.yandex.ru"
|
_BASE_URL = "https://market.yandex.ru"
|
||||||
_SEARCH_URL = "https://market.yandex.ru/search"
|
_PRICE_RE = re.compile(r"([\d\s]+)\s*₽")
|
||||||
|
|
||||||
_HEADERS = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
||||||
"(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
||||||
"Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8",
|
|
||||||
"Accept-Encoding": "gzip, deflate, br",
|
|
||||||
"Connection": "keep-alive",
|
|
||||||
"Sec-Fetch-Dest": "document",
|
|
||||||
"Sec-Fetch-Mode": "navigate",
|
|
||||||
"Sec-Fetch-Site": "none",
|
|
||||||
"Sec-Fetch-User": "?1",
|
|
||||||
"Upgrade-Insecure-Requests": "1",
|
|
||||||
}
|
|
||||||
|
|
||||||
_PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
|
|
||||||
|
|
||||||
|
|
||||||
def search_yamarket(query: str, limit: int = 3, timeout: float = 20.0,
|
def search_yamarket(query: str, limit: int = 3, timeout: float = 30.0,
|
||||||
max_retries: int = 2) -> list[dict[str, Any]]:
|
max_retries: int = 1) -> list[dict[str, Any]]:
|
||||||
"""Поиск товара в Я.Маркете. Возвращает топ-N с ценами и кол-вом магазинов."""
|
"""Поиск товара в Я.Маркете через headless Chromium."""
|
||||||
params = {"text": query, "cvredirect": "2"}
|
url = f"{_BASE_URL}/search?text={quote_plus(query)}"
|
||||||
|
|
||||||
|
html = None
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
try:
|
html = playwright_engine.fetch_page(
|
||||||
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
|
url,
|
||||||
follow_redirects=True) as client:
|
# Ждём появления товарных ссылок или контейнера выдачи
|
||||||
resp = client.get(_SEARCH_URL, params=params)
|
wait_selector="a[href*='/product--'], [data-auto='SerpItem'], [data-zone-name='snippet-card']",
|
||||||
except httpx.HTTPError as e:
|
wait_ms=3500,
|
||||||
log.warning("YaMarket request failed (attempt %d): %s", attempt + 1, e)
|
timeout_ms=int(timeout * 1000),
|
||||||
continue
|
)
|
||||||
|
if html:
|
||||||
|
break
|
||||||
|
|
||||||
if resp.status_code != 200:
|
if not html:
|
||||||
log.warning("YaMarket status=%s on attempt %d", resp.status_code, attempt + 1)
|
log.warning("YaMarket: no HTML for query=%r", query)
|
||||||
continue
|
return []
|
||||||
|
|
||||||
text = resp.text
|
if "showcaptcha" in html.lower() or "qrator" in html.lower()[:5000]:
|
||||||
if "qrator" in text.lower() or "showcaptcha" in text.lower():
|
log.warning("YaMarket: Qrator/captcha for query=%r", query)
|
||||||
log.warning("YaMarket Qrator/captcha on attempt %d, rotating proxy", attempt + 1)
|
return []
|
||||||
continue
|
|
||||||
|
|
||||||
return _parse_html(text, limit=limit)
|
return _parse_html(html, limit=limit)
|
||||||
|
|
||||||
log.warning("YaMarket gave up after %d attempts for query=%r", max_retries + 1, query)
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
|
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
results: list[dict[str, Any]] = []
|
results: list[dict[str, Any]] = []
|
||||||
|
|
||||||
# Я.Маркет встраивает данные в JSON внутри скриптов
|
# Основной селектор — товарные карточки на странице поиска
|
||||||
for script in soup.find_all("script", type="application/json"):
|
candidates = (
|
||||||
data = _try_json(script.string or "")
|
soup.select("[data-auto='SerpItem']")
|
||||||
if not data:
|
or soup.select("[data-zone-name='snippet-card']")
|
||||||
continue
|
or soup.select("article[data-baobab-name='card']")
|
||||||
# Структуры разные; ищем массив с offers/products
|
or soup.select("article:has(a[href*='/product--'])")
|
||||||
items = _find_products(data)
|
)
|
||||||
for it in items:
|
|
||||||
if len(results) >= limit:
|
for card in candidates:
|
||||||
break
|
|
||||||
item = _build_item(it)
|
|
||||||
if item:
|
|
||||||
results.append(item)
|
|
||||||
if len(results) >= limit:
|
if len(results) >= limit:
|
||||||
break
|
break
|
||||||
|
item = _extract_card(card)
|
||||||
|
if item:
|
||||||
|
results.append(item)
|
||||||
|
|
||||||
# Резервный путь — карточки прямо в HTML
|
# Резерв — собрать по найденным ссылкам product--
|
||||||
if not results:
|
if not results:
|
||||||
cards = soup.select("[data-zone-name='snippet-card'], [data-baobab-name='card']")
|
seen = set()
|
||||||
for card in cards:
|
for a in soup.select("a[href*='/product--']")[:limit * 2]:
|
||||||
if len(results) >= limit:
|
href = a.get("href") or ""
|
||||||
break
|
if href in seen:
|
||||||
item = _extract_html_card(card)
|
continue
|
||||||
if item:
|
seen.add(href)
|
||||||
results.append(item)
|
# Берём родительский article как карточку
|
||||||
|
card = a.find_parent("article") or a.find_parent("div")
|
||||||
|
if card:
|
||||||
|
item = _extract_card(card)
|
||||||
|
if item:
|
||||||
|
results.append(item)
|
||||||
|
if len(results) >= limit:
|
||||||
|
break
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _find_products(data: Any, _depth: int = 0) -> list[dict]:
|
def _extract_card(card) -> dict[str, Any] | None:
|
||||||
"""Рекурсивно ищем массив товаров в JSON Я.Маркета."""
|
"""Достаём заголовок, ссылку, цену, рейтинг, отзывы, фото, кол-во магазинов."""
|
||||||
if _depth > 8:
|
link_el = (
|
||||||
return []
|
card.select_one("a[href*='/product--']")
|
||||||
if isinstance(data, list):
|
or card.select_one("a[data-baobab-name='title']")
|
||||||
# Эвристика: список объектов с offers/price/title
|
)
|
||||||
if data and isinstance(data[0], dict) and (
|
if not link_el:
|
||||||
data[0].get("offers") or data[0].get("prices") or data[0].get("titles")
|
return None
|
||||||
):
|
href = link_el.get("href") or ""
|
||||||
return data
|
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
||||||
for item in data:
|
|
||||||
found = _find_products(item, _depth + 1)
|
|
||||||
if found:
|
|
||||||
return found
|
|
||||||
elif isinstance(data, dict):
|
|
||||||
for v in data.values():
|
|
||||||
found = _find_products(v, _depth + 1)
|
|
||||||
if found:
|
|
||||||
return found
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
title_el = (
|
||||||
def _build_item(p: dict) -> dict[str, Any] | None:
|
card.select_one("[data-zone-name='title'] span")
|
||||||
title_obj = p.get("titles") or {}
|
or card.select_one("h3 span")
|
||||||
title = (title_obj.get("raw") if isinstance(title_obj, dict) else "") or p.get("title", "")
|
or card.select_one("[data-auto='snippet-title']")
|
||||||
|
or link_el
|
||||||
|
)
|
||||||
|
title = title_el.get_text(strip=True) if title_el else (link_el.get_text(strip=True))
|
||||||
if not title:
|
if not title:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
url_obj = p.get("url") or p.get("urls", {}).get("encrypted", "")
|
# Цена
|
||||||
url = url_obj if isinstance(url_obj, str) else ""
|
|
||||||
if url and url.startswith("/"):
|
|
||||||
url = f"{_BASE_URL}{url}"
|
|
||||||
|
|
||||||
pic = ""
|
|
||||||
pictures = p.get("pictures") or []
|
|
||||||
if pictures and isinstance(pictures, list):
|
|
||||||
pic_obj = pictures[0]
|
|
||||||
if isinstance(pic_obj, dict):
|
|
||||||
pic = pic_obj.get("original", {}).get("url") or pic_obj.get("url") or ""
|
|
||||||
|
|
||||||
# Цена + кол-во магазинов
|
|
||||||
prices = p.get("prices") or p.get("offers") or {}
|
|
||||||
price_min = price_max = None
|
price_min = price_max = None
|
||||||
stores = None
|
price_el = (
|
||||||
if isinstance(prices, dict):
|
card.select_one("[data-auto='snippet-price-current']")
|
||||||
price_min = _try_int(prices.get("min", {}).get("value") if isinstance(prices.get("min"), dict) else prices.get("min"))
|
or card.select_one("[data-auto='price-value']")
|
||||||
price_max = _try_int(prices.get("max", {}).get("value") if isinstance(prices.get("max"), dict) else prices.get("max"))
|
or card.select_one("[class*='Price']")
|
||||||
stores = _try_int(prices.get("count") or prices.get("offersCount"))
|
)
|
||||||
|
if price_el:
|
||||||
|
m = _PRICE_RE.search(price_el.get_text(" ", strip=True))
|
||||||
|
if m:
|
||||||
|
price_min = _try_int(m.group(1).replace(" ", "").replace(" ", ""))
|
||||||
|
|
||||||
rating = _try_float((p.get("rating") or {}).get("value") if isinstance(p.get("rating"), dict) else p.get("rating"))
|
# Картинка
|
||||||
reviews = _try_int((p.get("reviews") or {}).get("count") if isinstance(p.get("reviews"), dict) else p.get("reviews"))
|
img_url = None
|
||||||
|
img_el = card.select_one("img[src], img[srcset]")
|
||||||
|
if img_el:
|
||||||
|
src = img_el.get("src") or img_el.get("data-src") or ""
|
||||||
|
# Иногда src — заглушка 1x1px, основное в srcset
|
||||||
|
if "data:image" in src or not src:
|
||||||
|
srcset = img_el.get("srcset") or ""
|
||||||
|
if srcset:
|
||||||
|
src = srcset.split(",")[0].strip().split(" ")[0]
|
||||||
|
if src.startswith("//"):
|
||||||
|
src = "https:" + src
|
||||||
|
if src:
|
||||||
|
img_url = src
|
||||||
|
|
||||||
|
# Рейтинг
|
||||||
|
rating = None
|
||||||
|
rating_el = card.select_one("[data-auto='snippet-rating'], [class*='Rating'] span")
|
||||||
|
if rating_el:
|
||||||
|
rt = rating_el.get_text(strip=True)
|
||||||
|
m = re.search(r"\d[.,]\d", rt)
|
||||||
|
if m:
|
||||||
|
rating = _try_float(m.group(0))
|
||||||
|
|
||||||
|
# Отзывы
|
||||||
|
reviews = None
|
||||||
|
reviews_el = card.select_one("[data-auto='snippet-feedback'], a[href*='/reviews']")
|
||||||
|
if reviews_el:
|
||||||
|
m = re.search(r"\d[\d\s]*", reviews_el.get_text(" ", strip=True))
|
||||||
|
if m:
|
||||||
|
reviews = _try_int(m.group(0).replace(" ", ""))
|
||||||
|
|
||||||
|
# Кол-во магазинов / предложений
|
||||||
|
stores = None
|
||||||
|
stores_el = card.select_one("[data-auto='offer-count'], a[href*='/offers']")
|
||||||
|
if stores_el:
|
||||||
|
m = re.search(r"\d+", stores_el.get_text(" ", strip=True))
|
||||||
|
if m:
|
||||||
|
stores = int(m.group(0))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"title": re.sub(r"<[^>]+>", "", title).strip(),
|
"title": title,
|
||||||
"url": url,
|
"url": url,
|
||||||
"image_url": pic,
|
"image_url": img_url,
|
||||||
"price_min_rub": price_min,
|
"price_min_rub": price_min,
|
||||||
"price_max_rub": price_max if price_max and price_max != price_min else None,
|
"price_max_rub": price_max if price_max and price_max != price_min else None,
|
||||||
"rating": rating,
|
"rating": rating,
|
||||||
@ -167,50 +176,6 @@ def _build_item(p: dict) -> dict[str, Any] | None:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _extract_html_card(card) -> dict[str, Any] | None:
|
|
||||||
"""Резервный парсинг HTML-карточки если JSON не нашёлся."""
|
|
||||||
title_el = card.select_one("[data-zone-name='title'] span, h3, [class*='Title']")
|
|
||||||
if not title_el:
|
|
||||||
return None
|
|
||||||
title = title_el.get_text(strip=True)
|
|
||||||
|
|
||||||
price_el = card.select_one("[data-auto='snippet-price-current'], [class*='Price']")
|
|
||||||
price = None
|
|
||||||
if price_el:
|
|
||||||
m = _PRICE_RE.search(price_el.get_text(" ", strip=True))
|
|
||||||
if m:
|
|
||||||
price = _try_int(m.group(1).replace(" ", ""))
|
|
||||||
|
|
||||||
img_el = card.select_one("img[srcset], img[src]")
|
|
||||||
img_url = ""
|
|
||||||
if img_el:
|
|
||||||
src = img_el.get("src") or img_el.get("data-src") or ""
|
|
||||||
if src.startswith("//"):
|
|
||||||
src = "https:" + src
|
|
||||||
img_url = src
|
|
||||||
|
|
||||||
link_el = card.select_one("a[href*='/product--'], a[data-baobab-name='title']")
|
|
||||||
url = ""
|
|
||||||
if link_el:
|
|
||||||
href = link_el.get("href") or ""
|
|
||||||
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
|
||||||
|
|
||||||
if not title:
|
|
||||||
return None
|
|
||||||
return {
|
|
||||||
"title": title,
|
|
||||||
"url": url,
|
|
||||||
"image_url": img_url,
|
|
||||||
"price_min_rub": price,
|
|
||||||
"price_max_rub": None,
|
|
||||||
"rating": None,
|
|
||||||
"reviews_count": None,
|
|
||||||
"stores_count": None,
|
|
||||||
"specs": {},
|
|
||||||
"source": "yamarket",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _try_int(v: Any) -> int | None:
|
def _try_int(v: Any) -> int | None:
|
||||||
if v is None:
|
if v is None:
|
||||||
return None
|
return None
|
||||||
@ -227,10 +192,3 @@ def _try_float(v: Any) -> float | None:
|
|||||||
return float(str(v).replace(" ", "").replace(",", "."))
|
return float(str(v).replace(" ", "").replace(",", "."))
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _try_json(s: str) -> Any:
|
|
||||||
try:
|
|
||||||
return json.loads(s)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
return None
|
|
||||||
|
|||||||
@ -7,3 +7,4 @@ google-auth>=2.30.0
|
|||||||
python-dotenv>=1.0.0
|
python-dotenv>=1.0.0
|
||||||
beautifulsoup4>=4.12.0
|
beautifulsoup4>=4.12.0
|
||||||
lxml>=5.2.0
|
lxml>=5.2.0
|
||||||
|
playwright>=1.45.0
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user