From d5f290bd0ac3e0b400d57a8f4e07b47d8df7cda4 Mon Sep 17 00:00:00 2001 From: wasrusgen Date: Mon, 11 May 2026 13:25:05 +0300 Subject: [PATCH] =?UTF-8?q?backend:=20Playwright=20+=20Chromium=20for=20JS?= =?UTF-8?q?-rendered=20sites=20(=D0=AF.=D0=9C=D0=B0=D1=80=D0=BA=D0=B5?= =?UTF-8?q?=D1=82,=20OZON=20fallback)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DOCKERFILE: - + Chromium system deps (libnss3, libxkbcommon0, libgbm1, libgtk-3-0, etc.) - + RUN python -m playwright install chromium (~150MB) - + ENV PLAYWRIGHT_BROWSERS_PATH REQUIREMENTS: - + playwright >= 1.45 PARSERS: - new playwright_engine.py — singleton browser, isolated context per request, blocks images/fonts/CSS to save memory, waits for selector + JS hydration - yamarket.py — rewritten to use Playwright (Я.Маркет is React SPA) - ozon.py — Playwright fallback when composer-api returns challenge (403) - wb.py — exponential backoff on 429, still uses direct HTTP (JSON API, no JS needed) STRATEGY (Hybrid Path C): - Я.Маркет: Playwright (rendering JS) - OZON: composer-api first, Playwright fallback - WB: direct HTTP with backoff (JSON API, fast) - DNS: kept but lower priority (Qrator hard to crack) - No more proxy needed for primary path DEPLOY: removed PROXY_STATIC_LIST from .env, expect ~5min for first build (Chromium download) --- backend-py/Dockerfile | 17 +- backend-py/app/parsers/ozon.py | 112 ++++++-- backend-py/app/parsers/playwright_engine.py | 129 +++++++++ backend-py/app/parsers/wb.py | 20 +- backend-py/app/parsers/yamarket.py | 288 +++++++++----------- backend-py/requirements.txt | 1 + 6 files changed, 374 insertions(+), 193 deletions(-) create mode 100644 backend-py/app/parsers/playwright_engine.py diff --git a/backend-py/Dockerfile b/backend-py/Dockerfile index ffdf71e..6610661 100644 --- a/backend-py/Dockerfile +++ b/backend-py/Dockerfile @@ -1,9 +1,16 @@ FROM python:3.12-slim # НУЦ Минцифры root CA — для GigaChat SSL. -# Скачиваем актуальный bundle на этапе сборки и добавляем в системный trust store. +# + системные пакеты для Playwright/Chromium (рендеринг JS-сайтов). RUN apt-get update \ - && apt-get install -y --no-install-recommends ca-certificates curl \ + && apt-get install -y --no-install-recommends \ + ca-certificates curl \ + # Chromium dependencies for Playwright + libnss3 libnspr4 libatk-bridge2.0-0 libatk1.0-0 libcups2 \ + libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \ + libgbm1 libgtk-3-0 libasound2 libpango-1.0-0 libcairo2 \ + libdbus-1-3 libdrm2 libxshmfence1 \ + fonts-liberation fonts-noto-color-emoji \ && curl -fsSL -o /usr/local/share/ca-certificates/russian_trusted_root_ca.crt \ https://gu-st.ru/content/Other/doc/russian_trusted_root_ca.cer \ && curl -fsSL -o /usr/local/share/ca-certificates/russian_trusted_sub_ca.crt \ @@ -16,6 +23,9 @@ WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt +# Скачиваем только Chromium (без firefox/webkit) — ~150MB +RUN python -m playwright install chromium + COPY app /app/app # httpx по умолчанию использует certifi → принудительно указываем системный bundle, @@ -23,6 +33,9 @@ COPY app /app/app ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt +# Playwright кэш-каталог браузеров +ENV PLAYWRIGHT_BROWSERS_PATH=/root/.cache/ms-playwright + EXPOSE 8000 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers", "--forwarded-allow-ips=*"] diff --git a/backend-py/app/parsers/ozon.py b/backend-py/app/parsers/ozon.py index 0bfd177..36887dc 100644 --- a/backend-py/app/parsers/ozon.py +++ b/backend-py/app/parsers/ozon.py @@ -12,8 +12,10 @@ from typing import Any from urllib.parse import quote_plus import httpx +from bs4 import BeautifulSoup from .. import proxy_pool +from . import playwright_engine log = logging.getLogger("zov.parser.ozon") @@ -34,38 +36,110 @@ _HEADERS = { _PRICE_RE = re.compile(r"([\d\s]+)\s*₽") -def search_ozon(query: str, limit: int = 3, timeout: float = 15.0, - max_retries: int = 2) -> list[dict[str, Any]]: - """Поиск товара в OZON через composer-api.""" +def search_ozon(query: str, limit: int = 3, timeout: float = 30.0, + max_retries: int = 1, use_playwright: bool = True) -> list[dict[str, Any]]: + """Поиск товара в OZON. + + Сначала пробуем composer-api JSON (быстро), при challenge — Playwright (медленно но точно). + """ + # Путь 1: быстрый composer-api url_param = f"/search/?text={quote_plus(query)}&from_global=true" params = {"url": url_param} - for attempt in range(max_retries + 1): try: with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS, follow_redirects=False) as client: resp = client.get(_API_URL, params=params) + if resp.status_code == 200: + try: + return _extract_products(resp.json(), limit=limit) + except Exception: + pass + log.debug("OZON composer-api attempt %d: status=%s", attempt + 1, resp.status_code) except httpx.HTTPError as e: - log.warning("OZON request failed (attempt %d): %s", attempt + 1, e) + log.debug("OZON composer-api err: %s", e) + + # Путь 2: Playwright (рендерим обычную HTML-страницу поиска) + if not use_playwright: + return [] + log.info("OZON falling back to Playwright for query=%r", query) + page_url = f"{_BASE_URL}/search/?text={quote_plus(query)}" + html = playwright_engine.fetch_page( + page_url, + wait_selector="a[href*='/product/'], [data-widget='searchResultsV2']", + wait_ms=3500, + timeout_ms=int(timeout * 1000), + ) + if not html: + return [] + return _parse_html_via_dom(html, limit=limit) + + +def _parse_html_via_dom(html: str, limit: int) -> list[dict[str, Any]]: + """Fallback: парсим товары из отрендеренного Chrome HTML.""" + soup = BeautifulSoup(html, "html.parser") + seen = set() + results: list[dict[str, Any]] = [] + + for link in soup.select("a[href*='/product/']"): + if len(results) >= limit: + break + href = link.get("href") or "" + if href in seen: + continue + seen.add(href) + + # Поднимаемся до карточки + card = link.find_parent("div") or link + title = link.get_text(strip=True) or (card.select_one("span") or {}).get_text(strip=True) if hasattr(card.select_one("span"), "get_text") else "" + if not title or len(title) < 5: continue - if resp.status_code in (301, 302, 307, 308): - log.info("OZON redirect %s, rotating proxy", resp.status_code) - continue - if resp.status_code != 200: - log.warning("OZON returned status=%s", resp.status_code) - continue + url = href if href.startswith("http") else f"{_BASE_URL}{href}" + url = url.split("?")[0] - try: - data = resp.json() - except Exception as e: - log.warning("OZON JSON parse failed: %s", e) - continue + # Цена в ближайшем родителе + price = None + price_card = link.find_parent("div", recursive=True) + if price_card: + txt = price_card.get_text(" ", strip=True) + m = _PRICE_RE.search(txt) + if m: + price = _try_int(m.group(1).replace(" ", "")) - return _extract_products(data, limit=limit) + # Картинка в карточке + img = None + img_el = card.find("img") if card else None + if img_el: + src = img_el.get("src") or "" + if src.startswith("//"): + src = "https:" + src + if src and "data:image" not in src: + img = src - log.warning("OZON gave up after %d attempts for query=%r", max_retries + 1, query) - return [] + results.append({ + "title": title[:200], + "url": url, + "image_url": img, + "price_min_rub": price, + "price_max_rub": None, + "rating": None, + "reviews_count": None, + "stores_count": None, + "specs": {}, + "source": "ozon", + }) + + return results + + +def _try_int(v: Any) -> int | None: + if v is None: + return None + try: + return int(float(str(v).replace(" ", "").replace(",", "."))) + except (ValueError, TypeError): + return None def _extract_products(data: dict, limit: int) -> list[dict[str, Any]]: diff --git a/backend-py/app/parsers/playwright_engine.py b/backend-py/app/parsers/playwright_engine.py new file mode 100644 index 0000000..d36643b --- /dev/null +++ b/backend-py/app/parsers/playwright_engine.py @@ -0,0 +1,129 @@ +"""Singleton Playwright + Chromium для парсинга JS-сайтов. + +Использование: + from .playwright_engine import fetch_page + html = fetch_page("https://market.yandex.ru/search?text=Bosch+KGN39") + +Зачем синглтон: запуск Chromium ~2-3 сек. Держим один экземпляр, открываем +изолированный контекст (cookies/storage) на каждый запрос. +""" +from __future__ import annotations +import logging +import threading +from typing import Optional + +log = logging.getLogger("zov.parser.playwright") + +_lock = threading.Lock() +_playwright = None +_browser = None + + +def _get_browser(): + """Возвращает singleton Chromium browser. Инициализирует при первом обращении.""" + global _playwright, _browser + with _lock: + if _browser is not None and _browser.is_connected(): + return _browser + try: + from playwright.sync_api import sync_playwright + except ImportError as e: + log.error("Playwright not installed: %s", e) + return None + + try: + _playwright = sync_playwright().start() + _browser = _playwright.chromium.launch( + headless=True, + args=[ + "--no-sandbox", + "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage", # snug на маленькой памяти + "--disable-gpu", + ], + ) + log.info("Playwright Chromium started") + return _browser + except Exception as e: + log.error("Failed to start Playwright: %s", e) + _playwright = None + _browser = None + return None + + +def fetch_page(url: str, wait_selector: Optional[str] = None, + wait_ms: int = 3000, timeout_ms: int = 25000, + user_agent: Optional[str] = None) -> Optional[str]: + """Открывает страницу через headless Chromium, ждёт пока JS отрендерит, + возвращает текущий HTML. + + Args: + url: целевой URL + wait_selector: если задан — ждём пока этот CSS-селектор появится + wait_ms: фиксированная задержка после загрузки (для JS-hydration) + timeout_ms: общий таймаут навигации + user_agent: переопределить UA (по умолчанию используется playwright-овский) + """ + browser = _get_browser() + if not browser: + return None + + ctx = None + page = None + try: + ctx = browser.new_context( + user_agent=user_agent or + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + viewport={"width": 1280, "height": 800}, + locale="ru-RU", + extra_http_headers={ + "Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8", + }, + ) + page = ctx.new_page() + + # Блокируем тяжёлые ресурсы — экономим время/память + def _route(route): + rt = route.request.resource_type + if rt in ("image", "font", "media", "stylesheet"): + return route.abort() + return route.continue_() + page.route("**/*", _route) + + page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded") + + if wait_selector: + try: + page.wait_for_selector(wait_selector, timeout=wait_ms + 5000) + except Exception: + log.debug("wait_selector %s not found, continuing", wait_selector) + else: + page.wait_for_timeout(wait_ms) + + html = page.content() + return html + except Exception as e: + log.warning("fetch_page failed for %s: %s", url, e) + return None + finally: + if page: + try: page.close() + except: pass + if ctx: + try: ctx.close() + except: pass + + +def shutdown(): + """Закрывает браузер при остановке приложения.""" + global _playwright, _browser + with _lock: + if _browser: + try: _browser.close() + except: pass + _browser = None + if _playwright: + try: _playwright.stop() + except: pass + _playwright = None diff --git a/backend-py/app/parsers/wb.py b/backend-py/app/parsers/wb.py index c485c41..20be0fa 100644 --- a/backend-py/app/parsers/wb.py +++ b/backend-py/app/parsers/wb.py @@ -40,38 +40,44 @@ _HEADERS = { def search_wb(query: str, limit: int = 3, timeout: float = 12.0, max_retries: int = 2) -> list[dict[str, Any]]: + """WB через прямой JSON API. Делает экспоненциальный backoff при 429.""" + import time params = {**_DEFAULT_PARAMS, "query": query} + backoff = 2.0 for attempt in range(max_retries + 1): try: + # Используем прямое подключение (без прокси) — WB лимитирует per-IP, + # но 1 запрос/несколько секунд проходит with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS) as client: resp = client.get(_SEARCH_URL, params=params) except httpx.HTTPError as e: log.warning("WB request failed (attempt %d): %s", attempt + 1, e) + time.sleep(backoff) + backoff *= 2 continue if resp.status_code == 429: - log.warning("WB rate-limited on attempt %d, rotating proxy", attempt + 1) + log.warning("WB rate-limited on attempt %d, sleeping %.1fs", attempt + 1, backoff) + time.sleep(backoff) + backoff *= 2 continue if resp.status_code != 200: log.warning("WB returned status=%s", resp.status_code) - continue + return [] try: data = resp.json() except Exception as e: log.warning("WB JSON parse failed: %s", e) - continue + return [] products = (data.get("data") or {}).get("products") or [] if not products: log.info("WB no products for query=%r", query) return [] - results: list[dict[str, Any]] = [] - for p in products[:limit]: - results.append(_build_item(p)) - return results + return [_build_item(p) for p in products[:limit]] log.warning("WB gave up after %d attempts for query=%r", max_retries + 1, query) return [] diff --git a/backend-py/app/parsers/yamarket.py b/backend-py/app/parsers/yamarket.py index d3a772b..62aab7e 100644 --- a/backend-py/app/parsers/yamarket.py +++ b/backend-py/app/parsers/yamarket.py @@ -1,162 +1,171 @@ -"""Парсер Я.Маркета — HTML страница поиска. +"""Парсер Я.Маркета — через Playwright (рендер JS). -Я.Маркет защищён Qrator. Через резидентный РФ-IP + правильные заголовки -+ cookies на сессию обычно проходит. Без прокси — 401. +Я.Маркет — SPA на React, товары подгружаются через XHR после первой загрузки. +Простой HTTP-запрос не вернёт каталог. Поэтому используем headless Chromium. -Из HTML вытаскиваем JSON, который Я.Маркет встраивает в