mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 18:44:47 +00:00
DOCKERFILE: - + Chromium system deps (libnss3, libxkbcommon0, libgbm1, libgtk-3-0, etc.) - + RUN python -m playwright install chromium (~150MB) - + ENV PLAYWRIGHT_BROWSERS_PATH REQUIREMENTS: - + playwright >= 1.45 PARSERS: - new playwright_engine.py — singleton browser, isolated context per request, blocks images/fonts/CSS to save memory, waits for selector + JS hydration - yamarket.py — rewritten to use Playwright (Я.Маркет is React SPA) - ozon.py — Playwright fallback when composer-api returns challenge (403) - wb.py — exponential backoff on 429, still uses direct HTTP (JSON API, no JS needed) STRATEGY (Hybrid Path C): - Я.Маркет: Playwright (rendering JS) - OZON: composer-api first, Playwright fallback - WB: direct HTTP with backoff (JSON API, fast) - DNS: kept but lower priority (Qrator hard to crack) - No more proxy needed for primary path DEPLOY: removed PROXY_STATIC_LIST from .env, expect ~5min for first build (Chromium download)
130 lines
4.5 KiB
Python
130 lines
4.5 KiB
Python
"""Singleton Playwright + Chromium для парсинга JS-сайтов.
|
|
|
|
Использование:
|
|
from .playwright_engine import fetch_page
|
|
html = fetch_page("https://market.yandex.ru/search?text=Bosch+KGN39")
|
|
|
|
Зачем синглтон: запуск Chromium ~2-3 сек. Держим один экземпляр, открываем
|
|
изолированный контекст (cookies/storage) на каждый запрос.
|
|
"""
|
|
from __future__ import annotations
|
|
import logging
|
|
import threading
|
|
from typing import Optional
|
|
|
|
log = logging.getLogger("zov.parser.playwright")
|
|
|
|
_lock = threading.Lock()
|
|
_playwright = None
|
|
_browser = None
|
|
|
|
|
|
def _get_browser():
|
|
"""Возвращает singleton Chromium browser. Инициализирует при первом обращении."""
|
|
global _playwright, _browser
|
|
with _lock:
|
|
if _browser is not None and _browser.is_connected():
|
|
return _browser
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
except ImportError as e:
|
|
log.error("Playwright not installed: %s", e)
|
|
return None
|
|
|
|
try:
|
|
_playwright = sync_playwright().start()
|
|
_browser = _playwright.chromium.launch(
|
|
headless=True,
|
|
args=[
|
|
"--no-sandbox",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--disable-dev-shm-usage", # snug на маленькой памяти
|
|
"--disable-gpu",
|
|
],
|
|
)
|
|
log.info("Playwright Chromium started")
|
|
return _browser
|
|
except Exception as e:
|
|
log.error("Failed to start Playwright: %s", e)
|
|
_playwright = None
|
|
_browser = None
|
|
return None
|
|
|
|
|
|
def fetch_page(url: str, wait_selector: Optional[str] = None,
|
|
wait_ms: int = 3000, timeout_ms: int = 25000,
|
|
user_agent: Optional[str] = None) -> Optional[str]:
|
|
"""Открывает страницу через headless Chromium, ждёт пока JS отрендерит,
|
|
возвращает текущий HTML.
|
|
|
|
Args:
|
|
url: целевой URL
|
|
wait_selector: если задан — ждём пока этот CSS-селектор появится
|
|
wait_ms: фиксированная задержка после загрузки (для JS-hydration)
|
|
timeout_ms: общий таймаут навигации
|
|
user_agent: переопределить UA (по умолчанию используется playwright-овский)
|
|
"""
|
|
browser = _get_browser()
|
|
if not browser:
|
|
return None
|
|
|
|
ctx = None
|
|
page = None
|
|
try:
|
|
ctx = browser.new_context(
|
|
user_agent=user_agent or
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
|
viewport={"width": 1280, "height": 800},
|
|
locale="ru-RU",
|
|
extra_http_headers={
|
|
"Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8",
|
|
},
|
|
)
|
|
page = ctx.new_page()
|
|
|
|
# Блокируем тяжёлые ресурсы — экономим время/память
|
|
def _route(route):
|
|
rt = route.request.resource_type
|
|
if rt in ("image", "font", "media", "stylesheet"):
|
|
return route.abort()
|
|
return route.continue_()
|
|
page.route("**/*", _route)
|
|
|
|
page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
|
|
|
|
if wait_selector:
|
|
try:
|
|
page.wait_for_selector(wait_selector, timeout=wait_ms + 5000)
|
|
except Exception:
|
|
log.debug("wait_selector %s not found, continuing", wait_selector)
|
|
else:
|
|
page.wait_for_timeout(wait_ms)
|
|
|
|
html = page.content()
|
|
return html
|
|
except Exception as e:
|
|
log.warning("fetch_page failed for %s: %s", url, e)
|
|
return None
|
|
finally:
|
|
if page:
|
|
try: page.close()
|
|
except: pass
|
|
if ctx:
|
|
try: ctx.close()
|
|
except: pass
|
|
|
|
|
|
def shutdown():
|
|
"""Закрывает браузер при остановке приложения."""
|
|
global _playwright, _browser
|
|
with _lock:
|
|
if _browser:
|
|
try: _browser.close()
|
|
except: pass
|
|
_browser = None
|
|
if _playwright:
|
|
try: _playwright.stop()
|
|
except: pass
|
|
_playwright = None
|