"""Singleton Playwright + Chromium для парсинга JS-сайтов. Использование: from .playwright_engine import fetch_page html = fetch_page("https://market.yandex.ru/search?text=Bosch+KGN39") Зачем синглтон: запуск Chromium ~2-3 сек. Держим один экземпляр, открываем изолированный контекст (cookies/storage) на каждый запрос. """ from __future__ import annotations import logging import threading from typing import Optional log = logging.getLogger("zov.parser.playwright") _lock = threading.Lock() _playwright = None _browser = None def _get_browser(): """Возвращает singleton Chromium browser. Инициализирует при первом обращении.""" global _playwright, _browser with _lock: if _browser is not None and _browser.is_connected(): return _browser try: from playwright.sync_api import sync_playwright except ImportError as e: log.error("Playwright not installed: %s", e) return None try: _playwright = sync_playwright().start() _browser = _playwright.chromium.launch( headless=True, args=[ "--no-sandbox", "--disable-blink-features=AutomationControlled", "--disable-dev-shm-usage", # snug на маленькой памяти "--disable-gpu", ], ) log.info("Playwright Chromium started") return _browser except Exception as e: log.error("Failed to start Playwright: %s", e) _playwright = None _browser = None return None def fetch_page(url: str, wait_selector: Optional[str] = None, wait_ms: int = 3000, timeout_ms: int = 25000, user_agent: Optional[str] = None) -> Optional[str]: """Открывает страницу через headless Chromium, ждёт пока JS отрендерит, возвращает текущий HTML. Args: url: целевой URL wait_selector: если задан — ждём пока этот CSS-селектор появится wait_ms: фиксированная задержка после загрузки (для JS-hydration) timeout_ms: общий таймаут навигации user_agent: переопределить UA (по умолчанию используется playwright-овский) """ browser = _get_browser() if not browser: return None ctx = None page = None try: ctx = browser.new_context( user_agent=user_agent or "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", viewport={"width": 1280, "height": 800}, locale="ru-RU", extra_http_headers={ "Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8", }, ) page = ctx.new_page() # Блокируем тяжёлые ресурсы — экономим время/память def _route(route): rt = route.request.resource_type if rt in ("image", "font", "media", "stylesheet"): return route.abort() return route.continue_() page.route("**/*", _route) page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded") if wait_selector: try: page.wait_for_selector(wait_selector, timeout=wait_ms + 5000) except Exception: log.debug("wait_selector %s not found, continuing", wait_selector) else: page.wait_for_timeout(wait_ms) html = page.content() return html except Exception as e: log.warning("fetch_page failed for %s: %s", url, e) return None finally: if page: try: page.close() except: pass if ctx: try: ctx.close() except: pass def shutdown(): """Закрывает браузер при остановке приложения.""" global _playwright, _browser with _lock: if _browser: try: _browser.close() except: pass _browser = None if _playwright: try: _playwright.stop() except: pass _playwright = None