zov-tech/backend-py/app/parsers/playwright_engine.py
wasrusgen e7f6e64e38 playwright_engine: route through proxy_pool — random residential IP per request
- New use_proxy param (default True)
- Per-request random proxy from pool
- _parse_proxy_url_for_playwright converts http://user:pass@host:port to playwright.proxy dict
2026-05-11 16:05:36 +03:00

157 lines
5.4 KiB
Python

"""Singleton Playwright + Chromium для парсинга JS-сайтов.
Использование:
from .playwright_engine import fetch_page
html = fetch_page("https://market.yandex.ru/search?text=Bosch+KGN39")
Зачем синглтон: запуск Chromium ~2-3 сек. Держим один экземпляр, открываем
изолированный контекст (cookies/storage) на каждый запрос.
"""
from __future__ import annotations
import logging
import threading
from typing import Optional
log = logging.getLogger("zov.parser.playwright")
_lock = threading.Lock()
_playwright = None
_browser = None
def _get_browser():
"""Возвращает singleton Chromium browser. Инициализирует при первом обращении."""
global _playwright, _browser
with _lock:
if _browser is not None and _browser.is_connected():
return _browser
try:
from playwright.sync_api import sync_playwright
except ImportError as e:
log.error("Playwright not installed: %s", e)
return None
try:
_playwright = sync_playwright().start()
_browser = _playwright.chromium.launch(
headless=True,
args=[
"--no-sandbox",
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage", # snug на маленькой памяти
"--disable-gpu",
],
)
log.info("Playwright Chromium started")
return _browser
except Exception as e:
log.error("Failed to start Playwright: %s", e)
_playwright = None
_browser = None
return None
def fetch_page(url: str, wait_selector: Optional[str] = None,
wait_ms: int = 3000, timeout_ms: int = 25000,
user_agent: Optional[str] = None, use_proxy: bool = True) -> Optional[str]:
"""Открывает страницу через headless Chromium, ждёт пока JS отрендерит,
возвращает текущий HTML.
Args:
url: целевой URL
wait_selector: если задан — ждём пока этот CSS-селектор появится
wait_ms: фиксированная задержка после загрузки (для JS-hydration)
timeout_ms: общий таймаут навигации
user_agent: переопределить UA
use_proxy: использовать residential proxy из proxy_pool (default True)
"""
browser = _get_browser()
if not browser:
return None
# Достаём случайный прокси из пула — для каждого запроса свой IP
proxy_cfg = None
if use_proxy:
from .. import proxy_pool
proxy_url = proxy_pool.get_random_proxy()
if proxy_url:
proxy_cfg = _parse_proxy_url_for_playwright(proxy_url)
ctx_kwargs = {
"user_agent": user_agent or
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
"viewport": {"width": 1280, "height": 800},
"locale": "ru-RU",
"extra_http_headers": {
"Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8",
},
}
if proxy_cfg:
ctx_kwargs["proxy"] = proxy_cfg
ctx = None
page = None
try:
ctx = browser.new_context(**ctx_kwargs)
page = ctx.new_page()
# Блокируем тяжёлые ресурсы — экономим время/память
def _route(route):
rt = route.request.resource_type
if rt in ("image", "font", "media", "stylesheet"):
return route.abort()
return route.continue_()
page.route("**/*", _route)
page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
if wait_selector:
try:
page.wait_for_selector(wait_selector, timeout=wait_ms + 5000)
except Exception:
log.debug("wait_selector %s not found, continuing", wait_selector)
else:
page.wait_for_timeout(wait_ms)
html = page.content()
return html
except Exception as e:
log.warning("fetch_page failed for %s: %s", url, e)
return None
finally:
if page:
try: page.close()
except: pass
if ctx:
try: ctx.close()
except: pass
def _parse_proxy_url_for_playwright(url: str) -> dict | None:
"""Конвертирует http://user:pass@host:port в формат для playwright.proxy."""
import re
m = re.match(r"^(https?|socks5)://(?:([^:]+):([^@]+)@)?([^:/]+):(\d+)/?$", url)
if not m:
return None
scheme, user, pwd, host, port = m.groups()
server = f"{scheme}://{host}:{port}"
cfg = {"server": server}
if user: cfg["username"] = user
if pwd: cfg["password"] = pwd
return cfg
def shutdown():
"""Закрывает браузер при остановке приложения."""
global _playwright, _browser
with _lock:
if _browser:
try: _browser.close()
except: pass
_browser = None
if _playwright:
try: _playwright.stop()
except: pass
_playwright = None