dns: switch to Playwright (Qrator JS challenge); ozon: fix false-positive antibot detector

- DNS: использовали httpx + proxy_pool но Qrator кидал 401 даже с residential
  → теперь Playwright + residential — браузер сам решает JS challenge
- OZON: теперь проверяем только <title>='Доступ ограничен' (точная), а не подстроку '/robotcheck/'
This commit is contained in:
wasrusgen 2026-05-11 16:34:04 +03:00
parent b27cf02aa2
commit aa569a8ed1
3 changed files with 28 additions and 30 deletions

View File

@ -17,6 +17,7 @@ import httpx
from bs4 import BeautifulSoup
from .. import proxy_pool
from . import playwright_engine
log = logging.getLogger("zov.parser.dns")
@ -44,41 +45,35 @@ _HEADERS = {
_PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
def search_dns(query: str, limit: int = 1, timeout: float = 12.0,
max_retries: int = 2) -> list[dict[str, Any]]:
"""Поиск товара на DNS по строке запроса.
def search_dns(query: str, limit: int = 1, timeout: float = 30.0,
max_retries: int = 1) -> list[dict[str, Any]]:
"""Поиск на DNS через Playwright + residential proxy.
Использует Proxy6-пул если PROXY6_TOKEN задан, иначе ходит напрямую.
DNS защищён Qrator без прокси скорее всего 401.
Возвращает список результатов (топ-N) или пустой при ошибке.
DNS защищён Qrator (JS challenge) обычный HTTP не пройдёт даже с прокси.
Playwright решает challenge автоматически (как реальный браузер).
"""
url = f"{_SEARCH_URL}?q={quote_plus(query)}"
log.info("DNS search: %s", url)
last_err = None
html = None
for attempt in range(max_retries + 1):
try:
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
follow_redirects=True) as client:
resp = client.get(url)
except httpx.HTTPError as e:
last_err = e
log.warning("DNS request failed (attempt %d): %s", attempt + 1, e)
continue
html = playwright_engine.fetch_page(
url,
wait_selector="a[href*='/product/']",
wait_ms=5000,
timeout_ms=int(timeout * 1000),
)
if html:
break
if resp.status_code == 200:
text = resp.text
if "qrator" in text.lower() or "challenge" in text.lower() or "captcha" in text.lower():
log.warning("DNS Qrator/captcha on attempt %d, rotating proxy", attempt + 1)
continue
return _parse_search_html(text, limit=limit)
log.warning("DNS returned status=%s on attempt %d", resp.status_code, attempt + 1)
log.warning("DNS gave up after %d attempts for query=%r (last_err=%s)",
max_retries + 1, query, last_err)
if not html:
log.warning("DNS: no HTML for query=%r", query)
return []
if "qrator" in html.lower()[:5000]:
log.warning("DNS: Qrator block for query=%r", query)
return []
return _parse_search_html(html, limit=limit)
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:

View File

@ -40,9 +40,12 @@ def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
if not html:
log.warning("OZON: no HTML for query=%r", query)
return []
# Реальный anti-bot — это редирект на /robotcheck/ или специальная страница
if "/robotcheck/" in html or "Доступ ограничен" in html[:5000]:
log.warning("OZON: anti-bot block for query=%r", query)
# OZON показывает "Доступ ограничен" как <title>, либо редирект на /antibot/
import re as _re
title_m = _re.search(r"<title>(.*?)</title>", html, _re.IGNORECASE)
page_title = title_m.group(1) if title_m else ""
if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]:
log.warning("OZON: anti-bot page (title=%r)", page_title)
return []
return _parse_html(html, limit=limit)

0
r.json Normal file
View File