mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 15:44:47 +00:00
dns: switch to Playwright (Qrator JS challenge); ozon: fix false-positive antibot detector
- DNS: использовали httpx + proxy_pool но Qrator кидал 401 даже с residential → теперь Playwright + residential — браузер сам решает JS challenge - OZON: теперь проверяем только <title>='Доступ ограничен' (точная), а не подстроку '/robotcheck/'
This commit is contained in:
parent
b27cf02aa2
commit
aa569a8ed1
@ -17,6 +17,7 @@ import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .. import proxy_pool
|
||||
from . import playwright_engine
|
||||
|
||||
log = logging.getLogger("zov.parser.dns")
|
||||
|
||||
@ -44,41 +45,35 @@ _HEADERS = {
|
||||
_PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
|
||||
|
||||
|
||||
def search_dns(query: str, limit: int = 1, timeout: float = 12.0,
|
||||
max_retries: int = 2) -> list[dict[str, Any]]:
|
||||
"""Поиск товара на DNS по строке запроса.
|
||||
def search_dns(query: str, limit: int = 1, timeout: float = 30.0,
|
||||
max_retries: int = 1) -> list[dict[str, Any]]:
|
||||
"""Поиск на DNS через Playwright + residential proxy.
|
||||
|
||||
Использует Proxy6-пул если PROXY6_TOKEN задан, иначе ходит напрямую.
|
||||
DNS защищён Qrator — без прокси скорее всего 401.
|
||||
|
||||
Возвращает список результатов (топ-N) или пустой при ошибке.
|
||||
DNS защищён Qrator (JS challenge) — обычный HTTP не пройдёт даже с прокси.
|
||||
Playwright решает challenge автоматически (как реальный браузер).
|
||||
"""
|
||||
url = f"{_SEARCH_URL}?q={quote_plus(query)}"
|
||||
log.info("DNS search: %s", url)
|
||||
|
||||
last_err = None
|
||||
html = None
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
|
||||
follow_redirects=True) as client:
|
||||
resp = client.get(url)
|
||||
except httpx.HTTPError as e:
|
||||
last_err = e
|
||||
log.warning("DNS request failed (attempt %d): %s", attempt + 1, e)
|
||||
continue
|
||||
html = playwright_engine.fetch_page(
|
||||
url,
|
||||
wait_selector="a[href*='/product/']",
|
||||
wait_ms=5000,
|
||||
timeout_ms=int(timeout * 1000),
|
||||
)
|
||||
if html:
|
||||
break
|
||||
|
||||
if resp.status_code == 200:
|
||||
text = resp.text
|
||||
if "qrator" in text.lower() or "challenge" in text.lower() or "captcha" in text.lower():
|
||||
log.warning("DNS Qrator/captcha on attempt %d, rotating proxy", attempt + 1)
|
||||
continue
|
||||
return _parse_search_html(text, limit=limit)
|
||||
|
||||
log.warning("DNS returned status=%s on attempt %d", resp.status_code, attempt + 1)
|
||||
|
||||
log.warning("DNS gave up after %d attempts for query=%r (last_err=%s)",
|
||||
max_retries + 1, query, last_err)
|
||||
if not html:
|
||||
log.warning("DNS: no HTML for query=%r", query)
|
||||
return []
|
||||
if "qrator" in html.lower()[:5000]:
|
||||
log.warning("DNS: Qrator block for query=%r", query)
|
||||
return []
|
||||
|
||||
return _parse_search_html(html, limit=limit)
|
||||
|
||||
|
||||
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||||
|
||||
@ -40,9 +40,12 @@ def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
|
||||
if not html:
|
||||
log.warning("OZON: no HTML for query=%r", query)
|
||||
return []
|
||||
# Реальный anti-bot — это редирект на /robotcheck/ или специальная страница
|
||||
if "/robotcheck/" in html or "Доступ ограничен" in html[:5000]:
|
||||
log.warning("OZON: anti-bot block for query=%r", query)
|
||||
# OZON показывает "Доступ ограничен" как <title>, либо редирект на /antibot/
|
||||
import re as _re
|
||||
title_m = _re.search(r"<title>(.*?)</title>", html, _re.IGNORECASE)
|
||||
page_title = title_m.group(1) if title_m else ""
|
||||
if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]:
|
||||
log.warning("OZON: anti-bot page (title=%r)", page_title)
|
||||
return []
|
||||
|
||||
return _parse_html(html, limit=limit)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user