mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 15:44:47 +00:00
dns: switch to Playwright (Qrator JS challenge); ozon: fix false-positive antibot detector
- DNS: использовали httpx + proxy_pool но Qrator кидал 401 даже с residential → теперь Playwright + residential — браузер сам решает JS challenge - OZON: теперь проверяем только <title>='Доступ ограничен' (точная), а не подстроку '/robotcheck/'
This commit is contained in:
parent
b27cf02aa2
commit
aa569a8ed1
@ -17,6 +17,7 @@ import httpx
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .. import proxy_pool
|
from .. import proxy_pool
|
||||||
|
from . import playwright_engine
|
||||||
|
|
||||||
log = logging.getLogger("zov.parser.dns")
|
log = logging.getLogger("zov.parser.dns")
|
||||||
|
|
||||||
@ -44,41 +45,35 @@ _HEADERS = {
|
|||||||
_PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
|
_PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
|
||||||
|
|
||||||
|
|
||||||
def search_dns(query: str, limit: int = 1, timeout: float = 12.0,
|
def search_dns(query: str, limit: int = 1, timeout: float = 30.0,
|
||||||
max_retries: int = 2) -> list[dict[str, Any]]:
|
max_retries: int = 1) -> list[dict[str, Any]]:
|
||||||
"""Поиск товара на DNS по строке запроса.
|
"""Поиск на DNS через Playwright + residential proxy.
|
||||||
|
|
||||||
Использует Proxy6-пул если PROXY6_TOKEN задан, иначе ходит напрямую.
|
DNS защищён Qrator (JS challenge) — обычный HTTP не пройдёт даже с прокси.
|
||||||
DNS защищён Qrator — без прокси скорее всего 401.
|
Playwright решает challenge автоматически (как реальный браузер).
|
||||||
|
|
||||||
Возвращает список результатов (топ-N) или пустой при ошибке.
|
|
||||||
"""
|
"""
|
||||||
url = f"{_SEARCH_URL}?q={quote_plus(query)}"
|
url = f"{_SEARCH_URL}?q={quote_plus(query)}"
|
||||||
log.info("DNS search: %s", url)
|
log.info("DNS search: %s", url)
|
||||||
|
|
||||||
last_err = None
|
html = None
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
try:
|
html = playwright_engine.fetch_page(
|
||||||
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
|
url,
|
||||||
follow_redirects=True) as client:
|
wait_selector="a[href*='/product/']",
|
||||||
resp = client.get(url)
|
wait_ms=5000,
|
||||||
except httpx.HTTPError as e:
|
timeout_ms=int(timeout * 1000),
|
||||||
last_err = e
|
)
|
||||||
log.warning("DNS request failed (attempt %d): %s", attempt + 1, e)
|
if html:
|
||||||
continue
|
break
|
||||||
|
|
||||||
if resp.status_code == 200:
|
if not html:
|
||||||
text = resp.text
|
log.warning("DNS: no HTML for query=%r", query)
|
||||||
if "qrator" in text.lower() or "challenge" in text.lower() or "captcha" in text.lower():
|
return []
|
||||||
log.warning("DNS Qrator/captcha on attempt %d, rotating proxy", attempt + 1)
|
if "qrator" in html.lower()[:5000]:
|
||||||
continue
|
log.warning("DNS: Qrator block for query=%r", query)
|
||||||
return _parse_search_html(text, limit=limit)
|
return []
|
||||||
|
|
||||||
log.warning("DNS returned status=%s on attempt %d", resp.status_code, attempt + 1)
|
return _parse_search_html(html, limit=limit)
|
||||||
|
|
||||||
log.warning("DNS gave up after %d attempts for query=%r (last_err=%s)",
|
|
||||||
max_retries + 1, query, last_err)
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
|
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||||||
|
|||||||
@ -40,9 +40,12 @@ def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
|
|||||||
if not html:
|
if not html:
|
||||||
log.warning("OZON: no HTML for query=%r", query)
|
log.warning("OZON: no HTML for query=%r", query)
|
||||||
return []
|
return []
|
||||||
# Реальный anti-bot — это редирект на /robotcheck/ или специальная страница
|
# OZON показывает "Доступ ограничен" как <title>, либо редирект на /antibot/
|
||||||
if "/robotcheck/" in html or "Доступ ограничен" in html[:5000]:
|
import re as _re
|
||||||
log.warning("OZON: anti-bot block for query=%r", query)
|
title_m = _re.search(r"<title>(.*?)</title>", html, _re.IGNORECASE)
|
||||||
|
page_title = title_m.group(1) if title_m else ""
|
||||||
|
if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]:
|
||||||
|
log.warning("OZON: anti-bot page (title=%r)", page_title)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
return _parse_html(html, limit=limit)
|
return _parse_html(html, limit=limit)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user