From aa569a8ed13c0db16c88a2533d360281b2161ab8 Mon Sep 17 00:00:00 2001 From: wasrusgen Date: Mon, 11 May 2026 16:34:04 +0300 Subject: [PATCH] dns: switch to Playwright (Qrator JS challenge); ozon: fix false-positive antibot detector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - DNS: использовали httpx + proxy_pool но Qrator кидал 401 даже с residential → теперь Playwright + residential — браузер сам решает JS challenge - OZON: теперь проверяем только ='Доступ ограничен' (точная), а не подстроку '/robotcheck/' --- backend-py/app/parsers/dns.py | 49 +++++++++++++++------------------- backend-py/app/parsers/ozon.py | 9 ++++--- r.json | 0 3 files changed, 28 insertions(+), 30 deletions(-) create mode 100644 r.json diff --git a/backend-py/app/parsers/dns.py b/backend-py/app/parsers/dns.py index 50fd6c4..a512f0a 100644 --- a/backend-py/app/parsers/dns.py +++ b/backend-py/app/parsers/dns.py @@ -17,6 +17,7 @@ import httpx from bs4 import BeautifulSoup from .. import proxy_pool +from . import playwright_engine log = logging.getLogger("zov.parser.dns") @@ -44,41 +45,35 @@ _HEADERS = { _PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽") -def search_dns(query: str, limit: int = 1, timeout: float = 12.0, - max_retries: int = 2) -> list[dict[str, Any]]: - """Поиск товара на DNS по строке запроса. +def search_dns(query: str, limit: int = 1, timeout: float = 30.0, + max_retries: int = 1) -> list[dict[str, Any]]: + """Поиск на DNS через Playwright + residential proxy. - Использует Proxy6-пул если PROXY6_TOKEN задан, иначе ходит напрямую. - DNS защищён Qrator — без прокси скорее всего 401. - - Возвращает список результатов (топ-N) или пустой при ошибке. + DNS защищён Qrator (JS challenge) — обычный HTTP не пройдёт даже с прокси. + Playwright решает challenge автоматически (как реальный браузер). """ url = f"{_SEARCH_URL}?q={quote_plus(query)}" log.info("DNS search: %s", url) - last_err = None + html = None for attempt in range(max_retries + 1): - try: - with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS, - follow_redirects=True) as client: - resp = client.get(url) - except httpx.HTTPError as e: - last_err = e - log.warning("DNS request failed (attempt %d): %s", attempt + 1, e) - continue + html = playwright_engine.fetch_page( + url, + wait_selector="a[href*='/product/']", + wait_ms=5000, + timeout_ms=int(timeout * 1000), + ) + if html: + break - if resp.status_code == 200: - text = resp.text - if "qrator" in text.lower() or "challenge" in text.lower() or "captcha" in text.lower(): - log.warning("DNS Qrator/captcha on attempt %d, rotating proxy", attempt + 1) - continue - return _parse_search_html(text, limit=limit) + if not html: + log.warning("DNS: no HTML for query=%r", query) + return [] + if "qrator" in html.lower()[:5000]: + log.warning("DNS: Qrator block for query=%r", query) + return [] - log.warning("DNS returned status=%s on attempt %d", resp.status_code, attempt + 1) - - log.warning("DNS gave up after %d attempts for query=%r (last_err=%s)", - max_retries + 1, query, last_err) - return [] + return _parse_search_html(html, limit=limit) def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]: diff --git a/backend-py/app/parsers/ozon.py b/backend-py/app/parsers/ozon.py index 8defe65..34a2fbc 100644 --- a/backend-py/app/parsers/ozon.py +++ b/backend-py/app/parsers/ozon.py @@ -40,9 +40,12 @@ def search_ozon(query: str, limit: int = 3, timeout: float = 30.0, if not html: log.warning("OZON: no HTML for query=%r", query) return [] - # Реальный anti-bot — это редирект на /robotcheck/ или специальная страница - if "/robotcheck/" in html or "Доступ ограничен" in html[:5000]: - log.warning("OZON: anti-bot block for query=%r", query) + # OZON показывает "Доступ ограничен" как <title>, либо редирект на /antibot/ + import re as _re + title_m = _re.search(r"<title>(.*?)", html, _re.IGNORECASE) + page_title = title_m.group(1) if title_m else "" + if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]: + log.warning("OZON: anti-bot page (title=%r)", page_title) return [] return _parse_html(html, limit=limit) diff --git a/r.json b/r.json new file mode 100644 index 0000000..e69de29