From 0f2635d5f815042c1996ca0e6d8315380ee15be2 Mon Sep 17 00:00:00 2001 From: wasrusgen Date: Mon, 11 May 2026 16:37:28 +0300 Subject: [PATCH] dns+ozon: 4 retries with proxy rotation (residential pool has dirty IPs) --- backend-py/app/parsers/dns.py | 35 ++++++++++++++++++---------------- backend-py/app/parsers/ozon.py | 34 ++++++++++++++++----------------- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/backend-py/app/parsers/dns.py b/backend-py/app/parsers/dns.py index a512f0a..1b6c57d 100644 --- a/backend-py/app/parsers/dns.py +++ b/backend-py/app/parsers/dns.py @@ -46,34 +46,37 @@ _PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽") def search_dns(query: str, limit: int = 1, timeout: float = 30.0, - max_retries: int = 1) -> list[dict[str, Any]]: - """Поиск на DNS через Playwright + residential proxy. + max_retries: int = 4) -> list[dict[str, Any]]: + """Поиск на DNS через Playwright + ротация residential прокси. - DNS защищён Qrator (JS challenge) — обычный HTTP не пройдёт даже с прокси. - Playwright решает challenge автоматически (как реальный браузер). + DNS защищён Qrator. Многие residential IP уже в чёрных списках, поэтому + делаем несколько попыток — на каждой берём новый прокси из пула. """ url = f"{_SEARCH_URL}?q={quote_plus(query)}" log.info("DNS search: %s", url) - html = None for attempt in range(max_retries + 1): html = playwright_engine.fetch_page( url, wait_selector="a[href*='/product/']", - wait_ms=5000, + wait_ms=4000, timeout_ms=int(timeout * 1000), ) - if html: - break + if not html: + log.warning("DNS attempt %d: no HTML", attempt + 1) + continue + # 403 от Qrator = IP в их чёрном списке, пробуем другой + if "HTTP 403" in html[:500] or "qrator" in html.lower()[:5000]: + log.info("DNS attempt %d: Qrator block, retry with new proxy", attempt + 1) + continue + # Успех + results = _parse_search_html(html, limit=limit) + if results: + return results + log.info("DNS attempt %d: 0 results, retry", attempt + 1) - if not html: - log.warning("DNS: no HTML for query=%r", query) - return [] - if "qrator" in html.lower()[:5000]: - log.warning("DNS: Qrator block for query=%r", query) - return [] - - return _parse_search_html(html, limit=limit) + log.warning("DNS gave up after %d attempts for query=%r", max_retries + 1, query) + return [] def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]: diff --git a/backend-py/app/parsers/ozon.py b/backend-py/app/parsers/ozon.py index 34a2fbc..396c226 100644 --- a/backend-py/app/parsers/ozon.py +++ b/backend-py/app/parsers/ozon.py @@ -22,11 +22,11 @@ _PRICE_RE = re.compile(r"(\d[\d\s  ]+)\s*₽") def search_ozon(query: str, limit: int = 3, timeout: float = 30.0, - max_retries: int = 1) -> list[dict[str, Any]]: - """Поиск товара в OZON через Playwright.""" + max_retries: int = 4) -> list[dict[str, Any]]: + """Поиск товара в OZON через Playwright + ротация residential прокси.""" url = f"{_BASE_URL}/search/?text={quote_plus(query)}" - html = None + import re as _re for attempt in range(max_retries + 1): html = playwright_engine.fetch_page( url, @@ -34,21 +34,21 @@ def search_ozon(query: str, limit: int = 3, timeout: float = 30.0, wait_ms=4000, timeout_ms=int(timeout * 1000), ) - if html: - break + if not html: + log.warning("OZON attempt %d: no HTML", attempt + 1) + continue + title_m = _re.search(r"(.*?)", html, _re.IGNORECASE) + page_title = title_m.group(1) if title_m else "" + if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]: + log.info("OZON attempt %d: anti-bot, retry with new proxy", attempt + 1) + continue + results = _parse_html(html, limit=limit) + if results: + return results + log.info("OZON attempt %d: 0 results, retry", attempt + 1) - if not html: - log.warning("OZON: no HTML for query=%r", query) - return [] - # OZON показывает "Доступ ограничен" как , либо редирект на /antibot/ - import re as _re - title_m = _re.search(r"<title>(.*?)", html, _re.IGNORECASE) - page_title = title_m.group(1) if title_m else "" - if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]: - log.warning("OZON: anti-bot page (title=%r)", page_title) - return [] - - return _parse_html(html, limit=limit) + log.warning("OZON gave up after %d attempts", max_retries + 1) + return [] def _parse_html(html: str, limit: int) -> list[dict[str, Any]]: