dns+ozon: 4 retries with proxy rotation (residential pool has dirty IPs)

This commit is contained in:
wasrusgen 2026-05-11 16:37:28 +03:00
parent 796e20bc73
commit 0f2635d5f8
2 changed files with 36 additions and 33 deletions

View File

@ -46,34 +46,37 @@ _PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
def search_dns(query: str, limit: int = 1, timeout: float = 30.0, def search_dns(query: str, limit: int = 1, timeout: float = 30.0,
max_retries: int = 1) -> list[dict[str, Any]]: max_retries: int = 4) -> list[dict[str, Any]]:
"""Поиск на DNS через Playwright + residential proxy. """Поиск на DNS через Playwright + ротация residential прокси.
DNS защищён Qrator (JS challenge) обычный HTTP не пройдёт даже с прокси. DNS защищён Qrator. Многие residential IP уже в чёрных списках, поэтому
Playwright решает challenge автоматически (как реальный браузер). делаем несколько попыток на каждой берём новый прокси из пула.
""" """
url = f"{_SEARCH_URL}?q={quote_plus(query)}" url = f"{_SEARCH_URL}?q={quote_plus(query)}"
log.info("DNS search: %s", url) log.info("DNS search: %s", url)
html = None
for attempt in range(max_retries + 1): for attempt in range(max_retries + 1):
html = playwright_engine.fetch_page( html = playwright_engine.fetch_page(
url, url,
wait_selector="a[href*='/product/']", wait_selector="a[href*='/product/']",
wait_ms=5000, wait_ms=4000,
timeout_ms=int(timeout * 1000), timeout_ms=int(timeout * 1000),
) )
if html:
break
if not html: if not html:
log.warning("DNS: no HTML for query=%r", query) log.warning("DNS attempt %d: no HTML", attempt + 1)
return [] continue
if "qrator" in html.lower()[:5000]: # 403 от Qrator = IP в их чёрном списке, пробуем другой
log.warning("DNS: Qrator block for query=%r", query) if "HTTP 403" in html[:500] or "qrator" in html.lower()[:5000]:
return [] log.info("DNS attempt %d: Qrator block, retry with new proxy", attempt + 1)
continue
# Успех
results = _parse_search_html(html, limit=limit)
if results:
return results
log.info("DNS attempt %d: 0 results, retry", attempt + 1)
return _parse_search_html(html, limit=limit) log.warning("DNS gave up after %d attempts for query=%r", max_retries + 1, query)
return []
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]: def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:

View File

@ -22,11 +22,11 @@ _PRICE_RE = re.compile(r"(\d[\d\s ]+)\s*₽")
def search_ozon(query: str, limit: int = 3, timeout: float = 30.0, def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
max_retries: int = 1) -> list[dict[str, Any]]: max_retries: int = 4) -> list[dict[str, Any]]:
"""Поиск товара в OZON через Playwright.""" """Поиск товара в OZON через Playwright + ротация residential прокси."""
url = f"{_BASE_URL}/search/?text={quote_plus(query)}" url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
html = None import re as _re
for attempt in range(max_retries + 1): for attempt in range(max_retries + 1):
html = playwright_engine.fetch_page( html = playwright_engine.fetch_page(
url, url,
@ -34,21 +34,21 @@ def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
wait_ms=4000, wait_ms=4000,
timeout_ms=int(timeout * 1000), timeout_ms=int(timeout * 1000),
) )
if html:
break
if not html: if not html:
log.warning("OZON: no HTML for query=%r", query) log.warning("OZON attempt %d: no HTML", attempt + 1)
return [] continue
# OZON показывает "Доступ ограничен" как <title>, либо редирект на /antibot/
import re as _re
title_m = _re.search(r"<title>(.*?)</title>", html, _re.IGNORECASE) title_m = _re.search(r"<title>(.*?)</title>", html, _re.IGNORECASE)
page_title = title_m.group(1) if title_m else "" page_title = title_m.group(1) if title_m else ""
if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]: if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]:
log.warning("OZON: anti-bot page (title=%r)", page_title) log.info("OZON attempt %d: anti-bot, retry with new proxy", attempt + 1)
return [] continue
results = _parse_html(html, limit=limit)
if results:
return results
log.info("OZON attempt %d: 0 results, retry", attempt + 1)
return _parse_html(html, limit=limit) log.warning("OZON gave up after %d attempts", max_retries + 1)
return []
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]: def _parse_html(html: str, limit: int) -> list[dict[str, Any]]: