mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 18:24:49 +00:00
dns+ozon: 4 retries with proxy rotation (residential pool has dirty IPs)
This commit is contained in:
parent
796e20bc73
commit
0f2635d5f8
@ -46,34 +46,37 @@ _PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
|
|||||||
|
|
||||||
|
|
||||||
def search_dns(query: str, limit: int = 1, timeout: float = 30.0,
|
def search_dns(query: str, limit: int = 1, timeout: float = 30.0,
|
||||||
max_retries: int = 1) -> list[dict[str, Any]]:
|
max_retries: int = 4) -> list[dict[str, Any]]:
|
||||||
"""Поиск на DNS через Playwright + residential proxy.
|
"""Поиск на DNS через Playwright + ротация residential прокси.
|
||||||
|
|
||||||
DNS защищён Qrator (JS challenge) — обычный HTTP не пройдёт даже с прокси.
|
DNS защищён Qrator. Многие residential IP уже в чёрных списках, поэтому
|
||||||
Playwright решает challenge автоматически (как реальный браузер).
|
делаем несколько попыток — на каждой берём новый прокси из пула.
|
||||||
"""
|
"""
|
||||||
url = f"{_SEARCH_URL}?q={quote_plus(query)}"
|
url = f"{_SEARCH_URL}?q={quote_plus(query)}"
|
||||||
log.info("DNS search: %s", url)
|
log.info("DNS search: %s", url)
|
||||||
|
|
||||||
html = None
|
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
html = playwright_engine.fetch_page(
|
html = playwright_engine.fetch_page(
|
||||||
url,
|
url,
|
||||||
wait_selector="a[href*='/product/']",
|
wait_selector="a[href*='/product/']",
|
||||||
wait_ms=5000,
|
wait_ms=4000,
|
||||||
timeout_ms=int(timeout * 1000),
|
timeout_ms=int(timeout * 1000),
|
||||||
)
|
)
|
||||||
if html:
|
|
||||||
break
|
|
||||||
|
|
||||||
if not html:
|
if not html:
|
||||||
log.warning("DNS: no HTML for query=%r", query)
|
log.warning("DNS attempt %d: no HTML", attempt + 1)
|
||||||
return []
|
continue
|
||||||
if "qrator" in html.lower()[:5000]:
|
# 403 от Qrator = IP в их чёрном списке, пробуем другой
|
||||||
log.warning("DNS: Qrator block for query=%r", query)
|
if "HTTP 403" in html[:500] or "qrator" in html.lower()[:5000]:
|
||||||
return []
|
log.info("DNS attempt %d: Qrator block, retry with new proxy", attempt + 1)
|
||||||
|
continue
|
||||||
|
# Успех
|
||||||
|
results = _parse_search_html(html, limit=limit)
|
||||||
|
if results:
|
||||||
|
return results
|
||||||
|
log.info("DNS attempt %d: 0 results, retry", attempt + 1)
|
||||||
|
|
||||||
return _parse_search_html(html, limit=limit)
|
log.warning("DNS gave up after %d attempts for query=%r", max_retries + 1, query)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
|
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||||||
|
|||||||
@ -22,11 +22,11 @@ _PRICE_RE = re.compile(r"(\d[\d\s ]+)\s*₽")
|
|||||||
|
|
||||||
|
|
||||||
def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
|
def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
|
||||||
max_retries: int = 1) -> list[dict[str, Any]]:
|
max_retries: int = 4) -> list[dict[str, Any]]:
|
||||||
"""Поиск товара в OZON через Playwright."""
|
"""Поиск товара в OZON через Playwright + ротация residential прокси."""
|
||||||
url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
|
url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
|
||||||
|
|
||||||
html = None
|
import re as _re
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
html = playwright_engine.fetch_page(
|
html = playwright_engine.fetch_page(
|
||||||
url,
|
url,
|
||||||
@ -34,21 +34,21 @@ def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
|
|||||||
wait_ms=4000,
|
wait_ms=4000,
|
||||||
timeout_ms=int(timeout * 1000),
|
timeout_ms=int(timeout * 1000),
|
||||||
)
|
)
|
||||||
if html:
|
|
||||||
break
|
|
||||||
|
|
||||||
if not html:
|
if not html:
|
||||||
log.warning("OZON: no HTML for query=%r", query)
|
log.warning("OZON attempt %d: no HTML", attempt + 1)
|
||||||
return []
|
continue
|
||||||
# OZON показывает "Доступ ограничен" как <title>, либо редирект на /antibot/
|
|
||||||
import re as _re
|
|
||||||
title_m = _re.search(r"<title>(.*?)</title>", html, _re.IGNORECASE)
|
title_m = _re.search(r"<title>(.*?)</title>", html, _re.IGNORECASE)
|
||||||
page_title = title_m.group(1) if title_m else ""
|
page_title = title_m.group(1) if title_m else ""
|
||||||
if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]:
|
if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]:
|
||||||
log.warning("OZON: anti-bot page (title=%r)", page_title)
|
log.info("OZON attempt %d: anti-bot, retry with new proxy", attempt + 1)
|
||||||
return []
|
continue
|
||||||
|
results = _parse_html(html, limit=limit)
|
||||||
|
if results:
|
||||||
|
return results
|
||||||
|
log.info("OZON attempt %d: 0 results, retry", attempt + 1)
|
||||||
|
|
||||||
return _parse_html(html, limit=limit)
|
log.warning("OZON gave up after %d attempts", max_retries + 1)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
|
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user