From 0f2635d5f815042c1996ca0e6d8315380ee15be2 Mon Sep 17 00:00:00 2001
From: wasrusgen <vasrusgen@gmail.com>
Date: Mon, 11 May 2026 16:37:28 +0300
Subject: [PATCH] dns+ozon: 4 retries with proxy rotation (residential pool has
 dirty IPs)

---
 backend-py/app/parsers/dns.py  | 35 ++++++++++++++++++----------------
 backend-py/app/parsers/ozon.py | 34 ++++++++++++++++-----------------
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/backend-py/app/parsers/dns.py b/backend-py/app/parsers/dns.py
index a512f0a..1b6c57d 100644
--- a/backend-py/app/parsers/dns.py
+++ b/backend-py/app/parsers/dns.py
@@ -46,34 +46,37 @@ _PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
 
 
 def search_dns(query: str, limit: int = 1, timeout: float = 30.0,
-               max_retries: int = 1) -> list[dict[str, Any]]:
-    """Поиск на DNS через Playwright + residential proxy.
+               max_retries: int = 4) -> list[dict[str, Any]]:
+    """Поиск на DNS через Playwright + ротация residential прокси.
 
-    DNS защищён Qrator (JS challenge) — обычный HTTP не пройдёт даже с прокси.
-    Playwright решает challenge автоматически (как реальный браузер).
+    DNS защищён Qrator. Многие residential IP уже в чёрных списках, поэтому
+    делаем несколько попыток — на каждой берём новый прокси из пула.
     """
     url = f"{_SEARCH_URL}?q={quote_plus(query)}"
     log.info("DNS search: %s", url)
 
-    html = None
     for attempt in range(max_retries + 1):
         html = playwright_engine.fetch_page(
             url,
             wait_selector="a[href*='/product/']",
-            wait_ms=5000,
+            wait_ms=4000,
             timeout_ms=int(timeout * 1000),
         )
-        if html:
-            break
+        if not html:
+            log.warning("DNS attempt %d: no HTML", attempt + 1)
+            continue
+        # 403 от Qrator = IP в их чёрном списке, пробуем другой
+        if "HTTP 403" in html[:500] or "qrator" in html.lower()[:5000]:
+            log.info("DNS attempt %d: Qrator block, retry with new proxy", attempt + 1)
+            continue
+        # Успех
+        results = _parse_search_html(html, limit=limit)
+        if results:
+            return results
+        log.info("DNS attempt %d: 0 results, retry", attempt + 1)
 
-    if not html:
-        log.warning("DNS: no HTML for query=%r", query)
-        return []
-    if "qrator" in html.lower()[:5000]:
-        log.warning("DNS: Qrator block for query=%r", query)
-        return []
-
-    return _parse_search_html(html, limit=limit)
+    log.warning("DNS gave up after %d attempts for query=%r", max_retries + 1, query)
+    return []
 
 
 def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
diff --git a/backend-py/app/parsers/ozon.py b/backend-py/app/parsers/ozon.py
index 34a2fbc..396c226 100644
--- a/backend-py/app/parsers/ozon.py
+++ b/backend-py/app/parsers/ozon.py
@@ -22,11 +22,11 @@ _PRICE_RE = re.compile(r"(\d[\d\s  ]+)\s*₽")
 
 
 def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
-                max_retries: int = 1) -> list[dict[str, Any]]:
-    """Поиск товара в OZON через Playwright."""
+                max_retries: int = 4) -> list[dict[str, Any]]:
+    """Поиск товара в OZON через Playwright + ротация residential прокси."""
     url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
 
-    html = None
+    import re as _re
     for attempt in range(max_retries + 1):
         html = playwright_engine.fetch_page(
             url,
@@ -34,21 +34,21 @@ def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
             wait_ms=4000,
             timeout_ms=int(timeout * 1000),
         )
-        if html:
-            break
+        if not html:
+            log.warning("OZON attempt %d: no HTML", attempt + 1)
+            continue
+        title_m = _re.search(r"<title>(.*?)</title>", html, _re.IGNORECASE)
+        page_title = title_m.group(1) if title_m else ""
+        if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]:
+            log.info("OZON attempt %d: anti-bot, retry with new proxy", attempt + 1)
+            continue
+        results = _parse_html(html, limit=limit)
+        if results:
+            return results
+        log.info("OZON attempt %d: 0 results, retry", attempt + 1)
 
-    if not html:
-        log.warning("OZON: no HTML for query=%r", query)
-        return []
-    # OZON показывает "Доступ ограничен" как <title>, либо редирект на /antibot/
-    import re as _re
-    title_m = _re.search(r"<title>(.*?)</title>", html, _re.IGNORECASE)
-    page_title = title_m.group(1) if title_m else ""
-    if "доступ ограничен" in page_title.lower() or "/antibot/" in html[:5000]:
-        log.warning("OZON: anti-bot page (title=%r)", page_title)
-        return []
-
-    return _parse_html(html, limit=limit)
+    log.warning("OZON gave up after %d attempts", max_retries + 1)
+    return []
 
 
 def _parse_html(html: str, limit: int) -> list[dict[str, Any]]: