From c5f662f53dbbee8e201edb492017ce9dbe17ed59 Mon Sep 17 00:00:00 2001
From: wasrusgen <vasrusgen@gmail.com>
Date: Mon, 11 May 2026 13:57:18 +0300
Subject: [PATCH] citilink: rewrite parser to walk up from a[href*=/product/]
 (CSS-in-JS resistant)

---
 backend-py/app/parsers/citilink.py | 117 ++++++++++++++++-------------
 1 file changed, 65 insertions(+), 52 deletions(-)

diff --git a/backend-py/app/parsers/citilink.py b/backend-py/app/parsers/citilink.py
index 8f9ee23..0560d2c 100644
--- a/backend-py/app/parsers/citilink.py
+++ b/backend-py/app/parsers/citilink.py
@@ -1,7 +1,7 @@
 """Парсер Citilink (citilink.ru) — через Playwright.
 
 Citilink — крупный российский магазин электроники. Работает с DC-IP, не требует
-прокси. Карточки помечены `data-meta-name=ProductCard...` или `data-meta-name=Snippet...`.
+прокси. Товары — `a[href*='/product/']`, ближайший родительский div — карточка.
 """
 from __future__ import annotations
 import logging
@@ -17,10 +17,10 @@ log = logging.getLogger("zov.parser.citilink")
 
 _BASE_URL = "https://www.citilink.ru"
 _SEARCH_URL = "https://www.citilink.ru/search/"
-_PRICE_RE = re.compile(r"(\d[\d\s  ]+)\s*₽|(\d[\d\s  ]+)\s*руб")
+_PRICE_RE = re.compile(r"(\d[\d\s  ]+)\s*₽")
 
 
-def search_citilink(query: str, limit: int = 3, timeout: float = 30.0,
+def search_citilink(query: str, limit: int = 3, timeout: float = 35.0,
                     max_retries: int = 1) -> list[dict[str, Any]]:
     """Поиск товара на Citilink через Playwright."""
     url = f"{_SEARCH_URL}?text={quote_plus(query)}"
@@ -29,8 +29,8 @@ def search_citilink(query: str, limit: int = 3, timeout: float = 30.0,
     for attempt in range(max_retries + 1):
         html = playwright_engine.fetch_page(
             url,
-            wait_selector="[data-meta-name*='Snippet'], [data-meta-name*='ProductCard']",
-            wait_ms=4000,
+            wait_selector="a[href*='/product/']",
+            wait_ms=8000,  # товары грузятся через XHR, нужна пауза
             timeout_ms=int(timeout * 1000),
         )
         if html:
@@ -46,73 +46,86 @@ def search_citilink(query: str, limit: int = 3, timeout: float = 30.0,
 def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
     soup = BeautifulSoup(html, "html.parser")
     results: list[dict[str, Any]] = []
+    seen_urls = set()
 
-    # Карточки товаров
-    cards = (
-        soup.select("[data-meta-name*='Snippet']")
-        or soup.select("[data-meta-name*='ProductCard']")
-        or soup.select("div.ProductCardHorizontal")
-    )
-
-    for card in cards:
+    for link in soup.select("a[href*='/product/']"):
         if len(results) >= limit:
             break
-        item = _extract_card(card)
+        href = link.get("href") or ""
+        url_clean = href.split("?")[0]
+        if url_clean in seen_urls:
+            continue
+        seen_urls.add(url_clean)
+
+        full_url = href if href.startswith("http") else f"{_BASE_URL}{href}"
+
+        # Поднимаемся к родительской карточке — у Citilink CSS-in-JS, поэтому
+        # ищем ближайший div, в котором есть и цена и название
+        card = link.find_parent("div")
+        if not card:
+            continue
+        # Если в этом div'е нет цены — поднимемся ещё выше
+        for _ in range(3):
+            if "₽" in card.get_text():
+                break
+            parent = card.find_parent("div")
+            if not parent:
+                break
+            card = parent
+
+        item = _extract_card(card, full_url)
         if item:
             results.append(item)
 
     return results
 
 
-def _extract_card(card) -> dict[str, Any] | None:
-    """Достаём title, url, цену, картинку, рейтинг, отзывы."""
-    # Ссылка на товар
-    link = card.select_one("a[href*='/product/']") or card.find("a", href=True)
-    if not link:
-        return None
-    href = link.get("href") or ""
-    if "/product/" not in href and "/promo/" not in href:
-        return None
-    url = href if href.startswith("http") else f"{_BASE_URL}{href}"
-
-    # Название
-    title = ""
-    # Citilink использует разные классы — пробуем несколько
-    for sel in [
-        "[data-meta-name*='Snippet__title']",
-        "[data-meta-name*='ProductCardHorizontal__title']",
-        "a[href*='/product/'] span",
-        "a[title]",
-    ]:
-        el = card.select_one(sel)
-        if el:
-            title = (el.get("title") or el.get_text(strip=True)).strip()
-            if title and len(title) > 5:
-                break
-    if not title:
-        # Резерв — длинный текст в карточке
-        for s in card.find_all(["span", "div"]):
-            t = s.get_text(strip=True)
-            if t and 15 < len(t) < 200 and "₽" not in t and "%" not in t:
-                title = t
-                break
-    if not title or len(title) < 5:
-        return None
-
+def _extract_card(card, url: str) -> dict[str, Any] | None:
+    """Из карточки достаём название, цену, картинку."""
     full_text = card.get_text(" ", strip=True)
 
     # Цена
     price = None
     for m in _PRICE_RE.finditer(full_text):
-        raw = (m.group(1) or m.group(2) or "").replace(" ", "").replace(" ", "").replace(" ", "")
+        raw = m.group(1).replace(" ", "").replace(" ", "").replace(" ", "")
         try:
             v = int(raw)
-            if 100 < v < 10_000_000:  # разумные пределы
+            if 1000 < v < 10_000_000:
                 price = v
                 break
         except ValueError:
             pass
 
+    # Название — ищем по типу «Холодильник Bosch KGN…»
+    # Citilink обычно выделяет название в отдельном span внутри карточки
+    title = ""
+    # Сначала пробуем явные селекторы
+    for sel in [
+        "[data-meta-name*='Snippet__title']",
+        "[data-meta-name*='title']",
+        "a[href*='/product/']",
+        "h2", "h3",
+    ]:
+        el = card.select_one(sel)
+        if el:
+            t = (el.get("title") or el.get_text(strip=True)).strip()
+            if t and len(t) > 10:
+                title = t
+                break
+    # Резерв: ищем самый длинный текстовый span без цены/процентов
+    if not title:
+        candidates = []
+        for s in card.find_all(["span", "div", "a"]):
+            t = s.get_text(" ", strip=True)
+            if 15 < len(t) < 200 and "₽" not in t and "%" not in t and "Рассрочка" not in t and "просмотр" not in t.lower():
+                candidates.append(t)
+        if candidates:
+            # Самый «осмысленный» — содержащий «Холодильник», «Bosch» и т.п. + достаточно длинный
+            candidates.sort(key=len, reverse=True)
+            title = candidates[0]
+    if not title or len(title) < 10:
+        return None
+
     # Картинка
     img_url = None
     img_el = card.find("img")
@@ -125,7 +138,7 @@ def _extract_card(card) -> dict[str, Any] | None:
 
     # Рейтинг
     rating = None
-    m = re.search(r"(\d[.,]\d)\s*[\\(\\d]", full_text)
+    m = re.search(r"(\d[.,]\d)\s*[\\(\d]", full_text)
     if m:
         try:
             r = float(m.group(1).replace(",", "."))