backend: working parsers — OZON + Citilink (DOM via Playwright) + WB

DIAGNOSTIC RESULTS: - OZON: 19 product links via Playwright on naked VPS-IP ✓ - Citilink: 112 data-meta-name Snippets ✓ - Wildberries: JSON API works with delays ✓ - Я.Маркет, DNS: blocked by ASN (need residential proxy) OZON PARSER: - Pure Playwright DOM (composer-api dropped — was blocked) - Selects a[href*='/product/'], walks up to card div, extracts title/price/img - Filters fake 'titles' like Распродажа, Скидка CITILINK PARSER (new): - Selects [data-meta-name*='Snippet'] or ProductCard markers - Multiple title selectors fallback chain - Filters out non-product hits PARSERS/__init__.py: - DEFAULT_SOURCES = (ozon, citilink, wb) — all work without proxy - Я.Маркет, DNS kept but not default — usable when residential proxy added NEW ENDPOINT: - GET /api/parse_citilink?q=...&limit=N
2026-06-03 16:24:50 +00:00 · 2026-05-11 13:53:07 +03:00 · 2026-05-11 13:53:07 +03:00 · e8b487891f
commit e8b487891f
parent 5fdae262ef
4 changed files with 280 additions and 211 deletions
--- a/backend-py/app/main.py
+++ b/backend-py/app/main.py
@ -13,7 +13,7 @@ from .config import get_config
 from .auth import verify_init_data
 from . import sheets, ai, telegram as tg, proxy_pool
 from . import parsers
-from .parsers import dns as parser_dns, wb as parser_wb, ozon as parser_ozon, yamarket as parser_ym
+from .parsers import dns as parser_dns, wb as parser_wb, ozon as parser_ozon, yamarket as parser_ym, citilink as parser_cl

 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
 log = logging.getLogger("zov.backend")
@ -195,6 +195,17 @@ def api_parse_yamarket(q: str = "", limit: int = 3):
        return {"ok": False, "error": str(e), "query": q}


+@app.get("/api/parse_citilink")
+def api_parse_citilink(q: str = "", limit: int = 3):
+    if not q:
+        return {"error": "missing_query"}
+    try:
+        results = parser_cl.search_citilink(q, limit=min(max(1, limit), 10))
+        return {"ok": True, "query": q, "count": len(results), "results": results}
+    except Exception as e:
+        return {"ok": False, "error": str(e), "query": q}
+
+
@app.get("/api/parse_all")
 def api_parse_all(q: str = ""):
    """Спрашивает все источники и возвращает агрегированный результат."""
--- a/backend-py/app/parsers/init.py
+++ b/backend-py/app/parsers/init.py
@ -32,14 +32,22 @@ from .dns import search_dns
 from .wb import search_wb
 from .ozon import search_ozon
 from .yamarket import search_yamarket
+from .citilink import search_citilink

 log = logging.getLogger("zov.parser")

-__all__ = ["search_dns", "search_wb", "search_ozon", "search_yamarket",
+__all__ = ["search_dns", "search_wb", "search_ozon", "search_yamarket", "search_citilink",
           "enrich_one", "enrich_models"]

+# Источники по умолчанию (работают с DC-IP без прокси):
+# - ozon, citilink: Playwright
+# - wb: прямой JSON API (с задержкой)
+# Опциональные (требуют residential proxy):
+# - yamarket, dns
+DEFAULT_SOURCES = ("ozon", "citilink", "wb")

-def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> dict[str, Any]:
+
+def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]:
    """Спрашивает все указанные источники и объединяет лучшее в единый отчёт.

    Возвращает:
@ -59,8 +67,9 @@ def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -
    """
    fetchers = {
        "wb":       lambda: _safe_first(search_wb, query),
-        "yamarket": lambda: _safe_first(search_yamarket, query),
        "ozon":     lambda: _safe_first(search_ozon, query),
+        "citilink": lambda: _safe_first(search_citilink, query),
+        "yamarket": lambda: _safe_first(search_yamarket, query),
        "dns":      lambda: _safe_first(search_dns, query),
    }

@ -87,8 +96,8 @@ def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -
        stores = items["yamarket"]["stores_count"]

    best_url = None
-    # Приоритет: yamarket (агрегатор) → wb → ozon → dns
-    for src in ("yamarket", "wb", "ozon", "dns"):
+    # Приоритет: ozon → citilink → wb → yamarket → dns
+    for src in ("ozon", "citilink", "wb", "yamarket", "dns"):
        i = items.get(src)
        if i and i.get("url"):
            best_url = i["url"]
@ -107,7 +116,7 @@ def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -


 def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5,
-                  sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> list[dict[str, Any]]:
+                  sources: tuple = DEFAULT_SOURCES) -> list[dict[str, Any]]:
    """Обогащает список моделей от AI данными со всех источников."""
    enriched: list[dict[str, Any]] = []
    for i, m in enumerate(models):
--- a/backend-py/app/parsers/citilink.py
+++ b/backend-py/app/parsers/citilink.py
@ -0,0 +1,157 @@
+"""Парсер Citilink (citilink.ru) — через Playwright.
+
+Citilink — крупный российский магазин электроники. Работает с DC-IP, не требует
+прокси. Карточки помечены `data-meta-name=ProductCard...` или `data-meta-name=Snippet...`.
+"""
+from __future__ import annotations
+import logging
+import re
+from typing import Any
+from urllib.parse import quote_plus
+
+from bs4 import BeautifulSoup
+
+from . import playwright_engine
+
+log = logging.getLogger("zov.parser.citilink")
+
+_BASE_URL = "https://www.citilink.ru"
+_SEARCH_URL = "https://www.citilink.ru/search/"
+_PRICE_RE = re.compile(r"(\d[\d\s  ]+)\s*₽|(\d[\d\s  ]+)\s*руб")
+
+
+def search_citilink(query: str, limit: int = 3, timeout: float = 30.0,
+                    max_retries: int = 1) -> list[dict[str, Any]]:
+    """Поиск товара на Citilink через Playwright."""
+    url = f"{_SEARCH_URL}?text={quote_plus(query)}"
+
+    html = None
+    for attempt in range(max_retries + 1):
+        html = playwright_engine.fetch_page(
+            url,
+            wait_selector="[data-meta-name*='Snippet'], [data-meta-name*='ProductCard']",
+            wait_ms=4000,
+            timeout_ms=int(timeout * 1000),
+        )
+        if html:
+            break
+
+    if not html:
+        log.warning("Citilink: no HTML for query=%r", query)
+        return []
+
+    return _parse_html(html, limit=limit)
+
+
+def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
+    soup = BeautifulSoup(html, "html.parser")
+    results: list[dict[str, Any]] = []
+
+    # Карточки товаров
+    cards = (
+        soup.select("[data-meta-name*='Snippet']")
+        or soup.select("[data-meta-name*='ProductCard']")
+        or soup.select("div.ProductCardHorizontal")
+    )
+
+    for card in cards:
+        if len(results) >= limit:
+            break
+        item = _extract_card(card)
+        if item:
+            results.append(item)
+
+    return results
+
+
+def _extract_card(card) -> dict[str, Any] | None:
+    """Достаём title, url, цену, картинку, рейтинг, отзывы."""
+    # Ссылка на товар
+    link = card.select_one("a[href*='/product/']") or card.find("a", href=True)
+    if not link:
+        return None
+    href = link.get("href") or ""
+    if "/product/" not in href and "/promo/" not in href:
+        return None
+    url = href if href.startswith("http") else f"{_BASE_URL}{href}"
+
+    # Название
+    title = ""
+    # Citilink использует разные классы — пробуем несколько
+    for sel in [
+        "[data-meta-name*='Snippet__title']",
+        "[data-meta-name*='ProductCardHorizontal__title']",
+        "a[href*='/product/'] span",
+        "a[title]",
+    ]:
+        el = card.select_one(sel)
+        if el:
+            title = (el.get("title") or el.get_text(strip=True)).strip()
+            if title and len(title) > 5:
+                break
+    if not title:
+        # Резерв — длинный текст в карточке
+        for s in card.find_all(["span", "div"]):
+            t = s.get_text(strip=True)
+            if t and 15 < len(t) < 200 and "₽" not in t and "%" not in t:
+                title = t
+                break
+    if not title or len(title) < 5:
+        return None
+
+    full_text = card.get_text(" ", strip=True)
+
+    # Цена
+    price = None
+    for m in _PRICE_RE.finditer(full_text):
+        raw = (m.group(1) or m.group(2) or "").replace(" ", "").replace(" ", "").replace(" ", "")
+        try:
+            v = int(raw)
+            if 100 < v < 10_000_000:  # разумные пределы
+                price = v
+                break
+        except ValueError:
+            pass
+
+    # Картинка
+    img_url = None
+    img_el = card.find("img")
+    if img_el:
+        src = img_el.get("src") or img_el.get("data-src") or ""
+        if src and "data:image" not in src:
+            if src.startswith("//"):
+                src = "https:" + src
+            img_url = src
+
+    # Рейтинг
+    rating = None
+    m = re.search(r"(\d[.,]\d)\s*[\\(\\d]", full_text)
+    if m:
+        try:
+            r = float(m.group(1).replace(",", "."))
+            if 0 < r <= 5.0:
+                rating = r
+        except ValueError:
+            pass
+
+    # Отзывы
+    reviews = None
+    m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", full_text)
+    if m:
+        try:
+            reviews = int(m.group(1).replace(" ", "").replace(" ", ""))
+        except ValueError:
+            pass
+
+    return {
+        "title": title[:250],
+        "url": url,
+        "image_url": img_url,
+        "price_min_rub": price,
+        "price_max_rub": None,
+        "rating": rating,
+        "reviews_count": reviews,
+        "stores_count": None,
+        "specs": {},
+        "source": "citilink",
+    }
--- a/backend-py/app/parsers/ozon.py
+++ b/backend-py/app/parsers/ozon.py
@ -1,9 +1,9 @@
-"""Парсер OZON — через composer-api (внутренний JSON API сайта).
+"""Парсер OZON — через Playwright (рендер JS).

-OZON отдаёт JSON через `/api/composer-api.bx/page/json/v2?url=/search/?text=…`.
-JSON содержит вложенные виджеты — нас интересует `widgetStates.searchResults...`.
+OZON блокирует прямой HTTP с DC-IP (403/307). С Playwright рендерит обычную
+HTML-страницу `/search/?text=…`, в которой есть карточки `a[href*='/product/']`.

-Без прокси возвращает 307/403. Через резидентный РФ-IP проходит.
+В карточке: название, цена, картинка, рейтинг, отзывы — в DOM рядом со ссылкой.
 """
 from __future__ import annotations
 import logging
@ -11,249 +11,141 @@ import re
 from typing import Any
 from urllib.parse import quote_plus

-import httpx
 from bs4 import BeautifulSoup

-from .. import proxy_pool
 from . import playwright_engine

 log = logging.getLogger("zov.parser.ozon")

 _BASE_URL = "https://www.ozon.ru"
-_API_URL = "https://www.ozon.ru/api/composer-api.bx/page/json/v2"
-
-_HEADERS = {
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
-                  "(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
-    "Accept": "application/json",
-    "Accept-Language": "ru-RU,ru;q=0.9",
-    "x-o3-app-name": "dweb_client",
-    "x-o3-app-version": "release_18.04",
-    "x-o3-page-type": "search",
-    "Referer": "https://www.ozon.ru/",
-}
-
-_PRICE_RE = re.compile(r"([\d\s]+)\s*₽")
+_PRICE_RE = re.compile(r"(\d[\d\s  ]+)\s*₽")


 def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
-                max_retries: int = 1, use_playwright: bool = True) -> list[dict[str, Any]]:
-    """Поиск товара в OZON.
+                max_retries: int = 1) -> list[dict[str, Any]]:
+    """Поиск товара в OZON через Playwright."""
+    url = f"{_BASE_URL}/search/?text={quote_plus(query)}"

-    Сначала пробуем composer-api JSON (быстро), при challenge — Playwright (медленно но точно).
-    """
-    # Путь 1: быстрый composer-api
-    url_param = f"/search/?text={quote_plus(query)}&from_global=true"
-    params = {"url": url_param}
+    html = None
    for attempt in range(max_retries + 1):
-        try:
-            with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
-                                            follow_redirects=False) as client:
-                resp = client.get(_API_URL, params=params)
-            if resp.status_code == 200:
-                try:
-                    return _extract_products(resp.json(), limit=limit)
-                except Exception:
-                    pass
-            log.debug("OZON composer-api attempt %d: status=%s", attempt + 1, resp.status_code)
-        except httpx.HTTPError as e:
-            log.debug("OZON composer-api err: %s", e)
+        html = playwright_engine.fetch_page(
+            url,
+            wait_selector="a[href*='/product/']",
+            wait_ms=4000,
+            timeout_ms=int(timeout * 1000),
+        )
+        if html:
+            break

-    # Путь 2: Playwright (рендерим обычную HTML-страницу поиска)
-    if not use_playwright:
-        return []
-    log.info("OZON falling back to Playwright for query=%r", query)
-    page_url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
-    html = playwright_engine.fetch_page(
-        page_url,
-        wait_selector="a[href*='/product/'], [data-widget='searchResultsV2']",
-        wait_ms=3500,
-        timeout_ms=int(timeout * 1000),
-    )
    if not html:
+        log.warning("OZON: no HTML for query=%r", query)
        return []
-    return _parse_html_via_dom(html, limit=limit)
+    if "robotcheck" in html.lower() or "challenge" in html.lower()[:5000]:
+        log.warning("OZON: anti-bot challenge for query=%r", query)
+        return []
+
+    return _parse_html(html, limit=limit)


-def _parse_html_via_dom(html: str, limit: int) -> list[dict[str, Any]]:
-    """Fallback: парсим товары из отрендеренного Chrome HTML."""
+def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
    soup = BeautifulSoup(html, "html.parser")
-    seen = set()
    results: list[dict[str, Any]] = []
+    seen_urls = set()

-    for link in soup.select("a[href*='/product/']"):
+    # Находим все ссылки на товары
+    product_links = soup.select("a[href*='/product/']")
+
+    for link in product_links:
        if len(results) >= limit:
            break
+
        href = link.get("href") or ""
-        if href in seen:
+        # Нормализация URL — убираем query params для дедупа
+        url_clean = href.split("?")[0]
+        if url_clean in seen_urls:
            continue
-        seen.add(href)
+        seen_urls.add(url_clean)

-        # Поднимаемся до карточки
-        card = link.find_parent("div") or link
-        title = link.get_text(strip=True) or (card.select_one("span") or {}).get_text(strip=True) if hasattr(card.select_one("span"), "get_text") else ""
-        if not title or len(title) < 5:
+        full_url = href if href.startswith("http") else f"{_BASE_URL}{href}"
+
+        # Поднимаемся до карточки — у OZON это обычно ближайший div с tile-* классом
+        card = (
+            link.find_parent("div", class_=re.compile("tile|search-item|product"))
+            or link.find_parent("div")
+        )
+        if not card:
            continue

-        url = href if href.startswith("http") else f"{_BASE_URL}{href}"
-        url = url.split("?")[0]
-
-        # Цена в ближайшем родителе
-        price = None
-        price_card = link.find_parent("div", recursive=True)
-        if price_card:
-            txt = price_card.get_text(" ", strip=True)
-            m = _PRICE_RE.search(txt)
-            if m:
-                price = _try_int(m.group(1).replace(" ", ""))
-
-        # Картинка в карточке
-        img = None
-        img_el = card.find("img") if card else None
-        if img_el:
-            src = img_el.get("src") or ""
-            if src.startswith("//"):
-                src = "https:" + src
-            if src and "data:image" not in src:
-                img = src
-
-        results.append({
-            "title": title[:200],
-            "url": url,
-            "image_url": img,
-            "price_min_rub": price,
-            "price_max_rub": None,
-            "rating": None,
-            "reviews_count": None,
-            "stores_count": None,
-            "specs": {},
-            "source": "ozon",
-        })
+        item = _extract_from_card(card, full_url, link)
+        if item and item.get("title") and len(item["title"]) > 5:
+            results.append(item)

    return results


-def _try_int(v: Any) -> int | None:
-    if v is None:
-        return None
-    try:
-        return int(float(str(v).replace(" ", "").replace(",", ".")))
-    except (ValueError, TypeError):
-        return None
+def _extract_from_card(card, url: str, link_el) -> dict[str, Any] | None:
+    """Достаём данные из карточки OZON: title, price, image, rating, reviews."""
+    full_text = card.get_text(" ", strip=True)

-
-def _extract_products(data: dict, limit: int) -> list[dict[str, Any]]:
-    """OZON прячет данные в widgetStates — ищем все ключи с 'searchResultsV2'."""
-    widget_states = data.get("widgetStates") or {}
-    products: list[dict[str, Any]] = []
-
-    for key, raw in widget_states.items():
-        if "searchResultsV2" not in key and "skuGrid" not in key and "searchCategories" not in key:
-            continue
-        try:
-            import json as _j
-            w = _j.loads(raw) if isinstance(raw, str) else raw
-        except Exception:
-            continue
-
-        items = w.get("items") or w.get("products") or []
-        for it in items:
-            if len(products) >= limit:
+    # Название — может быть прямо в ссылке, либо в соседнем span
+    title = link_el.get("title") or link_el.get_text(strip=True) or ""
+    if not title or len(title) < 5 or title in ("Распродажа", "Скидка", "Топ"):
+        # Ищем во вложенных span — обычно длинные строки = название
+        spans = card.find_all("span")
+        for s in spans:
+            t = s.get_text(strip=True)
+            if t and len(t) > 15 and len(t) < 200 and "₽" not in t and "%" not in t:
+                title = t
                break
-            item = _build_item(it)
-            if item:
-                products.append(item)
-        if len(products) >= limit:
-            break
-
-    return products
-
-
-def _build_item(it: dict[str, Any]) -> dict[str, Any] | None:
-    """Парсит карточку товара из OZON widget items[]."""
-    # Структура: { mainState: [...], action: { link: '/product/...' }, images: [...] }
-    sku = it.get("sku") or it.get("id")
-    if not sku:
+    title = title.strip()
+    if not title or len(title) < 5:
        return None

-    link = (it.get("action") or {}).get("link") or ""
-    url = f"{_BASE_URL}{link}" if link.startswith("/") else link
+    # Цена — первое число с ₽ в карточке (минимальная)
+    price = None
+    m = _PRICE_RE.search(full_text)
+    if m:
+        raw = m.group(1).replace(" ", "").replace(" ", "").replace(" ", "")
+        try:
+            price = int(raw)
+        except ValueError:
+            pass

    # Картинка
-    image_url = None
-    imgs = it.get("images") or it.get("tileImage") or []
-    if isinstance(imgs, list) and imgs:
-        first = imgs[0]
-        image_url = first if isinstance(first, str) else (first.get("image") or first.get("src"))
-    if not image_url:
-        ti = it.get("tileImage") or {}
-        if isinstance(ti, dict):
-            items = ti.get("items") or []
-            for x in items:
-                if isinstance(x, dict) and x.get("image"):
-                    image_url = x["image"].get("link") or x["image"].get("src")
-                    break
+    img_url = None
+    img_el = card.find("img")
+    if img_el:
+        src = img_el.get("src") or img_el.get("data-src") or ""
+        if src and "data:image" not in src:
+            if src.startswith("//"):
+                src = "https:" + src
+            img_url = src

-    # Цена и название — берём из mainState текстовых атомов
-    title = ""
-    price_min = None
-    price_max = None
+    # Рейтинг (если есть на карточке — иногда показывают)
    rating = None
+    m = re.search(r"(\d[.,]\d)\s*\(?\d", full_text)  # "4.7 (1242 отзыва)"
+    if m:
+        try:
+            rating = float(m.group(1).replace(",", "."))
+            if rating > 5.0:
+                rating = None  # видимо не рейтинг
+        except ValueError:
+            pass
+
    reviews = None
-
-    for atom in (it.get("mainState") or []):
-        atom_id = atom.get("id") or ""
-        atom_type = atom.get("type") or ""
-
-        if atom_type == "textAtom":
-            text = ((atom.get("textAtom") or {}).get("text") or "").strip()
-            if "name" in atom_id.lower() and not title:
-                title = re.sub(r"<[^>]+>", "", text)
-            elif "price" in atom_id.lower():
-                m = _PRICE_RE.search(text)
-                if m and not price_min:
-                    price_min = int(m.group(1).replace(" ", "").replace(" ", ""))
-
-        elif atom_type == "priceV2":
-            pv = atom.get("priceV2") or {}
-            for price_obj in (pv.get("price") or []):
-                t = (price_obj.get("text") or "").strip()
-                m = _PRICE_RE.search(t)
-                if m:
-                    val = int(m.group(1).replace(" ", "").replace(" ", ""))
-                    if price_min is None or val < price_min:
-                        price_min = val
-                    if price_max is None or val > price_max:
-                        price_max = val
-
-        elif atom_type == "labelList":
-            for lbl in ((atom.get("labelList") or {}).get("items") or []):
-                t = (lbl.get("title") or "").strip()
-                # Рейтинг типа "4.7"
-                if re.fullmatch(r"\d\.\d", t):
-                    rating = float(t)
-                # Отзывы типа "1242 отзыва"
-                m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", t)
-                if m:
-                    reviews = int(m.group(1).replace(" ", ""))
-
-    if not title:
-        # Резервный фолбэк — могут быть атомы в otherState
-        for atom in (it.get("otherState") or []):
-            text = ((atom.get("textAtom") or {}).get("text") or "").strip()
-            if text and len(text) > 5:
-                title = re.sub(r"<[^>]+>", "", text)
-                break
-
-    if not title:
-        return None
+    m = re.search(r"(\d[\d\s ]*)\s*(?:отзыв|оценок|review)", full_text, re.I)
+    if m:
+        try:
+            reviews = int(m.group(1).replace(" ", "").replace(" ", "").replace(" ", ""))
+        except ValueError:
+            pass

    return {
-        "title": title,
+        "title": title[:250],
        "url": url,
-        "image_url": image_url,
-        "price_min_rub": price_min,
-        "price_max_rub": price_max if price_max and price_max != price_min else None,
+        "image_url": img_url,
+        "price_min_rub": price,
+        "price_max_rub": None,
        "rating": rating,
        "reviews_count": reviews,
        "stores_count": None,