diff --git a/backend-py/app/main.py b/backend-py/app/main.py index 265b360..a5d82f7 100644 --- a/backend-py/app/main.py +++ b/backend-py/app/main.py @@ -13,7 +13,7 @@ from .config import get_config from .auth import verify_init_data from . import sheets, ai, telegram as tg, proxy_pool from . import parsers -from .parsers import dns as parser_dns, wb as parser_wb, ozon as parser_ozon, yamarket as parser_ym +from .parsers import dns as parser_dns, wb as parser_wb, ozon as parser_ozon, yamarket as parser_ym, citilink as parser_cl logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s") log = logging.getLogger("zov.backend") @@ -195,6 +195,17 @@ def api_parse_yamarket(q: str = "", limit: int = 3): return {"ok": False, "error": str(e), "query": q} +@app.get("/api/parse_citilink") +def api_parse_citilink(q: str = "", limit: int = 3): + if not q: + return {"error": "missing_query"} + try: + results = parser_cl.search_citilink(q, limit=min(max(1, limit), 10)) + return {"ok": True, "query": q, "count": len(results), "results": results} + except Exception as e: + return {"ok": False, "error": str(e), "query": q} + + @app.get("/api/parse_all") def api_parse_all(q: str = ""): """Спрашивает все источники и возвращает агрегированный результат.""" diff --git a/backend-py/app/parsers/__init__.py b/backend-py/app/parsers/__init__.py index ea72dfd..1e659df 100644 --- a/backend-py/app/parsers/__init__.py +++ b/backend-py/app/parsers/__init__.py @@ -32,14 +32,22 @@ from .dns import search_dns from .wb import search_wb from .ozon import search_ozon from .yamarket import search_yamarket +from .citilink import search_citilink log = logging.getLogger("zov.parser") -__all__ = ["search_dns", "search_wb", "search_ozon", "search_yamarket", +__all__ = ["search_dns", "search_wb", "search_ozon", "search_yamarket", "search_citilink", "enrich_one", "enrich_models"] +# Источники по умолчанию (работают с DC-IP без прокси): +# - ozon, citilink: Playwright +# - wb: прямой JSON API (с задержкой) +# Опциональные (требуют residential proxy): +# - yamarket, dns +DEFAULT_SOURCES = ("ozon", "citilink", "wb") -def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> dict[str, Any]: + +def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]: """Спрашивает все указанные источники и объединяет лучшее в единый отчёт. Возвращает: @@ -59,8 +67,9 @@ def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) - """ fetchers = { "wb": lambda: _safe_first(search_wb, query), - "yamarket": lambda: _safe_first(search_yamarket, query), "ozon": lambda: _safe_first(search_ozon, query), + "citilink": lambda: _safe_first(search_citilink, query), + "yamarket": lambda: _safe_first(search_yamarket, query), "dns": lambda: _safe_first(search_dns, query), } @@ -87,8 +96,8 @@ def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) - stores = items["yamarket"]["stores_count"] best_url = None - # Приоритет: yamarket (агрегатор) → wb → ozon → dns - for src in ("yamarket", "wb", "ozon", "dns"): + # Приоритет: ozon → citilink → wb → yamarket → dns + for src in ("ozon", "citilink", "wb", "yamarket", "dns"): i = items.get(src) if i and i.get("url"): best_url = i["url"] @@ -107,7 +116,7 @@ def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) - def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5, - sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> list[dict[str, Any]]: + sources: tuple = DEFAULT_SOURCES) -> list[dict[str, Any]]: """Обогащает список моделей от AI данными со всех источников.""" enriched: list[dict[str, Any]] = [] for i, m in enumerate(models): diff --git a/backend-py/app/parsers/citilink.py b/backend-py/app/parsers/citilink.py new file mode 100644 index 0000000..8f9ee23 --- /dev/null +++ b/backend-py/app/parsers/citilink.py @@ -0,0 +1,157 @@ +"""Парсер Citilink (citilink.ru) — через Playwright. + +Citilink — крупный российский магазин электроники. Работает с DC-IP, не требует +прокси. Карточки помечены `data-meta-name=ProductCard...` или `data-meta-name=Snippet...`. +""" +from __future__ import annotations +import logging +import re +from typing import Any +from urllib.parse import quote_plus + +from bs4 import BeautifulSoup + +from . import playwright_engine + +log = logging.getLogger("zov.parser.citilink") + +_BASE_URL = "https://www.citilink.ru" +_SEARCH_URL = "https://www.citilink.ru/search/" +_PRICE_RE = re.compile(r"(\d[\d\s ]+)\s*₽|(\d[\d\s ]+)\s*руб") + + +def search_citilink(query: str, limit: int = 3, timeout: float = 30.0, + max_retries: int = 1) -> list[dict[str, Any]]: + """Поиск товара на Citilink через Playwright.""" + url = f"{_SEARCH_URL}?text={quote_plus(query)}" + + html = None + for attempt in range(max_retries + 1): + html = playwright_engine.fetch_page( + url, + wait_selector="[data-meta-name*='Snippet'], [data-meta-name*='ProductCard']", + wait_ms=4000, + timeout_ms=int(timeout * 1000), + ) + if html: + break + + if not html: + log.warning("Citilink: no HTML for query=%r", query) + return [] + + return _parse_html(html, limit=limit) + + +def _parse_html(html: str, limit: int) -> list[dict[str, Any]]: + soup = BeautifulSoup(html, "html.parser") + results: list[dict[str, Any]] = [] + + # Карточки товаров + cards = ( + soup.select("[data-meta-name*='Snippet']") + or soup.select("[data-meta-name*='ProductCard']") + or soup.select("div.ProductCardHorizontal") + ) + + for card in cards: + if len(results) >= limit: + break + item = _extract_card(card) + if item: + results.append(item) + + return results + + +def _extract_card(card) -> dict[str, Any] | None: + """Достаём title, url, цену, картинку, рейтинг, отзывы.""" + # Ссылка на товар + link = card.select_one("a[href*='/product/']") or card.find("a", href=True) + if not link: + return None + href = link.get("href") or "" + if "/product/" not in href and "/promo/" not in href: + return None + url = href if href.startswith("http") else f"{_BASE_URL}{href}" + + # Название + title = "" + # Citilink использует разные классы — пробуем несколько + for sel in [ + "[data-meta-name*='Snippet__title']", + "[data-meta-name*='ProductCardHorizontal__title']", + "a[href*='/product/'] span", + "a[title]", + ]: + el = card.select_one(sel) + if el: + title = (el.get("title") or el.get_text(strip=True)).strip() + if title and len(title) > 5: + break + if not title: + # Резерв — длинный текст в карточке + for s in card.find_all(["span", "div"]): + t = s.get_text(strip=True) + if t and 15 < len(t) < 200 and "₽" not in t and "%" not in t: + title = t + break + if not title or len(title) < 5: + return None + + full_text = card.get_text(" ", strip=True) + + # Цена + price = None + for m in _PRICE_RE.finditer(full_text): + raw = (m.group(1) or m.group(2) or "").replace(" ", "").replace(" ", "").replace(" ", "") + try: + v = int(raw) + if 100 < v < 10_000_000: # разумные пределы + price = v + break + except ValueError: + pass + + # Картинка + img_url = None + img_el = card.find("img") + if img_el: + src = img_el.get("src") or img_el.get("data-src") or "" + if src and "data:image" not in src: + if src.startswith("//"): + src = "https:" + src + img_url = src + + # Рейтинг + rating = None + m = re.search(r"(\d[.,]\d)\s*[\\(\\d]", full_text) + if m: + try: + r = float(m.group(1).replace(",", ".")) + if 0 < r <= 5.0: + rating = r + except ValueError: + pass + + # Отзывы + reviews = None + m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", full_text) + if m: + try: + reviews = int(m.group(1).replace(" ", "").replace(" ", "")) + except ValueError: + pass + + return { + "title": title[:250], + "url": url, + "image_url": img_url, + "price_min_rub": price, + "price_max_rub": None, + "rating": rating, + "reviews_count": reviews, + "stores_count": None, + "specs": {}, + "source": "citilink", + } diff --git a/backend-py/app/parsers/ozon.py b/backend-py/app/parsers/ozon.py index 36887dc..5bf1128 100644 --- a/backend-py/app/parsers/ozon.py +++ b/backend-py/app/parsers/ozon.py @@ -1,9 +1,9 @@ -"""Парсер OZON — через composer-api (внутренний JSON API сайта). +"""Парсер OZON — через Playwright (рендер JS). -OZON отдаёт JSON через `/api/composer-api.bx/page/json/v2?url=/search/?text=…`. -JSON содержит вложенные виджеты — нас интересует `widgetStates.searchResults...`. +OZON блокирует прямой HTTP с DC-IP (403/307). С Playwright рендерит обычную +HTML-страницу `/search/?text=…`, в которой есть карточки `a[href*='/product/']`. -Без прокси возвращает 307/403. Через резидентный РФ-IP проходит. +В карточке: название, цена, картинка, рейтинг, отзывы — в DOM рядом со ссылкой. """ from __future__ import annotations import logging @@ -11,249 +11,141 @@ import re from typing import Any from urllib.parse import quote_plus -import httpx from bs4 import BeautifulSoup -from .. import proxy_pool from . import playwright_engine log = logging.getLogger("zov.parser.ozon") _BASE_URL = "https://www.ozon.ru" -_API_URL = "https://www.ozon.ru/api/composer-api.bx/page/json/v2" - -_HEADERS = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", - "Accept": "application/json", - "Accept-Language": "ru-RU,ru;q=0.9", - "x-o3-app-name": "dweb_client", - "x-o3-app-version": "release_18.04", - "x-o3-page-type": "search", - "Referer": "https://www.ozon.ru/", -} - -_PRICE_RE = re.compile(r"([\d\s]+)\s*₽") +_PRICE_RE = re.compile(r"(\d[\d\s  ]+)\s*₽") def search_ozon(query: str, limit: int = 3, timeout: float = 30.0, - max_retries: int = 1, use_playwright: bool = True) -> list[dict[str, Any]]: - """Поиск товара в OZON. + max_retries: int = 1) -> list[dict[str, Any]]: + """Поиск товара в OZON через Playwright.""" + url = f"{_BASE_URL}/search/?text={quote_plus(query)}" - Сначала пробуем composer-api JSON (быстро), при challenge — Playwright (медленно но точно). - """ - # Путь 1: быстрый composer-api - url_param = f"/search/?text={quote_plus(query)}&from_global=true" - params = {"url": url_param} + html = None for attempt in range(max_retries + 1): - try: - with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS, - follow_redirects=False) as client: - resp = client.get(_API_URL, params=params) - if resp.status_code == 200: - try: - return _extract_products(resp.json(), limit=limit) - except Exception: - pass - log.debug("OZON composer-api attempt %d: status=%s", attempt + 1, resp.status_code) - except httpx.HTTPError as e: - log.debug("OZON composer-api err: %s", e) + html = playwright_engine.fetch_page( + url, + wait_selector="a[href*='/product/']", + wait_ms=4000, + timeout_ms=int(timeout * 1000), + ) + if html: + break - # Путь 2: Playwright (рендерим обычную HTML-страницу поиска) - if not use_playwright: - return [] - log.info("OZON falling back to Playwright for query=%r", query) - page_url = f"{_BASE_URL}/search/?text={quote_plus(query)}" - html = playwright_engine.fetch_page( - page_url, - wait_selector="a[href*='/product/'], [data-widget='searchResultsV2']", - wait_ms=3500, - timeout_ms=int(timeout * 1000), - ) if not html: + log.warning("OZON: no HTML for query=%r", query) return [] - return _parse_html_via_dom(html, limit=limit) + if "robotcheck" in html.lower() or "challenge" in html.lower()[:5000]: + log.warning("OZON: anti-bot challenge for query=%r", query) + return [] + + return _parse_html(html, limit=limit) -def _parse_html_via_dom(html: str, limit: int) -> list[dict[str, Any]]: - """Fallback: парсим товары из отрендеренного Chrome HTML.""" +def _parse_html(html: str, limit: int) -> list[dict[str, Any]]: soup = BeautifulSoup(html, "html.parser") - seen = set() results: list[dict[str, Any]] = [] + seen_urls = set() - for link in soup.select("a[href*='/product/']"): + # Находим все ссылки на товары + product_links = soup.select("a[href*='/product/']") + + for link in product_links: if len(results) >= limit: break + href = link.get("href") or "" - if href in seen: + # Нормализация URL — убираем query params для дедупа + url_clean = href.split("?")[0] + if url_clean in seen_urls: continue - seen.add(href) + seen_urls.add(url_clean) - # Поднимаемся до карточки - card = link.find_parent("div") or link - title = link.get_text(strip=True) or (card.select_one("span") or {}).get_text(strip=True) if hasattr(card.select_one("span"), "get_text") else "" - if not title or len(title) < 5: + full_url = href if href.startswith("http") else f"{_BASE_URL}{href}" + + # Поднимаемся до карточки — у OZON это обычно ближайший div с tile-* классом + card = ( + link.find_parent("div", class_=re.compile("tile|search-item|product")) + or link.find_parent("div") + ) + if not card: continue - url = href if href.startswith("http") else f"{_BASE_URL}{href}" - url = url.split("?")[0] - - # Цена в ближайшем родителе - price = None - price_card = link.find_parent("div", recursive=True) - if price_card: - txt = price_card.get_text(" ", strip=True) - m = _PRICE_RE.search(txt) - if m: - price = _try_int(m.group(1).replace(" ", "")) - - # Картинка в карточке - img = None - img_el = card.find("img") if card else None - if img_el: - src = img_el.get("src") or "" - if src.startswith("//"): - src = "https:" + src - if src and "data:image" not in src: - img = src - - results.append({ - "title": title[:200], - "url": url, - "image_url": img, - "price_min_rub": price, - "price_max_rub": None, - "rating": None, - "reviews_count": None, - "stores_count": None, - "specs": {}, - "source": "ozon", - }) + item = _extract_from_card(card, full_url, link) + if item and item.get("title") and len(item["title"]) > 5: + results.append(item) return results -def _try_int(v: Any) -> int | None: - if v is None: - return None - try: - return int(float(str(v).replace(" ", "").replace(",", "."))) - except (ValueError, TypeError): - return None +def _extract_from_card(card, url: str, link_el) -> dict[str, Any] | None: + """Достаём данные из карточки OZON: title, price, image, rating, reviews.""" + full_text = card.get_text(" ", strip=True) - -def _extract_products(data: dict, limit: int) -> list[dict[str, Any]]: - """OZON прячет данные в widgetStates — ищем все ключи с 'searchResultsV2'.""" - widget_states = data.get("widgetStates") or {} - products: list[dict[str, Any]] = [] - - for key, raw in widget_states.items(): - if "searchResultsV2" not in key and "skuGrid" not in key and "searchCategories" not in key: - continue - try: - import json as _j - w = _j.loads(raw) if isinstance(raw, str) else raw - except Exception: - continue - - items = w.get("items") or w.get("products") or [] - for it in items: - if len(products) >= limit: + # Название — может быть прямо в ссылке, либо в соседнем span + title = link_el.get("title") or link_el.get_text(strip=True) or "" + if not title or len(title) < 5 or title in ("Распродажа", "Скидка", "Топ"): + # Ищем во вложенных span — обычно длинные строки = название + spans = card.find_all("span") + for s in spans: + t = s.get_text(strip=True) + if t and len(t) > 15 and len(t) < 200 and "₽" not in t and "%" not in t: + title = t break - item = _build_item(it) - if item: - products.append(item) - if len(products) >= limit: - break - - return products - - -def _build_item(it: dict[str, Any]) -> dict[str, Any] | None: - """Парсит карточку товара из OZON widget items[].""" - # Структура: { mainState: [...], action: { link: '/product/...' }, images: [...] } - sku = it.get("sku") or it.get("id") - if not sku: + title = title.strip() + if not title or len(title) < 5: return None - link = (it.get("action") or {}).get("link") or "" - url = f"{_BASE_URL}{link}" if link.startswith("/") else link + # Цена — первое число с ₽ в карточке (минимальная) + price = None + m = _PRICE_RE.search(full_text) + if m: + raw = m.group(1).replace(" ", "").replace(" ", "").replace(" ", "") + try: + price = int(raw) + except ValueError: + pass # Картинка - image_url = None - imgs = it.get("images") or it.get("tileImage") or [] - if isinstance(imgs, list) and imgs: - first = imgs[0] - image_url = first if isinstance(first, str) else (first.get("image") or first.get("src")) - if not image_url: - ti = it.get("tileImage") or {} - if isinstance(ti, dict): - items = ti.get("items") or [] - for x in items: - if isinstance(x, dict) and x.get("image"): - image_url = x["image"].get("link") or x["image"].get("src") - break + img_url = None + img_el = card.find("img") + if img_el: + src = img_el.get("src") or img_el.get("data-src") or "" + if src and "data:image" not in src: + if src.startswith("//"): + src = "https:" + src + img_url = src - # Цена и название — берём из mainState текстовых атомов - title = "" - price_min = None - price_max = None + # Рейтинг (если есть на карточке — иногда показывают) rating = None + m = re.search(r"(\d[.,]\d)\s*\(?\d", full_text) # "4.7 (1242 отзыва)" + if m: + try: + rating = float(m.group(1).replace(",", ".")) + if rating > 5.0: + rating = None # видимо не рейтинг + except ValueError: + pass + reviews = None - - for atom in (it.get("mainState") or []): - atom_id = atom.get("id") or "" - atom_type = atom.get("type") or "" - - if atom_type == "textAtom": - text = ((atom.get("textAtom") or {}).get("text") or "").strip() - if "name" in atom_id.lower() and not title: - title = re.sub(r"<[^>]+>", "", text) - elif "price" in atom_id.lower(): - m = _PRICE_RE.search(text) - if m and not price_min: - price_min = int(m.group(1).replace(" ", "").replace(" ", "")) - - elif atom_type == "priceV2": - pv = atom.get("priceV2") or {} - for price_obj in (pv.get("price") or []): - t = (price_obj.get("text") or "").strip() - m = _PRICE_RE.search(t) - if m: - val = int(m.group(1).replace(" ", "").replace(" ", "")) - if price_min is None or val < price_min: - price_min = val - if price_max is None or val > price_max: - price_max = val - - elif atom_type == "labelList": - for lbl in ((atom.get("labelList") or {}).get("items") or []): - t = (lbl.get("title") or "").strip() - # Рейтинг типа "4.7" - if re.fullmatch(r"\d\.\d", t): - rating = float(t) - # Отзывы типа "1242 отзыва" - m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", t) - if m: - reviews = int(m.group(1).replace(" ", "")) - - if not title: - # Резервный фолбэк — могут быть атомы в otherState - for atom in (it.get("otherState") or []): - text = ((atom.get("textAtom") or {}).get("text") or "").strip() - if text and len(text) > 5: - title = re.sub(r"<[^>]+>", "", text) - break - - if not title: - return None + m = re.search(r"(\d[\d\s ]*)\s*(?:отзыв|оценок|review)", full_text, re.I) + if m: + try: + reviews = int(m.group(1).replace(" ", "").replace(" ", "").replace(" ", "")) + except ValueError: + pass return { - "title": title, + "title": title[:250], "url": url, - "image_url": image_url, - "price_min_rub": price_min, - "price_max_rub": price_max if price_max and price_max != price_min else None, + "image_url": img_url, + "price_min_rub": price, + "price_max_rub": None, "rating": rating, "reviews_count": reviews, "stores_count": None,