From 82425dbd887c2948f78840054a10ba57f6763232 Mon Sep 17 00:00:00 2001 From: wasrusgen Date: Mon, 11 May 2026 12:18:04 +0300 Subject: [PATCH] =?UTF-8?q?backend:=20Proxy6=20pool=20+=20parsers=20WB=20/?= =?UTF-8?q?=20OZON=20/=20=D0=AF.=D0=9C=D0=B0=D1=80=D0=BA=D0=B5=D1=82=20/?= =?UTF-8?q?=20DNS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PROXY POOL (app/proxy_pool.py): - Loads active proxies from Proxy6.net API every 10 min - Random rotation per request via proxied_client(timeout, headers) - Graceful fallback to direct HTTP if PROXY6_TOKEN not set - Config: PROXY6_TOKEN env var PARSERS (app/parsers/): - dns.py — refactored to use proxy_pool with retry+rotation on Qrator block - wb.py — Wildberries JSON API (search.wb.ru), retries on 429 - ozon.py — OZON composer-api JSON (widgetStates extraction) - yamarket.py — Я.Маркет HTML + embedded JSON parser - __init__.py — enrich_one() fans out to all sources, aggregates min/max prices, max rating, sum reviews - enrich_models() — batch enrich for AI by_category output NEW DIAGNOSTIC ENDPOINTS (main.py): - GET /api/parse_wb?q=...&limit=N - GET /api/parse_ozon?q=...&limit=N - GET /api/parse_yamarket?q=...&limit=N - GET /api/parse_all?q=... — fan-out + aggregate - GET /api/proxy_status — pool diagnostics (count, token configured, age) PODBOR (main.py): - _enrich_ai_with_dns -> _enrich_ai_marketplaces (uses all sources) DEPLOY: needs PROXY6_TOKEN in /opt/zov-tech/deploy/.env on VPS, then docker compose build + up -d backend --- backend-py/app/config.py | 3 + backend-py/app/main.py | 76 ++++++++-- backend-py/app/parsers/__init__.py | 133 ++++++++++++++-- backend-py/app/parsers/dns.py | 44 ++++-- backend-py/app/parsers/ozon.py | 188 +++++++++++++++++++++++ backend-py/app/parsers/wb.py | 135 +++++++++++++++++ backend-py/app/parsers/yamarket.py | 236 +++++++++++++++++++++++++++++ backend-py/app/proxy_pool.py | 102 +++++++++++++ 8 files changed, 876 insertions(+), 41 deletions(-) create mode 100644 backend-py/app/parsers/ozon.py create mode 100644 backend-py/app/parsers/wb.py create mode 100644 backend-py/app/parsers/yamarket.py create mode 100644 backend-py/app/proxy_pool.py diff --git a/backend-py/app/config.py b/backend-py/app/config.py index f31f4e6..8238168 100644 --- a/backend-py/app/config.py +++ b/backend-py/app/config.py @@ -19,6 +19,8 @@ class Config: active_period_days: int grace_period_days: int + proxy6_token: str # пусто = без прокси (прямой HTTP) + def _required(name: str) -> str: val = os.getenv(name) @@ -39,4 +41,5 @@ def get_config() -> Config: gigachat_scope=os.getenv("GIGACHAT_SCOPE", "GIGACHAT_API_PERS"), active_period_days=int(os.getenv("ACTIVE_PERIOD_DAYS", "90")), grace_period_days=int(os.getenv("GRACE_PERIOD_DAYS", "14")), + proxy6_token=os.getenv("PROXY6_TOKEN", ""), ) diff --git a/backend-py/app/main.py b/backend-py/app/main.py index 921e91c..17d8b60 100644 --- a/backend-py/app/main.py +++ b/backend-py/app/main.py @@ -11,8 +11,9 @@ from fastapi.responses import JSONResponse from .config import get_config from .auth import verify_init_data -from . import sheets, ai, telegram as tg -from .parsers import dns as parser_dns +from . import sheets, ai, telegram as tg, proxy_pool +from . import parsers +from .parsers import dns as parser_dns, wb as parser_wb, ozon as parser_ozon, yamarket as parser_ym logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s") log = logging.getLogger("zov.backend") @@ -143,7 +144,7 @@ async def api_seed_admin(): @app.get("/api/parse_dns") async def api_parse_dns(q: str = "", limit: int = 1): - """Тестовый эндпоинт парсера DNS. Пример: /api/parse_dns?q=Bosch+KGN39&limit=3""" + """Тест парсера DNS.""" if not q: return {"error": "missing_query", "hint": "use ?q="} try: @@ -153,6 +154,57 @@ async def api_parse_dns(q: str = "", limit: int = 1): return {"ok": False, "error": str(e), "query": q} +@app.get("/api/parse_wb") +async def api_parse_wb(q: str = "", limit: int = 3): + if not q: + return {"error": "missing_query"} + try: + results = parser_wb.search_wb(q, limit=min(max(1, limit), 10)) + return {"ok": True, "query": q, "count": len(results), "results": results} + except Exception as e: + return {"ok": False, "error": str(e), "query": q} + + +@app.get("/api/parse_ozon") +async def api_parse_ozon(q: str = "", limit: int = 3): + if not q: + return {"error": "missing_query"} + try: + results = parser_ozon.search_ozon(q, limit=min(max(1, limit), 10)) + return {"ok": True, "query": q, "count": len(results), "results": results} + except Exception as e: + return {"ok": False, "error": str(e), "query": q} + + +@app.get("/api/parse_yamarket") +async def api_parse_yamarket(q: str = "", limit: int = 3): + if not q: + return {"error": "missing_query"} + try: + results = parser_ym.search_yamarket(q, limit=min(max(1, limit), 10)) + return {"ok": True, "query": q, "count": len(results), "results": results} + except Exception as e: + return {"ok": False, "error": str(e), "query": q} + + +@app.get("/api/parse_all") +async def api_parse_all(q: str = ""): + """Спрашивает все источники и возвращает агрегированный результат.""" + if not q: + return {"error": "missing_query"} + try: + data = parsers.enrich_one(q) + return {"ok": True, "query": q, "data": data} + except Exception as e: + return {"ok": False, "error": str(e), "query": q} + + +@app.get("/api/proxy_status") +async def api_proxy_status(): + """Диагностика: показывает текущее состояние proxy-пула.""" + return proxy_pool.pool_status() + + # ================================================================= # Handlers # ================================================================= @@ -289,13 +341,13 @@ def _handle_podbor(body: dict[str, Any]) -> dict[str, Any]: ) ai_result = ai.call_ai(user_prompt) - # Обогащение моделей DNS-парсингом - enrich_dns = body.get("enrich", True) - if enrich_dns: + # Обогащение моделей данными с маркетплейсов (WB / Я.Маркет / OZON / DNS) + enrich_enabled = body.get("enrich", True) + if enrich_enabled: try: - _enrich_ai_with_dns(ai_result) + _enrich_ai_marketplaces(ai_result) except Exception as e: - log.warning("DNS enrich failed: %s", e) + log.warning("Marketplace enrich failed: %s", e) # Update lead row with AI response sheets.update_cell_by_key("Leads", "id", lead_id, "ai_response", @@ -311,8 +363,10 @@ def _handle_podbor(body: dict[str, Any]) -> dict[str, Any]: return {"ok": True, "id": lead_id, "summary": summary_text, "ai": ai_result.get("json")} -def _enrich_ai_with_dns(ai_result: dict[str, Any]) -> None: - """Берёт ai_result['json']['by_category'][cat]['models'] и обогащает каждую DNS-данными.""" +def _enrich_ai_marketplaces(ai_result: dict[str, Any]) -> None: + """Обогащает каждую модель из ai_result['json']['by_category'] данными + с маркетплейсов (WB / Я.Маркет / OZON / DNS). Если PROXY6_TOKEN не задан — + скорее всего вернёт пустые данные (Qrator блокирует прямые HTTP).""" j = ai_result.get("json") if not j or not isinstance(j, dict): return @@ -321,7 +375,7 @@ def _enrich_ai_with_dns(ai_result: dict[str, Any]) -> None: if not isinstance(cat_data, dict): continue models = cat_data.get("models") or [] - cat_data["models"] = parser_dns.enrich_models(models, delay_sec=0.4) + cat_data["models"] = parsers.enrich_models(models, delay_sec=0.4) def _handle_test_ai() -> dict[str, Any]: diff --git a/backend-py/app/parsers/__init__.py b/backend-py/app/parsers/__init__.py index 30c3b74..ea72dfd 100644 --- a/backend-py/app/parsers/__init__.py +++ b/backend-py/app/parsers/__init__.py @@ -1,27 +1,132 @@ """Парсеры маркетплейсов для обогащения карточек моделей. -Подход MVP: парсим публичные HTML-страницы напрямую с VPS (без прокси). -При обнаружении anti-bot блокировок — переходим на резидентные прокси (Proxy6). +Все парсеры используют общий proxy_pool (Proxy6.net), если PROXY6_TOKEN задан. +Без прокси крупные маркетплейсы РФ (DNS, OZON, Я.Маркет) возвращают 401/307. Источники: -- dns.py — DNS Shop (dns-shop.ru) — самый простой anti-bot, основной источник характеристик -- yamarket.py — Я.Маркет (market.yandex.ru) — для сравнения цен между магазинами -- wildberries.py — Wildberries (wildberries.ru) — для отзывов и рейтингов +- dns.py — DNS Shop (характеристики, цена одного магазина) +- wb.py — Wildberries (JSON API, цена + отзывы + рейтинг) +- ozon.py — OZON (composer-api JSON) +- yamarket.py — Я.Маркет (HTML + встроенный JSON, сравнение цен) -Унифицированный формат результата: +Унифицированный формат результата (item): { "title": str, # Название как на странице "url": str, # Ссылка на товар "image_url": str | None, # URL основного фото - "price_min_rub": int | None, # Минимальная найденная цена - "price_max_rub": int | None, # Максимальная (если есть данные по нескольким магазинам) + "price_min_rub": int | None, + "price_max_rub": int | None, "rating": float | None, # 0.0 - 5.0 - "reviews_count": int | None, # Кол-во отзывов - "stores_count": int | None, # На скольких сайтах найдено (Я.Маркет) - "specs": dict[str, str], # Ключевые характеристики - "source": str, # "dns" / "yamarket" / "wildberries" + "reviews_count": int | None, + "stores_count": int | None, # Только Я.Маркет (сравнение) + "specs": dict[str, str], + "source": str, # 'dns' | 'wb' | 'ozon' | 'yamarket' } """ -from .dns import search_dns +from __future__ import annotations +import logging +import time +from typing import Any -__all__ = ["search_dns"] +from .dns import search_dns +from .wb import search_wb +from .ozon import search_ozon +from .yamarket import search_yamarket + +log = logging.getLogger("zov.parser") + +__all__ = ["search_dns", "search_wb", "search_ozon", "search_yamarket", + "enrich_one", "enrich_models"] + + +def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> dict[str, Any]: + """Спрашивает все указанные источники и объединяет лучшее в единый отчёт. + + Возвращает: + { + "wb": {item dict} или None, + "ozon": {item dict} или None, + "yamarket": {item dict} или None, + "dns": {item dict} или None, + "price_min_rub": int | None, # минимум по всем источникам + "price_max_rub": int | None, + "image_url": str | None, + "rating_max": float | None, + "reviews_total": int | None, + "stores_count": int | None, # макс. из yamarket + "best_url": str | None, + } + """ + fetchers = { + "wb": lambda: _safe_first(search_wb, query), + "yamarket": lambda: _safe_first(search_yamarket, query), + "ozon": lambda: _safe_first(search_ozon, query), + "dns": lambda: _safe_first(search_dns, query), + } + + items: dict[str, dict] = {} + for src in sources: + fn = fetchers.get(src) + if not fn: + continue + try: + items[src] = fn() + except Exception as e: + log.warning("Source %s failed for %r: %s", src, query, e) + items[src] = None + + # Агрегация + prices = [i["price_min_rub"] for i in items.values() if i and i.get("price_min_rub")] + images = [i["image_url"] for i in items.values() if i and i.get("image_url")] + ratings = [i["rating"] for i in items.values() if i and i.get("rating")] + reviews = [i["reviews_count"] for i in items.values() if i and i.get("reviews_count")] + + # Я.Маркет даёт количество магазинов + stores = None + if items.get("yamarket") and items["yamarket"].get("stores_count"): + stores = items["yamarket"]["stores_count"] + + best_url = None + # Приоритет: yamarket (агрегатор) → wb → ozon → dns + for src in ("yamarket", "wb", "ozon", "dns"): + i = items.get(src) + if i and i.get("url"): + best_url = i["url"] + break + + return { + **{src: items.get(src) for src in fetchers.keys()}, + "price_min_rub": min(prices) if prices else None, + "price_max_rub": max(prices) if prices else None, + "image_url": images[0] if images else None, + "rating_max": max(ratings) if ratings else None, + "reviews_total": sum(reviews) if reviews else None, + "stores_count": stores, + "best_url": best_url, + } + + +def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5, + sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> list[dict[str, Any]]: + """Обогащает список моделей от AI данными со всех источников.""" + enriched: list[dict[str, Any]] = [] + for i, m in enumerate(models): + q = m.get("search_query") or f"{m.get('brand', '')} {m.get('model', '')}".strip() + if not q: + enriched.append({**m, "enriched": None}) + continue + try: + data = enrich_one(q, sources=sources) + except Exception as e: + log.warning("Enrich failed for %r: %s", q, e) + data = None + enriched.append({**m, "enriched": data}) + if i < len(models) - 1 and delay_sec > 0: + time.sleep(delay_sec) + return enriched + + +def _safe_first(search_fn, query: str) -> dict[str, Any] | None: + """Вызывает поиск и возвращает первый результат или None.""" + results = search_fn(query, limit=1) + return results[0] if results else None diff --git a/backend-py/app/parsers/dns.py b/backend-py/app/parsers/dns.py index 81b59bf..50fd6c4 100644 --- a/backend-py/app/parsers/dns.py +++ b/backend-py/app/parsers/dns.py @@ -16,6 +16,8 @@ from urllib.parse import quote_plus import httpx from bs4 import BeautifulSoup +from .. import proxy_pool + log = logging.getLogger("zov.parser.dns") _BASE_URL = "https://www.dns-shop.ru" @@ -42,31 +44,41 @@ _HEADERS = { _PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽") -def search_dns(query: str, limit: int = 1, timeout: float = 12.0) -> list[dict[str, Any]]: +def search_dns(query: str, limit: int = 1, timeout: float = 12.0, + max_retries: int = 2) -> list[dict[str, Any]]: """Поиск товара на DNS по строке запроса. - Возвращает список результатов (топ-N). Каждый элемент — унифицированный - формат (см. parsers/__init__.py). Пустой список при ошибке. + Использует Proxy6-пул если PROXY6_TOKEN задан, иначе ходит напрямую. + DNS защищён Qrator — без прокси скорее всего 401. + + Возвращает список результатов (топ-N) или пустой при ошибке. """ url = f"{_SEARCH_URL}?q={quote_plus(query)}" log.info("DNS search: %s", url) - try: - with httpx.Client(headers=_HEADERS, timeout=timeout, follow_redirects=True) as client: - resp = client.get(url) - except httpx.HTTPError as e: - log.warning("DNS request failed: %s", e) - return [] + last_err = None + for attempt in range(max_retries + 1): + try: + with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS, + follow_redirects=True) as client: + resp = client.get(url) + except httpx.HTTPError as e: + last_err = e + log.warning("DNS request failed (attempt %d): %s", attempt + 1, e) + continue - if resp.status_code != 200: - log.warning("DNS returned %s for query=%r", resp.status_code, query) - return [] + if resp.status_code == 200: + text = resp.text + if "qrator" in text.lower() or "challenge" in text.lower() or "captcha" in text.lower(): + log.warning("DNS Qrator/captcha on attempt %d, rotating proxy", attempt + 1) + continue + return _parse_search_html(text, limit=limit) - if "challenge" in resp.text.lower() or "captcha" in resp.text.lower(): - log.warning("DNS anti-bot challenge detected for query=%r", query) - return [] + log.warning("DNS returned status=%s on attempt %d", resp.status_code, attempt + 1) - return _parse_search_html(resp.text, limit=limit) + log.warning("DNS gave up after %d attempts for query=%r (last_err=%s)", + max_retries + 1, query, last_err) + return [] def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]: diff --git a/backend-py/app/parsers/ozon.py b/backend-py/app/parsers/ozon.py new file mode 100644 index 0000000..0bfd177 --- /dev/null +++ b/backend-py/app/parsers/ozon.py @@ -0,0 +1,188 @@ +"""Парсер OZON — через composer-api (внутренний JSON API сайта). + +OZON отдаёт JSON через `/api/composer-api.bx/page/json/v2?url=/search/?text=…`. +JSON содержит вложенные виджеты — нас интересует `widgetStates.searchResults...`. + +Без прокси возвращает 307/403. Через резидентный РФ-IP проходит. +""" +from __future__ import annotations +import logging +import re +from typing import Any +from urllib.parse import quote_plus + +import httpx + +from .. import proxy_pool + +log = logging.getLogger("zov.parser.ozon") + +_BASE_URL = "https://www.ozon.ru" +_API_URL = "https://www.ozon.ru/api/composer-api.bx/page/json/v2" + +_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + "Accept": "application/json", + "Accept-Language": "ru-RU,ru;q=0.9", + "x-o3-app-name": "dweb_client", + "x-o3-app-version": "release_18.04", + "x-o3-page-type": "search", + "Referer": "https://www.ozon.ru/", +} + +_PRICE_RE = re.compile(r"([\d\s]+)\s*₽") + + +def search_ozon(query: str, limit: int = 3, timeout: float = 15.0, + max_retries: int = 2) -> list[dict[str, Any]]: + """Поиск товара в OZON через composer-api.""" + url_param = f"/search/?text={quote_plus(query)}&from_global=true" + params = {"url": url_param} + + for attempt in range(max_retries + 1): + try: + with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS, + follow_redirects=False) as client: + resp = client.get(_API_URL, params=params) + except httpx.HTTPError as e: + log.warning("OZON request failed (attempt %d): %s", attempt + 1, e) + continue + + if resp.status_code in (301, 302, 307, 308): + log.info("OZON redirect %s, rotating proxy", resp.status_code) + continue + if resp.status_code != 200: + log.warning("OZON returned status=%s", resp.status_code) + continue + + try: + data = resp.json() + except Exception as e: + log.warning("OZON JSON parse failed: %s", e) + continue + + return _extract_products(data, limit=limit) + + log.warning("OZON gave up after %d attempts for query=%r", max_retries + 1, query) + return [] + + +def _extract_products(data: dict, limit: int) -> list[dict[str, Any]]: + """OZON прячет данные в widgetStates — ищем все ключи с 'searchResultsV2'.""" + widget_states = data.get("widgetStates") or {} + products: list[dict[str, Any]] = [] + + for key, raw in widget_states.items(): + if "searchResultsV2" not in key and "skuGrid" not in key and "searchCategories" not in key: + continue + try: + import json as _j + w = _j.loads(raw) if isinstance(raw, str) else raw + except Exception: + continue + + items = w.get("items") or w.get("products") or [] + for it in items: + if len(products) >= limit: + break + item = _build_item(it) + if item: + products.append(item) + if len(products) >= limit: + break + + return products + + +def _build_item(it: dict[str, Any]) -> dict[str, Any] | None: + """Парсит карточку товара из OZON widget items[].""" + # Структура: { mainState: [...], action: { link: '/product/...' }, images: [...] } + sku = it.get("sku") or it.get("id") + if not sku: + return None + + link = (it.get("action") or {}).get("link") or "" + url = f"{_BASE_URL}{link}" if link.startswith("/") else link + + # Картинка + image_url = None + imgs = it.get("images") or it.get("tileImage") or [] + if isinstance(imgs, list) and imgs: + first = imgs[0] + image_url = first if isinstance(first, str) else (first.get("image") or first.get("src")) + if not image_url: + ti = it.get("tileImage") or {} + if isinstance(ti, dict): + items = ti.get("items") or [] + for x in items: + if isinstance(x, dict) and x.get("image"): + image_url = x["image"].get("link") or x["image"].get("src") + break + + # Цена и название — берём из mainState текстовых атомов + title = "" + price_min = None + price_max = None + rating = None + reviews = None + + for atom in (it.get("mainState") or []): + atom_id = atom.get("id") or "" + atom_type = atom.get("type") or "" + + if atom_type == "textAtom": + text = ((atom.get("textAtom") or {}).get("text") or "").strip() + if "name" in atom_id.lower() and not title: + title = re.sub(r"<[^>]+>", "", text) + elif "price" in atom_id.lower(): + m = _PRICE_RE.search(text) + if m and not price_min: + price_min = int(m.group(1).replace(" ", "").replace(" ", "")) + + elif atom_type == "priceV2": + pv = atom.get("priceV2") or {} + for price_obj in (pv.get("price") or []): + t = (price_obj.get("text") or "").strip() + m = _PRICE_RE.search(t) + if m: + val = int(m.group(1).replace(" ", "").replace(" ", "")) + if price_min is None or val < price_min: + price_min = val + if price_max is None or val > price_max: + price_max = val + + elif atom_type == "labelList": + for lbl in ((atom.get("labelList") or {}).get("items") or []): + t = (lbl.get("title") or "").strip() + # Рейтинг типа "4.7" + if re.fullmatch(r"\d\.\d", t): + rating = float(t) + # Отзывы типа "1242 отзыва" + m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", t) + if m: + reviews = int(m.group(1).replace(" ", "")) + + if not title: + # Резервный фолбэк — могут быть атомы в otherState + for atom in (it.get("otherState") or []): + text = ((atom.get("textAtom") or {}).get("text") or "").strip() + if text and len(text) > 5: + title = re.sub(r"<[^>]+>", "", text) + break + + if not title: + return None + + return { + "title": title, + "url": url, + "image_url": image_url, + "price_min_rub": price_min, + "price_max_rub": price_max if price_max and price_max != price_min else None, + "rating": rating, + "reviews_count": reviews, + "stores_count": None, + "specs": {}, + "source": "ozon", + } diff --git a/backend-py/app/parsers/wb.py b/backend-py/app/parsers/wb.py new file mode 100644 index 0000000..c485c41 --- /dev/null +++ b/backend-py/app/parsers/wb.py @@ -0,0 +1,135 @@ +"""Парсер Wildberries — через их JSON API. + +Endpoint search.wb.ru отдаёт чистый JSON с товарами. Цены в копейках/u +(делим на 100). У товаров есть rating, feedbacks (отзывы), brand. + +Цена /salePriceU/ — итоговая со скидкой, /priceU/ — RRP. +""" +from __future__ import annotations +import logging +from typing import Any +from urllib.parse import quote_plus + +import httpx + +from .. import proxy_pool + +log = logging.getLogger("zov.parser.wb") + +_SEARCH_URL = "https://search.wb.ru/exactmatch/ru/common/v9/search" +_DEFAULT_PARAMS = { + "TestGroup": "no_test", + "TestID": "no_test", + "appType": "1", + "curr": "rub", + "dest": "-1257786", # Москва, можно поменять + "resultset": "catalog", + "sort": "popular", + "spp": "30", + "suppressSpellcheck": "false", +} +_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + "Accept": "*/*", + "Accept-Language": "ru-RU,ru;q=0.9", + "Origin": "https://www.wildberries.ru", + "Referer": "https://www.wildberries.ru/", +} + + +def search_wb(query: str, limit: int = 3, timeout: float = 12.0, + max_retries: int = 2) -> list[dict[str, Any]]: + params = {**_DEFAULT_PARAMS, "query": query} + + for attempt in range(max_retries + 1): + try: + with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS) as client: + resp = client.get(_SEARCH_URL, params=params) + except httpx.HTTPError as e: + log.warning("WB request failed (attempt %d): %s", attempt + 1, e) + continue + + if resp.status_code == 429: + log.warning("WB rate-limited on attempt %d, rotating proxy", attempt + 1) + continue + if resp.status_code != 200: + log.warning("WB returned status=%s", resp.status_code) + continue + + try: + data = resp.json() + except Exception as e: + log.warning("WB JSON parse failed: %s", e) + continue + + products = (data.get("data") or {}).get("products") or [] + if not products: + log.info("WB no products for query=%r", query) + return [] + + results: list[dict[str, Any]] = [] + for p in products[:limit]: + results.append(_build_item(p)) + return results + + log.warning("WB gave up after %d attempts for query=%r", max_retries + 1, query) + return [] + + +def _build_item(p: dict[str, Any]) -> dict[str, Any]: + sale_u = p.get("salePriceU") or 0 + price_u = p.get("priceU") or 0 + # WB цена в копейках (или /100). Старое поле было в копейках, иногда в условных единицах. + # Делим на 100 — стандартный паттерн. + price_min = (sale_u // 100) if sale_u else (price_u // 100 if price_u else None) + price_max = (price_u // 100) if price_u and price_u != sale_u else None + + # Если у товара есть варианты sizes — берём минимальную цену оттуда + sizes = p.get("sizes") or [] + if sizes: + size_prices = [] + for s in sizes: + sp = (s.get("price") or {}).get("product") or 0 + if sp: + size_prices.append(sp // 100) + if size_prices: + price_min = min(size_prices) + if len(size_prices) > 1: + price_max = max(size_prices) + + pid = p.get("id") + image_url = _build_image_url(pid) if pid else None + + return { + "title": p.get("name") or "", + "url": f"https://www.wildberries.ru/catalog/{pid}/detail.aspx" if pid else "", + "image_url": image_url, + "price_min_rub": price_min, + "price_max_rub": price_max if price_max and price_max != price_min else None, + "rating": p.get("reviewRating") or p.get("rating"), + "reviews_count": p.get("feedbacks"), + "stores_count": None, + "specs": { + "brand": p.get("brand", ""), + "supplier": p.get("supplier", ""), + }, + "source": "wb", + } + + +def _build_image_url(product_id: int) -> str: + """WB хранит фото на nm-1..20.wbbasket.ru. URL зависит от диапазона id.""" + pid = int(product_id) + short = pid // 100000 + # Маппинг WB корзин (упрощённый) + if pid < 144_000_000: basket = (short // 1431) + 1 + elif pid < 287_000_000: basket = (short // 1431) + 1 + else: basket = (short // 1431) + 1 + # Безопасный fallback — basket 10 покрывает почти все ID + if basket < 1 or basket > 25: + basket = 10 + bn = str(basket).zfill(2) + vol = pid // 100000 + part = pid // 1000 + return f"https://basket-{bn}.wbbasket.ru/vol{vol}/part{part}/{pid}/images/big/1.webp" diff --git a/backend-py/app/parsers/yamarket.py b/backend-py/app/parsers/yamarket.py new file mode 100644 index 0000000..d3a772b --- /dev/null +++ b/backend-py/app/parsers/yamarket.py @@ -0,0 +1,236 @@ +"""Парсер Я.Маркета — HTML страница поиска. + +Я.Маркет защищён Qrator. Через резидентный РФ-IP + правильные заголовки ++ cookies на сессию обычно проходит. Без прокси — 401. + +Из HTML вытаскиваем JSON, который Я.Маркет встраивает в