mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 22:24:48 +00:00
PROXY POOL (app/proxy_pool.py): - Loads active proxies from Proxy6.net API every 10 min - Random rotation per request via proxied_client(timeout, headers) - Graceful fallback to direct HTTP if PROXY6_TOKEN not set - Config: PROXY6_TOKEN env var PARSERS (app/parsers/): - dns.py — refactored to use proxy_pool with retry+rotation on Qrator block - wb.py — Wildberries JSON API (search.wb.ru), retries on 429 - ozon.py — OZON composer-api JSON (widgetStates extraction) - yamarket.py — Я.Маркет HTML + embedded JSON parser - __init__.py — enrich_one() fans out to all sources, aggregates min/max prices, max rating, sum reviews - enrich_models() — batch enrich for AI by_category output NEW DIAGNOSTIC ENDPOINTS (main.py): - GET /api/parse_wb?q=...&limit=N - GET /api/parse_ozon?q=...&limit=N - GET /api/parse_yamarket?q=...&limit=N - GET /api/parse_all?q=... — fan-out + aggregate - GET /api/proxy_status — pool diagnostics (count, token configured, age) PODBOR (main.py): - _enrich_ai_with_dns -> _enrich_ai_marketplaces (uses all sources) DEPLOY: needs PROXY6_TOKEN in /opt/zov-tech/deploy/.env on VPS, then docker compose build + up -d backend
136 lines
4.9 KiB
Python
136 lines
4.9 KiB
Python
"""Парсер Wildberries — через их JSON API.
|
||
|
||
Endpoint search.wb.ru отдаёт чистый JSON с товарами. Цены в копейках/u
|
||
(делим на 100). У товаров есть rating, feedbacks (отзывы), brand.
|
||
|
||
Цена /salePriceU/ — итоговая со скидкой, /priceU/ — RRP.
|
||
"""
|
||
from __future__ import annotations
|
||
import logging
|
||
from typing import Any
|
||
from urllib.parse import quote_plus
|
||
|
||
import httpx
|
||
|
||
from .. import proxy_pool
|
||
|
||
log = logging.getLogger("zov.parser.wb")
|
||
|
||
_SEARCH_URL = "https://search.wb.ru/exactmatch/ru/common/v9/search"
|
||
_DEFAULT_PARAMS = {
|
||
"TestGroup": "no_test",
|
||
"TestID": "no_test",
|
||
"appType": "1",
|
||
"curr": "rub",
|
||
"dest": "-1257786", # Москва, можно поменять
|
||
"resultset": "catalog",
|
||
"sort": "popular",
|
||
"spp": "30",
|
||
"suppressSpellcheck": "false",
|
||
}
|
||
_HEADERS = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
||
"Accept": "*/*",
|
||
"Accept-Language": "ru-RU,ru;q=0.9",
|
||
"Origin": "https://www.wildberries.ru",
|
||
"Referer": "https://www.wildberries.ru/",
|
||
}
|
||
|
||
|
||
def search_wb(query: str, limit: int = 3, timeout: float = 12.0,
|
||
max_retries: int = 2) -> list[dict[str, Any]]:
|
||
params = {**_DEFAULT_PARAMS, "query": query}
|
||
|
||
for attempt in range(max_retries + 1):
|
||
try:
|
||
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS) as client:
|
||
resp = client.get(_SEARCH_URL, params=params)
|
||
except httpx.HTTPError as e:
|
||
log.warning("WB request failed (attempt %d): %s", attempt + 1, e)
|
||
continue
|
||
|
||
if resp.status_code == 429:
|
||
log.warning("WB rate-limited on attempt %d, rotating proxy", attempt + 1)
|
||
continue
|
||
if resp.status_code != 200:
|
||
log.warning("WB returned status=%s", resp.status_code)
|
||
continue
|
||
|
||
try:
|
||
data = resp.json()
|
||
except Exception as e:
|
||
log.warning("WB JSON parse failed: %s", e)
|
||
continue
|
||
|
||
products = (data.get("data") or {}).get("products") or []
|
||
if not products:
|
||
log.info("WB no products for query=%r", query)
|
||
return []
|
||
|
||
results: list[dict[str, Any]] = []
|
||
for p in products[:limit]:
|
||
results.append(_build_item(p))
|
||
return results
|
||
|
||
log.warning("WB gave up after %d attempts for query=%r", max_retries + 1, query)
|
||
return []
|
||
|
||
|
||
def _build_item(p: dict[str, Any]) -> dict[str, Any]:
|
||
sale_u = p.get("salePriceU") or 0
|
||
price_u = p.get("priceU") or 0
|
||
# WB цена в копейках (или /100). Старое поле было в копейках, иногда в условных единицах.
|
||
# Делим на 100 — стандартный паттерн.
|
||
price_min = (sale_u // 100) if sale_u else (price_u // 100 if price_u else None)
|
||
price_max = (price_u // 100) if price_u and price_u != sale_u else None
|
||
|
||
# Если у товара есть варианты sizes — берём минимальную цену оттуда
|
||
sizes = p.get("sizes") or []
|
||
if sizes:
|
||
size_prices = []
|
||
for s in sizes:
|
||
sp = (s.get("price") or {}).get("product") or 0
|
||
if sp:
|
||
size_prices.append(sp // 100)
|
||
if size_prices:
|
||
price_min = min(size_prices)
|
||
if len(size_prices) > 1:
|
||
price_max = max(size_prices)
|
||
|
||
pid = p.get("id")
|
||
image_url = _build_image_url(pid) if pid else None
|
||
|
||
return {
|
||
"title": p.get("name") or "",
|
||
"url": f"https://www.wildberries.ru/catalog/{pid}/detail.aspx" if pid else "",
|
||
"image_url": image_url,
|
||
"price_min_rub": price_min,
|
||
"price_max_rub": price_max if price_max and price_max != price_min else None,
|
||
"rating": p.get("reviewRating") or p.get("rating"),
|
||
"reviews_count": p.get("feedbacks"),
|
||
"stores_count": None,
|
||
"specs": {
|
||
"brand": p.get("brand", ""),
|
||
"supplier": p.get("supplier", ""),
|
||
},
|
||
"source": "wb",
|
||
}
|
||
|
||
|
||
def _build_image_url(product_id: int) -> str:
|
||
"""WB хранит фото на nm-1..20.wbbasket.ru. URL зависит от диапазона id."""
|
||
pid = int(product_id)
|
||
short = pid // 100000
|
||
# Маппинг WB корзин (упрощённый)
|
||
if pid < 144_000_000: basket = (short // 1431) + 1
|
||
elif pid < 287_000_000: basket = (short // 1431) + 1
|
||
else: basket = (short // 1431) + 1
|
||
# Безопасный fallback — basket 10 покрывает почти все ID
|
||
if basket < 1 or basket > 25:
|
||
basket = 10
|
||
bn = str(basket).zfill(2)
|
||
vol = pid // 100000
|
||
part = pid // 1000
|
||
return f"https://basket-{bn}.wbbasket.ru/vol{vol}/part{part}/{pid}/images/big/1.webp"
|