mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 22:24:48 +00:00
PROXY POOL (app/proxy_pool.py): - Loads active proxies from Proxy6.net API every 10 min - Random rotation per request via proxied_client(timeout, headers) - Graceful fallback to direct HTTP if PROXY6_TOKEN not set - Config: PROXY6_TOKEN env var PARSERS (app/parsers/): - dns.py — refactored to use proxy_pool with retry+rotation on Qrator block - wb.py — Wildberries JSON API (search.wb.ru), retries on 429 - ozon.py — OZON composer-api JSON (widgetStates extraction) - yamarket.py — Я.Маркет HTML + embedded JSON parser - __init__.py — enrich_one() fans out to all sources, aggregates min/max prices, max rating, sum reviews - enrich_models() — batch enrich for AI by_category output NEW DIAGNOSTIC ENDPOINTS (main.py): - GET /api/parse_wb?q=...&limit=N - GET /api/parse_ozon?q=...&limit=N - GET /api/parse_yamarket?q=...&limit=N - GET /api/parse_all?q=... — fan-out + aggregate - GET /api/proxy_status — pool diagnostics (count, token configured, age) PODBOR (main.py): - _enrich_ai_with_dns -> _enrich_ai_marketplaces (uses all sources) DEPLOY: needs PROXY6_TOKEN in /opt/zov-tech/deploy/.env on VPS, then docker compose build + up -d backend
133 lines
5.3 KiB
Python
133 lines
5.3 KiB
Python
"""Парсеры маркетплейсов для обогащения карточек моделей.
|
||
|
||
Все парсеры используют общий proxy_pool (Proxy6.net), если PROXY6_TOKEN задан.
|
||
Без прокси крупные маркетплейсы РФ (DNS, OZON, Я.Маркет) возвращают 401/307.
|
||
|
||
Источники:
|
||
- dns.py — DNS Shop (характеристики, цена одного магазина)
|
||
- wb.py — Wildberries (JSON API, цена + отзывы + рейтинг)
|
||
- ozon.py — OZON (composer-api JSON)
|
||
- yamarket.py — Я.Маркет (HTML + встроенный JSON, сравнение цен)
|
||
|
||
Унифицированный формат результата (item):
|
||
{
|
||
"title": str, # Название как на странице
|
||
"url": str, # Ссылка на товар
|
||
"image_url": str | None, # URL основного фото
|
||
"price_min_rub": int | None,
|
||
"price_max_rub": int | None,
|
||
"rating": float | None, # 0.0 - 5.0
|
||
"reviews_count": int | None,
|
||
"stores_count": int | None, # Только Я.Маркет (сравнение)
|
||
"specs": dict[str, str],
|
||
"source": str, # 'dns' | 'wb' | 'ozon' | 'yamarket'
|
||
}
|
||
"""
|
||
from __future__ import annotations
|
||
import logging
|
||
import time
|
||
from typing import Any
|
||
|
||
from .dns import search_dns
|
||
from .wb import search_wb
|
||
from .ozon import search_ozon
|
||
from .yamarket import search_yamarket
|
||
|
||
log = logging.getLogger("zov.parser")
|
||
|
||
__all__ = ["search_dns", "search_wb", "search_ozon", "search_yamarket",
|
||
"enrich_one", "enrich_models"]
|
||
|
||
|
||
def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> dict[str, Any]:
|
||
"""Спрашивает все указанные источники и объединяет лучшее в единый отчёт.
|
||
|
||
Возвращает:
|
||
{
|
||
"wb": {item dict} или None,
|
||
"ozon": {item dict} или None,
|
||
"yamarket": {item dict} или None,
|
||
"dns": {item dict} или None,
|
||
"price_min_rub": int | None, # минимум по всем источникам
|
||
"price_max_rub": int | None,
|
||
"image_url": str | None,
|
||
"rating_max": float | None,
|
||
"reviews_total": int | None,
|
||
"stores_count": int | None, # макс. из yamarket
|
||
"best_url": str | None,
|
||
}
|
||
"""
|
||
fetchers = {
|
||
"wb": lambda: _safe_first(search_wb, query),
|
||
"yamarket": lambda: _safe_first(search_yamarket, query),
|
||
"ozon": lambda: _safe_first(search_ozon, query),
|
||
"dns": lambda: _safe_first(search_dns, query),
|
||
}
|
||
|
||
items: dict[str, dict] = {}
|
||
for src in sources:
|
||
fn = fetchers.get(src)
|
||
if not fn:
|
||
continue
|
||
try:
|
||
items[src] = fn()
|
||
except Exception as e:
|
||
log.warning("Source %s failed for %r: %s", src, query, e)
|
||
items[src] = None
|
||
|
||
# Агрегация
|
||
prices = [i["price_min_rub"] for i in items.values() if i and i.get("price_min_rub")]
|
||
images = [i["image_url"] for i in items.values() if i and i.get("image_url")]
|
||
ratings = [i["rating"] for i in items.values() if i and i.get("rating")]
|
||
reviews = [i["reviews_count"] for i in items.values() if i and i.get("reviews_count")]
|
||
|
||
# Я.Маркет даёт количество магазинов
|
||
stores = None
|
||
if items.get("yamarket") and items["yamarket"].get("stores_count"):
|
||
stores = items["yamarket"]["stores_count"]
|
||
|
||
best_url = None
|
||
# Приоритет: yamarket (агрегатор) → wb → ozon → dns
|
||
for src in ("yamarket", "wb", "ozon", "dns"):
|
||
i = items.get(src)
|
||
if i and i.get("url"):
|
||
best_url = i["url"]
|
||
break
|
||
|
||
return {
|
||
**{src: items.get(src) for src in fetchers.keys()},
|
||
"price_min_rub": min(prices) if prices else None,
|
||
"price_max_rub": max(prices) if prices else None,
|
||
"image_url": images[0] if images else None,
|
||
"rating_max": max(ratings) if ratings else None,
|
||
"reviews_total": sum(reviews) if reviews else None,
|
||
"stores_count": stores,
|
||
"best_url": best_url,
|
||
}
|
||
|
||
|
||
def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5,
|
||
sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> list[dict[str, Any]]:
|
||
"""Обогащает список моделей от AI данными со всех источников."""
|
||
enriched: list[dict[str, Any]] = []
|
||
for i, m in enumerate(models):
|
||
q = m.get("search_query") or f"{m.get('brand', '')} {m.get('model', '')}".strip()
|
||
if not q:
|
||
enriched.append({**m, "enriched": None})
|
||
continue
|
||
try:
|
||
data = enrich_one(q, sources=sources)
|
||
except Exception as e:
|
||
log.warning("Enrich failed for %r: %s", q, e)
|
||
data = None
|
||
enriched.append({**m, "enriched": data})
|
||
if i < len(models) - 1 and delay_sec > 0:
|
||
time.sleep(delay_sec)
|
||
return enriched
|
||
|
||
|
||
def _safe_first(search_fn, query: str) -> dict[str, Any] | None:
|
||
"""Вызывает поиск и возвращает первый результат или None."""
|
||
results = search_fn(query, limit=1)
|
||
return results[0] if results else None
|