mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 15:44:47 +00:00
PROXY POOL (app/proxy_pool.py): - Loads active proxies from Proxy6.net API every 10 min - Random rotation per request via proxied_client(timeout, headers) - Graceful fallback to direct HTTP if PROXY6_TOKEN not set - Config: PROXY6_TOKEN env var PARSERS (app/parsers/): - dns.py — refactored to use proxy_pool with retry+rotation on Qrator block - wb.py — Wildberries JSON API (search.wb.ru), retries on 429 - ozon.py — OZON composer-api JSON (widgetStates extraction) - yamarket.py — Я.Маркет HTML + embedded JSON parser - __init__.py — enrich_one() fans out to all sources, aggregates min/max prices, max rating, sum reviews - enrich_models() — batch enrich for AI by_category output NEW DIAGNOSTIC ENDPOINTS (main.py): - GET /api/parse_wb?q=...&limit=N - GET /api/parse_ozon?q=...&limit=N - GET /api/parse_yamarket?q=...&limit=N - GET /api/parse_all?q=... — fan-out + aggregate - GET /api/proxy_status — pool diagnostics (count, token configured, age) PODBOR (main.py): - _enrich_ai_with_dns -> _enrich_ai_marketplaces (uses all sources) DEPLOY: needs PROXY6_TOKEN in /opt/zov-tech/deploy/.env on VPS, then docker compose build + up -d backend
249 lines
9.0 KiB
Python
249 lines
9.0 KiB
Python
"""Парсер DNS Shop (dns-shop.ru) — MVP без anti-bot защиты.
|
||
|
||
DNS отдаёт классический HTML с серверным рендерингом + AJAX-цены через
|
||
GraphQL. Для нашего MVP достаточно поисковой страницы — там есть title,
|
||
URL, картинка и цена в data-атрибутах карточки товара.
|
||
|
||
Если DNS изменит вёрстку — селекторы ниже придётся обновить.
|
||
"""
|
||
from __future__ import annotations
|
||
import logging
|
||
import re
|
||
import time
|
||
from typing import Any
|
||
from urllib.parse import quote_plus
|
||
|
||
import httpx
|
||
from bs4 import BeautifulSoup
|
||
|
||
from .. import proxy_pool
|
||
|
||
log = logging.getLogger("zov.parser.dns")
|
||
|
||
_BASE_URL = "https://www.dns-shop.ru"
|
||
_SEARCH_URL = "https://www.dns-shop.ru/search/"
|
||
|
||
# Реалистичный User-Agent (свежий Chrome on Windows)
|
||
_HEADERS = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/130.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Sec-Fetch-Dest": "document",
|
||
"Sec-Fetch-Mode": "navigate",
|
||
"Sec-Fetch-Site": "none",
|
||
"Sec-Fetch-User": "?1",
|
||
}
|
||
|
||
_PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
|
||
|
||
|
||
def search_dns(query: str, limit: int = 1, timeout: float = 12.0,
|
||
max_retries: int = 2) -> list[dict[str, Any]]:
|
||
"""Поиск товара на DNS по строке запроса.
|
||
|
||
Использует Proxy6-пул если PROXY6_TOKEN задан, иначе ходит напрямую.
|
||
DNS защищён Qrator — без прокси скорее всего 401.
|
||
|
||
Возвращает список результатов (топ-N) или пустой при ошибке.
|
||
"""
|
||
url = f"{_SEARCH_URL}?q={quote_plus(query)}"
|
||
log.info("DNS search: %s", url)
|
||
|
||
last_err = None
|
||
for attempt in range(max_retries + 1):
|
||
try:
|
||
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
|
||
follow_redirects=True) as client:
|
||
resp = client.get(url)
|
||
except httpx.HTTPError as e:
|
||
last_err = e
|
||
log.warning("DNS request failed (attempt %d): %s", attempt + 1, e)
|
||
continue
|
||
|
||
if resp.status_code == 200:
|
||
text = resp.text
|
||
if "qrator" in text.lower() or "challenge" in text.lower() or "captcha" in text.lower():
|
||
log.warning("DNS Qrator/captcha on attempt %d, rotating proxy", attempt + 1)
|
||
continue
|
||
return _parse_search_html(text, limit=limit)
|
||
|
||
log.warning("DNS returned status=%s on attempt %d", resp.status_code, attempt + 1)
|
||
|
||
log.warning("DNS gave up after %d attempts for query=%r (last_err=%s)",
|
||
max_retries + 1, query, last_err)
|
||
return []
|
||
|
||
|
||
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
results: list[dict[str, Any]] = []
|
||
|
||
# DNS использует разные шаблоны карточек. Пробуем несколько селекторов.
|
||
candidates = (
|
||
soup.select("div.catalog-product")
|
||
or soup.select("[data-product-card]")
|
||
or soup.select("div.product-buy")
|
||
)
|
||
|
||
for card in candidates:
|
||
if len(results) >= limit:
|
||
break
|
||
item = _extract_card(card)
|
||
if item:
|
||
results.append(item)
|
||
|
||
if not results:
|
||
# Резерв: попытаемся достать товар из JSON-LD
|
||
for script in soup.find_all("script", type="application/ld+json"):
|
||
data = _try_json(script.string or "")
|
||
if not data:
|
||
continue
|
||
items = data if isinstance(data, list) else [data]
|
||
for d in items:
|
||
if isinstance(d, dict) and d.get("@type") == "Product":
|
||
results.append({
|
||
"title": d.get("name") or "",
|
||
"url": d.get("url") or "",
|
||
"image_url": (d.get("image") or [None])[0] if isinstance(d.get("image"), list) else d.get("image"),
|
||
"price_min_rub": _try_int((d.get("offers") or {}).get("price")),
|
||
"price_max_rub": None,
|
||
"rating": _try_float((d.get("aggregateRating") or {}).get("ratingValue")),
|
||
"reviews_count": _try_int((d.get("aggregateRating") or {}).get("reviewCount")),
|
||
"stores_count": None,
|
||
"specs": {},
|
||
"source": "dns",
|
||
})
|
||
if len(results) >= limit:
|
||
break
|
||
if len(results) >= limit:
|
||
break
|
||
|
||
return results
|
||
|
||
|
||
def _extract_card(card) -> dict[str, Any] | None:
|
||
"""Извлекает данные карточки товара из произвольного блока."""
|
||
# Заголовок и ссылка
|
||
link_el = (
|
||
card.select_one("a.catalog-product__name")
|
||
or card.select_one("a.product-buy__title")
|
||
or card.select_one("a[href*='/product/']")
|
||
)
|
||
if not link_el:
|
||
return None
|
||
title = link_el.get_text(strip=True) or link_el.get("title") or ""
|
||
href = link_el.get("href") or ""
|
||
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
||
|
||
# Цена
|
||
price = None
|
||
price_el = (
|
||
card.select_one(".product-buy__price")
|
||
or card.select_one("[data-price]")
|
||
or card.select_one(".product-min-price__current")
|
||
)
|
||
if price_el:
|
||
# data-price атрибут — самый надёжный
|
||
dp = price_el.get("data-price") or price_el.get("data-product-price")
|
||
if dp:
|
||
price = _try_int(dp)
|
||
if not price:
|
||
m = _PRICE_RE.search(price_el.get_text(" ", strip=True))
|
||
if m:
|
||
price = _try_int(m.group(1).replace(" ", ""))
|
||
|
||
# Изображение
|
||
img_url = None
|
||
img_el = card.select_one("img.catalog-product__image, img.loaded-product__image, img[data-src], img[src]")
|
||
if img_el:
|
||
img_url = img_el.get("data-src") or img_el.get("src") or img_el.get("data-original")
|
||
if img_url and img_url.startswith("//"):
|
||
img_url = "https:" + img_url
|
||
|
||
# Рейтинг и кол-во отзывов
|
||
rating = None
|
||
rating_el = card.select_one(".catalog-product__rating, [data-rating]")
|
||
if rating_el:
|
||
rating = _try_float(rating_el.get("data-rating") or rating_el.get_text(strip=True))
|
||
|
||
reviews = None
|
||
reviews_el = card.select_one(".catalog-product__reviews, [data-reviews]")
|
||
if reviews_el:
|
||
m = re.search(r"\d+", reviews_el.get_text(" ", strip=True))
|
||
if m:
|
||
reviews = int(m.group(0))
|
||
|
||
if not title:
|
||
return None
|
||
|
||
return {
|
||
"title": title,
|
||
"url": url,
|
||
"image_url": img_url,
|
||
"price_min_rub": price,
|
||
"price_max_rub": price, # DNS показывает одну цену
|
||
"rating": rating,
|
||
"reviews_count": reviews,
|
||
"stores_count": 1,
|
||
"specs": {},
|
||
"source": "dns",
|
||
}
|
||
|
||
|
||
def _try_int(v: Any) -> int | None:
|
||
if v is None:
|
||
return None
|
||
try:
|
||
s = str(v).strip().replace(" ", "").replace(" ", "").replace(",", ".")
|
||
# Цена может быть строкой "79990" или "79990.00"
|
||
return int(float(s))
|
||
except (ValueError, TypeError):
|
||
return None
|
||
|
||
|
||
def _try_float(v: Any) -> float | None:
|
||
if v is None:
|
||
return None
|
||
try:
|
||
return float(str(v).strip().replace(",", "."))
|
||
except (ValueError, TypeError):
|
||
return None
|
||
|
||
|
||
def _try_json(s: str) -> Any:
|
||
import json
|
||
try:
|
||
return json.loads(s)
|
||
except (ValueError, TypeError):
|
||
return None
|
||
|
||
|
||
def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5) -> list[dict[str, Any]]:
|
||
"""Обогащает список моделей данными с DNS.
|
||
|
||
На входе: список моделей от AI с полем `search_query` (или brand+model).
|
||
На выходе: те же модели + ключи `dns: {...}` с парсингом.
|
||
"""
|
||
enriched: list[dict[str, Any]] = []
|
||
for i, m in enumerate(models):
|
||
q = m.get("search_query") or f"{m.get('brand', '')} {m.get('model', '')}".strip()
|
||
if not q:
|
||
enriched.append({**m, "dns": None})
|
||
continue
|
||
try:
|
||
results = search_dns(q, limit=1)
|
||
except Exception as e:
|
||
log.warning("DNS enrich failed for %r: %s", q, e)
|
||
results = []
|
||
enriched.append({**m, "dns": results[0] if results else None})
|
||
if i < len(models) - 1 and delay_sec > 0:
|
||
time.sleep(delay_sec) # вежливая задержка между запросами
|
||
return enriched
|