zov-tech/backend-py/app/parsers/dns.py
wasrusgen 82425dbd88 backend: Proxy6 pool + parsers WB / OZON / Я.Маркет / DNS
PROXY POOL (app/proxy_pool.py):
- Loads active proxies from Proxy6.net API every 10 min
- Random rotation per request via proxied_client(timeout, headers)
- Graceful fallback to direct HTTP if PROXY6_TOKEN not set
- Config: PROXY6_TOKEN env var

PARSERS (app/parsers/):
- dns.py — refactored to use proxy_pool with retry+rotation on Qrator block
- wb.py — Wildberries JSON API (search.wb.ru), retries on 429
- ozon.py — OZON composer-api JSON (widgetStates extraction)
- yamarket.py — Я.Маркет HTML + embedded JSON parser
- __init__.py — enrich_one() fans out to all sources, aggregates min/max prices, max rating, sum reviews
- enrich_models() — batch enrich for AI by_category output

NEW DIAGNOSTIC ENDPOINTS (main.py):
- GET /api/parse_wb?q=...&limit=N
- GET /api/parse_ozon?q=...&limit=N
- GET /api/parse_yamarket?q=...&limit=N
- GET /api/parse_all?q=... — fan-out + aggregate
- GET /api/proxy_status — pool diagnostics (count, token configured, age)

PODBOR (main.py):
- _enrich_ai_with_dns -> _enrich_ai_marketplaces (uses all sources)

DEPLOY: needs PROXY6_TOKEN in /opt/zov-tech/deploy/.env on VPS, then docker compose build + up -d backend
2026-05-11 12:18:04 +03:00

249 lines
9.0 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Парсер DNS Shop (dns-shop.ru) — MVP без anti-bot защиты.
DNS отдаёт классический HTML с серверным рендерингом + AJAX-цены через
GraphQL. Для нашего MVP достаточно поисковой страницы — там есть title,
URL, картинка и цена в data-атрибутах карточки товара.
Если DNS изменит вёрстку — селекторы ниже придётся обновить.
"""
from __future__ import annotations
import logging
import re
import time
from typing import Any
from urllib.parse import quote_plus
import httpx
from bs4 import BeautifulSoup
from .. import proxy_pool
log = logging.getLogger("zov.parser.dns")
_BASE_URL = "https://www.dns-shop.ru"
_SEARCH_URL = "https://www.dns-shop.ru/search/"
# Реалистичный User-Agent (свежий Chrome on Windows)
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/130.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
}
_PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
def search_dns(query: str, limit: int = 1, timeout: float = 12.0,
max_retries: int = 2) -> list[dict[str, Any]]:
"""Поиск товара на DNS по строке запроса.
Использует Proxy6-пул если PROXY6_TOKEN задан, иначе ходит напрямую.
DNS защищён Qrator — без прокси скорее всего 401.
Возвращает список результатов (топ-N) или пустой при ошибке.
"""
url = f"{_SEARCH_URL}?q={quote_plus(query)}"
log.info("DNS search: %s", url)
last_err = None
for attempt in range(max_retries + 1):
try:
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
follow_redirects=True) as client:
resp = client.get(url)
except httpx.HTTPError as e:
last_err = e
log.warning("DNS request failed (attempt %d): %s", attempt + 1, e)
continue
if resp.status_code == 200:
text = resp.text
if "qrator" in text.lower() or "challenge" in text.lower() or "captcha" in text.lower():
log.warning("DNS Qrator/captcha on attempt %d, rotating proxy", attempt + 1)
continue
return _parse_search_html(text, limit=limit)
log.warning("DNS returned status=%s on attempt %d", resp.status_code, attempt + 1)
log.warning("DNS gave up after %d attempts for query=%r (last_err=%s)",
max_retries + 1, query, last_err)
return []
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
soup = BeautifulSoup(html, "html.parser")
results: list[dict[str, Any]] = []
# DNS использует разные шаблоны карточек. Пробуем несколько селекторов.
candidates = (
soup.select("div.catalog-product")
or soup.select("[data-product-card]")
or soup.select("div.product-buy")
)
for card in candidates:
if len(results) >= limit:
break
item = _extract_card(card)
if item:
results.append(item)
if not results:
# Резерв: попытаемся достать товар из JSON-LD
for script in soup.find_all("script", type="application/ld+json"):
data = _try_json(script.string or "")
if not data:
continue
items = data if isinstance(data, list) else [data]
for d in items:
if isinstance(d, dict) and d.get("@type") == "Product":
results.append({
"title": d.get("name") or "",
"url": d.get("url") or "",
"image_url": (d.get("image") or [None])[0] if isinstance(d.get("image"), list) else d.get("image"),
"price_min_rub": _try_int((d.get("offers") or {}).get("price")),
"price_max_rub": None,
"rating": _try_float((d.get("aggregateRating") or {}).get("ratingValue")),
"reviews_count": _try_int((d.get("aggregateRating") or {}).get("reviewCount")),
"stores_count": None,
"specs": {},
"source": "dns",
})
if len(results) >= limit:
break
if len(results) >= limit:
break
return results
def _extract_card(card) -> dict[str, Any] | None:
"""Извлекает данные карточки товара из произвольного блока."""
# Заголовок и ссылка
link_el = (
card.select_one("a.catalog-product__name")
or card.select_one("a.product-buy__title")
or card.select_one("a[href*='/product/']")
)
if not link_el:
return None
title = link_el.get_text(strip=True) or link_el.get("title") or ""
href = link_el.get("href") or ""
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
# Цена
price = None
price_el = (
card.select_one(".product-buy__price")
or card.select_one("[data-price]")
or card.select_one(".product-min-price__current")
)
if price_el:
# data-price атрибут — самый надёжный
dp = price_el.get("data-price") or price_el.get("data-product-price")
if dp:
price = _try_int(dp)
if not price:
m = _PRICE_RE.search(price_el.get_text(" ", strip=True))
if m:
price = _try_int(m.group(1).replace(" ", ""))
# Изображение
img_url = None
img_el = card.select_one("img.catalog-product__image, img.loaded-product__image, img[data-src], img[src]")
if img_el:
img_url = img_el.get("data-src") or img_el.get("src") or img_el.get("data-original")
if img_url and img_url.startswith("//"):
img_url = "https:" + img_url
# Рейтинг и кол-во отзывов
rating = None
rating_el = card.select_one(".catalog-product__rating, [data-rating]")
if rating_el:
rating = _try_float(rating_el.get("data-rating") or rating_el.get_text(strip=True))
reviews = None
reviews_el = card.select_one(".catalog-product__reviews, [data-reviews]")
if reviews_el:
m = re.search(r"\d+", reviews_el.get_text(" ", strip=True))
if m:
reviews = int(m.group(0))
if not title:
return None
return {
"title": title,
"url": url,
"image_url": img_url,
"price_min_rub": price,
"price_max_rub": price, # DNS показывает одну цену
"rating": rating,
"reviews_count": reviews,
"stores_count": 1,
"specs": {},
"source": "dns",
}
def _try_int(v: Any) -> int | None:
if v is None:
return None
try:
s = str(v).strip().replace("", "").replace(" ", "").replace(",", ".")
# Цена может быть строкой "79990" или "79990.00"
return int(float(s))
except (ValueError, TypeError):
return None
def _try_float(v: Any) -> float | None:
if v is None:
return None
try:
return float(str(v).strip().replace(",", "."))
except (ValueError, TypeError):
return None
def _try_json(s: str) -> Any:
import json
try:
return json.loads(s)
except (ValueError, TypeError):
return None
def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5) -> list[dict[str, Any]]:
"""Обогащает список моделей данными с DNS.
На входе: список моделей от AI с полем `search_query` (или brand+model).
На выходе: те же модели + ключи `dns: {...}` с парсингом.
"""
enriched: list[dict[str, Any]] = []
for i, m in enumerate(models):
q = m.get("search_query") or f"{m.get('brand', '')} {m.get('model', '')}".strip()
if not q:
enriched.append({**m, "dns": None})
continue
try:
results = search_dns(q, limit=1)
except Exception as e:
log.warning("DNS enrich failed for %r: %s", q, e)
results = []
enriched.append({**m, "dns": results[0] if results else None})
if i < len(models) - 1 and delay_sec > 0:
time.sleep(delay_sec) # вежливая задержка между запросами
return enriched