mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 17:44:48 +00:00
247 lines
8.9 KiB
Python
247 lines
8.9 KiB
Python
"""Парсер DNS Shop (dns-shop.ru) — MVP без anti-bot защиты.
|
||
|
||
DNS отдаёт классический HTML с серверным рендерингом + AJAX-цены через
|
||
GraphQL. Для нашего MVP достаточно поисковой страницы — там есть title,
|
||
URL, картинка и цена в data-атрибутах карточки товара.
|
||
|
||
Если DNS изменит вёрстку — селекторы ниже придётся обновить.
|
||
"""
|
||
from __future__ import annotations
|
||
import logging
|
||
import re
|
||
import time
|
||
from typing import Any
|
||
from urllib.parse import quote_plus
|
||
|
||
import httpx
|
||
from bs4 import BeautifulSoup
|
||
|
||
from .. import proxy_pool
|
||
from . import playwright_engine
|
||
|
||
log = logging.getLogger("zov.parser.dns")
|
||
|
||
_BASE_URL = "https://www.dns-shop.ru"
|
||
_SEARCH_URL = "https://www.dns-shop.ru/search/"
|
||
|
||
# Реалистичный User-Agent (свежий Chrome on Windows)
|
||
_HEADERS = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/130.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Sec-Fetch-Dest": "document",
|
||
"Sec-Fetch-Mode": "navigate",
|
||
"Sec-Fetch-Site": "none",
|
||
"Sec-Fetch-User": "?1",
|
||
}
|
||
|
||
_PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")
|
||
|
||
|
||
def search_dns(query: str, limit: int = 1, timeout: float = 30.0,
|
||
max_retries: int = 4) -> list[dict[str, Any]]:
|
||
"""Поиск на DNS через Playwright + ротация residential прокси.
|
||
|
||
DNS защищён Qrator. Многие residential IP уже в чёрных списках, поэтому
|
||
делаем несколько попыток — на каждой берём новый прокси из пула.
|
||
"""
|
||
url = f"{_SEARCH_URL}?q={quote_plus(query)}"
|
||
log.info("DNS search: %s", url)
|
||
|
||
for attempt in range(max_retries + 1):
|
||
html = playwright_engine.fetch_page(
|
||
url,
|
||
wait_selector="a[href*='/product/']",
|
||
wait_ms=4000,
|
||
timeout_ms=int(timeout * 1000),
|
||
)
|
||
if not html:
|
||
log.warning("DNS attempt %d: no HTML", attempt + 1)
|
||
continue
|
||
# 403 от Qrator = IP в их чёрном списке, пробуем другой
|
||
if "HTTP 403" in html[:500] or "qrator" in html.lower()[:5000]:
|
||
log.info("DNS attempt %d: Qrator block, retry with new proxy", attempt + 1)
|
||
continue
|
||
# Успех
|
||
results = _parse_search_html(html, limit=limit)
|
||
if results:
|
||
return results
|
||
log.info("DNS attempt %d: 0 results, retry", attempt + 1)
|
||
|
||
log.warning("DNS gave up after %d attempts for query=%r", max_retries + 1, query)
|
||
return []
|
||
|
||
|
||
def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
results: list[dict[str, Any]] = []
|
||
|
||
# DNS использует разные шаблоны карточек. Пробуем несколько селекторов.
|
||
candidates = (
|
||
soup.select("div.catalog-product")
|
||
or soup.select("[data-product-card]")
|
||
or soup.select("div.product-buy")
|
||
)
|
||
|
||
for card in candidates:
|
||
if len(results) >= limit:
|
||
break
|
||
item = _extract_card(card)
|
||
if item:
|
||
results.append(item)
|
||
|
||
if not results:
|
||
# Резерв: попытаемся достать товар из JSON-LD
|
||
for script in soup.find_all("script", type="application/ld+json"):
|
||
data = _try_json(script.string or "")
|
||
if not data:
|
||
continue
|
||
items = data if isinstance(data, list) else [data]
|
||
for d in items:
|
||
if isinstance(d, dict) and d.get("@type") == "Product":
|
||
results.append({
|
||
"title": d.get("name") or "",
|
||
"url": d.get("url") or "",
|
||
"image_url": (d.get("image") or [None])[0] if isinstance(d.get("image"), list) else d.get("image"),
|
||
"price_min_rub": _try_int((d.get("offers") or {}).get("price")),
|
||
"price_max_rub": None,
|
||
"rating": _try_float((d.get("aggregateRating") or {}).get("ratingValue")),
|
||
"reviews_count": _try_int((d.get("aggregateRating") or {}).get("reviewCount")),
|
||
"stores_count": None,
|
||
"specs": {},
|
||
"source": "dns",
|
||
})
|
||
if len(results) >= limit:
|
||
break
|
||
if len(results) >= limit:
|
||
break
|
||
|
||
return results
|
||
|
||
|
||
def _extract_card(card) -> dict[str, Any] | None:
|
||
"""Извлекает данные карточки товара из произвольного блока."""
|
||
# Заголовок и ссылка
|
||
link_el = (
|
||
card.select_one("a.catalog-product__name")
|
||
or card.select_one("a.product-buy__title")
|
||
or card.select_one("a[href*='/product/']")
|
||
)
|
||
if not link_el:
|
||
return None
|
||
title = link_el.get_text(strip=True) or link_el.get("title") or ""
|
||
href = link_el.get("href") or ""
|
||
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
||
|
||
# Цена
|
||
price = None
|
||
price_el = (
|
||
card.select_one(".product-buy__price")
|
||
or card.select_one("[data-price]")
|
||
or card.select_one(".product-min-price__current")
|
||
)
|
||
if price_el:
|
||
# data-price атрибут — самый надёжный
|
||
dp = price_el.get("data-price") or price_el.get("data-product-price")
|
||
if dp:
|
||
price = _try_int(dp)
|
||
if not price:
|
||
m = _PRICE_RE.search(price_el.get_text(" ", strip=True))
|
||
if m:
|
||
price = _try_int(m.group(1).replace(" ", ""))
|
||
|
||
# Изображение
|
||
img_url = None
|
||
img_el = card.select_one("img.catalog-product__image, img.loaded-product__image, img[data-src], img[src]")
|
||
if img_el:
|
||
img_url = img_el.get("data-src") or img_el.get("src") or img_el.get("data-original")
|
||
if img_url and img_url.startswith("//"):
|
||
img_url = "https:" + img_url
|
||
|
||
# Рейтинг и кол-во отзывов
|
||
rating = None
|
||
rating_el = card.select_one(".catalog-product__rating, [data-rating]")
|
||
if rating_el:
|
||
rating = _try_float(rating_el.get("data-rating") or rating_el.get_text(strip=True))
|
||
|
||
reviews = None
|
||
reviews_el = card.select_one(".catalog-product__reviews, [data-reviews]")
|
||
if reviews_el:
|
||
m = re.search(r"\d+", reviews_el.get_text(" ", strip=True))
|
||
if m:
|
||
reviews = int(m.group(0))
|
||
|
||
if not title:
|
||
return None
|
||
|
||
return {
|
||
"title": title,
|
||
"url": url,
|
||
"image_url": img_url,
|
||
"price_min_rub": price,
|
||
"price_max_rub": price, # DNS показывает одну цену
|
||
"rating": rating,
|
||
"reviews_count": reviews,
|
||
"stores_count": 1,
|
||
"specs": {},
|
||
"source": "dns",
|
||
}
|
||
|
||
|
||
def _try_int(v: Any) -> int | None:
|
||
if v is None:
|
||
return None
|
||
try:
|
||
s = str(v).strip().replace(" ", "").replace(" ", "").replace(",", ".")
|
||
# Цена может быть строкой "79990" или "79990.00"
|
||
return int(float(s))
|
||
except (ValueError, TypeError):
|
||
return None
|
||
|
||
|
||
def _try_float(v: Any) -> float | None:
|
||
if v is None:
|
||
return None
|
||
try:
|
||
return float(str(v).strip().replace(",", "."))
|
||
except (ValueError, TypeError):
|
||
return None
|
||
|
||
|
||
def _try_json(s: str) -> Any:
|
||
import json
|
||
try:
|
||
return json.loads(s)
|
||
except (ValueError, TypeError):
|
||
return None
|
||
|
||
|
||
def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5) -> list[dict[str, Any]]:
|
||
"""Обогащает список моделей данными с DNS.
|
||
|
||
На входе: список моделей от AI с полем `search_query` (или brand+model).
|
||
На выходе: те же модели + ключи `dns: {...}` с парсингом.
|
||
"""
|
||
enriched: list[dict[str, Any]] = []
|
||
for i, m in enumerate(models):
|
||
q = m.get("search_query") or f"{m.get('brand', '')} {m.get('model', '')}".strip()
|
||
if not q:
|
||
enriched.append({**m, "dns": None})
|
||
continue
|
||
try:
|
||
results = search_dns(q, limit=1)
|
||
except Exception as e:
|
||
log.warning("DNS enrich failed for %r: %s", q, e)
|
||
results = []
|
||
enriched.append({**m, "dns": results[0] if results else None})
|
||
if i < len(models) - 1 and delay_sec > 0:
|
||
time.sleep(delay_sec) # вежливая задержка между запросами
|
||
return enriched
|