zov-tech/backend-py/app/parsers/dns.py

"""Парсер DNS Shop (dns-shop.ru) — MVP без anti-bot защиты.

DNS отдаёт классический HTML с серверным рендерингом + AJAX-цены через
GraphQL. Для нашего MVP достаточно поисковой страницы — там есть title,
URL, картинка и цена в data-атрибутах карточки товара.

Если DNS изменит вёрстку — селекторы ниже придётся обновить.
"""
from __future__ import annotations
import logging
import re
import time
from typing import Any
from urllib.parse import quote_plus

import httpx
from bs4 import BeautifulSoup

from .. import proxy_pool
from . import playwright_engine

log = logging.getLogger("zov.parser.dns")

_BASE_URL = "https://www.dns-shop.ru"
_SEARCH_URL = "https://www.dns-shop.ru/search/"

# Реалистичный User-Agent (свежий Chrome on Windows)
_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/130.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "ru-RU,ru;q=0.9,en;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
}

_PRICE_RE = re.compile(r"(\d[\d\s]*)\s*₽")


def search_dns(query: str, limit: int = 1, timeout: float = 30.0,
               max_retries: int = 4) -> list[dict[str, Any]]:
    """Поиск на DNS через Playwright + ротация residential прокси.

    DNS защищён Qrator. Многие residential IP уже в чёрных списках, поэтому
    делаем несколько попыток — на каждой берём новый прокси из пула.
    """
    url = f"{_SEARCH_URL}?q={quote_plus(query)}"
    log.info("DNS search: %s", url)

    for attempt in range(max_retries + 1):
        html = playwright_engine.fetch_page(
            url,
            wait_selector="a[href*='/product/']",
            wait_ms=4000,
            timeout_ms=int(timeout * 1000),
        )
        if not html:
            log.warning("DNS attempt %d: no HTML", attempt + 1)
            continue
        # 403 от Qrator = IP в их чёрном списке, пробуем другой
        if "HTTP 403" in html[:500] or "qrator" in html.lower()[:5000]:
            log.info("DNS attempt %d: Qrator block, retry with new proxy", attempt + 1)
            continue
        # Успех
        results = _parse_search_html(html, limit=limit)
        if results:
            return results
        log.info("DNS attempt %d: 0 results, retry", attempt + 1)

    log.warning("DNS gave up after %d attempts for query=%r", max_retries + 1, query)
    return []


def _parse_search_html(html: str, limit: int) -> list[dict[str, Any]]:
    soup = BeautifulSoup(html, "html.parser")
    results: list[dict[str, Any]] = []

    # DNS использует разные шаблоны карточек. Пробуем несколько селекторов.
    candidates = (
        soup.select("div.catalog-product")
        or soup.select("[data-product-card]")
        or soup.select("div.product-buy")
    )

    for card in candidates:
        if len(results) >= limit:
            break
        item = _extract_card(card)
        if item:
            results.append(item)

    if not results:
        # Резерв: попытаемся достать товар из JSON-LD
        for script in soup.find_all("script", type="application/ld+json"):
            data = _try_json(script.string or "")
            if not data:
                continue
            items = data if isinstance(data, list) else [data]
            for d in items:
                if isinstance(d, dict) and d.get("@type") == "Product":
                    results.append({
                        "title": d.get("name") or "",
                        "url": d.get("url") or "",
                        "image_url": (d.get("image") or [None])[0] if isinstance(d.get("image"), list) else d.get("image"),
                        "price_min_rub": _try_int((d.get("offers") or {}).get("price")),
                        "price_max_rub": None,
                        "rating": _try_float((d.get("aggregateRating") or {}).get("ratingValue")),
                        "reviews_count": _try_int((d.get("aggregateRating") or {}).get("reviewCount")),
                        "stores_count": None,
                        "specs": {},
                        "source": "dns",
                    })
                    if len(results) >= limit:
                        break
            if len(results) >= limit:
                break

    return results


def _extract_card(card) -> dict[str, Any] | None:
    """Извлекает данные карточки товара из произвольного блока."""
    # Заголовок и ссылка
    link_el = (
        card.select_one("a.catalog-product__name")
        or card.select_one("a.product-buy__title")
        or card.select_one("a[href*='/product/']")
    )
    if not link_el:
        return None
    title = link_el.get_text(strip=True) or link_el.get("title") or ""
    href = link_el.get("href") or ""
    url = href if href.startswith("http") else f"{_BASE_URL}{href}"

    # Цена
    price = None
    price_el = (
        card.select_one(".product-buy__price")
        or card.select_one("[data-price]")
        or card.select_one(".product-min-price__current")
    )
    if price_el:
        # data-price атрибут — самый надёжный
        dp = price_el.get("data-price") or price_el.get("data-product-price")
        if dp:
            price = _try_int(dp)
        if not price:
            m = _PRICE_RE.search(price_el.get_text(" ", strip=True))
            if m:
                price = _try_int(m.group(1).replace(" ", ""))

    # Изображение
    img_url = None
    img_el = card.select_one("img.catalog-product__image, img.loaded-product__image, img[data-src], img[src]")
    if img_el:
        img_url = img_el.get("data-src") or img_el.get("src") or img_el.get("data-original")
        if img_url and img_url.startswith("//"):
            img_url = "https:" + img_url

    # Рейтинг и кол-во отзывов
    rating = None
    rating_el = card.select_one(".catalog-product__rating, [data-rating]")
    if rating_el:
        rating = _try_float(rating_el.get("data-rating") or rating_el.get_text(strip=True))

    reviews = None
    reviews_el = card.select_one(".catalog-product__reviews, [data-reviews]")
    if reviews_el:
        m = re.search(r"\d+", reviews_el.get_text(" ", strip=True))
        if m:
            reviews = int(m.group(0))

    if not title:
        return None

    return {
        "title": title,
        "url": url,
        "image_url": img_url,
        "price_min_rub": price,
        "price_max_rub": price,  # DNS показывает одну цену
        "rating": rating,
        "reviews_count": reviews,
        "stores_count": 1,
        "specs": {},
        "source": "dns",
    }


def _try_int(v: Any) -> int | None:
    if v is None:
        return None
    try:
        s = str(v).strip().replace(" ", "").replace(" ", "").replace(",", ".")
        # Цена может быть строкой "79990" или "79990.00"
        return int(float(s))
    except (ValueError, TypeError):
        return None


def _try_float(v: Any) -> float | None:
    if v is None:
        return None
    try:
        return float(str(v).strip().replace(",", "."))
    except (ValueError, TypeError):
        return None


def _try_json(s: str) -> Any:
    import json
    try:
        return json.loads(s)
    except (ValueError, TypeError):
        return None


def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5) -> list[dict[str, Any]]:
    """Обогащает список моделей данными с DNS.

    На входе: список моделей от AI с полем `search_query` (или brand+model).
    На выходе: те же модели + ключи `dns: {...}` с парсингом.
    """
    enriched: list[dict[str, Any]] = []
    for i, m in enumerate(models):
        q = m.get("search_query") or f"{m.get('brand', '')} {m.get('model', '')}".strip()
        if not q:
            enriched.append({**m, "dns": None})
            continue
        try:
            results = search_dns(q, limit=1)
        except Exception as e:
            log.warning("DNS enrich failed for %r: %s", q, e)
            results = []
        enriched.append({**m, "dns": results[0] if results else None})
        if i < len(models) - 1 and delay_sec > 0:
            time.sleep(delay_sec)  # вежливая задержка между запросами
    return enriched