mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 17:04:48 +00:00
backend: working parsers — OZON + Citilink (DOM via Playwright) + WB
DIAGNOSTIC RESULTS: - OZON: 19 product links via Playwright on naked VPS-IP ✓ - Citilink: 112 data-meta-name Snippets ✓ - Wildberries: JSON API works with delays ✓ - Я.Маркет, DNS: blocked by ASN (need residential proxy) OZON PARSER: - Pure Playwright DOM (composer-api dropped — was blocked) - Selects a[href*='/product/'], walks up to card div, extracts title/price/img - Filters fake 'titles' like Распродажа, Скидка CITILINK PARSER (new): - Selects [data-meta-name*='Snippet'] or ProductCard markers - Multiple title selectors fallback chain - Filters out non-product hits PARSERS/__init__.py: - DEFAULT_SOURCES = (ozon, citilink, wb) — all work without proxy - Я.Маркет, DNS kept but not default — usable when residential proxy added NEW ENDPOINT: - GET /api/parse_citilink?q=...&limit=N
This commit is contained in:
parent
5fdae262ef
commit
e8b487891f
@ -13,7 +13,7 @@ from .config import get_config
|
|||||||
from .auth import verify_init_data
|
from .auth import verify_init_data
|
||||||
from . import sheets, ai, telegram as tg, proxy_pool
|
from . import sheets, ai, telegram as tg, proxy_pool
|
||||||
from . import parsers
|
from . import parsers
|
||||||
from .parsers import dns as parser_dns, wb as parser_wb, ozon as parser_ozon, yamarket as parser_ym
|
from .parsers import dns as parser_dns, wb as parser_wb, ozon as parser_ozon, yamarket as parser_ym, citilink as parser_cl
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
||||||
log = logging.getLogger("zov.backend")
|
log = logging.getLogger("zov.backend")
|
||||||
@ -195,6 +195,17 @@ def api_parse_yamarket(q: str = "", limit: int = 3):
|
|||||||
return {"ok": False, "error": str(e), "query": q}
|
return {"ok": False, "error": str(e), "query": q}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/parse_citilink")
|
||||||
|
def api_parse_citilink(q: str = "", limit: int = 3):
|
||||||
|
if not q:
|
||||||
|
return {"error": "missing_query"}
|
||||||
|
try:
|
||||||
|
results = parser_cl.search_citilink(q, limit=min(max(1, limit), 10))
|
||||||
|
return {"ok": True, "query": q, "count": len(results), "results": results}
|
||||||
|
except Exception as e:
|
||||||
|
return {"ok": False, "error": str(e), "query": q}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/parse_all")
|
@app.get("/api/parse_all")
|
||||||
def api_parse_all(q: str = ""):
|
def api_parse_all(q: str = ""):
|
||||||
"""Спрашивает все источники и возвращает агрегированный результат."""
|
"""Спрашивает все источники и возвращает агрегированный результат."""
|
||||||
|
|||||||
@ -32,14 +32,22 @@ from .dns import search_dns
|
|||||||
from .wb import search_wb
|
from .wb import search_wb
|
||||||
from .ozon import search_ozon
|
from .ozon import search_ozon
|
||||||
from .yamarket import search_yamarket
|
from .yamarket import search_yamarket
|
||||||
|
from .citilink import search_citilink
|
||||||
|
|
||||||
log = logging.getLogger("zov.parser")
|
log = logging.getLogger("zov.parser")
|
||||||
|
|
||||||
__all__ = ["search_dns", "search_wb", "search_ozon", "search_yamarket",
|
__all__ = ["search_dns", "search_wb", "search_ozon", "search_yamarket", "search_citilink",
|
||||||
"enrich_one", "enrich_models"]
|
"enrich_one", "enrich_models"]
|
||||||
|
|
||||||
|
# Источники по умолчанию (работают с DC-IP без прокси):
|
||||||
|
# - ozon, citilink: Playwright
|
||||||
|
# - wb: прямой JSON API (с задержкой)
|
||||||
|
# Опциональные (требуют residential proxy):
|
||||||
|
# - yamarket, dns
|
||||||
|
DEFAULT_SOURCES = ("ozon", "citilink", "wb")
|
||||||
|
|
||||||
def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> dict[str, Any]:
|
|
||||||
|
def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]:
|
||||||
"""Спрашивает все указанные источники и объединяет лучшее в единый отчёт.
|
"""Спрашивает все указанные источники и объединяет лучшее в единый отчёт.
|
||||||
|
|
||||||
Возвращает:
|
Возвращает:
|
||||||
@ -59,8 +67,9 @@ def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -
|
|||||||
"""
|
"""
|
||||||
fetchers = {
|
fetchers = {
|
||||||
"wb": lambda: _safe_first(search_wb, query),
|
"wb": lambda: _safe_first(search_wb, query),
|
||||||
"yamarket": lambda: _safe_first(search_yamarket, query),
|
|
||||||
"ozon": lambda: _safe_first(search_ozon, query),
|
"ozon": lambda: _safe_first(search_ozon, query),
|
||||||
|
"citilink": lambda: _safe_first(search_citilink, query),
|
||||||
|
"yamarket": lambda: _safe_first(search_yamarket, query),
|
||||||
"dns": lambda: _safe_first(search_dns, query),
|
"dns": lambda: _safe_first(search_dns, query),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -87,8 +96,8 @@ def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -
|
|||||||
stores = items["yamarket"]["stores_count"]
|
stores = items["yamarket"]["stores_count"]
|
||||||
|
|
||||||
best_url = None
|
best_url = None
|
||||||
# Приоритет: yamarket (агрегатор) → wb → ozon → dns
|
# Приоритет: ozon → citilink → wb → yamarket → dns
|
||||||
for src in ("yamarket", "wb", "ozon", "dns"):
|
for src in ("ozon", "citilink", "wb", "yamarket", "dns"):
|
||||||
i = items.get(src)
|
i = items.get(src)
|
||||||
if i and i.get("url"):
|
if i and i.get("url"):
|
||||||
best_url = i["url"]
|
best_url = i["url"]
|
||||||
@ -107,7 +116,7 @@ def enrich_one(query: str, sources: tuple = ("wb", "yamarket", "ozon", "dns")) -
|
|||||||
|
|
||||||
|
|
||||||
def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5,
|
def enrich_models(models: list[dict[str, Any]], delay_sec: float = 0.5,
|
||||||
sources: tuple = ("wb", "yamarket", "ozon", "dns")) -> list[dict[str, Any]]:
|
sources: tuple = DEFAULT_SOURCES) -> list[dict[str, Any]]:
|
||||||
"""Обогащает список моделей от AI данными со всех источников."""
|
"""Обогащает список моделей от AI данными со всех источников."""
|
||||||
enriched: list[dict[str, Any]] = []
|
enriched: list[dict[str, Any]] = []
|
||||||
for i, m in enumerate(models):
|
for i, m in enumerate(models):
|
||||||
|
|||||||
157
backend-py/app/parsers/citilink.py
Normal file
157
backend-py/app/parsers/citilink.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
"""Парсер Citilink (citilink.ru) — через Playwright.
|
||||||
|
|
||||||
|
Citilink — крупный российский магазин электроники. Работает с DC-IP, не требует
|
||||||
|
прокси. Карточки помечены `data-meta-name=ProductCard...` или `data-meta-name=Snippet...`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from . import playwright_engine
|
||||||
|
|
||||||
|
log = logging.getLogger("zov.parser.citilink")
|
||||||
|
|
||||||
|
_BASE_URL = "https://www.citilink.ru"
|
||||||
|
_SEARCH_URL = "https://www.citilink.ru/search/"
|
||||||
|
_PRICE_RE = re.compile(r"(\d[\d\s ]+)\s*₽|(\d[\d\s ]+)\s*руб")
|
||||||
|
|
||||||
|
|
||||||
|
def search_citilink(query: str, limit: int = 3, timeout: float = 30.0,
|
||||||
|
max_retries: int = 1) -> list[dict[str, Any]]:
|
||||||
|
"""Поиск товара на Citilink через Playwright."""
|
||||||
|
url = f"{_SEARCH_URL}?text={quote_plus(query)}"
|
||||||
|
|
||||||
|
html = None
|
||||||
|
for attempt in range(max_retries + 1):
|
||||||
|
html = playwright_engine.fetch_page(
|
||||||
|
url,
|
||||||
|
wait_selector="[data-meta-name*='Snippet'], [data-meta-name*='ProductCard']",
|
||||||
|
wait_ms=4000,
|
||||||
|
timeout_ms=int(timeout * 1000),
|
||||||
|
)
|
||||||
|
if html:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not html:
|
||||||
|
log.warning("Citilink: no HTML for query=%r", query)
|
||||||
|
return []
|
||||||
|
|
||||||
|
return _parse_html(html, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
results: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
# Карточки товаров
|
||||||
|
cards = (
|
||||||
|
soup.select("[data-meta-name*='Snippet']")
|
||||||
|
or soup.select("[data-meta-name*='ProductCard']")
|
||||||
|
or soup.select("div.ProductCardHorizontal")
|
||||||
|
)
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
if len(results) >= limit:
|
||||||
|
break
|
||||||
|
item = _extract_card(card)
|
||||||
|
if item:
|
||||||
|
results.append(item)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_card(card) -> dict[str, Any] | None:
|
||||||
|
"""Достаём title, url, цену, картинку, рейтинг, отзывы."""
|
||||||
|
# Ссылка на товар
|
||||||
|
link = card.select_one("a[href*='/product/']") or card.find("a", href=True)
|
||||||
|
if not link:
|
||||||
|
return None
|
||||||
|
href = link.get("href") or ""
|
||||||
|
if "/product/" not in href and "/promo/" not in href:
|
||||||
|
return None
|
||||||
|
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
||||||
|
|
||||||
|
# Название
|
||||||
|
title = ""
|
||||||
|
# Citilink использует разные классы — пробуем несколько
|
||||||
|
for sel in [
|
||||||
|
"[data-meta-name*='Snippet__title']",
|
||||||
|
"[data-meta-name*='ProductCardHorizontal__title']",
|
||||||
|
"a[href*='/product/'] span",
|
||||||
|
"a[title]",
|
||||||
|
]:
|
||||||
|
el = card.select_one(sel)
|
||||||
|
if el:
|
||||||
|
title = (el.get("title") or el.get_text(strip=True)).strip()
|
||||||
|
if title and len(title) > 5:
|
||||||
|
break
|
||||||
|
if not title:
|
||||||
|
# Резерв — длинный текст в карточке
|
||||||
|
for s in card.find_all(["span", "div"]):
|
||||||
|
t = s.get_text(strip=True)
|
||||||
|
if t and 15 < len(t) < 200 and "₽" not in t and "%" not in t:
|
||||||
|
title = t
|
||||||
|
break
|
||||||
|
if not title or len(title) < 5:
|
||||||
|
return None
|
||||||
|
|
||||||
|
full_text = card.get_text(" ", strip=True)
|
||||||
|
|
||||||
|
# Цена
|
||||||
|
price = None
|
||||||
|
for m in _PRICE_RE.finditer(full_text):
|
||||||
|
raw = (m.group(1) or m.group(2) or "").replace(" ", "").replace(" ", "").replace(" ", "")
|
||||||
|
try:
|
||||||
|
v = int(raw)
|
||||||
|
if 100 < v < 10_000_000: # разумные пределы
|
||||||
|
price = v
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Картинка
|
||||||
|
img_url = None
|
||||||
|
img_el = card.find("img")
|
||||||
|
if img_el:
|
||||||
|
src = img_el.get("src") or img_el.get("data-src") or ""
|
||||||
|
if src and "data:image" not in src:
|
||||||
|
if src.startswith("//"):
|
||||||
|
src = "https:" + src
|
||||||
|
img_url = src
|
||||||
|
|
||||||
|
# Рейтинг
|
||||||
|
rating = None
|
||||||
|
m = re.search(r"(\d[.,]\d)\s*[\\(\\d]", full_text)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
r = float(m.group(1).replace(",", "."))
|
||||||
|
if 0 < r <= 5.0:
|
||||||
|
rating = r
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Отзывы
|
||||||
|
reviews = None
|
||||||
|
m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", full_text)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
reviews = int(m.group(1).replace(" ", "").replace(" ", ""))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": title[:250],
|
||||||
|
"url": url,
|
||||||
|
"image_url": img_url,
|
||||||
|
"price_min_rub": price,
|
||||||
|
"price_max_rub": None,
|
||||||
|
"rating": rating,
|
||||||
|
"reviews_count": reviews,
|
||||||
|
"stores_count": None,
|
||||||
|
"specs": {},
|
||||||
|
"source": "citilink",
|
||||||
|
}
|
||||||
@ -1,9 +1,9 @@
|
|||||||
"""Парсер OZON — через composer-api (внутренний JSON API сайта).
|
"""Парсер OZON — через Playwright (рендер JS).
|
||||||
|
|
||||||
OZON отдаёт JSON через `/api/composer-api.bx/page/json/v2?url=/search/?text=…`.
|
OZON блокирует прямой HTTP с DC-IP (403/307). С Playwright рендерит обычную
|
||||||
JSON содержит вложенные виджеты — нас интересует `widgetStates.searchResults...`.
|
HTML-страницу `/search/?text=…`, в которой есть карточки `a[href*='/product/']`.
|
||||||
|
|
||||||
Без прокси возвращает 307/403. Через резидентный РФ-IP проходит.
|
В карточке: название, цена, картинка, рейтинг, отзывы — в DOM рядом со ссылкой.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import logging
|
import logging
|
||||||
@ -11,249 +11,141 @@ import re
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
import httpx
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .. import proxy_pool
|
|
||||||
from . import playwright_engine
|
from . import playwright_engine
|
||||||
|
|
||||||
log = logging.getLogger("zov.parser.ozon")
|
log = logging.getLogger("zov.parser.ozon")
|
||||||
|
|
||||||
_BASE_URL = "https://www.ozon.ru"
|
_BASE_URL = "https://www.ozon.ru"
|
||||||
_API_URL = "https://www.ozon.ru/api/composer-api.bx/page/json/v2"
|
_PRICE_RE = re.compile(r"(\d[\d\s ]+)\s*₽")
|
||||||
|
|
||||||
_HEADERS = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
||||||
"(KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
|
||||||
"Accept": "application/json",
|
|
||||||
"Accept-Language": "ru-RU,ru;q=0.9",
|
|
||||||
"x-o3-app-name": "dweb_client",
|
|
||||||
"x-o3-app-version": "release_18.04",
|
|
||||||
"x-o3-page-type": "search",
|
|
||||||
"Referer": "https://www.ozon.ru/",
|
|
||||||
}
|
|
||||||
|
|
||||||
_PRICE_RE = re.compile(r"([\d\s]+)\s*₽")
|
|
||||||
|
|
||||||
|
|
||||||
def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
|
def search_ozon(query: str, limit: int = 3, timeout: float = 30.0,
|
||||||
max_retries: int = 1, use_playwright: bool = True) -> list[dict[str, Any]]:
|
max_retries: int = 1) -> list[dict[str, Any]]:
|
||||||
"""Поиск товара в OZON.
|
"""Поиск товара в OZON через Playwright."""
|
||||||
|
url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
|
||||||
|
|
||||||
Сначала пробуем composer-api JSON (быстро), при challenge — Playwright (медленно но точно).
|
html = None
|
||||||
"""
|
|
||||||
# Путь 1: быстрый composer-api
|
|
||||||
url_param = f"/search/?text={quote_plus(query)}&from_global=true"
|
|
||||||
params = {"url": url_param}
|
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
try:
|
|
||||||
with proxy_pool.proxied_client(timeout=timeout, headers=_HEADERS,
|
|
||||||
follow_redirects=False) as client:
|
|
||||||
resp = client.get(_API_URL, params=params)
|
|
||||||
if resp.status_code == 200:
|
|
||||||
try:
|
|
||||||
return _extract_products(resp.json(), limit=limit)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
log.debug("OZON composer-api attempt %d: status=%s", attempt + 1, resp.status_code)
|
|
||||||
except httpx.HTTPError as e:
|
|
||||||
log.debug("OZON composer-api err: %s", e)
|
|
||||||
|
|
||||||
# Путь 2: Playwright (рендерим обычную HTML-страницу поиска)
|
|
||||||
if not use_playwright:
|
|
||||||
return []
|
|
||||||
log.info("OZON falling back to Playwright for query=%r", query)
|
|
||||||
page_url = f"{_BASE_URL}/search/?text={quote_plus(query)}"
|
|
||||||
html = playwright_engine.fetch_page(
|
html = playwright_engine.fetch_page(
|
||||||
page_url,
|
url,
|
||||||
wait_selector="a[href*='/product/'], [data-widget='searchResultsV2']",
|
wait_selector="a[href*='/product/']",
|
||||||
wait_ms=3500,
|
wait_ms=4000,
|
||||||
timeout_ms=int(timeout * 1000),
|
timeout_ms=int(timeout * 1000),
|
||||||
)
|
)
|
||||||
|
if html:
|
||||||
|
break
|
||||||
|
|
||||||
if not html:
|
if not html:
|
||||||
|
log.warning("OZON: no HTML for query=%r", query)
|
||||||
return []
|
return []
|
||||||
return _parse_html_via_dom(html, limit=limit)
|
if "robotcheck" in html.lower() or "challenge" in html.lower()[:5000]:
|
||||||
|
log.warning("OZON: anti-bot challenge for query=%r", query)
|
||||||
|
return []
|
||||||
|
|
||||||
|
return _parse_html(html, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
def _parse_html_via_dom(html: str, limit: int) -> list[dict[str, Any]]:
|
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||||||
"""Fallback: парсим товары из отрендеренного Chrome HTML."""
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
seen = set()
|
|
||||||
results: list[dict[str, Any]] = []
|
results: list[dict[str, Any]] = []
|
||||||
|
seen_urls = set()
|
||||||
|
|
||||||
for link in soup.select("a[href*='/product/']"):
|
# Находим все ссылки на товары
|
||||||
|
product_links = soup.select("a[href*='/product/']")
|
||||||
|
|
||||||
|
for link in product_links:
|
||||||
if len(results) >= limit:
|
if len(results) >= limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
href = link.get("href") or ""
|
href = link.get("href") or ""
|
||||||
if href in seen:
|
# Нормализация URL — убираем query params для дедупа
|
||||||
|
url_clean = href.split("?")[0]
|
||||||
|
if url_clean in seen_urls:
|
||||||
continue
|
continue
|
||||||
seen.add(href)
|
seen_urls.add(url_clean)
|
||||||
|
|
||||||
# Поднимаемся до карточки
|
full_url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
||||||
card = link.find_parent("div") or link
|
|
||||||
title = link.get_text(strip=True) or (card.select_one("span") or {}).get_text(strip=True) if hasattr(card.select_one("span"), "get_text") else ""
|
# Поднимаемся до карточки — у OZON это обычно ближайший div с tile-* классом
|
||||||
if not title or len(title) < 5:
|
card = (
|
||||||
|
link.find_parent("div", class_=re.compile("tile|search-item|product"))
|
||||||
|
or link.find_parent("div")
|
||||||
|
)
|
||||||
|
if not card:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
item = _extract_from_card(card, full_url, link)
|
||||||
url = url.split("?")[0]
|
if item and item.get("title") and len(item["title"]) > 5:
|
||||||
|
results.append(item)
|
||||||
# Цена в ближайшем родителе
|
|
||||||
price = None
|
|
||||||
price_card = link.find_parent("div", recursive=True)
|
|
||||||
if price_card:
|
|
||||||
txt = price_card.get_text(" ", strip=True)
|
|
||||||
m = _PRICE_RE.search(txt)
|
|
||||||
if m:
|
|
||||||
price = _try_int(m.group(1).replace(" ", ""))
|
|
||||||
|
|
||||||
# Картинка в карточке
|
|
||||||
img = None
|
|
||||||
img_el = card.find("img") if card else None
|
|
||||||
if img_el:
|
|
||||||
src = img_el.get("src") or ""
|
|
||||||
if src.startswith("//"):
|
|
||||||
src = "https:" + src
|
|
||||||
if src and "data:image" not in src:
|
|
||||||
img = src
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
"title": title[:200],
|
|
||||||
"url": url,
|
|
||||||
"image_url": img,
|
|
||||||
"price_min_rub": price,
|
|
||||||
"price_max_rub": None,
|
|
||||||
"rating": None,
|
|
||||||
"reviews_count": None,
|
|
||||||
"stores_count": None,
|
|
||||||
"specs": {},
|
|
||||||
"source": "ozon",
|
|
||||||
})
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _try_int(v: Any) -> int | None:
|
def _extract_from_card(card, url: str, link_el) -> dict[str, Any] | None:
|
||||||
if v is None:
|
"""Достаём данные из карточки OZON: title, price, image, rating, reviews."""
|
||||||
return None
|
full_text = card.get_text(" ", strip=True)
|
||||||
try:
|
|
||||||
return int(float(str(v).replace(" ", "").replace(",", ".")))
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
# Название — может быть прямо в ссылке, либо в соседнем span
|
||||||
def _extract_products(data: dict, limit: int) -> list[dict[str, Any]]:
|
title = link_el.get("title") or link_el.get_text(strip=True) or ""
|
||||||
"""OZON прячет данные в widgetStates — ищем все ключи с 'searchResultsV2'."""
|
if not title or len(title) < 5 or title in ("Распродажа", "Скидка", "Топ"):
|
||||||
widget_states = data.get("widgetStates") or {}
|
# Ищем во вложенных span — обычно длинные строки = название
|
||||||
products: list[dict[str, Any]] = []
|
spans = card.find_all("span")
|
||||||
|
for s in spans:
|
||||||
for key, raw in widget_states.items():
|
t = s.get_text(strip=True)
|
||||||
if "searchResultsV2" not in key and "skuGrid" not in key and "searchCategories" not in key:
|
if t and len(t) > 15 and len(t) < 200 and "₽" not in t and "%" not in t:
|
||||||
continue
|
title = t
|
||||||
try:
|
|
||||||
import json as _j
|
|
||||||
w = _j.loads(raw) if isinstance(raw, str) else raw
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
items = w.get("items") or w.get("products") or []
|
|
||||||
for it in items:
|
|
||||||
if len(products) >= limit:
|
|
||||||
break
|
break
|
||||||
item = _build_item(it)
|
title = title.strip()
|
||||||
if item:
|
if not title or len(title) < 5:
|
||||||
products.append(item)
|
|
||||||
if len(products) >= limit:
|
|
||||||
break
|
|
||||||
|
|
||||||
return products
|
|
||||||
|
|
||||||
|
|
||||||
def _build_item(it: dict[str, Any]) -> dict[str, Any] | None:
|
|
||||||
"""Парсит карточку товара из OZON widget items[]."""
|
|
||||||
# Структура: { mainState: [...], action: { link: '/product/...' }, images: [...] }
|
|
||||||
sku = it.get("sku") or it.get("id")
|
|
||||||
if not sku:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
link = (it.get("action") or {}).get("link") or ""
|
# Цена — первое число с ₽ в карточке (минимальная)
|
||||||
url = f"{_BASE_URL}{link}" if link.startswith("/") else link
|
price = None
|
||||||
|
m = _PRICE_RE.search(full_text)
|
||||||
|
if m:
|
||||||
|
raw = m.group(1).replace(" ", "").replace(" ", "").replace(" ", "")
|
||||||
|
try:
|
||||||
|
price = int(raw)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
# Картинка
|
# Картинка
|
||||||
image_url = None
|
img_url = None
|
||||||
imgs = it.get("images") or it.get("tileImage") or []
|
img_el = card.find("img")
|
||||||
if isinstance(imgs, list) and imgs:
|
if img_el:
|
||||||
first = imgs[0]
|
src = img_el.get("src") or img_el.get("data-src") or ""
|
||||||
image_url = first if isinstance(first, str) else (first.get("image") or first.get("src"))
|
if src and "data:image" not in src:
|
||||||
if not image_url:
|
if src.startswith("//"):
|
||||||
ti = it.get("tileImage") or {}
|
src = "https:" + src
|
||||||
if isinstance(ti, dict):
|
img_url = src
|
||||||
items = ti.get("items") or []
|
|
||||||
for x in items:
|
|
||||||
if isinstance(x, dict) and x.get("image"):
|
|
||||||
image_url = x["image"].get("link") or x["image"].get("src")
|
|
||||||
break
|
|
||||||
|
|
||||||
# Цена и название — берём из mainState текстовых атомов
|
# Рейтинг (если есть на карточке — иногда показывают)
|
||||||
title = ""
|
|
||||||
price_min = None
|
|
||||||
price_max = None
|
|
||||||
rating = None
|
rating = None
|
||||||
|
m = re.search(r"(\d[.,]\d)\s*\(?\d", full_text) # "4.7 (1242 отзыва)"
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
rating = float(m.group(1).replace(",", "."))
|
||||||
|
if rating > 5.0:
|
||||||
|
rating = None # видимо не рейтинг
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
reviews = None
|
reviews = None
|
||||||
|
m = re.search(r"(\d[\d\s ]*)\s*(?:отзыв|оценок|review)", full_text, re.I)
|
||||||
for atom in (it.get("mainState") or []):
|
|
||||||
atom_id = atom.get("id") or ""
|
|
||||||
atom_type = atom.get("type") or ""
|
|
||||||
|
|
||||||
if atom_type == "textAtom":
|
|
||||||
text = ((atom.get("textAtom") or {}).get("text") or "").strip()
|
|
||||||
if "name" in atom_id.lower() and not title:
|
|
||||||
title = re.sub(r"<[^>]+>", "", text)
|
|
||||||
elif "price" in atom_id.lower():
|
|
||||||
m = _PRICE_RE.search(text)
|
|
||||||
if m and not price_min:
|
|
||||||
price_min = int(m.group(1).replace(" ", "").replace(" ", ""))
|
|
||||||
|
|
||||||
elif atom_type == "priceV2":
|
|
||||||
pv = atom.get("priceV2") or {}
|
|
||||||
for price_obj in (pv.get("price") or []):
|
|
||||||
t = (price_obj.get("text") or "").strip()
|
|
||||||
m = _PRICE_RE.search(t)
|
|
||||||
if m:
|
if m:
|
||||||
val = int(m.group(1).replace(" ", "").replace(" ", ""))
|
try:
|
||||||
if price_min is None or val < price_min:
|
reviews = int(m.group(1).replace(" ", "").replace(" ", "").replace(" ", ""))
|
||||||
price_min = val
|
except ValueError:
|
||||||
if price_max is None or val > price_max:
|
pass
|
||||||
price_max = val
|
|
||||||
|
|
||||||
elif atom_type == "labelList":
|
|
||||||
for lbl in ((atom.get("labelList") or {}).get("items") or []):
|
|
||||||
t = (lbl.get("title") or "").strip()
|
|
||||||
# Рейтинг типа "4.7"
|
|
||||||
if re.fullmatch(r"\d\.\d", t):
|
|
||||||
rating = float(t)
|
|
||||||
# Отзывы типа "1242 отзыва"
|
|
||||||
m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", t)
|
|
||||||
if m:
|
|
||||||
reviews = int(m.group(1).replace(" ", ""))
|
|
||||||
|
|
||||||
if not title:
|
|
||||||
# Резервный фолбэк — могут быть атомы в otherState
|
|
||||||
for atom in (it.get("otherState") or []):
|
|
||||||
text = ((atom.get("textAtom") or {}).get("text") or "").strip()
|
|
||||||
if text and len(text) > 5:
|
|
||||||
title = re.sub(r"<[^>]+>", "", text)
|
|
||||||
break
|
|
||||||
|
|
||||||
if not title:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"title": title,
|
"title": title[:250],
|
||||||
"url": url,
|
"url": url,
|
||||||
"image_url": image_url,
|
"image_url": img_url,
|
||||||
"price_min_rub": price_min,
|
"price_min_rub": price,
|
||||||
"price_max_rub": price_max if price_max and price_max != price_min else None,
|
"price_max_rub": None,
|
||||||
"rating": rating,
|
"rating": rating,
|
||||||
"reviews_count": reviews,
|
"reviews_count": reviews,
|
||||||
"stores_count": None,
|
"stores_count": None,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user