mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 18:24:49 +00:00
yamarket: rewrite for /card/{slug}/{id} URL pattern (Я.Маркет 2026)
- Old /product--{id} URLs deprecated
- Walks up from a[href*='/card/'] to nearest article/zone-div
- Extracts title from link text or h2/h3/itemprop=name
- Price: min from card text (with sanity bounds 100..10M)
- Image filters yastatic / _next placeholders
- Rating: '4.7★' or '4.7 N оценок' pattern
- Reviews: 'N отзывов' / 'N оценок'
- Stores count: 'от N магазинов / предложений'
This commit is contained in:
parent
b1d8f3e38a
commit
839e775151
@ -24,16 +24,20 @@ _PRICE_RE = re.compile(r"([\d\s]+)\s*₽")
|
|||||||
|
|
||||||
def search_yamarket(query: str, limit: int = 3, timeout: float = 30.0,
|
def search_yamarket(query: str, limit: int = 3, timeout: float = 30.0,
|
||||||
max_retries: int = 1) -> list[dict[str, Any]]:
|
max_retries: int = 1) -> list[dict[str, Any]]:
|
||||||
"""Поиск товара в Я.Маркете через headless Chromium."""
|
"""Поиск товара в Я.Маркете через headless Chromium + residential proxy.
|
||||||
|
|
||||||
|
Я.Маркет (2025-2026) использует URL pattern `/card/{slug}/{productId}`.
|
||||||
|
Старые URL `/product--` больше не применяются.
|
||||||
|
"""
|
||||||
url = f"{_BASE_URL}/search?text={quote_plus(query)}"
|
url = f"{_BASE_URL}/search?text={quote_plus(query)}"
|
||||||
|
|
||||||
html = None
|
html = None
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
html = playwright_engine.fetch_page(
|
html = playwright_engine.fetch_page(
|
||||||
url,
|
url,
|
||||||
# Ждём появления товарных ссылок или контейнера выдачи
|
# Ждём появления товарных ссылок /card/...
|
||||||
wait_selector="a[href*='/product--'], [data-auto='SerpItem'], [data-zone-name='snippet-card']",
|
wait_selector="a[href*='/card/']",
|
||||||
wait_ms=3500,
|
wait_ms=5000,
|
||||||
timeout_ms=int(timeout * 1000),
|
timeout_ms=int(timeout * 1000),
|
||||||
)
|
)
|
||||||
if html:
|
if html:
|
||||||
@ -46,128 +50,135 @@ def search_yamarket(query: str, limit: int = 3, timeout: float = 30.0,
|
|||||||
if "showcaptcha" in html.lower() or "qrator" in html.lower()[:5000]:
|
if "showcaptcha" in html.lower() or "qrator" in html.lower()[:5000]:
|
||||||
log.warning("YaMarket: Qrator/captcha for query=%r", query)
|
log.warning("YaMarket: Qrator/captcha for query=%r", query)
|
||||||
return []
|
return []
|
||||||
|
if "Похоже, вы используете" in html[:30000] or "используете VPN" in html[:30000]:
|
||||||
|
log.warning("YaMarket: VPN warning page for query=%r", query)
|
||||||
|
return []
|
||||||
|
|
||||||
return _parse_html(html, limit=limit)
|
return _parse_html(html, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
|
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||||||
|
"""Парсим товары через URL pattern /card/{slug}/{productId} (Я.Маркет 2026)."""
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
results: list[dict[str, Any]] = []
|
results: list[dict[str, Any]] = []
|
||||||
|
seen_ids = set()
|
||||||
|
|
||||||
# Основной селектор — товарные карточки на странице поиска
|
for link in soup.select("a[href*='/card/']"):
|
||||||
candidates = (
|
|
||||||
soup.select("[data-auto='SerpItem']")
|
|
||||||
or soup.select("[data-zone-name='snippet-card']")
|
|
||||||
or soup.select("article[data-baobab-name='card']")
|
|
||||||
or soup.select("article:has(a[href*='/product--'])")
|
|
||||||
)
|
|
||||||
|
|
||||||
for card in candidates:
|
|
||||||
if len(results) >= limit:
|
if len(results) >= limit:
|
||||||
break
|
break
|
||||||
item = _extract_card(card)
|
href = link.get("href") or ""
|
||||||
if item:
|
m_id = re.search(r"/card/[^/]+/(\d+)", href)
|
||||||
results.append(item)
|
if not m_id:
|
||||||
|
|
||||||
# Резерв — собрать по найденным ссылкам product--
|
|
||||||
if not results:
|
|
||||||
seen = set()
|
|
||||||
for a in soup.select("a[href*='/product--']")[:limit * 2]:
|
|
||||||
href = a.get("href") or ""
|
|
||||||
if href in seen:
|
|
||||||
continue
|
continue
|
||||||
seen.add(href)
|
product_id = m_id.group(1)
|
||||||
# Берём родительский article как карточку
|
if product_id in seen_ids:
|
||||||
card = a.find_parent("article") or a.find_parent("div")
|
continue
|
||||||
if card:
|
seen_ids.add(product_id)
|
||||||
item = _extract_card(card)
|
|
||||||
|
full_url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
||||||
|
clean_url = full_url.split("?")[0]
|
||||||
|
|
||||||
|
# Карточка-родитель — article, div с data-zone-name или просто ближайший div
|
||||||
|
card = (
|
||||||
|
link.find_parent("article")
|
||||||
|
or link.find_parent("div", attrs={"data-zone-name": True})
|
||||||
|
or link.find_parent("div")
|
||||||
|
)
|
||||||
|
if not card:
|
||||||
|
continue
|
||||||
|
item = _extract_card(card, link, clean_url)
|
||||||
if item:
|
if item:
|
||||||
results.append(item)
|
results.append(item)
|
||||||
if len(results) >= limit:
|
|
||||||
break
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _extract_card(card) -> dict[str, Any] | None:
|
def _extract_card(card, link_el, url: str) -> dict[str, Any] | None:
|
||||||
"""Достаём заголовок, ссылку, цену, рейтинг, отзывы, фото, кол-во магазинов."""
|
"""Достаём title, price, image, rating, reviews, stores из карточки."""
|
||||||
link_el = (
|
full_text = card.get_text(" ", strip=True)
|
||||||
card.select_one("a[href*='/product--']")
|
|
||||||
or card.select_one("a[data-baobab-name='title']")
|
|
||||||
)
|
|
||||||
if not link_el:
|
|
||||||
return None
|
|
||||||
href = link_el.get("href") or ""
|
|
||||||
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
|
||||||
|
|
||||||
title_el = (
|
# Title — обычно в самой ссылке, либо в h3/h2/span внутри
|
||||||
card.select_one("[data-zone-name='title'] span")
|
title = (link_el.get("title") or link_el.get_text(strip=True) or "").strip()
|
||||||
or card.select_one("h3 span")
|
if not title or len(title) < 5:
|
||||||
or card.select_one("[data-auto='snippet-title']")
|
for sel in ["h3", "h2", "[data-auto='snippet-title']", "span[itemprop='name']"]:
|
||||||
or link_el
|
el = card.select_one(sel)
|
||||||
)
|
if el:
|
||||||
title = title_el.get_text(strip=True) if title_el else (link_el.get_text(strip=True))
|
t = (el.get("title") or el.get_text(strip=True)).strip()
|
||||||
|
if t and len(t) > 5:
|
||||||
|
title = t
|
||||||
|
break
|
||||||
if not title:
|
if not title:
|
||||||
|
# Резерв — длинный текст без цены/рейтинга
|
||||||
|
for s in card.find_all("span"):
|
||||||
|
t = s.get_text(strip=True)
|
||||||
|
if 15 < len(t) < 250 and "₽" not in t and "★" not in t and "отзыв" not in t.lower():
|
||||||
|
title = t
|
||||||
|
break
|
||||||
|
if not title or len(title) < 5:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Цена
|
# Цена — минимальная в карточке
|
||||||
price_min = price_max = None
|
price_min = None
|
||||||
price_el = (
|
for m in _PRICE_RE.finditer(full_text):
|
||||||
card.select_one("[data-auto='snippet-price-current']")
|
raw = m.group(1).replace(" ", "").replace(" ", "").replace(" ", "")
|
||||||
or card.select_one("[data-auto='price-value']")
|
try:
|
||||||
or card.select_one("[class*='Price']")
|
v = int(raw)
|
||||||
)
|
if 100 < v < 10_000_000:
|
||||||
if price_el:
|
if price_min is None or v < price_min:
|
||||||
m = _PRICE_RE.search(price_el.get_text(" ", strip=True))
|
price_min = v
|
||||||
if m:
|
except ValueError:
|
||||||
price_min = _try_int(m.group(1).replace(" ", "").replace(" ", ""))
|
pass
|
||||||
|
|
||||||
# Картинка
|
# Картинка (исключаем placeholder'ы)
|
||||||
img_url = None
|
img_url = None
|
||||||
img_el = card.select_one("img[src], img[srcset]")
|
for img_el in card.find_all("img"):
|
||||||
if img_el:
|
|
||||||
src = img_el.get("src") or img_el.get("data-src") or ""
|
src = img_el.get("src") or img_el.get("data-src") or ""
|
||||||
# Иногда src — заглушка 1x1px, основное в srcset
|
if not src or "data:image" in src:
|
||||||
if "data:image" in src or not src:
|
|
||||||
srcset = img_el.get("srcset") or ""
|
srcset = img_el.get("srcset") or ""
|
||||||
if srcset:
|
if srcset:
|
||||||
src = srcset.split(",")[0].strip().split(" ")[0]
|
src = srcset.split(",")[0].strip().split(" ")[0]
|
||||||
if src.startswith("//"):
|
if src.startswith("//"):
|
||||||
src = "https:" + src
|
src = "https:" + src
|
||||||
if src:
|
if not src or "yastatic" in src or "_next/static" in src:
|
||||||
|
continue
|
||||||
img_url = src
|
img_url = src
|
||||||
|
break
|
||||||
|
|
||||||
# Рейтинг
|
# Рейтинг
|
||||||
rating = None
|
rating = None
|
||||||
rating_el = card.select_one("[data-auto='snippet-rating'], [class*='Rating'] span")
|
m = re.search(r"(\d[.,]\d)(?:\s*★|\s*\(?\d+\s*оцен)", full_text)
|
||||||
if rating_el:
|
|
||||||
rt = rating_el.get_text(strip=True)
|
|
||||||
m = re.search(r"\d[.,]\d", rt)
|
|
||||||
if m:
|
if m:
|
||||||
rating = _try_float(m.group(0))
|
try:
|
||||||
|
r = float(m.group(1).replace(",", "."))
|
||||||
|
if 0 < r <= 5.0:
|
||||||
|
rating = r
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
# Отзывы
|
# Отзывы
|
||||||
reviews = None
|
reviews = None
|
||||||
reviews_el = card.select_one("[data-auto='snippet-feedback'], a[href*='/reviews']")
|
m = re.search(r"(\d[\d\s ]*)\s*(?:отзыв|оценок|review)", full_text, re.I)
|
||||||
if reviews_el:
|
|
||||||
m = re.search(r"\d[\d\s]*", reviews_el.get_text(" ", strip=True))
|
|
||||||
if m:
|
if m:
|
||||||
reviews = _try_int(m.group(0).replace(" ", ""))
|
try:
|
||||||
|
reviews = int(m.group(1).replace(" ", "").replace(" ", "").replace(" ", ""))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
# Кол-во магазинов / предложений
|
# Кол-во магазинов / предложений
|
||||||
stores = None
|
stores = None
|
||||||
stores_el = card.select_one("[data-auto='offer-count'], a[href*='/offers']")
|
m = re.search(r"(?:от|в)\s+(\d+)\s+(?:магазин|предложен)", full_text)
|
||||||
if stores_el:
|
|
||||||
m = re.search(r"\d+", stores_el.get_text(" ", strip=True))
|
|
||||||
if m:
|
if m:
|
||||||
stores = int(m.group(0))
|
try:
|
||||||
|
stores = int(m.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"title": title,
|
"title": title[:250],
|
||||||
"url": url,
|
"url": url,
|
||||||
"image_url": img_url,
|
"image_url": img_url,
|
||||||
"price_min_rub": price_min,
|
"price_min_rub": price_min,
|
||||||
"price_max_rub": price_max if price_max and price_max != price_min else None,
|
"price_max_rub": None,
|
||||||
"rating": rating,
|
"rating": rating,
|
||||||
"reviews_count": reviews,
|
"reviews_count": reviews,
|
||||||
"stores_count": stores,
|
"stores_count": stores,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user