mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 15:44:47 +00:00
DIAGNOSTIC RESULTS: - OZON: 19 product links via Playwright on naked VPS-IP ✓ - Citilink: 112 data-meta-name Snippets ✓ - Wildberries: JSON API works with delays ✓ - Я.Маркет, DNS: blocked by ASN (need residential proxy) OZON PARSER: - Pure Playwright DOM (composer-api dropped — was blocked) - Selects a[href*='/product/'], walks up to card div, extracts title/price/img - Filters fake 'titles' like Распродажа, Скидка CITILINK PARSER (new): - Selects [data-meta-name*='Snippet'] or ProductCard markers - Multiple title selectors fallback chain - Filters out non-product hits PARSERS/__init__.py: - DEFAULT_SOURCES = (ozon, citilink, wb) — all work without proxy - Я.Маркет, DNS kept but not default — usable when residential proxy added NEW ENDPOINT: - GET /api/parse_citilink?q=...&limit=N
158 lines
4.8 KiB
Python
158 lines
4.8 KiB
Python
"""Парсер Citilink (citilink.ru) — через Playwright.
|
||
|
||
Citilink — крупный российский магазин электроники. Работает с DC-IP, не требует
|
||
прокси. Карточки помечены `data-meta-name=ProductCard...` или `data-meta-name=Snippet...`.
|
||
"""
|
||
from __future__ import annotations
|
||
import logging
|
||
import re
|
||
from typing import Any
|
||
from urllib.parse import quote_plus
|
||
|
||
from bs4 import BeautifulSoup
|
||
|
||
from . import playwright_engine
|
||
|
||
log = logging.getLogger("zov.parser.citilink")
|
||
|
||
_BASE_URL = "https://www.citilink.ru"
|
||
_SEARCH_URL = "https://www.citilink.ru/search/"
|
||
_PRICE_RE = re.compile(r"(\d[\d\s ]+)\s*₽|(\d[\d\s ]+)\s*руб")
|
||
|
||
|
||
def search_citilink(query: str, limit: int = 3, timeout: float = 30.0,
|
||
max_retries: int = 1) -> list[dict[str, Any]]:
|
||
"""Поиск товара на Citilink через Playwright."""
|
||
url = f"{_SEARCH_URL}?text={quote_plus(query)}"
|
||
|
||
html = None
|
||
for attempt in range(max_retries + 1):
|
||
html = playwright_engine.fetch_page(
|
||
url,
|
||
wait_selector="[data-meta-name*='Snippet'], [data-meta-name*='ProductCard']",
|
||
wait_ms=4000,
|
||
timeout_ms=int(timeout * 1000),
|
||
)
|
||
if html:
|
||
break
|
||
|
||
if not html:
|
||
log.warning("Citilink: no HTML for query=%r", query)
|
||
return []
|
||
|
||
return _parse_html(html, limit=limit)
|
||
|
||
|
||
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
results: list[dict[str, Any]] = []
|
||
|
||
# Карточки товаров
|
||
cards = (
|
||
soup.select("[data-meta-name*='Snippet']")
|
||
or soup.select("[data-meta-name*='ProductCard']")
|
||
or soup.select("div.ProductCardHorizontal")
|
||
)
|
||
|
||
for card in cards:
|
||
if len(results) >= limit:
|
||
break
|
||
item = _extract_card(card)
|
||
if item:
|
||
results.append(item)
|
||
|
||
return results
|
||
|
||
|
||
def _extract_card(card) -> dict[str, Any] | None:
|
||
"""Достаём title, url, цену, картинку, рейтинг, отзывы."""
|
||
# Ссылка на товар
|
||
link = card.select_one("a[href*='/product/']") or card.find("a", href=True)
|
||
if not link:
|
||
return None
|
||
href = link.get("href") or ""
|
||
if "/product/" not in href and "/promo/" not in href:
|
||
return None
|
||
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
|
||
|
||
# Название
|
||
title = ""
|
||
# Citilink использует разные классы — пробуем несколько
|
||
for sel in [
|
||
"[data-meta-name*='Snippet__title']",
|
||
"[data-meta-name*='ProductCardHorizontal__title']",
|
||
"a[href*='/product/'] span",
|
||
"a[title]",
|
||
]:
|
||
el = card.select_one(sel)
|
||
if el:
|
||
title = (el.get("title") or el.get_text(strip=True)).strip()
|
||
if title and len(title) > 5:
|
||
break
|
||
if not title:
|
||
# Резерв — длинный текст в карточке
|
||
for s in card.find_all(["span", "div"]):
|
||
t = s.get_text(strip=True)
|
||
if t and 15 < len(t) < 200 and "₽" not in t and "%" not in t:
|
||
title = t
|
||
break
|
||
if not title or len(title) < 5:
|
||
return None
|
||
|
||
full_text = card.get_text(" ", strip=True)
|
||
|
||
# Цена
|
||
price = None
|
||
for m in _PRICE_RE.finditer(full_text):
|
||
raw = (m.group(1) or m.group(2) or "").replace(" ", "").replace(" ", "").replace(" ", "")
|
||
try:
|
||
v = int(raw)
|
||
if 100 < v < 10_000_000: # разумные пределы
|
||
price = v
|
||
break
|
||
except ValueError:
|
||
pass
|
||
|
||
# Картинка
|
||
img_url = None
|
||
img_el = card.find("img")
|
||
if img_el:
|
||
src = img_el.get("src") or img_el.get("data-src") or ""
|
||
if src and "data:image" not in src:
|
||
if src.startswith("//"):
|
||
src = "https:" + src
|
||
img_url = src
|
||
|
||
# Рейтинг
|
||
rating = None
|
||
m = re.search(r"(\d[.,]\d)\s*[\\(\\d]", full_text)
|
||
if m:
|
||
try:
|
||
r = float(m.group(1).replace(",", "."))
|
||
if 0 < r <= 5.0:
|
||
rating = r
|
||
except ValueError:
|
||
pass
|
||
|
||
# Отзывы
|
||
reviews = None
|
||
m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", full_text)
|
||
if m:
|
||
try:
|
||
reviews = int(m.group(1).replace(" ", "").replace(" ", ""))
|
||
except ValueError:
|
||
pass
|
||
|
||
return {
|
||
"title": title[:250],
|
||
"url": url,
|
||
"image_url": img_url,
|
||
"price_min_rub": price,
|
||
"price_max_rub": None,
|
||
"rating": rating,
|
||
"reviews_count": reviews,
|
||
"stores_count": None,
|
||
"specs": {},
|
||
"source": "citilink",
|
||
}
|