zov-tech/backend-py/app/parsers/citilink.py
wasrusgen e8b487891f backend: working parsers — OZON + Citilink (DOM via Playwright) + WB
DIAGNOSTIC RESULTS:
- OZON: 19 product links via Playwright on naked VPS-IP ✓
- Citilink: 112 data-meta-name Snippets ✓
- Wildberries: JSON API works with delays ✓
- Я.Маркет, DNS: blocked by ASN (need residential proxy)

OZON PARSER:
- Pure Playwright DOM (composer-api dropped — was blocked)
- Selects a[href*='/product/'], walks up to card div, extracts title/price/img
- Filters fake 'titles' like Распродажа, Скидка

CITILINK PARSER (new):
- Selects [data-meta-name*='Snippet'] or ProductCard markers
- Multiple title selectors fallback chain
- Filters out non-product hits

PARSERS/__init__.py:
- DEFAULT_SOURCES = (ozon, citilink, wb) — all work without proxy
- Я.Маркет, DNS kept but not default — usable when residential proxy added

NEW ENDPOINT:
- GET /api/parse_citilink?q=...&limit=N
2026-05-11 13:53:07 +03:00

158 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Парсер Citilink (citilink.ru) — через Playwright.
Citilink — крупный российский магазин электроники. Работает с DC-IP, не требует
прокси. Карточки помечены `data-meta-name=ProductCard...` или `data-meta-name=Snippet...`.
"""
from __future__ import annotations
import logging
import re
from typing import Any
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from . import playwright_engine
log = logging.getLogger("zov.parser.citilink")
_BASE_URL = "https://www.citilink.ru"
_SEARCH_URL = "https://www.citilink.ru/search/"
_PRICE_RE = re.compile(r"(\d[\d\s ]+)\s*₽|(\d[\d\s ]+)\s*руб")
def search_citilink(query: str, limit: int = 3, timeout: float = 30.0,
max_retries: int = 1) -> list[dict[str, Any]]:
"""Поиск товара на Citilink через Playwright."""
url = f"{_SEARCH_URL}?text={quote_plus(query)}"
html = None
for attempt in range(max_retries + 1):
html = playwright_engine.fetch_page(
url,
wait_selector="[data-meta-name*='Snippet'], [data-meta-name*='ProductCard']",
wait_ms=4000,
timeout_ms=int(timeout * 1000),
)
if html:
break
if not html:
log.warning("Citilink: no HTML for query=%r", query)
return []
return _parse_html(html, limit=limit)
def _parse_html(html: str, limit: int) -> list[dict[str, Any]]:
soup = BeautifulSoup(html, "html.parser")
results: list[dict[str, Any]] = []
# Карточки товаров
cards = (
soup.select("[data-meta-name*='Snippet']")
or soup.select("[data-meta-name*='ProductCard']")
or soup.select("div.ProductCardHorizontal")
)
for card in cards:
if len(results) >= limit:
break
item = _extract_card(card)
if item:
results.append(item)
return results
def _extract_card(card) -> dict[str, Any] | None:
"""Достаём title, url, цену, картинку, рейтинг, отзывы."""
# Ссылка на товар
link = card.select_one("a[href*='/product/']") or card.find("a", href=True)
if not link:
return None
href = link.get("href") or ""
if "/product/" not in href and "/promo/" not in href:
return None
url = href if href.startswith("http") else f"{_BASE_URL}{href}"
# Название
title = ""
# Citilink использует разные классы — пробуем несколько
for sel in [
"[data-meta-name*='Snippet__title']",
"[data-meta-name*='ProductCardHorizontal__title']",
"a[href*='/product/'] span",
"a[title]",
]:
el = card.select_one(sel)
if el:
title = (el.get("title") or el.get_text(strip=True)).strip()
if title and len(title) > 5:
break
if not title:
# Резерв — длинный текст в карточке
for s in card.find_all(["span", "div"]):
t = s.get_text(strip=True)
if t and 15 < len(t) < 200 and "" not in t and "%" not in t:
title = t
break
if not title or len(title) < 5:
return None
full_text = card.get_text(" ", strip=True)
# Цена
price = None
for m in _PRICE_RE.finditer(full_text):
raw = (m.group(1) or m.group(2) or "").replace(" ", "").replace(" ", "").replace(" ", "")
try:
v = int(raw)
if 100 < v < 10_000_000: # разумные пределы
price = v
break
except ValueError:
pass
# Картинка
img_url = None
img_el = card.find("img")
if img_el:
src = img_el.get("src") or img_el.get("data-src") or ""
if src and "data:image" not in src:
if src.startswith("//"):
src = "https:" + src
img_url = src
# Рейтинг
rating = None
m = re.search(r"(\d[.,]\d)\s*[\\(\\d]", full_text)
if m:
try:
r = float(m.group(1).replace(",", "."))
if 0 < r <= 5.0:
rating = r
except ValueError:
pass
# Отзывы
reviews = None
m = re.search(r"(\d[\d\s]*)\s*(?:отзыв|оценок)", full_text)
if m:
try:
reviews = int(m.group(1).replace(" ", "").replace(" ", ""))
except ValueError:
pass
return {
"title": title[:250],
"url": url,
"image_url": img_url,
"price_min_rub": price,
"price_max_rub": None,
"rating": rating,
"reviews_count": reviews,
"stores_count": None,
"specs": {},
"source": "citilink",
}