parsers: better image extraction — real product photos in report cards

CITILINK:
- Now reads data-src / data-original / srcset / src in priority order
- srcset → picks largest size variant (last in comma-list)
- Filters only _next/static/images (placeholder) and 'placeholder' in URL
- Accepts cs.citilink.ru / c.citilink.ru / images.citilink.ru product photos

ЯНДЕКС.МАРКЕТ:
- Collects all img attrs (data-src, data-original, srcset, data-srcset, src)
- Prefers avatars.mds.yandex.net (real product CDN), skips yastatic (icons/logos)
- Auto-appends /300x300 suffix to avatars.mds URLs without size

ENRICH_ONE (aggregator):
- Image picked by source priority: yamarket > wb > ozon > citilink > dns
- Yamarket photos are cleanest (avatars.mds.yandex.net)
- WB has product photos via basket-XX.wbbasket.ru
This commit is contained in:
wasrusgen 2026-05-11 23:43:25 +03:00
parent 0b48dd2371
commit 1a57374020
3 changed files with 67 additions and 22 deletions

View File

@ -86,10 +86,18 @@ def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]:
# Агрегация
prices = [i["price_min_rub"] for i in items.values() if i and i.get("price_min_rub")]
images = [i["image_url"] for i in items.values() if i and i.get("image_url")]
ratings = [i["rating"] for i in items.values() if i and i.get("rating")]
reviews = [i["reviews_count"] for i in items.values() if i and i.get("reviews_count")]
# Фото — выбираем по приоритету источника (качество фото различается)
image_priority = ("yamarket", "wb", "ozon", "citilink", "dns")
image_url = None
for src in image_priority:
i = items.get(src)
if i and i.get("image_url"):
image_url = i["image_url"]
break
# Я.Маркет даёт количество магазинов
stores = None
if items.get("yamarket") and items["yamarket"].get("stores_count"):
@ -107,7 +115,7 @@ def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]:
**{src: items.get(src) for src in fetchers.keys()},
"price_min_rub": min(prices) if prices else None,
"price_max_rub": max(prices) if prices else None,
"image_url": images[0] if images else None,
"image_url": image_url,
"rating_max": max(ratings) if ratings else None,
"reviews_total": sum(reviews) if reviews else None,
"stores_count": stores,

View File

@ -133,20 +133,37 @@ def _extract_card(card, url: str) -> dict[str, Any] | None:
if not title or len(title) < 10:
return None
# Картинка — ищем реальное товарное фото (НЕ placeholder/SSR-иконку)
# Картинка — пробуем разные источники: src, data-src, srcset
img_url = None
for img_el in card.find_all("img"):
src = img_el.get("src") or img_el.get("data-src") or ""
if not src or "data:image" in src:
continue
if src.startswith("//"):
src = "https:" + src
# Filter placeholders: rstatic.citilink.ru/_next/static/images/... всегда заглушка
if "_next/static/images" in src:
continue
# Реальные товарные фото — на c.citilink.ru или main.citilink.ru
img_url = src
break
# Источники в порядке приоритета
candidates = []
for attr in ("data-src", "data-original", "data-srcset", "srcset", "src"):
val = img_el.get(attr) or ""
if not val:
continue
if attr in ("srcset", "data-srcset"):
# Берём самый большой размер (последний в srcset)
parts = val.split(",")
if parts:
largest = parts[-1].strip().split(" ")[0]
candidates.append(largest)
else:
candidates.append(val)
for src in candidates:
if not src or "data:image" in src:
continue
if src.startswith("//"):
src = "https:" + src
# Отсеиваем placeholder'ы Next.js (всегда заглушки)
if "_next/static/images" in src or "placeholder" in src.lower():
continue
# Реальные товарные фото — обычно на cs.citilink.ru / c.citilink.ru / images.citilink.ru
img_url = src
break
if img_url:
break
# Рейтинг
rating = None

View File

@ -138,20 +138,40 @@ def _extract_card(card, link_el, url: str) -> dict[str, Any] | None:
except ValueError:
pass
# Картинка (исключаем placeholder'ы)
# Картинка — приоритет: avatars.mds.yandex.net (реальные товарные фото)
img_url = None
candidates = []
for img_el in card.find_all("img"):
src = img_el.get("src") or img_el.get("data-src") or ""
for attr in ("data-src", "data-original", "srcset", "data-srcset", "src"):
v = img_el.get(attr) or ""
if not v:
continue
if attr in ("srcset", "data-srcset"):
# Берём самый большой размер
parts = v.split(",")
if parts:
candidates.append(parts[-1].strip().split(" ")[0])
else:
candidates.append(v)
# Чистим и приоритизируем
for src in candidates:
if not src or "data:image" in src:
srcset = img_el.get("srcset") or ""
if srcset:
src = srcset.split(",")[0].strip().split(" ")[0]
continue
if src.startswith("//"):
src = "https:" + src
if not src or "yastatic" in src or "_next/static" in src:
if "yastatic" in src: # это иконки/логотипы
continue
img_url = src
break
# avatars.mds.yandex.net — высший приоритет
if "avatars.mds.yandex" in src:
img_url = src
break
# Любой полноценный http(s) URL — fallback
if src.startswith("http") and not img_url:
img_url = src
# Если URL не содержит размерного суффикса — добавим /300x300
if img_url and "avatars.mds.yandex" in img_url and not re.search(r"/\d+x\d+/?$|/orig/?$", img_url):
img_url = img_url.rstrip("/") + "/300x300"
# Рейтинг
rating = None