mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 15:04:50 +00:00
parsers: better image extraction — real product photos in report cards
CITILINK: - Now reads data-src / data-original / srcset / src in priority order - srcset → picks largest size variant (last in comma-list) - Filters only _next/static/images (placeholder) and 'placeholder' in URL - Accepts cs.citilink.ru / c.citilink.ru / images.citilink.ru product photos ЯНДЕКС.МАРКЕТ: - Collects all img attrs (data-src, data-original, srcset, data-srcset, src) - Prefers avatars.mds.yandex.net (real product CDN), skips yastatic (icons/logos) - Auto-appends /300x300 suffix to avatars.mds URLs without size ENRICH_ONE (aggregator): - Image picked by source priority: yamarket > wb > ozon > citilink > dns - Yamarket photos are cleanest (avatars.mds.yandex.net) - WB has product photos via basket-XX.wbbasket.ru
This commit is contained in:
parent
0b48dd2371
commit
1a57374020
@ -86,10 +86,18 @@ def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]:
|
||||
|
||||
# Агрегация
|
||||
prices = [i["price_min_rub"] for i in items.values() if i and i.get("price_min_rub")]
|
||||
images = [i["image_url"] for i in items.values() if i and i.get("image_url")]
|
||||
ratings = [i["rating"] for i in items.values() if i and i.get("rating")]
|
||||
reviews = [i["reviews_count"] for i in items.values() if i and i.get("reviews_count")]
|
||||
|
||||
# Фото — выбираем по приоритету источника (качество фото различается)
|
||||
image_priority = ("yamarket", "wb", "ozon", "citilink", "dns")
|
||||
image_url = None
|
||||
for src in image_priority:
|
||||
i = items.get(src)
|
||||
if i and i.get("image_url"):
|
||||
image_url = i["image_url"]
|
||||
break
|
||||
|
||||
# Я.Маркет даёт количество магазинов
|
||||
stores = None
|
||||
if items.get("yamarket") and items["yamarket"].get("stores_count"):
|
||||
@ -107,7 +115,7 @@ def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]:
|
||||
**{src: items.get(src) for src in fetchers.keys()},
|
||||
"price_min_rub": min(prices) if prices else None,
|
||||
"price_max_rub": max(prices) if prices else None,
|
||||
"image_url": images[0] if images else None,
|
||||
"image_url": image_url,
|
||||
"rating_max": max(ratings) if ratings else None,
|
||||
"reviews_total": sum(reviews) if reviews else None,
|
||||
"stores_count": stores,
|
||||
|
||||
@ -133,20 +133,37 @@ def _extract_card(card, url: str) -> dict[str, Any] | None:
|
||||
if not title or len(title) < 10:
|
||||
return None
|
||||
|
||||
# Картинка — ищем реальное товарное фото (НЕ placeholder/SSR-иконку)
|
||||
# Картинка — пробуем разные источники: src, data-src, srcset
|
||||
img_url = None
|
||||
for img_el in card.find_all("img"):
|
||||
src = img_el.get("src") or img_el.get("data-src") or ""
|
||||
if not src or "data:image" in src:
|
||||
continue
|
||||
if src.startswith("//"):
|
||||
src = "https:" + src
|
||||
# Filter placeholders: rstatic.citilink.ru/_next/static/images/... всегда заглушка
|
||||
if "_next/static/images" in src:
|
||||
continue
|
||||
# Реальные товарные фото — на c.citilink.ru или main.citilink.ru
|
||||
img_url = src
|
||||
break
|
||||
# Источники в порядке приоритета
|
||||
candidates = []
|
||||
for attr in ("data-src", "data-original", "data-srcset", "srcset", "src"):
|
||||
val = img_el.get(attr) or ""
|
||||
if not val:
|
||||
continue
|
||||
if attr in ("srcset", "data-srcset"):
|
||||
# Берём самый большой размер (последний в srcset)
|
||||
parts = val.split(",")
|
||||
if parts:
|
||||
largest = parts[-1].strip().split(" ")[0]
|
||||
candidates.append(largest)
|
||||
else:
|
||||
candidates.append(val)
|
||||
|
||||
for src in candidates:
|
||||
if not src or "data:image" in src:
|
||||
continue
|
||||
if src.startswith("//"):
|
||||
src = "https:" + src
|
||||
# Отсеиваем placeholder'ы Next.js (всегда заглушки)
|
||||
if "_next/static/images" in src or "placeholder" in src.lower():
|
||||
continue
|
||||
# Реальные товарные фото — обычно на cs.citilink.ru / c.citilink.ru / images.citilink.ru
|
||||
img_url = src
|
||||
break
|
||||
if img_url:
|
||||
break
|
||||
|
||||
# Рейтинг
|
||||
rating = None
|
||||
|
||||
@ -138,20 +138,40 @@ def _extract_card(card, link_el, url: str) -> dict[str, Any] | None:
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Картинка (исключаем placeholder'ы)
|
||||
# Картинка — приоритет: avatars.mds.yandex.net (реальные товарные фото)
|
||||
img_url = None
|
||||
candidates = []
|
||||
for img_el in card.find_all("img"):
|
||||
src = img_el.get("src") or img_el.get("data-src") or ""
|
||||
for attr in ("data-src", "data-original", "srcset", "data-srcset", "src"):
|
||||
v = img_el.get(attr) or ""
|
||||
if not v:
|
||||
continue
|
||||
if attr in ("srcset", "data-srcset"):
|
||||
# Берём самый большой размер
|
||||
parts = v.split(",")
|
||||
if parts:
|
||||
candidates.append(parts[-1].strip().split(" ")[0])
|
||||
else:
|
||||
candidates.append(v)
|
||||
# Чистим и приоритизируем
|
||||
for src in candidates:
|
||||
if not src or "data:image" in src:
|
||||
srcset = img_el.get("srcset") or ""
|
||||
if srcset:
|
||||
src = srcset.split(",")[0].strip().split(" ")[0]
|
||||
continue
|
||||
if src.startswith("//"):
|
||||
src = "https:" + src
|
||||
if not src or "yastatic" in src or "_next/static" in src:
|
||||
if "yastatic" in src: # это иконки/логотипы
|
||||
continue
|
||||
img_url = src
|
||||
break
|
||||
# avatars.mds.yandex.net — высший приоритет
|
||||
if "avatars.mds.yandex" in src:
|
||||
img_url = src
|
||||
break
|
||||
# Любой полноценный http(s) URL — fallback
|
||||
if src.startswith("http") and not img_url:
|
||||
img_url = src
|
||||
|
||||
# Если URL не содержит размерного суффикса — добавим /300x300
|
||||
if img_url and "avatars.mds.yandex" in img_url and not re.search(r"/\d+x\d+/?$|/orig/?$", img_url):
|
||||
img_url = img_url.rstrip("/") + "/300x300"
|
||||
|
||||
# Рейтинг
|
||||
rating = None
|
||||
|
||||
Loading…
Reference in New Issue
Block a user