From 1a57374020a0b5152e917d8073ebc7424aef4980 Mon Sep 17 00:00:00 2001 From: wasrusgen Date: Mon, 11 May 2026 23:43:25 +0300 Subject: [PATCH] =?UTF-8?q?parsers:=20better=20image=20extraction=20?= =?UTF-8?q?=E2=80=94=20real=20product=20photos=20in=20report=20cards?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CITILINK: - Now reads data-src / data-original / srcset / src in priority order - srcset → picks largest size variant (last in comma-list) - Filters only _next/static/images (placeholder) and 'placeholder' in URL - Accepts cs.citilink.ru / c.citilink.ru / images.citilink.ru product photos ЯНДЕКС.МАРКЕТ: - Collects all img attrs (data-src, data-original, srcset, data-srcset, src) - Prefers avatars.mds.yandex.net (real product CDN), skips yastatic (icons/logos) - Auto-appends /300x300 suffix to avatars.mds URLs without size ENRICH_ONE (aggregator): - Image picked by source priority: yamarket > wb > ozon > citilink > dns - Yamarket photos are cleanest (avatars.mds.yandex.net) - WB has product photos via basket-XX.wbbasket.ru --- backend-py/app/parsers/__init__.py | 12 +++++++-- backend-py/app/parsers/citilink.py | 41 +++++++++++++++++++++--------- backend-py/app/parsers/yamarket.py | 36 ++++++++++++++++++++------ 3 files changed, 67 insertions(+), 22 deletions(-) diff --git a/backend-py/app/parsers/__init__.py b/backend-py/app/parsers/__init__.py index 1e659df..b748626 100644 --- a/backend-py/app/parsers/__init__.py +++ b/backend-py/app/parsers/__init__.py @@ -86,10 +86,18 @@ def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]: # Агрегация prices = [i["price_min_rub"] for i in items.values() if i and i.get("price_min_rub")] - images = [i["image_url"] for i in items.values() if i and i.get("image_url")] ratings = [i["rating"] for i in items.values() if i and i.get("rating")] reviews = [i["reviews_count"] for i in items.values() if i and i.get("reviews_count")] + # Фото — выбираем по приоритету источника (качество фото различается) + image_priority = ("yamarket", "wb", "ozon", "citilink", "dns") + image_url = None + for src in image_priority: + i = items.get(src) + if i and i.get("image_url"): + image_url = i["image_url"] + break + # Я.Маркет даёт количество магазинов stores = None if items.get("yamarket") and items["yamarket"].get("stores_count"): @@ -107,7 +115,7 @@ def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]: **{src: items.get(src) for src in fetchers.keys()}, "price_min_rub": min(prices) if prices else None, "price_max_rub": max(prices) if prices else None, - "image_url": images[0] if images else None, + "image_url": image_url, "rating_max": max(ratings) if ratings else None, "reviews_total": sum(reviews) if reviews else None, "stores_count": stores, diff --git a/backend-py/app/parsers/citilink.py b/backend-py/app/parsers/citilink.py index 03d7b5a..942d6f4 100644 --- a/backend-py/app/parsers/citilink.py +++ b/backend-py/app/parsers/citilink.py @@ -133,20 +133,37 @@ def _extract_card(card, url: str) -> dict[str, Any] | None: if not title or len(title) < 10: return None - # Картинка — ищем реальное товарное фото (НЕ placeholder/SSR-иконку) + # Картинка — пробуем разные источники: src, data-src, srcset img_url = None for img_el in card.find_all("img"): - src = img_el.get("src") or img_el.get("data-src") or "" - if not src or "data:image" in src: - continue - if src.startswith("//"): - src = "https:" + src - # Filter placeholders: rstatic.citilink.ru/_next/static/images/... всегда заглушка - if "_next/static/images" in src: - continue - # Реальные товарные фото — на c.citilink.ru или main.citilink.ru - img_url = src - break + # Источники в порядке приоритета + candidates = [] + for attr in ("data-src", "data-original", "data-srcset", "srcset", "src"): + val = img_el.get(attr) or "" + if not val: + continue + if attr in ("srcset", "data-srcset"): + # Берём самый большой размер (последний в srcset) + parts = val.split(",") + if parts: + largest = parts[-1].strip().split(" ")[0] + candidates.append(largest) + else: + candidates.append(val) + + for src in candidates: + if not src or "data:image" in src: + continue + if src.startswith("//"): + src = "https:" + src + # Отсеиваем placeholder'ы Next.js (всегда заглушки) + if "_next/static/images" in src or "placeholder" in src.lower(): + continue + # Реальные товарные фото — обычно на cs.citilink.ru / c.citilink.ru / images.citilink.ru + img_url = src + break + if img_url: + break # Рейтинг rating = None diff --git a/backend-py/app/parsers/yamarket.py b/backend-py/app/parsers/yamarket.py index b1e653b..1555489 100644 --- a/backend-py/app/parsers/yamarket.py +++ b/backend-py/app/parsers/yamarket.py @@ -138,20 +138,40 @@ def _extract_card(card, link_el, url: str) -> dict[str, Any] | None: except ValueError: pass - # Картинка (исключаем placeholder'ы) + # Картинка — приоритет: avatars.mds.yandex.net (реальные товарные фото) img_url = None + candidates = [] for img_el in card.find_all("img"): - src = img_el.get("src") or img_el.get("data-src") or "" + for attr in ("data-src", "data-original", "srcset", "data-srcset", "src"): + v = img_el.get(attr) or "" + if not v: + continue + if attr in ("srcset", "data-srcset"): + # Берём самый большой размер + parts = v.split(",") + if parts: + candidates.append(parts[-1].strip().split(" ")[0]) + else: + candidates.append(v) + # Чистим и приоритизируем + for src in candidates: if not src or "data:image" in src: - srcset = img_el.get("srcset") or "" - if srcset: - src = srcset.split(",")[0].strip().split(" ")[0] + continue if src.startswith("//"): src = "https:" + src - if not src or "yastatic" in src or "_next/static" in src: + if "yastatic" in src: # это иконки/логотипы continue - img_url = src - break + # avatars.mds.yandex.net — высший приоритет + if "avatars.mds.yandex" in src: + img_url = src + break + # Любой полноценный http(s) URL — fallback + if src.startswith("http") and not img_url: + img_url = src + + # Если URL не содержит размерного суффикса — добавим /300x300 + if img_url and "avatars.mds.yandex" in img_url and not re.search(r"/\d+x\d+/?$|/orig/?$", img_url): + img_url = img_url.rstrip("/") + "/300x300" # Рейтинг rating = None