diff --git a/backend-py/app/parsers/__init__.py b/backend-py/app/parsers/__init__.py index 1e659df..b748626 100644 --- a/backend-py/app/parsers/__init__.py +++ b/backend-py/app/parsers/__init__.py @@ -86,10 +86,18 @@ def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]: # Агрегация prices = [i["price_min_rub"] for i in items.values() if i and i.get("price_min_rub")] - images = [i["image_url"] for i in items.values() if i and i.get("image_url")] ratings = [i["rating"] for i in items.values() if i and i.get("rating")] reviews = [i["reviews_count"] for i in items.values() if i and i.get("reviews_count")] + # Фото — выбираем по приоритету источника (качество фото различается) + image_priority = ("yamarket", "wb", "ozon", "citilink", "dns") + image_url = None + for src in image_priority: + i = items.get(src) + if i and i.get("image_url"): + image_url = i["image_url"] + break + # Я.Маркет даёт количество магазинов stores = None if items.get("yamarket") and items["yamarket"].get("stores_count"): @@ -107,7 +115,7 @@ def enrich_one(query: str, sources: tuple = DEFAULT_SOURCES) -> dict[str, Any]: **{src: items.get(src) for src in fetchers.keys()}, "price_min_rub": min(prices) if prices else None, "price_max_rub": max(prices) if prices else None, - "image_url": images[0] if images else None, + "image_url": image_url, "rating_max": max(ratings) if ratings else None, "reviews_total": sum(reviews) if reviews else None, "stores_count": stores, diff --git a/backend-py/app/parsers/citilink.py b/backend-py/app/parsers/citilink.py index 03d7b5a..942d6f4 100644 --- a/backend-py/app/parsers/citilink.py +++ b/backend-py/app/parsers/citilink.py @@ -133,20 +133,37 @@ def _extract_card(card, url: str) -> dict[str, Any] | None: if not title or len(title) < 10: return None - # Картинка — ищем реальное товарное фото (НЕ placeholder/SSR-иконку) + # Картинка — пробуем разные источники: src, data-src, srcset img_url = None for img_el in card.find_all("img"): - src = img_el.get("src") or img_el.get("data-src") or "" - if not src or "data:image" in src: - continue - if src.startswith("//"): - src = "https:" + src - # Filter placeholders: rstatic.citilink.ru/_next/static/images/... всегда заглушка - if "_next/static/images" in src: - continue - # Реальные товарные фото — на c.citilink.ru или main.citilink.ru - img_url = src - break + # Источники в порядке приоритета + candidates = [] + for attr in ("data-src", "data-original", "data-srcset", "srcset", "src"): + val = img_el.get(attr) or "" + if not val: + continue + if attr in ("srcset", "data-srcset"): + # Берём самый большой размер (последний в srcset) + parts = val.split(",") + if parts: + largest = parts[-1].strip().split(" ")[0] + candidates.append(largest) + else: + candidates.append(val) + + for src in candidates: + if not src or "data:image" in src: + continue + if src.startswith("//"): + src = "https:" + src + # Отсеиваем placeholder'ы Next.js (всегда заглушки) + if "_next/static/images" in src or "placeholder" in src.lower(): + continue + # Реальные товарные фото — обычно на cs.citilink.ru / c.citilink.ru / images.citilink.ru + img_url = src + break + if img_url: + break # Рейтинг rating = None diff --git a/backend-py/app/parsers/yamarket.py b/backend-py/app/parsers/yamarket.py index b1e653b..1555489 100644 --- a/backend-py/app/parsers/yamarket.py +++ b/backend-py/app/parsers/yamarket.py @@ -138,20 +138,40 @@ def _extract_card(card, link_el, url: str) -> dict[str, Any] | None: except ValueError: pass - # Картинка (исключаем placeholder'ы) + # Картинка — приоритет: avatars.mds.yandex.net (реальные товарные фото) img_url = None + candidates = [] for img_el in card.find_all("img"): - src = img_el.get("src") or img_el.get("data-src") or "" + for attr in ("data-src", "data-original", "srcset", "data-srcset", "src"): + v = img_el.get(attr) or "" + if not v: + continue + if attr in ("srcset", "data-srcset"): + # Берём самый большой размер + parts = v.split(",") + if parts: + candidates.append(parts[-1].strip().split(" ")[0]) + else: + candidates.append(v) + # Чистим и приоритизируем + for src in candidates: if not src or "data:image" in src: - srcset = img_el.get("srcset") or "" - if srcset: - src = srcset.split(",")[0].strip().split(" ")[0] + continue if src.startswith("//"): src = "https:" + src - if not src or "yastatic" in src or "_next/static" in src: + if "yastatic" in src: # это иконки/логотипы continue - img_url = src - break + # avatars.mds.yandex.net — высший приоритет + if "avatars.mds.yandex" in src: + img_url = src + break + # Любой полноценный http(s) URL — fallback + if src.startswith("http") and not img_url: + img_url = src + + # Если URL не содержит размерного суффикса — добавим /300x300 + if img_url and "avatars.mds.yandex" in img_url and not re.search(r"/\d+x\d+/?$|/orig/?$", img_url): + img_url = img_url.rstrip("/") + "/300x300" # Рейтинг rating = None