From c97b8dce3cf3a6efabc00645b924ca4dde593085 Mon Sep 17 00:00:00 2001 From: wasrusgen Date: Mon, 11 May 2026 17:20:59 +0300 Subject: [PATCH] =?UTF-8?q?parsers:=20skip=20sponsored/ad=20URLs=20(cpc/sp?= =?UTF-8?q?onsored=3D1)=20=E2=80=94=20they=20expire=20in=202-3=20hours?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User reported clicking matrix prices led to 'Произошла ошибка!' on OZON home page. Cause: parsers captured /product/?sponsored=1&cpc=Jtiito95... links that died after few hours. Fix: - ozon.py: skip href with 'sponsored=1', '/promo/', 'cpc='. Strip query string from final URL. - yamarket.py: skip 'sponsored=1', 'cpc=', 'advUuid' (Я.Маркет sponsored marker) - citilink.py: strip query string from final URL (defensive) Now matrix links go to canonical product pages that don't expire. --- backend-py/app/parsers/citilink.py | 4 +++- backend-py/app/parsers/ozon.py | 8 ++++++-- backend-py/app/parsers/yamarket.py | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/backend-py/app/parsers/citilink.py b/backend-py/app/parsers/citilink.py index 53842d1..03d7b5a 100644 --- a/backend-py/app/parsers/citilink.py +++ b/backend-py/app/parsers/citilink.py @@ -62,7 +62,9 @@ def _parse_html(html: str, limit: int) -> list[dict[str, Any]]: continue seen_urls.add(product_id) - full_url = href if href.startswith("http") else f"{_BASE_URL}{href}" + # Финальный URL — БЕЗ query params (sponsored / cpc / tracking) + href_clean = href.split("?")[0] + full_url = href_clean if href_clean.startswith("http") else f"{_BASE_URL}{href_clean}" # Поднимаемся к родительской карточке — у Citilink CSS-in-JS, поэтому # ищем ближайший div, в котором есть и цена и название diff --git a/backend-py/app/parsers/ozon.py b/backend-py/app/parsers/ozon.py index 396c226..ff4d0c3 100644 --- a/backend-py/app/parsers/ozon.py +++ b/backend-py/app/parsers/ozon.py @@ -64,13 +64,17 @@ def _parse_html(html: str, limit: int) -> list[dict[str, Any]]: break href = link.get("href") or "" - # Нормализация URL — убираем query params для дедупа + # Пропускаем спонсорные ссылки — они истекают через 2-3 часа + if "sponsored=1" in href or "/promo/" in href or "cpc=" in href: + continue + # Чистим URL — убираем все query-параметры url_clean = href.split("?")[0] if url_clean in seen_urls: continue seen_urls.add(url_clean) - full_url = href if href.startswith("http") else f"{_BASE_URL}{href}" + # Финальный URL — БЕЗ query params (sponsored ссылки иначе через 2-3ч 404) + full_url = url_clean if url_clean.startswith("http") else f"{_BASE_URL}{url_clean}" # Поднимаемся до карточки — у OZON это обычно ближайший div с tile-* классом card = ( diff --git a/backend-py/app/parsers/yamarket.py b/backend-py/app/parsers/yamarket.py index 00d7269..b1e653b 100644 --- a/backend-py/app/parsers/yamarket.py +++ b/backend-py/app/parsers/yamarket.py @@ -67,6 +67,9 @@ def _parse_html(html: str, limit: int) -> list[dict[str, Any]]: if len(results) >= limit: break href = link.get("href") or "" + # Пропускаем sponsored — их URL'ы с CPC-токенами истекают через несколько часов + if "sponsored=1" in href or "cpc=" in href or "advUuid" in href: + continue m_id = re.search(r"/card/[^/]+/(\d+)", href) if not m_id: continue