catalog: filter junk + background refresh + clear endpoint

FILTERING (catalog.py _save_results):
- CATEGORY_KEYWORDS: must contain category word ('холодильник', 'варочн', 'духов', etc.)
- CATEGORY_MIN_PRICE: filters parts/accessories (fridge >20k, hood >5k, etc.)
- PART_BLACKLIST: 'фильтр', 'лампочк', 'термодатчик', 'шланг', 'тэн', 'компрессор', etc.
- Previously had Asko light bulb (155₽), Miele dryer filter (376₽), Siemens cooktop in fridge category — all now filtered out

ASYNC REFRESH (main.py):
- POST /api/catalog/refresh queues background task, returns immediately
  (was sync, taking 3+ min → Cloudflare tunnel was killing connection)
- New GET /api/catalog/refresh_status for progress polling
- Concurrent refresh blocked (one at a time)

CLEAR ENDPOINT:
- POST /api/catalog/clear?cat=fridge clears one category
- POST /api/catalog/clear clears entire catalog (start over)

NEXT: clear current dirty data, re-seed fridge with filters
This commit is contained in:
wasrusgen 2026-05-12 07:09:33 +03:00
parent 9e652c4a34
commit fe472b0827
3 changed files with 290 additions and 21 deletions

View File

@ -49,6 +49,40 @@ CATEGORY_QUERIES = {
"washer": "стиральная машина",
}
# Ключевые слова, которые ДОЛЖНЫ быть в названии для категории
# (любое из вариантов подойдёт)
CATEGORY_KEYWORDS = {
"fridge": ["холодильник", "морозильник", "морозильная камера", "холодильная камера"],
"hob": ["варочн", "варочная", "плита", "конфорк", "индукцион"],
"oven": ["духов", "духовка", "духовой"],
"dw": ["посудомоеч", "посудомойк"],
"hood": ["вытяжк"],
"microwave": ["микроволнов", "свч"],
"coffee": ["кофемашин", "кофеварк", "эспрессо"],
"washer": ["стиральн", "стиралк"],
}
# Минимальные цены — отсекают запчасти, аксессуары, мини-приборы
CATEGORY_MIN_PRICE = {
"fridge": 20000,
"hob": 8000,
"oven": 15000,
"dw": 15000,
"hood": 5000,
"microwave": 3000,
"coffee": 5000,
"washer": 15000,
}
# Чёрный список — запчасти и аксессуары
PART_BLACKLIST = [
"фильтр", "лампочк", "запчаст", "ручка двер", "термодатчик", "термостат",
"уплотнит", "наклейк", "ткань", "чехол", "коврик", "пылесборник",
"тэн ", "тен ", "шланг", "шкив", "конденсатор", "магнетрон",
"мешок", "пакет для", "вантуз", "колба", "лед-фильтр",
"ремень привода", "помпа", "помп для", "сальник",
]
def refresh_catalog(categories: list[str] | None = None,
sources: tuple = ("yamarket", "wb", "citilink"),
@ -116,13 +150,22 @@ def refresh_catalog(categories: list[str] | None = None,
def _save_results(cat: str, brand: str, tier: str, query: str,
enriched: dict, max_items: int) -> int:
"""Сохраняет до max_items релевантных результатов из enriched."""
"""Сохраняет до max_items РЕЛЕВАНТНЫХ результатов из enriched.
Фильтры:
- бренд должен упоминаться в названии
- название должно содержать слово категории (холодильник / варочн / духов и т.п.)
- цена должна быть выше минимума для категории (отсекает запчасти)
- чёрный список слов: фильтр, лампочка, термодатчик и т.п.
"""
if not enriched:
return 0
saved = 0
seen_titles = set()
sources_priority = ["yamarket", "wb", "citilink", "ozon", "dns"]
cat_keywords = CATEGORY_KEYWORDS.get(cat, [])
min_price = CATEGORY_MIN_PRICE.get(cat, 0)
for src in sources_priority:
if saved >= max_items:
@ -131,14 +174,33 @@ def _save_results(cat: str, brand: str, tier: str, query: str,
if not item or not item.get("title"):
continue
# Фильтр релевантности: бренд должен упоминаться в названии или specs.brand
title = (item.get("title") or "").lower()
title_raw = item.get("title", "")
title_lower = title_raw.lower()
# 1. Бренд должен упоминаться
item_brand = (item.get("specs") or {}).get("brand", "").lower()
if brand.lower() not in title and brand.lower() not in item_brand:
if brand.lower() not in title_lower and brand.lower() not in item_brand:
log.debug("Skip (no brand): %s", title_raw[:80])
continue
# 2. Слово категории должно быть в названии
if cat_keywords and not any(kw in title_lower for kw in cat_keywords):
log.debug("Skip (wrong category): %s", title_raw[:80])
continue
# 3. Не запчасть/аксессуар
if any(bad in title_lower for bad in PART_BLACKLIST):
log.debug("Skip (part/accessory): %s", title_raw[:80])
continue
# 4. Цена выше минимума
price = item.get("price_min_rub")
if price and isinstance(price, (int, float)) and price < min_price:
log.debug("Skip (price too low %d < %d): %s", price, min_price, title_raw[:80])
continue
# Дедуп по title в рамках одного (cat, brand)
title_key = item["title"][:100].lower().strip()
title_key = title_raw[:100].lower().strip()
if title_key in seen_titles:
continue
seen_titles.add(title_key)
@ -149,7 +211,7 @@ def _save_results(cat: str, brand: str, tier: str, query: str,
cat,
brand,
tier,
item["title"][:250],
title_raw[:250],
query,
item.get("price_min_rub") or "",
item.get("price_max_rub") or "",
@ -165,6 +227,39 @@ def _save_results(cat: str, brand: str, tier: str, query: str,
return saved
def clear_catalog(category: str | None = None) -> int:
"""Удаляет все записи из каталога (или одной категории). Возвращает кол-во удалённых."""
try:
ws = sheets.ensure_sheet(SHEET_NAME, HEADERS)
all_rows = ws.get_all_values()
if len(all_rows) <= 1:
return 0
headers = all_rows[0]
cat_idx = headers.index("category") if "category" in headers else None
if category and cat_idx is not None:
# Удаляем только строки нужной категории
kept = [headers]
removed = 0
for r in all_rows[1:]:
if len(r) > cat_idx and r[cat_idx] == category:
removed += 1
else:
kept.append(r)
ws.clear()
ws.update("A1", kept)
return removed
else:
# Очищаем всё (оставляем только заголовки)
removed = len(all_rows) - 1
ws.clear()
ws.update("A1", [headers])
return removed
except Exception as e:
log.warning("clear_catalog failed: %s", e)
return 0
def list_catalog(category: str | None = None, tier: str | None = None,
brand: str | None = None, limit: int = 200) -> list[dict[str, Any]]:
"""Читает каталог из Sheets с опциональными фильтрами."""

View File

@ -224,27 +224,70 @@ async def api_proxy_status():
return proxy_pool.pool_status()
@app.post("/api/catalog/refresh")
def api_catalog_refresh(cat: str = "", per_brand: int = 2, delay: float = 1.0):
"""Запускает парсинг каталога (медленно — несколько минут на категорию).
from fastapi import BackgroundTasks
Параметры:
cat: одна категория (fridge|hob|oven|dw|hood|microwave|coffee|washer)
или пусто = все 8 (очень долго)
per_brand: сколько моделей сохранять на (brand × category) default 2
delay: задержка между запросами к парсерам, сек default 1.0
"""
categories = [cat] if cat else None
_CATALOG_REFRESH_STATUS = {"running": False, "last_result": None, "started_at": None}
def _bg_refresh(categories, per_brand, delay):
"""Фоновая задача обновления каталога — пишет статус в глобал."""
import datetime as _dt
_CATALOG_REFRESH_STATUS["running"] = True
_CATALOG_REFRESH_STATUS["started_at"] = _dt.datetime.now(_dt.timezone.utc).isoformat()
try:
result = catalog.refresh_catalog(
categories=categories,
per_brand=max(1, min(per_brand, 5)),
delay_sec=max(0.0, min(delay, 10.0)),
per_brand=per_brand,
delay_sec=delay,
)
return result
_CATALOG_REFRESH_STATUS["last_result"] = result
except Exception as e:
log.exception("catalog refresh failed")
return {"ok": False, "error": str(e)}
log.exception("bg catalog refresh failed")
_CATALOG_REFRESH_STATUS["last_result"] = {"ok": False, "error": str(e)}
finally:
_CATALOG_REFRESH_STATUS["running"] = False
@app.post("/api/catalog/refresh")
def api_catalog_refresh(background: BackgroundTasks,
cat: str = "", per_brand: int = 2, delay: float = 1.0):
"""Запускает refresh в ФОНЕ. Возвращает сразу, статус смотри в /api/catalog/refresh_status.
Параметры:
cat: одна категория или пусто = все 8 (очень долго)
per_brand: сколько моделей на (brand × category) default 2
delay: задержка между запросами, сек default 1.0
"""
if _CATALOG_REFRESH_STATUS["running"]:
return {"ok": False, "error": "already running", "started_at": _CATALOG_REFRESH_STATUS["started_at"]}
categories = [cat] if cat else None
background.add_task(
_bg_refresh,
categories,
max(1, min(per_brand, 5)),
max(0.0, min(delay, 10.0)),
)
return {
"ok": True,
"queued": True,
"categories": categories or "all",
"hint": "GET /api/catalog/refresh_status — узнать прогресс",
}
@app.get("/api/catalog/refresh_status")
def api_catalog_refresh_status():
"""Статус последнего/текущего refresh'а каталога."""
return _CATALOG_REFRESH_STATUS
@app.post("/api/catalog/clear")
def api_catalog_clear(cat: str = ""):
"""Удаляет всё содержимое каталога (или одной категории)."""
removed = catalog.clear_catalog(category=cat or None)
return {"ok": True, "removed": removed, "category": cat or "all"}
@app.get("/api/catalog/list")

131
cat_refresh.json Normal file
View File

@ -0,0 +1,131 @@
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]> <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]> <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<title>prepared-alfred-story-dale.trycloudflare.com | 524: A timeout occurred</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />
</head>
<body>
<div id="cf-wrapper">
<div id="cf-error-details" class="p-0">
<header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
<h1 class="inline-block sm:block sm:mb-2 font-light text-60 lg:text-4xl text-black-dark leading-tight mr-2">
<span class="inline-block">A timeout occurred</span>
<span class="code-label">Error code 524</span>
</h1>
<div>
Visit <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=prepared-alfred-story-dale.trycloudflare.com" target="_blank" rel="noopener noreferrer">cloudflare.com</a> for more information.
</div>
<div class="mt-3">2026-05-12 03:36:05 UTC</div>
</header>
<div class="my-8 bg-gradient-gray">
<div class="w-240 lg:w-full mx-auto">
<div class="clearfix md:px-8">
<div id="cf-browser-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
<div class="relative mb-10 md:m-0">
<span class="cf-icon-browser block md:hidden h-20 bg-center bg-no-repeat"></span>
<span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
</div>
<span class="md:block w-full truncate">You</span>
<h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
Browser
</h3>
<span class="leading-1.3 text-2xl text-green-success">Working</span>
</div>
<div id="cf-cloudflare-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
<div class="relative mb-10 md:m-0">
<a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=prepared-alfred-story-dale.trycloudflare.com" target="_blank" rel="noopener noreferrer">
<span class="cf-icon-cloud block md:hidden h-20 bg-center bg-no-repeat"></span>
<span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
</a>
</div>
<span class="md:block w-full truncate">Stockholm</span>
<h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
<a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=prepared-alfred-story-dale.trycloudflare.com" target="_blank" rel="noopener noreferrer">
Cloudflare
</a>
</h3>
<span class="leading-1.3 text-2xl text-green-success">Working</span>
</div>
<div id="cf-host-status" class="cf-error-source relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
<div class="relative mb-10 md:m-0">
<span class="cf-icon-server block md:hidden h-20 bg-center bg-no-repeat"></span>
<span class="cf-icon-error w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
</div>
<span class="md:block w-full truncate">prepared-alfred-story-dale.trycloudflare.com</span>
<h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
Host
</h3>
<span class="leading-1.3 text-2xl text-red-error">Error</span>
</div>
</div>
</div>
</div>
<div class="w-240 lg:w-full mx-auto mb-8 lg:px-8">
<div class="clearfix">
<div class="w-1/2 md:w-full float-left pr-6 md:pb-10 md:pr-0 leading-relaxed">
<h2 class="text-3xl font-normal leading-1.3 mb-4">What happened?</h2>
<p class="mb-2">The origin web server timed out responding to this request.</p><p>The likely cause is an overloaded background task, database or application, stressing the resources on the host web server.</p>
</div>
<div class="w-1/2 md:w-full float-left leading-relaxed">
<h2 class="text-3xl font-normal leading-1.3 mb-4">What can I do?</h2>
<h3 class="text-15 font-semibold mb-2">If you're a visitor of this website:</h3>
<p class="mb-6">Please try again in a few minutes.</p>
<h3 class="text-15 font-semibold mb-2">If you're the owner of this website:</h3>
<p>Please refer to the <a rel="noopener noreferrer" href="https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-524/">Error 524</a> article:</p>
<ul class="ml-4">
<li>Contact your hosting provider; check for long-running processes or an overloaded web server.</li>
<li>Use status polling of large HTTP processes to avoid this error.</li>
<li>Run the long-running scripts on a <a rel="noopener noreferrer" href="https://developers.cloudflare.com/dns/proxy-status/#dns-only-records">grey-clouded subdomain</a>.</li>
<li>Enterprise customers can <a rel="noopener noreferrer" href="https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-524/#resolution-on-cloudflare">increase the timeout setting</a> globally or for specific requests using Cache Rules.</li>
</ul>
</div>
</div>
</div>
<div class="cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300">
<p class="text-13">
<span class="cf-footer-item sm:block sm:mb-1">Cloudflare Ray ID: <strong class="font-semibold">9fa65d7a366a3e67</strong></span>
<span class="cf-footer-separator sm:hidden">&bull;</span>
<span id="cf-footer-item-ip" class="cf-footer-item hidden sm:block sm:mb-1">
Your IP:
<button type="button" id="cf-footer-ip-reveal" class="cf-footer-ip-reveal-btn">Click to reveal</button>
<span class="hidden" id="cf-footer-ip">193.233.23.50</span>
<span class="cf-footer-separator sm:hidden">&bull;</span>
</span>
<span class="cf-footer-item sm:block sm:mb-1"><span>Performance &amp; security by</span> <a rel="noopener noreferrer" href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=prepared-alfred-story-dale.trycloudflare.com" id="brand_link" target="_blank">Cloudflare</a></span>
</p>
<script>(function(){function d(){var b=a.getElementById("cf-footer-item-ip"),c=a.getElementById("cf-footer-ip-reveal");b&&"classList"in b&&(b.classList.remove("hidden"),c.addEventListener("click",function(){c.classList.add("hidden");a.getElementById("cf-footer-ip").classList.remove("hidden")}))}var a=document;document.addEventListener&&a.addEventListener("DOMContentLoaded",d)})();</script>
</div><!-- /.error-footer -->
</div>
</div>
</body>
</html>