mirror of
https://github.com/wasrusgen/zov-tech.git
synced 2026-06-03 17:44:48 +00:00
catalog: filter junk + background refresh + clear endpoint
FILTERING (catalog.py _save_results):
- CATEGORY_KEYWORDS: must contain category word ('холодильник', 'варочн', 'духов', etc.)
- CATEGORY_MIN_PRICE: filters parts/accessories (fridge >20k, hood >5k, etc.)
- PART_BLACKLIST: 'фильтр', 'лампочк', 'термодатчик', 'шланг', 'тэн', 'компрессор', etc.
- Previously had Asko light bulb (155₽), Miele dryer filter (376₽), Siemens cooktop in fridge category — all now filtered out
ASYNC REFRESH (main.py):
- POST /api/catalog/refresh queues background task, returns immediately
(was sync, taking 3+ min → Cloudflare tunnel was killing connection)
- New GET /api/catalog/refresh_status for progress polling
- Concurrent refresh blocked (one at a time)
CLEAR ENDPOINT:
- POST /api/catalog/clear?cat=fridge clears one category
- POST /api/catalog/clear clears entire catalog (start over)
NEXT: clear current dirty data, re-seed fridge with filters
This commit is contained in:
parent
9e652c4a34
commit
fe472b0827
@ -49,6 +49,40 @@ CATEGORY_QUERIES = {
|
||||
"washer": "стиральная машина",
|
||||
}
|
||||
|
||||
# Ключевые слова, которые ДОЛЖНЫ быть в названии для категории
|
||||
# (любое из вариантов подойдёт)
|
||||
CATEGORY_KEYWORDS = {
|
||||
"fridge": ["холодильник", "морозильник", "морозильная камера", "холодильная камера"],
|
||||
"hob": ["варочн", "варочная", "плита", "конфорк", "индукцион"],
|
||||
"oven": ["духов", "духовка", "духовой"],
|
||||
"dw": ["посудомоеч", "посудомойк"],
|
||||
"hood": ["вытяжк"],
|
||||
"microwave": ["микроволнов", "свч"],
|
||||
"coffee": ["кофемашин", "кофеварк", "эспрессо"],
|
||||
"washer": ["стиральн", "стиралк"],
|
||||
}
|
||||
|
||||
# Минимальные цены — отсекают запчасти, аксессуары, мини-приборы
|
||||
CATEGORY_MIN_PRICE = {
|
||||
"fridge": 20000,
|
||||
"hob": 8000,
|
||||
"oven": 15000,
|
||||
"dw": 15000,
|
||||
"hood": 5000,
|
||||
"microwave": 3000,
|
||||
"coffee": 5000,
|
||||
"washer": 15000,
|
||||
}
|
||||
|
||||
# Чёрный список — запчасти и аксессуары
|
||||
PART_BLACKLIST = [
|
||||
"фильтр", "лампочк", "запчаст", "ручка двер", "термодатчик", "термостат",
|
||||
"уплотнит", "наклейк", "ткань", "чехол", "коврик", "пылесборник",
|
||||
"тэн ", "тен ", "шланг", "шкив", "конденсатор", "магнетрон",
|
||||
"мешок", "пакет для", "вантуз", "колба", "лед-фильтр",
|
||||
"ремень привода", "помпа", "помп для", "сальник",
|
||||
]
|
||||
|
||||
|
||||
def refresh_catalog(categories: list[str] | None = None,
|
||||
sources: tuple = ("yamarket", "wb", "citilink"),
|
||||
@ -116,13 +150,22 @@ def refresh_catalog(categories: list[str] | None = None,
|
||||
|
||||
def _save_results(cat: str, brand: str, tier: str, query: str,
|
||||
enriched: dict, max_items: int) -> int:
|
||||
"""Сохраняет до max_items релевантных результатов из enriched."""
|
||||
"""Сохраняет до max_items РЕЛЕВАНТНЫХ результатов из enriched.
|
||||
|
||||
Фильтры:
|
||||
- бренд должен упоминаться в названии
|
||||
- название должно содержать слово категории (холодильник / варочн / духов и т.п.)
|
||||
- цена должна быть выше минимума для категории (отсекает запчасти)
|
||||
- чёрный список слов: фильтр, лампочка, термодатчик и т.п.
|
||||
"""
|
||||
if not enriched:
|
||||
return 0
|
||||
|
||||
saved = 0
|
||||
seen_titles = set()
|
||||
sources_priority = ["yamarket", "wb", "citilink", "ozon", "dns"]
|
||||
cat_keywords = CATEGORY_KEYWORDS.get(cat, [])
|
||||
min_price = CATEGORY_MIN_PRICE.get(cat, 0)
|
||||
|
||||
for src in sources_priority:
|
||||
if saved >= max_items:
|
||||
@ -131,14 +174,33 @@ def _save_results(cat: str, brand: str, tier: str, query: str,
|
||||
if not item or not item.get("title"):
|
||||
continue
|
||||
|
||||
# Фильтр релевантности: бренд должен упоминаться в названии или specs.brand
|
||||
title = (item.get("title") or "").lower()
|
||||
title_raw = item.get("title", "")
|
||||
title_lower = title_raw.lower()
|
||||
|
||||
# 1. Бренд должен упоминаться
|
||||
item_brand = (item.get("specs") or {}).get("brand", "").lower()
|
||||
if brand.lower() not in title and brand.lower() not in item_brand:
|
||||
if brand.lower() not in title_lower and brand.lower() not in item_brand:
|
||||
log.debug("Skip (no brand): %s", title_raw[:80])
|
||||
continue
|
||||
|
||||
# 2. Слово категории должно быть в названии
|
||||
if cat_keywords and not any(kw in title_lower for kw in cat_keywords):
|
||||
log.debug("Skip (wrong category): %s", title_raw[:80])
|
||||
continue
|
||||
|
||||
# 3. Не запчасть/аксессуар
|
||||
if any(bad in title_lower for bad in PART_BLACKLIST):
|
||||
log.debug("Skip (part/accessory): %s", title_raw[:80])
|
||||
continue
|
||||
|
||||
# 4. Цена выше минимума
|
||||
price = item.get("price_min_rub")
|
||||
if price and isinstance(price, (int, float)) and price < min_price:
|
||||
log.debug("Skip (price too low %d < %d): %s", price, min_price, title_raw[:80])
|
||||
continue
|
||||
|
||||
# Дедуп по title в рамках одного (cat, brand)
|
||||
title_key = item["title"][:100].lower().strip()
|
||||
title_key = title_raw[:100].lower().strip()
|
||||
if title_key in seen_titles:
|
||||
continue
|
||||
seen_titles.add(title_key)
|
||||
@ -149,7 +211,7 @@ def _save_results(cat: str, brand: str, tier: str, query: str,
|
||||
cat,
|
||||
brand,
|
||||
tier,
|
||||
item["title"][:250],
|
||||
title_raw[:250],
|
||||
query,
|
||||
item.get("price_min_rub") or "",
|
||||
item.get("price_max_rub") or "",
|
||||
@ -165,6 +227,39 @@ def _save_results(cat: str, brand: str, tier: str, query: str,
|
||||
return saved
|
||||
|
||||
|
||||
def clear_catalog(category: str | None = None) -> int:
|
||||
"""Удаляет все записи из каталога (или одной категории). Возвращает кол-во удалённых."""
|
||||
try:
|
||||
ws = sheets.ensure_sheet(SHEET_NAME, HEADERS)
|
||||
all_rows = ws.get_all_values()
|
||||
if len(all_rows) <= 1:
|
||||
return 0
|
||||
headers = all_rows[0]
|
||||
cat_idx = headers.index("category") if "category" in headers else None
|
||||
|
||||
if category and cat_idx is not None:
|
||||
# Удаляем только строки нужной категории
|
||||
kept = [headers]
|
||||
removed = 0
|
||||
for r in all_rows[1:]:
|
||||
if len(r) > cat_idx and r[cat_idx] == category:
|
||||
removed += 1
|
||||
else:
|
||||
kept.append(r)
|
||||
ws.clear()
|
||||
ws.update("A1", kept)
|
||||
return removed
|
||||
else:
|
||||
# Очищаем всё (оставляем только заголовки)
|
||||
removed = len(all_rows) - 1
|
||||
ws.clear()
|
||||
ws.update("A1", [headers])
|
||||
return removed
|
||||
except Exception as e:
|
||||
log.warning("clear_catalog failed: %s", e)
|
||||
return 0
|
||||
|
||||
|
||||
def list_catalog(category: str | None = None, tier: str | None = None,
|
||||
brand: str | None = None, limit: int = 200) -> list[dict[str, Any]]:
|
||||
"""Читает каталог из Sheets с опциональными фильтрами."""
|
||||
|
||||
@ -224,27 +224,70 @@ async def api_proxy_status():
|
||||
return proxy_pool.pool_status()
|
||||
|
||||
|
||||
@app.post("/api/catalog/refresh")
|
||||
def api_catalog_refresh(cat: str = "", per_brand: int = 2, delay: float = 1.0):
|
||||
"""Запускает парсинг каталога (медленно — несколько минут на категорию).
|
||||
from fastapi import BackgroundTasks
|
||||
|
||||
Параметры:
|
||||
cat: одна категория (fridge|hob|oven|dw|hood|microwave|coffee|washer)
|
||||
или пусто = все 8 (очень долго)
|
||||
per_brand: сколько моделей сохранять на (brand × category) — default 2
|
||||
delay: задержка между запросами к парсерам, сек — default 1.0
|
||||
"""
|
||||
categories = [cat] if cat else None
|
||||
|
||||
_CATALOG_REFRESH_STATUS = {"running": False, "last_result": None, "started_at": None}
|
||||
|
||||
|
||||
def _bg_refresh(categories, per_brand, delay):
|
||||
"""Фоновая задача обновления каталога — пишет статус в глобал."""
|
||||
import datetime as _dt
|
||||
_CATALOG_REFRESH_STATUS["running"] = True
|
||||
_CATALOG_REFRESH_STATUS["started_at"] = _dt.datetime.now(_dt.timezone.utc).isoformat()
|
||||
try:
|
||||
result = catalog.refresh_catalog(
|
||||
categories=categories,
|
||||
per_brand=max(1, min(per_brand, 5)),
|
||||
delay_sec=max(0.0, min(delay, 10.0)),
|
||||
per_brand=per_brand,
|
||||
delay_sec=delay,
|
||||
)
|
||||
return result
|
||||
_CATALOG_REFRESH_STATUS["last_result"] = result
|
||||
except Exception as e:
|
||||
log.exception("catalog refresh failed")
|
||||
return {"ok": False, "error": str(e)}
|
||||
log.exception("bg catalog refresh failed")
|
||||
_CATALOG_REFRESH_STATUS["last_result"] = {"ok": False, "error": str(e)}
|
||||
finally:
|
||||
_CATALOG_REFRESH_STATUS["running"] = False
|
||||
|
||||
|
||||
@app.post("/api/catalog/refresh")
|
||||
def api_catalog_refresh(background: BackgroundTasks,
|
||||
cat: str = "", per_brand: int = 2, delay: float = 1.0):
|
||||
"""Запускает refresh в ФОНЕ. Возвращает сразу, статус смотри в /api/catalog/refresh_status.
|
||||
|
||||
Параметры:
|
||||
cat: одна категория или пусто = все 8 (очень долго)
|
||||
per_brand: сколько моделей на (brand × category) — default 2
|
||||
delay: задержка между запросами, сек — default 1.0
|
||||
"""
|
||||
if _CATALOG_REFRESH_STATUS["running"]:
|
||||
return {"ok": False, "error": "already running", "started_at": _CATALOG_REFRESH_STATUS["started_at"]}
|
||||
|
||||
categories = [cat] if cat else None
|
||||
background.add_task(
|
||||
_bg_refresh,
|
||||
categories,
|
||||
max(1, min(per_brand, 5)),
|
||||
max(0.0, min(delay, 10.0)),
|
||||
)
|
||||
return {
|
||||
"ok": True,
|
||||
"queued": True,
|
||||
"categories": categories or "all",
|
||||
"hint": "GET /api/catalog/refresh_status — узнать прогресс",
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/catalog/refresh_status")
|
||||
def api_catalog_refresh_status():
|
||||
"""Статус последнего/текущего refresh'а каталога."""
|
||||
return _CATALOG_REFRESH_STATUS
|
||||
|
||||
|
||||
@app.post("/api/catalog/clear")
|
||||
def api_catalog_clear(cat: str = ""):
|
||||
"""Удаляет всё содержимое каталога (или одной категории)."""
|
||||
removed = catalog.clear_catalog(category=cat or None)
|
||||
return {"ok": True, "removed": removed, "category": cat or "all"}
|
||||
|
||||
|
||||
@app.get("/api/catalog/list")
|
||||
|
||||
131
cat_refresh.json
Normal file
131
cat_refresh.json
Normal file
@ -0,0 +1,131 @@
|
||||
<!DOCTYPE html>
|
||||
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
|
||||
<!--[if IE 7]> <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
|
||||
<!--[if IE 8]> <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
|
||||
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
|
||||
<head>
|
||||
|
||||
|
||||
<title>prepared-alfred-story-dale.trycloudflare.com | 524: A timeout occurred</title>
|
||||
<meta charset="UTF-8" />
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
|
||||
<meta name="robots" content="noindex, nofollow" />
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1" />
|
||||
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />
|
||||
|
||||
|
||||
</head>
|
||||
<body>
|
||||
<div id="cf-wrapper">
|
||||
<div id="cf-error-details" class="p-0">
|
||||
<header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
|
||||
<h1 class="inline-block sm:block sm:mb-2 font-light text-60 lg:text-4xl text-black-dark leading-tight mr-2">
|
||||
<span class="inline-block">A timeout occurred</span>
|
||||
<span class="code-label">Error code 524</span>
|
||||
</h1>
|
||||
<div>
|
||||
Visit <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=prepared-alfred-story-dale.trycloudflare.com" target="_blank" rel="noopener noreferrer">cloudflare.com</a> for more information.
|
||||
</div>
|
||||
<div class="mt-3">2026-05-12 03:36:05 UTC</div>
|
||||
</header>
|
||||
<div class="my-8 bg-gradient-gray">
|
||||
<div class="w-240 lg:w-full mx-auto">
|
||||
<div class="clearfix md:px-8">
|
||||
|
||||
<div id="cf-browser-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
|
||||
<div class="relative mb-10 md:m-0">
|
||||
|
||||
<span class="cf-icon-browser block md:hidden h-20 bg-center bg-no-repeat"></span>
|
||||
<span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
|
||||
|
||||
</div>
|
||||
<span class="md:block w-full truncate">You</span>
|
||||
<h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
|
||||
|
||||
Browser
|
||||
|
||||
</h3>
|
||||
<span class="leading-1.3 text-2xl text-green-success">Working</span>
|
||||
</div>
|
||||
|
||||
<div id="cf-cloudflare-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
|
||||
<div class="relative mb-10 md:m-0">
|
||||
<a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=prepared-alfred-story-dale.trycloudflare.com" target="_blank" rel="noopener noreferrer">
|
||||
<span class="cf-icon-cloud block md:hidden h-20 bg-center bg-no-repeat"></span>
|
||||
<span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
|
||||
</a>
|
||||
</div>
|
||||
<span class="md:block w-full truncate">Stockholm</span>
|
||||
<h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
|
||||
<a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=prepared-alfred-story-dale.trycloudflare.com" target="_blank" rel="noopener noreferrer">
|
||||
Cloudflare
|
||||
</a>
|
||||
</h3>
|
||||
<span class="leading-1.3 text-2xl text-green-success">Working</span>
|
||||
</div>
|
||||
|
||||
<div id="cf-host-status" class="cf-error-source relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
|
||||
<div class="relative mb-10 md:m-0">
|
||||
|
||||
<span class="cf-icon-server block md:hidden h-20 bg-center bg-no-repeat"></span>
|
||||
<span class="cf-icon-error w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
|
||||
|
||||
</div>
|
||||
<span class="md:block w-full truncate">prepared-alfred-story-dale.trycloudflare.com</span>
|
||||
<h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
|
||||
|
||||
Host
|
||||
|
||||
</h3>
|
||||
<span class="leading-1.3 text-2xl text-red-error">Error</span>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="w-240 lg:w-full mx-auto mb-8 lg:px-8">
|
||||
<div class="clearfix">
|
||||
<div class="w-1/2 md:w-full float-left pr-6 md:pb-10 md:pr-0 leading-relaxed">
|
||||
<h2 class="text-3xl font-normal leading-1.3 mb-4">What happened?</h2>
|
||||
<p class="mb-2">The origin web server timed out responding to this request.</p><p>The likely cause is an overloaded background task, database or application, stressing the resources on the host web server.</p>
|
||||
</div>
|
||||
<div class="w-1/2 md:w-full float-left leading-relaxed">
|
||||
<h2 class="text-3xl font-normal leading-1.3 mb-4">What can I do?</h2>
|
||||
<h3 class="text-15 font-semibold mb-2">If you're a visitor of this website:</h3>
|
||||
<p class="mb-6">Please try again in a few minutes.</p>
|
||||
|
||||
<h3 class="text-15 font-semibold mb-2">If you're the owner of this website:</h3>
|
||||
<p>Please refer to the <a rel="noopener noreferrer" href="https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-524/">Error 524</a> article:</p>
|
||||
<ul class="ml-4">
|
||||
<li>Contact your hosting provider; check for long-running processes or an overloaded web server.</li>
|
||||
<li>Use status polling of large HTTP processes to avoid this error.</li>
|
||||
<li>Run the long-running scripts on a <a rel="noopener noreferrer" href="https://developers.cloudflare.com/dns/proxy-status/#dns-only-records">grey-clouded subdomain</a>.</li>
|
||||
<li>Enterprise customers can <a rel="noopener noreferrer" href="https://developers.cloudflare.com/support/troubleshooting/http-status-codes/cloudflare-5xx-errors/error-524/#resolution-on-cloudflare">increase the timeout setting</a> globally or for specific requests using Cache Rules.</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300">
|
||||
<p class="text-13">
|
||||
<span class="cf-footer-item sm:block sm:mb-1">Cloudflare Ray ID: <strong class="font-semibold">9fa65d7a366a3e67</strong></span>
|
||||
<span class="cf-footer-separator sm:hidden">•</span>
|
||||
<span id="cf-footer-item-ip" class="cf-footer-item hidden sm:block sm:mb-1">
|
||||
Your IP:
|
||||
<button type="button" id="cf-footer-ip-reveal" class="cf-footer-ip-reveal-btn">Click to reveal</button>
|
||||
<span class="hidden" id="cf-footer-ip">193.233.23.50</span>
|
||||
<span class="cf-footer-separator sm:hidden">•</span>
|
||||
</span>
|
||||
<span class="cf-footer-item sm:block sm:mb-1"><span>Performance & security by</span> <a rel="noopener noreferrer" href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=prepared-alfred-story-dale.trycloudflare.com" id="brand_link" target="_blank">Cloudflare</a></span>
|
||||
|
||||
</p>
|
||||
<script>(function(){function d(){var b=a.getElementById("cf-footer-item-ip"),c=a.getElementById("cf-footer-ip-reveal");b&&"classList"in b&&(b.classList.remove("hidden"),c.addEventListener("click",function(){c.classList.add("hidden");a.getElementById("cf-footer-ip").classList.remove("hidden")}))}var a=document;document.addEventListener&&a.addEventListener("DOMContentLoaded",d)})();</script>
|
||||
</div><!-- /.error-footer -->
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Reference in New Issue
Block a user