admin firs step
This commit is contained in:
108
backend/app/workers/vehicle/vehicle_master_cleaner.py
Normal file
108
backend/app/workers/vehicle/vehicle_master_cleaner.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import sys
|
||||
from sqlalchemy import text, update
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [MASTER-CLEANER] %(message)s', stream=sys.stdout)
|
||||
logger = logging.getLogger("Master-Cleaner")
|
||||
|
||||
# --- REGEX MINTÁK (A "Kód" amivel az adatot keressük a szövegben) ---
|
||||
KW_PATTERN = re.compile(r'(\d{2,3})\s*(?:kW|kw|kilowatt)', re.IGNORECASE)
|
||||
CCM_PATTERN = re.compile(r'(\d{3,4})\s*(?:ccm|cm3|cc|cubic)', re.IGNORECASE)
|
||||
|
||||
class MasterCleaner:
|
||||
"""
|
||||
Thought Process:
|
||||
1. A robot célja a 126k rekord AI-mentes tisztítása.
|
||||
2. Első körben azokat a sorokat keressük, amik már technikailag teljesek (Auto-Gold).
|
||||
3. Második körben a 'raw_search_context' szövegeiből Regex-szel kinyerjük a hiányzó kW/ccm adatokat.
|
||||
4. Harmadik körben a duplikációkat (uix_vmd_precision_v2 alapján) összeolvasztjuk.
|
||||
"""
|
||||
|
||||
async def run_audit(self):
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
logger.info("🔍 Audit indítása a teljes állományon...")
|
||||
|
||||
# 1. AUTO-GOLD: Ha már minden mező kitöltött (UltimateSpecs R2/R3 jóvoltából)
|
||||
# Ez a leggyorsabb: ha van kW, ccm, fuel és body, akkor az kész.
|
||||
gold_query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET status = 'gold_enriched', updated_at = NOW(), source = source || ' + AUDITOR_FIX'
|
||||
WHERE status IN ('awaiting_ai_synthesis', 'unverified')
|
||||
AND power_kw > 0 AND engine_capacity > 0
|
||||
AND fuel_type != 'Unknown' AND body_type IS NOT NULL
|
||||
RETURNING id;
|
||||
""")
|
||||
res_gold = await db.execute(gold_query)
|
||||
logger.info(f"✨ {len(res_gold.fetchall())} járművet találtam, ami már eleve 'Arany' volt.")
|
||||
|
||||
# 2. REGEX EXTRACTION: Beleolvasunk a 'raw_search_context'-be
|
||||
# Olyanokat keresünk, ahol power_kw vagy engine_capacity még 0.
|
||||
logger.info("🧪 Regex extrakció indítása a szöveges kontextusból...")
|
||||
fetch_query = text("""
|
||||
SELECT id, raw_search_context, power_kw, engine_capacity
|
||||
FROM vehicle.vehicle_model_definitions
|
||||
WHERE (power_kw = 0 OR engine_capacity = 0)
|
||||
AND raw_search_context != ''
|
||||
AND status != 'gold_enriched'
|
||||
LIMIT 10000;
|
||||
""")
|
||||
|
||||
rows = (await db.execute(fetch_query)).fetchall()
|
||||
extracted_count = 0
|
||||
|
||||
for r_id, context, p_kw, e_ccm in rows:
|
||||
updates = {}
|
||||
|
||||
if p_kw == 0:
|
||||
kw_match = KW_PATTERN.search(context)
|
||||
if kw_match:
|
||||
updates["power_kw"] = int(kw_match.group(1))
|
||||
|
||||
if e_ccm == 0:
|
||||
ccm_match = CCM_PATTERN.search(context)
|
||||
if ccm_match:
|
||||
updates["engine_capacity"] = int(ccm_match.group(1))
|
||||
|
||||
if updates:
|
||||
# Ha találtunk valamit, frissítjük a rekordot
|
||||
stmt = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions
|
||||
SET power_kw = COALESCE(:kw, power_kw),
|
||||
engine_capacity = COALESCE(:ccm, engine_capacity),
|
||||
source = source || ' + REGEX_EXTRACT'
|
||||
WHERE id = :id
|
||||
""")
|
||||
await db.execute(stmt, {"kw": updates.get("power_kw"), "ccm": updates.get("engine_capacity"), "id": r_id})
|
||||
extracted_count += 1
|
||||
|
||||
logger.info(f"📝 {extracted_count} járműnél találtam meg az adatokat a szöveges kontextusban.")
|
||||
|
||||
# 3. DEDUPLIKÁCIÓ: Márka + Név + Üzemanyag + Évjárat alapján
|
||||
logger.info("✂️ Duplikációk összeolvasztása...")
|
||||
dedup_query = text("""
|
||||
UPDATE vehicle.vehicle_model_definitions AS p
|
||||
SET status = 'merged_duplicate'
|
||||
FROM vehicle.vehicle_model_definitions AS g
|
||||
WHERE p.status != 'gold_enriched' AND g.status = 'gold_enriched'
|
||||
AND p.make = g.make AND p.normalized_name = g.normalized_name
|
||||
AND p.year_from = g.year_from AND p.id != g.id
|
||||
RETURNING p.id;
|
||||
""")
|
||||
res_dedup = await db.execute(dedup_query)
|
||||
logger.info(f"🗑️ {len(res_dedup.fetchall())} duplikációt távolítottam el.")
|
||||
|
||||
await db.commit()
|
||||
logger.info("🏆 A 126k rekord átvizsgálása befejeződött!")
|
||||
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
logger.error(f"❌ Kritikus hiba az audit során: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
cleaner = MasterCleaner()
|
||||
asyncio.run(cleaner.run_audit())
|
||||
Reference in New Issue
Block a user