service-finder/backend/app/services/deduplication_service.py

"""
DeduplicationService - Explicit deduplikáció a márka, technikai kód és jármű típus alapján.
Integrálja a mapping_rules.py és mapping_dictionary.py fájlokat.
"""
import logging
from typing import Optional, Dict, Any
from sqlalchemy import select, and_, or_
from sqlalchemy.ext.asyncio import AsyncSession

from app.models.vehicle_definitions import VehicleModelDefinition
from app.workers.vehicle.mapping_rules import SOURCE_MAPPINGS, unify_data

logger = logging.getLogger(__name__)

# Ha nincs mapping_dictionary, hozzunk létre egy egyszerű szinonima szótárt
MAPPING_DICTIONARY = {
    "make_synonyms": {
        "BMW": ["BMW", "Bayerische Motoren Werke"],
        "MERCEDES": ["MERCEDES", "MERCEDES-BENZ", "MERCEDES BENZ"],
        "VOLKSWAGEN": ["VOLKSWAGEN", "VW"],
        "AUDI": ["AUDI"],
        "TOYOTA": ["TOYOTA"],
        "FORD": ["FORD"],
        # További márkák...
    },
    "technical_code_synonyms": {
        # Példa: "1.8 TSI" -> ["1.8 TSI", "1.8TSI", "1.8 TSI 180"]
    },
    "vehicle_class_synonyms": {
        "SUV": ["SUV", "SPORT UTILITY VEHICLE"],
        "SEDAN": ["SEDAN", "SALOON"],
        "HATCHBACK": ["HATCHBACK", "HATCH"],
        "COUPE": ["COUPE", "COUPÉ"],
    }
}

class DeduplicationService:
    """Szolgáltatás a duplikált járműmodell rekordok azonosítására és kezelésére."""

    @staticmethod
    def normalize_make(make: str) -> str:
        """Normalizálja a márka nevet a szinonimák alapján."""
        make_upper = make.strip().upper()
        for canonical, synonyms in MAPPING_DICTIONARY["make_synonyms"].items():
            if make_upper in synonyms or make_upper == canonical:
                return canonical
        return make_upper

    @staticmethod
    def normalize_technical_code(technical_code: Optional[str]) -> str:
        """Normalizálja a technikai kódot (pl. motor kód)."""
        if not technical_code:
            return ""
        # Egyszerű whitespace és pont eltávolítás
        code = technical_code.strip().upper()
        # További normalizáció: eltávolítás speciális karakterek
        import re
        code = re.sub(r'[^A-Z0-9]', '', code)
        return code

    @staticmethod
    def normalize_vehicle_class(vehicle_class: Optional[str]) -> str:
        """Normalizálja a jármű osztályt."""
        if not vehicle_class:
            return ""
        class_upper = vehicle_class.strip().upper()
        for canonical, synonyms in MAPPING_DICTIONARY["vehicle_class_synonyms"].items():
            if class_upper in synonyms or class_upper == canonical:
                return canonical
        return class_upper

    @classmethod
    async def find_duplicate(
        cls,
        session: AsyncSession,
        make: str,
        technical_code: str,
        vehicle_class: str,
        exclude_id: Optional[int] = None
    ) -> Optional[VehicleModelDefinition]:
        """
        Megkeresi, hogy létezik-e már ugyanilyen (normalizált) rekord a vehicle_model_definitions táblában.

        Args:
            session: SQLAlchemy async session
            make: márka (pl. "BMW")
            technical_code: technikai kód (pl. "N47")
            vehicle_class: jármű osztály (pl. "SEDAN")
            exclude_id: kizárni kívánt rekord ID (pl. frissítésnél)

        Returns:
            VehicleModelDefinition instance ha talált duplikátumot, egyébként None.
        """
        norm_make = cls.normalize_make(make)
        norm_technical_code = cls.normalize_technical_code(technical_code)
        norm_vehicle_class = cls.normalize_vehicle_class(vehicle_class)

        # Keresés a normalizált értékek alapján
        stmt = select(VehicleModelDefinition).where(
            and_(
                VehicleModelDefinition.make.ilike(f"%{norm_make}%"),
                VehicleModelDefinition.technical_code.ilike(f"%{norm_technical_code}%"),
                VehicleModelDefinition.vehicle_class.ilike(f"%{norm_vehicle_class}%")
            )
        )
        if exclude_id:
            stmt = stmt.where(VehicleModelDefinition.id != exclude_id)

        result = await session.execute(stmt)
        duplicate = result.scalar_one_or_none()

        if duplicate:
            logger.info(f"Duplikátum találva: ID {duplicate.id} - {duplicate.make} {duplicate.technical_code} {duplicate.vehicle_class}")
        return duplicate

    @classmethod
    async def ensure_no_duplicate(
        cls,
        session: AsyncSession,
        make: str,
        technical_code: str,
        vehicle_class: str,
        exclude_id: Optional[int] = None
    ) -> bool:
        """
        Ellenőrzi, hogy nincs-e duplikátum. Ha van, False-t ad vissza.
        """
        duplicate = await cls.find_duplicate(session, make, technical_code, vehicle_class, exclude_id)
        return duplicate is None

    @classmethod
    async def deduplicate_and_merge(
        cls,
        session: AsyncSession,
        new_record: Dict[str, Any],
        source_name: str = "manual"
    ) -> Dict[str, Any]:
        """
        Duplikáció ellenőrzése és esetleges merge logika.
        Ha talál duplikátumot, visszaadja a meglévő rekord adatait.
        Ha nem, visszaadja a normalizált új rekordot.

        Args:
            session: SQLAlchemy async session
            new_record: új rekord adatai (make, technical_code, vehicle_class, stb.)
            source_name: adatforrás neve a mapping_rules-hoz

        Returns:
            Dict with keys:
                - is_duplicate: bool
                - existing_id: int if duplicate else None
                - normalized_data: normalizált adatok
        """
        # Normalizálás mapping_rules segítségével
        unified = unify_data(new_record, source_name)

        make = unified.get("normalized_make", new_record.get("make", ""))
        technical_code = new_record.get("technical_code", "")
        vehicle_class = new_record.get("vehicle_class", "")

        duplicate = await cls.find_duplicate(session, make, technical_code, vehicle_class)

        if duplicate:
            return {
                "is_duplicate": True,
                "existing_id": duplicate.id,
                "normalized_data": {
                    "make": duplicate.make,
                    "technical_code": duplicate.technical_code,
                    "vehicle_class": duplicate.vehicle_class,
                }
            }

        # Nincs duplikátum, normalizált adatokkal tér vissza
        return {
            "is_duplicate": False,
            "existing_id": None,
            "normalized_data": {
                "make": cls.normalize_make(make),
                "technical_code": cls.normalize_technical_code(technical_code),
                "vehicle_class": cls.normalize_vehicle_class(vehicle_class),
            }
        }