átlagos kiegészítséek jó sok

This commit is contained in:
Roo
2026-03-22 11:02:05 +00:00
parent f53e0b53df
commit 5d44339f21
249 changed files with 20922 additions and 2253 deletions

View File

@@ -0,0 +1,425 @@
#!/usr/bin/env python3
"""
Worker: vehicle_ultimate_r0_spider
Producer-Consumer lánc első eleme. Kivesz egy autót a vehicle.vehicle_model_definitions táblából,
keres az UltimateSpecs oldalán, és a talált .html linkeket beszúrja a vehicle.auto_data_crawler_queue táblába.
"""
import asyncio
import logging
import random
import sys
import signal
import urllib.parse
from datetime import datetime
from typing import Optional, Dict, Any, List
from playwright.async_api import async_playwright, Page, Browser, BrowserContext
from sqlalchemy import text, select, and_, or_
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
from app.models.vehicle.external_reference_queue import ExternalReferenceQueue
from app.models.vehicle.vehicle_definitions import VehicleModelDefinition
# Logging konfiguráció
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R0-SPIDER] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("R0-SPIDER")
# Konfiguráció
SLEEP_INTERVAL = random.uniform(3, 6) # 3-6 mp között várakozás
MAX_RETRIES = 3
BASE_URL = "https://www.ultimatespecs.com/index.php?q={query}"
class UltimateSpecsSpider:
def __init__(self):
self.running = True
self.playwright = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
async def init_browser(self):
"""Playwright böngésző inicializálása"""
try:
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
]
)
self.context = await self.browser.new_context(
user_agent=self.user_agent,
viewport={'width': 1920, 'height': 1080},
java_script_enabled=True
)
logger.info("Playwright böngésző inicializálva")
except Exception as e:
logger.error(f"Hiba a böngésző inicializálásakor: {e}")
raise
async def close_browser(self):
"""Playwright böngésző lezárása"""
if self.context:
await self.context.close()
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
logger.info("Playwright böngésző lezárva")
async def fetch_next_vehicle(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
"""
Kivesz egy feldolgozandó járművet a vehicle_model_definitions táblából.
"""
query = text("""
SELECT id, make, marketing_name, year_from, vehicle_class
FROM vehicle.vehicle_model_definitions
WHERE status IN ('pending', 'manual_review_needed')
AND vehicle_class IN ('car', 'motorcycle')
ORDER BY priority_score DESC, updated_at ASC
LIMIT 1
FOR UPDATE SKIP LOCKED
""")
try:
result = await session.execute(query)
row = result.fetchone()
if row:
return {
'id': row[0],
'make': row[1],
'marketing_name': row[2],
'year_from': row[3],
'vehicle_class': row[4]
}
return None
except Exception as e:
logger.error(f"Hiba a következő jármű lekérdezésekor: {e}")
return None
def build_search_query(self, make: str, marketing_name: str, year_from: Optional[int]) -> str:
"""
Build search query for UltimateSpecs.
"""
# Clean and prepare the query
make_clean = make.lower().replace(' ', '-').replace('.', '')
model_clean = marketing_name.lower().replace(' ', '-').replace('.', '')
# Remove common suffixes
for suffix in ['-', 'series', 'class', 'model']:
if model_clean.endswith(suffix):
model_clean = model_clean[:-len(suffix)].rstrip('-')
query_parts = [make_clean, model_clean]
if year_from:
query_parts.append(str(year_from))
return ' '.join(query_parts)
async def extract_links_with_js(self, page: Page, make_url: str, model_word: str) -> List[Dict[str, str]]:
"""
Extract .html links from the page using the provided JavaScript filter.
"""
js_code = """
(args) => {
let targetMakeUrl = args.makeUrl; // pl. 'honda' vagy 'alfa-romeo'
let targetModel = args.modelWord; // pl. 'civic'
let specs = [];
document.querySelectorAll('a').forEach(a => {
let href = a.getAttribute('href') || '';
let text = a.innerText.trim();
let hrefLow = href.toLowerCase();
let textLow = text.toLowerCase();
if (hrefLow.includes('/car-specs/') || hrefLow.includes('/motorcycles-specs/')) {
// SZIGORÚ MÁRKA SZŰRŐ AZ URL-BEN (Reklámok ellen)
if (hrefLow.includes('/' + targetMakeUrl + '/') || hrefLow.includes(targetMakeUrl + '-models')) {
// MODELL SZŰRŐ A SZÖVEGBEN VAGY URL-BEN
if (targetModel === '' || textLow.includes(targetModel) || hrefLow.includes(targetModel)) {
if (hrefLow.endsWith('.html') && text.length > 1) {
specs.push({ name: text, url: href });
}
}
}
}
});
return specs;
}
"""
try:
# Prepare arguments for the JS function
args = {
'makeUrl': make_url.lower(),
'modelWord': model_word.lower()
}
# Execute the JavaScript
specs = await page.evaluate(js_code, args)
return specs
except Exception as e:
logger.error(f"Hiba a JS szűrő futtatásakor: {e}")
return []
async def search_and_extract_links(self, vehicle: Dict[str, Any]) -> List[Dict[str, str]]:
"""
Search on UltimateSpecs and extract links using two-step drill-down.
"""
search_query = self.build_search_query(
vehicle['make'],
vehicle['marketing_name'],
vehicle['year_from']
)
# Prepare make URL part
make_url = vehicle['make'].lower().replace(' ', '-').replace('.', '')
model_word = vehicle['marketing_name'].lower().split()[0] if vehicle['marketing_name'] else ''
encoded_query = urllib.parse.quote(search_query)
search_url = BASE_URL.format(query=encoded_query)
logger.info(f"Keresés: {search_query} | URL: {search_url}")
page = None
try:
page = await self.context.new_page()
# 1. Step: Go to search page
await page.goto(search_url, wait_until='networkidle', timeout=30000)
# Check if we're on a category page or search results
current_url = page.url
# 2. Step: Extract links with JS filter
all_links = await self.extract_links_with_js(page, make_url, model_word)
# If no links found on first page, try to click on first result
if not all_links and 'index.php' in current_url:
# Try to find and click on first relevant link
first_link = await page.query_selector('a[href*="/car-specs/"], a[href*="/motorcycles-specs/"]')
if first_link:
await first_link.click()
await page.wait_for_load_state('networkidle')
# Extract links from the new page
all_links = await self.extract_links_with_js(page, make_url, model_word)
# Ensure URLs are absolute
for link in all_links:
if not link['url'].startswith('http'):
link['url'] = f"https://www.ultimatespecs.com{link['url']}"
logger.info(f"{len(all_links)} link találva")
return all_links
except Exception as e:
logger.error(f"Hiba a keresés során: {e}")
return []
finally:
if page:
await page.close()
async def save_links_to_queue(self, session: AsyncSession, links: List[Dict[str, str]],
vehicle: Dict[str, Any]) -> int:
"""
Save extracted links to the external reference queue.
"""
saved_count = 0
for link in links:
try:
# Check if URL already exists
existing_query = select(ExternalReferenceQueue).where(
ExternalReferenceQueue.url == link['url']
)
existing_result = await session.execute(existing_query)
if existing_result.scalar_one_or_none():
logger.debug(f"URL már létezik: {link['url']}")
continue
# Create new queue entry
queue_entry = ExternalReferenceQueue(
url=link['url'],
level='engine',
category=vehicle['vehicle_class'] or 'car',
name=link['name'][:255],
parent_id=vehicle['id'],
status='pending'
)
session.add(queue_entry)
await session.commit()
saved_count += 1
logger.debug(f"URL mentve: {link['url']}")
except IntegrityError:
await session.rollback()
logger.debug(f"URL már létezik (integrity): {link['url']}")
except Exception as e:
await session.rollback()
logger.error(f"Hiba a URL mentésekor: {e}")
return saved_count
async def update_vehicle_status(self, session: AsyncSession, vehicle_id: int,
status: str, error_msg: str = None):
"""
Update the vehicle's status in the database.
"""
try:
query = text("""
UPDATE vehicle.vehicle_model_definitions
SET status = :status,
last_error = :error_msg,
updated_at = NOW(),
attempts = attempts + 1
WHERE id = :id
""")
await session.execute(
query,
{'status': status, 'error_msg': error_msg, 'id': vehicle_id}
)
await session.commit()
logger.info(f"Jármű státusz frissítve: {vehicle_id} -> {status}")
except Exception as e:
await session.rollback()
logger.error(f"Hiba a státusz frissítésekor: {e}")
async def process_single_vehicle(self):
"""
Process a single vehicle: fetch, search, extract links, save to queue.
"""
async with AsyncSessionLocal() as session:
try:
# 1. Fetch next vehicle
vehicle = await self.fetch_next_vehicle(session)
if not vehicle:
logger.info("Nincs feldolgozandó jármű")
return False
logger.info(f"Feldolgozás: {vehicle['make']} {vehicle['marketing_name']} "
f"(ID: {vehicle['id']})")
# 2. Search and extract links
links = await self.search_and_extract_links(vehicle)
if not links:
# No links found
await self.update_vehicle_status(
session, vehicle['id'],
'research_failed_empty',
'No links found on UltimateSpecs'
)
logger.warning(f"Nem található link: {vehicle['make']} {vehicle['marketing_name']}")
return True
# 3. Save links to queue
saved_count = await self.save_links_to_queue(session, links, vehicle)
# 4. Update vehicle status
if saved_count > 0:
await self.update_vehicle_status(
session, vehicle['id'],
'spider_dispatched',
f'{saved_count} links added to queue'
)
logger.info(f"{saved_count} link mentve a queue-ba")
else:
# All links already existed
await self.update_vehicle_status(
session, vehicle['id'],
'spider_dispatched',
'All links already in queue'
)
logger.info("Minden link már szerepel a queue-ban")
return True
except Exception as e:
logger.error(f"Hiba a jármű feldolgozása során: {e}")
# Try to update status with error
try:
if 'vehicle' in locals():
await self.update_vehicle_status(
session, vehicle['id'],
'research_failed_network',
str(e)[:500]
)
except:
pass
return True
async def run(self):
"""
Main loop of the spider.
"""
logger.info("UltimateSpecs R0 Spider indítása...")
try:
await self.init_browser()
while self.running:
try:
# Process a single vehicle
processed = await self.process_single_vehicle()
if not processed:
# No vehicles to process, wait longer
await asyncio.sleep(SLEEP_INTERVAL * 2)
else:
# Wait before next iteration
await asyncio.sleep(SLEEP_INTERVAL)
except KeyboardInterrupt:
logger.info("Keyboard interrupt, leállítás...")
self.running = False
break
except Exception as e:
logger.error(f"Hiba a fő ciklusban: {e}")
await asyncio.sleep(SLEEP_INTERVAL)
finally:
await self.close_browser()
logger.info("UltimateSpecs R0 Spider leállt")
def stop(self):
"""Stop the spider gracefully."""
self.running = False
logger.info("Leállítás kérése érkezett")
async def main():
"""Main entry point."""
spider = UltimateSpecsSpider()
# Signal handling for graceful shutdown
def signal_handler(signum, frame):
logger.info(f"Signal {signum} received, stopping...")
spider.stop()
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
await spider.run()
except Exception as e:
logger.error(f"Váratlan hiba: {e}")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,355 @@
#!/usr/bin/env python3
"""
Worker: vehicle_ultimate_r1_scraper
Producer-Consumer lánc második eleme (A Nyers Letöltő).
Kivesz egy feldolgozandó linket a vehicle.auto_data_crawler_queue táblából (level='engine'),
letölti a HTML tartalmat Playwright böngészővel, kinyeri a specifikációkat JS parserrel,
és elmenti a vehicle.external_reference_library táblába.
"""
import asyncio
import logging
import random
import sys
import signal
import json
from datetime import datetime
from typing import Optional, Dict, Any, List
from playwright.async_api import async_playwright, Page, Browser, BrowserContext, TimeoutError as PlaywrightTimeoutError
from sqlalchemy import text, select, and_, or_
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal, ensure_models_loaded
from app.models.vehicle.external_reference_queue import ExternalReferenceQueue
from app.models.vehicle.external_reference import ExternalReferenceLibrary
# Logging konfiguráció
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R1-SCRAPER] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("R1-SCRAPER")
# Konfiguráció
SLEEP_INTERVAL = random.uniform(3, 6) # 3-6 mp között várakozás
MAX_RETRIES = 3
CLOUDFLARE_KEYWORDS = ["just a moment", "cloudflare", "checking your browser"]
class UltimateSpecsScraper:
def __init__(self):
self.running = True
self.playwright = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
async def init_browser(self):
"""Playwright böngésző inicializálása"""
try:
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
]
)
self.context = await self.browser.new_context(
user_agent=self.user_agent,
viewport={'width': 1920, 'height': 1080},
java_script_enabled=True
)
logger.info("Playwright böngésző inicializálva")
except Exception as e:
logger.error(f"Hiba a böngésző inicializálásakor: {e}")
raise
async def close_browser(self):
"""Playwright böngésző lezárása"""
if self.context:
await self.context.close()
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
logger.info("Playwright böngésző lezárva")
async def fetch_next_queue_item(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
"""
Kivesz egy feldolgozandó linket a vehicle.auto_data_crawler_queue táblából.
"""
query = text("""
SELECT id, url, category, parent_id
FROM vehicle.auto_data_crawler_queue
WHERE level = 'engine' AND status = 'pending'
FOR UPDATE SKIP LOCKED LIMIT 1
""")
try:
result = await session.execute(query)
row = result.fetchone()
if row:
return {
"id": row[0],
"url": row[1],
"category": row[2],
"parent_id": row[3]
}
return None
except Exception as e:
logger.error(f"Hiba a queue lekérdezésekor: {e}")
return None
async def scrape_with_retry(self, url: str, max_retries: int = MAX_RETRIES) -> Optional[Dict[str, Any]]:
"""
Playwright böngészővel letölti a HTML tartalmat, retry logikával.
"""
for attempt in range(1, max_retries + 1):
try:
logger.info(f"Próbálkozás {attempt}/{max_retries}: {url}")
page = await self.context.new_page()
# Navigáció
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
# Várjunk a táblázatokra
try:
await page.wait_for_selector('table', timeout=5000)
except PlaywrightTimeoutError:
logger.warning("Nem található táblázat 5 másodpercen belül, de folytatjuk")
# Ellenőrizzük Cloudflare blokkolást
title = await page.title()
title_lower = title.lower()
if any(keyword in title_lower for keyword in CLOUDFLARE_KEYWORDS):
raise Exception(f"Cloudflare blokkolás észlelve: {title}")
# JS parser futtatása
specs = await page.evaluate("""() => {
let results = {};
// 1. ÖSSZES táblázat letapogatása
document.querySelectorAll('table').forEach(table => {
table.querySelectorAll('tr').forEach(row => {
let t = row.querySelector('.table_specs_title, .td_title, td:first-child, th:first-child');
let v = row.querySelector('.table_specs_value, .td_value, td:last-child');
if(t && v) {
let k = t.innerText.replace(/:/g,'').trim().toLowerCase();
let val = v.innerText.trim();
if(k && val && val !== "-") { results[k] = val; }
}
});
});
// 2. Extra szekciók és dimenziók mentése
const sections = {};
document.querySelectorAll('h2, h3, h4, .section-title, .specs-header').forEach(header => {
const title = header.innerText.trim();
if (title && title.length > 0) {
let nextElement = header.nextElementSibling;
let sectionData = {};
for (let i = 0; i < 5 && nextElement; i++) {
if (nextElement.tagName === 'TABLE') {
nextElement.querySelectorAll('tr').forEach(row => {
let t = row.querySelector('td:first-child, th:first-child');
let v = row.querySelector('td:last-child');
if(t && v) {
let k = t.innerText.replace(/:/g,'').trim().toLowerCase();
let val = v.innerText.trim();
if(k && val && val !== "-") {
sectionData[k] = val;
results[`${title.toLowerCase().replace(/ /g, '_')}_${k}`] = val;
}
}
});
}
nextElement = nextElement.nextElementSibling;
}
sections[title.toLowerCase().replace(/ /g, '_')] = sectionData;
}
});
results['_sections'] = sections;
return results;
}""")
await page.close()
if specs and len(specs) > 0:
logger.info(f"Sikeres letöltés, {len(specs)} specifikáció kinyerve")
return specs
else:
logger.warning("Üres specifikációk, újrapróbálkozás")
raise Exception("Üres specifikációk")
except Exception as e:
logger.error(f"Hiba a {attempt}. próbálkozásnál: {e}")
if attempt < max_retries:
backoff = random.uniform(2, 5)
logger.info(f"Várakozás {backoff:.1f} másodpercet...")
await asyncio.sleep(backoff)
else:
logger.error(f"Összes próbálkozás sikertelen: {e}")
return None
return None
async def process_queue_item(self, session: AsyncSession, item: Dict[str, Any]) -> bool:
"""
Feldolgoz egy queue tételt: letölti, kinyeri, elmenti.
"""
queue_id = item["id"]
url = item["url"]
category = item["category"]
try:
# 1. Letöltés
specs = await self.scrape_with_retry(url)
if not specs:
# Hiba esetén frissítjük a queue-t
await session.execute(
text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error', error_msg = :error_msg, retry_count = retry_count + 1
WHERE id = :id
"""),
{"error_msg": "Sikertelen letöltés (üres specifikációk vagy Cloudflare)", "id": queue_id}
)
await session.commit()
logger.error(f"Queue {queue_id} sikertelen, státusz: error")
return False
# 2. Új rekord létrehozása az external_reference_library táblában (nyers SQL)
# A specifications dict-et JSON stringgé alakítjuk
import json
specs_json = json.dumps(specs)
insert_query = text("""
INSERT INTO vehicle.external_reference_library
(source_name, source_url, category, specifications, pipeline_status, created_at, last_scraped_at)
VALUES (:source_name, :source_url, :category, CAST(:specifications AS jsonb), :pipeline_status, NOW(), NOW())
RETURNING id
""")
result = await session.execute(
insert_query,
{
"source_name": "ultimatespecs",
"source_url": url,
"category": category,
"specifications": specs_json,
"pipeline_status": "pending_enrich"
}
)
new_id = result.scalar()
# 3. Queue tétel frissítése completed-re
await session.execute(
text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'completed', updated_at = NOW()
WHERE id = :id
"""),
{"id": queue_id}
)
await session.commit()
logger.info(f"Queue {queue_id} sikeresen feldolgozva, library ID: {new_id}")
return True
except Exception as e:
logger.error(f"Hiba a queue {queue_id} feldolgozásakor: {e}")
await session.rollback()
# Hiba esetén error státusz
try:
await session.execute(
text("""
UPDATE vehicle.auto_data_crawler_queue
SET status = 'error', error_msg = :error_msg, retry_count = retry_count + 1
WHERE id = :id
"""),
{"error_msg": str(e)[:500], "id": queue_id}
)
await session.commit()
except Exception as update_err:
logger.error(f"Hiba a queue frissítésekor: {update_err}")
return False
async def run_once(self):
"""Egyetlen feldolgozási ciklus"""
# Biztosítjuk, hogy a modellek regisztrálva legyenek
ensure_models_loaded()
async with AsyncSessionLocal() as session:
try:
# Tranzakció kezdése
async with session.begin():
item = await self.fetch_next_queue_item(session)
if not item:
logger.info("Nincs feldolgozandó queue tétel")
return False
logger.info(f"Feldolgozás: {item['url']}")
success = await self.process_queue_item(session, item)
return success
except Exception as e:
logger.error(f"Hiba a run_once-ban: {e}")
return False
async def run_loop(self):
"""Fő ciklus: végtelen while, 3-6 mp várakozással"""
await self.init_browser()
try:
while self.running:
success = await self.run_once()
if not success:
# Ha nincs munka, várjunk egy kicsit
sleep_time = SLEEP_INTERVAL
logger.debug(f"Várakozás {sleep_time:.1f} másodpercet...")
await asyncio.sleep(sleep_time)
else:
# Sikeres feldolgozás után rövid várakozás
await asyncio.sleep(random.uniform(1, 2))
except KeyboardInterrupt:
logger.info("Keyboard interrupt, leállítás...")
except Exception as e:
logger.error(f"Váratlan hiba a fő ciklusban: {e}")
finally:
await self.close_browser()
def stop(self):
"""Leállítási jelzés"""
self.running = False
logger.info("Leállítási jelzés küldve")
async def main():
"""Fő függvény"""
scraper = UltimateSpecsScraper()
# Signal kezelés
def signal_handler(signum, frame):
scraper.stop()
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
await scraper.run_loop()
except Exception as e:
logger.error(f"Fatal error: {e}")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,299 @@
#!/usr/bin/env python3
"""
Worker: vehicle_ultimate_r2_enricher
Producer-Consumer lánc harmadik eleme (Az Elemző). Offline adattisztítást és strukturálást végez.
Kivesz egy feldolgozandó sort a vehicle.external_reference_library táblából (pipeline_status='pending_enrich'),
hozzácsatolja a vehicle.auto_data_crawler_queue adatait, kinyeri a standard értékeket a nyers JSON-ből,
és strukturált JSON-be csomagolja (standardized + _raw).
"""
import asyncio
import logging
import random
import sys
import signal
import json
import re
from datetime import datetime
from typing import Optional, Dict, Any, List, Tuple
from sqlalchemy import text, select, and_, or_
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
# Logging konfiguráció
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R2-ENRICHER] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("R2-ENRICHER")
# Konfiguráció
SLEEP_INTERVAL = random.uniform(1, 3) # 1-3 mp között várakozás
# Fuzzy mapping a metrikákhoz
FUZZY_MAPPING = {
"power_kw": ["horsepower", "total electric power", "engine power", "maximum power", "power"],
"engine_capacity": ["engine displacement", "displacement", "capacity", "cm3", "cu-in"],
"torque_nm": ["maximum torque", "total electric torque", "torque"],
"max_speed": ["top speed", "maximum speed"],
"curb_weight": ["curb weight", "weight"],
"wheelbase": ["wheelbase"],
"seats": ["num. of seats", "seats"]
}
# Szöveges mezők keresési kulcsszavai
TEXT_FIELD_KEYWORDS = {
"fuel_type": ["fuel type", "fuel", "engine fuel", "fuel system"],
"transmission_type": ["transmission", "gear", "gearbox"],
"drive_type": ["drive type", "drive", "drivetrain"],
"body_type": ["body type", "body", "car body"]
}
class UltimateSpecsEnricher:
def __init__(self):
self.running = True
async def fetch_next_library_item(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
"""
Kivesz egy feldolgozandó sort a Library-ből.
"""
query = text("""
SELECT id, specifications, make, model, year_from
FROM vehicle.external_reference_library
WHERE pipeline_status = 'pending_enrich'
FOR UPDATE SKIP LOCKED LIMIT 1
""")
try:
result = await session.execute(query)
row = result.fetchone()
if row:
return {
"id": row[0],
"specifications": row[1] if isinstance(row[1], dict) else {},
"make": row[2],
"model": row[3],
"year_from": row[4]
}
return None
except SQLAlchemyError as e:
logger.error(f"SQL hiba a lekérdezés során: {e}")
return None
def extract_fuzzy_metric(self, specifications: Dict[str, Any], target_key: str, keywords: List[str]) -> Optional[float]:
"""
Keres a specifications szótárban a megadott kulcsszavak alapján, és számot próbál kinyerni.
"""
if not specifications:
return None
# Először próbáljuk meg a kulcsokat (case-insensitive)
spec_lower = {k.lower(): v for k, v in specifications.items()}
for keyword in keywords:
for key, value in spec_lower.items():
if keyword.lower() in key:
# Ha a érték szám vagy string, próbáljuk kinyerni a számot
num = self.clean_number(value)
if num is not None:
# Ha a kulcs tartalmazza a "hp" vagy "horsepower" és a cél kW, konvertáljuk
if target_key == "power_kw" and ("hp" in key or "horsepower" in key):
# hp -> kW konverzió (1 hp = 0.7457 kW)
num = num * 0.7457
return num
return None
def clean_number(self, value: Any) -> Optional[float]:
"""
Kinyeri a számot egy stringből vagy más típusból.
"""
if value is None:
return None
if isinstance(value, (int, float)):
return float(value)
if isinstance(value, str):
# Távolítsuk el a nem szám karaktereket, kivéve pont és mínusz
# Keresünk mintákat mint "120 kW" vagy "120kW"
match = re.search(r'([-+]?\d*\.?\d+)\s*(?:kW|hp|cc|Nm|kg|km/h|mph)?', value, re.IGNORECASE)
if match:
try:
return float(match.group(1))
except ValueError:
pass
# Ha nincs specifikus egység, próbáljunk meg bármilyen számot kinyerni
matches = re.findall(r'[-+]?\d*\.?\d+', value)
if matches:
try:
return float(matches[0])
except ValueError:
pass
return None
def extract_text_field(self, specifications: Dict[str, Any], keywords: List[str]) -> Optional[str]:
"""
Kinyer egy szöveges mezőt a specifications-ből a kulcsszavak alapján.
"""
if not specifications:
return None
spec_lower = {k.lower(): v for k, v in specifications.items()}
for keyword in keywords:
for key, value in spec_lower.items():
if keyword.lower() in key:
if isinstance(value, str):
return value.strip()
elif isinstance(value, (int, float)):
return str(value)
return None
def enrich_specifications(self, raw_specs: Dict[str, Any], make: str, model: str, year_from: int) -> Dict[str, Any]:
"""
Fő strukturáló függvény: kinyeri a standard értékeket és létrehozza az új JSON struktúrát.
"""
standardized = {}
# Metrikák kinyerése
for target_key, keywords in FUZZY_MAPPING.items():
value = self.extract_fuzzy_metric(raw_specs, target_key, keywords)
standardized[target_key] = value
# Szöveges mezők kinyerése
for field, keywords in TEXT_FIELD_KEYWORDS.items():
value = self.extract_text_field(raw_specs, keywords)
standardized[field] = value
# Készítsük az új JSON struktúrát
updated_specifications = {
"standardized": standardized,
"_raw": raw_specs # Az eredeti R1 adat érintetlenül megmarad!
}
return updated_specifications
async def process_item(self, session: AsyncSession, item: Dict[str, Any]) -> bool:
"""
Feldolgoz egy elemet: kinyeri az adatokat, frissíti az adatbázist.
"""
try:
logger.info(f"Feldolgozás: ID={item['id']}, {item['make']} {item['model']} ({item['year_from']})")
# Adatok kinyerése és strukturálása
updated_specs = self.enrich_specifications(
item['specifications'],
item['make'],
item['model'],
item['year_from']
)
# Kinyert értékek a fizikai oszlopokhoz
power_kw = updated_specs['standardized'].get('power_kw')
engine_cc = updated_specs['standardized'].get('engine_capacity')
# UPDATE végrehajtása
update_query = text("""
UPDATE vehicle.external_reference_library
SET power_kw = :power_kw,
engine_cc = :engine_cc,
make = :make,
model = :model,
year_from = :year_from,
specifications = :updated_specifications,
pipeline_status = 'pending_match'
WHERE id = :id
""")
params = {
"power_kw": int(power_kw) if power_kw is not None else None,
"engine_cc": int(engine_cc) if engine_cc is not None else None,
"make": item['make'],
"model": item['model'],
"year_from": item['year_from'],
"updated_specifications": json.dumps(updated_specs),
"id": item['id']
}
await session.execute(update_query, params)
await session.commit()
logger.info(f"Sikeres frissítés: ID={item['id']}, power_kw={power_kw}, engine_cc={engine_cc}")
return True
except Exception as e:
logger.error(f"Hiba a feldolgozás során ID={item['id']}: {e}")
await session.rollback()
return False
async def run_once(self):
"""
Egyetlen feldolgozási ciklus.
"""
async with AsyncSessionLocal() as session:
try:
# Tranzakció indítása
async with session.begin():
item = await self.fetch_next_library_item(session)
if not item:
logger.debug("Nincs feldolgozandó elem")
return False
success = await self.process_item(session, item)
return success
except SQLAlchemyError as e:
logger.error(f"Adatbázis hiba: {e}")
return False
async def run_loop(self):
"""
Fő végtelen ciklus.
"""
logger.info("R2 Enricher indítva...")
while self.running:
try:
success = await self.run_once()
if not success:
# Ha nincs feldolgozandó elem, várjunk egy kicsit
await asyncio.sleep(SLEEP_INTERVAL)
except KeyboardInterrupt:
logger.info("Keyboard interrupt, leállítás...")
self.running = False
break
except Exception as e:
logger.error(f"Váratlan hiba a ciklusban: {e}")
await asyncio.sleep(SLEEP_INTERVAL)
logger.info("R2 Enricher leállt")
def stop(self):
"""Leállítási jelzés."""
self.running = False
async def main():
"""Fő függvény."""
enricher = UltimateSpecsEnricher()
# Signal kezelés
def signal_handler(signum, frame):
logger.info(f"Signal {signum} fogadva, leállítás...")
enricher.stop()
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
await enricher.run_loop()
except asyncio.CancelledError:
logger.info("Task cancelled")
finally:
logger.info("R2 Enricher befejezte a munkát.")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,400 @@
#!/usr/bin/env python3
"""
Worker: vehicle_ultimate_r3_finalizer
Producer-Consumer lánc negyedik, utolsó eleme (Az Összevezető).
Offline dolgozik egy végtelen while ciklusban (1-3 mp delay), és a meglévő adatbázis-táblákat szinkronizálja.
1. Lekérdezés (JOIN a Queue-val): Kivesz egy `pending_match` sort a Library-ből, és a Queue-ból lekéri az eredeti `parent_id`-t és a link nevét.
2. Szülő (Base VMD) ellenőrzése: Lekérdezi az eredeti szülő rekordot a VMD táblából a parent_id alapján.
3. Összevezetés (UPDATE vagy INSERT): A letisztított adatok a lib.specifications['standardized'] dict-ből jönnek.
- A ÁG: Ha a szülő status értéke IN ('pending', 'manual_review_needed'): UPDATE a szülő (VMD) rekordon
- B ÁG: Ha a szülő status MÁR NEM 'pending': INSERT új variációként a VMD táblába
4. Library lezárása: Frissíti a Library táblát pipeline_status = 'completed', matched_vmd_id beállítása.
"""
import asyncio
import logging
import random
import sys
import signal
import json
from datetime import datetime
from typing import Optional, Dict, Any, List, Tuple
from sqlalchemy import text, select, and_, or_
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
# Logging konfiguráció
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [R3-FINALIZER] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("R3-FINALIZER")
# Konfiguráció
SLEEP_INTERVAL = random.uniform(1, 3) # 1-3 mp között várakozás
class UltimateSpecsFinalizer:
def __init__(self):
self.running = True
async def fetch_pending_match(self, session: AsyncSession) -> Optional[Dict[str, Any]]:
"""
Kivesz egy `pending_match` sort a Library-ből, JOIN-olva a Queue-val.
FOR UPDATE OF lib SKIP LOCKED LIMIT 1
"""
query = text("""
SELECT lib.id, lib.source_url, lib.make, lib.model, lib.year_from,
lib.power_kw, lib.engine_cc, lib.specifications, lib.category,
q.parent_id, q.name AS variant_name
FROM vehicle.external_reference_library lib
JOIN vehicle.auto_data_crawler_queue q ON lib.source_url = q.url
WHERE lib.pipeline_status = 'pending_match'
FOR UPDATE OF lib SKIP LOCKED LIMIT 1
""")
result = await session.execute(query)
row = result.fetchone()
if not row:
return None
return {
"lib_id": row[0],
"source_url": row[1],
"make": row[2],
"model": row[3],
"year_from": row[4],
"power_kw": row[5],
"engine_cc": row[6],
"specifications": row[7] if row[7] else {},
"category": row[8],
"parent_id": row[9],
"variant_name": row[10]
}
async def get_parent_vmd(self, session: AsyncSession, parent_id: int) -> Optional[Dict[str, Any]]:
"""
Lekérdezi az eredeti szülő rekordot a VMD táblából a parent_id alapján.
FOR UPDATE (zárolás a konkurrens feldolgozás elkerülésére)
"""
query = text("""
SELECT id, status FROM vehicle.vehicle_model_definitions
WHERE id = :parent_id FOR UPDATE
""")
result = await session.execute(query, {"parent_id": parent_id})
row = result.fetchone()
if not row:
return None
return {
"id": row[0],
"status": row[1]
}
def extract_standardized_data(self, specifications: Dict[str, Any]) -> Dict[str, Any]:
"""
Kinyeri a standardizált adatokat a specifications['standardized'] dict-ből.
Csonkolja a szöveges mezőket a VMD tábla korlátaihoz (50 karakter).
"""
standardized = specifications.get('standardized', {})
# Alapvető numerikus mezők
extracted = {
"power_kw": standardized.get("power_kw"),
"engine_capacity": standardized.get("engine_capacity"),
"torque_nm": standardized.get("torque_nm"),
"max_speed": standardized.get("max_speed"),
"curb_weight": standardized.get("curb_weight"),
"wheelbase": standardized.get("wheelbase"),
"seats": standardized.get("seats"),
"fuel_type": standardized.get("fuel_type"),
"transmission_type": standardized.get("transmission_type"),
"drive_type": standardized.get("drive_type"),
"body_type": standardized.get("body_type"),
}
# Csonkolás a VMD mezőhosszokhoz
def truncate(value: Any, max_len: int = 50) -> Any:
if isinstance(value, str) and len(value) > max_len:
return value[:max_len]
return value
# Alkalmazza a csonkolást a szöveges mezőkre
for field in ["fuel_type", "transmission_type", "drive_type", "body_type"]:
if extracted.get(field):
extracted[field] = truncate(extracted[field], 50)
# Tisztítás: None értékek eltávolítása
return {k: v for k, v in extracted.items() if v is not None}
async def update_parent_vmd(self, session: AsyncSession, parent_id: int,
lib_data: Dict[str, Any], standardized: Dict[str, Any]) -> int:
"""
A ÁG: Frissíti a szülő VMD rekordot a kinyert standardizált adatokkal.
Állítja a VMD status-át 'awaiting_ai_synthesis'-re.
Visszaadja a parent_id-t (matched_vmd_id).
"""
# Build update fields
update_fields = {
"power_kw": standardized.get("power_kw") or lib_data.get("power_kw"),
"engine_capacity": standardized.get("engine_capacity") or lib_data.get("engine_cc"),
"torque_nm": standardized.get("torque_nm"),
"max_speed": standardized.get("max_speed"),
"curb_weight": standardized.get("curb_weight"),
"wheelbase": standardized.get("wheelbase"),
"seats": standardized.get("seats"),
"fuel_type": standardized.get("fuel_type"),
"transmission_type": standardized.get("transmission_type"),
"drive_type": standardized.get("drive_type"),
"body_type": standardized.get("body_type"),
"status": "awaiting_ai_synthesis",
"updated_at": datetime.utcnow(),
"source": "ultimatespecs",
"priority_score": 30,
}
# Remove None values
update_fields = {k: v for k, v in update_fields.items() if v is not None}
# Build SET clause
set_clause = ", ".join([f"{k} = :{k}" for k in update_fields.keys()])
query = text(f"""
UPDATE vehicle.vehicle_model_definitions
SET {set_clause}
WHERE id = :parent_id
RETURNING id
""")
params = {"parent_id": parent_id, **update_fields}
result = await session.execute(query, params)
updated_id = result.scalar()
logger.info(f"UPDATE parent VMD {parent_id} with {len(update_fields)} fields")
return updated_id
async def insert_variant_vmd(self, session: AsyncSession, lib_data: Dict[str, Any],
standardized: Dict[str, Any], variant_name: str) -> int:
"""
B ÁG: Beszúr egy új variációt a VMD táblába.
make = lib.make, marketing_name = variant_name, year_from = lib.year_from.
status = 'awaiting_ai_synthesis', source = 'ultimatespecs', priority_score = 30.
Visszaadja az új ID-t (matched_vmd_id).
Ha már létezik a rekord (duplicate key), visszaadja a meglévő ID-t.
"""
# Build insert data
insert_data = {
"make": lib_data["make"],
"marketing_name": variant_name,
"official_marketing_name": variant_name,
"year_from": lib_data["year_from"],
"power_kw": standardized.get("power_kw") or lib_data.get("power_kw"),
"engine_capacity": standardized.get("engine_capacity") or lib_data.get("engine_cc"),
"torque_nm": standardized.get("torque_nm"),
"max_speed": standardized.get("max_speed"),
"curb_weight": standardized.get("curb_weight"),
"wheelbase": standardized.get("wheelbase"),
"seats": standardized.get("seats"),
"fuel_type": standardized.get("fuel_type"),
"transmission_type": standardized.get("transmission_type"),
"drive_type": standardized.get("drive_type"),
"body_type": standardized.get("body_type"),
"status": "awaiting_ai_synthesis",
"vehicle_class": lib_data.get("category"),
"source": "ultimatespecs",
"priority_score": 30,
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow(),
"market": "EU",
"normalized_name": f"{lib_data['make']} {variant_name}",
"technical_code": "UNKNOWN",
"variant_code": "UNKNOWN",
"version_code": "UNKNOWN",
"specifications": json.dumps({}), # Üres JSON, mert NOT NULL
"raw_api_data": json.dumps({}), # Üres JSON
"research_metadata": json.dumps({}), # Üres JSON
"raw_search_context": "", # Üres string
}
# Remove None values
insert_data = {k: v for k, v in insert_data.items() if v is not None}
# Build columns and values
columns = ", ".join(insert_data.keys())
placeholders = ", ".join([f":{k}" for k in insert_data.keys()])
try:
# Próbáljuk meg beszúrni
query = text(f"""
INSERT INTO vehicle.vehicle_model_definitions ({columns})
VALUES ({placeholders})
RETURNING id
""")
result = await session.execute(query, insert_data)
new_id = result.scalar()
logger.info(f"INSERT new variant VMD {new_id} for {lib_data['make']} {variant_name}")
return new_id
except IntegrityError as e:
# Duplicate key violation - rollback és új lekérdezés
logger.warning(f"Duplicate key violation for {lib_data['make']} {variant_name}: {e}. Rolling back and looking for existing record...")
# Rollback a megszakított tranzakciót
await session.rollback()
# Keresés a meglévő rekordra új tranzakcióban
find_query = text("""
SELECT id FROM vehicle.vehicle_model_definitions
WHERE make = :make
AND marketing_name = :marketing_name
AND year_from = :year_from
LIMIT 1
""")
find_params = {
"make": lib_data["make"],
"marketing_name": variant_name,
"year_from": lib_data["year_from"]
}
result = await session.execute(find_query, find_params)
existing_id = result.scalar()
if existing_id:
logger.info(f"Found existing VMD {existing_id} for {lib_data['make']} {variant_name}")
return existing_id
else:
# Ha nem találjuk, dobjuk tovább a hibát
logger.error(f"Duplicate key but could not find existing record for {lib_data['make']} {variant_name}")
raise
async def close_library_entry(self, session: AsyncSession, lib_id: int, matched_vmd_id: int):
"""
Frissíti a Library táblát: pipeline_status = 'completed', matched_vmd_id beállítása.
"""
query = text("""
UPDATE vehicle.external_reference_library
SET pipeline_status = 'completed',
matched_vmd_id = :matched_vmd_id
WHERE id = :lib_id
""")
await session.execute(query, {"lib_id": lib_id, "matched_vmd_id": matched_vmd_id})
logger.info(f"Library {lib_id} closed with matched_vmd_id {matched_vmd_id}")
async def process_one(self):
"""
Feldolgoz egyetlen pending_match rekordot.
"""
async with AsyncSessionLocal() as session:
try:
# 1. Lekérdezés a Library-ből
lib_data = await self.fetch_pending_match(session)
if not lib_data:
return False
logger.info(f"Processing library ID {lib_data['lib_id']} for {lib_data['make']} {lib_data['model']}")
# 2. Szülő VMD ellenőrzése
parent_vmd = None
if lib_data['parent_id']:
parent_vmd = await self.get_parent_vmd(session, lib_data['parent_id'])
# 3. Standardizált adatok kinyerése
standardized = self.extract_standardized_data(lib_data['specifications'])
# 4. Döntés: UPDATE vagy INSERT
matched_vmd_id = None
if parent_vmd and parent_vmd['status'] in ('pending', 'manual_review_needed'):
# A ÁG: Szülő frissítése
matched_vmd_id = await self.update_parent_vmd(
session, parent_vmd['id'], lib_data, standardized
)
else:
# B ÁG: Új variáció beszúrása
matched_vmd_id = await self.insert_variant_vmd(
session, lib_data, standardized, lib_data['variant_name']
)
# 5. Library lezárása
await self.close_library_entry(session, lib_data['lib_id'], matched_vmd_id)
# Commit
await session.commit()
logger.info(f"Successfully finalized library {lib_data['lib_id']} -> VMD {matched_vmd_id}")
return True
except Exception as e:
await session.rollback()
logger.error(f"Error processing library {lib_data.get('lib_id', 'unknown')}: {e}")
return False
async def run(self, max_iterations: int = 10):
"""
Fő futási ciklus: korlátozott számú iteráció, 1-3 mp várakozással.
Args:
max_iterations: Maximum number of processing cycles (default: 10)
"""
logger.info(f"R3 Finalizer started. Max iterations: {max_iterations}. Waiting for pending_match entries...")
iteration = 0
while self.running and iteration < max_iterations:
try:
processed = await self.process_one()
if not processed:
# Nincs munka vagy hiba történt, várakozás
await asyncio.sleep(SLEEP_INTERVAL)
else:
# Sikeres feldolgozás után rövid várakozás
await asyncio.sleep(0.5)
# Minden esetben növeljük az iterációt (akár sikeres, akár sikertelen volt)
iteration += 1
logger.info(f"Iteration {iteration}/{max_iterations} completed.")
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Unexpected error in main loop: {e}")
await asyncio.sleep(5)
# Hiba esetén is növeljük az iterációt
iteration += 1
logger.info(f"Iteration {iteration}/{max_iterations} completed after error.")
logger.info(f"R3 Finalizer completed {iteration} iterations. Stopping.")
self.stop()
def stop(self):
self.running = False
logger.info("R3 Finalizer stopping...")
def main():
# Signal kezelés
finalizer = UltimateSpecsFinalizer()
def signal_handler(signum, frame):
logger.info(f"Received signal {signum}, shutting down...")
finalizer.stop()
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# Fő ciklus indítása - korlátozott számú iterációval teszteléshez
try:
# Teszteléshez: maximum 5 iteráció
asyncio.run(finalizer.run(max_iterations=5))
except KeyboardInterrupt:
logger.info("Keyboard interrupt received, shutting down...")
finally:
logger.info("R3 Finalizer stopped.")
if __name__ == "__main__":
main()