Files
service-finder/backend/app/workers/vehicle/R4_final_extractor.py
2026-03-22 11:02:05 +00:00

132 lines
5.9 KiB
Python

import asyncio
import logging
import random
import json
import re
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from sqlalchemy import text
from app.database import AsyncSessionLocal
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R4-EXTRACTOR] %(message)s')
logger = logging.getLogger("R4")
class FinalExtractor:
def __init__(self):
self.semaphore = asyncio.Semaphore(2) # Biztonságos párhuzamosság
def clean_key(self, key):
if "," in key: key = key.split(",")[-1]
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
key = key.split("?")[0].strip()
return key.capitalize()
async def scrape_engine(self, context, url):
page = await context.new_page()
try:
await asyncio.sleep(random.uniform(3, 6)) # Anti-bot késleltetés
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
content = await page.content()
soup = BeautifulSoup(content, 'html.parser')
data = {
"make": "", "model": "", "generation": "", "modification": "",
"year_from": None, "year_to": None, "power_kw": 0, "engine_cc": 0,
"specifications": {}, "source_url": url
}
rows = soup.find_all('tr')
for row in rows:
th, td = row.find('th'), row.find('td')
if not th or not td: continue
raw_k, val = th.get_text(strip=True), td.get_text(strip=True)
k_low = raw_k.lower()
if "brand" == k_low: data["make"] = val
elif "model" == k_low: data["model"] = val
elif "generation" == k_low: data["generation"] = val
elif "modification" == k_low: data["modification"] = val
elif "start of production" in k_low:
m = re.search(r'(\d{4})', val)
if m: data["year_from"] = int(m.group(1))
elif "end of production" in k_low:
m = re.search(r'(\d{4})', val)
if m: data["year_to"] = int(m.group(1))
elif "power" == k_low:
hp_m = re.search(r'(\d+)\s*Hp', val, re.I)
if hp_m: data["power_kw"] = int(int(hp_m.group(1)) / 1.36)
elif "displacement" in k_low:
cc_m = re.search(r'(\d+)\s*cm3', val)
if cc_m: data["engine_cc"] = int(cc_m.group(1))
clean_k = self.clean_key(raw_k)
if clean_k and val: data["specifications"][clean_k] = val
return data
except Exception as e:
logger.error(f"Hiba az adatlapon ({url}): {e}")
return None
finally:
await page.close()
async def save_to_library(self, data):
if not data or not data["make"]: return
async with AsyncSessionLocal() as db:
try:
await db.execute(text("""
INSERT INTO vehicle.external_reference_library
(source_name, make, model, generation, modification, year_from, year_to, power_kw, engine_cc, specifications, source_url)
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y_f, :y_t, :p_kw, :e_cc, :specs, :url)
ON CONFLICT (source_url) DO UPDATE SET specifications = EXCLUDED.specifications, last_scraped_at = NOW();
"""), {
"make": data["make"], "model": data["model"], "gen": data["generation"],
"mod": data["modification"], "y_f": data["year_from"], "y_t": data["year_to"],
"p_kw": data["power_kw"], "e_cc": data["engine_cc"],
"specs": json.dumps(data["specifications"]), "url": data["source_url"]
})
await db.commit()
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} ({data['power_kw']} kW)")
except Exception as e:
logger.error(f"DB Hiba: {e}")
async def run(self):
logger.info("🚀 R4 Adatbányász indítása...")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(user_agent="Mozilla/5.0...")
while True:
async with AsyncSessionLocal() as db:
res = await db.execute(text("""
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
WHERE id = (
SELECT id FROM vehicle.auto_data_crawler_queue
WHERE level = 'engine' AND status = 'pending'
ORDER BY id ASC LIMIT 1 FOR UPDATE SKIP LOCKED
) RETURNING id, url, name
"""))
target = res.fetchone()
await db.commit()
if not target:
logger.info("🏁 Nincs több feldolgozandó motoradat. Alvás 60mp...")
await asyncio.sleep(60)
continue
t_id, t_url, t_name = target
async with self.semaphore:
data = await self.scrape_engine(context, t_url)
if data:
await self.save_to_library(data)
new_status = 'completed'
else:
new_status = 'error'
async with AsyncSessionLocal() as db:
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = :s WHERE id = :id"),
{"s": new_status, "id": t_id})
await db.commit()
if __name__ == "__main__":
asyncio.run(FinalExtractor().run())