átlagos kiegészítséek jó sok
This commit is contained in:
159
backend/app/workers/vehicle/R3_engine_scout.py
Normal file
159
backend/app/workers/vehicle/R3_engine_scout.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.async_api import async_playwright
|
||||
from sqlalchemy import text
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
# --- NAPLÓZÁS KONFIGURÁCIÓ ---
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [R3-EXTRACTOR-v1.2] %(message)s')
|
||||
logger = logging.getLogger("R3")
|
||||
|
||||
# --- KONFIGURÁCIÓS PARAMÉTEREK ---
|
||||
MAX_RETRY_LIMIT = 3 # Max 3 próbálkozás járművenként
|
||||
|
||||
class R3DataMiner:
|
||||
def clean_key(self, key):
|
||||
if "," in key: key = key.split(",")[-1]
|
||||
key = key.replace("What is the ", "").replace("How much ", "").replace("How many ", "")
|
||||
return key.split("?")[0].strip().capitalize()
|
||||
|
||||
async def scrape_specs(self, context, url):
|
||||
page = await context.new_page()
|
||||
try:
|
||||
# Véletlenszerű várakozás a bot-védelem elkerülésére
|
||||
await asyncio.sleep(random.uniform(4, 8))
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
content = await page.content()
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
data = {"make": "", "model": "", "generation": "", "modification": "",
|
||||
"year_from": None, "power_kw": 0, "engine_cc": 0,
|
||||
"specifications": {}, "source_url": url}
|
||||
|
||||
# Eredeti parszoló logika
|
||||
for row in soup.find_all('tr'):
|
||||
th, td = row.find('th'), row.find('td')
|
||||
if not th or not td: continue
|
||||
k_raw, v = th.get_text(strip=True), td.get_text(strip=True)
|
||||
k_low = k_raw.lower()
|
||||
|
||||
if "brand" == k_low: data["make"] = v
|
||||
elif "model" == k_low: data["model"] = v
|
||||
elif "generation" == k_low: data["generation"] = v
|
||||
elif "modification" == k_low: data["modification"] = v
|
||||
elif "start of production" in k_low:
|
||||
m = re.search(r'(\d{4})', v)
|
||||
data["year_from"] = int(m.group(1)) if m else None
|
||||
elif "power" == k_low:
|
||||
hp = re.search(r'(\d+)\s*Hp', v, re.I)
|
||||
if hp: data["power_kw"] = int(int(hp.group(1)) / 1.36)
|
||||
elif "displacement" in k_low:
|
||||
cc = re.search(r'(\d+)\s*cm3', v)
|
||||
if cc: data["engine_cc"] = int(cc.group(1))
|
||||
|
||||
data["specifications"][self.clean_key(k_raw)] = v
|
||||
|
||||
if not data["make"] or not data["specifications"]:
|
||||
return None
|
||||
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Hiba az adatlapon ({url}): {e}")
|
||||
return None
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
async def run(self):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
while True:
|
||||
target = None
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
# JAVÍTÁS: Kikerült a priority_score, mert az oszlop nem létezik a crawler_queue táblában
|
||||
res = await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue SET status = 'processing'
|
||||
WHERE id = (
|
||||
SELECT id FROM vehicle.auto_data_crawler_queue
|
||||
WHERE level = 'engine'
|
||||
AND status IN ('pending', 'error')
|
||||
AND retry_count < 3
|
||||
ORDER BY id ASC
|
||||
LIMIT 1 FOR UPDATE SKIP LOCKED
|
||||
) RETURNING id, url, name, retry_count
|
||||
"""))
|
||||
target = res.fetchone()
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"❌ DB Hiba a feladatfelvételnél: {e}")
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
if not target:
|
||||
logger.info("🏁 Minden feladat elvégezve. Leállás.")
|
||||
break
|
||||
|
||||
t_id, t_url, t_name, t_retry = target
|
||||
if t_retry is None: t_retry = 0
|
||||
|
||||
logger.info(f"🚀 [{t_retry + 1}/3] Dolgozom: {t_name}")
|
||||
data = await self.scrape_specs(context, t_url)
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
if data and data["make"]:
|
||||
await db.execute(text("""
|
||||
INSERT INTO vehicle.external_reference_library
|
||||
(source_name, make, model, generation, modification, year_from, power_kw, engine_cc, specifications, source_url)
|
||||
VALUES ('auto-data.net', :make, :model, :gen, :mod, :y, :p, :e, :s, :u)
|
||||
ON CONFLICT (source_url) DO UPDATE SET
|
||||
specifications = EXCLUDED.specifications,
|
||||
last_scraped_at = NOW();
|
||||
"""), {
|
||||
"make": data["make"], "model": data["model"], "gen": data["generation"],
|
||||
"mod": data["modification"], "y": data["year_from"], "p": data["power_kw"],
|
||||
"e": data["engine_cc"], "s": json.dumps(data["specifications"]), "u": data["source_url"]
|
||||
})
|
||||
|
||||
await db.execute(text("UPDATE vehicle.auto_data_crawler_queue SET status = 'completed', updated_at = NOW() WHERE id = :id"), {"id": t_id})
|
||||
logger.info(f"✅ ARANYMENTÉS: {data['make']} {data['model']} {data['modification']}")
|
||||
else:
|
||||
new_retry = t_retry + 1
|
||||
if new_retry >= 3:
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'manual_review_needed',
|
||||
retry_count = :rc,
|
||||
error_msg = 'Sikertelen adatgyűjtés 3 próbálkozás után',
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"rc": new_retry, "id": t_id})
|
||||
logger.error(f"🚨 LIMIT ELÉRVE: {t_name} -> manual_review_needed")
|
||||
else:
|
||||
await db.execute(text("""
|
||||
UPDATE vehicle.auto_data_crawler_queue
|
||||
SET status = 'error',
|
||||
retry_count = :rc,
|
||||
updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""), {"rc": new_retry, "id": t_id})
|
||||
logger.warning(f"⚠️ Sikertelen próbálkozás ({new_retry}/3): {t_name}")
|
||||
|
||||
await db.commit()
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
miner = R3DataMiner()
|
||||
try:
|
||||
asyncio.run(miner.run())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("🛑 Felhasználói leállítás.")
|
||||
Reference in New Issue
Block a user