teljes backend_mentés
This commit is contained in:
113
backend/app/workers/vehicle/bike/test_aprilia.py.old
Normal file
113
backend/app/workers/vehicle/bike/test_aprilia.py.old
Normal file
@@ -0,0 +1,113 @@
|
||||
import asyncio
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def test_scraper():
|
||||
# Két probléma-fókuszú URL: a modern Aprilia és a régi, hibás HTML-ű BMW
|
||||
test_urls = [
|
||||
"https://www.autoevolution.com/moto/aprilia-rs-660-factory-2025.html",
|
||||
"https://www.autoevolution.com/moto/bmw-f-650-gs-2011.html"
|
||||
]
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
for url in test_urls:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🌍 MEGNYITÁS: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# A DOM betöltése megvárása
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
await asyncio.sleep(2) # Várunk picit a JS futásra
|
||||
|
||||
# A TÖKÉLETESÍTETT AUTOEVOLUTION PARSZOLÓ
|
||||
script = """
|
||||
() => {
|
||||
let results = {};
|
||||
|
||||
// 1. MÓDSZER: Régi motorok (pl. BMW F650GS) -> td.left és td.right
|
||||
let leftCells = document.querySelectorAll('td.left');
|
||||
leftCells.forEach(cell => {
|
||||
let key = cell.innerText.replace(/:$/, '').trim();
|
||||
let rightCell = cell.nextElementSibling;
|
||||
if(rightCell && rightCell.classList.contains('right')) {
|
||||
results[key] = rightCell.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 2. MÓDSZER: Modern motorok (pl. Aprilia) -> dt és dd
|
||||
let dts = document.querySelectorAll('dt');
|
||||
dts.forEach(dt => {
|
||||
let key = dt.innerText.replace(/:$/, '').trim();
|
||||
let dd = dt.nextElementSibling;
|
||||
if(dd && dd.tagName.toLowerCase() === 'dd') {
|
||||
results[key] = dd.innerText.trim();
|
||||
}
|
||||
});
|
||||
|
||||
// 3. MÓDSZER: Alternatív modern layout -> span.label és span.value
|
||||
let specRows = document.querySelectorAll('.spec-row');
|
||||
specRows.forEach(row => {
|
||||
let label = row.querySelector('.label');
|
||||
let value = row.querySelector('.value');
|
||||
if(label && value) {
|
||||
let key = label.innerText.replace(/:$/, '').trim();
|
||||
if (!results[key]) {
|
||||
results[key] = value.innerText.trim();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 4. MÓDSZER: "Adler" típusú elavult leírások fallbackje -> Vastagított szöveg
|
||||
if (Object.keys(results).length === 0) {
|
||||
document.querySelectorAll('b, strong').forEach(b => {
|
||||
let key = b.innerText.replace(/:$/, '').trim();
|
||||
if(key.length > 2 && key.length < 30) {
|
||||
let val = "";
|
||||
// Ha a szöveg közvetlenül a tag után van (Text Node)
|
||||
if(b.nextSibling && b.nextSibling.nodeType === 3) {
|
||||
val = b.nextSibling.textContent.trim();
|
||||
}
|
||||
// Ha egy másik elemben van
|
||||
else if (b.nextElementSibling && b.nextElementSibling.tagName !== 'B') {
|
||||
val = b.nextElementSibling.innerText.trim();
|
||||
}
|
||||
if(val && !results[key]) {
|
||||
results[key] = val;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
"""
|
||||
|
||||
data = await page.evaluate(script)
|
||||
|
||||
if data and len(data) > 0:
|
||||
# Kiszűrjük a zajt, csak a releváns műszaki adatokat hagyjuk meg
|
||||
relevant_keys = ["Type", "Displacement", "Bore X Stroke", "Compression Ratio",
|
||||
"Horsepower", "Torque", "Fuel System", "Gearbox", "Clutch",
|
||||
"Final Drive", "Frame", "Front Suspension", "Rear Suspension",
|
||||
"Front Brake", "Rear Brake", "Overall Length", "Overall Width",
|
||||
"Seat Height", "Wheelbase", "Fuel Capacity", "Weight", "Dry Weight",
|
||||
"Wet Weight", "Front", "Rear"]
|
||||
|
||||
filtered_data = {k: v for k, v in data.items() if any(rk.lower() in k.lower() for rk in relevant_keys)}
|
||||
|
||||
print("\n🟢 KINYERT ADATOK (DOM PARSZOLÓ):")
|
||||
print(json.dumps(filtered_data if filtered_data else data, indent=2, ensure_ascii=False))
|
||||
print(f"\n✅ Összesen {len(filtered_data if filtered_data else data)} műszaki paramétert találtam.")
|
||||
else:
|
||||
print("\n🔴 NULLA ADAT - A DOM parszoló nem talált egyezést.")
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_scraper())
|
||||
Reference in New Issue
Block a user