-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathhahu_processor.py
73 lines (56 loc) · 2.42 KB
/
hahu_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from lxml import html
from lxml import etree
from lxml.etree import tostring
import requests
from typing import List
from cache import loadToCache
from datamodels import car
header = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0",
}
def page(num, base):
return base + "/page" + str(num)
def fetch_results_from_query(query) -> List[car]:
initReq = requests.get(query.url, headers=header)
initTree = html.fromstring(initReq.content)
try:
num_of_pages = int(
initTree.xpath('//link[@rel="last"]/@href')[0].split("page")[1]
)
except IndexError:
num_of_pages = 1
results: List[car] = []
for pagenum in range(1, num_of_pages + 1):
print(f"\rProcessing page {str(pagenum)} out of {str(num_of_pages)} for query {query.name}", end="")
request = requests.get(page(pagenum, query.url))
tree = html.fromstring(request.content)
listings = tree.xpath('.//div[contains(@class, "row talalati-sor")]')
for listing in listings:
kepsor = listing.find('.//div[@class="talalatisor-kep"]')
adatsor = listing.find('.//div[@class="talalatisor-adatok"]')
info = adatsor.find('.//div[@class="talalatisor-info adatok"]')
title = kepsor.find(".//a").get("title")
url = kepsor.find(".//a").get("href")
try:
img = kepsor.find('.//img[@class="img-responsive"]').get(
"src"
)
except AttributeError:
img = "NotFound"
img = loadToCache(img)
price = adatsor.find('.//div[@class="vetelar"]').text
listing_id = listing.find(".//*[@data-hirkod]").get("data-hirkod")
databoxes = info.findall(".//span")
maybeData = list(map(lambda databox: databox.text, databoxes))
if None in maybeData:
# km in tooltip?
km = info.find('.//abbr[@title="Kilométeróra állása"]')
if km is not None:
finalData = [x if x is not None else km.text for x in maybeData]
else:
finalData = [x if x is not None else "? km" for x in maybeData]
else:
finalData = maybeData
thiscar = car(listing_id, title, url, price, img, " ".join(finalData))
results.append(thiscar)
return results