Refactored code, fixed km ambiguity

nthd · Nov 30, 2018 · affea91 · affea91
1 parent cbaa4c0
commit affea91
Show file tree

Hide file tree

Showing 7 changed files with 295 additions and 265 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
+cache/
+data_*/
+menu.html
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/cache.py b/cache.py
@@ -0,0 +1,25 @@
+import urllib
+import os
+import hashlib
+
+#non-absolute path is used for html generation, so changing this only would be unwise
+cachedir = os.getcwd() + "/cache/"
+
+if not os.path.exists(cachedir):
+	os.makedirs(cachedir)
+
+def loadToCache(imgurl):
+	if imgurl == "NotFound":
+		return "../resources/notfound.png"
+	extension = imgurl.split(".")[-1]
+	hash = hashlib.md5(imgurl.encode('utf-8')).hexdigest()
+	cacheFile = cachedir + hash + "." + extension
+
+	if not os.path.isfile(cacheFile):
+		try:
+			urllib.request.urlretrieve(imgurl, cacheFile)
+		except:
+			raise
+			return "../resources/notfound.png"
+
+	return "../cache/" + hash + "." + extension
diff --git a/dao.py b/dao.py
@@ -0,0 +1,59 @@
+import sqlite3
+from datamodels import *
+
+import os
+
+def setupNewDB(dirpath):
+	if not os.path.exists(dirpath):
+		os.makedirs(dirpath)
+
+	if os.path.isfile(dirpath+"/newdata.db"):
+		os.remove(dirpath+"/newdata.db")
+
+	newdb = sqlite3.connect(dirpath+"/newdata.db")
+
+	try:
+		newdb.execute("CREATE TABLE cars(id TEXT, title TEXT, url TEXT, price TEXT, img TEXT, cdata TEXT)")
+	except sqlite3.OperationalError:
+		print("Error setting up the database")
+		newdb.close()
+		quit()
+
+	return newdb
+
+def insertResults(db, results):
+	for res in results:		
+		db.execute("INSERT INTO cars VALUES (?,?,?,?,?,?)", (res.id, res.title, res.url, res.price, res.img, res.data))
+		db.commit()
+
+def findChanges(dirpath, results):
+	changes = []
+	newIDs = list(map(lambda newresult: newresult.id, results))	
+
+	if not os.path.isfile(dirpath+"/data.db"):
+		changes = list(map(lambda item: change(item, "new", ""), results))
+	else:
+		olddb = sqlite3.connect(dirpath+"/data.db")
+		for currentCar in results:
+			oldres = olddb.execute("SELECT * from cars WHERE id=?", [currentCar.id]).fetchone()
+			if oldres is not None:
+				oldcar = car(*oldres)
+				if oldcar != currentCar:
+					changes.append(change(currentCar, "changed", currentCar.diffFromOld(oldcar)))
+			else:
+				changes.append(change(currentCar, "new", ""))
+
+		oldCarData = olddb.execute("SELECT * from cars").fetchall()
+		oldCars = list(map(lambda tuple: car(*tuple), oldCarData))
+		for oldCar in oldCars:
+			if oldCar.id not in newIDs:
+				changes.append(change(oldCar, "deleted", "deleted"))
+
+		olddb.close()
+
+	return changes
+
+def archiveDatabase(dirpath):
+	if os.path.isfile(dirpath+"/data.db"):
+		os.remove(dirpath+"/data.db")
+	os.rename(dirpath+"/newdata.db", dirpath+"/data.db")
diff --git a/datamodels.py b/datamodels.py
@@ -0,0 +1,53 @@
+class query:
+	def __init__(self, name, url):
+		self.name = name
+		self.url = url
+
+class change:
+	def __init__(self, car, summary, reason):
+		self.car = car
+		self.summary = summary
+		self.reason = reason
+
+	def __str__(self):
+		return self.reason + "\n" + str(self.car)
+
+	def toListItem(self, template):
+		filled = template.replace("%LISTING_REASON%", self.summary)
+		filled = filled.replace("%LISTING_ID%", self.car.id)
+		filled = filled.replace("%LISTING_PRICE%", self.car.price)
+		filled = filled.replace("%LISTING_LINK%", self.car.url)
+		filled = filled.replace("%LISTING_TITLE%", self.car.title)
+		filled = filled.replace("%LISTING_IMAGE%", self.car.img)
+		filled = filled.replace("%DETAILED_REASON%", self.reason)
+		filled = filled.replace("%LISTING_DATA%", self.car.data)
+		return filled
+
+class car:
+	def __init__(self, id, title, url, price, img, data):
+		self.id = id
+		self.title = title
+		self.url = url
+		self.price = str(price.replace('\xa0', '&nbsp;'))
+		self.img = img
+		self.data = data
+
+	def __str__(self):
+		return self.id + "\n" + self.title + "\n" + self.price + "\n___________"
+
+	def __eq__(self, other):
+		if isinstance(other, car):
+			return self.id == other.id and self.title == other.title and self.url == other.url and self.price == other.price and self.img == other.img and self.data == other.data
+		return False
+
+	def diffFromOld(self, other):
+		difference = ""
+		if self.title != other.title:
+			difference += "title changed<br>\n"
+		if self.price != other.price:
+			difference += "price changed from " + other.price + "<br>\n"
+		if self.img != other.img:
+			difference += "image changed<br>\n"
+		if self.data != other.data:
+			difference += "data changed from: " + other.data + "<br>\n"
+		return difference
diff --git a/hahu_processor.py b/hahu_processor.py
@@ -0,0 +1,65 @@
+import requests
+from lxml import html
+from lxml import etree
+from lxml.etree import tostring
+
+from cache import *
+from datamodels import *
+
+header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',}
+
+def page(num, base):
+	return base+"/page"+str(num)
+
+def fetch_results_from_query(query):
+	initReq = requests.get(query.url, headers=header)
+	initTree = html.fromstring(initReq.content)
+
+	try:
+		num_of_pages = int(initTree.xpath('//link[@rel="last"]/@href')[0].split("page")[1])
+	except IndexError:
+		num_of_pages = 1
+
+	results = []
+
+	for pagenum in range(1,num_of_pages+1):
+		print("\rProcessing page " + str(pagenum) + " out of " + str(num_of_pages) + " for query " + query.name, end='')
+		request = requests.get(page(pagenum, query.url))
+		tree = html.fromstring(request.content)
+		listings = tree.xpath('.//div[contains(@class, "row talalati-sor")]')
+
+		for listing in listings:
+			kepsor = listing.find('.//div[@class="talalatisor-kep"]')
+			adatsor = listing.find('.//div[@class="talalatisor-adatok"]')
+			info = adatsor.find('.//div[@class="talalatisor-info adatok"]')
+
+
+			title = kepsor.find('.//a').get("title")
+			url = kepsor.find('.//a').get("href")
+
+			try:
+				img = kepsor.find('.//img[@class="img-responsive lazy"]').get('data-lazyurl')
+			except AttributeError:
+				img = "NotFound"
+
+			img = loadToCache(img)
+			price = adatsor.find('.//div[@class="vetelar"]').text
+			id = listing.find('.//*[@data-hirkod]').get('data-hirkod')
+			databoxes = info.findall('.//span')
+			maybeData = list(map(lambda databox: databox.text, databoxes))
+
+			if None in maybeData:
+				#km in tooltip?
+				km = info.find('.//abbr[@title="Kilométeróra állása"]')
+				if km is not None:
+					finalData = [x if x is not None else km.text for x in maybeData]
+				else:
+					finalData = [x if x is not None else "? km" for x in maybeData]
+
+			else:
+				finalData = maybeData
+
+			thiscar = car(id, title, url, price, img, " ".join(finalData))
+			results.append(thiscar)
+
+	return results