Skip to content

Commit

Permalink
Refactored code, fixed km ambiguity
Browse files Browse the repository at this point in the history
  • Loading branch information
Aresius423 committed Nov 30, 2018
1 parent cbaa4c0 commit affea91
Show file tree
Hide file tree
Showing 7 changed files with 295 additions and 265 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
cache/
data_*/
menu.html

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
25 changes: 25 additions & 0 deletions cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import urllib
import os
import hashlib

#non-absolute path is used for html generation, so changing this only would be unwise
cachedir = os.getcwd() + "/cache/"

if not os.path.exists(cachedir):
os.makedirs(cachedir)

def loadToCache(imgurl):
if imgurl == "NotFound":
return "../resources/notfound.png"
extension = imgurl.split(".")[-1]
hash = hashlib.md5(imgurl.encode('utf-8')).hexdigest()
cacheFile = cachedir + hash + "." + extension

if not os.path.isfile(cacheFile):
try:
urllib.request.urlretrieve(imgurl, cacheFile)
except:
raise
return "../resources/notfound.png"

return "../cache/" + hash + "." + extension
59 changes: 59 additions & 0 deletions dao.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sqlite3
from datamodels import *

import os

def setupNewDB(dirpath):
if not os.path.exists(dirpath):
os.makedirs(dirpath)

if os.path.isfile(dirpath+"/newdata.db"):
os.remove(dirpath+"/newdata.db")

newdb = sqlite3.connect(dirpath+"/newdata.db")

try:
newdb.execute("CREATE TABLE cars(id TEXT, title TEXT, url TEXT, price TEXT, img TEXT, cdata TEXT)")
except sqlite3.OperationalError:
print("Error setting up the database")
newdb.close()
quit()

return newdb

def insertResults(db, results):
for res in results:
db.execute("INSERT INTO cars VALUES (?,?,?,?,?,?)", (res.id, res.title, res.url, res.price, res.img, res.data))
db.commit()

def findChanges(dirpath, results):
changes = []
newIDs = list(map(lambda newresult: newresult.id, results))

if not os.path.isfile(dirpath+"/data.db"):
changes = list(map(lambda item: change(item, "new", ""), results))
else:
olddb = sqlite3.connect(dirpath+"/data.db")
for currentCar in results:
oldres = olddb.execute("SELECT * from cars WHERE id=?", [currentCar.id]).fetchone()
if oldres is not None:
oldcar = car(*oldres)
if oldcar != currentCar:
changes.append(change(currentCar, "changed", currentCar.diffFromOld(oldcar)))
else:
changes.append(change(currentCar, "new", ""))

oldCarData = olddb.execute("SELECT * from cars").fetchall()
oldCars = list(map(lambda tuple: car(*tuple), oldCarData))
for oldCar in oldCars:
if oldCar.id not in newIDs:
changes.append(change(oldCar, "deleted", "deleted"))

olddb.close()

return changes

def archiveDatabase(dirpath):
if os.path.isfile(dirpath+"/data.db"):
os.remove(dirpath+"/data.db")
os.rename(dirpath+"/newdata.db", dirpath+"/data.db")
53 changes: 53 additions & 0 deletions datamodels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
class query:
def __init__(self, name, url):
self.name = name
self.url = url

class change:
def __init__(self, car, summary, reason):
self.car = car
self.summary = summary
self.reason = reason

def __str__(self):
return self.reason + "\n" + str(self.car)

def toListItem(self, template):
filled = template.replace("%LISTING_REASON%", self.summary)
filled = filled.replace("%LISTING_ID%", self.car.id)
filled = filled.replace("%LISTING_PRICE%", self.car.price)
filled = filled.replace("%LISTING_LINK%", self.car.url)
filled = filled.replace("%LISTING_TITLE%", self.car.title)
filled = filled.replace("%LISTING_IMAGE%", self.car.img)
filled = filled.replace("%DETAILED_REASON%", self.reason)
filled = filled.replace("%LISTING_DATA%", self.car.data)
return filled

class car:
def __init__(self, id, title, url, price, img, data):
self.id = id
self.title = title
self.url = url
self.price = str(price.replace('\xa0', ' '))
self.img = img
self.data = data

def __str__(self):
return self.id + "\n" + self.title + "\n" + self.price + "\n___________"

def __eq__(self, other):
if isinstance(other, car):
return self.id == other.id and self.title == other.title and self.url == other.url and self.price == other.price and self.img == other.img and self.data == other.data
return False

def diffFromOld(self, other):
difference = ""
if self.title != other.title:
difference += "title changed<br>\n"
if self.price != other.price:
difference += "price changed from " + other.price + "<br>\n"
if self.img != other.img:
difference += "image changed<br>\n"
if self.data != other.data:
difference += "data changed from: " + other.data + "<br>\n"
return difference
65 changes: 65 additions & 0 deletions hahu_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import requests
from lxml import html
from lxml import etree
from lxml.etree import tostring

from cache import *
from datamodels import *

header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',}

def page(num, base):
return base+"/page"+str(num)

def fetch_results_from_query(query):
initReq = requests.get(query.url, headers=header)
initTree = html.fromstring(initReq.content)

try:
num_of_pages = int(initTree.xpath('//link[@rel="last"]/@href')[0].split("page")[1])
except IndexError:
num_of_pages = 1

results = []

for pagenum in range(1,num_of_pages+1):
print("\rProcessing page " + str(pagenum) + " out of " + str(num_of_pages) + " for query " + query.name, end='')
request = requests.get(page(pagenum, query.url))
tree = html.fromstring(request.content)
listings = tree.xpath('.//div[contains(@class, "row talalati-sor")]')

for listing in listings:
kepsor = listing.find('.//div[@class="talalatisor-kep"]')
adatsor = listing.find('.//div[@class="talalatisor-adatok"]')
info = adatsor.find('.//div[@class="talalatisor-info adatok"]')


title = kepsor.find('.//a').get("title")
url = kepsor.find('.//a').get("href")

try:
img = kepsor.find('.//img[@class="img-responsive lazy"]').get('data-lazyurl')
except AttributeError:
img = "NotFound"

img = loadToCache(img)
price = adatsor.find('.//div[@class="vetelar"]').text
id = listing.find('.//*[@data-hirkod]').get('data-hirkod')
databoxes = info.findall('.//span')
maybeData = list(map(lambda databox: databox.text, databoxes))

if None in maybeData:
#km in tooltip?
km = info.find('.//abbr[@title="Kilométeróra állása"]')
if km is not None:
finalData = [x if x is not None else km.text for x in maybeData]
else:
finalData = [x if x is not None else "? km" for x in maybeData]

else:
finalData = maybeData

thiscar = car(id, title, url, price, img, " ".join(finalData))
results.append(thiscar)

return results
Loading

0 comments on commit affea91

Please sign in to comment.