forked from syniuhin/kpi-databases-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_estore.py
81 lines (55 loc) · 1.94 KB
/
parse_estore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import urllib2
from lxml import etree
from gen_xml import process_text
BASE_URL = 'http://www.odissey.kiev.ua/'
class Product:
def __init__(self, name, price, description, image):
self.name = name
self.price = price
self.description = description
self.image = image
def __str__(self):
return self.name.encode('utf-8')
def __repr__(self):
return self.name.encode('utf-8')
def parse_html(url):
response = urllib2.urlopen(BASE_URL + url)
page = response.read()
tree = etree.HTML(page.decode("cp1251").encode('utf-8'))
# Set of hardcode hacks
name = tree.xpath("string(//div[@itemprop='name']/h1/text())")
price = tree.xpath("string(//div[@id='optionPrice']/text())")
image = tree.xpath("string(//img[@class='thumbnail']/@src)")
desc = tree.xpath(
"//div[@style='overflow-x: auto']/span[@itemprop='description']//text()")
process_text(desc)
desc = reduce(lambda a, x: a + x, desc)
return Product(name, price, desc, BASE_URL + image)
def generate_xml(filename):
page = urllib2.urlopen(BASE_URL).read()
tree = etree.HTML(page)
# Remove duplicates
urls = set(tree.xpath('//a/@href'))
urls = filter(lambda x: x.startswith('product'), urls)[:20]
products = []
for url in urls:
products.append(parse_html(url))
root = etree.Element("data")
for product in products:
product_el = etree.Element("product")
name_el = etree.Element("name")
name_el.text = product.name
price_el = etree.Element("price")
price_el.text = product.price
desc_el = etree.Element("description")
desc_el.text = product.description
image_el = etree.Element("image")
image_el.text = product.image
product_el.append(name_el)
product_el.append(price_el)
product_el.append(desc_el)
product_el.append(image_el)
root.append(product_el)
et = etree.ElementTree(root)
et.write(filename, encoding='utf-8', xml_declaration=True, pretty_print=True)
generate_xml('products.xml')