forked from backslash112/book_scraper_python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbook_price_scraper.py
70 lines (57 loc) · 1.86 KB
/
book_price_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from bs4 import BeautifulSoup
import re
from urllib.request import urlopen
import urllib.request
import csv
import time
import requests
import queue
import threading
from multiprocessing.pool import ThreadPool
def get_price_amazon(isbn, q):
base_url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords="
url = base_url + str(isbn)
# page = urlopen(url)
# soup = BeautifulSoup(page, 'lxml')
# page.close()
# Amazon don't allow automated access to their data, so need to fake the User-Agent
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(url)
html_content = response.read()
soup = BeautifulSoup(html_content, 'lxml')
# price_regexp = re.compile("\¥[0-9]+(\.[0-9]{2})?") # for amazon.cn
price_regexp = re.compile("\$[0-9]+(\.[0-9]{2})?") # for amazon.com
price = soup.find(text=price_regexp)
# return [isbn, price]
q.put([isbn, price])
def get_all_isbn():
all_isbn = []
with open('isbn.csv', 'rt') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
all_isbn.append(row[0])
return all_isbn
def run():
qs = []
pool = ThreadPool(processes=10)
book_price_list = []
for isbn in get_all_isbn():
# result = get_price_amazon(isbn)
# Multi-threading
q = queue.Queue()
pool.apply_async(get_price_amazon, args=(isbn, q))
qs.append(q)
for q in qs:
price = q.get()
print(price)
book_price_list.append(price)
print(len(book_price_list))
save_to_csv(book_price_list)
def save_to_csv(list):
print('save')
with open('prices.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerow(['isbn','price'])
a.writerows(list)
run()