This repository has been archived by the owner on Feb 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
scrape.py
221 lines (177 loc) · 8.98 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import JavascriptException, TimeoutException
import os
import re
import itertools
import argparse
import time
def stealthify_browser(browser):
"""
Stealth scripts to hide that we're headless
From https://github.com/MeiK2333/pyppeteer_stealth
:param browser: Selenium browser to stealthify
"""
browser.execute_script(open("stealthify.js").read())
def get_recommendations(url, browser, max_carousel_items=50):
"""
Get Amazon recommendations for a given URL
See carousels.js for the actual scraping code. Note that the given seed
URLs are expected to be product page URLs; behaviour when they point to
another type of page is undefined.
:param str url: Amazon item page URL to scrape
:param browser: Selenium WebDriver instance to scrape with
:param int max_carousel_items: Max items to scrape per recommendation carousel
:return dict: A dictionary, with an item for each found recommendation
list, that item being a dictionary with two items, "sponsored" (a boolean)
and "items" (a list of items in that list)
"""
stealthify_browser(browser)
try:
browser.get(url)
carousel_scrape = open("carousels.js").read()
browser.execute_script("window.max_carousel_items = %i;" % max_carousel_items)
recommendations = browser.execute_script("return %s" % carousel_scrape)
return recommendations
except JavascriptException as e:
print("Javascript error: %s" % e)
return {}
except TimeoutException as e:
print("Timeout while scraping, returning empty result set")
return {}
def gdf_escape(string):
"""
Escape string for use in GDF file
:param str string: String to escape
:return str: Escaped string, wrapped in quotes
"""
if not string:
return '""'
return '"' + string.replace('"', '\"').strip() + '"'
def generate_recommendation_network(seeds, depth=0, prefix="", max_carousel_items=50):
"""
Generate GDF files for the recommendations for the seed URLs
Scrapes recommendations for a given list of Amazon product page URLs and
stores the scraped values as Gephi-compatible GDF files, one per type of
recommendation (e.g. 'customers also viewed', 'customers also bought', and
so on).
If a depth greater than 0 is used, items are scraped breadth-first, i.e.
first depth 0 will be scraped, then depth 1, and so on. This balloons
extremely quickly so be very careful at depths greater than 0.
:todo: determine how (and if) to weigh nodes and edges
:param list seeds: A list of Amazon product page URLs to scrape
:param int depth: Depth for the scrape, defaults to 0. If the depth is 1,
all recommendations found in the first iteration will also be scraped, and
:param int max_carousel_items: Max items to scrape per recommendation carousel
:param str prefix: File prefix
so on. Careful!
"""
items = {}
links = {}
# set up selenium-driven browser
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:84.0) Gecko/20100101 Firefox/84.0"
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", user_agent)
options = Options()
options.add_argument("--headless")
browser = webdriver.Firefox(firefox_profile=profile, options=options)
# these are kind of arbitrary, but seem to work
browser.set_page_load_timeout(15)
browser.set_script_timeout(120)
browser.implicitly_wait(5)
seed_asins = set()
current_depth = 0
initial_asins = None
while current_depth <= depth:
# we use ASINs as unique identifiers - they can be found at a
# predictable place in the product page URL
seed_asins |= set([re.split("/(d|g)p/", seed)[1].split("/")[0] for seed in seeds])
if not initial_asins:
initial_asins = seed_asins
# this will store ASINs for a next round of scraping, if we end up
# doing one
new_seeds = []
progress = 1
for seed in seeds:
# give recommendation URLs (which are relative) the same host as
# the seed URL
amazon_host = "/".join(seed.split("/")[:3])
print("Scraping %s... (depth %i, %i/%i)" % (seed, current_depth, progress, len(seeds)))
progress += 1
# get the actual recommendations - this will typically take a while
recommendations = get_recommendations(seed, browser, max_carousel_items)
if not recommendations:
print("- no results, link may be invalid")
continue
seed_asin = re.split("/(d|g)p/", seed)[1].split("/")[0]
# process recommendations
for list_title, list_items in recommendations.items():
if list_title not in links:
links[list_title] = set()
for item in list_items["items"]:
metadata = item.copy()
if metadata["link"][0:4] != "http":
metadata["link"] = amazon_host + metadata["link"]
asin = item["asin"]
if asin not in items:
items[asin] = metadata
# if the item has not been scraped earlier, it is a
# candidate for scraping in a next iteration. Note that
# this does not account for different amazon domains - but
# mixing them would be unadvisible anyway
if asin not in seed_asins:
new_seeds.append(metadata["link"])
# store the pair as a simple a-b string. using a hashable
# type here (i.e. a string) allows using a set to store
# them, which automatically eliminates duplicates
if seed_asin != asin:
pair = "-".join([seed_asin, asin])
links[list_title].add(pair)
# the new seeds are the old seeds - prepare for next iteration (if
# needed)
seeds = new_seeds
current_depth += 1
browser.close()
# write GDF file
print("Generating networks for the following recommendation lists:")
for list_title, list_pairs in links.items():
if not list_pairs:
print("- %s (empty, skipping)" % list_title)
continue
print("- %s (%i recommendations)" % (list_title, len(list_pairs)))
# only include items that actually appear in this list
asins = set().union(*itertools.chain([pair.split("-") for pair in list_pairs]))
# prefix filename if requested
# remove characters invalid in windows filenames
filename = re.sub(r'<>:"/\\\|\?\*', '', list_title.replace(" ", "-")) + ".gdf"
if prefix:
filename = prefix + "-" + filename
filename = filename.replace("#", str(int(time.time())))
# if the output directory doesn't exist create it
output_dir = os.path.dirname(filename)
if output_dir and not os.path.isdir(output_dir):
os.makedirs(output_dir)
with open(filename, "w", encoding="utf-8") as output:
output.write(
"nodedef>name VARCHAR, title VARCHAR,author VARCHAR,url VARCHAR,price VARCHAR,thumbnail VARCHAR,is_seed BOOLEAN\n")
for asin, item in items.items():
if asin not in asins:
continue
is_seed = "true" if asin in initial_asins else "false"
output.write("%s,%s,%s,%s,%s,%s,%s\n" % (
gdf_escape(asin), gdf_escape(item["label"]), gdf_escape(item["author"]), gdf_escape(item["link"]), gdf_escape(item["price"]),
gdf_escape(item["thumbnail"]), is_seed))
output.write("edgedef>from VARCHAR,to VARCHAR,directed BOOLEAN\n")
for pair in list_pairs:
pair = pair.split("-")
output.write("%s,%s,true\n" % tuple([gdf_escape(bit) for bit in pair]))
if __name__ == "__main__":
cli = argparse.ArgumentParser()
cli.add_argument("-i", "--input", help="File with product page URLs to scrape, one per line", required=True)
cli.add_argument("-d", "--depth", default=0, help="Crawl depth, default 0")
cli.add_argument("-c", "--carousel-items", default=50, help="Max amount of products to scrape per carousel")
cli.add_argument("-p", "--prefix", default="results/#", help="File name prefix for the output GDF files (# is replaced with current time)")
args = cli.parse_args()
seeds = open(args.input).readlines()
carousel_items = int(args.carousel_items)
generate_recommendation_network(seeds, int(args.depth), args.prefix, carousel_items)