-
Notifications
You must be signed in to change notification settings - Fork 0
/
mercury.py
191 lines (146 loc) · 7.53 KB
/
mercury.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# Import CLI Utility Libraries
import argparse
from colorama import init
# Import Web Scraping Libraries
import lxml
import requests
from bs4 import BeautifulSoup
# Import Utility Libraries
import re
from sys import exit
from os.path import exists
from multiprocessing import Pool
from dataclasses import dataclass
# IMPORTANT: Define your CPU's number of cores
NUM_OF_CORES = 8
# Define ANSI Escape Codes
BOLD_RED = "\x1B[1;31m" # Changes the text style to bold and the color to red
WHITE = "\x1B[37m" # Changes the text the color to white
RESET = "\x1B[0m" # Resets all text styles and color
# Defines the "Product" dataclass
@dataclass
class Product:
URL: str # Product URL
NUM: int # Product number
NAME: str # Product name
COND: str # Product condition
PRICE: str # Product price
# Formatted representation of Product
def __str__(self):
""" Returns a formated string containing the product's components """
return f"Product #{self.NUM}\n Name: '{self.NAME}',\n Condition: {self.COND},\n Price: {self.PRICE}\n URL: {self.URL}"
# Downloads a given eBay product URL
def download(URL: str) -> tuple[str, str]:
""" Downloads the raw HTML of a given website and return it as a tuple along with the URL """
# GET the product URL
response = requests.get(URL, headers = {'Accept-Encoding': 'identity'})
# Verify that the request was successful
if not response:
print(f"{BOLD_RED}Error: {WHITE}GET request for '{URL}' returned a status code of {response.status_code}...{RESET}")
exit()
# Return a tuple containing the URL and HTML
return URL, response.text
# Parses the HTML string of an eBay product for wanted components
def parse(data: tuple[str, str]) -> Product:
""" Parses a raw HTML string and returns a 'Product' dataclass based on the components """
# Create a Beautiful Soup instance
soup = BeautifulSoup(data[1], "lxml")
# Verify that the creation of the Beautiful Soup instance was successful
if not soup:
print(f"{BOLD_RED}Error: {WHITE}Unable to create Beautiful Soup instance from '{data[0]}'...{RESET}")
exit()
# Parse the product components
name = soup.find("h1").get_text()[1:] # Remove the extra space infront of the item name
number = data[0][25:37] # Parse specific product number from the URL
condition = soup.find("div", {"class": "d-item-condition-text"}).find("span", {"class": "clipped"}).get_text() # Ignore duplicate condition text
price = soup.find("span", {"itemprop": "price"}).get_text() # Find the specific span using its constant attribute
# Return the constructed product
return Product(data[0], number, name, condition, price)
# Cleans the data before it is written into a CSV file
def clean(data: str) -> str:
""" Uses regex to remove characters that would mess with the CSV file format """
# Clean the string
clean = re.sub(",", " ", data) # Remove commas from the data string
clean = re.sub("\"", "\'", clean) # Replace the quotes character with an apostrophe
# Return the cleaned string
return clean
# Write the Product data into a file
def write(filename: str, product: Product) -> bool:
""" Attempts to write products into a file, return true if successful """
# Write Product data to file
output = None
try:
if not exists(filename): # If the file doesn't exist, create the file and write the header
output = open(filename, "w+")
output.write("ITEM_NUMBER,NAME,CONDITION,PRICE,URL\n")
else: # If the file exists, open and append to the file
output = open(filename, "a+")
output.write(f"{clean(product.NUM)},{clean(product.NAME)},{clean(product.COND)},{clean(product.PRICE)},{clean(product.URL)}") # Write the product data
output.close()
except Exception:
print(f"{BOLD_RED}{Exception} Error: {WHITE}Unable to create/write to file '{filename}'...{RESET}")
exit()
# Main method of application
def main() -> None:
""" Reads command line arguments, parses the data then calls the appropriate functions """
# Initialize the argument parser
parser = argparse.ArgumentParser(description = "A fast and efficient eBay product webscraper")
parser.add_argument("--url", action = "store", help = "URL of an eBay product ('https://www.ebay.com/itm/...')")
parser.add_argument("--file", action = "store", help = "Path to a file containing ONLY eBay product URLs")
parser.add_argument("--output", action = "store", help = "CSV filename where parsed data should be saved")
args = parser.parse_args()
# Verify that command line arguments were passed
if (args.url == None) and (args.file == None):
print(f"{BOLD_RED}Error: {WHITE}Neither a URL nor a filename was provided...{RESET}")
exit()
# Verify that only one command line arguments was passed
if (args.url != None) and (args.file != None):
print(f"{BOLD_RED}Error: {WHITE}Both a URL and a filename were provided...{RESET}")
exit()
# Based on argument type (URL or file), parse the product(s)
if args.url != None:
# Verify that URL is an eBay product
if "https://www.ebay.com/itm/" not in args.url:
print(f"{BOLD_RED}Error: {WHITE}URL '{args.url}' is not a valid eBay product...{RESET}")
exit()
# Download the product HTML
data = download(args.url)
# Parse the HTML for components
product = parse(data)
# Write output to file if prompted
if (args.output != None) and (".csv" in args.output[-4:]):
write(args.output, product)
# Print parsed data
print(product)
else:
# Verify that the file containing the product URLs exists
if not exists(args.file):
print(f"{BOLD_RED}Error: {WHITE}File '{args.file}' does not exist...{RESET}")
exit()
# Define URL, tuple, and product lists
alpha = list() # List to contain product URLs
gamma = list() # List to contain tuples (product URL and product HTML)
omega = list() # List to contain Products
# Save product URLs in data file to list
try:
file = open(args.file, "r")
alpha = file.readlines()
except Exception:
print(f"{BOLD_RED}{Exception} Error: {WHITE}Unable to parse data from file '{args.file}'...{RESET}")
exit()
# Determine the number of processes to run
processes = min(NUM_OF_CORES, len(alpha))
# Parse all the product URLs
with Pool(processes = processes) as pool:
gamma += pool.map(download, alpha) # Download every product HTML
omega += pool.map(parse, gamma) # Parse every HTML string
# Write output to file if prompted
if (args.output != None) and (".csv" in args.output[-4:]):
for product in omega:
write(args.output, product)
# Print parsed data
for product in omega:
print(product)
if __name__ == "__main__":
init() # Initialize Colorama
main() # Call the main method