-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathis-it-on-hfqpdb.py
executable file
·177 lines (142 loc) · 7.07 KB
/
is-it-on-hfqpdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/python3
# Copyright 2023 - 2024 Benjamin Steenkamer
import http.client
import os
import re
import shutil
import urllib.error
import urllib.request
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import cv2
import numpy as np
from tqdm import tqdm
HF = "https://www.harborfreight.com/coupons"
HF_RE = re.compile(r"https://images\.harborfreight\.com\/hftweb\/weblanding\/coupon-deals\/images\/(.+?)png")
HF_PROMO = "https://www.harborfreight.com/promotions" # % off coupons
HF_PROMO_RE = re.compile(r"https:\/\/images\.harborfreight\.com\/hftweb\/promotions(.+?)(png|jpg)")
HFQPDB = "https://www.hfqpdb.com"
HFQPDB_RE = re.compile(r'\/coupons\/[^"]+') # Coupons sometimes don't end with a file extension, so this matches to the end of <img src="">
SAVE_DIR = "coupons/"
SIMILAR_THRESHOLD = 0.8 # How similar two images have to be to be considered the same
def download_coupons(url, re_obj, desc, npos, replace="", replace_with=""):
coupon_urls = []
failed_urls = []
try:
with urllib.request.urlopen(url) as web_page:
for line in web_page.readlines():
for coupon in re_obj.finditer(line.decode("unicode-escape")):
c_url = coupon.group().replace(replace, replace_with)
if c_url not in coupon_urls: # Harbor Freight HTML can have duplicate URLs, so save once
coupon_urls.append(c_url)
except urllib.error.URLError:
failed_urls.append(url)
def _thread(url):
last_slash = url.rfind("/") + 1
image_name = url[last_slash:]
try:
image_bytes = urllib.request.urlopen(url).read()
return image_bytes, hash(image_bytes), image_name, url
except (urllib.error.URLError, http.client.InvalidURL):
# URLError = image doesn't actually exist on HF website
# InvalidURL = bugged file path on HF website
return None, None, image_name, url
requests = []
with ThreadPoolExecutor() as tpool:
for url in coupon_urls:
requests.append(tpool.submit(_thread, url))
coupons = []
if requests:
pbar = tqdm(total=len(requests), position=npos, desc=desc)
for request in as_completed(requests): # yields futures as they complete
result = request.result()
if result[0] is not None:
coupons.append(result[:-1])
else:
failed_urls.append(result[-1])
pbar.update(1)
elif not failed_urls: # Only prints if no failed URLs and no coupon downloaded
print("No coupons found :", url)
return coupons, failed_urls
def coupons_are_similar(coupon_a, coupon_b):
def template_cmp(image, template_image):
"""
Slides template_image over image, checking for similarities; template_image must not be greater than the image dimensions
"""
try:
res = cv2.matchTemplate(image, template_image, cv2.TM_CCOEFF_NORMED)
if np.where(res >= SIMILAR_THRESHOLD)[0].size > 0: # If there are similarities greater than threshold, they are probably the same coupon
return True
return False
except cv2.error: # Happens when the template is larger than input image
return None
nparr_a = np.frombuffer(coupon_a, np.uint8) # Convert binary string to ndarray
nparr_b = np.frombuffer(coupon_b, np.uint8)
img_a = cv2.imdecode(nparr_a, cv2.IMREAD_COLOR) # Convert ndarray to CV2 image
img_b = cv2.imdecode(nparr_b, cv2.IMREAD_COLOR)
coupon_a_gray = cv2.cvtColor(img_a, cv2.COLOR_BGR2GRAY) # Convert to grayscale
coupon_b_gray = cv2.cvtColor(img_b, cv2.COLOR_BGR2GRAY)
# Try to use coupon A as input, coupon b as the template
# If that fails, switch the two around and try again
are_similar = template_cmp(coupon_a_gray, coupon_b_gray)
if are_similar is None:
are_similar = template_cmp(coupon_b_gray, coupon_a_gray)
return are_similar if are_similar is not None else False
def process_coupon(hf_coupon, database):
not_found = None
hf_image, hf_image_hash, hf_name = hf_coupon
if hf_image_hash is not None:
save = True
for db_image, db_image_hash, _db_name in database:
if hf_image_hash == db_image_hash or coupons_are_similar(db_image, hf_image): # Coupon images are exactly the same (hash) or are fairly similar (CV template match)
save = False
break
if save:
os.makedirs(SAVE_DIR, exist_ok=True)
not_found = hf_name
with open(f"{SAVE_DIR}{hf_name}", "wb") as fp:
fp.write(hf_image)
return not_found
if __name__ == "__main__":
# Do coupon downloading on many threads
# TODO coupons are sometimes hidden on mobile coupon site: https://go.harborfreight.com/coupons/
# Delete old coupon folder, if it exists
if os.path.exists(SAVE_DIR):
shutil.rmtree(SAVE_DIR)
p_executor = ProcessPoolExecutor()
# Download coupons from HF and DB
db_requests = p_executor.submit(download_coupons, *(f"{HFQPDB}/browse", HFQPDB_RE, "Downloading HFQPDB ", 0, "/coupons/thumbs/tn_", f"{HFQPDB}/coupons/"))
main_requests= p_executor.submit(download_coupons, *(HF, HF_RE, "Downloading HF ", 1))
promo_request = p_executor.submit(download_coupons, *(HF_PROMO, HF_PROMO_RE, "Downloading HF Promo", 2))
# Gather downloaded coupons
db_coupons = db_requests.result()[0]
hf_coupons = main_requests.result()[0] + promo_request.result()[0]
failed_urls = db_requests.result()[1] + main_requests.result()[1] + promo_request.result()[1]
# Process coupons
process_reqs = []
for hf_coupon in hf_coupons:
process_reqs.append(p_executor.submit(process_coupon, hf_coupon, db_coupons))
# Gather processed coupon results
not_found = []
with tqdm(total=len(process_reqs), desc="Processing coupons ") as pbar:
for request in as_completed(process_reqs): # yields futures as they complete
coupon_not_found = request.result()
if coupon_not_found is not None:
not_found.append(coupon_not_found)
pbar.update(1)
# Print out image URLs that failed to download; all web request are completed at this point
if failed_urls:
print("\nFAILED TO DOWNLOAD:")
for url in failed_urls:
print(url)
# Print out image names that were not found on HFQPDB
if not_found:
print("\nNot found on HFQPDB:")
for name in not_found:
print(name)
# Expect the DB size to be larger than the current HF coupon page; DB contains never expire coupons that HF doesn't advertise
print(f"\n{len(hf_coupons) - len(not_found)}/{len(hf_coupons)} Harbor Freight coupons found on HFQPDB (DB coupon count={len(db_coupons)})")
if not not_found:
print("HFQPDB IS UP TO DATE")
else:
print(f"Consider uploading the {len(not_found)} missing coupon(s) to {HFQPDB}/mass_coupon_submit\nCoupon save location: {os.getcwd()}{os.sep}{SAVE_DIR}")
input("Press ENTER key to exit")