-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
executable file
·216 lines (195 loc) · 10.7 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#[END] ------------------------- CELL -------------------------------------------------------------------------------
import asyncio, nest_asyncio
from pyppeteer import launch
from selectorlib import Extractor
from requests import get
from collections import OrderedDict
from datetime import timedelta, datetime as dt
from time import sleep
from re import sub
import pandas as pd, numpy as np, platform, os, copy
from the_channels import get_channels, test_channel_filters
# Enable nested event loops
nest_asyncio.apply()
test_mode_channels, test_mode_periods = True, True # Changed to True for testing
end_page_sleep_time, stay, filters2sheets, exception_keys = 0, 7, {}, ['']#['room_type', 'beds', 'number_of_ratings']
fieldnames = ["name", "location", "price", "channel", "period", "rating", ]#"url" #"room_type", "beds", "number_of_ratings", "rating_title", "url", "price_for",
our = {
"Stamos All Inclusive":"Booking.com",
"Castellum Suites - All Inclusive":"ALL CHANNELS",
"Bivalvia Studios":"Booking.com",
"Kamari Beach Hotel - All inclusive":"Expedia.ie",
"Aloe Hotel – Adults Only":"Expedia.ie",
"Stamos Hotel - All Inclusive":"Expedia.ie",
"Lido Star Beach":"Expedia.ie",
"Bivalvia Beach Plus":"Expedia.ie",
"Kamari Beach Hotel":"Booking.com",
"Aloe Plus Hotel":"Booking.com",
"Kouros Home Hotel":"ALL CHANNELS"
}
our_hotels = list([k for k,v in our.items()])
competitors = []#["AFANDOU BLU", "Lymberia Hotel"]
def get_periods(stay=7, test_mode=True):
if test_mode:
init_periods = ['2025-05-04']# , '2025-08-28', '2025-09-28', '2025-10-28'
def generate_periods(period):
start = dt.strptime(period, '%Y-%m-%d').date()
end = start + timedelta(days=stay)
return ({'day':str(start.day), 'month':str(start.month), 'year':str(start.year)}, {'day':str(end.day), 'month':str(end.month), 'year':str(end.year)})
return [generate_periods(period) for period in init_periods]
else:
start, end, res, periods = '2025-05-01', '2025-11-01', [], ()
start = dt.strptime(start, '%Y-%m-%d').date()
e = dt.strptime(end, '%Y-%m-%d').date()
end = start + timedelta(days=stay)
while e > end:
res.append(({'day':str('{:02d}'.format(start.day)), 'month':str('{:02d}'.format(start.month)), 'year':str('{:02d}'.format(start.year))}, {'day':str('{:02d}'.format(end.day)), 'month':str('{:02d}'.format(end.month)), 'year':str('{:02d}'.format(end.year))}))
start = end
end = start + timedelta(days=stay)
return res
channels, periods = get_channels(test_mode=test_mode_channels), get_periods(stay=stay, test_mode=test_mode_periods)
def channel_url(channel, p, url):
if channel == 'Booking.com':
url = sub(r"checkin_year=\w+\&", "checkin_year={s_year}&".format(s_year=p[0]['year']), url)
url = sub(r"checkin_month=\w+\&", "checkin_month={s_month}&".format(s_month=p[0]['month']), url)
url = sub(r"checkin_monthday=\w+\&", "checkin_monthday={s_day}&".format(s_day=p[0]['day']), url)
url = sub(r"checkout_year=\w+\&", "checkout_year={s_year}&".format(s_year=p[1]['year']), url)
url = sub(r"checkout_month=\w+\&", "checkout_month={s_month}&".format(s_month=p[1]['month']), url)
url = sub(r"checkout_monthday=\w+\&", "checkout_monthday={s_day}&".format(s_day=p[1]['day']), url)
elif channel == 'Expedia.ie':
chin = p[0]['year']+'-'+p[0]['month']+'-'+p[0]['day']
chout = p[1]['year']+'-'+p[1]['month']+'-'+p[1]['day']
url = sub(r"d1=(.*?&)", "d1={checkin}&".format(checkin=chin), url)
url = sub(r"d2=(.*?&)", "d2={checkout}&".format(checkout=chout), url)
url = sub(r"startDate=(.*?&)", "startDate={checkin}&".format(checkin=chin), url)
url = sub(r"endDate=(.*?&)", "endDate={checkout}&".format(checkout=chout), url)
return url
async def scrape(channel, url, period, filter_name):
print("Downloading from Channel: {channel} > Period: {period} | filter_name: {filter_name} | Stay {stay}.".format(channel=channel, period=period, filter_name=filter_name, stay=str(stay)))
print("Launching browser...")
browser = await launch(
headless=False, # Set to False to see the browser
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1080'
],
dumpio=True # Print browser console logs
)
print("Browser launched successfully")
page = await browser.newPage()
print("New page created")
# Set user agent
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36')
print("User agent set")
# Set headers
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
'Referer': channels[channel]['referer']
})
print("Headers set")
try:
print(f"Navigating to URL: {url}")
await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000}) # Increased timeout to 60 seconds
print("Page loaded")
# Scroll to bottom three times
for i in range(3):
print(f"Scrolling page (attempt {i+1}/3)")
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
await asyncio.sleep(end_page_sleep_time)
print("Page Load Completed!")
content = await page.content()
e = Extractor.from_yaml_file(channels[channel]['yaml_file'])
res = e.extract(content, base_url=channels[channel]['referer'])
return res
except Exception as e:
print(f"Error during scraping: {str(e)}")
raise e
finally:
print("Closing browser")
await browser.close()
print("Browser closed")
async def process_filter(writer_i, filter_name_i):
writer, filter_name = copy.deepcopy(writer_i), copy.deepcopy(filter_name_i)
try:
for channel in channels:
for p in periods:
period = "{s_year}-{s_month}-{s_day}".format(s_year=p[0]['year'], s_month=p[0]['month'], s_day=p[0]['day'])
url = channel_url(channel, p, channels[channel]['url'][filter_name])
data = await scrape(channel, url, period, filter_name)
count = 0
if data and 'hotels' in data and data['hotels'] is not None:
for h in data['hotels']:
record, dont_write_flag = {'period': dt.strptime(period, '%Y-%m-%d').date(), 'channel':channel}, 0
for k,v in h.items():
if (k == 'name' and str(v).strip() in ('NoneType', 'None', '')): dont_write_flag = 1
elif dont_write_flag == 0 and k not in exception_keys:
if type(v) == list: v = v[count] if count+1 <= len(v) else None
if k not in ('price', 'period'): record[k] = v
elif k == 'price': record[k] = float(v.replace("€", '').replace(',', '').replace(' ', '').replace(' ', '')) if v is not None else "NULL"
else: pass
if dont_write_flag == 0:
count += 1
for key in fieldnames:
v = record[key]
del record[key]
record[key] = v
for key in fieldnames:
writer[key].append(record[key])
except Exception as e: raise e
return {filter_name:writer}
async def main():
filters = channels[list(channels.keys())[0]]['url'].keys()
filters2sheets = {}
for filter_name in filters:
print("************ filter_name =", filter_name)
writer = {k:[] for k in fieldnames}
result = await process_filter(writer, filter_name)
filters2sheets.update(result)
print("************ Processing complete!")
return filters2sheets
if __name__ == '__main__':
# Run the async main function
filters2sheets = asyncio.get_event_loop().run_until_complete(main())
# Generate Excel report
prod_date = str(dt.now().date())
env_filename = 'Test Mode Scraping Report' if True in (test_mode_periods, test_mode_channels) else 'Full Price Scraping Report'
xlsx_writer = pd.ExcelWriter(env_filename+' {prod_date}.xlsx'.format(prod_date=prod_date), engine = 'xlsxwriter')
for a_filter, writer_data in filters2sheets.items():
df = pd.DataFrame(writer_data)
def custom_style(s):
flag = -1
if s[0].strip() in our_hotels: return ['text_wrap: True; background-color: green;' for x in s]
elif s[0].strip() in competitors: return ['text_wrap: True; background-color: red' for x in s]
else: return ['text_wrap: True;' for x in s]
channel_names = list(channels.keys())
df.to_excel(xlsx_writer, sheet_name=a_filter, index=False, header=True, startrow=5)
workbook = xlsx_writer.book
worksheet = xlsx_writer.sheets[a_filter]
worksheet.set_column('A:Z', 15)
header_format = workbook.add_format({'bold': True, 'font_color': 'black'})
text_wrap_format = workbook.add_format({'text_wrap': True})
red_format = workbook.add_format({'bold': True, 'font_color': 'red'})
for idx, channel in enumerate(channel_names):
worksheet.write(0, idx, channel+" Filter URL", header_format)
worksheet.write(1, idx, channels[channel]['url'][a_filter], text_wrap_format)
worksheet.write(2, 0, "Channel Name", red_format)
worksheet.write(3, 0, "Not Found (ours):", red_format)
worksheet.write(4, 0, "^ Found Periods:", red_format)
missing_our_hotels = list(set(our_hotels) - (set(our_hotels) - set(writer_data['name'])))
comp = set(writer_data['name'])
diff = set(our_hotels)-(set(our_hotels) - set(writer_data['name']))
for idx, hotel_name in enumerate(missing_our_hotels):
worksheet.write(2, idx+1, our[hotel_name])
worksheet.write(3, idx+1, hotel_name)
weight = len(channels.keys()) if our[hotel_name] == 'ALL CHANNELS' else 1
period_completeness = writer_data['name'].count(hotel_name)/(len(periods))/weight
completeness_color = 'green' if period_completeness == 1 else 'red'
found_periods_format = workbook.add_format({'bold': True, 'font_color': completeness_color})
found_periods_format.set_num_format(10)
worksheet.write(4, idx+1, period_completeness, found_periods_format)
xlsx_writer.close()
print("Report generated successfully!")