-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.py
323 lines (243 loc) · 9.98 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
# This script wil comb through the pages database and manipulate information
import os
import site_saver
from multiprocessing import Pool
from collections import Counter
def main_folders_manager():
# Creates and manages all necessary folders
# DATABASE INFO
global DATA_FILENAME
global LINK_LIST_FILENAME
global TEXT_LIST_FILENAME
global META_LIST_FILENAME
(DATA_FILENAME,
LINK_LIST_FILENAME,
TEXT_LIST_FILENAME,
META_LIST_FILENAME) = site_saver.get_filenames()
# MAIN FOLDER
global INDEXER_FOLDER
INDEXER_FOLDER = os.path.join(os.path.dirname(__file__), INDEXER_FOLDER)
if not os.path.exists(INDEXER_FOLDER):
os.mkdir(INDEXER_FOLDER)
# main_page_path_list FILE
global main_page_path_list_file
main_page_path_list_file = os.path.join(INDEXER_FOLDER, main_page_path_list_file)
# ranked_url_list FILE
global ranked_url_list_file
ranked_url_list_file = os.path.join(INDEXER_FOLDER, ranked_url_list_file)
def file_comber(curr_dir, file_name):
# Recursively finds and returns paths for a file name in a directory
# Get proper paths for every folder in current dir
item_list = [os.path.join(curr_dir, x) for x in os.listdir(curr_dir)]
folder_list = [x for x in item_list if os.path.isdir(x)]
file_list = [x for x in item_list if os.path.isfile(x)]
path_list = []
if os.path.join(curr_dir, file_name) in file_list:
path_list.append(os.path.join(curr_dir, file_name))
if len(folder_list) != 0: # If there are more folders here
for folder in folder_list:
data = file_comber(folder, file_name)
if type(data) == list:
path_list += data
else:
path_list.append(data)
return path_list
def get_all_of_a_file_in_database(file_name):
# Finds recursively all instances of a file name in the database
return file_comber(PAGES_DATABASE_FOLDER, file_name)
def get_file_name_in_folder(file_name, folder):
folder = os.path.dirname(folder)
aux_list = []
if file_name in os.listdir(folder):
return os.path.join(folder, file_name)
return ''
# LIST CONVERSION
def list_of_lists_to_saveable_list(input_list):
# Some lists can't be writen to a file line by line,
# as it is with such as 'main_page_path_list' and 'ranked_url_list'.
# This function concatenates the contents of the sublist, separated by
# a specific character,
# and turns it into a simple list.
# NOTE: This only works with a simple list of lists, in the format:
# list[['a', 'b', 'c'], ['d', 'e', 'f']]
# Result: ['a|b|c', 'd|e|f']
# If a int type is present in the sublist,
# it will be marked with an identifier
separation_char = ('|')
int_identifier = '´'
simple_list = []
for item in input_list:
substring = ''
for index, path in enumerate(item): # Another loop trick! I like it!
# If subitem is int, mark it with an identifier
if type(path) == int:
path = int_identifier + str(path)
substring += path
if not index == len(item)-1:
substring += separation_char
simple_list.append(substring)
return simple_list
def unpack_list_of_lists(input_list):
# This function does the opposite of 'list_of_lists_to_saveable_list()'
# Returns the layered list in its original state
separation_char = ('|')
int_identifier = '´'
output_list = []
for page in input_list:
data = page.split('|')
for index, subitem in enumerate(data):
if int_identifier in subitem:
data[index] = int(subitem.split(int_identifier)[1])
output_list.append(data)
return output_list
# MAIN_PAGE_PATH_LIST
def gather_all_paths_in_database():
# Returns a list of all file paths for all pages
global DATA_FILENAME
global LINK_LIST_FILENAME
global TEXT_LIST_FILENAME
global META_LIST_FILENAME
# Through meta files, we can locate every page without directory wizardry.
file_name_list = [DATA_FILENAME, LINK_LIST_FILENAME, TEXT_LIST_FILENAME]
meta_list = get_all_of_a_file_in_database(META_LIST_FILENAME)
main_page_path_list = [[x] for x in meta_list]
# Make elements of 'main_page_path_list' lists containing all four file paths
for name in file_name_list:
for page in main_page_path_list:
page.append(get_file_name_in_folder(name, page[0]))
# I really didn't know you could append directly like this. Yay!
return main_page_path_list
def save_main_page_path_list():
global main_page_path_list
global main_page_path_list_file
data = list_of_lists_to_saveable_list(main_page_path_list)
site_saver.save_list_to_file(data, main_page_path_list_file)
def load_main_page_path_list():
global main_page_path_list
global main_page_path_list_file
list = site_saver.load_list_from_file(main_page_path_list_file)
main_page_path_list = unpack_list_of_lists(list)
def update_main_page_path_list():
global main_page_path_list
print('Update MPL')
main_page_path_list = gather_all_paths_in_database()
save_main_page_path_list()
# TERM SEARCHING
def search_term_in_list(search_term, in_list):
# Returns items in 'in_list' that contain the 'search_term'
return [x for x in in_list if search_term in x]
def search_list_in_page(search_term, path):
# Returns lines in a file that contain the 'search_term'
data = site_saver.load_list_from_file(path)
return search_term_in_list(search_term, data)
def full_search_in_page(search_term, path):
# Returns more complete data for a search in a page.
# Bases search by meta_file, not text_file
# It can accept any of the page's files as a valid path.
# Get info from page
page_base_dir = os.path.dirname(path)
meta_path = os.path.join(page_base_dir, META_LIST_FILENAME)
text_path = os.path.join(page_base_dir, TEXT_LIST_FILENAME)
url = (site_saver.load_list_from_file(meta_path)[0]).split('\\')[1]
# Search for term in text file
aux = [search_list_in_page(search_term, text_path), url, meta_path]
return aux
def search_term_in_ranked_database(search_term):
# Search depending on relevance
global ranked_url_list
results = []
sample = ranked_url_list
# Prepare sample list with correct arguments
# Since 'search_term' is an argument, it must appear in all items.
# That's how 'pool.starmap()' works!
sample = [[search_term, x[1]] for x in sample]
# Search all pages with multiprocessing.
data_pack_bundle = []
with Pool() as pool:
data_pack_bundle = pool.starmap(full_search_in_page, sample, chunksize=5)
# Rid bundle from instances with no results
data_pack_bundle = [x for x in data_pack_bundle if x[0] != []]
return data_pack_bundle
# RANKED_URL_LIST
def get_url_ranking_from_database():
# Ranks pages by the ammount of times they are mentioned in the database
# Returns [URL, meta file path, times cited] in descending order.
global main_page_path_list
global main_page_path_list_file
print('Ranking URLs')
# Get all links possible in all indexed pages
all_links = []
sample = main_page_path_list # Workset to work with
link_list_paths = [x[2] for x in sample] # Get all link files' paths
pages_work_set = []
# Reduce sample list to [URL, meta_file_path]
for page in sample:
meta_path = page[0]
page_link = (site_saver.load_list_from_file(meta_path)[0]).split('\\')[1]
pages_work_set.append([page_link, meta_path])
# Collect the contents of all link files with multiprocessing
data_pack_bundle = []
with Pool() as pool:
data_pack_bundle = pool.map(site_saver.load_list_from_file, link_list_paths, chunksize=5)
# Put all link from bundles in a simple, single list.
for page in data_pack_bundle:
for link in page:
all_links.append(link)
# Count all obtained links
count_list = Counter(all_links)
# Create list of all links obtained, without duplicates.
#all_unique_links = set(all_links) # Is this redundant, with the 'sample' list?
#pages_work_set
# Assemble list of every link and their score
full_mention_list = [[x[0], x[1], count_list[x[0]]] for x in pages_work_set]
full_mention_list.sort(key=lambda full_mention_list: full_mention_list[2], reverse=True)
# Strip score from list
#page_link_rank = [x[0] for x in full_mention_list]
page_link_rank = full_mention_list
print('URLs ranked!')
return page_link_rank
def save_ranked_url_list():
global ranked_url_list
global ranked_url_list_file
data = list_of_lists_to_saveable_list(ranked_url_list)
site_saver.save_list_to_file(data, ranked_url_list_file)
def load_ranked_url_list():
global ranked_url_list
global ranked_url_list_file
list = site_saver.load_list_from_file(ranked_url_list_file)
ranked_url_list = unpack_list_of_lists(list)
def update_ranked_url_list():
# The rank of the currently indexed pages
global ranked_url_list
update_main_page_path_list()
save_main_page_path_list()
print('Update RUL')
ranked_url_list = get_url_ranking_from_database()
save_ranked_url_list()
# Search
def main():
global main_page_path_list
global main_page_path_list_file
global ranked_url_list
global ranked_url_list_file
load_main_page_path_list()
load_ranked_url_list()
if False:
update_ranked_url_list()
search_term = 'plane'
result = search_term_in_ranked_database(search_term)
pass
# DATABASE INFO
PAGES_DATABASE_FOLDER = site_saver.get_pages_database_path()
DATA_FILENAME = ''
LINK_LIST_FILENAME = ''
TEXT_LIST_FILENAME = ''
META_LIST_FILENAME = ''
INDEXER_FOLDER = 'INDEXER'
main_page_path_list_file = 'INDEXER_MPL.txt'
main_page_path_list = []
ranked_url_list = []
ranked_url_list_file = 'INDEXER_URL_rank.txt'
main_folders_manager()
if __name__ == '__main__':
main()