-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_data.py
100 lines (67 loc) · 2.4 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
@author: Sriram Veturi
@title: SmartSearch - An Intelligent Search Engine.
@date: 05/06/2019
"""
import os
import json
import collections
from clean_html import extract_text_from_page
from clean_html import preprocess_text
DATA_DIRECTORY = "./documents"
def create_data_directory():
"""
Function to create a directory to store the documents.
:return True/False: Creation Successful Flag.
"""
# If it already exists, return True.
if os.path.isdir(DATA_DIRECTORY) is True:
print("Directory to store the documents already exists. Moving on.")
return True
else:
try:
os.mkdir(DATA_DIRECTORY)
print("Directory created to store the documents.")
return True
except Exception as e:
print(e)
return False
def create_documents(sites_list, parent_children_url_map):
"""
Function to create documents in the directory.
:param sites_list: The list of sites crawled.
"""
# Index to store the document name.
document_name_index = 1
try:
# Traverse through the urls, preprocess them and create document.
for page_url in sites_list:
try:
print("Processing document {doc_number}.".format(doc_number=document_name_index))
raw_text_data = extract_text_from_page(page_url)
preprocessed_text = preprocess_text(raw_text_data)
preprocessed_text_words = preprocessed_text.split(" ")
preprocessed_text_words_count_map = dict(collections.Counter(preprocessed_text_words))
document_contents_map = dict()
document_contents_map["INDEX"] = document_name_index
document_contents_map["URL"] = page_url
# Some urls in the parent_children_url_map would not have children.
try:
document_contents_map["OUTGOING_LINKS"] = parent_children_url_map[page_url]
except:
# If no children, empty list as all outgoing urls list.
document_contents_map["OUTGOING_LINKS"] = []
document_contents_map["WORD_COUNT_MAP"] = preprocessed_text_words_count_map
# Create a json file which would store the web graph information of the url.
document_name = str(document_name_index) + ".json"
with open(os.path.join(DATA_DIRECTORY, document_name), 'w') as doc_file:
json.dump(document_contents_map, doc_file)
document_name_index += 1
except Exception as e:
print("Could not extract text from {url} because of the error below.\n".format(url=page_url))
print(e)
continue
return True
except Exception as e:
print(e)
return False