-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfindPrivacyContent.py
198 lines (170 loc) · 7.74 KB
/
findPrivacyContent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import re
# Path to the directory containing HTML files of course listings
path = './courseListings/'
pattern = r"^[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?$"
# List all files in the specified directory
fileList = os.listdir(path)
uniName = []
content = []
contentType = []
titlePage = []
privacypolicyTitleUniName = []
privacypolicyTitleContent = []
privacypolicyTitleContentType = []
privacypolicyTitlePage = []
universitiesWithNoPrivacyRelatedContent = []
universitieClassificationWithNoPrivacyRelatedContent = []
count= 0
# Iterate through each file in the directory
for file in fileList:
# Initialize lists to store university-level data
universityLevelUniName = []
universityLevelContent = []
universityLevelContentType = []
universityLevelTitlePage = []
# Process only HTML files
if(".html" not in file):
continue
# Extract university name from the file name
university = file.split(".")[0]
print("------------------------ " + university + " ------------------------")
# Open and parse the HTML file
html_page = open("./courseListings/" + file, "r")
soup = BeautifulSoup(html_page, "html.parser")
title = soup.find('title')
ct = 0
# Remove unwanted sections from the HTML content
[x.extract() for x in soup.findAll('footer')]
[x.extract() for x in soup.findAll('header')]
[x.extract() for x in soup.findAll('nav')]
[x.extract() for x in soup.findAll('style')]
for x in soup.select("[class*=foot]"):
x.extract()
for x in soup.select("[class*=Foot]"):
x.extract()
for x in soup.select("[class*=nav]"):
x.extract()
for x in soup.select("[class*=gdpr]"):
x.extract()
for x in soup.select("[class*=GDPR]"):
x.extract()
for x in soup.select("[class*=Gdpr]"):
x.extract()
# Check if the page title contains privacy-related keywords
if(title is not None and title.string is not None):
if('privacy statement' in title.string.lower() or 'gdpr' in title.string.lower() or 'privacy notice' in title.string.lower()):
for tag in soup.find_all(lambda tag: 'privacy' in tag.get_text(strip=True, separator=' ') or 'Privacy' in tag.get_text(strip=True, separator=' ')):
if(tag.name == "script" or tag.string is None):
continue
privacypolicyTitleUniName.append(university)
privacypolicyTitleContent.append(tag.string)
privacypolicyTitleContentType.append("Main Link")
privacypolicyTitlePage.append(title.string)
continue
# Extract privacy-related content from the main HTML file
for tag in soup.find_all(lambda tag: 'privacy' in tag.get_text(strip=True, separator=' ') or 'Privacy' in tag.get_text(strip=True, separator=' ')):
if(tag.name == "script" or tag.string is None):
continue
if("FERPA" in tag.string or "Family Educational Rights and Privacy act" in tag.string
or "Privacy Statement" in tag.string or "Terms of Use" in tag.string
or "Family Education Rights and Privacy Acts Policy" in tag.string
or "Family Education Rights and Privacy Act" in tag.string
or "The Family Educational Rights & Privacy Act" in tag.string):
continue
# print("++++++++++ Tag Data ++++++++++++")
if(tag.string in universityLevelContent or re.match(pattern, tag.string)):
continue
else:
print(tag.string, title)
if(ct == 0):
universityLevelUniName.append(university)
ct += 1
else:
universityLevelUniName.append(' ')
universityLevelContent.append(tag.string)
universityLevelContentType.append('Main Page')
if(title is not None):
universityLevelTitlePage.append(title.string)
else:
universityLevelTitlePage.append(' ')
# Continue if the directory for sub-links does not exist
if(not(os.path.exists(path + university + "/"))):
continue
# Process sub-link pages within the university's directory
for subLink in os.listdir(path + university + "/"):
subHtml_page = open("./courseListings/" + university + "/" + subLink, "r")
soup = BeautifulSoup(subHtml_page, "html.parser")
subLinkTitle = soup.find('title')
# Remove unwanted sections from the sub-link HTML content
[x.extract() for x in soup.findAll('footer')]
[x.extract() for x in soup.findAll('header')]
[x.extract() for x in soup.findAll('nav')]
[x.extract() for x in soup.findAll('style')]
for x in soup.select("[class*=foot]"):
x.extract()
for x in soup.select("[class*=nav]"):
x.extract()
for x in soup.select("[class*=gdpr]"):
x.extract()
for x in soup.select("[class*=GDPR]"):
x.extract()
for x in soup.select("[class*=Gdpr]"):
x.extract()
# Check if the sub-link page title contains privacy-related keywords
if(subLinkTitle is not None and subLinkTitle.string is not None):
if('privacy statement' in subLinkTitle.string.lower() or 'gdpr' in subLinkTitle.string.lower() or 'privacy notice' in subLinkTitle.string.lower()):
for tag in soup.find_all(lambda tag: 'privacy' in tag.get_text(strip=True, separator=' ') or 'Privacy' in tag.get_text(strip=True, separator=' ')):
if(tag.name == "script" or tag.string is None):
continue
privacypolicyTitleUniName.append(university)
privacypolicyTitleContent.append(tag.string)
privacypolicyTitleContentType.append("Sub Link")
privacypolicyTitlePage.append(subLinkTitle.string)
continue
# Extract privacy-related content from sub-link HTML files
for tag in soup.find_all(lambda tag: 'privacy' in tag.get_text(strip=True, separator=' ') or 'Privacy' in tag.get_text(strip=True, separator=' ')):
if(tag.name == "script" or tag.string is None):
continue
if("FERPA" in tag.string or "Family Educational Rights and Privacy act" in tag.string
or "Privacy Statement" in tag.string or "Terms of Use" in tag.string
or "Family Education Rights and Privacy Acts Policy" in tag.string
or "Family Education Rights and Privacy Act" in tag.string
or "The Family Educational Rights & Privacy Act" in tag.string):
continue
# print("++++++++++ Tag Data ++++++++++++")
if(tag.string in universityLevelContent or re.match(pattern, tag.string)):
continue
else:
print(tag.string, subLink, tag.name, subLinkTitle)
if(ct == 0):
universityLevelUniName.append(university)
ct += 1
else:
universityLevelUniName.append(' ')
universityLevelContent.append(tag.string)
universityLevelContentType.append('Sub link Page')
if(subLinkTitle is not None):
universityLevelTitlePage.append(subLinkTitle.string)
else:
universityLevelTitlePage.append(' ')
# If no privacy-related content was found, add the university to the list
if(len(universityLevelContent) == 0):
universitiesWithNoPrivacyRelatedContent.append(university)
# universitieClassificationWithNoPrivacyRelatedContent.append(universityClassificationMapping[university])
uniName += universityLevelUniName
content += universityLevelContent
contentType += universityLevelContentType
titlePage += universityLevelTitlePage
# Create a DataFrame and save the extracted privacy-related content to a CSV file
df = pd.DataFrame({"University Name ": uniName, "Page ": contentType, 'title': titlePage, "Privacy related content ": content})
df.to_csv("./privacyContent.csv", index = False)
# Create a DataFrame and save the extracted privacy-related content from title pages to a CSV file
df = pd.DataFrame({"University Name ": privacypolicyTitleUniName, "Page ": privacypolicyTitleContentType, 'title': privacypolicyTitlePage, "Privacy related content ": privacypolicyTitleContent})
df.to_csv("./privacyRelatedTitleContent.csv", index = False)
# Create a DataFrame and save the list of universities with no privacy-related content to a CSV file
df = pd.DataFrame({"University Name ": universitiesWithNoPrivacyRelatedContent})
df.to_csv("./universitiesWithNoPrivacyRelatedContent.csv", index = False)