-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnlp.py
68 lines (52 loc) · 1.91 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import urllib.request
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk
import operator
# Get HTML document from URL
# returns HTML file string
def getPage(url: str):
response = urllib.request.urlopen(url)
return response.read()
# Cleanup HTML to just get the text
# pip install beautifulsoup4
# pip install html5lib
def toSoup(html):
soup = BeautifulSoup(html, 'html5lib')
return soup.get_text(strip=True)
# Count the frequency of tokens within the page
# Returns first X most frequent tokens
# pip install nltk
def countWords(tokens, num_to_get):
# Remove 'stopwords' from the page - words such as 'and', 'the', etc;
sr = stopwords.words('english')
clean_tokens = tokens[:]
for token in tokens:
if token in sr:
clean_tokens.remove(token)
# Use the NLTK to return a dictionary of the token and the number of occurrences
freq = nltk.FreqDist(clean_tokens)
items = freq.items()
# Sort the dictionary by the number of occurrences, in descending order
sorted_items = sorted(items, key=operator.itemgetter(1))
sorted_items.reverse()
# Add the X most common words to a list
ret = []
sorted_list = list(sorted_items)
for i in range(num_to_get):
ret.append(sorted_list[i])
# Return that list
return ret
if __name__ == '__main__':
# Print out HTML page, with HTML included
html = getPage('https://en.wikipedia.org/wiki/Tesla,_Inc.')
# Cleanup HTML to just get the text
text = toSoup(html)
# Convert text into tokens
tokens = [t for t in text.split()]
# Count the frequency of words
words = countWords(tokens, 10)
for key, val in words:
print(str(key) + ': ' + str(val))
# Print statement about what the page is about
print("\nI'm thinking that this page is about " + str(words[0][0]) + ".")