-
Notifications
You must be signed in to change notification settings - Fork 0
/
II_sentimentCode.py
executable file
·91 lines (83 loc) · 4.13 KB
/
II_sentimentCode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
========================================================= PART I =======================================================
=Using receiveEmail_V2.py code to receive Email contents, and the contents are stored in database, but for the purpose =
=clearance, I am not directly fetching email contents from the database. Instead, I am still retrieving email contents =
=from email server. =
========================================================================================================================
"""
import II_receiveEmail as RE
allEmailsList = RE.allEmailsList
"""
======================================================== PART II =======================================================
=Using baidu ernie module to sentiment the email contents. It is a direct pre-training module included in paddlehub. =
=Although the ernie module within paddlehub package has a few backdrops, including occupying huge amount of memory, but=
=it works fine with English words. =
========================================================================================================================
"""
import paddlehub as hub
positive_significantLevel = 0.6 # Set a significant level for sentiment results
negative_significantLevel = 0.6
import re
from bs4 import BeautifulSoup
import nltk
import os
from tqdm import tqdm
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from BK import textNormalizer_V2 as tn
class sentimentCode:
def __init__(self, sentimentData, posi_sigLevel, nega_sigLevel): # sentimentData is a list
self.senta = hub.Module(name="senta_bilstm")
self.sentimentData = sentimentData
self.posi_sigLevel = posi_sigLevel
self.nega_sigLevel = nega_sigLevel
def strip_html_tags(self, text):
soup = BeautifulSoup(text, 'html.parser')
[s.extract() for s in soup(['iframe', 'script'])]
stripped_text = soup.get_text()
stripped_text = re.sub('r[\r|\n|\r\n]+', '\n', stripped_text)
return stripped_text
def testText(self): # Preparing the test text for sentiment test
rawData = self.sentimentData
re_testText = []
default_st = nltk.sent_tokenize
for singleList in rawData:
# print(singleList, "..........")
# ... dealing with plain text ...
# ... Break down into sentences using nltk ...
try:
tmp_x = singleList['PLAINTEXT']
except KeyError:
#print('No Plain Text')
continue
plainText_sentences = tn.tn_preProcessing.pre_process_document(tmp_x)
try:
sentimentSingledict = {"DATE":singleList['EMAILDATE'],
"TEXT":plainText_sentences}
except IndexError:
sentimentSingledict = {"DATE":singleList['EMAILDATE'],
"TEXT":'NaN'}
continue
re_testText.append(sentimentSingledict)
# ... dealing with html text ...
try:
content = singleList['HTMLTEXT']
except KeyError:
#print("No HTML Text")
continue
clean_content = self.strip_html_tags(content)
return re_testText
def totalResult(self):
test_Text = self.testText()
re_totalResult = []
for i in tqdm(range(len(test_Text)), ncols=150, desc="Emails Sentiment", colour="green"):
singleText = test_Text[i]
input_dict = singleText['TEXT']
results = self.senta.sentiment_classify(texts=input_dict, use_gpu = True, batch_size=1)
singleTestresult={"DATE":singleText['DATE'],
"RESULTS":results}
re_totalResult.append(singleTestresult)
print("\n", "Email contents sentiment is DONE!", "."*200, "\n")
return re_totalResult
senti = sentimentCode(allEmailsList, positive_significantLevel, negative_significantLevel)
totalResults = senti.totalResult()
#print(totalResults)