-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetText.py
75 lines (62 loc) · 2.46 KB
/
getText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from bs4 import BeautifulSoup
from requests import get
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from nltk.tokenize import RegexpTokenizer
import nltk
from pattern.web import Newsfeed, plaintext
url = 'https://wearechange.org/obama-ordered-cia-to-train-isis-jihadists-declassified-documents/?fbclid=IwAR20r8AwISfyoyudE7U7owT7Y-vLSwraN4cpkNzQeFo7pYfTuBiTMObHplQ'
htmlString = get(url).text
html = BeautifulSoup(htmlString, 'lxml')
# here we have to do "Inspect Element" to see the class the post belongs to
# For this particular website, the posts are like this
# <div class="entry-content"> <p> The Text </p> </div>
# So for each website, we have to change this
entries = html.find_all( {'class':'post-content entry-content', 'p':True}) # gets html code with
text = [e.get_text() for e in entries] # seperates text from html elements such as <div> etc.
print('{} posts were found.'.format(len(text)))
if len(text) > 1:
text = [' '.join(text)]
# url = 'http://feeds.feedblitz.com/SethsBlog'
# postText = []
# for post in Newsfeed().search(url):
# postText.append(plaintext(post.text))
# print('{} posts were found.'.format(len(postText)))
# pprint(postText[-1])
# rssKeys = {'language':post.language,
# 'author':post.author,
# 'url':post.url,
# 'title':post.title,
# 'id':post.id}
# pprint(rssKeys)
text = [x.replace('\n', '') for x in text]
text = [x.replace('\t', '') for x in text]
# Now that we got the text, we have to split it into Tokens and remove unnecessary words
# get english stopwords
sw = nltk.corpus.stopwords.words('english')
clean_text = []
# keep all the words that aren't stopwords
for word in text:
if word not in sw:
clean_text.append(word)
print(clean_text)
max_features = 4500
tokenizer = Tokenizer(num_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
tokenizer.fit_on_texts(texts=clean_text)
X = tokenizer.texts_to_sequences(texts=clean_text)
# print(X)
X = pad_sequences(sequences=X, maxlen=max_features, padding='pre')
# Prediction
fakeNews = False
rf_model = tf.keras.models.load_model('model.h5')
rf_prediction = rf_model.predict_classes(X)
# gru_prediction[0][1]=0.3 # Just to check the if
print("scores of the regression:", rf_prediction)
# print(np.argmax(rf_prediction, axis=1)[0])
if rf_prediction == 1:
fakeNews = True
print("Fake News!")
else:
fakeNews=False
print("Reliable source!")