-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFake News.py
65 lines (48 loc) · 1.96 KB
/
Fake News.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 13:02:36 2019
@author: Hans
"""
%reset -f
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
#%% Read in the data, find out the header and the dimensions.
# In total there are 6335 news articles which are either fake or real that we can use to train upon
df=pd.read_csv('D:\\Git_Projects\\Fake News\\news.csv')
#Get shape and head
print(df.shape)
df.head()
#%% Extraxt the lables
label = df.label
label.head()
text = df.text
text.head()
#%% for this problem we will be using the sklearn package for
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=train_test_split(text, label, test_size=0.2)
""" for text analysis its worthwile to remove stopwords. These stopwords do not contribute to the news
being fake or true. We can do this using the TfidfVectoriser which analysis the Term frequency and
Inverse Document Frequency """
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.8)
#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train)
tfidf_test=tfidf_vectorizer.transform(x_test)
#%%
""" We implement the online passive agressive algortihm. When the label is correct, W is not changed.
When it is wrong, W is changed aggrissively such that the label is forced to be correct. However in the minimal way.
"""
pac=PassiveAggressiveClassifier(C=0.01,max_iter = 500)
pac.fit(tfidf_train,y_train)
# Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
# CHeck false positives and negatives
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])
# Set the y=0 labels to -1