-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBagOfWords.py
42 lines (27 loc) · 1.98 KB
/
BagOfWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
def Vectorizer(realstring,fakestring,BoW_gram,stopWords=None):
##############################################
# CREATING LINE-WORD MATRIX WITH FREQUENCY #
##############################################
real_vectorizer = CountVectorizer(ngram_range=(BoW_gram,BoW_gram),stop_words=stopWords,analyzer='word')
real_word_matrix = real_vectorizer.fit_transform(realstring)
fake_vectorizer = CountVectorizer(ngram_range=(BoW_gram,BoW_gram),stop_words=stopWords,analyzer='word')
fake_word_matrix = fake_vectorizer.fit_transform(fakestring)
real_vocabulary = real_vectorizer.vocabulary_
fake_vocabulary = fake_vectorizer.vocabulary_
return real_vectorizer,real_word_matrix,real_vocabulary,fake_vectorizer,fake_word_matrix,fake_vocabulary
############################################################################################################
###########################################################################################################
def tfidf_Vectorizer(realstring,fakestring,BoW_gram,stopWords=None):
##############################################
# CREATING LINE-WORD MATRIX WITH FREQUENCY #
##############################################
real_vectorizer = TfidfVectorizer(ngram_range=(BoW_gram,BoW_gram),stop_words=stopWords,analyzer='word')
real_word_matrix = real_vectorizer.fit_transform(realstring)
fake_vectorizer = TfidfVectorizer(ngram_range=(BoW_gram,BoW_gram),stop_words=stopWords,analyzer='word')
fake_word_matrix = fake_vectorizer.fit_transform(fakestring)
real_vocabulary = real_vectorizer.vocabulary_
fake_vocabulary = fake_vectorizer.vocabulary_
return real_vectorizer,real_word_matrix,real_vocabulary,fake_vectorizer,fake_word_matrix,fake_vocabulary
############################################################################################################