-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathTFIDF.py
138 lines (98 loc) · 2.45 KB
/
TFIDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json
stop_words = set(stopwords.words('english'))
import math
if __name__ == "__main__":
mainFile = open("subject_to_case.json", "r")
data = {}
data = json.load(mainFile)
for key in data :
totalCountOfWords = dict()
numberOfDocuments = dict()
Tfidf = []
key = "criminal"
print(key)
cnt = 0
for case in data[key] :
if cnt > 100:
break
cnt += 1
location = "H:\\Downloads\\OpenSoft-Data\\OpenSoft-Data\\All_FT\\"
location = location + str(case) + ".txt"
tempFile = open(location, "r")
print(location)
check = dict()
for line in tempFile:
for word in line.split():
val = word.lower()
fin = ""
key1 = False
for char in val:
if char >= 'a' and char <= 'z':
fin = fin + char
elif char >= '0' and char <= '9':
key1 = True
break
if key1 == True:
continue
if fin == "":
continue
if fin not in stop_words and fin != "":
if fin in totalCountOfWords:
totalCountOfWords[fin] += 1
else :
totalCountOfWords[fin] = 1
check[fin] = 1
for word in check:
if word in numberOfDocuments:
numberOfDocuments[word] += 1
else :
numberOfDocuments[word] = 1
V = dict()
cnt = 0
key = "criminal"
for case in data[key]:
if cnt > 100:
break
cnt += 1
count = dict()
totalNumberOfWords = 0
location = "H:\\Downloads\\OpenSoft-Data\\OpenSoft-Data\\All_FT\\"
location = location + str(case) + ".txt"
tempFile = open(location, "r")
for line in tempFile:
for word in line.split():
val = word.lower()
fin = ""
key1 = False
for char in val:
if char >= 'a' and char <= 'z':
fin = fin + char
elif char >= '0' and char <= '9':
key1 = True
break
if key1 == True:
continue
if fin == "":
continue
if fin not in stop_words and fin != "":
if fin in count:
count[fin] += 1
else :
count[fin] = 1
totalNumberOfWords += 1
for word in count:
Tf = count[word] / totalNumberOfWords
IDF = math.log(len(data[key]) / numberOfDocuments[word])
if word in V:
V[word] = max(V[word], Tf*IDF)
else :
V[word] = Tf*IDF
for word in V:
Tfidf.append((V[word], word))
Tfidf.sort(reverse = True)
for i in range(0, 1000):
print(Tfidf[i])
break