-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
executable file
·110 lines (91 loc) · 3.83 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# - *- coding: utf- 8 - *-
import nltk
from nltk import word_tokenize
from wordSimilarity import *
from summarization import *
from createIndex import *
import MySQLdb
def createSummary(bookid):
db = MySQLdb.connect("localhost", "root", "root", "sparks", charset='utf8', use_unicode=True)
cursor = db.cursor()
s_id = bookid
sql = "SELECT bookDetails.bookPath FROM bookDetails WHERE bookDetails.bookdetailID = " +bookid
cursor.execute(sql)
row = cursor.fetchone()
readPath = row[0]
# print(row[0])
db.commit()
read_file = open(readPath, 'r', encoding="utf16")
file = read_file.read()
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', file)
word_cluster = checkWordSimilarity(cleantext)
words = word_tokenize(cleantext)
for word in words:
for key, value in word_cluster.items():
values = word_cluster[key]
if word in values:
word.replace(word, key)
book_summary = summarycreation(bookid, file, words)
sql = 'INSERT INTO bookSummary VALUES ('+s_id+', '+bookid+', "<p>'+book_summary+'</p>")'
# cursor.execute(sql)
db.commit()
c_title = {}
n = 0
titles = re.findall(r'(<h4>(.*?)</h4>)', file)
count = len(titles)
for t in titles:
c_title[n] = t[1]
n+=1
match = re.split(r'<h4>', file)
cs_id = int(bookid+"0")
c_id = int(bookid+"00")
x=-1
for m in match:
matches = re.findall(r'(<h2>(.*?)</h2>)', m)
if len(matches) >=1:
continue
else:
if x!= count:
x += 1
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', m)
word_cluster = checkWordSimilarity(cleantext)
words = word_tokenize(cleantext)
for word in words:
for key, value in word_cluster.items():
values = word_cluster[key]
if word in values:
word.replace(word, key)
chapter_summary = chaptersummarycreation(bookid, m, words)
print(">......>>>>>" + str(c_id))
sql = 'INSERT INTO ChapterSummary VALUES (' + str(cs_id) + ', ' + bookid + ', ' + str(c_id) + ', "<h4>' + str(c_title[x]) + '</h4>", "<p>' + chapter_summary + '</p>")'
# print(sql)
# cursor.execute(sql)
db.commit()
paras = re.findall(r'(<p>(.*?)</p>)', m)
ps_id = int(bookid + str(c_id)+"00")
p_id = int(str(bookid)+str(c_id)+ "000")
for para in paras:
print(para[1])
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', para[1])
para_word_cluster = checkWordSimilarity(cleantext)
parawords = word_tokenize(cleantext)
for word in parawords:
for key, value in para_word_cluster.items():
values = para_word_cluster[key]
if word in values:
word.replace(word, key)
paragraph_summary = paragraphsummarycreation(bookid, para[1], parawords)
print(paragraph_summary)
print(">......>>>>>"+str(ps_id))
sql = 'INSERT INTO ParagraphSummary VALUES (' + str(ps_id) + ', ' + bookid + ', ' + str(c_id) + ', ' + str(p_id) + ', "<p>' + paragraph_summary + '</p>")'
# print(sql)
cursor.execute(sql)
db.commit()
ps_id += 1
p_id += 1
cs_id +=1
c_id +=1
createSummary("5")