-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdivide_file.py
executable file
·71 lines (57 loc) · 1.63 KB
/
divide_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#! /usr/bin/python
import glob
import sys
import numpy as np
totFileDir = 'clean_books_div/'
trainSize = 0.8
devSize = 0.1
# testSize = 0.2, not needed
model='class' # gen, gen_auth, or class
globSearch = '*_all.txt'
authorLabelDict = {
"Charles Darwin": 0,
"Edgar Allan Poe": 1,
"Edward Stratemeyer": 2,
"Jacob Abbott": 3,
"Lewis Carroll": 4,
"Mark Twain": 5,
"Michael Faraday": 6,
"Ralph Waldo Emerson": 7,
"Rudyard Kipling": 8,
"Winston Churchill": 9
}
fidTrain = open(totFileDir + 'guten_train.txt', 'w')
fidDev = open(totFileDir + 'guten_dev.txt', 'w')
fidTest = open(totFileDir + 'guten_test.txt', 'w')
for currBook in glob.glob(totFileDir + globSearch):
currBookName = currBook.split('/')[1].split('_')[0]
label = str(authorLabelDict[currBookName])
fidOrig = open(currBook, 'r')
allLines = fidOrig.readlines()
numLines = len(allLines)
trainCutoff = np.floor(numLines * trainSize)
devCutoff = np.floor(numLines * (trainSize + devSize))
print currBook, 'has', numLines, 'lines.'
for i, line in enumerate(allLines):
if model == 'class':
lineSplit = line.split()
for x in range(len(lineSplit)):
lineSplit[x] = label+lineSplit[x]
line2Write = ' '.join(lineSplit)
line2Write += '\n'
elif model == 'gen_auth':
lineSplit = line.split()
lineSplit = [label] + lineSplit
line2Write = ' '.join(lineSplit)
else:
line2Write = line
if i < trainCutoff:
fidTrain.write(line2Write)
elif i < devCutoff:
fidDev.write(line2Write)
else:
fidTest.write(line2Write)
fidOrig.close()
fidTrain.close()
fidDev.close()
fidTest.close()