-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_matrices.py
114 lines (99 loc) · 3.91 KB
/
create_matrices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from io_ops import read
import os
import glob
import numpy as np
non_float_fields = ["per", "ref", "jury"]
similarities = ["timing", "cosine", "dtw"]
def get_all_dicts():
source_path = "{}/results".format(os.getcwd())
os.chdir(source_path)
all_dicts = []
for inputfile in glob.glob("*.txt"):
dicts = read(inputfile)
for dic in dicts:
for key, value in dic.iteritems():
if key not in similarities and key not in non_float_fields:
dic[key] = 1.0 / (1.0001 + float(value))
elif key not in non_float_fields:
dic[key] = float(value)
elif key == 'jury':
if value == 'pass':
dic[key] = 1
elif value == 'fail':
dic[key] = 0
else:
raise ValueError("Key jury should be pass or fail.")
all_dicts = all_dicts + dicts
return all_dicts
def normalize_values(all_dicts):
max_dict = {}
for dic in all_dicts:
for key, value in dic.iteritems():
if key not in non_float_fields:
if max_dict.get(key, False):
max_dict[key] = max(max_dict[key], abs(value))
else:
max_dict[key] = abs(value)
for dic in all_dicts:
for key, value in dic.iteritems():
if key not in non_float_fields:
dic[key] = dic[key] / float(max_dict[key])
return all_dicts
def create_train_from_all_data(normalized_dicts):
measures = list(normalized_dicts[0].keys())
for field in non_float_fields:
measures.remove(field)
print("All measures: {}".format(measures))
inputVector = []
outputVector = []
for dic in normalized_dicts:
sample = []
for measure in measures:
sample.append(dic[measure])
inputVector.append(sample)
outputVector.append(dic['jury'])
print("Total vectors: {}".format(len(inputVector)))
return inputVector, outputVector, measures
def create_train_with_equal_pass_fail(normalized_dicts):
measures = list(normalized_dicts[0].keys())
non_float_fields = ["per", "ref", "jury"]
for field in non_float_fields:
measures.remove(field)
print("All measures: {}".format(measures))
passInputVector = []
passOutputVector = []
failInputVector = []
failOutputVector = []
for dic in normalized_dicts:
sample = []
for measure in measures:
sample.append(dic[measure])
if dic['jury'] == 1:
passInputVector.append(sample)
passOutputVector.append(dic['jury'])
elif dic['jury'] == 0:
failInputVector.append(sample)
failOutputVector.append(dic['jury'])
else:
raise ValueError("Key jury should be pass or fail.")
print("Total passes: {}".format(len(passInputVector)))
print("Total fails: {}".format(len(failInputVector)))
if len(passInputVector) > len(failInputVector):
indices = np.random.permutation(len(failInputVector))
passInputVector = list(np.array(passInputVector)[indices])
passOutputVector = passOutputVector[:len(failInputVector)]
elif len(passInputVector) < len(failInputVector):
indices = np.random.permutation(len(passInputVector))
failInputVector = list(np.array(failInputVector)[indices])
failOutputVector = failOutputVector[:len(passInputVector)]
inputVector = passInputVector + failInputVector
outputVector = passOutputVector + failOutputVector
return inputVector, outputVector, measures
def get_data(method):
all_dicts = get_all_dicts()
normalized_dicts = normalize_values(all_dicts)
if method == 'all':
vectors = create_train_from_all_data(normalized_dicts)
elif method == 'equal':
vectors = create_train_with_equal_pass_fail(normalized_dicts)
return vectors