-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_loader.py
132 lines (103 loc) · 5.21 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
import sys
import utils as utl
def create_dataset(stress, not_stress, path=True, reshape=True, train_test=True, balance_classes=False, oversampling_method=None):
"""Create dataset given stress and not-stress data or path.
stress -- path or data of stress class
not_stress -- path or data of not-stress class
path -- whether values of first and seconds arguments are path or not (default True)
reshape -- whether to reshape the data as (n_samples, window_length, n_channels) (default True)
train_test -- whether to split the data into train and test sets (default True)
balance_classes -- whether to balance the data between the classes (default True)
oversampling_method -- initialized methods from imblearn.over_sampling such as RandomOverSampler, SMOTE, ADASYN
"""
# if path load the datasets
if path:
stress_segments = utl.read_data(stress)
not_stress_segments = utl.read_data(not_stress)
else:
stress_segments = stress
not_stress_segments = not_stress
# majority class undersampling: randomly drop samples from not-stress class to have equal number samples
if balance_classes == True:
not_stress_segments = utl.select_random_samples(not_stress_segments, stress_segments.shape[0])
# concatenate the stress and not-stress data
x = np.concatenate([stress_segments, not_stress_segments], axis=0)
y = np.concatenate([
np.ones(len(stress_segments), dtype=int),
np.zeros(len(not_stress_segments), dtype=int)
], axis=0)
# remove segments from memory
del stress_segments
del not_stress_segments
print(f"X: {x.shape}, Y: {y.shape}")
x_tr = x
y_tr = y
x_ts, y_ts = [], []
# split into train, test
if train_test:
x_tr, x_ts, y_tr, y_ts = utl.split_into_train_val_test(x, y, test_split=0.3)
# SMOTE oversampling to balance the classes. Do this only for the training set
if (oversampling_method != None) & (balance_classes == False):
if len(x_tr.shape) == 2:
x_tr, y_tr = oversampling_method.fit_resample(x_tr, y_tr)
elif len(x_tr.shape) == 3:
org_shape = x_tr.shape
x_tr, y_tr = oversampling_method.fit_resample(x_tr.reshape(-1, org_shape[1] * org_shape[2]), y_tr)
x_tr = x_tr.reshape(-1, org_shape[1], org_shape[2])
# reshape is instructed
if reshape:
if len(x_tr.shape) == 2:
x_tr = x_tr.reshape(-1, x_tr.shape[1], 1)
if x_ts is np.ndarray:
x_ts = x_ts.reshape(-1, x_ts.shape[1], 1)
elif len(x_tr.shape) == 3:
# the case for acceleration data with 3 channels
x_tr = x_tr.transpose([0, 2, 1])
if x_ts is np.ndarray:
x_ts = x_ts.transpose([0, 2, 1])
return x_tr, y_tr, x_ts, y_ts
def load_wesad_data(baseline_path, amusement_path, stressed_path, combine_amusement=False, reshape=True, train_test=True):
"""Get training and testing set for the WESAD dataset.
baseline_path: path to the baseline data
amusement_path: path to the amusement data
stressed_path: path to the stressed data
combine_amusement: whether to combine amusement class or not; default false
reshape: whether to reshape the 2d data into 3d or not; default true
train_test; whether to split the data into training and testing set or not; default true
Return features and labels.
"""
X, Y = combine_class_data(baseline_path, amusement_path, stressed_path, combine_amusement)
if reshape:
X = X.reshape(-1, X.shape[1], 1)
if train_test:
x_train, x_test, y_train, y_test = utl.split_into_train_test(X, Y, test_split=0.25)
return x_train, x_test, y_train, y_test, utl.get_hot_labels(y_train), utl.get_hot_labels(y_test)
else:
return X, Y
def combine_class_data(baseline_path, amusement_path, stressed_path, include_amusement = False):
"""
Load the data for different stress class for the WESAD dataset and return X, Y.
If amusement is included into the baseline class, the labels assigned to amusement is 0
same as that of baseline class.
baseline_path (string): path to baseline data,
amusement_path (string): path to the amusement data,
stressed_path (string): path to stressed data
include_amusement (Boolean): whether to include amusement data into baseline or not.
By default amusement data is not included into baseline.
X, Y : NumPy arrays.
"""
# load the segments
baseline_segments = utl.read_data(baseline_path)
stress_segments = utl.read_data(stressed_path)
# combine the baseline and stress segments
X = np.concatenate([baseline_segments, stress_segments], axis = 0)
Y = np.concatenate([np.zeros(baseline_segments.shape[0], dtype=int),
np.ones(stress_segments.shape[0], dtype=int)
])
# include the amusement data is indicated
if include_amusement:
amusement_segments = utl.read_data(amusement_path)
X = np.concatenate([X, amusement_segments])
Y = np.concatenate([Y, np.zeros(amusement_segments.shape[0], dtype=int)])
return X, Y