-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdf.py
62 lines (41 loc) · 1.81 KB
/
df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import copy
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
class DataFrame(object):
def __init__(self, columns, data):
assert len(columns) == len(data), 'columns length does not match data length'
lengths = [mat.shape[0] for mat in data]
assert len(set(lengths)) == 1, 'all matrices in data must have same first dimension'
self.length = lengths[0]
self.columns = columns
self.data = data
self.dict = dict(zip(self.columns, self.data))
self.idx = np.arange(self.length)
def shuffle(self):
np.random.shuffle(self.idx)
def batch_generator(self, batch_size, shuffle=True, num_epochs=10000, allow_smaller_final_batch=False):
epoch_num = 0
while epoch_num < num_epochs:
if shuffle:
self.shuffle()
for i in range(0, self.length, batch_size):
batch_idx = self.idx[i: i + batch_size]
if not allow_smaller_final_batch and len(batch_idx) != batch_size:
break
yield DataFrame(columns=copy.copy(self.columns), data=[mat[batch_idx].copy() for mat in self.data])
epoch_num += 1
def iterrows(self):
for i in self.idx:
yield self[i]
def mask(self, mask):
return DataFrame(copy.copy(self.columns), [mat[mask] for mat in self.data])
def __iter__(self):
return self.dict.items().__iter__()
def __len__(self):
return self.length
def __getitem__(self, key):
if isinstance(key, str):
return self.dict[key]
elif isinstance(key, int):
return pd.Series(dict(zip(self.columns, [mat[self.idx[key]] for mat in self.data])))