Skip to content

Commit

Permalink
improved dataset for ML
Browse files Browse the repository at this point in the history
  • Loading branch information
jbaron committed Aug 9, 2024
1 parent 096ccba commit 4195820
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 18 deletions.
59 changes: 44 additions & 15 deletions roboquant/ml/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


class FeatureStrategy(Strategy):
"""Abstract base class for strategies wanting to use features
"""Abstract base class for strategies wanting to use event features
for their input.
"""

Expand All @@ -45,26 +45,55 @@ def predict(self, x: NDArray, time: datetime) -> Signal | None: ...


class SequenceDataset(Dataset):
"""Sequence Dataset"""
"""Dataset that creates an input sequence and an output sequence useful for recurrent networks.
The output sequence is always after the input sequence (prediction) and there can be
optionally a gap.
def __init__(self, x_data: NDArray, y_data: NDArray, sequences=20, transform=None, target_transform=None):
self.sequences = sequences
self.x_data = x_data
self.y_data = y_data
```
[...input...][...gap...][...target...]
```
"""

def __init__(
self,
input_data: NDArray,
target_data: NDArray,
input_sequences=20,
target_sequences=1,
gap=0,
transform=None,
target_transform=None,
target_squeeze=True
):
assert len(input_data) == len(target_data), "x_data and y_data need to have the same length"
self.input_data = input_data
self.target_data = target_data
self.input_sequences = input_sequences
self.output_sequences = target_sequences
self.gap = gap
self.transform = transform
self.target_squeeze = target_squeeze
self.target_transform = target_transform

if len(self) == 0:
logger.warning("this dataset won't produce any data")

def __len__(self):
return len(self.y_data) - self.sequences
calc_l = len(self.target_data) - self.input_sequences - self.output_sequences - self.gap + 1
return max(0, calc_l)

def __getitem__(self, idx):
end = idx + self.sequences
features = self.x_data[idx:end]
target = self.y_data[end - 1]
end = idx + self.input_sequences
features = self.input_data[idx:end]
start = end + self.gap
target = self.target_data[start: start + self.output_sequences]
if self.transform:
features = self.transform(features)
if self.target_transform:
target = self.target_transform(target)

if self.output_sequences == 1 and self.target_squeeze:
target = np.squeeze(target, 0)
return features, target


Expand Down Expand Up @@ -111,17 +140,17 @@ def _get_dataloaders(self, x, y, prediction: int, validation_split: float, batch
# what is the border between train- and validation-data
border = round(len(y) * (1.0 - validation_split))

x_train = x[: border - prediction]
y_train = y[prediction:border]
x_train = x[:border]
y_train = y[:border]

train_dataset = SequenceDataset(x_train, y_train, self.history)
train_dataset = SequenceDataset(x_train, y_train, self.history, gap=prediction)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataloader = None
if validation_split > 0.0:
x_valid = x[border - prediction: -prediction]
x_valid = x[border:]
y_valid = y[border:]
valid_dataset = SequenceDataset(x_valid, y_valid, self.history)
valid_dataset = SequenceDataset(x_valid, y_valid, self.history, gap=prediction)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

return train_dataloader, valid_dataloader
Expand Down
18 changes: 15 additions & 3 deletions tests/unit/test_rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

from torch import nn
import torch.nn.functional as F
import numpy as np

import roboquant as rq
from roboquant.asset import Stock
from roboquant.ml.features import BarFeature, CombinedFeature, PriceFeature, SMAFeature
from roboquant.ml.strategies import RNNStrategy
from roboquant.ml.strategies import RNNStrategy, SequenceDataset
from tests.common import get_feed


Expand All @@ -26,12 +27,23 @@ def forward(self, inputs):

class TestRNN(unittest.TestCase):

def test_ds(self):
x = np.ones((100, 10))
y = np.ones((100, 5))
ds = SequenceDataset(x, y, 20, 10, 1)
size = len(ds)
self.assertEqual(70, size)
for idx in range(size):
a, b = ds[idx]
self.assertEqual(20, len(a))
self.assertEqual(10, len(b))

def test_lstm_model(self):
# logging.basicConfig()
# logging.getLogger("roboquant.strategies").setLevel(level=logging.INFO)
# Setup
apple = Stock("AAPL")
prediction = 10
prediction = 5
feed = get_feed()
model = _MyModel()

Expand All @@ -46,7 +58,7 @@ def test_lstm_model(self):

# Train the model with 10 years of data
tf = rq.Timeframe.fromisoformat("2010-01-01", "2020-01-01")
strategy.fit(feed, timeframe=tf, epochs=2, validation_split=0.25, prediction=prediction)
strategy.fit(feed, timeframe=tf, epochs=2, validation_split=0.50, prediction=prediction)

# Run the trained model with the last 4 years of data
tf = rq.Timeframe.fromisoformat("2020-01-01", "2024-01-01")
Expand Down

0 comments on commit 4195820

Please sign in to comment.