improved dataset for ML

neurallayer · Aug 9, 2024 · 4195820 · 4195820
1 parent 096ccba
commit 4195820
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 18 deletions.
diff --git a/roboquant/ml/strategies.py b/roboquant/ml/strategies.py
@@ -19,7 +19,7 @@
 
 
 class FeatureStrategy(Strategy):
-    """Abstract base class for strategies wanting to use features
+    """Abstract base class for strategies wanting to use event features
     for their input.
     """
 
@@ -45,26 +45,55 @@ def predict(self, x: NDArray, time: datetime) -> Signal | None: ...
 
 
 class SequenceDataset(Dataset):
-    """Sequence Dataset"""
+    """Dataset that creates an input sequence and an output sequence useful for recurrent networks.
+    The output sequence is always after the input sequence (prediction) and there can be
+    optionally a gap.
 
-    def __init__(self, x_data: NDArray, y_data: NDArray, sequences=20, transform=None, target_transform=None):
-        self.sequences = sequences
-        self.x_data = x_data
-        self.y_data = y_data
+    ```
+    [...input...][...gap...][...target...]
+    ```
+    """
+
+    def __init__(
+        self,
+        input_data: NDArray,
+        target_data: NDArray,
+        input_sequences=20,
+        target_sequences=1,
+        gap=0,
+        transform=None,
+        target_transform=None,
+        target_squeeze=True
+    ):
+        assert len(input_data) == len(target_data), "x_data and y_data need to have the same length"
+        self.input_data = input_data
+        self.target_data = target_data
+        self.input_sequences = input_sequences
+        self.output_sequences = target_sequences
+        self.gap = gap
         self.transform = transform
+        self.target_squeeze = target_squeeze
         self.target_transform = target_transform
 
+        if len(self) == 0:
+            logger.warning("this dataset won't produce any data")
+
     def __len__(self):
-        return len(self.y_data) - self.sequences
+        calc_l = len(self.target_data) - self.input_sequences - self.output_sequences - self.gap + 1
+        return max(0, calc_l)
 
     def __getitem__(self, idx):
-        end = idx + self.sequences
-        features = self.x_data[idx:end]
-        target = self.y_data[end - 1]
+        end = idx + self.input_sequences
+        features = self.input_data[idx:end]
+        start = end + self.gap
+        target = self.target_data[start: start + self.output_sequences]
         if self.transform:
             features = self.transform(features)
         if self.target_transform:
             target = self.target_transform(target)
+
+        if self.output_sequences == 1 and self.target_squeeze:
+            target = np.squeeze(target, 0)
         return features, target
 
 
@@ -111,17 +140,17 @@ def _get_dataloaders(self, x, y, prediction: int, validation_split: float, batch
         # what is the border between train- and validation-data
         border = round(len(y) * (1.0 - validation_split))
 
-        x_train = x[: border - prediction]
-        y_train = y[prediction:border]
+        x_train = x[:border]
+        y_train = y[:border]
 
-        train_dataset = SequenceDataset(x_train, y_train, self.history)
+        train_dataset = SequenceDataset(x_train, y_train, self.history, gap=prediction)
         train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 
         valid_dataloader = None
         if validation_split > 0.0:
-            x_valid = x[border - prediction: -prediction]
+            x_valid = x[border:]
             y_valid = y[border:]
-            valid_dataset = SequenceDataset(x_valid, y_valid, self.history)
+            valid_dataset = SequenceDataset(x_valid, y_valid, self.history, gap=prediction)
             valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
 
         return train_dataloader, valid_dataloader

diff --git a/tests/unit/test_rnn.py b/tests/unit/test_rnn.py
@@ -2,11 +2,12 @@
 
 from torch import nn
 import torch.nn.functional as F
+import numpy as np
 
 import roboquant as rq
 from roboquant.asset import Stock
 from roboquant.ml.features import BarFeature, CombinedFeature, PriceFeature, SMAFeature
-from roboquant.ml.strategies import RNNStrategy
+from roboquant.ml.strategies import RNNStrategy, SequenceDataset
 from tests.common import get_feed
 
 
@@ -26,12 +27,23 @@ def forward(self, inputs):
 
 class TestRNN(unittest.TestCase):
 
+    def test_ds(self):
+        x = np.ones((100, 10))
+        y = np.ones((100, 5))
+        ds = SequenceDataset(x, y, 20, 10, 1)
+        size = len(ds)
+        self.assertEqual(70, size)
+        for idx in range(size):
+            a, b = ds[idx]
+            self.assertEqual(20, len(a))
+            self.assertEqual(10, len(b))
+
     def test_lstm_model(self):
         # logging.basicConfig()
         # logging.getLogger("roboquant.strategies").setLevel(level=logging.INFO)
         # Setup
         apple = Stock("AAPL")
-        prediction = 10
+        prediction = 5
         feed = get_feed()
         model = _MyModel()
 
@@ -46,7 +58,7 @@ def test_lstm_model(self):
 
         # Train the model with 10 years of data
         tf = rq.Timeframe.fromisoformat("2010-01-01", "2020-01-01")
-        strategy.fit(feed, timeframe=tf, epochs=2, validation_split=0.25, prediction=prediction)
+        strategy.fit(feed, timeframe=tf, epochs=2, validation_split=0.50, prediction=prediction)
 
         # Run the trained model with the last 4 years of data
         tf = rq.Timeframe.fromisoformat("2020-01-01", "2024-01-01")