Merge pull request #11 from dirac-institute/stamp_dataloader

Getting Stamp dataset, CNN, and resnet50 working
dirac-institute · Jan 24, 2025 · a2643e4 · a2643e4
2 parents 6a7bf63 + 3f31d5d
commit a2643e4
Show file tree

Hide file tree

Showing 10 changed files with 330 additions and 152 deletions.
diff --git a/example_config.toml b/example_config.toml
diff --git a/src/kbmod_ml/data_sets/__init__.py b/src/kbmod_ml/data_sets/__init__.py
diff --git a/src/kbmod_ml/data_sets/kbmod_stamps.py b/src/kbmod_ml/data_sets/kbmod_stamps.py
@@ -0,0 +1,93 @@
+import os
+
+import numpy as np
+import torch
+from fibad.data_sets.data_set_registry import fibad_data_set
+from torch.utils.data.sampler import SubsetRandomSampler
+
+
+@fibad_data_set
+class KbmodStamps:
+    """TODO: what is the actual shape of the data that we're going to want to import?
+    my initial thoughts is that we'll have a single numpy array that we stitch together
+    from the two datasets (adding a column with a classification based on which set
+    they come from). We should also have the option to select which stamp type we are using
+    (mean, median, sum, and var weighted). So we can just have all those stored as individual rows.
+
+    We could have an "active columns" variable, with the indices of the columns we want to grab
+    (corresponding to which coadd type we want to use), which could reflect in the `shape` function.
+    """
+
+    def __init__(self, config, split: str):
+        coadd_type_to_column = {
+            "median": 0,
+            "mean": 1,
+            "sum": 2,
+            "var_weighted": 3,
+        }
+
+        cols = []
+
+        for c in ["mean"]:
+            cols.append(coadd_type_to_column[c])
+
+        self.active_columns = np.array(cols)
+
+        data_dir = config["general"]["data_dir"]
+        true_positive_file_name = config["kbmod_ml"]["true_positive_file_name"]
+        false_positive_file_name = config["kbmod_ml"]["false_positive_file_name"]
+
+        true_data_path = os.path.join(data_dir, true_positive_file_name)
+        false_data_path = os.path.join(data_dir, false_positive_file_name)
+
+        if not os.path.isfile(true_data_path):
+            raise ValueError(f"Could not find {true_positive_file_name} in provided {data_dir}")
+        if not os.path.isfile(false_data_path):
+            raise ValueError(f"could not find {false_positive_file_name} in provided {data_dir}")
+
+        true_positive_samples = np.load(true_data_path)
+        false_positive_samples = np.load(false_data_path)
+
+        self._labels = np.concatenate(
+            [
+                np.ones(len(true_positive_samples), dtype=np.int8),
+                np.zeros(len(false_positive_samples), dtype=np.int8),
+            ]
+        )
+        self._data = np.concatenate([true_positive_samples[:, :3, :, :], false_positive_samples])
+
+        if split != "test":
+            num_train = len(self)
+            indices = list(range(num_train))
+            split_idx = 0
+            if config["data_set"]["validate_size"]:
+                split_idx = int(np.floor(config["data_set"]["validate_size"] * num_train))
+
+            random_seed = None
+            if config["data_set"]["seed"]:
+                random_seed = config["data_set"]["seed"]
+            np.random.seed(random_seed)
+            np.random.shuffle(indices)
+
+            train_idx, valid_idx = indices[split_idx:], indices[:split_idx]
+
+            # These samplers are used by PyTorch's DataLoader to split the dataset
+            # into training and validation sets.
+            self.train_sampler = SubsetRandomSampler(train_idx)
+            self.validation_sampler = SubsetRandomSampler(valid_idx)
+
+    def shape(self):
+        """data shape, including currently enabled columns"""
+        cols = len(self.active_columns)
+        width, height = self._data[0][0].shape
+
+        return (cols, width, height)
+
+    def __getitem__(self, idx):
+        row = self._data[idx][self.active_columns]
+        label = self._labels[idx]
+
+        return torch.tensor(row), torch.tensor(label, dtype=torch.int8)
+
+    def __len__(self):
+        return len(self._data)
diff --git a/src/kbmod_ml/default_config.toml b/src/kbmod_ml/default_config.toml
@@ -0,0 +1,13 @@
+[data_set]
+name = "kbmod_ml.data_sets.kbmod_stamps.KbmodStamps"
+
+train_size = 0.8
+
+validate_size = 0.2
+
+[kbmod_ml]
+# The file name of the true positive samples
+true_positive_file_name = 'true_positive_stamps_full.npy'
+
+# The file name of the false positive samples
+false_positive_file_name = 'false_positive_stamps_trimmed.npy'
diff --git a/src/kbmod_ml/example_benchmarks.py b/src/kbmod_ml/example_benchmarks.py
diff --git a/src/kbmod_ml/models/__init__.py b/src/kbmod_ml/models/__init__.py
diff --git a/src/kbmod_ml/models/cnn.py b/src/kbmod_ml/models/cnn.py
@@ -7,7 +7,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F  # noqa N812
-import torch.optim as optim
 from fibad.models.model_registry import fibad_model
 
 logger = logging.getLogger(__name__)
@@ -16,29 +15,22 @@
 @fibad_model
 class CNN(nn.Module):
     def __init__(self, config, shape):
-        logger.info("This is an external model, not in FIBAD!!!")
         super().__init__()
-        self.conv1 = nn.Conv2d(3, 6, 5)
-        self.pool = nn.MaxPool2d(2, 2)
-        self.conv2 = nn.Conv2d(6, 16, 5)
-        self.fc1 = nn.Linear(16 * 5 * 5, 120)
-        self.fc2 = nn.Linear(120, 84)
-        self.fc3 = nn.Linear(84, 10)
+        self.conv1 = nn.Conv2d(shape[0], 16, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
+        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
+        self.fc1 = nn.Linear(64 * shape[1] * shape[2], 1)
 
         self.config = config
 
-        # Optimizer and criterion could be set directly, i.e. `self.optimizer = optim.SGD(...)`
-        # but we define them as methods as a way to allow for more flexibility in the future.
-        self.optimizer = self._optimizer()
-        self.criterion = self._criterion()
-
     def forward(self, x):
-        x = self.pool(F.relu(self.conv1(x)))
-        x = self.pool(F.relu(self.conv2(x)))
+        if isinstance(x, tuple):
+            x, _ = x
+        x = F.relu(self.conv1(x))
+        x = F.relu(self.conv2(x))
+        x = F.relu(self.conv3(x))
         x = torch.flatten(x, 1)
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        x = self.fc3(x)
+        x = torch.sigmoid(self.fc1(x))
         return x
 
     def train_step(self, batch):
@@ -59,16 +51,10 @@ def train_step(self, batch):
 
         self.optimizer.zero_grad()
         outputs = self(inputs)
-        loss = self.criterion(outputs, labels)
+        loss = self.criterion(outputs, labels.unsqueeze(1).float())
         loss.backward()
         self.optimizer.step()
         return {"loss": loss.item()}
 
     def _criterion(self):
-        return nn.CrossEntropyLoss()
-
-    def _optimizer(self):
-        return optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
-
-    def save(self):
-        torch.save(self.state_dict(), self.config.get("weights_filepath", "example_cnn.pth"))
+        return nn.BCELoss()
diff --git a/src/kbmod_ml/models/resnet50.py b/src/kbmod_ml/models/resnet50.py
@@ -1,13 +1,8 @@
 # ruff: noqa: D101, D102
 
-# This example model is taken from the PyTorch CIFAR10 tutorial:
-# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#define-a-convolutional-neural-network
 import logging
 
-import torch
 import torch.nn as nn
-import torch.nn.functional as F  # noqa N812
-import torch.optim as optim
 from fibad.models.model_registry import fibad_model
 from torchvision.models import resnet50
 
@@ -16,20 +11,19 @@
 
 @fibad_model
 class RESNET50(nn.Module):
-    def __init__(self, model_config, shape):
-        logger.info("This is an external model, not in FIBAD!!!")
+    def __init__(self, config, shape):
         super().__init__()
 
-        self.config = model_config
+        self.config = config
 
-        self.model = resnet50(pretrained=False, num_classes=self.config["model"]["num_classes"])
+        self.model = resnet50(num_classes=2)
 
-        # Optimizer and criterion could be set directly, i.e. `self.optimizer = optim.SGD(...)`
-        # but we define them as methods as a way to allow for more flexibility in the future.
-        self.optimizer = self._optimizer()
-        self.criterion = self._criterion()
+        # Modify the input channels to 1 (e.g., for grayscale images)
+        self.model = self.modify_resnet_input_channels(self.model, num_channels=shape[0])
 
     def forward(self, x):
+        if isinstance(x, tuple):
+            x, _ = x
         return self.model(x)
 
     def train_step(self, batch):
@@ -55,11 +49,21 @@ def train_step(self, batch):
         self.optimizer.step()
         return {"loss": loss.item()}
 
-    def _criterion(self):
-        return nn.CrossEntropyLoss()
+    def modify_resnet_input_channels(self, model, num_channels):
+        # Get the first convolutional layer
+        first_conv_layer = model.conv1
 
-    def _optimizer(self):
-        return optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
+        # Create a new convolutional layer with the desired number of input channels
+        new_conv_layer = nn.Conv2d(
+            in_channels=num_channels,
+            out_channels=first_conv_layer.out_channels,
+            kernel_size=first_conv_layer.kernel_size,
+            stride=first_conv_layer.stride,
+            padding=first_conv_layer.padding,
+            bias=first_conv_layer.bias,
+        )
 
-    def save(self):
-        torch.save(self.state_dict(), self.config.get("weights_filepath"))
+        # Replace the first convolutional layer in the model
+        model.conv1 = new_conv_layer
+
+        return model