diff --git a/example_config.toml b/example_config.toml deleted file mode 100644 index ffc332f..0000000 --- a/example_config.toml +++ /dev/null @@ -1,93 +0,0 @@ -[general] -use_gpu = true - -# Destination of log messages -# 'stderr' and 'stdout' specify the console. -log_destination = "stderr" -# A path name specifies a file e.g. -# log = "fibad_log.txt" - -# Lowest log level to emit. -# As you go down the list, fibad will become more verbose in the log. -# -# log_level = "critical" # Only emit the most severe of errors -# log_level = "error" # Emit all errors -# log_level = "warning" # Emit warnings and all errors -log_level = "info" # Emit informational messages, warnings and all errors -# log_level = "debug" # Very verbose, emit all log messages. - -data_dir = "/home/drew/code/fibad/data/" -results_dir = "./results" # Results get named - under this directory - -[download] -sw = "22asec" -sh = "22asec" -filter = ["HSC-G", "HSC-R", "HSC-I", "HSC-Z", "HSC-Y"] -type = "coadd" -rerun = "pdr3_wide" -username = false -password = false -num_sources = -1 # Values below 1 here indicate all sources in the catalog will be downloaded -offset = 0 -concurrent_connections = 4 -stats_print_interval = 60 -fits_file = "./catalog.fits" - -# These control the downloader's HTTP requests and retries -# `retry_wait` How long to wait before retrying a failed HTTP request in seconds. Default 30s -retry_wait = 30 -# `retries` How many times to retry a failed HTTP request before moving on to the next one. Default 3 times -retries = 3 -# `timepout` How long should we wait to get a full HTTP response from the server. Default 3600s (1hr) -timeout = 3600 -# `chunksize` How many sky location rectangles should we request in a single request. Default is 990 -chunk_size = 990 - -# Whether to retrieve the image layer -image = true -# Whether to retrieve the variance layer -variance = false -# Whether to retrieve the mask layer -mask = false - -[model] -# The name of the built-in model to use or the libpath to an external model -# e.g. "user_package.submodule.ExternalModel" or "ExampleAutoencoder" -name = "kbmod_ml.models.resnet50.RESNET50" - -weights_filepath = "resnet50.pth" -epochs = 10 - -num_classes = 10 - - -[data_set] -# Name of the built-in data loader to use or the libpath to an external data loader -# e.g. "user_package.submodule.ExternalDataLoader" or "HSCDataLoader" -name = "CifarDataLoader" - -[data_loader] -# Pixel dimensions used to crop all images prior to loading. Will prune any images that are too small. -# -# If not provided by user, the default of 'false' scans the directory for the smallest dimensioned files, and -# uses those pixel dimensions as the crop size. -# -#crop_to = [100,100] -crop_to = false - -# Limit data loader to only particular filters when there are more in the data set. -# -# When not provided by the user, the number of filters will be automatically gleaned from the data set. -# Defaults behavior is produced by the false value. -# -#filters = ["HSC-G", "HSC-R", "HSC-I", "HSC-Z", "HSC-Y"] -filters = false - -# Default PyTorch DataLoader parameters -batch_size = 10 -shuffle = true -num_workers = 10 - -[predict] -model_weights_file = false -batch_size = 32 diff --git a/src/kbmod_ml/data_sets/__init__.py b/src/kbmod_ml/data_sets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kbmod_ml/data_sets/kbmod_stamps.py b/src/kbmod_ml/data_sets/kbmod_stamps.py new file mode 100644 index 0000000..65aac08 --- /dev/null +++ b/src/kbmod_ml/data_sets/kbmod_stamps.py @@ -0,0 +1,93 @@ +import os + +import numpy as np +import torch +from fibad.data_sets.data_set_registry import fibad_data_set +from torch.utils.data.sampler import SubsetRandomSampler + + +@fibad_data_set +class KbmodStamps: + """TODO: what is the actual shape of the data that we're going to want to import? + my initial thoughts is that we'll have a single numpy array that we stitch together + from the two datasets (adding a column with a classification based on which set + they come from). We should also have the option to select which stamp type we are using + (mean, median, sum, and var weighted). So we can just have all those stored as individual rows. + + We could have an "active columns" variable, with the indices of the columns we want to grab + (corresponding to which coadd type we want to use), which could reflect in the `shape` function. + """ + + def __init__(self, config, split: str): + coadd_type_to_column = { + "median": 0, + "mean": 1, + "sum": 2, + "var_weighted": 3, + } + + cols = [] + + for c in ["mean"]: + cols.append(coadd_type_to_column[c]) + + self.active_columns = np.array(cols) + + data_dir = config["general"]["data_dir"] + true_positive_file_name = config["kbmod_ml"]["true_positive_file_name"] + false_positive_file_name = config["kbmod_ml"]["false_positive_file_name"] + + true_data_path = os.path.join(data_dir, true_positive_file_name) + false_data_path = os.path.join(data_dir, false_positive_file_name) + + if not os.path.isfile(true_data_path): + raise ValueError(f"Could not find {true_positive_file_name} in provided {data_dir}") + if not os.path.isfile(false_data_path): + raise ValueError(f"could not find {false_positive_file_name} in provided {data_dir}") + + true_positive_samples = np.load(true_data_path) + false_positive_samples = np.load(false_data_path) + + self._labels = np.concatenate( + [ + np.ones(len(true_positive_samples), dtype=np.int8), + np.zeros(len(false_positive_samples), dtype=np.int8), + ] + ) + self._data = np.concatenate([true_positive_samples[:, :3, :, :], false_positive_samples]) + + if split != "test": + num_train = len(self) + indices = list(range(num_train)) + split_idx = 0 + if config["data_set"]["validate_size"]: + split_idx = int(np.floor(config["data_set"]["validate_size"] * num_train)) + + random_seed = None + if config["data_set"]["seed"]: + random_seed = config["data_set"]["seed"] + np.random.seed(random_seed) + np.random.shuffle(indices) + + train_idx, valid_idx = indices[split_idx:], indices[:split_idx] + + # These samplers are used by PyTorch's DataLoader to split the dataset + # into training and validation sets. + self.train_sampler = SubsetRandomSampler(train_idx) + self.validation_sampler = SubsetRandomSampler(valid_idx) + + def shape(self): + """data shape, including currently enabled columns""" + cols = len(self.active_columns) + width, height = self._data[0][0].shape + + return (cols, width, height) + + def __getitem__(self, idx): + row = self._data[idx][self.active_columns] + label = self._labels[idx] + + return torch.tensor(row), torch.tensor(label, dtype=torch.int8) + + def __len__(self): + return len(self._data) diff --git a/src/kbmod_ml/default_config.toml b/src/kbmod_ml/default_config.toml new file mode 100644 index 0000000..996e140 --- /dev/null +++ b/src/kbmod_ml/default_config.toml @@ -0,0 +1,13 @@ +[data_set] +name = "kbmod_ml.data_sets.kbmod_stamps.KbmodStamps" + +train_size = 0.8 + +validate_size = 0.2 + +[kbmod_ml] +# The file name of the true positive samples +true_positive_file_name = 'true_positive_stamps_full.npy' + +# The file name of the false positive samples +false_positive_file_name = 'false_positive_stamps_trimmed.npy' diff --git a/src/kbmod_ml/example_benchmarks.py b/src/kbmod_ml/example_benchmarks.py deleted file mode 100644 index 5a77b06..0000000 --- a/src/kbmod_ml/example_benchmarks.py +++ /dev/null @@ -1,14 +0,0 @@ -"""An example module containing simplistic methods under benchmarking.""" - -import random -import time - - -def runtime_computation(): - """Runtime computation consuming between 0 and 5 seconds.""" - time.sleep(random.uniform(0, 5)) - - -def memory_computation(): - """Memory computation for a random list up to 512 samples.""" - return [0] * random.randint(0, 512) diff --git a/src/kbmod_ml/models/__init__.py b/src/kbmod_ml/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kbmod_ml/models/cnn.py b/src/kbmod_ml/models/cnn.py index 9d6d39e..15641bd 100644 --- a/src/kbmod_ml/models/cnn.py +++ b/src/kbmod_ml/models/cnn.py @@ -7,7 +7,6 @@ import torch import torch.nn as nn import torch.nn.functional as F # noqa N812 -import torch.optim as optim from fibad.models.model_registry import fibad_model logger = logging.getLogger(__name__) @@ -16,29 +15,22 @@ @fibad_model class CNN(nn.Module): def __init__(self, config, shape): - logger.info("This is an external model, not in FIBAD!!!") super().__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) + self.conv1 = nn.Conv2d(shape[0], 16, kernel_size=3, padding=1) + self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1) + self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1) + self.fc1 = nn.Linear(64 * shape[1] * shape[2], 1) self.config = config - # Optimizer and criterion could be set directly, i.e. `self.optimizer = optim.SGD(...)` - # but we define them as methods as a way to allow for more flexibility in the future. - self.optimizer = self._optimizer() - self.criterion = self._criterion() - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) + if isinstance(x, tuple): + x, _ = x + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) x = torch.flatten(x, 1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) + x = torch.sigmoid(self.fc1(x)) return x def train_step(self, batch): @@ -59,16 +51,10 @@ def train_step(self, batch): self.optimizer.zero_grad() outputs = self(inputs) - loss = self.criterion(outputs, labels) + loss = self.criterion(outputs, labels.unsqueeze(1).float()) loss.backward() self.optimizer.step() return {"loss": loss.item()} def _criterion(self): - return nn.CrossEntropyLoss() - - def _optimizer(self): - return optim.SGD(self.parameters(), lr=0.001, momentum=0.9) - - def save(self): - torch.save(self.state_dict(), self.config.get("weights_filepath", "example_cnn.pth")) + return nn.BCELoss() diff --git a/src/kbmod_ml/models/resnet50.py b/src/kbmod_ml/models/resnet50.py index 83ff146..cab59a9 100644 --- a/src/kbmod_ml/models/resnet50.py +++ b/src/kbmod_ml/models/resnet50.py @@ -1,13 +1,8 @@ # ruff: noqa: D101, D102 -# This example model is taken from the PyTorch CIFAR10 tutorial: -# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#define-a-convolutional-neural-network import logging -import torch import torch.nn as nn -import torch.nn.functional as F # noqa N812 -import torch.optim as optim from fibad.models.model_registry import fibad_model from torchvision.models import resnet50 @@ -16,20 +11,19 @@ @fibad_model class RESNET50(nn.Module): - def __init__(self, model_config, shape): - logger.info("This is an external model, not in FIBAD!!!") + def __init__(self, config, shape): super().__init__() - self.config = model_config + self.config = config - self.model = resnet50(pretrained=False, num_classes=self.config["model"]["num_classes"]) + self.model = resnet50(num_classes=2) - # Optimizer and criterion could be set directly, i.e. `self.optimizer = optim.SGD(...)` - # but we define them as methods as a way to allow for more flexibility in the future. - self.optimizer = self._optimizer() - self.criterion = self._criterion() + # Modify the input channels to 1 (e.g., for grayscale images) + self.model = self.modify_resnet_input_channels(self.model, num_channels=shape[0]) def forward(self, x): + if isinstance(x, tuple): + x, _ = x return self.model(x) def train_step(self, batch): @@ -55,11 +49,21 @@ def train_step(self, batch): self.optimizer.step() return {"loss": loss.item()} - def _criterion(self): - return nn.CrossEntropyLoss() + def modify_resnet_input_channels(self, model, num_channels): + # Get the first convolutional layer + first_conv_layer = model.conv1 - def _optimizer(self): - return optim.SGD(self.parameters(), lr=0.001, momentum=0.9) + # Create a new convolutional layer with the desired number of input channels + new_conv_layer = nn.Conv2d( + in_channels=num_channels, + out_channels=first_conv_layer.out_channels, + kernel_size=first_conv_layer.kernel_size, + stride=first_conv_layer.stride, + padding=first_conv_layer.padding, + bias=first_conv_layer.bias, + ) - def save(self): - torch.save(self.state_dict(), self.config.get("weights_filepath")) + # Replace the first convolutional layer in the model + model.conv1 = new_conv_layer + + return model diff --git a/train_model.ipynb b/train_model.ipynb new file mode 100644 index 0000000..542f517 --- /dev/null +++ b/train_model.ipynb @@ -0,0 +1,175 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a fibad instance for use in the rest of the notebook\n", + "from fibad import Fibad\n", + "\n", + "fibad_instance = Fibad(config_file=\"./user_config.toml\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set the number of epochs to run over so we can test more easily\n", + "fibad_instance.config[\"train\"][\"epochs\"] = 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Specify output weights file name and train the model (CNN is the default)\n", + "fibad_instance.config[\"train\"][\"weights_filepath\"] = \"cnn_weights.pth\"\n", + "fibad_instance.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start and run tensorboard\n", + "%reload_ext tensorboard\n", + "%tensorboard --logdir ./results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run inference using the previously trained model\n", + "# fibad_instance.config['infer']['model_weights_file'] = ''\n", + "fibad_instance.infer()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pull all the results of inference into a single array\n", + "import numpy as np\n", + "import glob\n", + "\n", + "# Load all the files in the results directory and load the values into a single array\n", + "results = glob.glob(\"/home/drew/code/kbmod-ml/results/20250116-123512-infer/*.npy\")\n", + "\n", + "all_results = []\n", + "for f in results:\n", + " np_array = np.load(f)\n", + " all_results.append(np_array[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install matplotlib\n", + "# Plot the results of the inference\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.hist(all_results, bins=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Below here isn't working yet - something is wrong with RESNET50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fibad import Fibad\n", + "\n", + "fibad_instance = Fibad(config_file=\"./user_config.toml\")\n", + "\n", + "# Change the model to the resnet50 model and attempt to train\n", + "fibad_instance.config[\"model\"][\"name\"] = \"kbmod_ml.models.resnet50.RESNET50\"\n", + "fibad_instance.config[\"train\"][\"epochs\"] = 2\n", + "fibad_instance.config[\"data_set\"][\"train_size\"] = 0.95\n", + "fibad_instance.config[\"data_set\"][\"validate_size\"] = 0.05\n", + "fibad_instance.config[\"data_set\"][\"test_size\"] = 0.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fibad_instance.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load an instance of the dataset\n", + "from kbmod_ml.data_sets.kbmod_stamps import KbmodStamps\n", + "\n", + "ds = KbmodStamps(config=fibad_instance.config, split=\"train\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create an instance of the resnet50 model and attempt to pass a single data sample through it.\n", + "from kbmod_ml.models.resnet50 import RESNET50\n", + "\n", + "m = RESNET50(config=fibad_instance.config, shape=(2, 21, 21))\n", + "input_tensor = ds[0][0].unsqueeze(0)\n", + "print(input_tensor.shape)\n", + "\n", + "output_tensor = m(ds[0][0].unsqueeze(0))\n", + "print(output_tensor)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "kbmod-ml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/user_config.toml b/user_config.toml new file mode 100644 index 0000000..f496e28 --- /dev/null +++ b/user_config.toml @@ -0,0 +1,14 @@ +[general] +dev_mode = true + +data_dir = "/Users/drew/code/kbmod-ml/data" + +results_dir = "./results" + +[model] +name = "kbmod_ml.models.cnn.CNN" + +weights_filepath = "resnet50.pth" + +[infer] +chromadb = false \ No newline at end of file