From 0e96d6a70fac9d6d08b94c9742f9f77178f70595 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Wed, 25 Dec 2024 13:57:37 +0900 Subject: [PATCH 01/12] Add single_datamodule.py. --- .../datamodules/single_datamodule.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 aiaccel/torch/lightning/datamodules/single_datamodule.py diff --git a/aiaccel/torch/lightning/datamodules/single_datamodule.py b/aiaccel/torch/lightning/datamodules/single_datamodule.py new file mode 100644 index 00000000..08abea9d --- /dev/null +++ b/aiaccel/torch/lightning/datamodules/single_datamodule.py @@ -0,0 +1,58 @@ +from typing import Any, Callable + +from torch.utils.data import DataLoader, Dataset + +import lightning as lt + +from aiaccel.torch.datasets import scatter_dataset + + +class SingleDataModule(lt.LightningDataModule): + def __init__( + self, + train_dataset_fn: Callable[..., Dataset[str]], + val_dataset_fn: Callable[..., Dataset[str]], + batch_size: int, + num_workers: int = 10, + wrap_scatter_dataset: bool = True, + ): + super().__init__() + + self.train_dataset_fn = train_dataset_fn + self.val_dataset_fn = val_dataset_fn + + self.default_dataloader_kwargs = dict[str, Any]( + batch_size=batch_size, + num_workers=num_workers, + persistent_workers=True, + shuffle=True, + ) + + self.wrap_scatter_dataset = wrap_scatter_dataset + + def setup(self, stage: str | None): + if stage == "fit": + if self.wrap_scatter_dataset: + self.train_dataset = scatter_dataset(self.train_dataset_fn()) + self.val_dataset = scatter_dataset(self.val_dataset_fn()) + else: + self.train_dataset = self.train_dataset_fn() + self.val_dataset = self.val_dataset_fn() + + print(f"Dataset size: {len(self.train_dataset)=}, {len(self.val_dataset)=}") + else: + raise ValueError("`stage` is not 'fit'.") + + def train_dataloader(self): + return DataLoader( + self.train_dataset, + drop_last=True, + **self.default_dataloader_kwargs, + ) + + def val_dataloader(self): + return DataLoader( + self.val_dataset, + drop_last=False, + **self.default_dataloader_kwargs, + ) From 08f363710a33be871b67bd1be6098b81e6c5b6b5 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Wed, 25 Dec 2024 13:58:20 +0900 Subject: [PATCH 02/12] Add examples/torch/MNIST . --- examples/torch/MNIST/config.yaml | 21 ++++++++ .../torch/MNIST/mnist_dataset_function.py | 22 ++++++++ examples/torch/MNIST/mnist_resnet50.py | 50 +++++++++++++++++++ examples/torch/MNIST/train.sh | 12 +++++ 4 files changed, 105 insertions(+) create mode 100644 examples/torch/MNIST/config.yaml create mode 100644 examples/torch/MNIST/mnist_dataset_function.py create mode 100644 examples/torch/MNIST/mnist_resnet50.py create mode 100644 examples/torch/MNIST/train.sh diff --git a/examples/torch/MNIST/config.yaml b/examples/torch/MNIST/config.yaml new file mode 100644 index 00000000..002cd7e2 --- /dev/null +++ b/examples/torch/MNIST/config.yaml @@ -0,0 +1,21 @@ +trainer: + max_epochs: 10 + callbacks: + - _target_: lightning.pytorch.callbacks.ModelCheckpoint + filename: "{epoch:04d}" + save_last: True + save_top_k: -1 + +task: + _target_: mnist_resnet50.MNISTResNet50 + +datamodule: + _target_: aiaccel.torch.lightning.datamodules.single_datamodule.SingleDataModule + train_dataset_fn: + _partial_: true + _target_: mnist_dataset_function.train_dataset + val_dataset_fn: + _partial_: true + _target_: mnist_dataset_function.val_dataset + batch_size: 2 + wrap_scatter_dataset: False diff --git a/examples/torch/MNIST/mnist_dataset_function.py b/examples/torch/MNIST/mnist_dataset_function.py new file mode 100644 index 00000000..fe52f0e9 --- /dev/null +++ b/examples/torch/MNIST/mnist_dataset_function.py @@ -0,0 +1,22 @@ +import torchvision +from torchvision import transforms + + +def train_dataset(): + transform = transforms.Compose([ + transforms.Resize((224, 224)), + transforms.Grayscale(num_output_channels=3), + transforms.ToTensor(), + transforms.Normalize((0.5,), (0.5,)) + ]) + return torchvision.datasets.MNIST("./dataset", train=True, download=True, transform=transform) + + +def val_dataset(): + transform = transforms.Compose([ + transforms.Resize((224, 224)), + transforms.Grayscale(num_output_channels=3), + transforms.ToTensor(), + transforms.Normalize((0.5,), (0.5,)) + ]) + return torchvision.datasets.MNIST("./dataset", train=False, download=True, transform=transform) diff --git a/examples/torch/MNIST/mnist_resnet50.py b/examples/torch/MNIST/mnist_resnet50.py new file mode 100644 index 00000000..51dca82c --- /dev/null +++ b/examples/torch/MNIST/mnist_resnet50.py @@ -0,0 +1,50 @@ +import lightning as pl +import torch +from torch import nn +from torch.nn import functional as F +from torchvision import models + + +# モデル定義 +class MNISTResNet50(pl.LightningModule): + def __init__(self, num_classes=10): + super().__init__() + self.model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT) + + # 入力層を1チャネル対応に置き換え + self.model.conv1 = nn.Conv2d( + in_channels=3, # Grayscale → 3チャンネルに変換済み + out_channels=64, + kernel_size=7, + stride=2, + padding=3, + bias=False + ) + + # 出力層をMNISTの10クラス用に置き換え + self.model.fc = nn.Linear(self.model.fc.in_features, num_classes) + + def forward(self, x): + return self.model(x) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.cross_entropy(logits, y) + self.log('train_loss', loss) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.cross_entropy(logits, y) + self.log('val_loss', loss) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.cross_entropy(logits, y) + self.log('test_loss', loss) + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=1e-4) diff --git a/examples/torch/MNIST/train.sh b/examples/torch/MNIST/train.sh new file mode 100644 index 00000000..bf024c5d --- /dev/null +++ b/examples/torch/MNIST/train.sh @@ -0,0 +1,12 @@ +#! /bin/bash + +#$-l rt_F=1 +#$-l h_rt=1:00:00 +#$-j y +#$-cwd + +source /etc/profile.d/modules.sh +module load singularitypro +module load hpcx/2.12 + +python -m aiaccel.torch.apps.train $wd/config.yaml --working_directory $wd From 054d6179676ccc53bada392e9c30be4b701665c2 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Tue, 7 Jan 2025 16:40:43 +0900 Subject: [PATCH 03/12] Remove mnist_dataset_function.py and mnist_resnet50.py. --- examples/torch/MNIST/config.yaml | 54 +++++++++++++++++-- .../torch/MNIST/mnist_dataset_function.py | 22 -------- examples/torch/MNIST/mnist_resnet50.py | 50 ----------------- examples/torch/MNIST/torchvision_task.py | 33 ++++++++++++ 4 files changed, 84 insertions(+), 75 deletions(-) delete mode 100644 examples/torch/MNIST/mnist_dataset_function.py delete mode 100644 examples/torch/MNIST/mnist_resnet50.py create mode 100644 examples/torch/MNIST/torchvision_task.py diff --git a/examples/torch/MNIST/config.yaml b/examples/torch/MNIST/config.yaml index 002cd7e2..481c6d79 100644 --- a/examples/torch/MNIST/config.yaml +++ b/examples/torch/MNIST/config.yaml @@ -7,15 +7,63 @@ trainer: save_top_k: -1 task: - _target_: mnist_resnet50.MNISTResNet50 + _target_: torchvision_task.Resnet50Task + model: + _target_: torchvision.models.resnet50 + weights: + _target_: hydra.utils.get_object + path: torchvision.models.ResNet50_Weights.DEFAULT + optimizer_config: + _target_: aiaccel.torch.lightning.OptimizerConfig + optimizer_generator: + _partial_: True + _target_: torch.optim.AdamW + lr: 1.e-4 + num_classes: 10 datamodule: _target_: aiaccel.torch.lightning.datamodules.single_datamodule.SingleDataModule train_dataset_fn: _partial_: true - _target_: mnist_dataset_function.train_dataset + _target_: torchvision.datasets.MNIST + root: "./dataset" + train: True + download: True + transform: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: + - 224 + - 224 + - _target_: torchvision.transforms.Grayscale + num_output_channels: 3 + - _target_: torchvision.transforms.ToTensor + - _target_: torchvision.transforms.Normalize + mean: + - 0.5 + std: + - 0.5 val_dataset_fn: _partial_: true - _target_: mnist_dataset_function.val_dataset + _target_: torchvision.datasets.MNIST + root: "./dataset" + train: False + download: True + transform: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: + - 224 + - 224 + - _target_: torchvision.transforms.Grayscale + num_output_channels: 3 + - _target_: torchvision.transforms.ToTensor + - _target_: torchvision.transforms.Normalize + mean: + - 0.5 + std: + - 0.5 batch_size: 2 wrap_scatter_dataset: False diff --git a/examples/torch/MNIST/mnist_dataset_function.py b/examples/torch/MNIST/mnist_dataset_function.py deleted file mode 100644 index fe52f0e9..00000000 --- a/examples/torch/MNIST/mnist_dataset_function.py +++ /dev/null @@ -1,22 +0,0 @@ -import torchvision -from torchvision import transforms - - -def train_dataset(): - transform = transforms.Compose([ - transforms.Resize((224, 224)), - transforms.Grayscale(num_output_channels=3), - transforms.ToTensor(), - transforms.Normalize((0.5,), (0.5,)) - ]) - return torchvision.datasets.MNIST("./dataset", train=True, download=True, transform=transform) - - -def val_dataset(): - transform = transforms.Compose([ - transforms.Resize((224, 224)), - transforms.Grayscale(num_output_channels=3), - transforms.ToTensor(), - transforms.Normalize((0.5,), (0.5,)) - ]) - return torchvision.datasets.MNIST("./dataset", train=False, download=True, transform=transform) diff --git a/examples/torch/MNIST/mnist_resnet50.py b/examples/torch/MNIST/mnist_resnet50.py deleted file mode 100644 index 51dca82c..00000000 --- a/examples/torch/MNIST/mnist_resnet50.py +++ /dev/null @@ -1,50 +0,0 @@ -import lightning as pl -import torch -from torch import nn -from torch.nn import functional as F -from torchvision import models - - -# モデル定義 -class MNISTResNet50(pl.LightningModule): - def __init__(self, num_classes=10): - super().__init__() - self.model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT) - - # 入力層を1チャネル対応に置き換え - self.model.conv1 = nn.Conv2d( - in_channels=3, # Grayscale → 3チャンネルに変換済み - out_channels=64, - kernel_size=7, - stride=2, - padding=3, - bias=False - ) - - # 出力層をMNISTの10クラス用に置き換え - self.model.fc = nn.Linear(self.model.fc.in_features, num_classes) - - def forward(self, x): - return self.model(x) - - def training_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.cross_entropy(logits, y) - self.log('train_loss', loss) - return loss - - def validation_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.cross_entropy(logits, y) - self.log('val_loss', loss) - - def test_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.cross_entropy(logits, y) - self.log('test_loss', loss) - - def configure_optimizers(self): - return torch.optim.Adam(self.parameters(), lr=1e-4) diff --git a/examples/torch/MNIST/torchvision_task.py b/examples/torch/MNIST/torchvision_task.py new file mode 100644 index 00000000..cb762178 --- /dev/null +++ b/examples/torch/MNIST/torchvision_task.py @@ -0,0 +1,33 @@ +from torch import nn +from torch.nn import functional as F + +from aiaccel.torch.lightning import OptimizerConfig, OptimizerLightningModule + + +class Resnet50Task(OptimizerLightningModule): + def __init__(self, model: nn.Module, optimizer_config: OptimizerConfig, num_classes: int = 10): + super().__init__(optimizer_config) + self.model = model + self.model.fc = nn.Linear(self.model.fc.in_features, num_classes) + + def forward(self, x): + return self.model(x) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.cross_entropy(logits, y) + self.log('train_loss', loss) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.cross_entropy(logits, y) + self.log('val_loss', loss) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.cross_entropy(logits, y) + self.log('test_loss', loss) From df599dae75d99ca87674012f245465ca8d2cdd94 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Wed, 8 Jan 2025 16:59:37 +0900 Subject: [PATCH 04/12] Fix for lint. --- .../datamodules/single_datamodule.py | 17 +++++++----- examples/torch/MNIST/torchvision_task.py | 27 ++++++++++--------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/aiaccel/torch/lightning/datamodules/single_datamodule.py b/aiaccel/torch/lightning/datamodules/single_datamodule.py index 08abea9d..fedbdc4c 100644 --- a/aiaccel/torch/lightning/datamodules/single_datamodule.py +++ b/aiaccel/torch/lightning/datamodules/single_datamodule.py @@ -1,6 +1,8 @@ -from typing import Any, Callable +from typing import Any -from torch.utils.data import DataLoader, Dataset +from collections.abc import Callable, Sized + +from torch.utils.data import DataLoader, Dataset, Subset import lightning as lt @@ -30,7 +32,9 @@ def __init__( self.wrap_scatter_dataset = wrap_scatter_dataset - def setup(self, stage: str | None): + def setup(self, stage: str | None) -> None: + self.train_dataset: Dataset[str] | Subset[str] + self.val_dataset: Dataset[str] | Subset[str] if stage == "fit": if self.wrap_scatter_dataset: self.train_dataset = scatter_dataset(self.train_dataset_fn()) @@ -39,18 +43,19 @@ def setup(self, stage: str | None): self.train_dataset = self.train_dataset_fn() self.val_dataset = self.val_dataset_fn() - print(f"Dataset size: {len(self.train_dataset)=}, {len(self.val_dataset)=}") + if isinstance(self.train_dataset, Sized) and isinstance(self.val_dataset, Sized): + print(f"Dataset size: {len(self.train_dataset)=}, {len(self.val_dataset)=}") else: raise ValueError("`stage` is not 'fit'.") - def train_dataloader(self): + def train_dataloader(self) -> DataLoader[Any]: return DataLoader( self.train_dataset, drop_last=True, **self.default_dataloader_kwargs, ) - def val_dataloader(self): + def val_dataloader(self) -> DataLoader[Any]: return DataLoader( self.val_dataset, drop_last=False, diff --git a/examples/torch/MNIST/torchvision_task.py b/examples/torch/MNIST/torchvision_task.py index cb762178..1020309d 100644 --- a/examples/torch/MNIST/torchvision_task.py +++ b/examples/torch/MNIST/torchvision_task.py @@ -1,5 +1,8 @@ -from torch import nn -from torch.nn import functional as F +from typing import Any + +from torch import Tensor, nn +from torch.nn import functional as func +from torch.utils.data import DataLoader from aiaccel.torch.lightning import OptimizerConfig, OptimizerLightningModule @@ -10,24 +13,24 @@ def __init__(self, model: nn.Module, optimizer_config: OptimizerConfig, num_clas self.model = model self.model.fc = nn.Linear(self.model.fc.in_features, num_classes) - def forward(self, x): + def forward(self, x: Any) -> Any: return self.model(x) - def training_step(self, batch, batch_idx): + def training_step(self, batch: DataLoader[Any], batch_idx: int) -> Tensor: x, y = batch logits = self(x) - loss = F.cross_entropy(logits, y) - self.log('train_loss', loss) + loss = func.cross_entropy(logits, y) + self.log("train_loss", loss) return loss - def validation_step(self, batch, batch_idx): + def validation_step(self, batch: DataLoader[Any], batch_idx: int) -> None: x, y = batch logits = self(x) - loss = F.cross_entropy(logits, y) - self.log('val_loss', loss) + loss = func.cross_entropy(logits, y) + self.log("val_loss", loss) - def test_step(self, batch, batch_idx): + def test_step(self, batch: DataLoader[Any], batch_idx: int) -> None: x, y = batch logits = self(x) - loss = F.cross_entropy(logits, y) - self.log('test_loss', loss) + loss = func.cross_entropy(logits, y) + self.log("test_loss", loss) From 8b634df286fb9414e77e76963ed8129984d37413 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Wed, 22 Jan 2025 16:59:46 +0900 Subject: [PATCH 05/12] Fix for ABCI. --- examples/torch/MNIST/config.yaml | 10 +++++----- examples/torch/MNIST/torchvision_task.py | 19 ++++++++++++------- examples/torch/MNIST/train.sh | 20 +++++++++++++------- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/examples/torch/MNIST/config.yaml b/examples/torch/MNIST/config.yaml index 481c6d79..41c45f19 100644 --- a/examples/torch/MNIST/config.yaml +++ b/examples/torch/MNIST/config.yaml @@ -34,8 +34,8 @@ datamodule: transforms: - _target_: torchvision.transforms.Resize size: - - 224 - - 224 + - 256 + - 256 - _target_: torchvision.transforms.Grayscale num_output_channels: 3 - _target_: torchvision.transforms.ToTensor @@ -55,8 +55,8 @@ datamodule: transforms: - _target_: torchvision.transforms.Resize size: - - 224 - - 224 + - 256 + - 256 - _target_: torchvision.transforms.Grayscale num_output_channels: 3 - _target_: torchvision.transforms.ToTensor @@ -65,5 +65,5 @@ datamodule: - 0.5 std: - 0.5 - batch_size: 2 + batch_size: 128 wrap_scatter_dataset: False diff --git a/examples/torch/MNIST/torchvision_task.py b/examples/torch/MNIST/torchvision_task.py index 1020309d..b8eb4844 100644 --- a/examples/torch/MNIST/torchvision_task.py +++ b/examples/torch/MNIST/torchvision_task.py @@ -5,6 +5,7 @@ from torch.utils.data import DataLoader from aiaccel.torch.lightning import OptimizerConfig, OptimizerLightningModule +import torchmetrics class Resnet50Task(OptimizerLightningModule): @@ -13,6 +14,10 @@ def __init__(self, model: nn.Module, optimizer_config: OptimizerConfig, num_clas self.model = model self.model.fc = nn.Linear(self.model.fc.in_features, num_classes) + self.train_accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=10) + self.val_accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=10) + self.test_accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=10) + def forward(self, x: Any) -> Any: return self.model(x) @@ -20,17 +25,17 @@ def training_step(self, batch: DataLoader[Any], batch_idx: int) -> Tensor: x, y = batch logits = self(x) loss = func.cross_entropy(logits, y) - self.log("train_loss", loss) + + acc = self.train_accuracy(logits, y) + self.log('train_loss', loss, prog_bar=True) + self.log('train_acc', acc, prog_bar=True) return loss def validation_step(self, batch: DataLoader[Any], batch_idx: int) -> None: x, y = batch logits = self(x) loss = func.cross_entropy(logits, y) - self.log("val_loss", loss) - def test_step(self, batch: DataLoader[Any], batch_idx: int) -> None: - x, y = batch - logits = self(x) - loss = func.cross_entropy(logits, y) - self.log("test_loss", loss) + acc = self.val_accuracy(logits, y) + self.log('val_loss', loss, prog_bar=True) + self.log('val_acc', acc, prog_bar=True) diff --git a/examples/torch/MNIST/train.sh b/examples/torch/MNIST/train.sh index bf024c5d..5f8d5a7b 100644 --- a/examples/torch/MNIST/train.sh +++ b/examples/torch/MNIST/train.sh @@ -1,12 +1,18 @@ #! /bin/bash -#$-l rt_F=1 -#$-l h_rt=1:00:00 -#$-j y -#$-cwd +#PBS -q rt_HF +#PBS -l select=1 +#PBS -l walltime=1:00:00 +#PBS -P grpname +#PBS -j oe + +cd ${PBS_O_WORKDIR} source /etc/profile.d/modules.sh -module load singularitypro -module load hpcx/2.12 +module load cuda/12.6/12.6.1 + +source path_to_aiaccel_env/bin/activate + +wd=path_to_working_directory -python -m aiaccel.torch.apps.train $wd/config.yaml --working_directory $wd +singularity exec --nv path_to_python.sif python -m aiaccel.torch.apps.train $wd/config.yaml --working_directory $wd From 10670be1238cfa0b1db8f9ffa677e3c1fea59a42 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Wed, 22 Jan 2025 17:01:06 +0900 Subject: [PATCH 06/12] Fix for lint. --- examples/torch/MNIST/torchvision_task.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/torch/MNIST/torchvision_task.py b/examples/torch/MNIST/torchvision_task.py index b8eb4844..49737d27 100644 --- a/examples/torch/MNIST/torchvision_task.py +++ b/examples/torch/MNIST/torchvision_task.py @@ -4,9 +4,10 @@ from torch.nn import functional as func from torch.utils.data import DataLoader -from aiaccel.torch.lightning import OptimizerConfig, OptimizerLightningModule import torchmetrics +from aiaccel.torch.lightning import OptimizerConfig, OptimizerLightningModule + class Resnet50Task(OptimizerLightningModule): def __init__(self, model: nn.Module, optimizer_config: OptimizerConfig, num_classes: int = 10): @@ -14,9 +15,9 @@ def __init__(self, model: nn.Module, optimizer_config: OptimizerConfig, num_clas self.model = model self.model.fc = nn.Linear(self.model.fc.in_features, num_classes) - self.train_accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=10) - self.val_accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=10) - self.test_accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=10) + self.train_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10) + self.val_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10) + self.test_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10) def forward(self, x: Any) -> Any: return self.model(x) @@ -27,8 +28,8 @@ def training_step(self, batch: DataLoader[Any], batch_idx: int) -> Tensor: loss = func.cross_entropy(logits, y) acc = self.train_accuracy(logits, y) - self.log('train_loss', loss, prog_bar=True) - self.log('train_acc', acc, prog_bar=True) + self.log("train_loss", loss, prog_bar=True) + self.log("train_acc", acc, prog_bar=True) return loss def validation_step(self, batch: DataLoader[Any], batch_idx: int) -> None: @@ -37,5 +38,5 @@ def validation_step(self, batch: DataLoader[Any], batch_idx: int) -> None: loss = func.cross_entropy(logits, y) acc = self.val_accuracy(logits, y) - self.log('val_loss', loss, prog_bar=True) - self.log('val_acc', acc, prog_bar=True) + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", acc, prog_bar=True) From 6ff0d7f0b67993ce92aa4ede532b0e71da843be1 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Tue, 28 Jan 2025 10:42:36 +0900 Subject: [PATCH 07/12] Change HF to HG. --- examples/torch/MNIST/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/torch/MNIST/train.sh b/examples/torch/MNIST/train.sh index 5f8d5a7b..1d656c73 100644 --- a/examples/torch/MNIST/train.sh +++ b/examples/torch/MNIST/train.sh @@ -1,6 +1,6 @@ #! /bin/bash -#PBS -q rt_HF +#PBS -q rt_HG #PBS -l select=1 #PBS -l walltime=1:00:00 #PBS -P grpname From dd0f2ade77daf1ba84c69deedd89702023f4c051 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Tue, 28 Jan 2025 13:32:02 +0900 Subject: [PATCH 08/12] Add ddp config and script. --- examples/torch/MNIST/config_ddp.yaml | 72 ++++++++++++++++++++++++++++ examples/torch/MNIST/train_ddp.sh | 18 +++++++ 2 files changed, 90 insertions(+) create mode 100644 examples/torch/MNIST/config_ddp.yaml create mode 100644 examples/torch/MNIST/train_ddp.sh diff --git a/examples/torch/MNIST/config_ddp.yaml b/examples/torch/MNIST/config_ddp.yaml new file mode 100644 index 00000000..a7061c2d --- /dev/null +++ b/examples/torch/MNIST/config_ddp.yaml @@ -0,0 +1,72 @@ +trainer: + accelerator: "gpu" + devices: 8 + strategy: "ddp" + max_epochs: 10 + callbacks: + - _target_: lightning.pytorch.callbacks.ModelCheckpoint + filename: "{epoch:04d}" + save_last: True + save_top_k: -1 + +task: + _target_: torchvision_task.Resnet50Task + model: + _target_: torchvision.models.resnet50 + weights: + _target_: hydra.utils.get_object + path: torchvision.models.ResNet50_Weights.DEFAULT + optimizer_config: + _target_: aiaccel.torch.lightning.OptimizerConfig + optimizer_generator: + _partial_: True + _target_: torch.optim.AdamW + lr: 1.e-4 + num_classes: 10 + +datamodule: + _target_: aiaccel.torch.lightning.datamodules.single_datamodule.SingleDataModule + train_dataset_fn: + _partial_: true + _target_: torchvision.datasets.MNIST + root: "./dataset" + train: True + download: True + transform: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: + - 256 + - 256 + - _target_: torchvision.transforms.Grayscale + num_output_channels: 3 + - _target_: torchvision.transforms.ToTensor + - _target_: torchvision.transforms.Normalize + mean: + - 0.5 + std: + - 0.5 + val_dataset_fn: + _partial_: true + _target_: torchvision.datasets.MNIST + root: "./dataset" + train: False + download: True + transform: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Resize + size: + - 256 + - 256 + - _target_: torchvision.transforms.Grayscale + num_output_channels: 3 + - _target_: torchvision.transforms.ToTensor + - _target_: torchvision.transforms.Normalize + mean: + - 0.5 + std: + - 0.5 + batch_size: 128 + wrap_scatter_dataset: False diff --git a/examples/torch/MNIST/train_ddp.sh b/examples/torch/MNIST/train_ddp.sh new file mode 100644 index 00000000..544cad57 --- /dev/null +++ b/examples/torch/MNIST/train_ddp.sh @@ -0,0 +1,18 @@ +#! /bin/bash + +#PBS -q rt_HF +#PBS -l select=1 +#PBS -l walltime=1:00:00 +#PBS -P grpname +#PBS -j oe + +cd ${PBS_O_WORKDIR} + +source /etc/profile.d/modules.sh +module load cuda/12.6/12.6.1 + +source path_to_aiaccel_env/bin/activate + +wd=path_to_working_directory + +singularity exec --nv path_to_python.sif python -m aiaccel.torch.apps.train $wd/config_ddp.yaml --working_directory $wd From 6e4b9e063202dce26cd9200927a1f8554855b0b7 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Tue, 28 Jan 2025 16:54:20 +0900 Subject: [PATCH 09/12] Add logger. --- examples/torch/MNIST/config.yaml | 1 + examples/torch/MNIST/config_ddp.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/torch/MNIST/config.yaml b/examples/torch/MNIST/config.yaml index 41c45f19..811de63d 100644 --- a/examples/torch/MNIST/config.yaml +++ b/examples/torch/MNIST/config.yaml @@ -1,4 +1,5 @@ trainer: + logger: True max_epochs: 10 callbacks: - _target_: lightning.pytorch.callbacks.ModelCheckpoint diff --git a/examples/torch/MNIST/config_ddp.yaml b/examples/torch/MNIST/config_ddp.yaml index a7061c2d..b42b3798 100644 --- a/examples/torch/MNIST/config_ddp.yaml +++ b/examples/torch/MNIST/config_ddp.yaml @@ -2,6 +2,7 @@ trainer: accelerator: "gpu" devices: 8 strategy: "ddp" + logger: True max_epochs: 10 callbacks: - _target_: lightning.pytorch.callbacks.ModelCheckpoint From d2f56dd0600315447d42d0ea41ebf6129619ed20 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Mon, 3 Feb 2025 14:35:52 +0900 Subject: [PATCH 10/12] Fix for mypy. --- examples/torch/MNIST/torchvision_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/torch/MNIST/torchvision_task.py b/examples/torch/MNIST/torchvision_task.py index 49737d27..77dc5409 100644 --- a/examples/torch/MNIST/torchvision_task.py +++ b/examples/torch/MNIST/torchvision_task.py @@ -13,11 +13,11 @@ class Resnet50Task(OptimizerLightningModule): def __init__(self, model: nn.Module, optimizer_config: OptimizerConfig, num_classes: int = 10): super().__init__(optimizer_config) self.model = model - self.model.fc = nn.Linear(self.model.fc.in_features, num_classes) + if hasattr(self.model.fc, "in_features") and isinstance(self.model.fc.in_features, int): + self.model.fc = nn.Linear(self.model.fc.in_features, num_classes) self.train_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10) self.val_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10) - self.test_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10) def forward(self, x: Any) -> Any: return self.model(x) From 50b6013a049a7a12edf50a4d284bf2044a222a7c Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Mon, 3 Feb 2025 15:32:53 +0900 Subject: [PATCH 11/12] Add docs for ABCI3.0. --- docs/source/user_guide/torch.rst | 41 ++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/docs/source/user_guide/torch.rst b/docs/source/user_guide/torch.rst index 0c488e4a..1d38a365 100644 --- a/docs/source/user_guide/torch.rst +++ b/docs/source/user_guide/torch.rst @@ -7,6 +7,47 @@ Writing a simple training script Running inference ----------------- +To run aiaccel on ABCI3.0, you need an environment with Python 3.10. +This guide explains how to set up the environment using Singularity. + +.. note:: + + For details on how to use Singularity, please refer to the following documentation: + https://docs.abci.ai/v3/en/containers/ + +Create the following Singularity definition file: + +.. code-block:: bash + :caption: aiaccel_env.def + + BootStrap: docker + + From: python:3.10 + + %post + + pip install --upgrade pip + + # aiaccel env + pip install aiaccel[torch]@git+https://github.com/aistairc/aiaccel.git@develop/v2 + + # torch/MNIST example env + pip install torchvision + + +Use the Singularity definition file to build a Singularity image file: + +.. code-block:: bash + + singularity build aiaccel.sif aiaccel_env.def + +Use the Singularity image file to execute aiaccel: + +.. code-block:: bash + + singularity exec --nv aiaccel.sif python -m aiaccel.torch.apps.train $wd/config.yaml --working_directory $wd + + Writing a DDP training script ----------------------------- From 43941f0984f56fb10cdbcc5e99df5b8f7dff2937 Mon Sep 17 00:00:00 2001 From: KanaiYuma Date: Mon, 3 Feb 2025 16:40:15 +0900 Subject: [PATCH 12/12] Fix for lint. --- docs/source/user_guide/torch.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/source/user_guide/torch.rst b/docs/source/user_guide/torch.rst index 1d38a365..c54c577a 100644 --- a/docs/source/user_guide/torch.rst +++ b/docs/source/user_guide/torch.rst @@ -7,8 +7,8 @@ Writing a simple training script Running inference ----------------- -To run aiaccel on ABCI3.0, you need an environment with Python 3.10. -This guide explains how to set up the environment using Singularity. +To run aiaccel on ABCI3.0, you need an environment with Python 3.10. This guide explains +how to set up the environment using Singularity. .. note:: @@ -34,7 +34,6 @@ Create the following Singularity definition file: # torch/MNIST example env pip install torchvision - Use the Singularity definition file to build a Singularity image file: .. code-block:: bash @@ -47,7 +46,6 @@ Use the Singularity image file to execute aiaccel: singularity exec --nv aiaccel.sif python -m aiaccel.torch.apps.train $wd/config.yaml --working_directory $wd - Writing a DDP training script -----------------------------