Skip to content

Commit

Permalink
Merge pull request #1 from pomonam/gpu_test
Browse files Browse the repository at this point in the history
Add GPU tests for DDP, FSDP, and torch.compile
  • Loading branch information
pomonam authored Mar 12, 2024
2 parents 05daad4 + 9b09521 commit 4031fdf
Show file tree
Hide file tree
Showing 14 changed files with 293 additions and 226 deletions.
3 changes: 2 additions & 1 deletion examples/_test_requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
scikit-learn
scikit-learn
jupyter
3 changes: 1 addition & 2 deletions examples/uci/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This directory contains scripts designed for training a regression model and con

## Training

To initiate the training of a regression model using the Concrete dataset, execute the following command:
To train a regression model on the Concrete dataset, run the following command:
```bash
python train.py --dataset_name concrete \
--dataset_dir ./data \
Expand All @@ -16,7 +16,6 @@ python train.py --dataset_name concrete \
--num_train_epochs 20 \
--seed 1004
```
Alternatively, you can download the model checkpoint.

# Influence Analysis

Expand Down
124 changes: 20 additions & 104 deletions examples/uci/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,16 @@
import logging
import math
import os
from typing import Dict, Tuple
from typing import Tuple

import torch
import torch.nn.functional as F
from analyzer import Analyzer, prepare_model
from arguments import FactorArguments, ScoreArguments
from module.utils import wrap_tracked_modules
from task import Task
from torch import nn
from torch.profiler import ProfilerActivity, profile, record_function

from examples.uci.pipeline import construct_regression_mlp, get_regression_dataset
from kronfluence.analyzer import Analyzer, prepare_model
from kronfluence.arguments import FactorArguments
from kronfluence.task import Task

BATCH_DTYPE = Tuple[torch.Tensor, torch.Tensor]

Expand All @@ -33,31 +31,18 @@ def parse_args():
default="./data",
help="A folder containing the UCI regression dataset.",
)

parser.add_argument(
"--factor_strategy",
type=str,
default="ekfac",
help="Strategy to compute preconditioning factors.",
)
parser.add_argument(
"--batch_size",
type=int,
default=256,
help="Batch size for compute factors and scores.",
)
parser.add_argument(
"--analysis_name",
"--checkpoint_dir",
type=str,
default="uci",
help="Name of the influence analysis.",
default="./checkpoints",
help="A path to store the final checkpoint.",
)

parser.add_argument(
"--checkpoint_dir",
"--factor_strategy",
type=str,
default="./checkpoints",
help="A path to store the final checkpoint.",
default="ekfac",
help="Strategy to compute preconditioning factors.",
)

args = parser.parse_args()
Expand Down Expand Up @@ -96,14 +81,12 @@ def compute_measurement(

def main():
args = parse_args()

logging.basicConfig(level=logging.INFO)

train_dataset = get_regression_dataset(data_name=args.dataset_name, split="train", data_path=args.dataset_dir)
eval_dataset = get_regression_dataset(data_name=args.dataset_name, split="valid", data_path=args.dataset_dir)
train_dataset = get_regression_dataset(data_name=args.dataset_name, split="train", dataset_dir=args.dataset_dir)
eval_dataset = get_regression_dataset(data_name=args.dataset_name, split="valid", dataset_dir=args.dataset_dir)

model = construct_regression_mlp()

checkpoint_path = os.path.join(args.checkpoint_dir, "model.pth")
if not os.path.isfile(checkpoint_path):
raise ValueError(f"No checkpoint found at {checkpoint_path}.")
Expand All @@ -113,98 +96,31 @@ def main():
model = prepare_model(model, task)

analyzer = Analyzer(
analysis_name=args.analysis_name,
analysis_name=args.dataset_name,
model=model,
task=task,
cpu=True,
)
factor_args = FactorArguments(
strategy=args.factor_strategy,
covariance_data_partition_size=5,
covariance_module_partition_size=4,
)
# with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
# with record_function("covariance"):
# analyzer.fit_covariance_matrices(
# factors_name=args.factor_strategy,
# dataset=train_dataset,
# factor_args=factor_args,
# per_device_batch_size=args.batch_size,
# overwrite_output_dir=True,
# )
#
# print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
# cov_factors = analyzer.fit_covariance_matrices(
# factors_name=args.factor_strategy,
# dataset=train_dataset,
# factor_args=factor_args,
# per_device_batch_size=args.batch_size,
# overwrite_output_dir=True,
# )
# print(cov_factors)

with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
with record_function("eigen"):
res = analyzer.perform_eigendecomposition(
factors_name=args.factor_strategy,
factor_args=factor_args,
overwrite_output_dir=True,
)
# print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))
# print(res)
res = analyzer.fit_lambda_matrices(
analyzer.fit_all_factors(
factors_name=args.factor_strategy,
dataset=train_dataset,
# factor_args=factor_args,
per_device_batch_size=None,
factor_args=factor_args,
overwrite_output_dir=True,
)
# print(res)
#
score_args = ScoreArguments(data_partition_size=2, module_partition_size=2)
analyzer.compute_pairwise_scores(
scores_name="hello",

scores = analyzer.compute_pairwise_scores(
scores_name="pairwise",
factors_name=args.factor_strategy,
query_dataset=eval_dataset,
train_dataset=train_dataset,
per_device_query_batch_size=16,
per_device_train_batch_size=8,
score_args=score_args,
per_device_query_batch_size=len(eval_dataset),
overwrite_output_dir=True,
)
# scores = analyzer.load_pairwise_scores(scores_name="hello")
# print(scores)
#
# analyzer.compute_self_scores(
# scores_name="hello",
# factors_name=args.factor_strategy,
# # query_dataset=eval_dataset,
# train_dataset=train_dataset,
# # per_device_query_batch_size=16,
# per_device_train_batch_size=8,
# overwrite_output_dir=True,
# )
# # scores = analyzer.load_self_scores(scores_name="hello")
# # print(scores)

# analyzer.fit_all_factors(
# factor_name=args.factor_strategy,
# dataset=train_dataset,
# factor_args=factor_args,
# per_device_batch_size=None,
# overwrite_output_dir=True,
# )
#
# score_name = "full_pairwise"
# analyzer.compute_pairwise_scores(
# score_name=score_name,
# query_dataset=eval_dataset,
# per_device_query_batch_size=len(eval_dataset),
# train_dataset=train_dataset,
# per_device_train_batch_size=len(train_dataset),
# )
# scores = analyzer.load_pairwise_scores(score_name=score_name)
# print(scores.shape)
logging.info(f"Scores: {scores}")


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions examples/uci/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ def get_regression_dataset(
data_name: str,
split: str,
indices: List[int] = None,
data_path: str = "data/",
dataset_dir: str = "data/",
) -> Dataset:
assert split in ["train", "eval_train", "valid"]

# Load the dataset from the `.data` file.
data = np.loadtxt(os.path.join(data_path, data_name + ".data"), delimiter=None)
data = np.loadtxt(os.path.join(dataset_dir, data_name + ".data"), delimiter=None)
data = data.astype(np.float32)

# Shuffle the dataset.
Expand Down
96 changes: 54 additions & 42 deletions examples/uci/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import torch
import torch.nn.functional as F
from accelerate.utils import set_seed
from torch.utils.data import DataLoader
from torch import nn
from torch.utils import data
from tqdm import tqdm

from examples.uci.pipeline import construct_regression_mlp, get_regression_dataset
Expand Down Expand Up @@ -82,28 +83,25 @@ def parse_args():
return args


def main():
args = parse_args()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

if args.seed is not None:
set_seed(args.seed)

train_dataset = get_regression_dataset(data_name=args.dataset_name, split="train", data_path=args.dataset_dir)
train_dataloader = DataLoader(
dataset=train_dataset,
batch_size=args.train_batch_size,
def train(
dataset: data.Dataset,
batch_size: int,
num_train_epochs: int,
learning_rate: float,
weight_decay: float,
) -> nn.Module:
train_dataloader = data.DataLoader(
dataset=dataset,
batch_size=batch_size,
shuffle=True,
drop_last=True,
)

model = construct_regression_mlp()
optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

logger.info("Start training the model.")
model.train()
for epoch in range(args.num_train_epochs):
for epoch in range(num_train_epochs):
total_loss = 0
with tqdm(train_dataloader, unit="batch") as tepoch:
for batch in tepoch:
Expand All @@ -116,43 +114,57 @@ def main():
optimizer.step()
optimizer.zero_grad()
tepoch.set_postfix(loss=total_loss.item() / len(train_dataloader))
return model

logger.info("Start evaluating the model.")
model.eval()
train_eval_dataset = get_regression_dataset(
data_name=args.dataset_name, split="eval_train", data_path=args.dataset_dir
)
train_eval_dataloader = DataLoader(
dataset=train_eval_dataset,
batch_size=args.eval_batch_size,
shuffle=False,
drop_last=False,
)
eval_dataset = get_regression_dataset(data_name=args.dataset_name, split="valid", data_path=args.dataset_dir)
eval_dataloader = DataLoader(
dataset=eval_dataset,
batch_size=args.eval_batch_size,

def evaluate(model: nn.Module, dataset: data.Dataset, batch_size: int) -> float:
dataloader = data.DataLoader(
dataset=dataset,
batch_size=batch_size,
shuffle=False,
drop_last=False,
)

model.eval()
total_loss = 0
for batch in train_eval_dataloader:
for batch in dataloader:
with torch.no_grad():
inputs, targets = batch
outputs = model(inputs)
loss = F.mse_loss(outputs, targets, reduction="sum")
total_loss += loss.detach().float()
logger.info(f"Train loss {total_loss.item() / len(train_eval_dataloader.dataset)}")

total_loss = 0
for batch in eval_dataloader:
with torch.no_grad():
inputs, targets = batch
outputs = model(inputs)
loss = F.mse_loss(outputs, targets, reduction="sum")
total_loss += loss.detach().float()
logger.info(f"Evaluation loss {total_loss.item() / len(eval_dataloader.dataset)}")
return total_loss.item() / len(dataloader.dataset)


def main():
args = parse_args()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

if args.seed is not None:
set_seed(args.seed)

train_dataset = get_regression_dataset(data_name=args.dataset_name, split="train", dataset_dir=args.dataset_dir)

model = train(
dataset=train_dataset,
batch_size=args.train_batch_size,
num_train_epochs=args.num_train_epochs,
learning_rate=args.learning_rate,
weight_decay=args.weight_decay,
)

eval_train_dataset = get_regression_dataset(
data_name=args.dataset_name, split="eval_train", dataset_dir=args.dataset_dir
)
train_loss = evaluate(model=model, dataset=eval_train_dataset, batch_size=args.eval_batch_size)
logger.info(f"Train loss: {train_loss}")

eval_dataset = get_regression_dataset(data_name=args.dataset_name, split="valid", dataset_dir=args.dataset_dir)
eval_loss = evaluate(model=model, dataset=eval_dataset, batch_size=args.eval_batch_size)
logger.info(f"Evaluation loss: {eval_loss}")

if args.checkpoint_dir is not None:
torch.save(model.state_dict(), os.path.join(args.checkpoint_dir, "model.pth"))
Expand Down
Loading

0 comments on commit 4031fdf

Please sign in to comment.