Linting fix

pomonam · Mar 20, 2024 · 5b78068 · 5b78068
1 parent 426c983
commit 5b78068
Show file tree

Hide file tree

Showing 8 changed files with 145 additions and 20 deletions.
diff --git a/examples/cifar/README.md b/examples/cifar/README.md
@@ -17,7 +17,7 @@ python train.py --dataset_dir ./data \
     --seed 1004
 ```
 
-# Computing Pairwise Influence Scores
+## Computing Pairwise Influence Scores
 
 To obtain a pairwise influence scores on 2000 query data points using `ekfac`, run the following command:
 ```bash
@@ -26,8 +26,20 @@ python analyze.py --query_batch_size 1000 \
     --checkpoint_dir ./checkpoints \
     --factor_strategy ekfac
 ```
-You can also use `identity`, `diagonal`, and `kfac`. On A100 (80GB), it takes roughly 2 minutes to compute the pairwise scores.
+You can also use `identity`, `diagonal`, and `kfac`. On A100 (80GB), it takes roughly 2 minutes to compute the 
+pairwise scores (including computing EKFAC factors).
 
-# Counterfactual Evaluation
+## Mislabeled Data Detection
 
-You can check the notebook `tutorial.ipynb` for running the counterfactual evaluation.
+First, train the model with 10% of training dataset mislabeled by running the following command:
+```bash
+python train.py --dataset_dir ./data \
+    --corrupt_percentage 0.1 \
+    --checkpoint_dir ./checkpoints \
+    --train_batch_size 512 \
+    --eval_batch_size 1024 \
+    --learning_rate 0.4 \
+    --weight_decay 0.0001 \
+    --num_train_epochs 25 \
+    --seed 1004
+```
diff --git a/examples/cifar/analyze.py b/examples/cifar/analyze.py
@@ -5,11 +5,13 @@
 
 import torch
 import torch.nn.functional as F
-from arguments import FactorArguments
+from kronfluence.arguments import FactorArguments
 from torch import nn
+
 from examples.cifar.pipeline import construct_resnet9, get_cifar10_dataset
 from kronfluence.analyzer import Analyzer, prepare_model
 from kronfluence.task import Task
+from kronfluence.utils.dataset import DataLoaderKwargs
 
 BATCH_TYPE = Tuple[torch.Tensor, torch.Tensor]
 
@@ -125,6 +127,9 @@ def main():
         cpu=False,
     )
 
+    dataloader_kwargs = DataLoaderKwargs(num_workers=4)
+    analyzer.set_dataloader_kwargs(dataloader_kwargs)
+
     factor_args = FactorArguments(strategy=args.factor_strategy)
     analyzer.fit_all_factors(
         factors_name=args.factor_strategy,

diff --git a/examples/cifar/detect_mislabled_dataset.py b/examples/cifar/detect_mislabled_dataset.py
@@ -0,0 +1,109 @@
+import argparse
+import logging
+import os
+from typing import Tuple
+
+import torch
+from arguments import FactorArguments
+from examples.cifar.analyze import ClassificationTask
+from examples.cifar.pipeline import construct_resnet9, get_cifar10_dataset
+from kronfluence.analyzer import Analyzer, prepare_model
+
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Influence analysis on UCI datasets.")
+
+    parser.add_argument(
+        "--corrupt_percentage",
+        type=float,
+        default=0.1,
+        help="Percentage of the training dataset to corrupt.",
+    )
+    parser.add_argument(
+        "--dataset_dir",
+        type=str,
+        default="./data",
+        help="A folder to download or load CIFAR-10 dataset.",
+    )
+
+    parser.add_argument(
+        "--query_batch_size",
+        type=int,
+        default=1000,
+        help="Batch size for computing query gradients.",
+    )
+
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        default="./checkpoints",
+        help="A path to store the final checkpoint.",
+    )
+
+    parser.add_argument(
+        "--factor_strategy",
+        type=str,
+        default="ekfac",
+        help="Strategy to compute preconditioning factors.",
+    )
+
+    args = parser.parse_args()
+
+    if args.checkpoint_dir is not None:
+        os.makedirs(args.checkpoint_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=logging.INFO)
+
+    train_dataset = get_cifar10_dataset(
+        split="eval_train", corrupt_percentage=args.corrupt_percentage, dataset_dir=args.dataset_dir
+    )
+    eval_dataset = get_cifar10_dataset(split="valid", dataset_dir=args.dataset_dir)
+
+    model = construct_resnet9()
+    model_name = "model"
+    if args.corrupt_percentage is not None:
+        model_name += "_corrupt_" + str(args.corrupt_percentage)
+    checkpoint_path = os.path.join(args.checkpoint_dir, f"{model_name}.pth")
+    if not os.path.isfile(checkpoint_path):
+        raise ValueError(f"No checkpoint found at {checkpoint_path}.")
+    model.load_state_dict(torch.load(checkpoint_path))
+
+    task = ClassificationTask()
+    model = prepare_model(model, task)
+
+    analyzer = Analyzer(
+        analysis_name="cifar10",
+        model=model,
+        task=task,
+        cpu=False,
+    )
+
+    factor_args = FactorArguments(strategy=args.factor_strategy)
+    analyzer.fit_all_factors(
+        factors_name=args.factor_strategy,
+        dataset=train_dataset,
+        per_device_batch_size=None,
+        factor_args=factor_args,
+        overwrite_output_dir=True,
+    )
+    analyzer.compute_pairwise_scores(
+        scores_name="pairwise",
+        factors_name=args.factor_strategy,
+        query_dataset=eval_dataset,
+        query_indices=list(range(2000)),
+        train_dataset=train_dataset,
+        per_device_query_batch_size=args.query_batch_size,
+        overwrite_output_dir=True,
+    )
+    scores = analyzer.load_pairwise_scores("pairwise")
+    print(scores)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/uci/README.md b/examples/uci/README.md
@@ -1,7 +1,7 @@
 # UCI Regression Example
 
 This directory contains scripts designed for training a regression model and conducting influence analysis with 
-datasets from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/datasets). Install all necessary packages:
+datasets from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/datasets). Please begin by installing necessary packages.
 
 ```bash
 pip install -r requirements.txt
@@ -22,7 +22,7 @@ python train.py --dataset_name concrete \
     --seed 1004
 ```
 
-# Computing Pairwise Influence Scores
+## Computing Pairwise Influence Scores
 
 To obtain a pairwise influence scores using `ekfac`, run the following command:
 ```bash
@@ -33,6 +33,6 @@ python analyze.py --dataset_name concrete \
 ```
 You can also use `identity`, `diagonal`, and `kfac`.
 
-# Counterfactual Evaluation
+## Counterfactual Evaluation
 
-You can check the notebook `tutorial.ipynb` for running the counterfactual evaluation.
+You can check the notebook `tutorial.ipynb` to run the counterfactual evaluation.
diff --git a/examples/uci/analyze.py b/examples/uci/analyze.py
@@ -13,7 +13,7 @@
 from kronfluence.analyzer import Analyzer, prepare_model
 from kronfluence.task import Task
 
-BATCH_DTYPE = Tuple[torch.Tensor, torch.Tensor]
+BATCH_TYPE = Tuple[torch.Tensor, torch.Tensor]
 
 
 def parse_args():
@@ -58,7 +58,7 @@ def parse_args():
 class RegressionTask(Task):
     def compute_train_loss(
         self,
-        batch: BATCH_DTYPE,
+        batch: BATCH_TYPE,
         model: nn.Module,
         sample: bool = False,
     ) -> torch.Tensor:
@@ -72,7 +72,7 @@ def compute_train_loss(
 
     def compute_measurement(
         self,
-        batch: BATCH_DTYPE,
+        batch: BATCH_TYPE,
         model: nn.Module,
     ) -> torch.Tensor:
         # The measurement function is set as a training loss.

diff --git a/kronfluence/computer/computer.py b/kronfluence/computer/computer.py
@@ -36,7 +36,6 @@
 from kronfluence.utils.exceptions import (
     FactorsNotFoundError,
     TrackedModuleNotFoundError,
-    UnsupportableModuleError,
 )
 from kronfluence.utils.logger import PassThroughProfiler, Profiler, get_logger
 from kronfluence.utils.save import (
@@ -82,7 +81,7 @@ def __init__(
                 f"Analyzer."
             )
             self.logger.error(error_msg)
-            raise UnsupportableModuleError(error_msg)
+            raise TrackedModuleNotFoundError(error_msg)
         self.logger.info(f"Tracking modules with names: {tracked_module_names}.")
 
         if self.state.use_distributed and not isinstance(model, (DDP, FSDP)):

diff --git a/tests/gpu_tests/ddp_variation_test.py b/tests/gpu_tests/ddp_variation_test.py
@@ -14,7 +14,7 @@
 from kronfluence.arguments import FactorArguments, ScoreArguments
 from kronfluence.task import Task
 from tests.gpu_tests.ddp_test import OLD_FACTOR_NAME
-from tests.gpu_tests.pipeline import BATCH_DTYPE, construct_test_mlp, get_mnist_dataset
+from tests.gpu_tests.pipeline import BATCH_TYPE, construct_test_mlp, get_mnist_dataset
 
 LOCAL_RANK = int(os.environ["LOCAL_RANK"])
 WORLD_RANK = int(os.environ["RANK"])
@@ -27,7 +27,7 @@
 class GpuVariationTask(Task):
     def compute_train_loss(
         self,
-        batch: BATCH_DTYPE,
+        batch: BATCH_TYPE,
         model: nn.Module,
         sample: bool = False,
     ) -> torch.Tensor:
@@ -45,7 +45,7 @@ def compute_train_loss(
 
     def compute_measurement(
         self,
-        batch: BATCH_DTYPE,
+        batch: BATCH_TYPE,
         model: nn.Module,
     ) -> torch.Tensor:
         inputs, labels = batch

diff --git a/tests/gpu_tests/pipeline.py b/tests/gpu_tests/pipeline.py
@@ -10,13 +10,13 @@
 
 from kronfluence.task import Task
 
-BATCH_DTYPE = Tuple[torch.Tensor, torch.Tensor]
+BATCH_TYPE = Tuple[torch.Tensor, torch.Tensor]
 
 
 class GpuTestTask(Task):
     def compute_train_loss(
         self,
-        batch: BATCH_DTYPE,
+        batch: BATCH_TYPE,
         model: nn.Module,
         sample: bool = False,
     ) -> torch.Tensor:
@@ -34,7 +34,7 @@ def compute_train_loss(
 
     def compute_measurement(
         self,
-        batch: BATCH_DTYPE,
+        batch: BATCH_TYPE,
         model: nn.Module,
     ) -> torch.Tensor:
         inputs, labels = batch