From a99816455a42cc753ec8c3a85dfef36dc711a311 Mon Sep 17 00:00:00 2001
From: Juhan Bae <pomonam15@gmail.com>
Date: Wed, 20 Mar 2024 00:35:05 -0400
Subject: [PATCH] Add self-influence computation

---
 examples/cifar/README.md                      | 15 +++++++++++-
 examples/cifar/analyze.py                     |  2 +-
 ...ataset.py => detect_mislabeled_dataset.py} | 23 ++++---------------
 3 files changed, 20 insertions(+), 20 deletions(-)
 rename examples/cifar/{detect_mislabled_dataset.py => detect_mislabeled_dataset.py} (78%)

diff --git a/examples/cifar/README.md b/examples/cifar/README.md
index a86283a..66ac654 100644
--- a/examples/cifar/README.md
+++ b/examples/cifar/README.md
@@ -26,7 +26,7 @@ python analyze.py --query_batch_size 1000 \
     --checkpoint_dir ./checkpoints \
     --factor_strategy ekfac
 ```
-You can also use `identity`, `diagonal`, and `kfac`. On A100 (80GB), it takes roughly 2 minutes to compute the 
+You can also use `identity`, `diagonal`, and `kfac`. On A100 (80GB), it takes roughly 1.5 minutes to compute the 
 pairwise scores (including computing EKFAC factors).
 
 ## Mislabeled Data Detection
@@ -43,3 +43,16 @@ python train.py --dataset_dir ./data \
     --num_train_epochs 25 \
     --seed 1004
 ```
+
+Then, compute self-influence scores with the following command:
+```bash
+python detect_mislabeled_dataset.py --dataset_dir ./data \
+    --corrupt_percentage 0.1 \
+    --checkpoint_dir ./checkpoints \
+    --train_batch_size 512 \
+    --eval_batch_size 1024 \
+    --learning_rate 0.4 \
+    --weight_decay 0.0001 \
+    --num_train_epochs 25 \
+    --seed 1004
+```
\ No newline at end of file
diff --git a/examples/cifar/analyze.py b/examples/cifar/analyze.py
index 6f7556f..292b102 100644
--- a/examples/cifar/analyze.py
+++ b/examples/cifar/analyze.py
@@ -17,7 +17,7 @@
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="Influence analysis on UCI datasets.")
+    parser = argparse.ArgumentParser(description="Influence analysis on CIFAR-10 dataset.")
 
     parser.add_argument(
         "--corrupt_percentage",
diff --git a/examples/cifar/detect_mislabled_dataset.py b/examples/cifar/detect_mislabeled_dataset.py
similarity index 78%
rename from examples/cifar/detect_mislabled_dataset.py
rename to examples/cifar/detect_mislabeled_dataset.py
index 266ee1c..bccbd77 100644
--- a/examples/cifar/detect_mislabled_dataset.py
+++ b/examples/cifar/detect_mislabeled_dataset.py
@@ -1,7 +1,6 @@
 import argparse
 import logging
 import os
-from typing import Tuple
 
 import torch
 from arguments import FactorArguments
@@ -10,9 +9,8 @@
 from kronfluence.analyzer import Analyzer, prepare_model
 
 
-
 def parse_args():
-    parser = argparse.ArgumentParser(description="Influence analysis on UCI datasets.")
+    parser = argparse.ArgumentParser(description="Detecting mislabeled CIFAR-10 dataset.")
 
     parser.add_argument(
         "--corrupt_percentage",
@@ -27,13 +25,6 @@ def parse_args():
         help="A folder to download or load CIFAR-10 dataset.",
     )
 
-    parser.add_argument(
-        "--query_batch_size",
-        type=int,
-        default=1000,
-        help="Batch size for computing query gradients.",
-    )
-
     parser.add_argument(
         "--checkpoint_dir",
         type=str,
@@ -63,7 +54,6 @@ def main():
     train_dataset = get_cifar10_dataset(
         split="eval_train", corrupt_percentage=args.corrupt_percentage, dataset_dir=args.dataset_dir
     )
-    eval_dataset = get_cifar10_dataset(split="valid", dataset_dir=args.dataset_dir)
 
     model = construct_resnet9()
     model_name = "model"
@@ -90,18 +80,15 @@ def main():
         dataset=train_dataset,
         per_device_batch_size=None,
         factor_args=factor_args,
-        overwrite_output_dir=True,
+        overwrite_output_dir=False,
     )
-    analyzer.compute_pairwise_scores(
-        scores_name="pairwise",
+    analyzer.compute_self_scores(
+        scores_name="self",
         factors_name=args.factor_strategy,
-        query_dataset=eval_dataset,
-        query_indices=list(range(2000)),
         train_dataset=train_dataset,
-        per_device_query_batch_size=args.query_batch_size,
         overwrite_output_dir=True,
     )
-    scores = analyzer.load_pairwise_scores("pairwise")
+    scores = analyzer.load_pairwise_scores("self")
     print(scores)