Added scaling experiments

asaparov · Sep 13, 2024 · 6798b62 · 6798b62
1 parent 1e54228
commit 6798b62
Show file tree

Hide file tree

Showing 378 changed files with 7,861 additions and 0 deletions.
diff --git a/scaling_experiments/Readme.md b/scaling_experiments/Readme.md
@@ -0,0 +1,26 @@
+This folder is for scaling_experiments.
+It contains results of all seeds from 1 to 14.
+
+Each seed folder contains .sbatch, .err, and .out files for each experiments.
+
+We have 3 files `finding_metrics_hid.py`, `finding_metrics_input_sizes.py` and `finding_metrics_layers.py` to analyze the experiments.
+
+Steps to analyze the outputs of these experiments.
+
+1) `finding_metrics_input_sizes.py`
+- Set the `base_directory` in  the code.
+- Run `python finding_metric_hid.py`, you can specify arguments `--epochs, --metrics, --seeds` you want to analyze.
+- This code will calculate minimum and average across all the seeds for every epoch for the metric you have specified.
+
+2) `finding_metrics_hid.py`
+- Set the `base_directory` and `layers` you want to analyze in the code.
+- Run `python finding_metric_hid.py`, you can specify arguments `--epochs, --metrics, --seeds` you want to analyze.
+- This code will calculate minimum and average across all the seeds for every epoch for the metric you have specified.
+
+3) `finding_metrics_hid.py`
+- Set the `base_directory` and `layers` you want to analyze in the code.
+- Run `python finding_metric_hid.py`, you can specify arguments `--epochs, --metrics, --seeds` you want to analyze.
+- This code will calculate minimum and average across all the seeds for every epoch for the metric you have specified.
+
+
+The outputs generated from these files can directly be given to the python notebooks for generating plots.
diff --git a/scaling_experiments/finding_metrics_hid.py b/scaling_experiments/finding_metrics_hid.py
@@ -0,0 +1,137 @@
+import argparse
+import os
+import re
+from collections import defaultdict
+import logging
+import numpy as np
+import random
+from tqdm import tqdm
+
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+logger = logging.getLogger()
+
+def parse_seeds(seeds_arg):
+    if ':' in seeds_arg:
+        start, end = map(int, seeds_arg.split(':'))
+        return range(start, end + 1)
+    else:
+        return [int(seed) for seed in seeds_arg.split(',')]
+
+def extract_metric(file_path, epoch, metric):
+    with open(file_path, 'r') as f:
+        content = f.read()
+
+    # Define regex patterns for each metric
+    patterns = {
+        'training_loss': r'epoch = {}, training loss = ([\d.]+)'.format(epoch),
+        'training_accuracy': r'epoch = {}.*?training accuracy: ([\d.]+)'.format(epoch),
+        'test_accuracy': r'epoch = {}.*?test accuracy = ([\d.]+)'.format(epoch),
+        'test_loss': r'epoch = {}.*?test loss = ([\d.]+)'.format(epoch)
+    }
+
+    match = re.search(patterns[metric], content, re.DOTALL)
+    if match:
+        return float(match.group(1))
+    else:
+        return None
+
+def process_directories(base_dir, epoch, metric, seeds):
+    values = defaultdict(list)
+    missing_data = defaultdict(list)
+    missing_files = defaultdict(list)
+    present_files = defaultdict(list)
+
+    expected_layers = [16, 64, 256, 1024]  # Define the expected layers
+
+    for seed in seeds:
+        seed_dir = f"{base_dir}/seed_{seed}"
+        if not os.path.exists(seed_dir):
+            logger.warning(f"Directory not found: {seed_dir}")
+            continue
+
+        found_layers = set()
+
+        for file in os.listdir(seed_dir):
+
+            if file.startswith("scaling_n_c_1e-5_seed") and re.search(r"_hid_\d+\.out$", file):
+                # print("for seed file", file)
+                parts = file.split('_')
+                layers = int(parts[-1].split('.')[0])
+                found_layers.add(layers)
+                present_files[layers].append(seed)
+
+                file_path = os.path.join(seed_dir, file)
+                value = extract_metric(file_path, epoch, metric)
+                # print("\nvalue:", value)   
+
+                if value is not None:
+                    values[layers].append(value)
+                else:
+                    missing_data[layers].append(seed)
+
+        # Check for missing layer files
+        missing_layers = set(expected_layers) - found_layers
+        for layer in missing_layers:
+            missing_files[layer].append(seed)
+
+    return values, missing_data, missing_files, present_files
+
+def calculate_average_values(values):
+    avg_values = {}
+    min_values = {}
+    max_values = {}
+    for layer, value_list in values.items():
+        if value_list:
+            avg_values[layer] = sum(value_list) / len(value_list)
+            min_values[layer] = min(value_list)
+            max_values[layer] = max(value_list)
+        else:
+            logger.warning(f"No valid values found for layer {layer}")
+    return avg_values, min_values, max_values
+
+def main():
+    parser = argparse.ArgumentParser(description="Calculate average metrics across seeds for different layers.")
+    parser.add_argument("--epoch", type=int, default=1000, help="Epoch number to analyze (default: 1000)")
+    parser.add_argument("--metric", choices=['training_loss', 'training_accuracy', 'test_accuracy', 'test_loss'], 
+                        default='test_loss', help="Metric to average (default: test_loss)")
+    parser.add_argument("--seeds", default="1:15", help="Seeds to process. Can be a range (e.g., '1:15') or a comma-separated list (e.g., '1,3,5,7,9')")
+
+    args = parser.parse_args()
+
+    ## Set your base directory to find seed folders ex. seed_1, seed_2, ...
+    base_directory = "/scratch/sxp8182/work_with_abu/learning_to_search/cirriculum_learning/learning_to_search/"
+
+    ## Define the hidden dims you want to analyze. Even though the variable name says layers, its actually hidden dims.
+    layers = [16, 64, 256, 1024]  # Define the layers
+    max_infinity = np.inf
+    min_infinity = -np.inf
+
+    # Add a random number to the output file name
+    output_file = "average_metrics_hidden_" + str(random.randint(1, 1000000)) + ".txt"  # Specify the output file name
+    print("saving in file:\n", output_file)
+    with open(output_file, 'w') as f:  # Open the file for writing
+        for epoch in tqdm(range(1, args.epoch + 1)):  # Loop through epochs from 1 to 900
+            seeds = parse_seeds(args.seeds)
+            values, missing_data, missing_files, present_files = process_directories(base_directory, epoch, args.metric, seeds)
+            # print("value dict:\n\n")
+            #print(values)
+            avg_values, min_values, max_values = calculate_average_values(values)
+            # print("avg_values:\n\n")
+            #print(avg_values)
+            # print("min_values:\n\n")
+            #print(min_values)
+
+            for missing_file in missing_files:
+                print(f"Missing files for layer {missing_file}: {missing_files[missing_file]}")
+            # for present_file in present_files:
+            #     print(f"Present files for layer {present_file}: {present_files[present_file]}")
+            for missing_data_no in missing_data:
+                print(f"Missing data for layer {missing_data_no}: {missing_data[missing_data_no]}")
+
+            # Store averages and mins in the specified format
+            f.write(f"epoch_{epoch}_avg = [{', '.join(f'{avg_values.get(l, max_infinity):.6f}' for l in layers)}]\n")
+            f.write(f"epoch_{epoch}_min = [{', '.join(f'{min_values.get(l, min_infinity):.6f}' for l in layers)}]\n")
+
+if __name__ == "__main__":
+    main()
diff --git a/scaling_experiments/finding_metrics_input_sizes.py b/scaling_experiments/finding_metrics_input_sizes.py
@@ -0,0 +1,129 @@
+import argparse
+import os
+import re
+from collections import defaultdict
+import logging
+import numpy as np
+import random
+from tqdm import tqdm
+
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+logger = logging.getLogger()
+
+def parse_seeds(seeds_arg):
+    if ':' in seeds_arg:
+        start, end = map(int, seeds_arg.split(':'))
+        return range(start, end + 1)
+    else:
+        return [int(seed) for seed in seeds_arg.split(',')]
+
+def extract_metric(file_path, epoch, metric):
+    with open(file_path, 'r') as f:
+        content = f.read()
+
+    # Define regex patterns for each metric
+    patterns = {
+        'training_loss': r'epoch = {}, training loss = ([\d.]+)'.format(epoch),
+        'training_accuracy': r'epoch = {}.*?training accuracy: ([\d.]+)'.format(epoch),
+        'test_accuracy': r'epoch = {}.*?test accuracy = ([\d.]+)'.format(epoch),
+        'test_loss': r'epoch = {}.*?test loss = ([\d.]+)'.format(epoch)
+    }
+
+    match = re.search(patterns[metric], content, re.DOTALL)
+    if match:
+        return float(match.group(1))
+    else:
+        #logger.warning(f"Epoch {epoch} {metric} not found in {file_path}")
+        return None
+
+def process_directories(base_dir, epoch, metric, seeds):
+    values = defaultdict(list)
+    missing_data = defaultdict(list)
+    missing_files = defaultdict(list)
+    present_files = defaultdict(list)
+
+    expected_lookaheads = set(range(32, 129, 6))  # 32, 38, 44, ..., 128
+
+    for seed in seeds:
+        seed_dir = f"{base_dir}/seed_{seed}"
+        if not os.path.exists(seed_dir):
+            logger.warning(f"Directory not found: {seed_dir}")
+            continue
+
+        found_lookaheads = set()
+
+        for file in os.listdir(seed_dir):
+            if( file.startswith("scaling_n_c_1e-5_") and file.endswith("_max_8.out")) or (file.startswith("n_c_1e-5_") and file.endswith("_max_8.out")):
+                parts = file.split('_')
+                lookahead = int(parts[-3])
+                found_lookaheads.add(lookahead)
+                present_files[lookahead].append(seed)
+
+                file_path = os.path.join(seed_dir, file)
+                value = extract_metric(file_path, epoch, metric)
+
+                if value is not None:
+                    values[lookahead].append(value)
+                else:
+                    missing_data[lookahead].append(seed)
+
+
+        # Check for missing lookahead files
+        missing_lookaheads = expected_lookaheads - found_lookaheads
+        for lookahead in missing_lookaheads:
+            missing_files[lookahead].append(seed)
+
+    return values, missing_data, missing_files, present_files
+
+def calculate_average_values(values):
+    avg_values = {}
+    min_values = {}
+    max_values = {}
+    for lookahead, value_list in values.items():
+        if value_list:
+            avg_values[lookahead] = sum(value_list) / len(value_list)
+            min_values[lookahead] = min(value_list)
+            max_values[lookahead] = max(value_list)
+        else:
+            logger.warning(f"No valid values found for lookahead {lookahead}")
+    return avg_values, min_values, max_values
+
+def main():
+    parser = argparse.ArgumentParser(description="Calculate average metrics across seeds for different lookaheads.")
+    #parser.add_argument("base_directory", help="Path to the base directory containing seed folders")
+    parser.add_argument("--epoch", type=int, default=1000, help="Epoch number to analyze (default: 1000)")
+    parser.add_argument("--metric", choices=['training_loss', 'training_accuracy', 'test_accuracy', 'test_loss'], 
+                        default='test_loss', help="Metric to average (default: test_loss)")
+    parser.add_argument("--seeds", default="1:9", help="Seeds to process. Can be a range (e.g., '1:9') or a comma-separated list (e.g., '1,3,5,7,9')")
+
+    args = parser.parse_args()
+
+    ## Set your base directory to find seed folders ex. seed_1, seed_2, ...
+    base_directory="/scratch/sxp8182/work_with_abu/learning_to_search/cirriculum_learning/learning_to_search/"
+
+    lookaheads = list(range(32, 153, 6))  # Define the lookaheads
+    max_infinity = np.inf
+    min_infinity = -np.inf
+
+    # Add a random number to the output file name
+    output_file = "average_metrics_" + str(random.randint(1, 1000000)) + ".txt"  # Specify the output file name
+    with open(output_file, 'w') as f:  # Open the file for writing
+        for epoch in tqdm(range(1, args.epoch + 1)):  # Loop through epochs from 1 to 900
+            seeds = parse_seeds(args.seeds)
+            values, missing_data, missing_files, present_files = process_directories(base_directory, epoch, args.metric, seeds)
+            avg_values, min_values, max_values = calculate_average_values(values)
+
+            # for missing_file in missing_files:
+            #     print(f"Missing files for lookahead {missing_file}: {missing_files[missing_file]}")
+            # for present_file in present_files:
+            #     print(f"Present files for lookahead {present_file}: {present_files[present_file]}")
+            # for missing_data_no in missing_data:
+            #     print(f"Missing data for lookahead {missing_data_no}: {missing_data[missing_data_no]}")
+
+            # Store averages and mins in the specified format
+            f.write(f"epoch_{epoch}_avg = [{', '.join(f'{avg_values.get(l, max_infinity):.6f}' for l in lookaheads)}]\n")
+            f.write(f"epoch_{epoch}_min = [{', '.join(f'{min_values.get(l, min_infinity):.6f}' for l in lookaheads)}]\n")
+
+if __name__ == "__main__":
+    main()