Skip to content

Commit

Permalink
Added scaling experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
srushti98 committed Sep 13, 2024
1 parent 1e54228 commit 6798b62
Show file tree
Hide file tree
Showing 378 changed files with 7,861 additions and 0 deletions.
26 changes: 26 additions & 0 deletions scaling_experiments/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
This folder is for scaling_experiments.
It contains results of all seeds from 1 to 14.

Each seed folder contains .sbatch, .err, and .out files for each experiments.

We have 3 files `finding_metrics_hid.py`, `finding_metrics_input_sizes.py` and `finding_metrics_layers.py` to analyze the experiments.

Steps to analyze the outputs of these experiments.

1) `finding_metrics_input_sizes.py`
- Set the `base_directory` in the code.
- Run `python finding_metric_hid.py`, you can specify arguments `--epochs, --metrics, --seeds` you want to analyze.
- This code will calculate minimum and average across all the seeds for every epoch for the metric you have specified.

2) `finding_metrics_hid.py`
- Set the `base_directory` and `layers` you want to analyze in the code.
- Run `python finding_metric_hid.py`, you can specify arguments `--epochs, --metrics, --seeds` you want to analyze.
- This code will calculate minimum and average across all the seeds for every epoch for the metric you have specified.

3) `finding_metrics_hid.py`
- Set the `base_directory` and `layers` you want to analyze in the code.
- Run `python finding_metric_hid.py`, you can specify arguments `--epochs, --metrics, --seeds` you want to analyze.
- This code will calculate minimum and average across all the seeds for every epoch for the metric you have specified.


The outputs generated from these files can directly be given to the python notebooks for generating plots.
137 changes: 137 additions & 0 deletions scaling_experiments/finding_metrics_hid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import argparse
import os
import re
from collections import defaultdict
import logging
import numpy as np
import random
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger()

def parse_seeds(seeds_arg):
if ':' in seeds_arg:
start, end = map(int, seeds_arg.split(':'))
return range(start, end + 1)
else:
return [int(seed) for seed in seeds_arg.split(',')]

def extract_metric(file_path, epoch, metric):
with open(file_path, 'r') as f:
content = f.read()

# Define regex patterns for each metric
patterns = {
'training_loss': r'epoch = {}, training loss = ([\d.]+)'.format(epoch),
'training_accuracy': r'epoch = {}.*?training accuracy: ([\d.]+)'.format(epoch),
'test_accuracy': r'epoch = {}.*?test accuracy = ([\d.]+)'.format(epoch),
'test_loss': r'epoch = {}.*?test loss = ([\d.]+)'.format(epoch)
}

match = re.search(patterns[metric], content, re.DOTALL)
if match:
return float(match.group(1))
else:
return None

def process_directories(base_dir, epoch, metric, seeds):
values = defaultdict(list)
missing_data = defaultdict(list)
missing_files = defaultdict(list)
present_files = defaultdict(list)

expected_layers = [16, 64, 256, 1024] # Define the expected layers

for seed in seeds:
seed_dir = f"{base_dir}/seed_{seed}"
if not os.path.exists(seed_dir):
logger.warning(f"Directory not found: {seed_dir}")
continue

found_layers = set()

for file in os.listdir(seed_dir):

if file.startswith("scaling_n_c_1e-5_seed") and re.search(r"_hid_\d+\.out$", file):
# print("for seed file", file)
parts = file.split('_')
layers = int(parts[-1].split('.')[0])
found_layers.add(layers)
present_files[layers].append(seed)

file_path = os.path.join(seed_dir, file)
value = extract_metric(file_path, epoch, metric)
# print("\nvalue:", value)

if value is not None:
values[layers].append(value)
else:
missing_data[layers].append(seed)

# Check for missing layer files
missing_layers = set(expected_layers) - found_layers
for layer in missing_layers:
missing_files[layer].append(seed)

return values, missing_data, missing_files, present_files

def calculate_average_values(values):
avg_values = {}
min_values = {}
max_values = {}
for layer, value_list in values.items():
if value_list:
avg_values[layer] = sum(value_list) / len(value_list)
min_values[layer] = min(value_list)
max_values[layer] = max(value_list)
else:
logger.warning(f"No valid values found for layer {layer}")
return avg_values, min_values, max_values

def main():
parser = argparse.ArgumentParser(description="Calculate average metrics across seeds for different layers.")
parser.add_argument("--epoch", type=int, default=1000, help="Epoch number to analyze (default: 1000)")
parser.add_argument("--metric", choices=['training_loss', 'training_accuracy', 'test_accuracy', 'test_loss'],
default='test_loss', help="Metric to average (default: test_loss)")
parser.add_argument("--seeds", default="1:15", help="Seeds to process. Can be a range (e.g., '1:15') or a comma-separated list (e.g., '1,3,5,7,9')")

args = parser.parse_args()

## Set your base directory to find seed folders ex. seed_1, seed_2, ...
base_directory = "/scratch/sxp8182/work_with_abu/learning_to_search/cirriculum_learning/learning_to_search/"

## Define the hidden dims you want to analyze. Even though the variable name says layers, its actually hidden dims.
layers = [16, 64, 256, 1024] # Define the layers
max_infinity = np.inf
min_infinity = -np.inf

# Add a random number to the output file name
output_file = "average_metrics_hidden_" + str(random.randint(1, 1000000)) + ".txt" # Specify the output file name
print("saving in file:\n", output_file)
with open(output_file, 'w') as f: # Open the file for writing
for epoch in tqdm(range(1, args.epoch + 1)): # Loop through epochs from 1 to 900
seeds = parse_seeds(args.seeds)
values, missing_data, missing_files, present_files = process_directories(base_directory, epoch, args.metric, seeds)
# print("value dict:\n\n")
#print(values)
avg_values, min_values, max_values = calculate_average_values(values)
# print("avg_values:\n\n")
#print(avg_values)
# print("min_values:\n\n")
#print(min_values)

for missing_file in missing_files:
print(f"Missing files for layer {missing_file}: {missing_files[missing_file]}")
# for present_file in present_files:
# print(f"Present files for layer {present_file}: {present_files[present_file]}")
for missing_data_no in missing_data:
print(f"Missing data for layer {missing_data_no}: {missing_data[missing_data_no]}")

# Store averages and mins in the specified format
f.write(f"epoch_{epoch}_avg = [{', '.join(f'{avg_values.get(l, max_infinity):.6f}' for l in layers)}]\n")
f.write(f"epoch_{epoch}_min = [{', '.join(f'{min_values.get(l, min_infinity):.6f}' for l in layers)}]\n")

if __name__ == "__main__":
main()
129 changes: 129 additions & 0 deletions scaling_experiments/finding_metrics_input_sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import argparse
import os
import re
from collections import defaultdict
import logging
import numpy as np
import random
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger()

def parse_seeds(seeds_arg):
if ':' in seeds_arg:
start, end = map(int, seeds_arg.split(':'))
return range(start, end + 1)
else:
return [int(seed) for seed in seeds_arg.split(',')]

def extract_metric(file_path, epoch, metric):
with open(file_path, 'r') as f:
content = f.read()

# Define regex patterns for each metric
patterns = {
'training_loss': r'epoch = {}, training loss = ([\d.]+)'.format(epoch),
'training_accuracy': r'epoch = {}.*?training accuracy: ([\d.]+)'.format(epoch),
'test_accuracy': r'epoch = {}.*?test accuracy = ([\d.]+)'.format(epoch),
'test_loss': r'epoch = {}.*?test loss = ([\d.]+)'.format(epoch)
}

match = re.search(patterns[metric], content, re.DOTALL)
if match:
return float(match.group(1))
else:
#logger.warning(f"Epoch {epoch} {metric} not found in {file_path}")
return None

def process_directories(base_dir, epoch, metric, seeds):
values = defaultdict(list)
missing_data = defaultdict(list)
missing_files = defaultdict(list)
present_files = defaultdict(list)

expected_lookaheads = set(range(32, 129, 6)) # 32, 38, 44, ..., 128

for seed in seeds:
seed_dir = f"{base_dir}/seed_{seed}"
if not os.path.exists(seed_dir):
logger.warning(f"Directory not found: {seed_dir}")
continue

found_lookaheads = set()

for file in os.listdir(seed_dir):
if( file.startswith("scaling_n_c_1e-5_") and file.endswith("_max_8.out")) or (file.startswith("n_c_1e-5_") and file.endswith("_max_8.out")):
parts = file.split('_')
lookahead = int(parts[-3])
found_lookaheads.add(lookahead)
present_files[lookahead].append(seed)

file_path = os.path.join(seed_dir, file)
value = extract_metric(file_path, epoch, metric)

if value is not None:
values[lookahead].append(value)
else:
missing_data[lookahead].append(seed)


# Check for missing lookahead files
missing_lookaheads = expected_lookaheads - found_lookaheads
for lookahead in missing_lookaheads:
missing_files[lookahead].append(seed)

return values, missing_data, missing_files, present_files

def calculate_average_values(values):
avg_values = {}
min_values = {}
max_values = {}
for lookahead, value_list in values.items():
if value_list:
avg_values[lookahead] = sum(value_list) / len(value_list)
min_values[lookahead] = min(value_list)
max_values[lookahead] = max(value_list)
else:
logger.warning(f"No valid values found for lookahead {lookahead}")
return avg_values, min_values, max_values

def main():
parser = argparse.ArgumentParser(description="Calculate average metrics across seeds for different lookaheads.")
#parser.add_argument("base_directory", help="Path to the base directory containing seed folders")
parser.add_argument("--epoch", type=int, default=1000, help="Epoch number to analyze (default: 1000)")
parser.add_argument("--metric", choices=['training_loss', 'training_accuracy', 'test_accuracy', 'test_loss'],
default='test_loss', help="Metric to average (default: test_loss)")
parser.add_argument("--seeds", default="1:9", help="Seeds to process. Can be a range (e.g., '1:9') or a comma-separated list (e.g., '1,3,5,7,9')")

args = parser.parse_args()

## Set your base directory to find seed folders ex. seed_1, seed_2, ...
base_directory="/scratch/sxp8182/work_with_abu/learning_to_search/cirriculum_learning/learning_to_search/"

lookaheads = list(range(32, 153, 6)) # Define the lookaheads
max_infinity = np.inf
min_infinity = -np.inf

# Add a random number to the output file name
output_file = "average_metrics_" + str(random.randint(1, 1000000)) + ".txt" # Specify the output file name
with open(output_file, 'w') as f: # Open the file for writing
for epoch in tqdm(range(1, args.epoch + 1)): # Loop through epochs from 1 to 900
seeds = parse_seeds(args.seeds)
values, missing_data, missing_files, present_files = process_directories(base_directory, epoch, args.metric, seeds)
avg_values, min_values, max_values = calculate_average_values(values)

# for missing_file in missing_files:
# print(f"Missing files for lookahead {missing_file}: {missing_files[missing_file]}")
# for present_file in present_files:
# print(f"Present files for lookahead {present_file}: {present_files[present_file]}")
# for missing_data_no in missing_data:
# print(f"Missing data for lookahead {missing_data_no}: {missing_data[missing_data_no]}")

# Store averages and mins in the specified format
f.write(f"epoch_{epoch}_avg = [{', '.join(f'{avg_values.get(l, max_infinity):.6f}' for l in lookaheads)}]\n")
f.write(f"epoch_{epoch}_min = [{', '.join(f'{min_values.get(l, min_infinity):.6f}' for l in lookaheads)}]\n")

if __name__ == "__main__":
main()
Loading

0 comments on commit 6798b62

Please sign in to comment.