From 533626a992d4f4d8812e7debb6aa2d3f23914c8e Mon Sep 17 00:00:00 2001
From: Youssef Mansour <youssefmansour@aucegypt.edu>
Date: Fri, 27 Sep 2024 18:20:40 +0200
Subject: [PATCH] inferring training proportions

---
 open_lm/extra_funcs.py       | 214 +++++++++++++++++++++++++++++++++++
 open_lm/infer_proportions.py |  57 ++++++++++
 2 files changed, 271 insertions(+)
 create mode 100644 open_lm/extra_funcs.py
 create mode 100644 open_lm/infer_proportions.py

diff --git a/open_lm/extra_funcs.py b/open_lm/extra_funcs.py
new file mode 100644
index 0000000..b746d5e
--- /dev/null
+++ b/open_lm/extra_funcs.py
@@ -0,0 +1,214 @@
+import os
+import shutil
+import random
+import json
+import torch
+import numpy as np
+import subprocess
+
+from open_lm.params import parse_args
+from open_lm.model import test_classif_model
+
+def inference():
+    
+    args = parse_args([])
+    args.model = "open_lm_25m"
+    args.classif_model_path = "/workspace/youssef/lrz/logs/RedPajama/prop/checkpoints/epoch_1.pt"
+    args.num_classes = 2
+     
+    test_data_path = '/workspace/youssef/lrz/datasets/prop/Llama1_gen.pt'
+    dataset = torch.load(test_data_path)
+    
+    model = test_classif_model(args)
+    model = model.to('cuda:3')
+       
+    pred = []
+    for sample in dataset:
+        sample = torch.LongTensor(sample).to('cuda:3')
+        with torch.no_grad():
+            out, _, _ = model(sample)        
+            pred.append(torch.argmax(out,2)[:,-1].item())
+               
+    c1 = pred.count(0)
+    c2 = pred.count(1)
+
+    print(c1,c2)
+
+    if c2 > c1:
+        return 1
+    else:
+        return 0
+
+def train_classifier(cuda_devices="3", log_dir="/workspace/youssef/lrz/logs/RedPajama/prop"):
+    # Set the CUDA_VISIBLE_DEVICES environment variable
+    os.environ["CUDA_VISIBLE_DEVICES"] = cuda_devices
+    
+    # Generate a random master port between 10000 and 65000
+    master_port = random.randint(10000, 65000)
+
+    # Construct the torchrun command
+    command = [
+        "torchrun",
+        f"--master_port={master_port}",
+        "--nproc-per-node", "1",
+        "-m", "open_lm.main",
+        "--model", "open_lm_25m",
+        "--dataset-manifest", "/workspace/youssef/lrz/datasets/prop/train/manifest.jsonl",
+        "--train-num-samples", "200000000",
+        "--workers", "1",
+        "--precision", "amp_bfloat16",
+        "--grad-checkpointing",
+        "--log-every-n-steps", "100",
+        "--grad-clip-norm", "1",
+        "--global-batch-size", "16",
+        "--data-key", "txt",
+        "--lr", "3e-4",
+        "--warmup", "2000",
+        "--wd", "0.1",
+        "--beta2", "0.95",
+        "--epochs", "1",
+        "--resume", "latest",
+        "--logs", "/workspace/youssef/lrz/logs/RedPajama/",
+        "--name", "prop",
+        "--classification", "True",
+        "--num-classes", "2",
+        "--classif-model-path", "/workspace/youssef/lrz/logs/pretrain/25M_0.5BC4/checkpoint/epoch_1.pt"
+    ]
+
+    os.makedirs(log_dir, exist_ok=True)
+
+    # Create log file paths
+    stdout_log = os.path.join(log_dir, "output.log")
+    stderr_log = os.path.join(log_dir, "error.log")
+
+    # Run the torchrun command using subprocess
+    with open(stdout_log, "w") as out_file, open(stderr_log, "w") as err_file:
+        try:
+            result = subprocess.run(command, check=True, stdout=out_file, stderr=err_file)
+            print(f"torchrun finished with return code: {result.returncode}")
+        except subprocess.CalledProcessError as e:
+            print(f"An error occurred while running torchrun: {e}")
+
+
+
+def proj_simplex(y):
+    m = len(y)
+    bget = False
+    s = sorted(y, reverse=True) # sorting in descending order
+    tmpsum = 0
+    for i in range(m-1):
+        tmpsum = tmpsum + s[i]
+        tmax = (tmpsum - 1) / (i+1)
+        if tmax >= s[i+1]:
+            bget = True
+            break
+    if not bget:
+        tmax = (tmpsum + s[m-1] -1) / m
+    return np.maximum(y-tmax,0)
+
+
+
+def del_dir(dir_path):
+    try:
+        # Remove the directory and all its contents
+        shutil.rmtree(dir_path)
+        print(f"Removed directory: {dir_path}")
+    except FileNotFoundError:
+        print(f"Directory not found: {dir_path}")
+    except PermissionError:
+        print(f"Permission denied: {dir_path}")
+    except Exception as e:
+        print(f"An error occurred while removing the directory: {e}")
+
+
+def round_preserving_sum(numbers):
+    """
+    This function takes a list of numbers that add up to 1, multiplies each by 100,
+    rounds them to integers while preserving the sum as 100.
+    """
+    # Step 1: Multiply all numbers by 100
+    multiplied = np.array(numbers) * 100
+
+    # Step 2: Separate integer and decimal parts
+    integers = np.floor(multiplied).astype(int)  # Integer parts
+    decimals = multiplied - integers  # Decimal parts
+
+    # Step 3: Calculate the difference between the current sum and 100
+    current_sum = np.sum(integers)
+    difference = 100 - current_sum
+
+    # Step 4: Distribute the difference by rounding up the largest decimals
+    if difference > 0:
+        # Get indices of the largest decimals and round up those numbers
+        indices_to_round_up = np.argsort(-decimals)[:difference]
+        integers[indices_to_round_up] += 1
+
+    return integers.tolist()
+
+def sample_and_rename_files(sample_counts_list):
+
+    base_path = "/workspace/youssef/lrz/datasets/prop/original/"
+    output_folder = "/workspace/youssef/lrz/datasets/prop/train/"
+   
+    # Define the folder names in order
+    file_names = ['arxiv', 'c4', 'cc', 'github', 'se', 'wiki']
+    folder_names = [os.path.join(base_path, folder) for folder in file_names]
+    
+    # Check if the provided sample_counts_list contains exactly two lists
+    if len(sample_counts_list) != 2 or any(len(sample_counts) != 6 for sample_counts in sample_counts_list):
+        raise ValueError("sample_counts_list must contain exactly two lists, each with 6 numbers.")
+    
+    # Create the output folder if it doesn't exist
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    
+    # List to store the manifest data
+    manifest_data = []
+
+    # Loop over the two lists of sample counts
+    for index, sample_counts in enumerate(sample_counts_list):
+        # Iterate through each folder and sample the required number of .tar files
+        for i, folder in enumerate(folder_names):
+            folder_path = os.path.join(folder)
+            
+            if not os.path.exists(folder_path):
+                raise ValueError(f"Folder {folder_path} does not exist.")
+            
+            # Get all .tar files from the current folder
+            all_files = [f for f in os.listdir(folder_path) if f.endswith('.tar')]
+            
+            # Ensure the sample count is not more than available files
+            sample_count = min(sample_counts[i], len(all_files))
+            
+            # Randomly sample the required number of files from the folder
+            sampled_files = random.sample(all_files, sample_count)
+            
+            # Copy each sampled file to the output folder with the new name
+            for file_name in sampled_files:
+                # Construct source file path
+                source_file_path = os.path.join(folder_path, file_name)
+                
+                # Create the new filename by prepending the index (0 or 1) with a dash
+                new_file_name = f"{index}-{file_name[:-4]}"  # Remove the .tar extension
+                
+                # Destination path in the output folder
+                dest_file_path = os.path.join(output_folder, new_file_name + '.tar')  # Keep .tar in destination
+                
+                # Copy the file to the output folder
+                shutil.copy2(source_file_path, dest_file_path)
+                
+                # Add entry to manifest_data, replacing ".tar" in new_file_name with an empty string
+                manifest_entry = {
+                    "shard": new_file_name,  # No .tar extension
+                    "num_sequences": 489  # Set a fixed number of sequences
+                }
+                manifest_data.append(manifest_entry)
+
+    # Write the manifest.jsonl file
+    manifest_file_path = os.path.join(output_folder, "manifest.jsonl")
+    with open(manifest_file_path, 'w') as manifest_file:
+        # Write each entry except the last one with a newline
+        for entry in manifest_data:
+            manifest_file.write(json.dumps(entry) + '\n')
+
+    print(f"Files sampled and saved in {output_folder}. Manifest file created as {manifest_file_path}.")
\ No newline at end of file
diff --git a/open_lm/infer_proportions.py b/open_lm/infer_proportions.py
new file mode 100644
index 0000000..1dadd64
--- /dev/null
+++ b/open_lm/infer_proportions.py
@@ -0,0 +1,57 @@
+import torch
+import numpy as np
+
+from extra_funcs import train_classifier, proj_simplex, round_preserving_sum, sample_and_rename_files, inference, del_dir
+
+def comparison(x, xcandidate):
+
+    list1 = round_preserving_sum(x.tolist())
+    list2 = round_preserving_sum(xcandidate.tolist())
+    list = [list1, list2]
+
+    sample_and_rename_files(list)
+
+    train_classifier()
+
+    result = inference()
+
+    del_dir("/workspace/youssef/lrz/logs/RedPajama/prop")
+    del_dir("/workspace/youssef/lrz/datasets/prop/train")
+
+    return result
+
+
+def gradientless_descent(N=6, num_iter=200, radius = 0.2, alpha=0.5):
+
+    #For measuring error
+    xorig = np.array([0.0325,0.1575,0.6775,0.0525,0.0275,0.0525])
+    
+    # initialize x with equal probability 
+    x = np.ones(N)/N
+    
+    error = []
+    prop = []
+    
+    for i in range(num_iter):
+        
+        stepsize = 1/(i+1)**alpha
+        # choose random direction with radius R
+        dir = np.random.randn(N)
+        dir = dir/np.linalg.norm(dir)*radius*stepsize
+        
+        xcandidate = proj_simplex( x + dir )
+        
+        # compare x with x+dir and update x
+        if comparison(x, xcandidate) == 1:
+            x = xcandidate
+
+        print(i, np.linalg.norm(x-xorig), x)
+        error.append(np.linalg.norm(x-xorig))
+        prop.append(x)
+
+        torch.save(error, "error.pt")
+        torch.save(prop, "prop.pt")
+    return x
+
+if __name__ == "__main__":
+    gradientless_descent()