From fe27b8db9ef639bbd8f9aa32cea17bcff2f09137 Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Tue, 10 Sep 2024 09:04:08 -0700
Subject: [PATCH 01/42] refactor hf download

---
 llmfoundry/data/finetuning/tasks.py | 115 +++++++++++++++++-----------
 1 file changed, 71 insertions(+), 44 deletions(-)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index e8f6484ef2..824b7b3bd6 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -702,7 +702,73 @@ def state_dict(self, num_samples: int,
             num_samples=num_samples,
             from_beginning=from_beginning,
         )
+    
+def download_hf_dataset_if_needed(
+    dataset_name: str,
+    hf_kwargs: Optional[dict[str, Any]] = None
+) -> str:
+    """
+    Download a HuggingFace dataset locally if it does not already exist.
+
+    Args:
+        dataset_name (str): The name of the HuggingFace dataset to use. Can be a remote http(s) 
+        directory or object store bucket containing the file {split}.jsonl.
+        safe_load (bool): Whether to enforce safe loading of the dataset.
+        hf_kwargs (dict, optional): Additional kwargs to pass to `datasets.load_dataset`.
+
+    Returns:
+        str: The local path to the dataset.
+    """
+    if hf_kwargs is None:
+        hf_kwargs = {}
+
+    if not os.path.isdir(dataset_name):
+        local_dataset_dir = os.path.join(
+            DOWNLOADED_FT_DATASETS_DIRPATH,
+            dataset_name,
+        )
+
+        if _is_empty_or_nonexistent(dirpath=local_dataset_dir):
+            # Safely load the dataset from HF Hub with restricted file types.
+            hf_hub.snapshot_download(
+                dataset_name,
+                repo_type='dataset',
+                allow_patterns=[
+                    '*' + ext for ext in SUPPORTED_EXTENSIONS
+                ],
+                token=hf_kwargs.get('token', None),
+                revision=hf_kwargs.get('revision', None),
+                local_dir_use_symlinks=False,
+                local_dir=local_dataset_dir,
+            )
+            if _is_empty_or_nonexistent(dirpath=dataset_name):
+                log.error("Failed to safely load the dataset from HF Hub.")
+                raise InvalidFileExtensionError(
+                    dataset_name,
+                    SUPPORTED_EXTENSIONS,
+                )
+        # Set dataset_name to the downloaded location.
+        dataset_name = local_dataset_dir
+
+    # Ensure dataset_name is a local directory path (using abspath to avoid confusion).
+    dataset_name = os.path.abspath(dataset_name)
+
+    # Check that the directory contains only allowed file types.
+    dataset_files = [
+        f for _, _, files in os.walk(dataset_name) for f in files
+    ]
+    if not all(
+        Path(f).suffix in SUPPORTED_EXTENSIONS +
+        HUGGINGFACE_FOLDER_EXTENSIONS or f == '.gitignore'
+        for f in dataset_files
+    ):
+        log.error(f"Invalid file extension found in dataset during safe load.")
+        raise InvalidFileExtensionError(
+            dataset_name,
+            SUPPORTED_EXTENSIONS,
+        )
 
+    return dataset_name
 
 class DatasetConstructor:
 
@@ -901,50 +967,11 @@ def build_from_hf(
         filtered_dataset = None
         try:
             if safe_load:
-                if not os.path.isdir(dataset_name):
-                    # dataset_name is not a local dir path, download if needed.
-                    local_dataset_dir = os.path.join(
-                        DOWNLOADED_FT_DATASETS_DIRPATH,
-                        dataset_name,
-                    )
-
-                    if _is_empty_or_nonexistent(dirpath=local_dataset_dir):
-                        # Safely load a dataset from HF Hub with restricted file types.
-                        hf_hub.snapshot_download(
-                            dataset_name,
-                            repo_type='dataset',
-                            allow_patterns=[
-                                '*' + ext for ext in SUPPORTED_EXTENSIONS
-                            ],
-                            token=hf_kwargs.get('token', None),
-                            revision=hf_kwargs.get('revision', None),
-                            local_dir_use_symlinks=False,
-                            local_dir=local_dataset_dir,
-                        )
-                        if _is_empty_or_nonexistent(dirpath=local_dataset_dir):
-                            raise InvalidFileExtensionError(
-                                dataset_name,
-                                SUPPORTED_EXTENSIONS,
-                            )
-                    # Set dataset_name to the downloaded location.
-                    dataset_name = local_dataset_dir
-
-                # dataset_name is a local dir path. Use the abspath to prevent confusion.
-                dataset_name = os.path.abspath(dataset_name)
-
-                # Ensure that the local dir contains only allowed file types.
-                dataset_files = [
-                    f for _, _, files in os.walk(dataset_name) for f in files
-                ]
-                if not all(
-                    Path(f).suffix in SUPPORTED_EXTENSIONS +
-                    HUGGINGFACE_FOLDER_EXTENSIONS or f == '.gitignore'
-                    for f in dataset_files
-                ):
-                    raise InvalidFileExtensionError(
-                        dataset_name,
-                        SUPPORTED_EXTENSIONS,
-                    )
+                dataset_name = download_hf_dataset_if_needed(
+                    dataset_name,
+                    safe_load,
+                    hf_kwargs,
+                )
 
             dataset = hf_datasets.load_dataset(
                 dataset_name,

From 18859b15df3cee80435583947f9cbf49880a358d Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Thu, 12 Sep 2024 13:34:59 -0700
Subject: [PATCH 02/42] split_eval_set skeleton

---
 llmfoundry/command_utils/__init__.py          | 36 +++++++------
 .../command_utils/data_prep/split_eval_set.py | 37 +++++++++++++
 scripts/data_prep/split_eval_set.py           | 54 +++++++++++++++++++
 3 files changed, 110 insertions(+), 17 deletions(-)
 create mode 100644 llmfoundry/command_utils/data_prep/split_eval_set.py
 create mode 100644 scripts/data_prep/split_eval_set.py

diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
index 0226c4f408..5407b723cc 100644
--- a/llmfoundry/command_utils/__init__.py
+++ b/llmfoundry/command_utils/__init__.py
@@ -20,6 +20,7 @@
     convert_text_to_mds,
     convert_text_to_mds_from_args,
 )
+from llmfoundry.command_utils.data_prep.split_eval_set import split_eval_set_from_args
 from llmfoundry.command_utils.eval import (
     eval_from_yaml,
     evaluate,
@@ -33,21 +34,22 @@
 )
 
 __all__ = [
-    'train',
-    'train_from_yaml',
-    'TrainConfig',
-    'TRAIN_CONFIG_KEYS',
-    'validate_config',
-    'evaluate',
-    'eval_from_yaml',
-    'convert_dataset_hf',
-    'convert_dataset_hf_from_args',
-    'convert_dataset_json',
-    'convert_dataset_json_from_args',
-    'convert_finetuning_dataset_from_args',
-    'convert_finetuning_dataset',
-    'convert_text_to_mds',
-    'convert_text_to_mds_from_args',
-    'convert_delta_to_json_from_args',
-    'fetch_DT',
+    "train",
+    "train_from_yaml",
+    "TrainConfig",
+    "TRAIN_CONFIG_KEYS",
+    "validate_config",
+    "evaluate",
+    "eval_from_yaml",
+    "convert_dataset_hf",
+    "convert_dataset_hf_from_args",
+    "convert_dataset_json",
+    "convert_dataset_json_from_args",
+    "convert_finetuning_dataset_from_args",
+    "convert_finetuning_dataset",
+    "convert_text_to_mds",
+    "convert_text_to_mds_from_args",
+    "convert_delta_to_json_from_args",
+    "fetch_DT",
+    "split_eval_set_from_args",
 ]
diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py
new file mode 100644
index 0000000000..01205cba15
--- /dev/null
+++ b/llmfoundry/command_utils/data_prep/split_eval_set.py
@@ -0,0 +1,37 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import json
+from enum import Enum
+
+import datasets
+from llmfoundry.data.finetuning.tasks import download_hf_dataset_if_needed
+
+
+class SupportedDataFormats(Enum):
+    REMOTE_JSONL = "jsonl"  # UC JSONL
+    DELTA_JSONL = "delta_jsonl"  # Delta table preprocessed to JSONL
+    HF = "huggingface"
+
+
+def validate_data_path(data_path: str) -> None:
+    """
+    Validates the data path and returns the format of the data.
+
+    Args:
+        data_path (str): Path to the training dataset
+    """
+
+
+def split_eval_set_from_args() -> None:
+    """
+    Args:
+        data_path_folder (str): Path to the training dataset folder
+        data_path_split (str): Data split
+        output_path (str): Directory to save the split dataset
+        eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training
+        max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
+        seed (int): Random seed for splitting the dataset
+    """
+    pass
diff --git a/scripts/data_prep/split_eval_set.py b/scripts/data_prep/split_eval_set.py
new file mode 100644
index 0000000000..ee8bfee453
--- /dev/null
+++ b/scripts/data_prep/split_eval_set.py
@@ -0,0 +1,54 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import ArgumentParser
+
+from llmfoundry.command_utils import split_eval_set_from_args
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Split training dataset into train and eval sets",
+    )
+    parser.add_argument(
+        "--data_path_folder", required=True, type=str, help="Path to the training dataset folder"
+    )
+    parser.add_argument(
+        "--data_path_split", required=True, type=str, help="Path to the training dataset split"
+    )
+    parser.add_argument(
+        "--output_path",
+        required=True,
+        type=str,
+        help="Path to save the split dataset",
+    )
+    parser.add_argument(
+        "--eval_split_ratio",
+        required=False,
+        type=float,
+        default=0.1,
+        help="Ratio of the dataset to use for evaluation. The remainder will be used for training",
+    )
+    parser.add_argument(
+        "--max_eval_samples",
+        required=False,
+        type=int,
+        default=None,
+        help="Maximum number of samples to include in the eval set",
+    )
+    parser.add_argument(
+        "--seed",
+        required=False,
+        type=int,
+        default=42,
+        help="Random seed for splitting the dataset",
+    )
+    args = parser.parse_args()
+    split_eval_set_from_args(
+        data_path_folder=args.data_path_folder,
+        data_path_split=args.data_path_split,
+        output_path=args.output_path,
+        eval_split_ratio=args.eval_split_ratio,
+        max_eval_samples=args.max_eval_samples,
+        seed=args.seed,
+    )

From f29ef67b306be24717e04053cb98890b1f33472e Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Sun, 15 Sep 2024 16:22:57 -0700
Subject: [PATCH 03/42] splitting script

---
 .../command_utils/data_prep/split_eval_set.py | 162 ++++++++++++++++--
 llmfoundry/data/finetuning/tasks.py           |   6 +-
 2 files changed, 152 insertions(+), 16 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py
index 01205cba15..f6afc8722d 100644
--- a/llmfoundry/command_utils/data_prep/split_eval_set.py
+++ b/llmfoundry/command_utils/data_prep/split_eval_set.py
@@ -1,31 +1,167 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
 import os
+import re
 import json
-from enum import Enum
+import contextlib
+import datasets as hf_datasets
+import numpy as np
+from typing import Optional
 
-import datasets
-from llmfoundry.data.finetuning.tasks import download_hf_dataset_if_needed
+from composer.utils import get_file
+from llmfoundry.data.finetuning.tasks import maybe_safe_download_hf_data
 
 
-class SupportedDataFormats(Enum):
-    REMOTE_JSONL = "jsonl"  # UC JSONL
-    DELTA_JSONL = "delta_jsonl"  # Delta table preprocessed to JSONL
-    HF = "huggingface"
+DELTA_JSONL_REGEX = re.compile(r"^tmp-t$")
+REMOTE_OBJECT_STORE_FILE_REGEX = re.compile(
+    r"^((s3|oci|gs):\/\/|dbfs:\/Volumes\/)[/a-zA-Z0-9 ()_\-.]+$"
+)
+HF_REGEX = re.compile(r"^[/a-zA-Z0-9 ()_\-.]+$")
 
+TEMP_DIR = "tmp-split"
 
-def validate_data_path(data_path: str) -> None:
+log = logging.getLogger(__name__)
+
+import sys
+
+log.setLevel(logging.DEBUG)
+log.addHandler(logging.StreamHandler(sys.stdout))
+
+
+def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> str:
     """
-    Validates the data path and returns the format of the data.
+    Prepares dataset as a local JSONL file. Downloads from remote object store or HF if necessary.
+
+    This function is intended to be invoked by DBX Finetuning.
+    Thus, it assumes the provided data is in one of three formats:
+        1. A Delta table converted to JSONL at 'tmp-t/{data_path_split}-00000-of-00001.jsonl`
+           using the 'llmfoundry.scripts.convert_delta_to_json.py' script.
+        2. A JSONL stored as a remote object store file (e.g. S3, OCI, GCS)
+        3. A Hugging Face dataset
 
     Args:
-        data_path (str): Path to the training dataset
+        data_path_folder (str): Path to the training dataset folder
+        data_path_split (str): Data split
+
+    Returns:
+        str: Path to the training dataset
     """
+    os.makedirs(TEMP_DIR, exist_ok=True)
+
+    if DELTA_JSONL_REGEX.match(data_path_folder):
+        data_path = os.path.join(data_path_folder, f"{data_path_split}-00000-of-00001.jsonl")
+        if not os.path.exists(data_path):
+            # TODO: error handling
+            raise FileNotFoundError(f"File {data_path} does not exist.")
+
+    if REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder):
+        log.info(
+            f"Downloading dataset from remote object store: {data_path_folder}{data_path_split}.jsonl"
+        )
+        remote_path = f"{data_path_folder}/{data_path_split}.jsonl"
+        data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl")
+        try:
+            get_file(remote_path, data_path, overwrite=True)
+        except FileNotFoundError as e:
+            # TODO: error handling
+            raise e
+
+    elif HF_REGEX.match(data_path_folder):
+        log.info(
+            f"Downloading dataset from Hugging Face: {data_path_folder} with split {data_path_split}"
+        )
+        # TODO: maybe add support for HF kwargs
+        local_hf_path = maybe_safe_download_hf_data(data_path_folder)
+        # convert dataset split to JSONL
+        dataset = hf_datasets.load_dataset(
+            local_hf_path,
+            split=data_path_split,
+        )
+        data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl")
+        with open(data_path, "w") as f:
+            for example in dataset:
+                f.write(json.dumps(example) + "\n")
+
+    else:
+        # TODO: error handling
+        raise ValueError(
+            f"Unrecognized data_path_folder: {data_path_folder}. Must be a Delta table, remote object store file, or Hugging Face dataset."
+        )
+
+    if not os.path.exists(data_path):
+        # TODO: error handling
+        raise FileNotFoundError(f"File {data_path} does not exist.")
+
+    return data_path
+
 
+@contextlib.contextmanager
+def temp_seed(seed: int):
+    state = np.random.get_state()
+    np.random.seed(seed)
+    try:
+        yield
+    finally:
+        np.random.set_state(state)
 
-def split_eval_set_from_args() -> None:
+
+def _split_examples(
+    data_path: str,
+    output_path: str,
+    eval_split_ratio: float,
+    max_eval_samples: Optional[int],
+    seed: Optional[int] = None,
+) -> None:
+    """
+    Splits the dataset into training and evaluation sets.
+
+    Args:
+        data_path (str): Path to the training dataset (local jsonl file)
+        eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training
+        max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
+        seed (int): Random seed for splitting the dataset
     """
+    # first pass: count total number of lines and determine sample size
+    total_lines = 0
+    with open(data_path, "r") as infile:
+        for _ in infile:
+            total_lines += 1
+    sample_size = int(eval_split_ratio * total_lines)
+    if max_eval_samples is not None:
+        sample_size = min(sample_size, max_eval_samples)
+
+    with temp_seed(seed) if seed is not None else contextlib.nullcontext():
+        random_numbers = np.random.rand(total_lines)
+        sample_indices = set(np.argsort(random_numbers)[:sample_size])
+
+    # second pass: sample indices
+    with open(data_path, "r") as infile, open(
+        os.path.join(output_path, "train.jsonl"), "w"
+    ) as train_outfile, open(os.path.join(output_path, "eval.jsonl"), "w") as eval_outfile:
+        for idx, line in enumerate(infile):
+            if idx in sample_indices:
+                eval_outfile.write(line)
+            else:
+                train_outfile.write(line)
+
+    log.info(
+        f"Split {data_path} into train set of size {total_lines - sample_size} and eval set of size {sample_size}."
+    )
+
+
+def split_eval_set_from_args(
+    data_path_folder: str,
+    data_path_split: str,
+    output_path: str,
+    eval_split_ratio: float,
+    max_eval_samples: Optional[int] = None,
+    seed: Optional[int] = None,
+) -> None:
+    """
+    A wrapper for split_eval_set that parses arguments
+
     Args:
         data_path_folder (str): Path to the training dataset folder
         data_path_split (str): Data split
@@ -34,4 +170,6 @@ def split_eval_set_from_args() -> None:
         max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
         seed (int): Random seed for splitting the dataset
     """
-    pass
+    os.makedirs(output_path, exist_ok=True)
+    data_path = maybe_download_data_as_json(data_path_folder, data_path_split)
+    _split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed)
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 824b7b3bd6..ea6004e01c 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -703,7 +703,7 @@ def state_dict(self, num_samples: int,
             from_beginning=from_beginning,
         )
     
-def download_hf_dataset_if_needed(
+def maybe_safe_download_hf_data(
     dataset_name: str,
     hf_kwargs: Optional[dict[str, Any]] = None
 ) -> str:
@@ -713,7 +713,6 @@ def download_hf_dataset_if_needed(
     Args:
         dataset_name (str): The name of the HuggingFace dataset to use. Can be a remote http(s) 
         directory or object store bucket containing the file {split}.jsonl.
-        safe_load (bool): Whether to enforce safe loading of the dataset.
         hf_kwargs (dict, optional): Additional kwargs to pass to `datasets.load_dataset`.
 
     Returns:
@@ -967,9 +966,8 @@ def build_from_hf(
         filtered_dataset = None
         try:
             if safe_load:
-                dataset_name = download_hf_dataset_if_needed(
+                dataset_name = maybe_download_hf_data(
                     dataset_name,
-                    safe_load,
                     hf_kwargs,
                 )
 

From 3d9d51f2a66a1b3a8d4888b712b87101c6f1397b Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Mon, 16 Sep 2024 00:58:53 -0700
Subject: [PATCH 04/42] error handling and testing

---
 llmfoundry/command_utils/__init__.py          |   6 +-
 .../command_utils/data_prep/split_eval_set.py |  38 ++--
 .../data_prep/test_split_eval_set.py          | 163 ++++++++++++++++++
 3 files changed, 183 insertions(+), 24 deletions(-)
 create mode 100644 tests/a_scripts/data_prep/test_split_eval_set.py

diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
index 5407b723cc..8757f3b1bc 100644
--- a/llmfoundry/command_utils/__init__.py
+++ b/llmfoundry/command_utils/__init__.py
@@ -20,7 +20,10 @@
     convert_text_to_mds,
     convert_text_to_mds_from_args,
 )
-from llmfoundry.command_utils.data_prep.split_eval_set import split_eval_set_from_args
+from llmfoundry.command_utils.data_prep.split_eval_set import (
+    split_eval_set_from_args,
+    split_examples,
+)
 from llmfoundry.command_utils.eval import (
     eval_from_yaml,
     evaluate,
@@ -52,4 +55,5 @@
     "convert_delta_to_json_from_args",
     "fetch_DT",
     "split_eval_set_from_args",
+    "split_examples",
 ]
diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py
index f6afc8722d..b4b150f81f 100644
--- a/llmfoundry/command_utils/data_prep/split_eval_set.py
+++ b/llmfoundry/command_utils/data_prep/split_eval_set.py
@@ -10,7 +10,7 @@
 import numpy as np
 from typing import Optional
 
-from composer.utils import get_file
+import composer.utils as utils
 from llmfoundry.data.finetuning.tasks import maybe_safe_download_hf_data
 
 
@@ -24,11 +24,6 @@
 
 log = logging.getLogger(__name__)
 
-import sys
-
-log.setLevel(logging.DEBUG)
-log.addHandler(logging.StreamHandler(sys.stdout))
-
 
 def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> str:
     """
@@ -51,22 +46,16 @@ def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) ->
     os.makedirs(TEMP_DIR, exist_ok=True)
 
     if DELTA_JSONL_REGEX.match(data_path_folder):
+        log.info(f"Dataset is converted from Delta table. Using local file {data_path_folder}")
         data_path = os.path.join(data_path_folder, f"{data_path_split}-00000-of-00001.jsonl")
-        if not os.path.exists(data_path):
-            # TODO: error handling
-            raise FileNotFoundError(f"File {data_path} does not exist.")
 
-    if REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder):
+    elif REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder):
         log.info(
             f"Downloading dataset from remote object store: {data_path_folder}{data_path_split}.jsonl"
         )
         remote_path = f"{data_path_folder}/{data_path_split}.jsonl"
         data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl")
-        try:
-            get_file(remote_path, data_path, overwrite=True)
-        except FileNotFoundError as e:
-            # TODO: error handling
-            raise e
+        utils.get_file(remote_path, data_path, overwrite=True)
 
     elif HF_REGEX.match(data_path_folder):
         log.info(
@@ -85,20 +74,21 @@ def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) ->
                 f.write(json.dumps(example) + "\n")
 
     else:
-        # TODO: error handling
         raise ValueError(
-            f"Unrecognized data_path_folder: {data_path_folder}. Must be a Delta table, remote object store file, or Hugging Face dataset."
+            f"Encountered unknown data path format when splitting dataset: {data_path_folder} with split {data_path_split}"
         )
 
     if not os.path.exists(data_path):
-        # TODO: error handling
-        raise FileNotFoundError(f"File {data_path} does not exist.")
+        raise FileNotFoundError(
+            f"Expected dataset file at {data_path} for splitting, but it does not exist."
+        )
 
     return data_path
 
 
 @contextlib.contextmanager
 def temp_seed(seed: int):
+    log.info(f"Setting random seed to {seed}")
     state = np.random.get_state()
     np.random.seed(seed)
     try:
@@ -107,11 +97,11 @@ def temp_seed(seed: int):
         np.random.set_state(state)
 
 
-def _split_examples(
+def split_examples(
     data_path: str,
     output_path: str,
     eval_split_ratio: float,
-    max_eval_samples: Optional[int],
+    max_eval_samples: Optional[int] = None,
     seed: Optional[int] = None,
 ) -> None:
     """
@@ -119,10 +109,13 @@ def _split_examples(
 
     Args:
         data_path (str): Path to the training dataset (local jsonl file)
+        output_path (str): Directory to save the split dataset
         eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training
         max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
         seed (int): Random seed for splitting the dataset
     """
+    os.makedirs(output_path, exist_ok=True)
+
     # first pass: count total number of lines and determine sample size
     total_lines = 0
     with open(data_path, "r") as infile:
@@ -170,6 +163,5 @@ def split_eval_set_from_args(
         max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
         seed (int): Random seed for splitting the dataset
     """
-    os.makedirs(output_path, exist_ok=True)
     data_path = maybe_download_data_as_json(data_path_folder, data_path_split)
-    _split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed)
+    split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed)
diff --git a/tests/a_scripts/data_prep/test_split_eval_set.py b/tests/a_scripts/data_prep/test_split_eval_set.py
new file mode 100644
index 0000000000..a1b80b91cd
--- /dev/null
+++ b/tests/a_scripts/data_prep/test_split_eval_set.py
@@ -0,0 +1,163 @@
+import os
+import json
+import pytest
+import hashlib
+from unittest.mock import patch
+
+from llmfoundry.command_utils import split_eval_set_from_args, split_examples
+
+# Default values
+OUTPUT_DIR = "tmp-split"
+TMPT_DIR = "tmp-t"
+DATA_PATH_SPLIT = "train"
+EVAL_SPLIT_RATIO = 0.1
+DEFAULT_FILE = TMPT_DIR + "/train-00000-of-00001.jsonl"
+
+
+def calculate_file_hash(filepath: str) -> str:
+    with open(filepath, "rb") as f:
+        file_hash = hashlib.sha256(f.read()).hexdigest()
+    return file_hash
+
+
+def count_lines(filepath: str) -> int:
+    with open(filepath, "r") as f:
+        return sum(1 for _ in f)
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_and_teardown_module():
+    # Setup: create local testing file
+    os.makedirs(TMPT_DIR, exist_ok=True)
+    with open(DEFAULT_FILE, "w") as f:
+        for i in range(1000):
+            f.write(json.dumps({"prompt": "hello world " + str(i), "response": "hi you!"}) + "\n")
+    yield
+
+    # Teardown: clean up output and tmp directories
+    os.system(f"rm -rf {OUTPUT_DIR}")
+    os.system(f"rm -rf {TMPT_DIR}")
+
+
+def test_basic_split():
+    """Test basic functionality on local file"""
+    output_path = os.path.join(OUTPUT_DIR, "basic-test")
+    split_eval_set_from_args(TMPT_DIR, DATA_PATH_SPLIT, output_path, EVAL_SPLIT_RATIO)
+    assert os.path.isfile(os.path.join(output_path, "train.jsonl"))
+    assert os.path.isfile(os.path.join(output_path, "eval.jsonl"))
+
+
+def test_basic_split_output_exists():
+    """Test that split overwrites existing files in output directory"""
+    output_path = os.path.join(OUTPUT_DIR, "basic-test")
+    os.makedirs(output_path, exist_ok=True)
+    train_file = os.path.join(output_path, "train.jsonl")
+    eval_file = os.path.join(output_path, "eval.jsonl")
+    with open(train_file, "w") as f:
+        f.write("existing file train")
+    with open(eval_file, "w") as f:
+        f.write("existing file eval")
+    old_train_hash = calculate_file_hash(train_file)
+    old_eval_hash = calculate_file_hash(eval_file)
+    split_eval_set_from_args(
+        TMPT_DIR,
+        DATA_PATH_SPLIT,
+        output_path,
+        EVAL_SPLIT_RATIO,
+    )
+    assert calculate_file_hash(train_file) != old_train_hash
+    assert calculate_file_hash(eval_file) != old_eval_hash
+
+
+def test_max_eval_samples():
+    """Test case where max_eval_samples < eval_split_ratio * total samples"""
+    output_path = os.path.join(OUTPUT_DIR, "max-eval-test")
+    max_eval_samples = 50
+    split_eval_set_from_args(
+        TMPT_DIR,
+        DATA_PATH_SPLIT,
+        output_path,
+        EVAL_SPLIT_RATIO,
+        max_eval_samples,
+    )
+    eval_lines = count_lines(os.path.join(output_path, "eval.jsonl"))
+    assert eval_lines == max_eval_samples
+
+
+def test_eval_split_ratio():
+    """Test case where max_eval_samples is not used"""
+    output_path = os.path.join(OUTPUT_DIR, "eval-split-test")
+    split_eval_set_from_args(TMPT_DIR, DATA_PATH_SPLIT, output_path, EVAL_SPLIT_RATIO)
+    original_data_lines = count_lines(DEFAULT_FILE)
+    eval_lines = count_lines(os.path.join(output_path, "eval.jsonl"))
+    assert abs(eval_lines - EVAL_SPLIT_RATIO * original_data_lines) < 1  # allow for rounding errors
+
+
+def test_seed_consistency():
+    """Test if the same seed generates consistent splits"""
+    output_path_1 = os.path.join(OUTPUT_DIR, "seed-test-1")
+    output_path_2 = os.path.join(OUTPUT_DIR, "seed-test-2")
+    split_examples(DEFAULT_FILE, output_path_1, EVAL_SPLIT_RATIO, seed=12345)
+    split_examples(DEFAULT_FILE, output_path_2, EVAL_SPLIT_RATIO, seed=12345)
+    train_hash_1 = calculate_file_hash(os.path.join(output_path_1, "train.jsonl"))
+    train_hash_2 = calculate_file_hash(os.path.join(output_path_2, "train.jsonl"))
+    eval_hash_1 = calculate_file_hash(os.path.join(output_path_1, "eval.jsonl"))
+    eval_hash_2 = calculate_file_hash(os.path.join(output_path_2, "eval.jsonl"))
+
+    assert train_hash_1 == train_hash_2
+    assert eval_hash_1 == eval_hash_2
+
+    output_path_3 = os.path.join(OUTPUT_DIR, "seed-test-3")
+    split_examples(DEFAULT_FILE, output_path_3, EVAL_SPLIT_RATIO, seed=54321)
+    train_hash_3 = calculate_file_hash(os.path.join(output_path_3, "train.jsonl"))
+    eval_hash_3 = calculate_file_hash(os.path.join(output_path_3, "eval.jsonl"))
+
+    assert train_hash_1 != train_hash_3
+    assert eval_hash_1 != eval_hash_3
+
+
+def test_hf_data_split():
+    """Test splitting a dataset from Hugging Face"""
+    output_path = os.path.join(OUTPUT_DIR, "hf-split-test")
+    split_eval_set_from_args(
+        "databricks/databricks-dolly-15k", "train", output_path, EVAL_SPLIT_RATIO
+    )
+    assert os.path.isfile(os.path.join(output_path, "train.jsonl"))
+    assert os.path.isfile(os.path.join(output_path, "eval.jsonl"))
+    assert count_lines(os.path.join(output_path, "train.jsonl")) > 0
+    assert count_lines(os.path.join(output_path, "eval.jsonl")) > 0
+
+
+def _mock_get_file(remote_path: str, data_path: str, overwrite: bool):
+    with open(data_path, "w") as f:
+        for i in range(1000):
+            f.write(json.dumps({"prompt": "hello world " + str(i), "response": "hi you!"}) + "\n")
+
+
+def test_remote_store_data_split():
+    """Test splitting a dataset from a remote store"""
+    output_path = os.path.join(OUTPUT_DIR, "remote-split-test")
+    with patch("composer.utils.get_file", side_effect=_mock_get_file) as mock_get_file:
+        split_eval_set_from_args(
+            "dbfs:/Volumes/test/test/test.jsonl",
+            "unique-split-name",
+            output_path,
+            EVAL_SPLIT_RATIO,
+        )
+        mock_get_file.assert_called()
+
+    assert os.path.isfile(os.path.join(output_path, "train.jsonl"))
+    assert os.path.isfile(os.path.join(output_path, "eval.jsonl"))
+    assert count_lines(os.path.join(output_path, "train.jsonl")) > 0
+    assert count_lines(os.path.join(output_path, "eval.jsonl")) > 0
+
+
+def test_missing_delta_file_error():
+    # expects file 'TMPT_DIR/missing-00000-of-00001.jsonl
+    with pytest.raises(FileNotFoundError):
+        split_eval_set_from_args(TMPT_DIR, "missing", OUTPUT_DIR, EVAL_SPLIT_RATIO)
+
+
+def test_unknown_file_format_error():
+    with pytest.raises(ValueError):
+        split_eval_set_from_args("s3:/path/to/file.jsonl", "train", OUTPUT_DIR, EVAL_SPLIT_RATIO)

From 4e7b357079e4feeb0f0935d22dd3c5540e3a9d90 Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Mon, 16 Sep 2024 01:08:53 -0700
Subject: [PATCH 05/42] undo autoformat

---
 llmfoundry/command_utils/__init__.py | 38 ++++++++++++++--------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
index 8757f3b1bc..4f74fe6ec9 100644
--- a/llmfoundry/command_utils/__init__.py
+++ b/llmfoundry/command_utils/__init__.py
@@ -37,23 +37,23 @@
 )
 
 __all__ = [
-    "train",
-    "train_from_yaml",
-    "TrainConfig",
-    "TRAIN_CONFIG_KEYS",
-    "validate_config",
-    "evaluate",
-    "eval_from_yaml",
-    "convert_dataset_hf",
-    "convert_dataset_hf_from_args",
-    "convert_dataset_json",
-    "convert_dataset_json_from_args",
-    "convert_finetuning_dataset_from_args",
-    "convert_finetuning_dataset",
-    "convert_text_to_mds",
-    "convert_text_to_mds_from_args",
-    "convert_delta_to_json_from_args",
-    "fetch_DT",
-    "split_eval_set_from_args",
-    "split_examples",
+    'train',
+    'train_from_yaml',
+    'TrainConfig',
+    'TRAIN_CONFIG_KEYS',
+    'validate_config',
+    'evaluate',
+    'eval_from_yaml',
+    'convert_dataset_hf',
+    'convert_dataset_hf_from_args',
+    'convert_dataset_json',
+    'convert_dataset_json_from_args',
+    'convert_finetuning_dataset_from_args',
+    'convert_finetuning_dataset',
+    'convert_text_to_mds',
+    'convert_text_to_mds_from_args',
+    'convert_delta_to_json_from_args',
+    'fetch_DT',
+    'split_eval_set_from_args',
+    'split_examples',
 ]

From 83ab9c30e0a2432bcc6213e4cb8b55296b13e438 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Mon, 16 Sep 2024 13:54:10 -0700
Subject: [PATCH 06/42] Replace FSDP args (#1517)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 llmfoundry/command_utils/eval.py                         | 8 ++++++--
 tests/a_scripts/inference/test_convert_composer_to_hf.py | 5 +++--
 tests/models/hf/test_fsdp_weight_tying.py                | 2 +-
 tests/models/hf/test_hf_peft_wrapping.py                 | 2 +-
 tests/models/test_fsdp_act_checkpoint.py                 | 2 +-
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index f622ca182d..eca16bd815 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -52,7 +52,7 @@ def evaluate_model(
     device_eval_batch_size: Union[int, float],
     eval_gauntlet_config: Optional[Union[str, dict[str, Any]]],
     eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]],
-    fsdp_config: Optional[dict[str, Any]],
+    parallelism_config: Optional[dict[str, Any]],
     loggers: list[LoggerDestination],
     python_log_level: Optional[str],
     precision: str,
@@ -99,6 +99,10 @@ def evaluate_model(
             mosaicml_logger.log_metrics(metadata)
             mosaicml_logger._flush_metadata(force_flush=True)
 
+    fsdp_config = parallelism_config.get(
+        'fsdp_config',
+        None,
+    ) if parallelism_config else None
     if fsdp_config and model.get('load_in_8bit', False):
         raise ValueError(
             'The FSDP config block is not supported when loading ' +
@@ -316,7 +320,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
              device_eval_batch_size=eval_config.device_eval_batch_size,
              eval_gauntlet_config=eval_gauntlet_config,
              eval_loader_config=eval_loader_config,
-             fsdp_config=fsdp_config,
+             parallelism_config={'fsdp': fsdp_config},
              loggers=loggers,
              python_log_level=eval_config.python_log_level,
              precision=eval_config.precision,
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 4f1bd63c62..66ec739a65 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -1042,7 +1042,8 @@ def test_huggingface_conversion_callback(
         model=original_model,
         device='gpu',
         precision=trainer_precision,
-        fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None,
+        parallelism_config={'fsdp': fsdp_config}
+        if fsdp_state_dict_type is not None else None,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
@@ -1469,7 +1470,7 @@ def test_mptmoe_huggingface_conversion_callback(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py
index 69ced673a1..8e6c113169 100644
--- a/tests/models/hf/test_fsdp_weight_tying.py
+++ b/tests/models/hf/test_fsdp_weight_tying.py
@@ -91,7 +91,7 @@ def test_fsdp_weight_tying(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py
index 56cb36c8c1..01acc22a60 100644
--- a/tests/models/hf/test_hf_peft_wrapping.py
+++ b/tests/models/hf/test_hf_peft_wrapping.py
@@ -125,7 +125,7 @@ def test_lora_mixed_init(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py
index a41574538a..366bcf7786 100644
--- a/tests/models/test_fsdp_act_checkpoint.py
+++ b/tests/models/test_fsdp_act_checkpoint.py
@@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint(
     trainer = Trainer(
         model=model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
     )
 
     assert trainer.state.fsdp_enabled

From 0114f33da83b5e2c43f6399f69acd8401525a9e8 Mon Sep 17 00:00:00 2001
From: Abhay Gupta <gupta-abhay@users.noreply.github.com>
Date: Mon, 16 Sep 2024 17:09:12 -0700
Subject: [PATCH 07/42] enable correct padding_idx for embedding layers (#1527)

---
 llmfoundry/models/mpt/modeling_mpt.py     |  1 +
 llmfoundry/models/utils/param_init_fns.py |  3 +++
 tests/models/utils/test_param_init_fns.py | 27 +++++++++++++++++++++++
 3 files changed, 31 insertions(+)

diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 06b64101c3..cfe1172634 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -396,6 +396,7 @@ def __init__(self, config: MPTConfig):
         self.wte = SharedEmbedding(
             config.vocab_size,
             config.d_model,
+            padding_idx=config.pad_token_id,
             device=config.init_device,
         )
         if self.learned_pos_emb:
diff --git a/llmfoundry/models/utils/param_init_fns.py b/llmfoundry/models/utils/param_init_fns.py
index 180e7b894c..8ad6e77c57 100644
--- a/llmfoundry/models/utils/param_init_fns.py
+++ b/llmfoundry/models/utils/param_init_fns.py
@@ -224,6 +224,9 @@ def embedding_init(
             emb_init_fn_ = init_fn_
 
         emb_init_fn_(module.weight)
+        if module.padding_idx is not None:
+            with torch.no_grad():
+                module.weight[module.padding_idx].fill_(0)
 
         return True
 
diff --git a/tests/models/utils/test_param_init_fns.py b/tests/models/utils/test_param_init_fns.py
index 0eaf60c869..11d9fba430 100644
--- a/tests/models/utils/test_param_init_fns.py
+++ b/tests/models/utils/test_param_init_fns.py
@@ -199,3 +199,30 @@ def test_emb_init(emb_init_cfg: Optional[tuple[str, Union[int, list[int]]]]):
                 emb_init_uniform_lim,
             ) == 2 and emb_init_uniform_lim[0] == emb_init_uniform_lim[1]:
                 assert (model.emb.weight == emb_init_uniform_lim[0]).all()
+
+
+@pytest.mark.parametrize(
+    'padding_idx',
+    [0, 2],
+)
+def test_emb_padding_init(padding_idx: int,):
+    cfg: dict[str, Union[int, list[int]]] = {
+        'vocab_size': 64,
+        'in_features': 16,
+        'n_layers': 2,
+        'padding_idx': padding_idx,
+        'emb_init_std': 5,
+    }
+    dict_cfg = om.create(cfg)
+
+    model = nn.Embedding(
+        dict_cfg.vocab_size,
+        dict_cfg.in_features,
+        dict_cfg.padding_idx,
+    )
+
+    model.apply(partial(param_init_fns.get('kaiming_normal_'), **dict_cfg))
+    assert isinstance(model, torch.nn.Embedding)
+
+    if dict_cfg.get('emb_init_std') is not None:
+        assert (model.weight[padding_idx] == 0).all()

From 9a1b78b128a242590b00f364a99d2d2d735f9468 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Tue, 17 Sep 2024 10:29:09 -0700
Subject: [PATCH 08/42] Revert "Replace FSDP args" (#1533)

---
 llmfoundry/command_utils/eval.py                         | 8 ++------
 tests/a_scripts/inference/test_convert_composer_to_hf.py | 5 ++---
 tests/models/hf/test_fsdp_weight_tying.py                | 2 +-
 tests/models/hf/test_hf_peft_wrapping.py                 | 2 +-
 tests/models/test_fsdp_act_checkpoint.py                 | 2 +-
 5 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index eca16bd815..f622ca182d 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -52,7 +52,7 @@ def evaluate_model(
     device_eval_batch_size: Union[int, float],
     eval_gauntlet_config: Optional[Union[str, dict[str, Any]]],
     eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]],
-    parallelism_config: Optional[dict[str, Any]],
+    fsdp_config: Optional[dict[str, Any]],
     loggers: list[LoggerDestination],
     python_log_level: Optional[str],
     precision: str,
@@ -99,10 +99,6 @@ def evaluate_model(
             mosaicml_logger.log_metrics(metadata)
             mosaicml_logger._flush_metadata(force_flush=True)
 
-    fsdp_config = parallelism_config.get(
-        'fsdp_config',
-        None,
-    ) if parallelism_config else None
     if fsdp_config and model.get('load_in_8bit', False):
         raise ValueError(
             'The FSDP config block is not supported when loading ' +
@@ -320,7 +316,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
              device_eval_batch_size=eval_config.device_eval_batch_size,
              eval_gauntlet_config=eval_gauntlet_config,
              eval_loader_config=eval_loader_config,
-             parallelism_config={'fsdp': fsdp_config},
+             fsdp_config=fsdp_config,
              loggers=loggers,
              python_log_level=eval_config.python_log_level,
              precision=eval_config.precision,
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 66ec739a65..4f1bd63c62 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -1042,8 +1042,7 @@ def test_huggingface_conversion_callback(
         model=original_model,
         device='gpu',
         precision=trainer_precision,
-        parallelism_config={'fsdp': fsdp_config}
-        if fsdp_state_dict_type is not None else None,
+        fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
@@ -1470,7 +1469,7 @@ def test_mptmoe_huggingface_conversion_callback(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        parallelism_config={'fsdp': fsdp_config},
+        fsdp_config=fsdp_config,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py
index 8e6c113169..69ced673a1 100644
--- a/tests/models/hf/test_fsdp_weight_tying.py
+++ b/tests/models/hf/test_fsdp_weight_tying.py
@@ -91,7 +91,7 @@ def test_fsdp_weight_tying(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        parallelism_config={'fsdp': fsdp_config},
+        fsdp_config=fsdp_config,
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py
index 01acc22a60..56cb36c8c1 100644
--- a/tests/models/hf/test_hf_peft_wrapping.py
+++ b/tests/models/hf/test_hf_peft_wrapping.py
@@ -125,7 +125,7 @@ def test_lora_mixed_init(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        parallelism_config={'fsdp': fsdp_config},
+        fsdp_config=fsdp_config,
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py
index 366bcf7786..a41574538a 100644
--- a/tests/models/test_fsdp_act_checkpoint.py
+++ b/tests/models/test_fsdp_act_checkpoint.py
@@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint(
     trainer = Trainer(
         model=model,
         device='gpu',
-        parallelism_config={'fsdp': fsdp_config},
+        fsdp_config=fsdp_config,
     )
 
     assert trainer.state.fsdp_enabled

From 7a23f60ad5ce25e80c3d5f3ab3badfb413743daa Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <narayan.saaketh@gmail.com>
Date: Tue, 17 Sep 2024 12:54:28 -0700
Subject: [PATCH 09/42] Delete unneeded inner base model in PEFT HF
 Checkpointer (#1532)

---
 llmfoundry/callbacks/hf_checkpointer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 4e6a501f2f..65bdcb3b6c 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -585,6 +585,7 @@ def tensor_hook(
                         new_base_model_instance,
                         original_model.peft_config[active_adapter],
                     )
+                    del new_base_model_instance
                 else:
                     new_model_instance = type(original_model)(new_config)
                     new_model_instance.generation_config.update(

From 2e3d14f6130ebad5a149c1c52f53fd07628e1006 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Tue, 17 Sep 2024 13:45:04 -0700
Subject: [PATCH 10/42] Add deprecation warning to fsdp_config (#1530)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 llmfoundry/command_utils/eval.py              |  35 ++++-
 .../inference/test_convert_composer_to_hf.py  |   5 +-
 tests/eval/test_eval_deprecation.py           | 125 ++++++++++++++++++
 tests/models/hf/test_fsdp_weight_tying.py     |   2 +-
 tests/models/hf/test_hf_peft_wrapping.py      |   2 +-
 tests/models/test_fsdp_act_checkpoint.py      |   2 +-
 6 files changed, 163 insertions(+), 8 deletions(-)
 create mode 100644 tests/eval/test_eval_deprecation.py

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index f622ca182d..e644ad1f0f 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -4,6 +4,7 @@
 import logging
 import os
 import time
+import warnings
 from typing import Any, Optional, Union
 
 import pandas as pd
@@ -11,7 +12,7 @@
 from composer.core import Callback
 from composer.loggers.logger_destination import LoggerDestination
 from composer.trainer import Trainer
-from composer.utils import dist, get_device, reproducibility
+from composer.utils import dist, get_device, parallelism, reproducibility
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
@@ -36,6 +37,7 @@
     process_init_device,
 )
 from llmfoundry.utils.registry_utils import import_file
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
 
 log = logging.getLogger(__name__)
 
@@ -52,7 +54,6 @@ def evaluate_model(
     device_eval_batch_size: Union[int, float],
     eval_gauntlet_config: Optional[Union[str, dict[str, Any]]],
     eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]],
-    fsdp_config: Optional[dict[str, Any]],
     loggers: list[LoggerDestination],
     python_log_level: Optional[str],
     precision: str,
@@ -62,9 +63,33 @@ def evaluate_model(
     callback_configs: Optional[dict[str, Any]],
     metadata: Optional[dict[str, str]],
     logged_config: dict[str, Any],
+    fsdp_config: Optional[dict[str, Any]] = None,
+    parallelism_config: Optional[dict[str, Any]] = None,
     should_log_config: bool = True,
     load_path: Optional[str] = None,
 ):
+    if parallelism_config:
+        deprecated_fsdp_args = list(
+            parallelism.FSDPConfig.__annotations__.keys(),
+        )
+        for deprecated_arg in deprecated_fsdp_args:
+            if deprecated_arg in parallelism_config:
+                raise ValueError(
+                    'parallelism_config cannot contain deprecated fsdp_config arguments.',
+                )
+
+    if fsdp_config:
+        warnings.warn(
+            VersionedDeprecationWarning(
+                'The argument fsdp_config is deprecated. Please use parallelism_config instead.',
+                remove_version='0.13.0',
+            ),
+        )
+    if fsdp_config and parallelism_config:
+        raise ValueError(
+            'Both fsdp_config and parallelism_config cannot be provided at the same time. Please use parallelism_config.',
+        )
+
     log.info(f'Evaluating model: {model_name}')
     # Build tokenizer and model
     tokenizer_cfg = tokenizer
@@ -99,6 +124,10 @@ def evaluate_model(
             mosaicml_logger.log_metrics(metadata)
             mosaicml_logger._flush_metadata(force_flush=True)
 
+    fsdp_config = parallelism_config.get(
+        'fsdp_config',
+        None,
+    ) if parallelism_config else fsdp_config
     if fsdp_config and model.get('load_in_8bit', False):
         raise ValueError(
             'The FSDP config block is not supported when loading ' +
@@ -146,7 +175,7 @@ def evaluate_model(
         callbacks=callbacks,
         loggers=loggers,
         precision=precision,
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         load_path=load_path,
         load_weights_only=True,
         progress_bar=False,
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 4f1bd63c62..66ec739a65 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -1042,7 +1042,8 @@ def test_huggingface_conversion_callback(
         model=original_model,
         device='gpu',
         precision=trainer_precision,
-        fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None,
+        parallelism_config={'fsdp': fsdp_config}
+        if fsdp_state_dict_type is not None else None,
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
@@ -1469,7 +1470,7 @@ def test_mptmoe_huggingface_conversion_callback(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=train_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
diff --git a/tests/eval/test_eval_deprecation.py b/tests/eval/test_eval_deprecation.py
new file mode 100644
index 0000000000..828186245a
--- /dev/null
+++ b/tests/eval/test_eval_deprecation.py
@@ -0,0 +1,125 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+import warnings
+
+from llmfoundry.command_utils.eval import evaluate_model
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
+
+
+class TestEvaluateModelDeprecation(unittest.TestCase):
+
+    def setUp(self):
+        self.common_args = { # type: ignore
+            'tokenizer': {
+                'name': 'test_tokenizer',
+            },
+            'model': {
+                'name': 'test_model',
+            },
+            'model_name': 'test',
+            'dist_timeout': 60,
+            'run_name': 'test_run',
+            'seed': 42,
+            'icl_tasks': [],
+            'max_seq_len': 512,
+            'device_eval_batch_size': 1,
+            'eval_gauntlet_config': None,
+            'eval_loader_config': None,
+            'loggers': [],
+            'python_log_level': None,
+            'precision': 'fp32',
+            'eval_gauntlet_df': None,
+            'eval_subset_num_batches': 1,
+            'icl_subset_num_batches': None,
+            'callback_configs': None,
+            'metadata': None,
+            'logged_config': {},
+        }
+
+    def test_no_deprecation_warning(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            import composer.utils.parallelism
+            deprecated_fsdp_args = list(
+                composer.utils.parallelism.FSDPConfig.__annotations__.keys(),
+            )
+            print(deprecated_fsdp_args)
+
+            try:
+                parallelism_config = {'fsdp': {'verbose': True}}
+                evaluate_model(
+                    **self.common_args,
+                    parallelism_config=parallelism_config,
+                )
+            except ValueError as ve:
+                if 'parallelism_config cannot contain deprecated fsdp_config arguments.' in str(
+                    ve,
+                ):
+                    self.fail(
+                        'Raised ValueError about deprecated fsdp_config arguments',
+                    )
+                elif 'Both fsdp_config and parallelism_config cannot be provided at the same time.' in str(
+                    ve,
+                ):
+                    self.fail(
+                        'Raised ValueError about both configs being provided',
+                    )
+            except Exception:
+                pass
+
+            deprecation_warnings = [
+                warning for warning in w
+                if isinstance(warning.message, VersionedDeprecationWarning)
+            ]
+            if deprecation_warnings:
+                self.fail('VersionedDeprecationWarning was raised')
+
+    def test_deprecation_warning_with_deprecated_arg(self):
+        # Use assertRaises to catch the expected ValueError
+        with self.assertRaises(ValueError) as context:
+            # Directly call evaluate_model; do not use try-except here
+            evaluate_model(
+                **self.common_args,
+                parallelism_config={'activation_checkpointing': True},
+            )
+
+        # Assert that the correct error message is in the exception
+        self.assertIn(
+            'parallelism_config cannot contain deprecated fsdp_config arguments.',
+            str(context.exception),
+        )
+
+    def test_deprecation_warning_with_fsdp_config(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+
+            try:
+                evaluate_model(
+                    **self.common_args,
+                    parallelism_config=None,
+                    fsdp_config={'verbose': True},
+                )
+            except Exception:
+                pass
+
+            self.assertTrue(
+                any(
+                    issubclass(warning.category, VersionedDeprecationWarning)
+                    for warning in w
+                ),
+            )
+
+    def test_error_with_both_fsdp_and_parallelism_config(self):
+        with self.assertRaises(ValueError) as context:
+            evaluate_model(
+                **self.common_args,
+                parallelism_config={'some_arg': True},
+                fsdp_config={'some_arg': True},
+            )
+
+        self.assertIn(
+            'Both fsdp_config and parallelism_config cannot be provided at the same time.',
+            str(context.exception),
+        )
diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py
index 69ced673a1..8e6c113169 100644
--- a/tests/models/hf/test_fsdp_weight_tying.py
+++ b/tests/models/hf/test_fsdp_weight_tying.py
@@ -91,7 +91,7 @@ def test_fsdp_weight_tying(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py
index 56cb36c8c1..01acc22a60 100644
--- a/tests/models/hf/test_hf_peft_wrapping.py
+++ b/tests/models/hf/test_hf_peft_wrapping.py
@@ -125,7 +125,7 @@ def test_lora_mixed_init(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         train_dataloader=[],
         device_train_microbatch_size=1,
     )
diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py
index a41574538a..366bcf7786 100644
--- a/tests/models/test_fsdp_act_checkpoint.py
+++ b/tests/models/test_fsdp_act_checkpoint.py
@@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint(
     trainer = Trainer(
         model=model,
         device='gpu',
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
     )
 
     assert trainer.state.fsdp_enabled

From d7c78229e91129d4c35006209fabd5fb2f2252e9 Mon Sep 17 00:00:00 2001
From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com>
Date: Sun, 22 Sep 2024 14:03:42 -0400
Subject: [PATCH 11/42] Fix reuse kv cache for torch attention (#1539)

---
 llmfoundry/models/layers/attention.py   |  3 +++
 tests/models/layers/test_flash_torch.py | 19 ++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index a1af2235cf..625327767e 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -656,6 +656,9 @@ def get_qkv(
                     'prev_layer_key_value is None, cannot reuse_prev_layer_kv.',
                 )
             key, value = prev_layer_key_value
+            if self.attn_impl == 'torch':
+                key = rearrange(key, 'b h d s -> b s (h d)')
+                value = rearrange(value, 'b h s d -> b s (h d)')
 
             query = self.Wq(x)
             if self.clip_qkv:
diff --git a/tests/models/layers/test_flash_torch.py b/tests/models/layers/test_flash_torch.py
index 01a6a7576d..0a4b32a73a 100644
--- a/tests/models/layers/test_flash_torch.py
+++ b/tests/models/layers/test_flash_torch.py
@@ -188,7 +188,7 @@ def gen_bias(attn_impl: str):
                 alibi=alibi,
                 alibi_bias_max=8,
             )
-        if attn_impl != 'flash' and attn_uses_sequence_id and sequence_id is not None:
+        if attn_impl == 'torch' and attn_uses_sequence_id and sequence_id is not None:
             assert isinstance(attn_bias, torch.Tensor)  # pyright
             attn_bias = apply_sequence_id(
                 attn_bias,
@@ -561,8 +561,10 @@ def test_grouped_query_invalid_heads():
         },
     }],
 )
+@pytest.mark.parametrize('attn_impl', ['flash', 'torch'])
 def test_reuse_prev_layer_kv_cache(
     pos_emb_config: dict,
+    attn_impl: str,
     device: str = 'cuda',
 ):
     """Checks reusing previous layer's kv cache."""
@@ -570,7 +572,7 @@ def test_reuse_prev_layer_kv_cache(
     rope = pos_emb_config['rope']
 
     cfg = {
-        'attn_impl': 'flash',
+        'attn_impl': attn_impl,
         'd_model': 64,
         'n_heads': 4,
         'attn_pdrop': 0,
@@ -630,6 +632,13 @@ def gen_bias(attn_impl: str):
                 alibi=alibi,
                 alibi_bias_max=8,
             )
+        if attn_impl == 'torch':
+            assert isinstance(attn_bias, torch.Tensor)  # pyright
+            attn_bias = apply_sequence_id(
+                attn_bias,
+                sequence_id,  # type: ignore
+                s,
+            )
 
         return attn_bias
 
@@ -637,7 +646,7 @@ def gen_bias(attn_impl: str):
         sequence_id=sequence_id,
         S=s,
         attn_uses_sequence_id=True,
-        attn_impl='flash',
+        attn_impl=attn_impl,
         attention_mask=attention_mask,
     )
 
@@ -656,7 +665,7 @@ def gen_bias(attn_impl: str):
     x1.requires_grad = True
 
     with torch.autocast(x0.device.type):
-        attn_bias_0 = gen_bias('flash')
+        attn_bias_0 = gen_bias(attn_impl)
         alibi_slopes_0 = None
         if alibi:
             alibi_slopes_0 = gen_slopes(
@@ -703,7 +712,7 @@ def gen_bias(attn_impl: str):
             flash_attn_padding_info=flash_attn_padding_info,
             alibi_slopes=alibi_slopes_0,
         )
-        attn_bias_1 = gen_bias('flash')
+        attn_bias_1 = gen_bias(attn_impl)
         alibi_slopes_1 = None
         if alibi:
             alibi_slopes_1 = gen_slopes(

From 14cff668750dc08eb4511ddee0d55b127e711dea Mon Sep 17 00:00:00 2001
From: Milo Cress <iamroot@databricks.com>
Date: Sun, 22 Sep 2024 19:49:21 -0400
Subject: [PATCH 12/42] Error on text dataset file not found (#1534)

---
 .../data_prep/convert_text_to_mds.py              | 15 ++++++++++-----
 llmfoundry/utils/exceptions.py                    | 11 +++++++++++
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
index 9a1f8a912d..3ea5aeb5d4 100644
--- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -32,6 +32,7 @@
     CannotUnicodeDecodeFile,
     DatasetTooSmallError,
     InputFolderMissingDataError,
+    InputFolderNotFound,
     OutputFolderNotEmptyError,
 )
 
@@ -125,11 +126,15 @@ def get_object_names(input_folder: str) -> list[str]:
     object_store = maybe_create_object_store_from_uri(input_folder)
     if object_store is not None:
         _, _, folder_prefix = parse_uri(input_folder)
-        names = [
-            name for name in object_store.list_objects(folder_prefix)
-            if name.endswith('.txt')
-        ]
-        log.info(f'Found {len(names)} text files in remote storage')
+        try:
+            names = [
+                name for name in object_store.list_objects(folder_prefix)
+                if name.endswith('.txt')
+            ]
+            log.info(f'Found {len(names)} text files in remote storage')
+        except FileNotFoundError:
+            raise InputFolderNotFound(folder_prefix)
+
     else:
         # input_folder is a local folder
         names = [
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 11895564f2..900355dff5 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -348,6 +348,17 @@ def __init__(self, input_folder: str) -> None:
         super().__init__(message, input_folder=input_folder)
 
 
+class InputFolderNotFound(UserError):
+    """Error thrown when the a folder is not found."""
+
+    def __init__(self, folder_that_was_not_found: str) -> None:
+        message = f'{folder_that_was_not_found} not found.'
+        super().__init__(
+            message,
+            folder_that_was_not_found=folder_that_was_not_found,
+        )
+
+
 class CannotUnicodeDecodeFile(UserError):
     """Error thrown when the input folder is missing data."""
 

From a2c0507795a887b6fb71d3ef975b714523fe2abb Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <narayan.saaketh@gmail.com>
Date: Sun, 22 Sep 2024 18:23:51 -0700
Subject: [PATCH 13/42] Make ICL tasks not required for eval (#1540)

---
 llmfoundry/command_utils/eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index e644ad1f0f..70c4319ea8 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -262,7 +262,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
         EvalConfig,
         EVAL_CONFIG_KEYS,
         transforms=[allow_toplevel_keys],
-        icl_tasks_required=True,
+        icl_tasks_required=False,
     )
 
     model_configs = eval_config.models
@@ -273,7 +273,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]:
     # Mandatory Evaluation Parameters
     icl_tasks = eval_config.icl_tasks or eval_config.icl_tasks_str
     if icl_tasks is None:
-        raise ValueError('icl_tasks must be specified in the config')
+        icl_tasks = []
 
     # Optional Evaluation Parameters with default values
     eval_loader_config = eval_config.eval_loader or eval_config.eval_loaders

From 85403c086710bc0f62d03fc03c0fcbb2e5ffda1d Mon Sep 17 00:00:00 2001
From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com>
Date: Mon, 23 Sep 2024 10:37:26 -0400
Subject: [PATCH 14/42] Bumping flash attention version to 2.6.3 and adding
 option for softcap in attention and lm_head logits. (#1374)

---
 llmfoundry/models/layers/attention.py      | 24 +++++-
 llmfoundry/models/mpt/configuration_mpt.py | 14 +++
 llmfoundry/models/mpt/modeling_mpt.py      |  6 ++
 llmfoundry/models/utils/config_defaults.py |  1 +
 setup.py                                   |  2 +-
 tests/models/layers/test_flash_attn.py     | 99 +++++++++++++++++++++-
 6 files changed, 140 insertions(+), 6 deletions(-)

diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index 625327767e..612d6b9642 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -112,6 +112,7 @@ def scaled_multihead_dot_product_attention(
     dropout_p: float = 0.0,
     training: bool = False,
     needs_weights: bool = False,
+    attn_logit_softcapping: Optional[float] = None,
     sliding_window_size: int = -1,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
@@ -149,6 +150,11 @@ def scaled_multihead_dot_product_attention(
 
     attn_weight = q.matmul(k) * softmax_scale
 
+    if attn_logit_softcapping is not None:
+        attn_weight = attn_logit_softcapping * torch.tanh(
+            attn_weight / attn_logit_softcapping,
+        )
+
     if attn_bias is not None:
         # clamp to 0 necessary for torch 2.0 compile()
         _s_q = max(0, attn_bias.size(2) - s_q)
@@ -264,6 +270,7 @@ def flash_attn_fn(
     sliding_window_size: int = -1,
     alibi_slopes: Optional[torch.Tensor] = None,
     flash_attn_padding_info: Optional[dict[str, torch.Tensor]] = None,
+    attn_logit_softcapping: Optional[float] = None,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     if key_padding_mask is not None:
@@ -381,13 +388,17 @@ def flash_attn_fn(
             return_attn_probs=needs_weights,
         )
     elif is_flash_v2_installed():
-        alibi_kwargs = {}
+        extra_attn_kwargs = {}
         if check_alibi_support('flash'):
-            alibi_kwargs = {'alibi_slopes': alibi_slopes}
+            extra_attn_kwargs['alibi_slopes'] = alibi_slopes
         elif alibi_slopes is not None:
             raise ValueError(
                 'alibi_slopes is only supported for flash-attn>=2.4.2',
             )
+        if is_flash_v2_installed(
+            v2_version='v2.6.2',
+        ) and attn_logit_softcapping is not None:
+            extra_attn_kwargs['softcap'] = attn_logit_softcapping
         output_unpad = flash_attn_interface.flash_attn_varlen_func(
             q=query_unpad,
             k=key_unpad,
@@ -401,7 +412,7 @@ def flash_attn_fn(
             causal=reset_is_causal,
             return_attn_probs=needs_weights,
             window_size=(sliding_window_size, sliding_window_size),
-            **alibi_kwargs,
+            **extra_attn_kwargs,
         )
     else:
         raise RuntimeError(
@@ -448,6 +459,7 @@ def __init__(
         bias: bool = True,
         sliding_window_size: int = -1,
         reuse_kv_layer_idx: Optional[int] = None,
+        attn_logit_softcapping: Optional[float] = None,
         kv_dim: Optional[int] = None,
     ):
         super().__init__()
@@ -463,6 +475,7 @@ def __init__(
         self.kv_n_heads = kv_n_heads
         self.sliding_window_size = sliding_window_size
         self.reuse_kv_layer_idx = reuse_kv_layer_idx
+        self.attn_logit_softcapping = attn_logit_softcapping
 
         self.kv_dim = kv_dim if kv_dim is not None else self.d_model
         self.head_dim = d_model // n_heads
@@ -625,6 +638,7 @@ def forward(
             dropout_p=self.attn_dropout_p,
             training=self.training,
             needs_weights=needs_weights,
+            attn_logit_softcapping=self.attn_logit_softcapping,
             sliding_window_size=self.sliding_window_size,
             **extra_attn_kwargs,
         )
@@ -853,6 +867,7 @@ def __init__(
         bias: bool = True,
         sliding_window_size: int = -1,
         reuse_kv_layer_idx: Optional[int] = None,
+        attn_logit_softcapping: Optional[float] = None,
         kv_dim: Optional[int] = None,
     ):
         super().__init__(
@@ -873,6 +888,7 @@ def __init__(
             bias=bias,
             sliding_window_size=sliding_window_size,
             reuse_kv_layer_idx=reuse_kv_layer_idx,
+            attn_logit_softcapping=attn_logit_softcapping,
             kv_dim=kv_dim,
         )
 
@@ -902,6 +918,7 @@ def __init__(
         bias: bool = True,
         sliding_window_size: int = -1,
         reuse_kv_layer_idx: Optional[int] = None,
+        attn_logit_softcapping: Optional[float] = None,
         kv_dim: Optional[int] = None,
     ):
         super().__init__(
@@ -922,6 +939,7 @@ def __init__(
             bias=bias,
             sliding_window_size=sliding_window_size,
             reuse_kv_layer_idx=reuse_kv_layer_idx,
+            attn_logit_softcapping=attn_logit_softcapping,
             kv_dim=kv_dim,
         )
 
diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index 91b431e3b4..dbcabdf5f9 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -51,6 +51,7 @@ def __init__(
         tie_word_embeddings: bool = True,
         use_pad_tok_in_ffn: bool = True,
         block_overrides: Optional[dict[str, Any]] = None,
+        final_logit_softcapping: Optional[float] = None,
         **kwargs: Any,
     ):
         """The MPT configuration class.
@@ -148,6 +149,7 @@ def __init__(
                             reuse_kv_layer:
                                 attn_config:
                                     reuse_kv_layer_idx: -6 # Relative index of the layer whose kv cache to reuse
+            final_logit_softcapping (float | None): Softcapping threshold for final logit. Set to None to disable (default value None). Please see https://arxiv.org/pdf/2403.08295 for more details.
             kwargs (Any): Other relevant keyword arguments.
         """
         self.d_model = d_model
@@ -181,6 +183,7 @@ def __init__(
         if block_overrides is not None:
             self._validate_block_overrides(block_overrides)
         self.block_overrides = block_overrides
+        self.final_logit_softcapping = final_logit_softcapping
 
         if isinstance(fc_type, str):
             fc_type = {'name': fc_type}
@@ -325,6 +328,17 @@ def _validate_config(self) -> None:
             raise NotImplementedError(
                 'sliding window attention only implemented for torch attention and flash attention (v2.3.0 or higher).',
             )
+        if self.attn_config['attn_logit_softcapping'] is not None:
+            if self.attn_config['attn_logit_softcapping'] <= 0:
+                raise ValueError(
+                    'Attention attn_logit_softcapping should be positive.',
+                )
+            if self.attn_config[
+                'attn_impl'
+            ] == 'flash' and not is_flash_v2_installed(v2_version='v2.6.2',):
+                raise NotImplementedError(
+                    'Attention attn_logit_softcapping is only implemented with torch attention or flash attention v2.6.2 (or higher).',
+                )
         if self.attn_config['kv_dim'] is not None and self.attn_config[
             'fused_qkv']:
             raise ValueError(
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index cfe1172634..9212f5594d 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -1071,6 +1071,7 @@ def __init__(self, config: MPTConfig):
                         f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.",
                     )
             self.logit_scale = logit_scale
+        self.final_logit_softcapping = config.final_logit_softcapping
 
     @property
     def backbone_model_class(self) -> type[MPTModel]:
@@ -1172,6 +1173,11 @@ def forward(
                 )
             logits *= self.logit_scale
 
+        if self.final_logit_softcapping is not None:
+            logits = self.final_logit_softcapping * torch.tanh(
+                logits / self.final_logit_softcapping,
+            )
+
         loss = None
         if labels is not None:
             _labels = torch.roll(labels, shifts=-1)
diff --git a/llmfoundry/models/utils/config_defaults.py b/llmfoundry/models/utils/config_defaults.py
index bd3b29a479..5550785149 100644
--- a/llmfoundry/models/utils/config_defaults.py
+++ b/llmfoundry/models/utils/config_defaults.py
@@ -18,6 +18,7 @@
     'softmax_scale': None,
     'attn_uses_sequence_id': False,
     'sliding_window_size': -1,
+    'attn_logit_softcapping': None,
     'alibi': False,
     'alibi_bias_max': 8,
     'rope': False,
diff --git a/setup.py b/setup.py
index 0a75c610b8..ebc66fdacf 100644
--- a/setup.py
+++ b/setup.py
@@ -104,7 +104,7 @@
 
 # Flash 2 group kept for backwards compatibility
 extra_deps['gpu-flash2'] = [
-    'flash-attn>=2.5.8,<3',
+    'flash-attn>=2.6.3,<3',
 ]
 
 extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])
diff --git a/tests/models/layers/test_flash_attn.py b/tests/models/layers/test_flash_attn.py
index 987ea7160a..666d93c9b4 100644
--- a/tests/models/layers/test_flash_attn.py
+++ b/tests/models/layers/test_flash_attn.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+from typing import Optional
 
 import pytest
 import torch
@@ -334,5 +335,99 @@ def gen_bias():
     _assert_approx_equal(value_1.grad, value_2.grad)
 
 
-def _assert_approx_equal(value1: torch.Tensor, value2: torch.Tensor):
-    assert torch.norm(value2 - value1) <= 1e-2 + 1e-2 * torch.norm(value2)
+@pytest.mark.gpu
+@pytest.mark.skipif(
+    not is_flash_v2_installed(v2_version='v2.6.2'),
+    reason=
+    'attn_logit_softcapping only supported by Flash Attention after v2.6.2.',
+)
+@pytest.mark.parametrize(
+    'attn_logit_softcapping',
+    [None, 0.1, 1.0, 10.0, 100.0],
+)
+def test_attn_logit_softcapping(attn_logit_softcapping: Optional[float]):
+    # Test that attn_logit_softcapping in attention works as expected.
+    dtype = torch.bfloat16
+    device = 'cuda'
+    d = 128
+    seqlen_1 = 8
+    bsz = 2
+    n_heads = 4
+
+    query_1 = torch.randn(bsz, seqlen_1,
+                          n_heads * d).to(dtype=dtype, device=device)
+    query_1.requires_grad = True
+    key_1 = torch.randn(bsz, seqlen_1,
+                        n_heads * d).to(dtype=dtype, device=device)
+    key_1.requires_grad = True
+    value_1 = torch.randn(bsz, seqlen_1,
+                          n_heads * d).to(dtype=dtype, device=device)
+    value_1.requires_grad = True
+    output_1, _, _ = flash_attn_fn(
+        query=query_1,
+        key=key_1,
+        value=value_1,
+        n_heads=n_heads,
+        kv_n_heads=n_heads,
+        past_key_value=None,
+        softmax_scale=1 / math.sqrt(d),
+        attn_bias=None,
+        key_padding_mask=None,
+        is_causal=True,
+        dropout_p=0.0,
+        training=False,
+        needs_weights=False,
+        flash_attn_padding_info=gen_flash_attn_padding_info(
+            bsz,
+            seqlen_1,
+            0,
+            query_1.device,
+            None,
+            None,
+        ),
+        should_repeat_kv_for_gqa=True,
+        attn_logit_softcapping=attn_logit_softcapping,
+    )
+    output_1.sum().backward()
+
+    query_2 = query_1.detach().clone()
+    query_2.requires_grad = True
+    key_2 = key_1.detach().clone()
+    key_2.requires_grad = True
+    value_2 = value_1.detach().clone()
+    value_2.requires_grad = True
+    output_2, _, _ = scaled_multihead_dot_product_attention(
+        query=query_2,
+        key=key_2,
+        value=value_2,
+        n_heads=n_heads,
+        kv_n_heads=n_heads,
+        past_key_value=None,
+        softmax_scale=1 / math.sqrt(d),
+        key_padding_mask=None,
+        is_causal=True,
+        dropout_p=0.0,
+        training=False,
+        needs_weights=False,
+        attn_logit_softcapping=attn_logit_softcapping,
+    )
+    output_2.sum().backward()
+
+    _assert_approx_equal(output_1, output_2)
+    assert (query_2.grad is not None) and (query_1.grad is not None)
+    _assert_approx_equal(query_1.grad, query_2.grad)
+    assert (key_2.grad is not None) and (key_1.grad is not None)
+    _assert_approx_equal(key_1.grad, key_2.grad)
+    assert (value_2.grad is not None) and (value_1.grad is not None)
+    _assert_approx_equal(value_1.grad, value_2.grad)
+
+
+def _assert_approx_equal(
+    value1: torch.Tensor,
+    value2: torch.Tensor,
+    atol: float = 1e-2,
+    rtol: float = 1e-2,
+):
+    actual_difference = torch.norm(value2 - value1)
+    allowed_difference = atol + rtol * torch.norm(value2)
+    assert actual_difference < allowed_difference, f'{actual_difference=}, {allowed_difference=}'

From f377090dec102afc646fb29a4510ded6ae74ecf9 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:00:07 -0700
Subject: [PATCH 15/42] Register mosaic logger (#1542)

---
 llmfoundry/loggers/__init__.py         |  2 ++
 tests/loggers/test_mosaic_ml_logger.py | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 tests/loggers/test_mosaic_ml_logger.py

diff --git a/llmfoundry/loggers/__init__.py b/llmfoundry/loggers/__init__.py
index cd3f3fdc62..c60d9be2cd 100644
--- a/llmfoundry/loggers/__init__.py
+++ b/llmfoundry/loggers/__init__.py
@@ -4,6 +4,7 @@
 from composer.loggers import (
     InMemoryLogger,
     MLFlowLogger,
+    MosaicMLLogger,
     TensorboardLogger,
     WandBLogger,
 )
@@ -18,3 +19,4 @@
     func=InMemoryLogger,
 )  # for backwards compatibility
 loggers.register('mlflow', func=MLFlowLogger)
+loggers.register('mosaicml', func=MosaicMLLogger)
diff --git a/tests/loggers/test_mosaic_ml_logger.py b/tests/loggers/test_mosaic_ml_logger.py
new file mode 100644
index 0000000000..e9c003321b
--- /dev/null
+++ b/tests/loggers/test_mosaic_ml_logger.py
@@ -0,0 +1,16 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from composer.loggers import MosaicMLLogger
+
+from llmfoundry.utils.builders import build_logger
+
+
+def test_mosaic_ml_logger_constructs():
+    mosaic_ml_logger = build_logger(
+        'mosaicml',
+        kwargs={'ignore_exceptions': True},
+    )
+
+    assert isinstance(mosaic_ml_logger, MosaicMLLogger)
+    assert mosaic_ml_logger.ignore_exceptions == True

From d85c83b15d5b07a1b8cd00eaa7e400aaf7b22ea7 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Mon, 23 Sep 2024 23:24:16 -0700
Subject: [PATCH 16/42] Hfcheckpointer optional generation config (#1543)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 llmfoundry/callbacks/hf_checkpointer.py       |  7 ++-
 .../inference/test_convert_composer_to_hf.py  | 56 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 65bdcb3b6c..4365a5b2e5 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -588,9 +588,10 @@ def tensor_hook(
                     del new_base_model_instance
                 else:
                     new_model_instance = type(original_model)(new_config)
-                    new_model_instance.generation_config.update(
-                        **original_model.generation_config.to_dict(),
-                    )
+                    if new_model_instance.generation_config is not None:
+                        new_model_instance.generation_config.update(
+                            **original_model.generation_config.to_dict(),
+                        )
 
             # Then load the state dict in with "assign" so that the state dict
             # is loaded properly even though the model is initially on meta device.
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index 66ec739a65..bf5f2a970b 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -8,13 +8,14 @@
 import pathlib
 import shutil
 from argparse import Namespace
-from typing import Any, Callable, Optional, cast
+from typing import Any, Callable, Optional, Union, cast
 from unittest import mock
 from unittest.mock import ANY, MagicMock, patch
 
 import catalogue
 import pytest
 import torch
+import torch.nn as nn
 import transformers
 from composer import ComposerModel, Trainer
 from composer.loggers import MLFlowLogger
@@ -23,7 +24,13 @@
 from omegaconf import OmegaConf as om
 from torch.distributed._tensor.api import DTensor
 from torch.utils.data import DataLoader
-from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers import (
+    AutoConfig,
+    GenerationConfig,
+    PretrainedConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+)
 
 from llmfoundry.callbacks import HuggingFaceCheckpointer
 from llmfoundry.callbacks.hf_checkpointer import _maybe_get_license_filename
@@ -1637,3 +1644,48 @@ def test_license_file_finder(
     found_path = _maybe_get_license_filename(str(tmp_path))
     assert (found_path == license_file_name
            ) if license_file_name is not None else (found_path is None)
+
+
+@pytest.mark.parametrize('generation_config', [None, {}, {'max_length': 200}])
+def test_generation_config_variants(
+    generation_config: Optional[Union[dict[str, Any], GenerationConfig]],
+):
+
+    class MockModel(nn.Module):
+
+        def __init__(self, config: PretrainedConfig):
+            super().__init__()
+            self.config = config
+            # Ensure generation_config is always a GenerationConfig object
+            if isinstance(config.generation_config, dict):
+                self.generation_config = GenerationConfig(
+                    **config.generation_config,
+                )
+            else:
+                self.generation_config = config.generation_config
+
+    config = AutoConfig.from_pretrained('gpt2')
+    # Convert dict to GenerationConfig if needed
+    if isinstance(generation_config, dict):
+        generation_config = GenerationConfig(**generation_config)
+    config.generation_config = generation_config
+
+    mock_model = MockModel(config)
+    logger = MagicMock()
+    state = MagicMock()
+    state.timestamp.batch = 1
+    state.is_model_ddp = False
+    state.model.model = mock_model
+    state.model.tokenizer = None
+
+    checkpointer = HuggingFaceCheckpointer(
+        save_folder='test',
+        save_interval='1ba',
+    )
+
+    checkpointer._save_checkpoint(
+        state=state,
+        logger=logger,
+        upload_to_save_folder=False,
+        register_to_mlflow=False,
+    )

From 275a2a40d86a36882cc7963e2677628e05aaaf01 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Tue, 24 Sep 2024 16:57:21 -0700
Subject: [PATCH 17/42] Bump composer version to 0.25.0 (#1546)

---
 setup.py                                                 | 8 ++++----
 tests/a_scripts/inference/test_convert_composer_to_hf.py | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index ebc66fdacf..48c1326b0d 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@
 ]
 
 install_requires = [
-    'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.24.1,<0.25',
+    'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.25.0,<0.26',
     'mlflow>=2.14.1,<2.17',
     'accelerate>=0.25,<0.34',  # for HF inference `device_map`
     'transformers>=4.43.2,<4.44',
@@ -91,7 +91,7 @@
 ]
 
 extra_deps['databricks'] = [
-    'mosaicml[databricks]>=0.24.1,<0.25',
+    'mosaicml[databricks]>=0.25.0,<0.26',
     'numpy<2',
     'databricks-sql-connector>=3,<4',
     'databricks-connect==14.1.0',
@@ -99,7 +99,7 @@
 ]
 
 extra_deps['tensorboard'] = [
-    'mosaicml[tensorboard]>=0.24.1,<0.25',
+    'mosaicml[tensorboard]>=0.25.0,<0.26',
 ]
 
 # Flash 2 group kept for backwards compatibility
@@ -110,7 +110,7 @@
 extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])
 
 extra_deps['peft'] = [
-    'mosaicml[peft]>=0.24.1,<0.25',
+    'mosaicml[peft]>=0.25.0,<0.26',
 ]
 
 extra_deps['openai'] = [
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
index bf5f2a970b..c25432dc48 100644
--- a/tests/a_scripts/inference/test_convert_composer_to_hf.py
+++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -1563,6 +1563,8 @@ def test_mptmoe_huggingface_conversion_callback(
 
             # Check output equivalence
             loaded_model = loaded_model.cuda().bfloat16()  # type: ignore
+            for k, v in batch.items():
+                batch[k] = v.cuda()
             loaded_model_logits = loaded_model(
                 input_ids=batch.get('input_ids', None),
                 attention_mask=batch.get('attention_mask', None),

From 151a2e297b603d84e1e4dfed389c3494990936e6 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 25 Sep 2024 08:53:05 -0700
Subject: [PATCH 18/42] Bump streaming version to 0.9.0 (#1550)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 48c1326b0d..d1979faf63 100644
--- a/setup.py
+++ b/setup.py
@@ -56,7 +56,7 @@
     'mlflow>=2.14.1,<2.17',
     'accelerate>=0.25,<0.34',  # for HF inference `device_map`
     'transformers>=4.43.2,<4.44',
-    'mosaicml-streaming>=0.8.1,<0.9',
+    'mosaicml-streaming>=0.9.0,<0.10',
     'torch>=2.4.0,<2.4.1',
     'datasets>=2.19,<2.20',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data

From 722526d420dab9adc5a5be18425d5e08c97ee0c8 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 25 Sep 2024 09:25:27 -0700
Subject: [PATCH 19/42] Bump version to 0.13.0.dev0 (#1549)

---
 llmfoundry/_version.py                | 2 +-
 llmfoundry/command_utils/eval.py      | 2 +-
 llmfoundry/models/hf/model_wrapper.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py
index 2f1f590b19..0cddcaf967 100644
--- a/llmfoundry/_version.py
+++ b/llmfoundry/_version.py
@@ -3,4 +3,4 @@
 
 """The LLM Foundry Version."""
 
-__version__ = '0.12.0.dev0'
+__version__ = '0.13.0.dev0'
diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index 70c4319ea8..73127e8a07 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -82,7 +82,7 @@ def evaluate_model(
         warnings.warn(
             VersionedDeprecationWarning(
                 'The argument fsdp_config is deprecated. Please use parallelism_config instead.',
-                remove_version='0.13.0',
+                remove_version='0.14.0',
             ),
         )
     if fsdp_config and parallelism_config:
diff --git a/llmfoundry/models/hf/model_wrapper.py b/llmfoundry/models/hf/model_wrapper.py
index c8805e5d6d..f2b67db1ec 100644
--- a/llmfoundry/models/hf/model_wrapper.py
+++ b/llmfoundry/models/hf/model_wrapper.py
@@ -48,7 +48,7 @@ def __init__(
         warnings.warn(
             VersionedDeprecationWarning(
                 '`HuggingFaceModelWithFSDP` is deprecated. In the future please use `BaseHuggingFaceModel`.',
-                remove_version='0.13.0',
+                remove_version='0.14.0',
             ),
         )
         super().__init__(

From c786defb6b6175243cd9e4a1b69918488ba7e3b9 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Wed, 25 Sep 2024 14:34:40 -0700
Subject: [PATCH 20/42] Add proper user error for accessing schema (#1548)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 .../data_prep/convert_delta_to_json.py        | 24 ++++++++++++-
 .../data_prep/test_convert_delta_to_json.py   | 35 +++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
index 666d0278c6..d676fc2165 100644
--- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -233,7 +233,27 @@ def run_query(
     elif method == 'dbconnect':
         if spark == None:
             raise ValueError(f'sparkSession is required for dbconnect')
-        df = spark.sql(query)
+
+        try:
+            df = spark.sql(query)
+        except Exception as e:
+            from pyspark.errors import AnalysisException
+            if isinstance(e, AnalysisException):
+                if 'INSUFFICIENT_PERMISSIONS' in e.message:  # pyright: ignore
+                    match = re.search(
+                        r"Schema\s+'([^']+)'",
+                        e.message,  # pyright: ignore
+                    )
+                    if match:
+                        schema_name = match.group(1)
+                        action = f'using the schema {schema_name}'
+                    else:
+                        action = 'using the schema'
+                    raise InsufficientPermissionsError(action=action,) from e
+            raise RuntimeError(
+                f'Error in querying into schema. Restart sparkSession and try again',
+            ) from e
+
         if collect:
             return df.collect()
         return df
@@ -461,6 +481,8 @@ def fetch(
                 raise InsufficientPermissionsError(
                     action=f'reading from {tablename}',
                 ) from e
+        if isinstance(e, InsufficientPermissionsError):
+            raise e
         raise RuntimeError(
             f'Error in get rows from {tablename}. Restart sparkSession and try again',
         ) from e
diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py
index e623467bf7..bbb03a26d9 100644
--- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py
+++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py
@@ -1,12 +1,14 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import sys
 import unittest
 from argparse import Namespace
 from typing import Any
 from unittest.mock import MagicMock, mock_open, patch
 
 from llmfoundry.command_utils.data_prep.convert_delta_to_json import (
+    InsufficientPermissionsError,
     download,
     fetch_DT,
     format_tablename,
@@ -17,6 +19,39 @@
 
 class TestConvertDeltaToJsonl(unittest.TestCase):
 
+    def test_run_query_dbconnect_insufficient_permissions(self):
+        error_message = (
+            '[INSUFFICIENT_PERMISSIONS] Insufficient privileges: User does not have USE SCHEMA '
+            "on Schema 'main.oogabooga'. SQLSTATE: 42501"
+        )
+
+        class MockAnalysisException(Exception):
+
+            def __init__(self, message: str):
+                self.message = message
+
+        with patch.dict('sys.modules', {'pyspark.errors': MagicMock()}):
+            sys.modules[
+                'pyspark.errors'
+            ].AnalysisException = MockAnalysisException  # pyright: ignore
+
+            mock_spark = MagicMock()
+            mock_spark.sql.side_effect = MockAnalysisException(error_message)
+
+            with self.assertRaises(InsufficientPermissionsError) as context:
+                run_query(
+                    'SELECT * FROM table',
+                    method='dbconnect',
+                    cursor=None,
+                    spark=mock_spark,
+                )
+
+            self.assertIn(
+                'using the schema main.oogabooga',
+                str(context.exception),
+            )
+            mock_spark.sql.assert_called_once_with('SELECT * FROM table')
+
     @patch(
         'databricks.sql.connect',
     )

From e6b8d142c3c8133f21b9e1d7c05927201976b2e8 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Wed, 25 Sep 2024 15:47:48 -0700
Subject: [PATCH 21/42] Validate Cluster Access Mode (#1551)

Co-authored-by: v-chen_data <v-chen_data@example.com>
Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 .../data_prep/convert_delta_to_json.py        | 12 +++++++++++
 llmfoundry/utils/exceptions.py                | 13 ++++++++++++
 .../data_prep/test_convert_delta_to_json.py   | 20 +++++++++++++++----
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
index d676fc2165..fbbc5f2cd9 100644
--- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -20,6 +20,7 @@
 
 from llmfoundry.utils.exceptions import (
     ClusterDoesNotExistError,
+    ClusterInvalidAccessMode,
     FailedToConnectToDatabricksError,
     FailedToCreateSQLConnectionError,
     InsufficientPermissionsError,
@@ -568,6 +569,17 @@ def validate_and_get_cluster_info(
         if res is None:
             raise ClusterDoesNotExistError(cluster_id)
 
+        data_security_mode = str(
+            res.data_security_mode,
+        ).upper()[len('DATASECURITYMODE.'):]
+
+        # NONE stands for No Isolation Shared
+        if data_security_mode == 'NONE':
+            raise ClusterInvalidAccessMode(
+                cluster_id=cluster_id,
+                access_mode=data_security_mode,
+            )
+
         assert res.spark_version is not None
         stripped_runtime = re.sub(
             r'[a-zA-Z]',
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 900355dff5..265b9bbe8f 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -318,6 +318,19 @@ def __init__(self, cluster_id: str) -> None:
         super().__init__(message, cluster_id=cluster_id)
 
 
+class ClusterInvalidAccessMode(NetworkError):
+    """Error thrown when the cluster does not exist."""
+
+    def __init__(self, cluster_id: str, access_mode: str) -> None:
+        message = f'Cluster with id {cluster_id} has access mode {access_mode}. ' + \
+        'please make sure the cluster used has access mode Shared or Single User!'
+        super().__init__(
+            message,
+            cluster_id=cluster_id,
+            access_mode=access_mode,
+        )
+
+
 class FailedToCreateSQLConnectionError(
     NetworkError,
 ):
diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py
index bbb03a26d9..b1a9f1e878 100644
--- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py
+++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py
@@ -264,7 +264,10 @@ def test_dbconnect_called(
         DATABRICKS_TOKEN = 'token'
         use_serverless = False
 
-        mock_cluster_response = Namespace(spark_version='14.1.0-scala2.12')
+        mock_cluster_response = Namespace(
+            spark_version='14.1.0-scala2.12',
+            data_security_mode='SINGLE_USER',
+        )
         mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response
 
         mock_remote = MagicMock()
@@ -321,7 +324,10 @@ def test_sqlconnect_called_dbr13(
         DATABRICKS_TOKEN = 'token'
         use_serverless = False
 
-        mock_cluster_response = Namespace(spark_version='13.0.0-scala2.12')
+        mock_cluster_response = Namespace(
+            spark_version='13.0.0-scala2.12',
+            data_security_mode='SINGLE_USER',
+        )
         mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response
 
         fetch_DT(
@@ -373,7 +379,10 @@ def test_sqlconnect_called_dbr14(
         DATABRICKS_TOKEN = 'token'
         use_serverless = False
 
-        mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12')
+        mock_cluster_response = Namespace(
+            spark_version='14.2.0-scala2.12',
+            data_security_mode='SINGLE_USER',
+        )
         mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response
 
         fetch_DT(
@@ -425,7 +434,10 @@ def test_sqlconnect_called_https(
         DATABRICKS_TOKEN = 'token'
         use_serverless = False
 
-        mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12')
+        mock_cluster_response = Namespace(
+            spark_version='14.2.0-scala2.12',
+            data_security_mode='SINGLE_USER',
+        )
         mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response
 
         fetch_DT(

From dc58bb7eb95e52874774e1d5a7669a1a5f194429 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 26 Sep 2024 09:40:13 -0700
Subject: [PATCH 22/42] Update mcli yamls (#1552)

---
 mcli/mcli-1b-eval.yaml                | 4 ++--
 mcli/mcli-1b-max-seq-len-8k.yaml      | 4 ++--
 mcli/mcli-1b.yaml                     | 4 ++--
 mcli/mcli-benchmark-mpt.yaml          | 4 ++--
 mcli/mcli-convert-composer-to-hf.yaml | 4 ++--
 mcli/mcli-hf-eval.yaml                | 4 ++--
 mcli/mcli-hf-generate.yaml            | 4 ++--
 mcli/mcli-llama2-finetune.yaml        | 4 ++--
 mcli/mcli-openai-eval.yaml            | 4 ++--
 mcli/mcli-pretokenize-oci-upload.yaml | 4 ++--
 10 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml
index 4fcf8b3cb9..bd6a7b538a 100644
--- a/mcli/mcli-1b-eval.yaml
+++ b/mcli/mcli-1b-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
@@ -9,7 +9,7 @@ integrations:
 command: |
   cd llm-foundry/scripts/
   composer eval/eval.py /mnt/config/parameters.yaml
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 name: mpt-1b-eval
 
 compute:
diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
index fb96c576e0..b437bc5f0d 100644
--- a/mcli/mcli-1b-max-seq-len-8k.yaml
+++ b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
@@ -17,7 +17,7 @@ command: |
     --out_root ./my-copy-c4 --splits train_small val_small \
     --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
   composer train/train.py /mnt/config/parameters.yaml
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 name: mpt-1b-ctx-8k-gpus-8
 
 compute:
diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml
index 26255977f4..789fc4fc02 100644
--- a/mcli/mcli-1b.yaml
+++ b/mcli/mcli-1b.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
@@ -21,7 +21,7 @@ command: |
     eval_loader.dataset.split=val_small \
     max_duration=100ba \
     eval_interval=0
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 name: mpt-1b-gpus-8
 
 compute:
diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml
index 3995598fd3..0c023f9a83 100644
--- a/mcli/mcli-benchmark-mpt.yaml
+++ b/mcli/mcli-benchmark-mpt.yaml
@@ -6,12 +6,12 @@ compute:
   # cluster: TODO # Name of the cluster to use for this run
   # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments
 
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
 
diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml
index 7b715f6792..a211e3baeb 100644
--- a/mcli/mcli-convert-composer-to-hf.yaml
+++ b/mcli/mcli-convert-composer-to-hf.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit:  # OR use your commit hash
   pip_install: .
   ssh_clone: false  # Should be true if using a private repo
@@ -13,7 +13,7 @@ command: |
     --hf_output_path s3://bucket/folder/hf/ \
     --output_precision bf16 \
 
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 name: convert-composer-hf
 
 compute:
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 27f5938d67..9bcebfbea0 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
@@ -16,7 +16,7 @@ gpu_num: 8
 # gpu_type:
 # cluster:  # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml
index cb3040e4ee..85a0f6b0e4 100644
--- a/mcli/mcli-hf-generate.yaml
+++ b/mcli/mcli-hf-generate.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
@@ -35,7 +35,7 @@ command: |
       "Here's a quick recipe for baking chocolate chip cookies: Start by" \
       "The best 5 cities to visit in Europe are"
 
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 name: hf-generate
 
 compute:
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index 7134e6204c..210e8942b5 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
@@ -9,7 +9,7 @@ integrations:
 command: |
   cd llm-foundry/scripts
   composer train/train.py /mnt/config/parameters.yaml
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 name: llama2-finetune
 
 compute:
diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml
index cd04d89f4e..987fc829a9 100644
--- a/mcli/mcli-openai-eval.yaml
+++ b/mcli/mcli-openai-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu,openai]
   ssh_clone: false  # Should be true if using a private repo
@@ -16,7 +16,7 @@ gpu_num:  #
 gpu_type:  #
 cluster:  # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml
index 5425ce9897..49fbbb08d8 100644
--- a/mcli/mcli-pretokenize-oci-upload.yaml
+++ b/mcli/mcli-pretokenize-oci-upload.yaml
@@ -1,5 +1,5 @@
 name: c4-2k-pre-tokenized
-image: mosaicml/llm-foundry:2.3.1_cu121-latest
+image: mosaicml/llm-foundry:2.4.0_cu124-latest
 compute:
   gpus: 8  # Number of GPUs to use
 
@@ -14,7 +14,7 @@ integrations:
   - oci-cli==3.23.2
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.11.0
+  git_branch: v0.12.0
   # git_commit: # OR use your commit hash
   pip_install: .
   ssh_clone: false  # Should be true if using a private repo

From 3b1fc4ae5c205118901fcf1557260952fe844e2e Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 26 Sep 2024 17:23:34 -0400
Subject: [PATCH 23/42] Use `allenai/c4` instead of `c4` dataset (#1554)

Co-authored-by: Eitan Turok <eitan.turok@gmail.com>
---
 README.md                                     |  2 +-
 TUTORIAL.md                                   |  4 ++--
 .../data_prep/convert_dataset_hf.py           |  4 ++--
 .../data_prep/convert_dataset_json.py         |  2 +-
 mcli/mcli-1b-max-seq-len-8k.yaml              |  2 +-
 mcli/mcli-1b.yaml                             |  2 +-
 mcli/mcli-pretokenize-oci-upload.yaml         |  2 +-
 scripts/data_prep/README.md                   |  2 +-
 scripts/train/README.md                       |  6 ++---
 .../train/benchmarking/submit_benchmarks.py   |  2 +-
 .../data_prep/test_convert_dataset_hf.py      |  2 +-
 tests/a_scripts/eval/test_eval.py             | 11 +++++-----
 tests/a_scripts/train/test_train.py           | 22 ++++++++++---------
 tests/data/test_dataloader.py                 |  6 ++---
 tests/data_utils.py                           |  2 +-
 15 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 0fabb98653..bc4eff48fd 100644
--- a/README.md
+++ b/README.md
@@ -223,7 +223,7 @@ cd scripts
 
 # Convert C4 dataset to StreamingDataset format
 python data_prep/convert_dataset_hf.py \
-  --dataset c4 --data_subset en \
+  --dataset allenai/c4 --data_subset en \
   --out_root my-copy-c4 --splits train_small val_small \
   --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
 
diff --git a/TUTORIAL.md b/TUTORIAL.md
index 3be4910c4f..d1751f62e3 100644
--- a/TUTORIAL.md
+++ b/TUTORIAL.md
@@ -216,7 +216,7 @@ Output the processed data to `./my-adaptation-data`. Note that we use smaller su
 <!--pytest.mark.skip-->
 ```bash
 python scripts/data_prep/convert_dataset_hf.py \
-  --dataset c4 --data_subset en \
+  --dataset allenai/c4 --data_subset en \
   --out_root my-adaptation-data --splits train_small val_small \
   --concat_tokens 4096 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \
   --compression zstd
@@ -248,7 +248,7 @@ The first step to training from scratch is to get your pretraining data prepared
 <!--pytest.mark.skip-->
 ```bash
 python scripts/data_prep/convert_dataset_hf.py \
-  --dataset c4 --data_subset en \
+  --dataset allenai/c4 --data_subset en \
   --out_root my-copy-c4 --splits train_small val_small \
   --concat_tokens 2048 --tokenizer gpt2 \
   --eos_text '<|endoftext|>' \
diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
index 0ea94ac687..2667407110 100644
--- a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
+++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
@@ -158,7 +158,7 @@ def __init__(
     truncated_samples=100,
 )
 
-CONSTS = {'c4': c4constants, 'the_pile': pileconstants}
+CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants}
 
 
 def build_hf_dataset(
@@ -335,7 +335,7 @@ def convert_dataset_hf(
         dataset_constants = CONSTS[dataset]
     except KeyError:
         raise ValueError(
-            f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "c4" are supported.',
+            f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "allenai/c4" are supported.',
         )
 
     if concat_tokens is not None and tokenizer is not None:
diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
index 35d7e637e6..c6f7d51c02 100644
--- a/llmfoundry/command_utils/data_prep/convert_dataset_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_dataset_json.py
@@ -43,7 +43,7 @@ def build_hf_dataset(
         no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries
         tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use
         data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset.
-            Typically "all" (The Pile) or "en" (c4).
+            Typically "all" (The Pile) or "en" (allenai/c4).
 
     Returns:
         An IterableDataset.
diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
index b437bc5f0d..1d48cd8105 100644
--- a/mcli/mcli-1b-max-seq-len-8k.yaml
+++ b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -13,7 +13,7 @@ integrations:
 command: |
   cd llm-foundry/scripts
   python data_prep/convert_dataset_hf.py \
-    --dataset c4 --data_subset en \
+    --dataset allenai/c4 --data_subset en \
     --out_root ./my-copy-c4 --splits train_small val_small \
     --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
   composer train/train.py /mnt/config/parameters.yaml
diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml
index 789fc4fc02..71566d4c46 100644
--- a/mcli/mcli-1b.yaml
+++ b/mcli/mcli-1b.yaml
@@ -13,7 +13,7 @@ integrations:
 command: |
   cd llm-foundry/scripts
   python data_prep/convert_dataset_hf.py \
-    --dataset c4 --data_subset en \
+    --dataset allenai/c4 --data_subset en \
     --out_root ./my-copy-c4 --splits train_small val_small \
     --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
   composer train/train.py train/yamls/pretrain/mpt-1b.yaml \
diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml
index 49fbbb08d8..a3e8c40b88 100644
--- a/mcli/mcli-pretokenize-oci-upload.yaml
+++ b/mcli/mcli-pretokenize-oci-upload.yaml
@@ -24,7 +24,7 @@ command: |
 
   # Run the dataset conversion
   python convert_dataset_hf.py \
-    --dataset c4 --data_subset en \
+    --dataset allenai/c4 --data_subset en \
     --out_root ./my-copy-c4 \
     --splits val_small val train_small train \
     --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
diff --git a/scripts/data_prep/README.md b/scripts/data_prep/README.md
index 3601cc865f..b72caeebc4 100644
--- a/scripts/data_prep/README.md
+++ b/scripts/data_prep/README.md
@@ -14,7 +14,7 @@ Currently supports `c4` and `The Pile`.
 ```bash
 # Convert C4 dataset to StreamingDataset format
 python convert_dataset_hf.py \
-  --dataset c4 --data_subset en \
+  --dataset allenai/c4 --data_subset en \
   --out_root my-copy-c4 --splits train_small val_small \
   --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \
   --compression zstd
diff --git a/scripts/train/README.md b/scripts/train/README.md
index 6730cb793b..247814d782 100644
--- a/scripts/train/README.md
+++ b/scripts/train/README.md
@@ -27,7 +27,7 @@ If you haven't already, make sure to [install the requirements](../../README.md#
 
 To run pretraining, you'll need to make yourself a copy of a pretraining dataset and format it for efficient streaming. Check out the [`llm-foundry/data_prep`](../data_prep) folder for detailed instructions on how to convert your dataset to the MosaicML [StreamingDataset](https://github.com/mosaicml/streaming) format.
 
-As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/c4) dataset here.
+As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/allenai/c4) dataset here.
 
 We first convert the dataset from its native format (a collection of zipped JSONs)
 to MosaicML's StreamingDataset format, which is a collection of binary `.mds` files.
@@ -44,13 +44,13 @@ This will take 20-60 seconds depending on your internet bandwidth.
 You should see two folders once completed: `./my-copy-c4/train_small` and `./my-copy-c4/val_small` that are ~1.0GB total. Note that we are using the `--concat_tokens` option to pre tokenize our samples to be of the max sequence length without padding
 <!--pytest.mark.skip-->
 ```bash
-python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
+python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
 ```
 
 Alternatively, you can download the full `train` and `val` splits if you really want to train the model (i.e. not just profile the model). This will take 1-to-many hours depending on bandwidth, number of CPUs, etc. The final folder `./my-copy-c4/train` will be ~800GB so make sure you have space!
 <!--pytest.mark.skip-->
 ```bash
-python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
+python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
 ```
 
 For any of the above commands, you can also choose to compress the `.mds` files.
diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py
index fd7be1fc6d..27f5c26c7d 100644
--- a/scripts/train/benchmarking/submit_benchmarks.py
+++ b/scripts/train/benchmarking/submit_benchmarks.py
@@ -479,7 +479,7 @@ def run_config(
     if args.data_remote is None:
         command += f"""
             cd llm-foundry/scripts
-            python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>'
+            python data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>'
             composer train/train.py /mnt/config/parameters.yaml
             """
     else:
diff --git a/tests/a_scripts/data_prep/test_convert_dataset_hf.py b/tests/a_scripts/data_prep/test_convert_dataset_hf.py
index e09c54ca70..da1e101ae7 100644
--- a/tests/a_scripts/data_prep/test_convert_dataset_hf.py
+++ b/tests/a_scripts/data_prep/test_convert_dataset_hf.py
@@ -11,7 +11,7 @@ def test_download_script_from_api(tmp_path: Path):
     # test calling it directly
     path = os.path.join(tmp_path, 'my-copy-c4-1')
     convert_dataset_hf(
-        dataset='c4',
+        dataset='allenai/c4',
         data_subset='en',
         splits=['val_xsmall'],
         out_root=path,
diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py
index fc0dc8a882..f1b76913d1 100644
--- a/tests/a_scripts/eval/test_eval.py
+++ b/tests/a_scripts/eval/test_eval.py
@@ -121,7 +121,7 @@ def test_loader_eval(
 
     # Set up multiple eval dataloaders
     first_eval_loader = test_cfg.eval_loader
-    first_eval_loader.label = 'c4'
+    first_eval_loader.label = 'allenai/c4'
     # Create second eval dataloader using the arxiv dataset.
     second_eval_loader = copy.deepcopy(first_eval_loader)
     second_eval_loader.label = 'arxiv'
@@ -157,16 +157,17 @@ def test_loader_eval(
     print(inmemorylogger.data.keys())
 
     # Checks for first eval dataloader
-    assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
+    assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
+    )
     assert isinstance(
-        inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
+        inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
         list,
     )
     assert len(
-        inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
+        inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
     ) > 0
     assert isinstance(
-        inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
+        inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
         tuple,
     )
 
diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py
index 9af96f9868..b1bca9ebd0 100644
--- a/tests/a_scripts/train/test_train.py
+++ b/tests/a_scripts/train/test_train.py
@@ -134,7 +134,7 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
     test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
     # Set up multiple eval dataloaders
     first_eval_loader = test_cfg.eval_loader
-    first_eval_loader.label = 'c4'
+    first_eval_loader.label = 'allenai/c4'
     # Create second eval dataloader using the arxiv dataset.
     second_eval_loader = copy.deepcopy(first_eval_loader)
     second_eval_loader.label = 'arxiv'
@@ -154,16 +154,17 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
     assert isinstance(inmemorylogger, InMemoryLogger)
 
     # Checks for first eval dataloader
-    assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
+    assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
+    )
     assert isinstance(
-        inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
+        inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
         list,
     )
     assert len(
-        inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
+        inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
     ) > 0
     assert isinstance(
-        inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
+        inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
         tuple,
     )
 
@@ -212,7 +213,7 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
     c4_dataset_name = create_c4_dataset_xxsmall(tmp_path)
     test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
     first_eval_loader = test_cfg.eval_loader
-    first_eval_loader.label = 'c4'
+    first_eval_loader.label = 'allenai/c4'
     test_cfg.eval_loader = om.create([first_eval_loader])
     test_cfg.eval_subset_num_batches = 1  # -1 to evaluate on all batches
     test_cfg.max_duration = '1ba'
@@ -226,15 +227,16 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
         0]  # pyright: ignore [reportGeneralTypeIssues]
     assert isinstance(inmemorylogger, InMemoryLogger)
 
-    assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
+    assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
+    )
     assert isinstance(
-        inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
+        inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
         list,
     )
     assert len(
-        inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
+        inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
     ) > 0
     assert isinstance(
-        inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
+        inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
         tuple,
     )
diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
index d215d93542..7239bfe958 100644
--- a/tests/data/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -204,7 +204,7 @@ def test_correct_padding(
     shutil.rmtree(path, ignore_errors=True)
     if pretokenize:
         convert_dataset_hf(
-            dataset='c4',
+            dataset='allenai/c4',
             data_subset='en',
             splits=[split],
             out_root=path,
@@ -219,7 +219,7 @@ def test_correct_padding(
         )
     else:
         convert_dataset_hf(
-            dataset='c4',
+            dataset='allenai/c4',
             data_subset='en',
             splits=[split],
             out_root=path,
@@ -233,7 +233,7 @@ def test_correct_padding(
             num_workers=None,
         )
     if not os.path.isdir(path):
-        raise RuntimeError(f'c4 dataset at {path} not set up as expected')
+        raise RuntimeError(f'allenai/c4 dataset at {path} not set up as expected')
 
     test_cfg = get_config(
         conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml',
diff --git a/tests/data_utils.py b/tests/data_utils.py
index 117310b0cf..1f6c26b72e 100644
--- a/tests/data_utils.py
+++ b/tests/data_utils.py
@@ -231,7 +231,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str:
 
     # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188
     convert_dataset_hf(
-        dataset='c4',
+        dataset='allenai/c4',
         data_subset='en',
         splits=[downloaded_split],
         out_root=c4_dir,

From ee456002a1dd86f3d9102ac5ade9f7436be51d82 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Fri, 27 Sep 2024 10:39:39 -0400
Subject: [PATCH 24/42] Tensor Parallelism (#1521)

Co-authored-by: Eitan Turok <eitan.turok@gmail.com>
Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 llmfoundry/__init__.py            |   2 +
 llmfoundry/command_utils/train.py |  32 +++++--
 llmfoundry/registry.py            |  22 +++++
 llmfoundry/tp/__init__.py         |  11 +++
 llmfoundry/tp/ffn_tp_strategy.py  |  56 +++++++++++++
 llmfoundry/utils/builders.py      |  29 +++++--
 llmfoundry/utils/config_utils.py  |  14 +++-
 tests/test_registry.py            |   1 +
 tests/tp/__init__.py              |   2 +
 tests/tp/test_tp_strategies.py    | 133 ++++++++++++++++++++++++++++++
 10 files changed, 289 insertions(+), 13 deletions(-)
 create mode 100644 llmfoundry/tp/__init__.py
 create mode 100644 llmfoundry/tp/ffn_tp_strategy.py
 create mode 100644 tests/tp/__init__.py
 create mode 100644 tests/tp/test_tp_strategies.py

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
index b851aaa559..07e8f35747 100644
--- a/llmfoundry/__init__.py
+++ b/llmfoundry/__init__.py
@@ -48,6 +48,7 @@
     models,
     optim,
     tokenizers,
+    tp,
     utils,
 )
 from llmfoundry._version import __version__
@@ -87,5 +88,6 @@
     'models',
     'optim',
     'tokenizers',
+    'tp',
     'utils',
 ]
diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 14b7980d57..29878714f6 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -5,6 +5,7 @@
 import os
 import time
 import warnings
+from copy import deepcopy
 from typing import Any, Optional, Union
 
 import torch
@@ -43,6 +44,7 @@
     build_save_planner,
     build_scheduler,
     build_tokenizer,
+    build_tp_strategies,
 )
 from llmfoundry.utils.config_utils import (
     TRAIN_CONFIG_KEYS,
@@ -329,16 +331,27 @@ def train(cfg: DictConfig) -> Trainer:
                 changing autoresume default to True...',
         )
 
-    # Warn if fsdp is enabled but user only has 1 GPU
-    if dist.get_world_size() == 1 and fsdp_config is not None:
+    # Optional tp config
+    tp_config: Optional[dict[str, Any]] = train_cfg.tp_config
+
+    # Warn if FSDP or TP is enabled but user only has 1 GPU
+    if dist.get_world_size(
+    ) == 1 and (fsdp_config is not None or tp_config is not None):
+        parallelism = ''
+        if fsdp_config is not None:
+            parallelism += 'FSDP'
+        if tp_config is not None:
+            parallelism += '+TP' if fsdp_config is not None else 'TP'
         warnings.warn(
-            'FSDP is not applicable for single-GPU training. Reverting to DDP.',
+            f'{parallelism} is not applicable for single-GPU training. Reverting to DDP.',
         )
         fsdp_config = None
+        tp_config = None
 
     # Initialize context
-    init_context = process_init_device(model_config, fsdp_config)
+    init_context = process_init_device(model_config, fsdp_config, tp_config)
     logged_cfg.update({'fsdp_config': fsdp_config}, merge=True)
+    logged_cfg.update({'tp_config': deepcopy(tp_config)}, merge=True)
 
     # Build tokenizer
     log.info('Building tokenizer...')
@@ -502,6 +515,15 @@ def train(cfg: DictConfig) -> Trainer:
 
     _log_num_params(model, logged_cfg)
 
+    # TP config
+    if tp_config is not None:
+        strategy = tp_config.pop('strategy', None)
+        assert isinstance(strategy, str), '`strategy` must be in `tp_config`.'
+        tp_config['layer_plan'] = build_tp_strategies(strategy, model)
+
+    # Parallelism config
+    parallelism_config = {'fsdp': fsdp_config, 'tp': tp_config}
+
     # Optimizer
     optimizer_name: str = train_cfg.optimizer.pop('name')
     optimizer_cfg = train_cfg.optimizer
@@ -546,7 +568,7 @@ def train(cfg: DictConfig) -> Trainer:
         precision=train_cfg.precision,
         algorithms=algorithms,
         device_train_microbatch_size=train_cfg.device_train_microbatch_size,
-        parallelism_config={'fsdp': fsdp_config},
+        parallelism_config=parallelism_config,
         save_folder=train_cfg.save_folder,
         save_filename=save_filename,
         save_latest_filename=save_latest_filename,
diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index cb2455a760..850c4f3bbd 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -7,6 +7,7 @@
 from composer.models import ComposerModel
 from composer.optim import ComposerScheduler
 from torch.distributed.checkpoint import LoadPlanner, SavePlanner
+from torch.distributed.tensor.parallel.style import ParallelStyle
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader as TorchDataloader
 from torch.utils.data import Dataset
@@ -389,6 +390,26 @@
     description=_save_planners_description,
 )
 
+_tp_strategies_description = (
+    """The tp_strategies registry is used to register strategies for tensor parallelism.
+
+    Args:
+        model (ComposerModel): The model.
+
+    Returns:
+        layer_plan (Dict[str, ParallelStyle]): The plan used to parallelize the model.
+        model (ComposerModel): The model.
+    """
+)
+
+tp_strategies = create_registry(
+    'llmfoundry',
+    'tp_strategies',
+    generic_type=Callable[[ComposerModel], dict[str, ParallelStyle]],
+    entry_points=True,
+    description=_tp_strategies_description,
+)
+
 __all__ = [
     'loggers',
     'callbacks',
@@ -416,4 +437,5 @@
     'config_transforms',
     'load_planners',
     'save_planners',
+    'tp_strategies',
 ]
diff --git a/llmfoundry/tp/__init__.py b/llmfoundry/tp/__init__.py
new file mode 100644
index 0000000000..323ae23727
--- /dev/null
+++ b/llmfoundry/tp/__init__.py
@@ -0,0 +1,11 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from llmfoundry.registry import tp_strategies
+from llmfoundry.tp.ffn_tp_strategy import ffn_tp_strategy
+
+tp_strategies.register('ffn', func=ffn_tp_strategy)
+
+__all__ = [
+    'ffn_tp_strategy',
+]
diff --git a/llmfoundry/tp/ffn_tp_strategy.py b/llmfoundry/tp/ffn_tp_strategy.py
new file mode 100644
index 0000000000..1de92ef6ae
--- /dev/null
+++ b/llmfoundry/tp/ffn_tp_strategy.py
@@ -0,0 +1,56 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from composer.models import ComposerModel
+from torch.distributed._tensor import Replicate, Shard
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
+)
+from torch.distributed.tensor.parallel.style import ParallelStyle
+
+
+def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
+    TP_LAYERS = {'ffn', 'ffn.up_proj', 'ffn.down_proj'}
+
+    # Validate that all TP_LAYERS are in model
+    tp_layers_in_model = {
+        layer for layer in TP_LAYERS for name, _ in model.named_modules()
+        if layer in name
+    }
+    if tp_layers_in_model != TP_LAYERS:
+        raise RuntimeError(
+            f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.',
+        )
+
+    # Generate layer plan
+    layer_plan: dict[str, ParallelStyle] = {}
+    for name, _ in model.named_modules():
+        # Before the ffn layer starts, distribute the input data for proper TP use
+        # Inputs are currently sharded across the batch dimension (dim 0) as is done in standard DDP
+        # Inputs will be replicated across hidden dimension (dim 1) via allgather
+        if name.split('.')[-1] == 'ffn':
+            layer_plan[name] = PrepareModuleInput(
+                input_layouts=Shard(0),
+                desired_input_layouts=Replicate(),
+                use_local_output=True,
+            )
+        # Shard the ffn.up_proj weight matrix across its columns
+        # Inputs are already replicated across each TP group
+        # Outputs will be sharded along the hidden dimension (dim 1) via allgather
+        elif name.split('.')[-2:] == ['ffn', 'up_proj']:
+            layer_plan[name] = ColwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(-1),
+            )
+        # Shard the ffn.down_proj weight matrix across its rows
+        # Inputs are sharded along the hidden dimension (dim 1)
+        # Outputs will be sharded along batch dimension (dim 0) via allreduce
+        elif name.split('.')[-2:] == ['ffn', 'down_proj']:
+            layer_plan[name] = RowwiseParallel(
+                input_layouts=Shard(-1),
+                output_layouts=Shard(0),
+            )
+
+    return layer_plan
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index f2d5cfc0f7..687b21b46d 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -7,14 +7,9 @@
 import logging
 import os
 import re
+import warnings
 from collections import OrderedDict
-from typing import (
-    Any,
-    ContextManager,
-    Iterable,
-    Optional,
-    Union,
-)
+from typing import Any, ContextManager, Iterable, Optional, Union
 
 import torch
 from composer.core import Algorithm, Callback, Evaluator
@@ -25,6 +20,7 @@
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from torch.distributed.checkpoint import LoadPlanner, SavePlanner
+from torch.distributed.tensor.parallel.style import ParallelStyle
 from torch.optim.optimizer import Optimizer
 from torchmetrics import Metric
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -37,6 +33,7 @@
 )
 from llmfoundry.utils.config_utils import to_dict_container, to_list_container
 from llmfoundry.utils.registry_utils import construct_from_registry
+from llmfoundry.utils.warnings import experimental_function
 
 log = logging.getLogger(__name__)
 
@@ -52,6 +49,7 @@
     'build_tokenizer',
     'build_composer_model',
     'build_metric',
+    'build_tp_strategies',
 ]
 
 
@@ -701,3 +699,20 @@ def _validate_cfg(icl_cfg: dict[str, Any]):
                 )
 
     return evaluators, logger_keys
+
+
+@experimental_function('Tensor Parallelism')
+def build_tp_strategies(
+    name: str,
+    model: ComposerModel,
+) -> dict[str, ParallelStyle]:
+
+    warnings.warn(
+        'Checkpointing is not currently supported for tensor parallelism due to this pytorch bug: https://github.com/pytorch/pytorch/issues/134095#issuecomment-2345018244',
+    )
+    return construct_from_registry(
+        name=name,
+        registry=registry.tp_strategies,
+        partial_function=False,
+        kwargs={'model': model},
+    )
diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index ba5c5941b8..c22495993c 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -120,6 +120,7 @@ class TrainConfig:
     # Distributed training parameters
     dist_timeout: Union[int, float] = 600.0
     fsdp_config: Optional[dict[str, Any]] = None
+    tp_config: Optional[dict[str, Any]] = None
 
     # Evaluation parameters
     eval_interval: Union[int, str] = 1
@@ -501,7 +502,11 @@ def update_batch_size_info(cfg: dict[str, Any]) -> dict[str, Any]:
     return cfg
 
 
-def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict]):
+def process_init_device(
+    model_cfg: dict[str, Any],
+    fsdp_config: Optional[dict] = None,
+    tp_config: Optional[dict] = None,
+):
     # Restrict model init_device to 'meta' and 'cpu',
     # using 'cuda' vs. 'cuda:id' is tricky and can lead to common user errors
     # when multiple GPUs are available.
@@ -533,6 +538,13 @@ def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict]):
             # Set defaults for mixed initialization
             fsdp_config.setdefault('load_monolith_rank0_only', True)
 
+    # Check we are not using tensor parallelism with MoEs
+    if tp_config is not None and 'ffn_config' in model_cfg and model_cfg[
+        'ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
+        raise ValueError(
+            'Tensor Parallelism is not currently supported for MoE models.',
+        )
+
     # Set ffn_config.device_mesh using fsdp_config
     if fsdp_config is not None and 'ffn_config' in model_cfg and model_cfg[
         'ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
diff --git a/tests/test_registry.py b/tests/test_registry.py
index 5108a7d46c..90ef3bfaac 100644
--- a/tests/test_registry.py
+++ b/tests/test_registry.py
@@ -47,6 +47,7 @@ def test_expected_registries_exist():
         'config_transforms',
         'load_planners',
         'save_planners',
+        'tp_strategies',
     }
 
     assert existing_registries == expected_registry_names
diff --git a/tests/tp/__init__.py b/tests/tp/__init__.py
new file mode 100644
index 0000000000..80950cb7b4
--- /dev/null
+++ b/tests/tp/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/tp/test_tp_strategies.py b/tests/tp/test_tp_strategies.py
new file mode 100644
index 0000000000..fd2fa384ce
--- /dev/null
+++ b/tests/tp/test_tp_strategies.py
@@ -0,0 +1,133 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pytest
+from omegaconf import OmegaConf as om
+from torch.distributed._tensor import Replicate, Shard
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
+)
+
+from llmfoundry.command_utils.train import train
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
+from llmfoundry.utils.builders import build_tp_strategies
+from llmfoundry.utils.config_utils import process_init_device
+from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg
+
+
+@pytest.mark.gpu
+@pytest.mark.filterwarnings(
+    'ignore:tp_strategies is experimental and may change with future versions.',
+)
+def test_ffn_tp_strategy():
+    """Test the FFN tensor parallelism strategy is correct."""
+    # Create layer plan from fnn tp_strategy
+    tp_config = {
+        'strategy': 'ffn',
+    }
+
+    model_cfg = {
+        'name': 'mpt_causal_lm',
+        'd_model': 128,
+        'n_heads': 4,
+        'n_layers': 3,
+        'expansion_ratio': 1,
+        'max_seq_len': 16,
+        'vocab_size': 50368,
+    }
+    model = ComposerMPTCausalLM(**model_cfg)
+    layer_plan = build_tp_strategies(tp_config['strategy'], model)
+
+    # Expected layer plan
+    _expected_layer_plan = {
+        'ffn':
+            PrepareModuleInput(
+                input_layouts=Shard(0),
+                desired_input_layouts=Replicate(),
+                use_local_output=True,
+            ),
+        'ffn.down_proj':
+            RowwiseParallel(
+                input_layouts=Shard(-1),
+                output_layouts=Shard(0),
+            ),
+        'ffn.up_proj':
+            ColwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(-1),
+            ),
+    }
+    expected_layer_plan = {
+        f'model.transformer.blocks.{layer_idx}.{name}': layer_plan
+        for name, layer_plan in _expected_layer_plan.items()
+        for layer_idx in range(model_cfg['n_layers'])
+    }
+
+    # Compare expected and actual layer plans
+    for (n1, lp1), (n2, lp2) in zip(
+        sorted(expected_layer_plan.items()),
+        sorted(layer_plan.items()),
+    ):
+        assert n1 == n2
+        assert type(lp1) == type(lp2)
+        if isinstance(
+            lp1,
+            PrepareModuleInput,
+        ) and isinstance(lp2, PrepareModuleInput):
+            assert lp1.input_layouts == lp2.input_layouts
+            assert lp1.desired_input_layouts == lp2.desired_input_layouts
+            assert lp1.use_local_output == lp2.use_local_output
+        elif (
+            isinstance(lp1, ColwiseParallel) and
+            isinstance(lp2, ColwiseParallel)
+        ) or (
+            isinstance(lp1, RowwiseParallel) and
+            isinstance(lp2, RowwiseParallel)
+        ):
+            assert lp1.input_layouts == lp2.input_layouts
+            assert lp1.output_layouts == lp2.output_layouts
+            assert lp1.use_local_output == lp2.use_local_output
+        else:
+            raise ValueError(f'Layer plan of wrong type: {type(layer_plan)}')
+
+
+@pytest.mark.gpu
+def test_no_tp_with_one_gpu():
+    """Test that when we have one GPU, we use DDP and not FSDP-TP."""
+    with TemporaryDirectory() as tmp_path:
+        # Make `train_cfg`` with a tensor parallelism strategy
+        dataset_name = create_c4_dataset_xxsmall(Path(tmp_path))
+        train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
+        train_cfg.tp_config = {'strategy': 'ffn'}
+
+        # Expect a warning
+        with pytest.warns(
+            UserWarning,
+            match=
+            r'FSDP\+TP is not applicable for single-GPU training. Reverting to DDP.',
+        ):
+            train(train_cfg)
+
+
+@pytest.mark.gpu  # use gpu because `megablocks` only installed with `gpu` dependencies
+def test_no_tp_with_moes():
+    """Test that tensor parallelism is not compatible with MoEs."""
+    # Make `cfg` for MoE model, fsdp, and tp
+    train_cfg_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml'
+    with open(train_cfg_path, 'r', encoding='utf-8') as f:
+        train_cfg = om.load(f)
+    model_cfg = train_cfg.model
+    fsdp_cfg = train_cfg.fsdp_config
+    tp_cfg = {'strategy': 'ffn'}
+
+    # Expect an error
+    with pytest.raises(
+        ValueError,
+        match='Tensor Parallelism is not currently supported for MoE models.',
+    ):
+        process_init_device(model_cfg, fsdp_cfg, tp_cfg)

From 107d246a4c9c04f0a906f8f0fafcca1297d9e68e Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Fri, 27 Sep 2024 13:12:00 -0700
Subject: [PATCH 25/42] Insufficient Permissions Error when trying to access
 table (#1555)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 .../data_prep/convert_delta_to_json.py        | 127 +++++++-----------
 llmfoundry/utils/exceptions.py                |  13 +-
 .../data_prep/test_convert_delta_to_json.py   |  23 ++--
 tests/utils/test_exceptions.py                |  39 ++++--
 4 files changed, 103 insertions(+), 99 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
index fbbc5f2cd9..44e8651cdf 100644
--- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -234,27 +234,7 @@ def run_query(
     elif method == 'dbconnect':
         if spark == None:
             raise ValueError(f'sparkSession is required for dbconnect')
-
-        try:
-            df = spark.sql(query)
-        except Exception as e:
-            from pyspark.errors import AnalysisException
-            if isinstance(e, AnalysisException):
-                if 'INSUFFICIENT_PERMISSIONS' in e.message:  # pyright: ignore
-                    match = re.search(
-                        r"Schema\s+'([^']+)'",
-                        e.message,  # pyright: ignore
-                    )
-                    if match:
-                        schema_name = match.group(1)
-                        action = f'using the schema {schema_name}'
-                    else:
-                        action = 'using the schema'
-                    raise InsufficientPermissionsError(action=action,) from e
-            raise RuntimeError(
-                f'Error in querying into schema. Restart sparkSession and try again',
-            ) from e
-
+        df = spark.sql(query)
         if collect:
             return df.collect()
         return df
@@ -469,71 +449,66 @@ def fetch(
     """
     cursor = dbsql.cursor() if dbsql is not None else None
     try:
-        nrows = get_total_rows(
-            tablename,
-            method,
-            cursor,
-            sparkSession,
-        )
-    except Exception as e:
-        from pyspark.errors import AnalysisException
-        if isinstance(e, AnalysisException):
-            if 'INSUFFICIENT_PERMISSIONS' in e.message:  # pyright: ignore
-                raise InsufficientPermissionsError(
-                    action=f'reading from {tablename}',
-                ) from e
-        if isinstance(e, InsufficientPermissionsError):
-            raise e
-        raise RuntimeError(
-            f'Error in get rows from {tablename}. Restart sparkSession and try again',
-        ) from e
+        # Get total rows
+        nrows = get_total_rows(tablename, method, cursor, sparkSession)
 
-    try:
+        # Get columns info
         columns, order_by, columns_str = get_columns_info(
             tablename,
             method,
             cursor,
             sparkSession,
         )
+
+        if method == 'dbconnect' and sparkSession is not None:
+            log.info(f'{processes=}')
+            df = sparkSession.table(tablename)
+
+            # Running the query and collecting the data as arrow or json.
+            signed, _, _ = df.collect_cf('arrow')  # pyright: ignore
+            log.info(f'len(signed) = {len(signed)}')
+
+            args = get_args(signed, json_output_folder, columns)
+
+            # Stopping the SparkSession to avoid spilling connection state into the subprocesses.
+            sparkSession.stop()
+
+            with ProcessPoolExecutor(max_workers=processes) as executor:
+                list(executor.map(download_starargs, args))
+
+        elif method == 'dbsql' and cursor is not None:
+            for start in range(0, nrows, batch_size):
+                log.warning(f'batch {start}')
+                end = min(start + batch_size, nrows)
+                fetch_data(
+                    method,
+                    cursor,
+                    sparkSession,
+                    start,
+                    end,
+                    order_by,
+                    tablename,
+                    columns_str,
+                    json_output_folder,
+                )
+
     except Exception as e:
-        raise RuntimeError(
-            f'Error in get columns from {tablename}. Restart sparkSession and try again',
-        ) from e
+        from databricks.sql.exc import ServerOperationError
+        from pyspark.errors import AnalysisException
 
-    if method == 'dbconnect' and sparkSession is not None:
-        log.info(f'{processes=}')
-        df = sparkSession.table(tablename)
-
-        # Running the query and collecting the data as arrow or json.
-        signed, _, _ = df.collect_cf('arrow')  # pyright: ignore
-        log.info(f'len(signed) = {len(signed)}')
-
-        args = get_args(signed, json_output_folder, columns)
-
-        # Stopping the SparkSession to avoid spilling connection state into the subprocesses.
-        sparkSession.stop()
-
-        with ProcessPoolExecutor(max_workers=processes) as executor:
-            list(executor.map(download_starargs, args))
-
-    elif method == 'dbsql' and cursor is not None:
-        for start in range(0, nrows, batch_size):
-            log.warning(f'batch {start}')
-            end = min(start + batch_size, nrows)
-            fetch_data(
-                method,
-                cursor,
-                sparkSession,
-                start,
-                end,
-                order_by,
-                tablename,
-                columns_str,
-                json_output_folder,
-            )
+        if isinstance(e, (AnalysisException, ServerOperationError)):
+            if 'INSUFFICIENT_PERMISSIONS' in str(e):
+                raise InsufficientPermissionsError(str(e)) from e
+
+        if isinstance(e, InsufficientPermissionsError):
+            raise
+
+        # For any other exception, raise a general error
+        raise RuntimeError(f'Error processing {tablename}: {str(e)}') from e
 
-    if cursor is not None:
-        cursor.close()
+    finally:
+        if cursor is not None:
+            cursor.close()
 
 
 def validate_and_get_cluster_info(
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 265b9bbe8f..242ac4f32c 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -456,6 +456,13 @@ def __init__(
 class InsufficientPermissionsError(UserError):
     """Error thrown when the user does not have sufficient permissions."""
 
-    def __init__(self, action: str) -> None:
-        message = f'Insufficient permissions when {action}. Please check your permissions.'
-        super().__init__(message, action=action)
+    def __init__(self, message: str) -> None:
+        self.message = message
+        super().__init__(message)
+
+    def __reduce__(self):
+        # Return a tuple of class, a tuple of arguments, and optionally state
+        return (InsufficientPermissionsError, (self.message,))
+
+    def __str__(self):
+        return self.message
diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py
index b1a9f1e878..981f5c1ed6 100644
--- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py
+++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py
@@ -10,6 +10,7 @@
 from llmfoundry.command_utils.data_prep.convert_delta_to_json import (
     InsufficientPermissionsError,
     download,
+    fetch,
     fetch_DT,
     format_tablename,
     iterative_combine_jsons,
@@ -30,27 +31,33 @@ class MockAnalysisException(Exception):
             def __init__(self, message: str):
                 self.message = message
 
+            def __str__(self):
+                return self.message
+
         with patch.dict('sys.modules', {'pyspark.errors': MagicMock()}):
             sys.modules[
                 'pyspark.errors'
-            ].AnalysisException = MockAnalysisException  # pyright: ignore
+            ].AnalysisException = MockAnalysisException  # type: ignore
 
             mock_spark = MagicMock()
             mock_spark.sql.side_effect = MockAnalysisException(error_message)
 
             with self.assertRaises(InsufficientPermissionsError) as context:
-                run_query(
-                    'SELECT * FROM table',
+                fetch(
                     method='dbconnect',
-                    cursor=None,
-                    spark=mock_spark,
+                    tablename='main.oogabooga',
+                    json_output_folder='/fake/path',
+                    batch_size=1,
+                    processes=1,
+                    sparkSession=mock_spark,
+                    dbsql=None,
                 )
 
-            self.assertIn(
-                'using the schema main.oogabooga',
+            self.assertEqual(
                 str(context.exception),
+                error_message,
             )
-            mock_spark.sql.assert_called_once_with('SELECT * FROM table')
+            mock_spark.sql.assert_called()
 
     @patch(
         'databricks.sql.connect',
diff --git a/tests/utils/test_exceptions.py b/tests/utils/test_exceptions.py
index 8bfc7287ab..564dfa2f14 100644
--- a/tests/utils/test_exceptions.py
+++ b/tests/utils/test_exceptions.py
@@ -4,7 +4,7 @@
 import contextlib
 import inspect
 import pickle
-from typing import Any, Optional
+from typing import Any, Optional, get_type_hints
 
 import pytest
 
@@ -14,16 +14,30 @@
 def create_exception_object(
     exception_class: type[foundry_exceptions.BaseContextualError],
 ):
-    # get required arg types of exception class by inspecting its __init__ method
 
-    if hasattr(inspect, 'get_annotations'):
-        required_args = inspect.get_annotations( # type: ignore
-            exception_class.__init__,
-        )  # type: ignore
-    else:
-        required_args = exception_class.__init__.__annotations__  # python 3.9 and below
-
-    # create a dictionary of required args with default values
+    def get_init_annotations(cls: type):
+        try:
+            return get_type_hints(cls.__init__)
+        except (AttributeError, TypeError):
+            # Handle cases where __init__ does not exist or has no annotations
+            return {}
+
+    # First, try to get annotations from the class itself
+    required_args = get_init_annotations(exception_class)
+
+    # If the annotations are empty, look at parent classes
+    if not required_args:
+        for parent in exception_class.__bases__:
+            if parent == object:
+                break
+            parent_args = get_init_annotations(parent)
+            if parent_args:
+                required_args = parent_args
+                break
+
+    # Remove self, return, and kwargs
+    required_args.pop('self', None)
+    required_args.pop('return', None)
     required_args.pop('kwargs', None)
 
     def get_default_value(arg_type: Optional[type] = None):
@@ -51,8 +65,6 @@ def get_default_value(arg_type: Optional[type] = None):
             return [{'key': 'value'}]
         raise ValueError(f'Unsupported arg type: {arg_type}')
 
-    required_args.pop('self', None)
-    required_args.pop('return', None)
     kwargs = {
         arg: get_default_value(arg_type)
         for arg, arg_type in required_args.items()
@@ -80,6 +92,7 @@ def filter_exceptions(possible_exceptions: list[str]):
 def test_exception_serialization(
     exception_class: type[foundry_exceptions.BaseContextualError],
 ):
+    print(f'Testing serialization for {exception_class.__name__}')
     excluded_base_classes = [
         foundry_exceptions.InternalError,
         foundry_exceptions.UserError,
@@ -88,6 +101,7 @@ def test_exception_serialization(
     ]
 
     exception = create_exception_object(exception_class)
+    print(f'Created exception object: {exception}')
 
     expect_reduce_error = exception.__class__ in excluded_base_classes
     error_context = pytest.raises(
@@ -95,6 +109,7 @@ def test_exception_serialization(
     ) if expect_reduce_error else contextlib.nullcontext()
 
     exc_str = str(exception)
+    print(f'Exception string: {exc_str}')
     with error_context:
         pkl = pickle.dumps(exception)
         unpickled_exc = pickle.loads(pkl)

From 4202a063ea744f1713bba3c4aa52955913974418 Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <narayan.saaketh@gmail.com>
Date: Mon, 30 Sep 2024 10:32:21 -0700
Subject: [PATCH 26/42] Add NoOp optimizer (#1560)

---
 llmfoundry/optim/__init__.py |  3 +++
 llmfoundry/optim/no_op.py    | 44 +++++++++++++++++++++++++++++++++
 tests/optim/test_no_op.py    | 48 ++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100644 llmfoundry/optim/no_op.py
 create mode 100644 tests/optim/test_no_op.py

diff --git a/llmfoundry/optim/__init__.py b/llmfoundry/optim/__init__.py
index 0b55944338..ce93487aef 100644
--- a/llmfoundry/optim/__init__.py
+++ b/llmfoundry/optim/__init__.py
@@ -10,6 +10,7 @@
 
 from llmfoundry.optim.adaptive_lion import DecoupledAdaLRLion, DecoupledClipLion
 from llmfoundry.optim.lion import DecoupledLionW
+from llmfoundry.optim.no_op import NoOp
 from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler
 from llmfoundry.registry import optimizers, schedulers
 
@@ -17,6 +18,7 @@
 optimizers.register('clip_lion', func=DecoupledClipLion)
 optimizers.register('decoupled_lionw', func=DecoupledLionW)
 optimizers.register('decoupled_adamw', func=DecoupledAdamW)
+optimizers.register('no_op', func=NoOp)
 
 schedulers.register('constant_with_warmup', func=ConstantWithWarmupScheduler)
 schedulers.register(
@@ -33,5 +35,6 @@
     'DecoupledLionW',
     'DecoupledClipLion',
     'DecoupledAdaLRLion',
+    'NoOp',
     'InverseSquareRootWithWarmupScheduler',
 ]
diff --git a/llmfoundry/optim/no_op.py b/llmfoundry/optim/no_op.py
new file mode 100644
index 0000000000..416363c261
--- /dev/null
+++ b/llmfoundry/optim/no_op.py
@@ -0,0 +1,44 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Iterable, Optional
+
+import torch
+
+
+class NoOp(torch.optim.Optimizer):
+    """Optimizer that performs no optimization steps."""
+
+    def __init__(
+        self,
+        params: Iterable[torch.Tensor],
+    ):
+        """Initialize NoOp optimizer.
+
+        Args:
+            params (Iterable[torch.Tensor]): Model parameters for the optimizer.
+        """
+        # LR schedulers expect param groups to have LR. Unused.
+        defaults = {'lr': 0.0}
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state: dict[str, dict[Any, Any]]) -> None:
+        super().__setstate__(state)
+
+    def state_dict(self):
+        return super().state_dict()
+
+    @torch.no_grad()
+    def step(self, closure: Optional[Callable] = None):
+        """Perform no-op optimization step where no parameters are updated.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        return loss
diff --git a/tests/optim/test_no_op.py b/tests/optim/test_no_op.py
new file mode 100644
index 0000000000..27766d6eaf
--- /dev/null
+++ b/tests/optim/test_no_op.py
@@ -0,0 +1,48 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+from typing import Callable
+
+import torch
+from composer.trainer import Trainer
+from torch.utils.data import DataLoader
+
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
+from llmfoundry.utils.builders import build_optimizer
+
+
+def test_no_op_does_nothing(
+    build_tiny_mpt: Callable[..., ComposerMPTCausalLM],
+    tiny_ft_dataloader: DataLoader,
+):
+
+    # Build MPT model
+    model = build_tiny_mpt(
+        loss_fn='torch_crossentropy',
+        attn_config={
+            'attn_impl': 'torch',
+        },
+    )
+
+    # Build NoOp optimizer
+    no_op_optim = build_optimizer(model, 'no_op', optimizer_config={})
+
+    orig_model = copy.deepcopy(model)
+
+    # build trainer
+    trainer = Trainer(
+        model=model,
+        train_dataloader=tiny_ft_dataloader,
+        max_duration=f'2ba',
+        optimizers=no_op_optim,
+    )
+    trainer.fit()
+
+    # Check that the model has not changed
+    for (
+        (orig_name, orig_param),
+        (new_name, new_param),
+    ) in zip(orig_model.named_parameters(), model.named_parameters()):
+        print(f'Checking {orig_name} and {new_name}')
+        assert torch.equal(orig_param, new_param)

From 0ad6ab4757bc5bd232a85278d65e7399efcf44dc Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Mon, 30 Sep 2024 11:16:44 -0700
Subject: [PATCH 27/42] Deterministic GCRP Errors  (#1559)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 .../data_prep/convert_delta_to_json.py        | 31 ++++++----
 llmfoundry/utils/exceptions.py                | 15 +++++
 .../data_prep/test_convert_delta_to_json.py   | 58 +++++++++++++++++++
 3 files changed, 94 insertions(+), 10 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
index 44e8651cdf..2321d306ff 100644
--- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
+++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -23,6 +23,7 @@
     ClusterInvalidAccessMode,
     FailedToConnectToDatabricksError,
     FailedToCreateSQLConnectionError,
+    FaultyDataPrepCluster,
     InsufficientPermissionsError,
 )
 
@@ -660,16 +661,26 @@ def fetch_DT(
     )
 
     formatted_delta_table_name = format_tablename(delta_table_name)
-
-    fetch(
-        method,
-        formatted_delta_table_name,
-        json_output_folder,
-        batch_size,
-        processes,
-        sparkSession,
-        dbsql,
-    )
+    import grpc
+    try:
+        fetch(
+            method,
+            formatted_delta_table_name,
+            json_output_folder,
+            batch_size,
+            processes,
+            sparkSession,
+            dbsql,
+        )
+    except grpc.RpcError as e:
+        if e.code(
+        ) == grpc.StatusCode.INTERNAL and 'Job aborted due to stage failure' in e.details(
+        ):
+            raise FaultyDataPrepCluster(
+                message=
+                f'Faulty data prep cluster, please try swapping data prep cluster: {e.details()}',
+            ) from e
+        raise e
 
     if dbsql is not None:
         dbsql.close()
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 242ac4f32c..9cbea2cac8 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -466,3 +466,18 @@ def __reduce__(self):
 
     def __str__(self):
         return self.message
+
+
+class FaultyDataPrepCluster(UserError):
+    """Error thrown when the user uses faulty data prep cluster."""
+
+    def __init__(self, message: str) -> None:
+        self.message = message
+        super().__init__(message)
+
+    def __reduce__(self):
+        # Return a tuple of class, a tuple of arguments, and optionally state
+        return (FaultyDataPrepCluster, (self.message,))
+
+    def __str__(self):
+        return self.message
diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py
index 981f5c1ed6..34a5b5ca55 100644
--- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py
+++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py
@@ -7,7 +7,10 @@
 from typing import Any
 from unittest.mock import MagicMock, mock_open, patch
 
+import grpc
+
 from llmfoundry.command_utils.data_prep.convert_delta_to_json import (
+    FaultyDataPrepCluster,
     InsufficientPermissionsError,
     download,
     fetch,
@@ -524,3 +527,58 @@ def test_format_tablename(self):
             format_tablename('hyphenated-catalog.schema.test_table'),
             '`hyphenated-catalog`.`schema`.`test_table`',
         )
+
+    @patch('llmfoundry.command_utils.data_prep.convert_delta_to_json.fetch')
+    @patch(
+        'llmfoundry.command_utils.data_prep.convert_delta_to_json.validate_and_get_cluster_info',
+    )
+    def test_fetch_DT_grpc_error_handling(
+        self,
+        mock_validate_cluster_info: MagicMock,
+        mock_fetch: MagicMock,
+    ):
+        # Arrange
+        # Mock the validate_and_get_cluster_info to return test values
+        mock_validate_cluster_info.return_value = ('dbconnect', None, None)
+
+        # Create a grpc.RpcError with StatusCode.INTERNAL and specific details
+        grpc_error = grpc.RpcError()
+        grpc_error.code = lambda: grpc.StatusCode.INTERNAL
+        grpc_error.details = lambda: 'Job aborted due to stage failure: Task failed due to an error.'
+
+        # Configure the fetch function to raise the grpc.RpcError
+        mock_fetch.side_effect = grpc_error
+
+        # Test inputs
+        delta_table_name = 'test_table'
+        json_output_folder = '/tmp/to/jsonl'
+        http_path = None
+        cluster_id = None
+        use_serverless = False
+        DATABRICKS_HOST = 'https://test-host'
+        DATABRICKS_TOKEN = 'test-token'
+
+        # Act & Assert
+        with self.assertRaises(FaultyDataPrepCluster) as context:
+            fetch_DT(
+                delta_table_name=delta_table_name,
+                json_output_folder=json_output_folder,
+                http_path=http_path,
+                cluster_id=cluster_id,
+                use_serverless=use_serverless,
+                DATABRICKS_HOST=DATABRICKS_HOST,
+                DATABRICKS_TOKEN=DATABRICKS_TOKEN,
+            )
+
+        # Verify that the FaultyDataPrepCluster contains the expected message
+        self.assertIn(
+            'Faulty data prep cluster, please try swapping data prep cluster: ',
+            str(context.exception),
+        )
+        self.assertIn(
+            'Job aborted due to stage failure',
+            str(context.exception),
+        )
+
+        # Verify that fetch was called
+        mock_fetch.assert_called_once()

From bdc58b3c9279485ea3cf242d34260261fd1af4bc Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Mon, 30 Sep 2024 15:13:54 -0400
Subject: [PATCH 28/42] Simplify CL API (#1510)

---
 .../callbacks/curriculum_learning_callback.py | 78 ++++++++++++-------
 .../test_curriculum_learning_callback.py      | 48 ++++++++----
 2 files changed, 82 insertions(+), 44 deletions(-)

diff --git a/llmfoundry/callbacks/curriculum_learning_callback.py b/llmfoundry/callbacks/curriculum_learning_callback.py
index 449ab338bc..70e996e494 100644
--- a/llmfoundry/callbacks/curriculum_learning_callback.py
+++ b/llmfoundry/callbacks/curriculum_learning_callback.py
@@ -9,7 +9,8 @@
 
 import copy
 import logging
-from typing import Any
+import warnings
+from typing import Any, Optional, Union
 
 from composer import DataSpec
 from composer.core import State, Time, TimeUnit, ensure_time
@@ -23,6 +24,7 @@
     BaseContextualError,
     TrainDataLoaderLocation,
 )
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
 
 log = logging.getLogger(__name__)
 
@@ -32,19 +34,21 @@
 class CurriculumLearning(CallbackWithConfig):
     """Starts an epoch with a different dataset when resuming from a checkpoint.
 
+    Example duration:
+    <number>tok
     Example schedule:
     [
         {
             'duration': <number>tok,
-            'train_loader': <dataloader parameters>, # matches top level train_loader
+            'dataset': <dataset parameters>,
         },
         {
             'duration': <number>tok,
-            'train_loader': <dataloader parameters>,
+            'dataset': <dataset parameters>,
         },
         {
             'duration': <number>tok,
-            'train_loader': <dataloader parameters>,
+            'dataset': <dataset parameters>,
         ],
     ]
 
@@ -53,48 +57,59 @@ class CurriculumLearning(CallbackWithConfig):
             being used. Note that this is the full train config and must
             contain the 'train_loader', 'device_train_batch_size', and
             'tokenizer' keys.
+        duration (Union[Time, str, int], optional): The duration of the first datamix
+            (which corresponds to the train_loader). Defaults to None.
         schedule (list[dict[str, Any]]): The list of datamixes to use and their
             durations. Duration units must match max_duration and be in terms of
             a TimeUnit that is supported by Iteration. The duration values must
             be positive. There must be at least one datamix in the schedule. The
-            first datamix in the schedule must match the train_loader in the
-            train_config. On resumption, previously trained on datamixes and
-            durations cannot be changed. The duration of the current datamix
-            must be greater than the saved timestamp. The dataset must be a
-            StreamingDataset.
+            first datamix during training is not included in the schedule. On
+            resumption, previously trained on datamixes and durations cannot be
+            changed. The duration of the current datamix must be greater than
+            the saved timestamp. The dataset must be a StreamingDataset.
     """
 
     def __init__(
         self,
         train_config: dict[str, Any],
         schedule: list[dict[str, Any]],
+        duration: Optional[Union[Time, str, int]] = None,
     ):
+        if duration is None:
+            warnings.warn(
+                VersionedDeprecationWarning(
+                    'Specifying the full schedule in the CurriculumLearning ' +
+                    'callback is deprecated. Please specify the duration of ' +
+                    'the first datamix separately and change the schedule ' +
+                    'use datasets instead of dataloaders.',
+                    remove_version='0.15.0',
+                ),
+            )
+
         # Ensure all duration units are in epochs or tokens and values are positive
         self._schedule = schedule
         if len(self._schedule) == 0:
             raise ValueError('The schedule must have at least one datamix.')
-        for index, datamix in enumerate(self._schedule):
+        if duration is not None:
+            first_datamix = {
+                'duration': duration,
+                'dataset': train_config['train_loader']['dataset'],
+            }
+            self._schedule.insert(0, first_datamix)
+        for datamix in self._schedule:
             self._validate_datamix(datamix)
 
-            if (
-                index == 0 and
-                train_config['train_loader'] != datamix['train_loader']
-            ):
-                raise ValueError((
-                    'The first datamix in the schedule must match the '
-                    'train_loader in the train_config.'
-                ))
-
         self._schedule_index = 0
-        self.device_train_batch_size = train_config['device_train_batch_size']
-        self.tokenizer = None
+        self._train_loader_config: dict[str, Any] = train_config['train_loader']
+        self._device_train_batch_size = train_config['device_train_batch_size']
+        self._tokenizer = None
 
     def init(self, state: State, logger: Logger):
         del logger  # unused
 
         if not hasattr(state.model, 'tokenizer'):
             raise ValueError('state.model must have a tokenizer attribute.')
-        self.tokenizer = state.model.tokenizer
+        self._tokenizer = state.model.tokenizer
 
     def before_load(self, state: State, logger: Logger):
         del logger  # unused
@@ -151,8 +166,13 @@ def iteration_start(self, state: State, logger: Logger):
             # which is stale
             clean_stale_shared_memory()
             datamix = copy.deepcopy(self._schedule[self._schedule_index])
+            train_loader_config = copy.deepcopy(self._train_loader_config)
+            if 'dataset' in datamix:
+                train_loader_config['dataset'].update(datamix['dataset'])
+            else:
+                train_loader_config = datamix['train_loader']
             data_spec = self._build_train_loader(
-                train_loader_config=datamix['train_loader'],
+                train_loader_config=train_loader_config,
                 logger=logger,
             )
             state.set_dataloader(
@@ -211,18 +231,20 @@ def _build_train_loader(
         train_loader_config: dict[str, Any],
         logger: Logger,
     ) -> DataSpec:
+        del logger  # unused
+
         from llmfoundry.data.dataloader import build_dataloader
 
         # Copied from scripts/train/train.py
         log.info(
             f'Building train loader in CurriculumLearning callback for dataset {self._schedule_index}',
         )
-        assert self.tokenizer is not None
+        assert self._tokenizer is not None
         try:
             return build_dataloader(
                 train_loader_config,
-                self.tokenizer,
-                self.device_train_batch_size,
+                self._tokenizer,
+                self._device_train_batch_size,
             )
         except BaseContextualError as e:
             e.location = TrainDataLoaderLocation
@@ -260,5 +282,5 @@ def _validate_datamix(self, datamix: dict[str, Any]):
                 'Schedules can only be defined in terms of epochs or tokens.',
             )
 
-        if 'train_loader' not in datamix:
-            raise ValueError('Each datamix must have a train_loader.')
+        if 'train_loader' not in datamix and 'dataset' not in datamix:
+            raise ValueError('Each datamix must have a dataset.')
diff --git a/tests/callbacks/test_curriculum_learning_callback.py b/tests/callbacks/test_curriculum_learning_callback.py
index 075698a4c0..0e6a6c1efe 100644
--- a/tests/callbacks/test_curriculum_learning_callback.py
+++ b/tests/callbacks/test_curriculum_learning_callback.py
@@ -22,7 +22,7 @@
     [
         (None, '1ep'),
         ({
-            'dataset': 'some_dataset',
+            'hf_name': 'some_dataset',
         }, '1ep'),
         (None, '10tok'),
         (None, ''),
@@ -36,23 +36,29 @@ def test_curriculum_learning_callback_init(
 ):
     test_cfg = _get_test_cfg()
     test_cfg['train_loader'] = tiny_ft_dataloader_cfg
-    train_loader = test_cfg['train_loader'] if datamix is None else datamix
+    if datamix is None:
+        train_loader = test_cfg['train_loader']['dataset']
+    else:
+        train_loader = datamix
     kwargs = {
         'schedule': [{
             'duration': duration,
-            'train_loader': train_loader,
+            'dataset': train_loader,
         }, {
             'duration': '2ep',
-            'train_loader': {},
+            'dataset': {},
         }],
     }
+
+    kwargs['duration'] = kwargs['schedule'].pop(0)['duration']
+
     if duration == '':
         del kwargs['schedule'][0]['duration']
     if datamix is not None and len(datamix) == 0:
-        del kwargs['schedule'][0]['train_loader']
+        del kwargs['schedule'][0]['dataset']
 
     context = nullcontext()
-    if datamix is not None or duration == '':
+    if (datamix is not None and len(datamix) == 0) or duration == '':
         context = pytest.raises(ValueError)
     with context:
         callback = build_callback(
@@ -85,13 +91,15 @@ def test_curriculum_learning_callback_before_load(
     kwargs = {
         'schedule': [{
             'duration': duration,
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }, {
             'duration': '2ep',
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }],
     }
 
+    kwargs['duration'] = kwargs['schedule'].pop(0)['duration']
+
     callback = build_callback(
         'curriculum_learning',
         kwargs=kwargs,
@@ -123,13 +131,15 @@ def test_curriculum_learning_callback_after_load(build_tiny_mpt: Callable,):
     kwargs = {
         'schedule': [{
             'duration': '1ep',
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }, {
             'duration': '2ep',
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }],
     }
 
+    kwargs['duration'] = kwargs['schedule'].pop(0)['duration']
+
     callback = build_callback(
         'curriculum_learning',
         kwargs=kwargs,
@@ -168,13 +178,15 @@ def test_curriculum_learning_callback_iteration(
     kwargs = {
         'schedule': [{
             'duration': '1ep',
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }, {
             'duration': '2ep',
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }],
     }
 
+    kwargs['duration'] = kwargs['schedule'].pop(0)['duration']
+
     callback = build_callback(
         'curriculum_learning',
         kwargs=kwargs,
@@ -208,13 +220,15 @@ def test_curriculum_learning_callback_state_dict(build_tiny_mpt: Callable,):
     kwargs = {
         'schedule': [{
             'duration': '1ep',
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }, {
             'duration': '2ep',
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }],
     }
 
+    kwargs['duration'] = kwargs['schedule'].pop(0)['duration']
+
     callback = build_callback(
         'curriculum_learning',
         kwargs=kwargs,
@@ -249,13 +263,15 @@ def test_curriculum_learning_callback_load_state_dict(
     kwargs = {
         'schedule': [{
             'duration': '1ep',
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }, {
             'duration': '2ep',
-            'train_loader': test_cfg['train_loader'],
+            'dataset': test_cfg['train_loader']['dataset'],
         }],
     }
 
+    kwargs['duration'] = kwargs['schedule'].pop(0)['duration']
+
     callback = build_callback(
         'curriculum_learning',
         kwargs=kwargs,

From 30cdd67f54581143722cbfcf1b775c32ebc56730 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 30 Sep 2024 16:36:36 -0700
Subject: [PATCH 29/42] Reapply #1389 (#1561)

---
 llmfoundry/data/finetuning/dataloader.py | 56 +++++++++---------------
 llmfoundry/data/finetuning/tasks.py      |  2 +-
 llmfoundry/models/hf/hf_base.py          |  2 +-
 llmfoundry/utils/builders.py             |  2 +-
 4 files changed, 23 insertions(+), 39 deletions(-)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 69051a2d51..612b8d6385 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -575,42 +575,26 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
 
         # Since we don't know exactly what the extension will be, since it is one of a list
         # use a signal file to wait for instead of the desired file
-        signal_file_path = os.path.join(
-            finetune_dir,
-            f'.node_{dist.get_node_rank()}_local_rank0_completed',
-        )
-        if dist.get_local_rank() == 0:
-            try:
-                get_file(path=name, destination=destination, overwrite=True)
-            except FileNotFoundError as e:
-                if extension == SUPPORTED_EXTENSIONS[-1]:
-                    files_searched = [
-                        f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS
-                    ]
-                    raise FileNotFoundError(
-                        f'Could not find a file with any of ' + \
-                        f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \
-                        f'at {files_searched}',
-                    ) from e
-                else:
-                    log.debug(
-                        f'Could not find {name}, looking for another extension',
-                    )
-                continue
-
-            os.makedirs(os.path.dirname(signal_file_path), exist_ok=True)
-            with open(signal_file_path, 'wb') as f:
-                f.write(b'local_rank0_completed_download')
-
-        # Avoid the collective call until the local rank zero has finished trying to download the dataset
-        # so that we don't timeout for large downloads. This syncs all processes on the node
-        with dist.local_rank_zero_download_and_wait(signal_file_path):
-            # Then, wait to ensure every node has finished trying to download the dataset
-            dist.barrier()
-
-        # clean up signal file
-        if dist.get_local_rank() == 0:
-            os.remove(signal_file_path)
+        with dist.busy_wait_for_local_rank_zero(finetune_dir):
+            if dist.get_local_rank() == 0:
+                try:
+                    get_file(path=name, destination=destination, overwrite=True)
+                except FileNotFoundError as e:
+                    if extension == SUPPORTED_EXTENSIONS[-1]:
+                        files_searched = [
+                            f'{name}/{split}{ext}'
+                            for ext in SUPPORTED_EXTENSIONS
+                        ]
+                        raise FileNotFoundError(
+                            f'Could not find a file with any of ' + \
+                            f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \
+                            f'at {files_searched}',
+                        ) from e
+                    else:
+                        log.debug(
+                            f'Could not find {name}, looking for another extension',
+                        )
+                    continue
         dist.barrier()
         break
     return finetune_dir
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index e8f6484ef2..e099ffe14a 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -877,7 +877,7 @@ def build_from_hf(
         if tokenizer is None:
             raise ValueError('A tokenizer must be provided.')
 
-        signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed'
+        signal_file_path = dist.get_node_signal_file_name()
 
         # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing.
         # Once local rank 0 is done, the datasets are all cached on disk, and all other ranks
diff --git a/llmfoundry/models/hf/hf_base.py b/llmfoundry/models/hf/hf_base.py
index d193e1067f..2ec9bbaa98 100644
--- a/llmfoundry/models/hf/hf_base.py
+++ b/llmfoundry/models/hf/hf_base.py
@@ -356,7 +356,7 @@ def build_inner_model(
                 f'init_device="{init_device}" must be either "cpu" or "meta".',
             )
 
-        signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed'
+        signal_file_path = dist.get_node_signal_file_name()
         if dist.get_local_rank() == 0:
             with open(signal_file_path, 'wb') as f:
                 f.write(b'local_rank0_completed_download')
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 687b21b46d..ae04b68ee5 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -494,7 +494,7 @@ def build_tokenizer(
     os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
     os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 
-    signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup'
+    signal_file_path = dist.get_node_signal_file_name()
 
     if dist.is_available() and dist.is_initialized(
     ) and dist.get_world_size() > 1:

From ec4cafd4faa417b370bd189811aef85cc9506cc9 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Tue, 1 Oct 2024 11:14:49 -0400
Subject: [PATCH 30/42] Add dataset swap callback (#1536)

---
 llmfoundry/callbacks/__init__.py              |   2 +
 llmfoundry/callbacks/dataset_swap_callback.py | 114 ++++++++++++++++++
 tests/callbacks/test_dataset_swap_callback.py |  14 +++
 3 files changed, 130 insertions(+)
 create mode 100644 llmfoundry/callbacks/dataset_swap_callback.py
 create mode 100644 tests/callbacks/test_dataset_swap_callback.py

diff --git a/llmfoundry/callbacks/__init__.py b/llmfoundry/callbacks/__init__.py
index fe84efa316..8a7e1312eb 100644
--- a/llmfoundry/callbacks/__init__.py
+++ b/llmfoundry/callbacks/__init__.py
@@ -17,6 +17,7 @@
 
 from llmfoundry.callbacks.async_eval_callback import AsyncEval
 from llmfoundry.callbacks.curriculum_learning_callback import CurriculumLearning
+from llmfoundry.callbacks.dataset_swap_callback import DatasetSwap
 from llmfoundry.callbacks.env_logging_callback import EnvironmentLoggingCallback
 from llmfoundry.callbacks.eval_gauntlet_callback import EvalGauntlet
 from llmfoundry.callbacks.eval_output_logging_callback import EvalOutputLogging
@@ -65,6 +66,7 @@
 
 callbacks_with_config.register('async_eval', func=AsyncEval)
 callbacks_with_config.register('curriculum_learning', func=CurriculumLearning)
+callbacks_with_config.register('dataset_swap', func=DatasetSwap)
 
 __all__ = [
     'FDiffMetrics',
diff --git a/llmfoundry/callbacks/dataset_swap_callback.py b/llmfoundry/callbacks/dataset_swap_callback.py
new file mode 100644
index 0000000000..415819e428
--- /dev/null
+++ b/llmfoundry/callbacks/dataset_swap_callback.py
@@ -0,0 +1,114 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Enable curriculum learning by resuming with a different dataset.
+
+This callback is currently experimental. The API may change without warning in
+the future.
+"""
+
+import logging
+from typing import Any
+
+from composer.core import State
+from composer.loggers import Logger
+from streaming import StreamingDataset
+from torch.utils.data import DataLoader
+
+from llmfoundry.interfaces import CallbackWithConfig
+from llmfoundry.utils.warnings import experimental_class
+
+log = logging.getLogger(__name__)
+
+__all__ = ['DatasetSwap']
+
+
+@experimental_class('DatasetSwap callback')
+class DatasetSwap(CallbackWithConfig):
+    """Starts an epoch with a different dataset when resuming from a checkpoint.
+
+    Args:
+        train_config (Dict): The configuration of the dataset currently
+            being used. Note that this is the full train config and must
+            contain the 'train_loader' key.
+        dataset_index (int): The index of the dataset currently being used.
+    """
+
+    def __init__(self, train_config: dict, dataset_index: int):
+        self.dataset_index = dataset_index
+        self.saved_dataset_index = 0
+        self.all_dataset_configs = []
+        self.current_dataset_state = {}
+        # The current dataset config is resolved and passed in train.py
+        self.current_dataset_config = train_config['train_loader']
+
+    def before_load(self, state: State, logger: Logger):
+        del logger
+
+        # Save the current dataset state so we can restore it correctly
+        # if we are resuming with a new dataset.
+        train_loader = state.train_dataloader
+        # Check if we are using a DataLoader and StreamingDataset
+        if not isinstance(train_loader, DataLoader):
+            raise ValueError(
+                f'CurriculumLearning callback can only be used with a train ',
+                f'dataloader of type DataLoader, but got {type(train_loader)}.',
+            )
+        dataset = train_loader.dataset
+        if not isinstance(dataset, StreamingDataset):
+            raise ValueError(
+                f'CurriculumLearning callback only supports StreamingDataset ',
+                f'because it requires loading and saving dataset state. ',
+                f'Instead, got a dataset of type {type(dataset)}',
+            )
+        assert isinstance(dataset, StreamingDataset)
+        # Save the current dataset state so we can restore it if needed.
+        self.current_dataset_state = dataset.state_dict(  # type: ignore
+            num_samples=0, from_beginning=False)
+
+    def after_load(self, state: State, logger: Logger):
+        del logger
+
+        # As saved_dataset_index is loaded from state_dict, this only runs when
+        # a user explicitly increments the dataset_index and not on any other
+        # resumption, including autoresume.
+        train_loader = state._train_dataloader
+        assert isinstance(
+            train_loader,
+            DataLoader,
+        ), 'CurriculumLearning callback requires a DataLoader.'
+        dataset = train_loader.dataset
+        assert isinstance(
+            dataset,
+            StreamingDataset,
+        ), 'CurriculumLearning callback requires a StreamingDataset.'
+        if self.saved_dataset_index < self.dataset_index:
+            # Ignore the dataset state that was read in from the checkpoint, and
+            # replace with the new dataset state. This preserves resumption info.
+            if self.current_dataset_state['epoch'] < 0:
+                # Make sure the epoch in the loaded state dict is not negative.
+                # Since `__iter__` has not yet been called on the dataset, the
+                # epoch index in the dataset will still be -1. We need to ensure
+                # that we set the epoch correctly to 0 in this case.
+                self.current_dataset_state['epoch'] = 0
+            dataset.load_state_dict(  # type: ignore
+                self.current_dataset_state)
+            # Start a new epoch since we are using a new dataset.
+            # This will also reset the sample_in_epoch written to checkpoint,
+            # making sure that subsequent resumptions proceed correctly.
+            state.timestamp = state.timestamp.to_next_epoch()
+            # Append the new dataset config to the list of all dataset configs.
+            self.all_dataset_configs.append(self.current_dataset_config)
+        elif self.dataset_index == 0 and len(self.all_dataset_configs) == 0:
+            # Make sure to track our current dataset config if we are just starting training.
+            self.all_dataset_configs.append(self.current_dataset_config)
+
+    def state_dict(self):
+        return {
+            'dataset_index': self.dataset_index,
+            'all_dataset_configs': self.all_dataset_configs,
+        }
+
+    def load_state_dict(self, state: dict[str, Any]):
+        self.saved_dataset_index = state.get('dataset_index', 0)
+        self.all_dataset_configs = state.get('all_dataset_configs', [])
diff --git a/tests/callbacks/test_dataset_swap_callback.py b/tests/callbacks/test_dataset_swap_callback.py
new file mode 100644
index 0000000000..f54b0f6f5f
--- /dev/null
+++ b/tests/callbacks/test_dataset_swap_callback.py
@@ -0,0 +1,14 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from llmfoundry.utils.builders import build_callback
+
+
+def test_dataset_swap_callback_builds():
+    kwargs = {'dataset_index': 0}
+    callback = build_callback(
+        'dataset_swap',
+        kwargs=kwargs,
+        train_config={'train_loader': {}},
+    )
+    assert callback is not None

From b517297091d4ecfd6f2030d64e5cddb59cac7935 Mon Sep 17 00:00:00 2001
From: Milo Cress <iamroot@databricks.com>
Date: Tue, 1 Oct 2024 14:02:38 -0400
Subject: [PATCH 31/42] Add error to catch more unknown example types (#1562)

---
 llmfoundry/data/finetuning/tasks.py      |  2 ++
 tests/data/test_template_tokenization.py | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index e099ffe14a..c81856b8ba 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -174,6 +174,8 @@ def _get_key(dictionary: Mapping[str, Any], allowed_keys: set[str]):
     if not isinstance(dictionary, Mapping):
         raise InvalidExampleTypeError(str(type(dictionary)))
     desired_keys = allowed_keys.intersection(dictionary.keys())
+    if len(desired_keys) == 0:
+        raise UnknownExampleTypeError(str(set(dictionary.keys())))
     return list(desired_keys)[0]
 
 
diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py
index fdf7233115..0697894bb2 100644
--- a/tests/data/test_template_tokenization.py
+++ b/tests/data/test_template_tokenization.py
@@ -53,11 +53,21 @@ def test_tokenize_chat_example_malformed():
     }
     wrong_example_type = ['this is not a dictionary']
     wrong_messages_type = {'messages': 'this is not a list of messages'}
+    wrong_role = {
+        'messages': [{
+            'role': 'user',
+            'content': 'Hello GPT!',
+        }, {
+            'role': 'misnamed_assistant',
+            'content': 'user message not followed by an assistant label',
+        }],
+    }
     malformed_chat_examples = [
         too_few_messages,
         no_content,
         ends_with_user_role,
         no_assistant_message,
+        wrong_role,
     ]
     my_tokenizer = build_tokenizer('mosaicml/mpt-7b-8k-chat', {})
     for example in malformed_chat_examples:

From 8cf3d8718763e7d9760b6f4df5780e6c23e18e0f Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Tue, 1 Oct 2024 14:12:55 -0400
Subject: [PATCH 32/42] Add FileExtensionNotFoundError (#1564)

---
 llmfoundry/data/finetuning/dataloader.py |  7 +++----
 llmfoundry/utils/exceptions.py           | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 612b8d6385..3e64360a67 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -28,6 +28,7 @@
 from llmfoundry.data.text_data import build_streams
 from llmfoundry.utils.config_utils import to_dict_container
 from llmfoundry.utils.exceptions import (
+    FinetuningFileNotFoundError,
     MissingHuggingFaceURLSplitError,
     NotEnoughDatasetSamplesError,
 )
@@ -585,10 +586,8 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
                             f'{name}/{split}{ext}'
                             for ext in SUPPORTED_EXTENSIONS
                         ]
-                        raise FileNotFoundError(
-                            f'Could not find a file with any of ' + \
-                            f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \
-                            f'at {files_searched}',
+                        raise FinetuningFileNotFoundError(
+                            files_searched=files_searched,
                         ) from e
                     else:
                         log.debug(
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 9cbea2cac8..4a4321637f 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -481,3 +481,19 @@ def __reduce__(self):
 
     def __str__(self):
         return self.message
+
+
+class FinetuningFileNotFoundError(UserError):
+    """Error thrown when a file can't be found with any supported extension."""
+
+    def __init__(self, files_searched: list[str]) -> None:
+        from llmfoundry.data.finetuning.tasks import SUPPORTED_EXTENSIONS
+        message = (
+            f'Could not find a file with any of ' + \
+            f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \
+            f'at {files_searched}'
+        )
+        super().__init__(
+            message,
+            files_searched=files_searched,
+        )

From a462f037c62fd6d23e7bdfa346d49e1598025052 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Tue, 1 Oct 2024 16:37:16 -0400
Subject: [PATCH 33/42] Add InvalidConversationError (#1565)

---
 llmfoundry/data/finetuning/tasks.py |  7 ++++---
 llmfoundry/utils/exceptions.py      | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index c81856b8ba..a68a611c52 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -76,6 +76,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     DatasetTooSmallError,
     IncorrectMessageKeyQuantityError,
     InvalidContentTypeError,
+    InvalidConversationError,
     InvalidExampleTypeError,
     InvalidFileExtensionError,
     InvalidLastChatMessageRoleError,
@@ -270,17 +271,17 @@ def slice_out_last_turn(
         if conversation_through_previous_turn != full_conversation[:len(
             conversation_through_previous_turn,
         )]:
-            raise ValueError(
+            raise InvalidConversationError(
                 f'The full conversation must start with the conversation through the previous turn. {conversation_through_previous_turn=}, {full_conversation=}',
             )
         if conversation_through_previous_turn != prompt_with_history[:len(
             conversation_through_previous_turn,
         )]:
-            raise ValueError(
+            raise InvalidConversationError(
                 f'The prompt_with_history must start with the conversation through the previous turn. {conversation_through_previous_turn=}, {prompt_with_history=}',
             )
         if prompt_with_history != full_conversation[:len(prompt_with_history)]:
-            raise ValueError(
+            raise InvalidConversationError(
                 f'prompt_with_history must be the first part of the full conversation. {prompt_with_history=}, {full_conversation=}',
             )
         prompt = prompt_with_history[len(conversation_through_previous_turn):]
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 4a4321637f..81cfb21d11 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -497,3 +497,18 @@ def __init__(self, files_searched: list[str]) -> None:
             message,
             files_searched=files_searched,
         )
+
+
+class InvalidConversationError(UserError):
+    """Error thrown when the conversation is invalid."""
+
+    def __init__(self, message: str) -> None:
+        self.message = message
+        super().__init__(message)
+
+    def __reduce__(self):
+        # Return a tuple of class, a tuple of arguments, and optionally state
+        return (InvalidConversationError, (self.message,))
+
+    def __str__(self):
+        return self.message

From 24fec7908cdd5b5de31a569049e2dd366d307251 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Wed, 2 Oct 2024 11:26:55 -0700
Subject: [PATCH 34/42] Release docker img (#1547)

Co-authored-by: v-chen_data <v-chen_data@example.com>
---
 .github/workflows/release.yaml | 64 ++++++++++++++++++++++++++++++++++
 Dockerfile                     |  9 +++--
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index c09f9bb7a5..3617732c8f 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -50,3 +50,67 @@ jobs:
         user: __token__
         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
         repository_url: https://test.pypi.org/legacy/
+
+  build-docker:
+    name: Build llm-foundry Release Docker Image
+    needs:
+    - code-quality
+    runs-on: mosaic-8wide
+    if: github.repository_owner == 'mosaicml'
+    steps:
+    - name: Checkout source
+      uses: actions/checkout@v3
+
+    - name: Setup Docker Buildx
+      uses: docker/setup-buildx-action@v2
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v2
+      with:
+        username: ${{ secrets.DOCKER_HUB_USERNAME }}
+        password: ${{ secrets.DOCKER_HUB_PASSWORD }}
+
+    - name: Define Docker tags
+      id: define-tags
+      run: |
+        BRANCH_NAME="${{ github.ref_name }}"
+        TAG_NAME=$(echo "${BRANCH_NAME}" | sed 's/\//_/g')
+        echo "BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV
+
+        echo "DOCKER_TAG=mosaicml/llm-foundry:release_${TAG_NAME}" >> $GITHUB_ENV
+        echo "AWS_DOCKER_TAG=mosaicml/llm-foundry:release_${TAG_NAME}_aws" >> $GITHUB_ENV
+        echo "LATEST_TAG=mosaicml/llm-foundry:release-latest" >> $GITHUB_ENV
+        echo "AWS_LATEST_TAG=mosaicml/llm-foundry:release_aws-latest" >> $GITHUB_ENV
+
+
+    - name: Build and push AWS Docker image
+      uses: docker/build-push-action@v3
+      with:
+        context: .
+        file: Dockerfile
+        push: true
+        tags: |
+          ${{ env.AWS_DOCKER_TAG }}
+          ${{ env.AWS_LATEST_TAG }}
+        build-args: |
+          BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
+          BRANCH_NAME=${{ env.BRANCH_NAME }}
+          TE_COMMIT=901e5d2
+          DEP_GROUPS=[all]
+          KEEP_FOUNDRY=true
+
+    - name: Build and push Docker image
+      uses: docker/build-push-action@v3
+      with:
+        context: .
+        file: Dockerfile
+        push: true
+        tags: |
+          ${{ env.DOCKER_TAG }}
+          ${{ env.LATEST_TAG }}
+        build-args: |
+          BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+          BRANCH_NAME=${{ env.BRANCH_NAME }}
+          TE_COMMIT=901e5d2
+          DEP_GROUPS=[all]
+          KEEP_FOUNDRY=true
diff --git a/Dockerfile b/Dockerfile
index ca52532395..a9d44bfa27 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,6 +7,7 @@ FROM $BASE_IMAGE
 ARG BRANCH_NAME
 ARG DEP_GROUPS
 ARG TE_COMMIT
+ARG KEEP_FOUNDRY=false
 
 ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0"
 
@@ -21,5 +22,9 @@ RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install g
 # Install and uninstall foundry to cache foundry requirements
 RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git
 RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"
-RUN pip uninstall -y llm-foundry
-RUN rm -rf llm-foundry
+
+# Conditionally uninstall llm-foundry and remove its directory
+RUN if [ "$KEEP_FOUNDRY" != "true" ]; then \
+      pip uninstall -y llm-foundry && \
+      rm -rf /llm-foundry; \
+    fi

From 214305fb90070e6aeb054c971d5ab6e08e84671b Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <narayan.saaketh@gmail.com>
Date: Wed, 2 Oct 2024 14:47:57 -0700
Subject: [PATCH 35/42] Revert FT dataloader changes from #1561, keep #1564
 (#1566)

---
 llmfoundry/data/finetuning/dataloader.py | 52 ++++++++++++++++--------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 3e64360a67..ca841979f9 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -576,24 +576,40 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str:
 
         # Since we don't know exactly what the extension will be, since it is one of a list
         # use a signal file to wait for instead of the desired file
-        with dist.busy_wait_for_local_rank_zero(finetune_dir):
-            if dist.get_local_rank() == 0:
-                try:
-                    get_file(path=name, destination=destination, overwrite=True)
-                except FileNotFoundError as e:
-                    if extension == SUPPORTED_EXTENSIONS[-1]:
-                        files_searched = [
-                            f'{name}/{split}{ext}'
-                            for ext in SUPPORTED_EXTENSIONS
-                        ]
-                        raise FinetuningFileNotFoundError(
-                            files_searched=files_searched,
-                        ) from e
-                    else:
-                        log.debug(
-                            f'Could not find {name}, looking for another extension',
-                        )
-                    continue
+        signal_file_path = os.path.join(
+            finetune_dir,
+            f'.node_{dist.get_node_rank()}_local_rank0_completed',
+        )
+        if dist.get_local_rank() == 0:
+            try:
+                get_file(path=name, destination=destination, overwrite=True)
+            except FileNotFoundError as e:
+                if extension == SUPPORTED_EXTENSIONS[-1]:
+                    files_searched = [
+                        f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS
+                    ]
+                    raise FinetuningFileNotFoundError(
+                        files_searched=files_searched,
+                    ) from e
+                else:
+                    log.debug(
+                        f'Could not find {name}, looking for another extension',
+                    )
+                continue
+
+            os.makedirs(os.path.dirname(signal_file_path), exist_ok=True)
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_download')
+
+        # Avoid the collective call until the local rank zero has finished trying to download the dataset
+        # so that we don't timeout for large downloads. This syncs all processes on the node
+        with dist.local_rank_zero_download_and_wait(signal_file_path):
+            # Then, wait to ensure every node has finished trying to download the dataset
+            dist.barrier()
+
+        # clean up signal file
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
         dist.barrier()
         break
     return finetune_dir

From 4bbb4a5cddeef5fa22f6d7b1e47bbefb9f0edc30 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Wed, 2 Oct 2024 18:11:26 -0400
Subject: [PATCH 36/42] Cleanup TP (#1556)

Co-authored-by: Eitan Turok <eitan.turok@gmail.com>
---
 llmfoundry/command_utils/train.py |  13 ++--
 llmfoundry/utils/config_utils.py  |  17 ++++-
 tests/data_utils.py               |   1 +
 tests/tp/test_tp_strategies.py    | 114 +++++++++++++++++++++++++++---
 4 files changed, 127 insertions(+), 18 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 29878714f6..9a5931ddba 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -5,7 +5,6 @@
 import os
 import time
 import warnings
-from copy import deepcopy
 from typing import Any, Optional, Union
 
 import torch
@@ -19,7 +18,7 @@
     TraceHandler,
     cyclic_schedule,
 )
-from composer.utils import dist, get_device, reproducibility
+from composer.utils import TPConfig, dist, get_device, reproducibility
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
@@ -332,7 +331,7 @@ def train(cfg: DictConfig) -> Trainer:
         )
 
     # Optional tp config
-    tp_config: Optional[dict[str, Any]] = train_cfg.tp_config
+    tp_config: Optional[Union[TPConfig, dict[str, Any]]] = train_cfg.tp_config
 
     # Warn if FSDP or TP is enabled but user only has 1 GPU
     if dist.get_world_size(
@@ -351,7 +350,7 @@ def train(cfg: DictConfig) -> Trainer:
     # Initialize context
     init_context = process_init_device(model_config, fsdp_config, tp_config)
     logged_cfg.update({'fsdp_config': fsdp_config}, merge=True)
-    logged_cfg.update({'tp_config': deepcopy(tp_config)}, merge=True)
+    logged_cfg.update({'tp_config': tp_config}, merge=True)
 
     # Build tokenizer
     log.info('Building tokenizer...')
@@ -517,9 +516,9 @@ def train(cfg: DictConfig) -> Trainer:
 
     # TP config
     if tp_config is not None:
-        strategy = tp_config.pop('strategy', None)
-        assert isinstance(strategy, str), '`strategy` must be in `tp_config`.'
-        tp_config['layer_plan'] = build_tp_strategies(strategy, model)
+        strategy = tp_config.pop('strategy')
+        layer_plan = build_tp_strategies(strategy, model)
+        tp_config = TPConfig(**tp_config, layer_plan=layer_plan)
 
     # Parallelism config
     parallelism_config = {'fsdp': fsdp_config, 'tp': tp_config}
diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index c22495993c..18112c18aa 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -288,7 +288,6 @@ def apply_transforms_to_config(
 
     for transform in transform_functions:
         cfg = transform(cfg)
-
     return cfg
 
 
@@ -538,6 +537,22 @@ def process_init_device(
             # Set defaults for mixed initialization
             fsdp_config.setdefault('load_monolith_rank0_only', True)
 
+    if tp_config is not None:
+        # Check tp_config has required fields
+        if 'strategy' not in tp_config or 'tensor_parallel_degree' not in tp_config:
+            raise ValueError(
+                "`tp_config` requires 'strategy' and 'tensor_parallel_degree' values. ",
+            )
+
+        # Check we are not using tensor parallelism with MoEs
+        if 'ffn_config' in model_cfg and model_cfg['ffn_config'].get(
+            'ffn_type',
+            None,
+        ) in ffns_with_megablocks:
+            raise ValueError(
+                'Tensor Parallelism is not currently supported for MoE models.',
+            )
+
     # Check we are not using tensor parallelism with MoEs
     if tp_config is not None and 'ffn_config' in model_cfg and model_cfg[
         'ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
diff --git a/tests/data_utils.py b/tests/data_utils.py
index 1f6c26b72e..67c1be9f6e 100644
--- a/tests/data_utils.py
+++ b/tests/data_utils.py
@@ -251,6 +251,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str:
         shutil.copytree(
             os.path.join(c4_dir, 'val_xxsmall'),
             os.path.join(c4_dir, mocked_split),
+            dirs_exist_ok=True,
         )
     assert os.path.exists(c4_dir)
     return c4_dir
diff --git a/tests/tp/test_tp_strategies.py b/tests/tp/test_tp_strategies.py
index fd2fa384ce..6dfc30759b 100644
--- a/tests/tp/test_tp_strategies.py
+++ b/tests/tp/test_tp_strategies.py
@@ -1,10 +1,16 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+import pathlib
+import shutil
 from pathlib import Path
 from tempfile import TemporaryDirectory
+from typing import Optional
 
 import pytest
+from composer import Trainer
+from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from torch.distributed._tensor import Replicate, Shard
 from torch.distributed.tensor.parallel import (
@@ -96,9 +102,94 @@ def test_ffn_tp_strategy():
             raise ValueError(f'Layer plan of wrong type: {type(layer_plan)}')
 
 
+def get_cfg(
+    dataset_name: pathlib.Path,
+    tp_strategy: Optional[str] = None,
+    tp_degree: Optional[int] = None,
+    yaml_path: str = 'scripts/train/yamls/pretrain/testing.yaml',
+):
+    # Read cfg from `testing.yaml`
+    from tests.fixtures.autouse import REPO_DIR
+    cfg_path: str = os.path.join(REPO_DIR, yaml_path)
+    with open(cfg_path, 'r', encoding='utf-8') as f:
+        train_cfg = om.load(f)
+    assert isinstance(train_cfg, DictConfig)
+
+    # Set the name, dataset, loggers
+    train_cfg.variables.run_name = 'fsdp-test'
+    train_cfg.variables.data_local = dataset_name
+    train_cfg.loggers = DictConfig({'inmemory': DictConfig({})})
+
+    # Set batch size, duration
+    train_cfg.global_train_batch_size = 16
+    train_cfg.device_eval_batch_size = 2
+    train_cfg.device_train_microbatch_size = 2
+    train_cfg.max_duration = '1ep'
+    train_cfg.eval_interval = '1ep'
+
+    # TP needs unfused qkv (even without TP, we unfuse qkv for a fair comparison)
+    train_cfg.model.attn_cfg = {'fused_qkv': False}
+
+    if tp_strategy and tp_degree:
+        train_cfg.variables.run_name = 'tp-test'
+        train_cfg.tp_config = {
+            'strategy': tp_strategy,
+            'tensor_parallel_degree': tp_degree,
+        }
+
+    return train_cfg
+
+
+def get_loss_array(trainer: Trainer):
+    logger = trainer.logger.destinations[0]
+    loss_array = logger.get_timeseries('loss/train/total')['loss/train/total'
+                                                          ]  # type: ignore
+    return loss_array
+
+
+@pytest.mark.gpu
+@pytest.mark.world_size(4)
+@pytest.mark.parametrize('tp_degree', [2])
+@pytest.mark.parametrize('tp_strategy', ['ffn'])
+def test_tp_train(tp_degree: int, tp_strategy: str):
+    """Test that we can train with FSDP-TP."""
+    my_dir = Path('/my-data-dir')
+
+    try:
+        # create c4 dataset
+        if my_dir.is_dir() and my_dir.exists():
+            shutil.rmtree(my_dir)
+        my_dir.mkdir(parents=True)
+        tp_dataset_name = create_c4_dataset_xxsmall(my_dir)
+
+        # Train model with TP and get loss
+        tp_cfg = get_cfg(pathlib.Path(tp_dataset_name), tp_strategy, tp_degree)
+        tp_trainer = train(tp_cfg)
+        tp_trainer.close()
+        tp_loss = get_loss_array(tp_trainer)
+
+        # Compare loss and expected loss for TP
+        import numpy as np
+        expected_tp_loss = np.array([
+            12.02126884,
+            11.96996498,
+            12.02957344,
+            11.97966957,
+            11.99677086,
+            11.96347618,
+        ])
+        np.testing.assert_allclose(tp_loss, expected_tp_loss)
+    except Exception as e:
+        raise e
+    finally:
+        # always remove the directory
+        if os.path.isdir(my_dir):
+            shutil.rmtree(my_dir)
+
+
 @pytest.mark.gpu
-def test_no_tp_with_one_gpu():
-    """Test that when we have one GPU, we use DDP and not FSDP-TP."""
+def test_tp_train_with_one_gpu():
+    """Test that when we have one GPU, we train DDP and not FSDP-TP."""
     with TemporaryDirectory() as tmp_path:
         # Make `train_cfg`` with a tensor parallelism strategy
         dataset_name = create_c4_dataset_xxsmall(Path(tmp_path))
@@ -115,19 +206,22 @@ def test_no_tp_with_one_gpu():
 
 
 @pytest.mark.gpu  # use gpu because `megablocks` only installed with `gpu` dependencies
-def test_no_tp_with_moes():
+@pytest.mark.parametrize('tp_degree', [2])
+@pytest.mark.parametrize('tp_strategy', ['ffn'])
+def test_tp_train_with_moes(tp_degree: int, tp_strategy: str):
     """Test that tensor parallelism is not compatible with MoEs."""
     # Make `cfg` for MoE model, fsdp, and tp
-    train_cfg_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml'
-    with open(train_cfg_path, 'r', encoding='utf-8') as f:
-        train_cfg = om.load(f)
-    model_cfg = train_cfg.model
-    fsdp_cfg = train_cfg.fsdp_config
-    tp_cfg = {'strategy': 'ffn'}
+    moe_yaml_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml'
+    dataset_name = Path('')  # dummy dataset path
+    train_cfg = get_cfg(dataset_name, tp_strategy, tp_degree, moe_yaml_path)
 
     # Expect an error
     with pytest.raises(
         ValueError,
         match='Tensor Parallelism is not currently supported for MoE models.',
     ):
-        process_init_device(model_cfg, fsdp_cfg, tp_cfg)
+        process_init_device(
+            train_cfg.model,
+            train_cfg.fsdp_config,
+            train_cfg.tp_config,
+        )

From 788c1f59ca5832842007fca620ec6bbf3abe9611 Mon Sep 17 00:00:00 2001
From: Abhay Gupta <gupta-abhay@users.noreply.github.com>
Date: Fri, 4 Oct 2024 13:36:55 -0700
Subject: [PATCH 37/42] Changes for dataset swap callback (#1569)

---
 llmfoundry/callbacks/dataset_swap_callback.py | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/llmfoundry/callbacks/dataset_swap_callback.py b/llmfoundry/callbacks/dataset_swap_callback.py
index 415819e428..d95846bd34 100644
--- a/llmfoundry/callbacks/dataset_swap_callback.py
+++ b/llmfoundry/callbacks/dataset_swap_callback.py
@@ -8,7 +8,7 @@
 """
 
 import logging
-from typing import Any
+from dataclasses import dataclass
 
 from composer.core import State
 from composer.loggers import Logger
@@ -23,6 +23,12 @@
 __all__ = ['DatasetSwap']
 
 
+@dataclass
+class DatasetSwapStateDict:
+    dataset_index: int
+    all_dataset_configs: list
+
+
 @experimental_class('DatasetSwap callback')
 class DatasetSwap(CallbackWithConfig):
     """Starts an epoch with a different dataset when resuming from a checkpoint.
@@ -105,10 +111,18 @@ def after_load(self, state: State, logger: Logger):
 
     def state_dict(self):
         return {
-            'dataset_index': self.dataset_index,
-            'all_dataset_configs': self.all_dataset_configs,
+            'callback_state':
+                DatasetSwapStateDict(
+                    dataset_index=self.dataset_index,
+                    all_dataset_configs=self.all_dataset_configs,
+                ),
         }
 
-    def load_state_dict(self, state: dict[str, Any]):
-        self.saved_dataset_index = state.get('dataset_index', 0)
-        self.all_dataset_configs = state.get('all_dataset_configs', [])
+    def load_state_dict(self, state: dict[str, DatasetSwapStateDict]):
+        _dummy_obj = DatasetSwapStateDict(
+            dataset_index=0,
+            all_dataset_configs=[],
+        )
+        _state_obj = state.get('callback_state', _dummy_obj)
+        self.saved_dataset_index = getattr(_state_obj, 'dataset_index')
+        self.all_dataset_configs = getattr(_state_obj, 'all_dataset_configs')

From 56e45732d52b2e5cdb3bf67a936c6e1cebbeadbd Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Tue, 10 Sep 2024 09:04:08 -0700
Subject: [PATCH 38/42] refactor hf download

---
 llmfoundry/data/finetuning/tasks.py | 115 +++++++++++++++++-----------
 1 file changed, 71 insertions(+), 44 deletions(-)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index a68a611c52..f894a22a6a 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -705,7 +705,73 @@ def state_dict(self, num_samples: int,
             num_samples=num_samples,
             from_beginning=from_beginning,
         )
+    
+def download_hf_dataset_if_needed(
+    dataset_name: str,
+    hf_kwargs: Optional[dict[str, Any]] = None
+) -> str:
+    """
+    Download a HuggingFace dataset locally if it does not already exist.
+
+    Args:
+        dataset_name (str): The name of the HuggingFace dataset to use. Can be a remote http(s) 
+        directory or object store bucket containing the file {split}.jsonl.
+        safe_load (bool): Whether to enforce safe loading of the dataset.
+        hf_kwargs (dict, optional): Additional kwargs to pass to `datasets.load_dataset`.
+
+    Returns:
+        str: The local path to the dataset.
+    """
+    if hf_kwargs is None:
+        hf_kwargs = {}
+
+    if not os.path.isdir(dataset_name):
+        local_dataset_dir = os.path.join(
+            DOWNLOADED_FT_DATASETS_DIRPATH,
+            dataset_name,
+        )
+
+        if _is_empty_or_nonexistent(dirpath=local_dataset_dir):
+            # Safely load the dataset from HF Hub with restricted file types.
+            hf_hub.snapshot_download(
+                dataset_name,
+                repo_type='dataset',
+                allow_patterns=[
+                    '*' + ext for ext in SUPPORTED_EXTENSIONS
+                ],
+                token=hf_kwargs.get('token', None),
+                revision=hf_kwargs.get('revision', None),
+                local_dir_use_symlinks=False,
+                local_dir=local_dataset_dir,
+            )
+            if _is_empty_or_nonexistent(dirpath=dataset_name):
+                log.error("Failed to safely load the dataset from HF Hub.")
+                raise InvalidFileExtensionError(
+                    dataset_name,
+                    SUPPORTED_EXTENSIONS,
+                )
+        # Set dataset_name to the downloaded location.
+        dataset_name = local_dataset_dir
+
+    # Ensure dataset_name is a local directory path (using abspath to avoid confusion).
+    dataset_name = os.path.abspath(dataset_name)
+
+    # Check that the directory contains only allowed file types.
+    dataset_files = [
+        f for _, _, files in os.walk(dataset_name) for f in files
+    ]
+    if not all(
+        Path(f).suffix in SUPPORTED_EXTENSIONS +
+        HUGGINGFACE_FOLDER_EXTENSIONS or f == '.gitignore'
+        for f in dataset_files
+    ):
+        log.error(f"Invalid file extension found in dataset during safe load.")
+        raise InvalidFileExtensionError(
+            dataset_name,
+            SUPPORTED_EXTENSIONS,
+        )
 
+    return dataset_name
 
 class DatasetConstructor:
 
@@ -904,50 +970,11 @@ def build_from_hf(
         filtered_dataset = None
         try:
             if safe_load:
-                if not os.path.isdir(dataset_name):
-                    # dataset_name is not a local dir path, download if needed.
-                    local_dataset_dir = os.path.join(
-                        DOWNLOADED_FT_DATASETS_DIRPATH,
-                        dataset_name,
-                    )
-
-                    if _is_empty_or_nonexistent(dirpath=local_dataset_dir):
-                        # Safely load a dataset from HF Hub with restricted file types.
-                        hf_hub.snapshot_download(
-                            dataset_name,
-                            repo_type='dataset',
-                            allow_patterns=[
-                                '*' + ext for ext in SUPPORTED_EXTENSIONS
-                            ],
-                            token=hf_kwargs.get('token', None),
-                            revision=hf_kwargs.get('revision', None),
-                            local_dir_use_symlinks=False,
-                            local_dir=local_dataset_dir,
-                        )
-                        if _is_empty_or_nonexistent(dirpath=local_dataset_dir):
-                            raise InvalidFileExtensionError(
-                                dataset_name,
-                                SUPPORTED_EXTENSIONS,
-                            )
-                    # Set dataset_name to the downloaded location.
-                    dataset_name = local_dataset_dir
-
-                # dataset_name is a local dir path. Use the abspath to prevent confusion.
-                dataset_name = os.path.abspath(dataset_name)
-
-                # Ensure that the local dir contains only allowed file types.
-                dataset_files = [
-                    f for _, _, files in os.walk(dataset_name) for f in files
-                ]
-                if not all(
-                    Path(f).suffix in SUPPORTED_EXTENSIONS +
-                    HUGGINGFACE_FOLDER_EXTENSIONS or f == '.gitignore'
-                    for f in dataset_files
-                ):
-                    raise InvalidFileExtensionError(
-                        dataset_name,
-                        SUPPORTED_EXTENSIONS,
-                    )
+                dataset_name = download_hf_dataset_if_needed(
+                    dataset_name,
+                    safe_load,
+                    hf_kwargs,
+                )
 
             dataset = hf_datasets.load_dataset(
                 dataset_name,

From 983a32d4d37a0ab7c79fd481b57916fa67c45d1b Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Thu, 12 Sep 2024 13:34:59 -0700
Subject: [PATCH 39/42] split_eval_set skeleton

---
 llmfoundry/command_utils/__init__.py          | 36 +++++++------
 .../command_utils/data_prep/split_eval_set.py | 37 +++++++++++++
 scripts/data_prep/split_eval_set.py           | 54 +++++++++++++++++++
 3 files changed, 110 insertions(+), 17 deletions(-)
 create mode 100644 llmfoundry/command_utils/data_prep/split_eval_set.py
 create mode 100644 scripts/data_prep/split_eval_set.py

diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
index 0226c4f408..5407b723cc 100644
--- a/llmfoundry/command_utils/__init__.py
+++ b/llmfoundry/command_utils/__init__.py
@@ -20,6 +20,7 @@
     convert_text_to_mds,
     convert_text_to_mds_from_args,
 )
+from llmfoundry.command_utils.data_prep.split_eval_set import split_eval_set_from_args
 from llmfoundry.command_utils.eval import (
     eval_from_yaml,
     evaluate,
@@ -33,21 +34,22 @@
 )
 
 __all__ = [
-    'train',
-    'train_from_yaml',
-    'TrainConfig',
-    'TRAIN_CONFIG_KEYS',
-    'validate_config',
-    'evaluate',
-    'eval_from_yaml',
-    'convert_dataset_hf',
-    'convert_dataset_hf_from_args',
-    'convert_dataset_json',
-    'convert_dataset_json_from_args',
-    'convert_finetuning_dataset_from_args',
-    'convert_finetuning_dataset',
-    'convert_text_to_mds',
-    'convert_text_to_mds_from_args',
-    'convert_delta_to_json_from_args',
-    'fetch_DT',
+    "train",
+    "train_from_yaml",
+    "TrainConfig",
+    "TRAIN_CONFIG_KEYS",
+    "validate_config",
+    "evaluate",
+    "eval_from_yaml",
+    "convert_dataset_hf",
+    "convert_dataset_hf_from_args",
+    "convert_dataset_json",
+    "convert_dataset_json_from_args",
+    "convert_finetuning_dataset_from_args",
+    "convert_finetuning_dataset",
+    "convert_text_to_mds",
+    "convert_text_to_mds_from_args",
+    "convert_delta_to_json_from_args",
+    "fetch_DT",
+    "split_eval_set_from_args",
 ]
diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py
new file mode 100644
index 0000000000..01205cba15
--- /dev/null
+++ b/llmfoundry/command_utils/data_prep/split_eval_set.py
@@ -0,0 +1,37 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import json
+from enum import Enum
+
+import datasets
+from llmfoundry.data.finetuning.tasks import download_hf_dataset_if_needed
+
+
+class SupportedDataFormats(Enum):
+    REMOTE_JSONL = "jsonl"  # UC JSONL
+    DELTA_JSONL = "delta_jsonl"  # Delta table preprocessed to JSONL
+    HF = "huggingface"
+
+
+def validate_data_path(data_path: str) -> None:
+    """
+    Validates the data path and returns the format of the data.
+
+    Args:
+        data_path (str): Path to the training dataset
+    """
+
+
+def split_eval_set_from_args() -> None:
+    """
+    Args:
+        data_path_folder (str): Path to the training dataset folder
+        data_path_split (str): Data split
+        output_path (str): Directory to save the split dataset
+        eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training
+        max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
+        seed (int): Random seed for splitting the dataset
+    """
+    pass
diff --git a/scripts/data_prep/split_eval_set.py b/scripts/data_prep/split_eval_set.py
new file mode 100644
index 0000000000..ee8bfee453
--- /dev/null
+++ b/scripts/data_prep/split_eval_set.py
@@ -0,0 +1,54 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import ArgumentParser
+
+from llmfoundry.command_utils import split_eval_set_from_args
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Split training dataset into train and eval sets",
+    )
+    parser.add_argument(
+        "--data_path_folder", required=True, type=str, help="Path to the training dataset folder"
+    )
+    parser.add_argument(
+        "--data_path_split", required=True, type=str, help="Path to the training dataset split"
+    )
+    parser.add_argument(
+        "--output_path",
+        required=True,
+        type=str,
+        help="Path to save the split dataset",
+    )
+    parser.add_argument(
+        "--eval_split_ratio",
+        required=False,
+        type=float,
+        default=0.1,
+        help="Ratio of the dataset to use for evaluation. The remainder will be used for training",
+    )
+    parser.add_argument(
+        "--max_eval_samples",
+        required=False,
+        type=int,
+        default=None,
+        help="Maximum number of samples to include in the eval set",
+    )
+    parser.add_argument(
+        "--seed",
+        required=False,
+        type=int,
+        default=42,
+        help="Random seed for splitting the dataset",
+    )
+    args = parser.parse_args()
+    split_eval_set_from_args(
+        data_path_folder=args.data_path_folder,
+        data_path_split=args.data_path_split,
+        output_path=args.output_path,
+        eval_split_ratio=args.eval_split_ratio,
+        max_eval_samples=args.max_eval_samples,
+        seed=args.seed,
+    )

From d3d587da142ac67c74fc6ba60f5a24e219910078 Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Sun, 15 Sep 2024 16:22:57 -0700
Subject: [PATCH 40/42] splitting script

---
 .../command_utils/data_prep/split_eval_set.py | 162 ++++++++++++++++--
 llmfoundry/data/finetuning/tasks.py           |   6 +-
 2 files changed, 152 insertions(+), 16 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py
index 01205cba15..f6afc8722d 100644
--- a/llmfoundry/command_utils/data_prep/split_eval_set.py
+++ b/llmfoundry/command_utils/data_prep/split_eval_set.py
@@ -1,31 +1,167 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
 import os
+import re
 import json
-from enum import Enum
+import contextlib
+import datasets as hf_datasets
+import numpy as np
+from typing import Optional
 
-import datasets
-from llmfoundry.data.finetuning.tasks import download_hf_dataset_if_needed
+from composer.utils import get_file
+from llmfoundry.data.finetuning.tasks import maybe_safe_download_hf_data
 
 
-class SupportedDataFormats(Enum):
-    REMOTE_JSONL = "jsonl"  # UC JSONL
-    DELTA_JSONL = "delta_jsonl"  # Delta table preprocessed to JSONL
-    HF = "huggingface"
+DELTA_JSONL_REGEX = re.compile(r"^tmp-t$")
+REMOTE_OBJECT_STORE_FILE_REGEX = re.compile(
+    r"^((s3|oci|gs):\/\/|dbfs:\/Volumes\/)[/a-zA-Z0-9 ()_\-.]+$"
+)
+HF_REGEX = re.compile(r"^[/a-zA-Z0-9 ()_\-.]+$")
 
+TEMP_DIR = "tmp-split"
 
-def validate_data_path(data_path: str) -> None:
+log = logging.getLogger(__name__)
+
+import sys
+
+log.setLevel(logging.DEBUG)
+log.addHandler(logging.StreamHandler(sys.stdout))
+
+
+def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> str:
     """
-    Validates the data path and returns the format of the data.
+    Prepares dataset as a local JSONL file. Downloads from remote object store or HF if necessary.
+
+    This function is intended to be invoked by DBX Finetuning.
+    Thus, it assumes the provided data is in one of three formats:
+        1. A Delta table converted to JSONL at 'tmp-t/{data_path_split}-00000-of-00001.jsonl`
+           using the 'llmfoundry.scripts.convert_delta_to_json.py' script.
+        2. A JSONL stored as a remote object store file (e.g. S3, OCI, GCS)
+        3. A Hugging Face dataset
 
     Args:
-        data_path (str): Path to the training dataset
+        data_path_folder (str): Path to the training dataset folder
+        data_path_split (str): Data split
+
+    Returns:
+        str: Path to the training dataset
     """
+    os.makedirs(TEMP_DIR, exist_ok=True)
+
+    if DELTA_JSONL_REGEX.match(data_path_folder):
+        data_path = os.path.join(data_path_folder, f"{data_path_split}-00000-of-00001.jsonl")
+        if not os.path.exists(data_path):
+            # TODO: error handling
+            raise FileNotFoundError(f"File {data_path} does not exist.")
+
+    if REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder):
+        log.info(
+            f"Downloading dataset from remote object store: {data_path_folder}{data_path_split}.jsonl"
+        )
+        remote_path = f"{data_path_folder}/{data_path_split}.jsonl"
+        data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl")
+        try:
+            get_file(remote_path, data_path, overwrite=True)
+        except FileNotFoundError as e:
+            # TODO: error handling
+            raise e
+
+    elif HF_REGEX.match(data_path_folder):
+        log.info(
+            f"Downloading dataset from Hugging Face: {data_path_folder} with split {data_path_split}"
+        )
+        # TODO: maybe add support for HF kwargs
+        local_hf_path = maybe_safe_download_hf_data(data_path_folder)
+        # convert dataset split to JSONL
+        dataset = hf_datasets.load_dataset(
+            local_hf_path,
+            split=data_path_split,
+        )
+        data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl")
+        with open(data_path, "w") as f:
+            for example in dataset:
+                f.write(json.dumps(example) + "\n")
+
+    else:
+        # TODO: error handling
+        raise ValueError(
+            f"Unrecognized data_path_folder: {data_path_folder}. Must be a Delta table, remote object store file, or Hugging Face dataset."
+        )
+
+    if not os.path.exists(data_path):
+        # TODO: error handling
+        raise FileNotFoundError(f"File {data_path} does not exist.")
+
+    return data_path
+
 
+@contextlib.contextmanager
+def temp_seed(seed: int):
+    state = np.random.get_state()
+    np.random.seed(seed)
+    try:
+        yield
+    finally:
+        np.random.set_state(state)
 
-def split_eval_set_from_args() -> None:
+
+def _split_examples(
+    data_path: str,
+    output_path: str,
+    eval_split_ratio: float,
+    max_eval_samples: Optional[int],
+    seed: Optional[int] = None,
+) -> None:
+    """
+    Splits the dataset into training and evaluation sets.
+
+    Args:
+        data_path (str): Path to the training dataset (local jsonl file)
+        eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training
+        max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
+        seed (int): Random seed for splitting the dataset
     """
+    # first pass: count total number of lines and determine sample size
+    total_lines = 0
+    with open(data_path, "r") as infile:
+        for _ in infile:
+            total_lines += 1
+    sample_size = int(eval_split_ratio * total_lines)
+    if max_eval_samples is not None:
+        sample_size = min(sample_size, max_eval_samples)
+
+    with temp_seed(seed) if seed is not None else contextlib.nullcontext():
+        random_numbers = np.random.rand(total_lines)
+        sample_indices = set(np.argsort(random_numbers)[:sample_size])
+
+    # second pass: sample indices
+    with open(data_path, "r") as infile, open(
+        os.path.join(output_path, "train.jsonl"), "w"
+    ) as train_outfile, open(os.path.join(output_path, "eval.jsonl"), "w") as eval_outfile:
+        for idx, line in enumerate(infile):
+            if idx in sample_indices:
+                eval_outfile.write(line)
+            else:
+                train_outfile.write(line)
+
+    log.info(
+        f"Split {data_path} into train set of size {total_lines - sample_size} and eval set of size {sample_size}."
+    )
+
+
+def split_eval_set_from_args(
+    data_path_folder: str,
+    data_path_split: str,
+    output_path: str,
+    eval_split_ratio: float,
+    max_eval_samples: Optional[int] = None,
+    seed: Optional[int] = None,
+) -> None:
+    """
+    A wrapper for split_eval_set that parses arguments
+
     Args:
         data_path_folder (str): Path to the training dataset folder
         data_path_split (str): Data split
@@ -34,4 +170,6 @@ def split_eval_set_from_args() -> None:
         max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
         seed (int): Random seed for splitting the dataset
     """
-    pass
+    os.makedirs(output_path, exist_ok=True)
+    data_path = maybe_download_data_as_json(data_path_folder, data_path_split)
+    _split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed)
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index f894a22a6a..f9ffaf4463 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -706,7 +706,7 @@ def state_dict(self, num_samples: int,
             from_beginning=from_beginning,
         )
     
-def download_hf_dataset_if_needed(
+def maybe_safe_download_hf_data(
     dataset_name: str,
     hf_kwargs: Optional[dict[str, Any]] = None
 ) -> str:
@@ -716,7 +716,6 @@ def download_hf_dataset_if_needed(
     Args:
         dataset_name (str): The name of the HuggingFace dataset to use. Can be a remote http(s) 
         directory or object store bucket containing the file {split}.jsonl.
-        safe_load (bool): Whether to enforce safe loading of the dataset.
         hf_kwargs (dict, optional): Additional kwargs to pass to `datasets.load_dataset`.
 
     Returns:
@@ -970,9 +969,8 @@ def build_from_hf(
         filtered_dataset = None
         try:
             if safe_load:
-                dataset_name = download_hf_dataset_if_needed(
+                dataset_name = maybe_download_hf_data(
                     dataset_name,
-                    safe_load,
                     hf_kwargs,
                 )
 

From b921b309829ea9c4f4b428a22cf07cf34a2333e0 Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Mon, 16 Sep 2024 00:58:53 -0700
Subject: [PATCH 41/42] error handling and testing

---
 llmfoundry/command_utils/__init__.py          |   6 +-
 .../command_utils/data_prep/split_eval_set.py |  38 ++--
 .../data_prep/test_split_eval_set.py          | 163 ++++++++++++++++++
 3 files changed, 183 insertions(+), 24 deletions(-)
 create mode 100644 tests/a_scripts/data_prep/test_split_eval_set.py

diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
index 5407b723cc..8757f3b1bc 100644
--- a/llmfoundry/command_utils/__init__.py
+++ b/llmfoundry/command_utils/__init__.py
@@ -20,7 +20,10 @@
     convert_text_to_mds,
     convert_text_to_mds_from_args,
 )
-from llmfoundry.command_utils.data_prep.split_eval_set import split_eval_set_from_args
+from llmfoundry.command_utils.data_prep.split_eval_set import (
+    split_eval_set_from_args,
+    split_examples,
+)
 from llmfoundry.command_utils.eval import (
     eval_from_yaml,
     evaluate,
@@ -52,4 +55,5 @@
     "convert_delta_to_json_from_args",
     "fetch_DT",
     "split_eval_set_from_args",
+    "split_examples",
 ]
diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py
index f6afc8722d..b4b150f81f 100644
--- a/llmfoundry/command_utils/data_prep/split_eval_set.py
+++ b/llmfoundry/command_utils/data_prep/split_eval_set.py
@@ -10,7 +10,7 @@
 import numpy as np
 from typing import Optional
 
-from composer.utils import get_file
+import composer.utils as utils
 from llmfoundry.data.finetuning.tasks import maybe_safe_download_hf_data
 
 
@@ -24,11 +24,6 @@
 
 log = logging.getLogger(__name__)
 
-import sys
-
-log.setLevel(logging.DEBUG)
-log.addHandler(logging.StreamHandler(sys.stdout))
-
 
 def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> str:
     """
@@ -51,22 +46,16 @@ def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) ->
     os.makedirs(TEMP_DIR, exist_ok=True)
 
     if DELTA_JSONL_REGEX.match(data_path_folder):
+        log.info(f"Dataset is converted from Delta table. Using local file {data_path_folder}")
         data_path = os.path.join(data_path_folder, f"{data_path_split}-00000-of-00001.jsonl")
-        if not os.path.exists(data_path):
-            # TODO: error handling
-            raise FileNotFoundError(f"File {data_path} does not exist.")
 
-    if REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder):
+    elif REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder):
         log.info(
             f"Downloading dataset from remote object store: {data_path_folder}{data_path_split}.jsonl"
         )
         remote_path = f"{data_path_folder}/{data_path_split}.jsonl"
         data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl")
-        try:
-            get_file(remote_path, data_path, overwrite=True)
-        except FileNotFoundError as e:
-            # TODO: error handling
-            raise e
+        utils.get_file(remote_path, data_path, overwrite=True)
 
     elif HF_REGEX.match(data_path_folder):
         log.info(
@@ -85,20 +74,21 @@ def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) ->
                 f.write(json.dumps(example) + "\n")
 
     else:
-        # TODO: error handling
         raise ValueError(
-            f"Unrecognized data_path_folder: {data_path_folder}. Must be a Delta table, remote object store file, or Hugging Face dataset."
+            f"Encountered unknown data path format when splitting dataset: {data_path_folder} with split {data_path_split}"
         )
 
     if not os.path.exists(data_path):
-        # TODO: error handling
-        raise FileNotFoundError(f"File {data_path} does not exist.")
+        raise FileNotFoundError(
+            f"Expected dataset file at {data_path} for splitting, but it does not exist."
+        )
 
     return data_path
 
 
 @contextlib.contextmanager
 def temp_seed(seed: int):
+    log.info(f"Setting random seed to {seed}")
     state = np.random.get_state()
     np.random.seed(seed)
     try:
@@ -107,11 +97,11 @@ def temp_seed(seed: int):
         np.random.set_state(state)
 
 
-def _split_examples(
+def split_examples(
     data_path: str,
     output_path: str,
     eval_split_ratio: float,
-    max_eval_samples: Optional[int],
+    max_eval_samples: Optional[int] = None,
     seed: Optional[int] = None,
 ) -> None:
     """
@@ -119,10 +109,13 @@ def _split_examples(
 
     Args:
         data_path (str): Path to the training dataset (local jsonl file)
+        output_path (str): Directory to save the split dataset
         eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training
         max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
         seed (int): Random seed for splitting the dataset
     """
+    os.makedirs(output_path, exist_ok=True)
+
     # first pass: count total number of lines and determine sample size
     total_lines = 0
     with open(data_path, "r") as infile:
@@ -170,6 +163,5 @@ def split_eval_set_from_args(
         max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used
         seed (int): Random seed for splitting the dataset
     """
-    os.makedirs(output_path, exist_ok=True)
     data_path = maybe_download_data_as_json(data_path_folder, data_path_split)
-    _split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed)
+    split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed)
diff --git a/tests/a_scripts/data_prep/test_split_eval_set.py b/tests/a_scripts/data_prep/test_split_eval_set.py
new file mode 100644
index 0000000000..a1b80b91cd
--- /dev/null
+++ b/tests/a_scripts/data_prep/test_split_eval_set.py
@@ -0,0 +1,163 @@
+import os
+import json
+import pytest
+import hashlib
+from unittest.mock import patch
+
+from llmfoundry.command_utils import split_eval_set_from_args, split_examples
+
+# Default values
+OUTPUT_DIR = "tmp-split"
+TMPT_DIR = "tmp-t"
+DATA_PATH_SPLIT = "train"
+EVAL_SPLIT_RATIO = 0.1
+DEFAULT_FILE = TMPT_DIR + "/train-00000-of-00001.jsonl"
+
+
+def calculate_file_hash(filepath: str) -> str:
+    with open(filepath, "rb") as f:
+        file_hash = hashlib.sha256(f.read()).hexdigest()
+    return file_hash
+
+
+def count_lines(filepath: str) -> int:
+    with open(filepath, "r") as f:
+        return sum(1 for _ in f)
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_and_teardown_module():
+    # Setup: create local testing file
+    os.makedirs(TMPT_DIR, exist_ok=True)
+    with open(DEFAULT_FILE, "w") as f:
+        for i in range(1000):
+            f.write(json.dumps({"prompt": "hello world " + str(i), "response": "hi you!"}) + "\n")
+    yield
+
+    # Teardown: clean up output and tmp directories
+    os.system(f"rm -rf {OUTPUT_DIR}")
+    os.system(f"rm -rf {TMPT_DIR}")
+
+
+def test_basic_split():
+    """Test basic functionality on local file"""
+    output_path = os.path.join(OUTPUT_DIR, "basic-test")
+    split_eval_set_from_args(TMPT_DIR, DATA_PATH_SPLIT, output_path, EVAL_SPLIT_RATIO)
+    assert os.path.isfile(os.path.join(output_path, "train.jsonl"))
+    assert os.path.isfile(os.path.join(output_path, "eval.jsonl"))
+
+
+def test_basic_split_output_exists():
+    """Test that split overwrites existing files in output directory"""
+    output_path = os.path.join(OUTPUT_DIR, "basic-test")
+    os.makedirs(output_path, exist_ok=True)
+    train_file = os.path.join(output_path, "train.jsonl")
+    eval_file = os.path.join(output_path, "eval.jsonl")
+    with open(train_file, "w") as f:
+        f.write("existing file train")
+    with open(eval_file, "w") as f:
+        f.write("existing file eval")
+    old_train_hash = calculate_file_hash(train_file)
+    old_eval_hash = calculate_file_hash(eval_file)
+    split_eval_set_from_args(
+        TMPT_DIR,
+        DATA_PATH_SPLIT,
+        output_path,
+        EVAL_SPLIT_RATIO,
+    )
+    assert calculate_file_hash(train_file) != old_train_hash
+    assert calculate_file_hash(eval_file) != old_eval_hash
+
+
+def test_max_eval_samples():
+    """Test case where max_eval_samples < eval_split_ratio * total samples"""
+    output_path = os.path.join(OUTPUT_DIR, "max-eval-test")
+    max_eval_samples = 50
+    split_eval_set_from_args(
+        TMPT_DIR,
+        DATA_PATH_SPLIT,
+        output_path,
+        EVAL_SPLIT_RATIO,
+        max_eval_samples,
+    )
+    eval_lines = count_lines(os.path.join(output_path, "eval.jsonl"))
+    assert eval_lines == max_eval_samples
+
+
+def test_eval_split_ratio():
+    """Test case where max_eval_samples is not used"""
+    output_path = os.path.join(OUTPUT_DIR, "eval-split-test")
+    split_eval_set_from_args(TMPT_DIR, DATA_PATH_SPLIT, output_path, EVAL_SPLIT_RATIO)
+    original_data_lines = count_lines(DEFAULT_FILE)
+    eval_lines = count_lines(os.path.join(output_path, "eval.jsonl"))
+    assert abs(eval_lines - EVAL_SPLIT_RATIO * original_data_lines) < 1  # allow for rounding errors
+
+
+def test_seed_consistency():
+    """Test if the same seed generates consistent splits"""
+    output_path_1 = os.path.join(OUTPUT_DIR, "seed-test-1")
+    output_path_2 = os.path.join(OUTPUT_DIR, "seed-test-2")
+    split_examples(DEFAULT_FILE, output_path_1, EVAL_SPLIT_RATIO, seed=12345)
+    split_examples(DEFAULT_FILE, output_path_2, EVAL_SPLIT_RATIO, seed=12345)
+    train_hash_1 = calculate_file_hash(os.path.join(output_path_1, "train.jsonl"))
+    train_hash_2 = calculate_file_hash(os.path.join(output_path_2, "train.jsonl"))
+    eval_hash_1 = calculate_file_hash(os.path.join(output_path_1, "eval.jsonl"))
+    eval_hash_2 = calculate_file_hash(os.path.join(output_path_2, "eval.jsonl"))
+
+    assert train_hash_1 == train_hash_2
+    assert eval_hash_1 == eval_hash_2
+
+    output_path_3 = os.path.join(OUTPUT_DIR, "seed-test-3")
+    split_examples(DEFAULT_FILE, output_path_3, EVAL_SPLIT_RATIO, seed=54321)
+    train_hash_3 = calculate_file_hash(os.path.join(output_path_3, "train.jsonl"))
+    eval_hash_3 = calculate_file_hash(os.path.join(output_path_3, "eval.jsonl"))
+
+    assert train_hash_1 != train_hash_3
+    assert eval_hash_1 != eval_hash_3
+
+
+def test_hf_data_split():
+    """Test splitting a dataset from Hugging Face"""
+    output_path = os.path.join(OUTPUT_DIR, "hf-split-test")
+    split_eval_set_from_args(
+        "databricks/databricks-dolly-15k", "train", output_path, EVAL_SPLIT_RATIO
+    )
+    assert os.path.isfile(os.path.join(output_path, "train.jsonl"))
+    assert os.path.isfile(os.path.join(output_path, "eval.jsonl"))
+    assert count_lines(os.path.join(output_path, "train.jsonl")) > 0
+    assert count_lines(os.path.join(output_path, "eval.jsonl")) > 0
+
+
+def _mock_get_file(remote_path: str, data_path: str, overwrite: bool):
+    with open(data_path, "w") as f:
+        for i in range(1000):
+            f.write(json.dumps({"prompt": "hello world " + str(i), "response": "hi you!"}) + "\n")
+
+
+def test_remote_store_data_split():
+    """Test splitting a dataset from a remote store"""
+    output_path = os.path.join(OUTPUT_DIR, "remote-split-test")
+    with patch("composer.utils.get_file", side_effect=_mock_get_file) as mock_get_file:
+        split_eval_set_from_args(
+            "dbfs:/Volumes/test/test/test.jsonl",
+            "unique-split-name",
+            output_path,
+            EVAL_SPLIT_RATIO,
+        )
+        mock_get_file.assert_called()
+
+    assert os.path.isfile(os.path.join(output_path, "train.jsonl"))
+    assert os.path.isfile(os.path.join(output_path, "eval.jsonl"))
+    assert count_lines(os.path.join(output_path, "train.jsonl")) > 0
+    assert count_lines(os.path.join(output_path, "eval.jsonl")) > 0
+
+
+def test_missing_delta_file_error():
+    # expects file 'TMPT_DIR/missing-00000-of-00001.jsonl
+    with pytest.raises(FileNotFoundError):
+        split_eval_set_from_args(TMPT_DIR, "missing", OUTPUT_DIR, EVAL_SPLIT_RATIO)
+
+
+def test_unknown_file_format_error():
+    with pytest.raises(ValueError):
+        split_eval_set_from_args("s3:/path/to/file.jsonl", "train", OUTPUT_DIR, EVAL_SPLIT_RATIO)

From 792500186e4b233a062085318e32f777a6261692 Mon Sep 17 00:00:00 2001
From: Matthew Ding <matthew.ding@databricks.com>
Date: Mon, 16 Sep 2024 01:08:53 -0700
Subject: [PATCH 42/42] undo autoformat

---
 llmfoundry/command_utils/__init__.py | 38 ++++++++++++++--------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py
index 8757f3b1bc..4f74fe6ec9 100644
--- a/llmfoundry/command_utils/__init__.py
+++ b/llmfoundry/command_utils/__init__.py
@@ -37,23 +37,23 @@
 )
 
 __all__ = [
-    "train",
-    "train_from_yaml",
-    "TrainConfig",
-    "TRAIN_CONFIG_KEYS",
-    "validate_config",
-    "evaluate",
-    "eval_from_yaml",
-    "convert_dataset_hf",
-    "convert_dataset_hf_from_args",
-    "convert_dataset_json",
-    "convert_dataset_json_from_args",
-    "convert_finetuning_dataset_from_args",
-    "convert_finetuning_dataset",
-    "convert_text_to_mds",
-    "convert_text_to_mds_from_args",
-    "convert_delta_to_json_from_args",
-    "fetch_DT",
-    "split_eval_set_from_args",
-    "split_examples",
+    'train',
+    'train_from_yaml',
+    'TrainConfig',
+    'TRAIN_CONFIG_KEYS',
+    'validate_config',
+    'evaluate',
+    'eval_from_yaml',
+    'convert_dataset_hf',
+    'convert_dataset_hf_from_args',
+    'convert_dataset_json',
+    'convert_dataset_json_from_args',
+    'convert_finetuning_dataset_from_args',
+    'convert_finetuning_dataset',
+    'convert_text_to_mds',
+    'convert_text_to_mds_from_args',
+    'convert_delta_to_json_from_args',
+    'fetch_DT',
+    'split_eval_set_from_args',
+    'split_examples',
 ]