From fe27b8db9ef639bbd8f9aa32cea17bcff2f09137 Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Tue, 10 Sep 2024 09:04:08 -0700 Subject: [PATCH 01/42] refactor hf download --- llmfoundry/data/finetuning/tasks.py | 115 +++++++++++++++++----------- 1 file changed, 71 insertions(+), 44 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index e8f6484ef2..824b7b3bd6 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -702,7 +702,73 @@ def state_dict(self, num_samples: int, num_samples=num_samples, from_beginning=from_beginning, ) + +def download_hf_dataset_if_needed( + dataset_name: str, + hf_kwargs: Optional[dict[str, Any]] = None +) -> str: + """ + Download a HuggingFace dataset locally if it does not already exist. + + Args: + dataset_name (str): The name of the HuggingFace dataset to use. Can be a remote http(s) + directory or object store bucket containing the file {split}.jsonl. + safe_load (bool): Whether to enforce safe loading of the dataset. + hf_kwargs (dict, optional): Additional kwargs to pass to `datasets.load_dataset`. + + Returns: + str: The local path to the dataset. + """ + if hf_kwargs is None: + hf_kwargs = {} + + if not os.path.isdir(dataset_name): + local_dataset_dir = os.path.join( + DOWNLOADED_FT_DATASETS_DIRPATH, + dataset_name, + ) + + if _is_empty_or_nonexistent(dirpath=local_dataset_dir): + # Safely load the dataset from HF Hub with restricted file types. + hf_hub.snapshot_download( + dataset_name, + repo_type='dataset', + allow_patterns=[ + '*' + ext for ext in SUPPORTED_EXTENSIONS + ], + token=hf_kwargs.get('token', None), + revision=hf_kwargs.get('revision', None), + local_dir_use_symlinks=False, + local_dir=local_dataset_dir, + ) + if _is_empty_or_nonexistent(dirpath=dataset_name): + log.error("Failed to safely load the dataset from HF Hub.") + raise InvalidFileExtensionError( + dataset_name, + SUPPORTED_EXTENSIONS, + ) + # Set dataset_name to the downloaded location. + dataset_name = local_dataset_dir + + # Ensure dataset_name is a local directory path (using abspath to avoid confusion). + dataset_name = os.path.abspath(dataset_name) + + # Check that the directory contains only allowed file types. + dataset_files = [ + f for _, _, files in os.walk(dataset_name) for f in files + ] + if not all( + Path(f).suffix in SUPPORTED_EXTENSIONS + + HUGGINGFACE_FOLDER_EXTENSIONS or f == '.gitignore' + for f in dataset_files + ): + log.error(f"Invalid file extension found in dataset during safe load.") + raise InvalidFileExtensionError( + dataset_name, + SUPPORTED_EXTENSIONS, + ) + return dataset_name class DatasetConstructor: @@ -901,50 +967,11 @@ def build_from_hf( filtered_dataset = None try: if safe_load: - if not os.path.isdir(dataset_name): - # dataset_name is not a local dir path, download if needed. - local_dataset_dir = os.path.join( - DOWNLOADED_FT_DATASETS_DIRPATH, - dataset_name, - ) - - if _is_empty_or_nonexistent(dirpath=local_dataset_dir): - # Safely load a dataset from HF Hub with restricted file types. - hf_hub.snapshot_download( - dataset_name, - repo_type='dataset', - allow_patterns=[ - '*' + ext for ext in SUPPORTED_EXTENSIONS - ], - token=hf_kwargs.get('token', None), - revision=hf_kwargs.get('revision', None), - local_dir_use_symlinks=False, - local_dir=local_dataset_dir, - ) - if _is_empty_or_nonexistent(dirpath=local_dataset_dir): - raise InvalidFileExtensionError( - dataset_name, - SUPPORTED_EXTENSIONS, - ) - # Set dataset_name to the downloaded location. - dataset_name = local_dataset_dir - - # dataset_name is a local dir path. Use the abspath to prevent confusion. - dataset_name = os.path.abspath(dataset_name) - - # Ensure that the local dir contains only allowed file types. - dataset_files = [ - f for _, _, files in os.walk(dataset_name) for f in files - ] - if not all( - Path(f).suffix in SUPPORTED_EXTENSIONS + - HUGGINGFACE_FOLDER_EXTENSIONS or f == '.gitignore' - for f in dataset_files - ): - raise InvalidFileExtensionError( - dataset_name, - SUPPORTED_EXTENSIONS, - ) + dataset_name = download_hf_dataset_if_needed( + dataset_name, + safe_load, + hf_kwargs, + ) dataset = hf_datasets.load_dataset( dataset_name, From 18859b15df3cee80435583947f9cbf49880a358d Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Thu, 12 Sep 2024 13:34:59 -0700 Subject: [PATCH 02/42] split_eval_set skeleton --- llmfoundry/command_utils/__init__.py | 36 +++++++------ .../command_utils/data_prep/split_eval_set.py | 37 +++++++++++++ scripts/data_prep/split_eval_set.py | 54 +++++++++++++++++++ 3 files changed, 110 insertions(+), 17 deletions(-) create mode 100644 llmfoundry/command_utils/data_prep/split_eval_set.py create mode 100644 scripts/data_prep/split_eval_set.py diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 0226c4f408..5407b723cc 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -20,6 +20,7 @@ convert_text_to_mds, convert_text_to_mds_from_args, ) +from llmfoundry.command_utils.data_prep.split_eval_set import split_eval_set_from_args from llmfoundry.command_utils.eval import ( eval_from_yaml, evaluate, @@ -33,21 +34,22 @@ ) __all__ = [ - 'train', - 'train_from_yaml', - 'TrainConfig', - 'TRAIN_CONFIG_KEYS', - 'validate_config', - 'evaluate', - 'eval_from_yaml', - 'convert_dataset_hf', - 'convert_dataset_hf_from_args', - 'convert_dataset_json', - 'convert_dataset_json_from_args', - 'convert_finetuning_dataset_from_args', - 'convert_finetuning_dataset', - 'convert_text_to_mds', - 'convert_text_to_mds_from_args', - 'convert_delta_to_json_from_args', - 'fetch_DT', + "train", + "train_from_yaml", + "TrainConfig", + "TRAIN_CONFIG_KEYS", + "validate_config", + "evaluate", + "eval_from_yaml", + "convert_dataset_hf", + "convert_dataset_hf_from_args", + "convert_dataset_json", + "convert_dataset_json_from_args", + "convert_finetuning_dataset_from_args", + "convert_finetuning_dataset", + "convert_text_to_mds", + "convert_text_to_mds_from_args", + "convert_delta_to_json_from_args", + "fetch_DT", + "split_eval_set_from_args", ] diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py new file mode 100644 index 0000000000..01205cba15 --- /dev/null +++ b/llmfoundry/command_utils/data_prep/split_eval_set.py @@ -0,0 +1,37 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import os +import json +from enum import Enum + +import datasets +from llmfoundry.data.finetuning.tasks import download_hf_dataset_if_needed + + +class SupportedDataFormats(Enum): + REMOTE_JSONL = "jsonl" # UC JSONL + DELTA_JSONL = "delta_jsonl" # Delta table preprocessed to JSONL + HF = "huggingface" + + +def validate_data_path(data_path: str) -> None: + """ + Validates the data path and returns the format of the data. + + Args: + data_path (str): Path to the training dataset + """ + + +def split_eval_set_from_args() -> None: + """ + Args: + data_path_folder (str): Path to the training dataset folder + data_path_split (str): Data split + output_path (str): Directory to save the split dataset + eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training + max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used + seed (int): Random seed for splitting the dataset + """ + pass diff --git a/scripts/data_prep/split_eval_set.py b/scripts/data_prep/split_eval_set.py new file mode 100644 index 0000000000..ee8bfee453 --- /dev/null +++ b/scripts/data_prep/split_eval_set.py @@ -0,0 +1,54 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from argparse import ArgumentParser + +from llmfoundry.command_utils import split_eval_set_from_args + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Split training dataset into train and eval sets", + ) + parser.add_argument( + "--data_path_folder", required=True, type=str, help="Path to the training dataset folder" + ) + parser.add_argument( + "--data_path_split", required=True, type=str, help="Path to the training dataset split" + ) + parser.add_argument( + "--output_path", + required=True, + type=str, + help="Path to save the split dataset", + ) + parser.add_argument( + "--eval_split_ratio", + required=False, + type=float, + default=0.1, + help="Ratio of the dataset to use for evaluation. The remainder will be used for training", + ) + parser.add_argument( + "--max_eval_samples", + required=False, + type=int, + default=None, + help="Maximum number of samples to include in the eval set", + ) + parser.add_argument( + "--seed", + required=False, + type=int, + default=42, + help="Random seed for splitting the dataset", + ) + args = parser.parse_args() + split_eval_set_from_args( + data_path_folder=args.data_path_folder, + data_path_split=args.data_path_split, + output_path=args.output_path, + eval_split_ratio=args.eval_split_ratio, + max_eval_samples=args.max_eval_samples, + seed=args.seed, + ) From f29ef67b306be24717e04053cb98890b1f33472e Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Sun, 15 Sep 2024 16:22:57 -0700 Subject: [PATCH 03/42] splitting script --- .../command_utils/data_prep/split_eval_set.py | 162 ++++++++++++++++-- llmfoundry/data/finetuning/tasks.py | 6 +- 2 files changed, 152 insertions(+), 16 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py index 01205cba15..f6afc8722d 100644 --- a/llmfoundry/command_utils/data_prep/split_eval_set.py +++ b/llmfoundry/command_utils/data_prep/split_eval_set.py @@ -1,31 +1,167 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import logging import os +import re import json -from enum import Enum +import contextlib +import datasets as hf_datasets +import numpy as np +from typing import Optional -import datasets -from llmfoundry.data.finetuning.tasks import download_hf_dataset_if_needed +from composer.utils import get_file +from llmfoundry.data.finetuning.tasks import maybe_safe_download_hf_data -class SupportedDataFormats(Enum): - REMOTE_JSONL = "jsonl" # UC JSONL - DELTA_JSONL = "delta_jsonl" # Delta table preprocessed to JSONL - HF = "huggingface" +DELTA_JSONL_REGEX = re.compile(r"^tmp-t$") +REMOTE_OBJECT_STORE_FILE_REGEX = re.compile( + r"^((s3|oci|gs):\/\/|dbfs:\/Volumes\/)[/a-zA-Z0-9 ()_\-.]+$" +) +HF_REGEX = re.compile(r"^[/a-zA-Z0-9 ()_\-.]+$") +TEMP_DIR = "tmp-split" -def validate_data_path(data_path: str) -> None: +log = logging.getLogger(__name__) + +import sys + +log.setLevel(logging.DEBUG) +log.addHandler(logging.StreamHandler(sys.stdout)) + + +def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> str: """ - Validates the data path and returns the format of the data. + Prepares dataset as a local JSONL file. Downloads from remote object store or HF if necessary. + + This function is intended to be invoked by DBX Finetuning. + Thus, it assumes the provided data is in one of three formats: + 1. A Delta table converted to JSONL at 'tmp-t/{data_path_split}-00000-of-00001.jsonl` + using the 'llmfoundry.scripts.convert_delta_to_json.py' script. + 2. A JSONL stored as a remote object store file (e.g. S3, OCI, GCS) + 3. A Hugging Face dataset Args: - data_path (str): Path to the training dataset + data_path_folder (str): Path to the training dataset folder + data_path_split (str): Data split + + Returns: + str: Path to the training dataset """ + os.makedirs(TEMP_DIR, exist_ok=True) + + if DELTA_JSONL_REGEX.match(data_path_folder): + data_path = os.path.join(data_path_folder, f"{data_path_split}-00000-of-00001.jsonl") + if not os.path.exists(data_path): + # TODO: error handling + raise FileNotFoundError(f"File {data_path} does not exist.") + + if REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder): + log.info( + f"Downloading dataset from remote object store: {data_path_folder}{data_path_split}.jsonl" + ) + remote_path = f"{data_path_folder}/{data_path_split}.jsonl" + data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl") + try: + get_file(remote_path, data_path, overwrite=True) + except FileNotFoundError as e: + # TODO: error handling + raise e + + elif HF_REGEX.match(data_path_folder): + log.info( + f"Downloading dataset from Hugging Face: {data_path_folder} with split {data_path_split}" + ) + # TODO: maybe add support for HF kwargs + local_hf_path = maybe_safe_download_hf_data(data_path_folder) + # convert dataset split to JSONL + dataset = hf_datasets.load_dataset( + local_hf_path, + split=data_path_split, + ) + data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl") + with open(data_path, "w") as f: + for example in dataset: + f.write(json.dumps(example) + "\n") + + else: + # TODO: error handling + raise ValueError( + f"Unrecognized data_path_folder: {data_path_folder}. Must be a Delta table, remote object store file, or Hugging Face dataset." + ) + + if not os.path.exists(data_path): + # TODO: error handling + raise FileNotFoundError(f"File {data_path} does not exist.") + + return data_path + +@contextlib.contextmanager +def temp_seed(seed: int): + state = np.random.get_state() + np.random.seed(seed) + try: + yield + finally: + np.random.set_state(state) -def split_eval_set_from_args() -> None: + +def _split_examples( + data_path: str, + output_path: str, + eval_split_ratio: float, + max_eval_samples: Optional[int], + seed: Optional[int] = None, +) -> None: + """ + Splits the dataset into training and evaluation sets. + + Args: + data_path (str): Path to the training dataset (local jsonl file) + eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training + max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used + seed (int): Random seed for splitting the dataset """ + # first pass: count total number of lines and determine sample size + total_lines = 0 + with open(data_path, "r") as infile: + for _ in infile: + total_lines += 1 + sample_size = int(eval_split_ratio * total_lines) + if max_eval_samples is not None: + sample_size = min(sample_size, max_eval_samples) + + with temp_seed(seed) if seed is not None else contextlib.nullcontext(): + random_numbers = np.random.rand(total_lines) + sample_indices = set(np.argsort(random_numbers)[:sample_size]) + + # second pass: sample indices + with open(data_path, "r") as infile, open( + os.path.join(output_path, "train.jsonl"), "w" + ) as train_outfile, open(os.path.join(output_path, "eval.jsonl"), "w") as eval_outfile: + for idx, line in enumerate(infile): + if idx in sample_indices: + eval_outfile.write(line) + else: + train_outfile.write(line) + + log.info( + f"Split {data_path} into train set of size {total_lines - sample_size} and eval set of size {sample_size}." + ) + + +def split_eval_set_from_args( + data_path_folder: str, + data_path_split: str, + output_path: str, + eval_split_ratio: float, + max_eval_samples: Optional[int] = None, + seed: Optional[int] = None, +) -> None: + """ + A wrapper for split_eval_set that parses arguments + Args: data_path_folder (str): Path to the training dataset folder data_path_split (str): Data split @@ -34,4 +170,6 @@ def split_eval_set_from_args() -> None: max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used seed (int): Random seed for splitting the dataset """ - pass + os.makedirs(output_path, exist_ok=True) + data_path = maybe_download_data_as_json(data_path_folder, data_path_split) + _split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 824b7b3bd6..ea6004e01c 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -703,7 +703,7 @@ def state_dict(self, num_samples: int, from_beginning=from_beginning, ) -def download_hf_dataset_if_needed( +def maybe_safe_download_hf_data( dataset_name: str, hf_kwargs: Optional[dict[str, Any]] = None ) -> str: @@ -713,7 +713,6 @@ def download_hf_dataset_if_needed( Args: dataset_name (str): The name of the HuggingFace dataset to use. Can be a remote http(s) directory or object store bucket containing the file {split}.jsonl. - safe_load (bool): Whether to enforce safe loading of the dataset. hf_kwargs (dict, optional): Additional kwargs to pass to `datasets.load_dataset`. Returns: @@ -967,9 +966,8 @@ def build_from_hf( filtered_dataset = None try: if safe_load: - dataset_name = download_hf_dataset_if_needed( + dataset_name = maybe_download_hf_data( dataset_name, - safe_load, hf_kwargs, ) From 3d9d51f2a66a1b3a8d4888b712b87101c6f1397b Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Mon, 16 Sep 2024 00:58:53 -0700 Subject: [PATCH 04/42] error handling and testing --- llmfoundry/command_utils/__init__.py | 6 +- .../command_utils/data_prep/split_eval_set.py | 38 ++-- .../data_prep/test_split_eval_set.py | 163 ++++++++++++++++++ 3 files changed, 183 insertions(+), 24 deletions(-) create mode 100644 tests/a_scripts/data_prep/test_split_eval_set.py diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 5407b723cc..8757f3b1bc 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -20,7 +20,10 @@ convert_text_to_mds, convert_text_to_mds_from_args, ) -from llmfoundry.command_utils.data_prep.split_eval_set import split_eval_set_from_args +from llmfoundry.command_utils.data_prep.split_eval_set import ( + split_eval_set_from_args, + split_examples, +) from llmfoundry.command_utils.eval import ( eval_from_yaml, evaluate, @@ -52,4 +55,5 @@ "convert_delta_to_json_from_args", "fetch_DT", "split_eval_set_from_args", + "split_examples", ] diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py index f6afc8722d..b4b150f81f 100644 --- a/llmfoundry/command_utils/data_prep/split_eval_set.py +++ b/llmfoundry/command_utils/data_prep/split_eval_set.py @@ -10,7 +10,7 @@ import numpy as np from typing import Optional -from composer.utils import get_file +import composer.utils as utils from llmfoundry.data.finetuning.tasks import maybe_safe_download_hf_data @@ -24,11 +24,6 @@ log = logging.getLogger(__name__) -import sys - -log.setLevel(logging.DEBUG) -log.addHandler(logging.StreamHandler(sys.stdout)) - def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> str: """ @@ -51,22 +46,16 @@ def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> os.makedirs(TEMP_DIR, exist_ok=True) if DELTA_JSONL_REGEX.match(data_path_folder): + log.info(f"Dataset is converted from Delta table. Using local file {data_path_folder}") data_path = os.path.join(data_path_folder, f"{data_path_split}-00000-of-00001.jsonl") - if not os.path.exists(data_path): - # TODO: error handling - raise FileNotFoundError(f"File {data_path} does not exist.") - if REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder): + elif REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder): log.info( f"Downloading dataset from remote object store: {data_path_folder}{data_path_split}.jsonl" ) remote_path = f"{data_path_folder}/{data_path_split}.jsonl" data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl") - try: - get_file(remote_path, data_path, overwrite=True) - except FileNotFoundError as e: - # TODO: error handling - raise e + utils.get_file(remote_path, data_path, overwrite=True) elif HF_REGEX.match(data_path_folder): log.info( @@ -85,20 +74,21 @@ def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> f.write(json.dumps(example) + "\n") else: - # TODO: error handling raise ValueError( - f"Unrecognized data_path_folder: {data_path_folder}. Must be a Delta table, remote object store file, or Hugging Face dataset." + f"Encountered unknown data path format when splitting dataset: {data_path_folder} with split {data_path_split}" ) if not os.path.exists(data_path): - # TODO: error handling - raise FileNotFoundError(f"File {data_path} does not exist.") + raise FileNotFoundError( + f"Expected dataset file at {data_path} for splitting, but it does not exist." + ) return data_path @contextlib.contextmanager def temp_seed(seed: int): + log.info(f"Setting random seed to {seed}") state = np.random.get_state() np.random.seed(seed) try: @@ -107,11 +97,11 @@ def temp_seed(seed: int): np.random.set_state(state) -def _split_examples( +def split_examples( data_path: str, output_path: str, eval_split_ratio: float, - max_eval_samples: Optional[int], + max_eval_samples: Optional[int] = None, seed: Optional[int] = None, ) -> None: """ @@ -119,10 +109,13 @@ def _split_examples( Args: data_path (str): Path to the training dataset (local jsonl file) + output_path (str): Directory to save the split dataset eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used seed (int): Random seed for splitting the dataset """ + os.makedirs(output_path, exist_ok=True) + # first pass: count total number of lines and determine sample size total_lines = 0 with open(data_path, "r") as infile: @@ -170,6 +163,5 @@ def split_eval_set_from_args( max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used seed (int): Random seed for splitting the dataset """ - os.makedirs(output_path, exist_ok=True) data_path = maybe_download_data_as_json(data_path_folder, data_path_split) - _split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed) + split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed) diff --git a/tests/a_scripts/data_prep/test_split_eval_set.py b/tests/a_scripts/data_prep/test_split_eval_set.py new file mode 100644 index 0000000000..a1b80b91cd --- /dev/null +++ b/tests/a_scripts/data_prep/test_split_eval_set.py @@ -0,0 +1,163 @@ +import os +import json +import pytest +import hashlib +from unittest.mock import patch + +from llmfoundry.command_utils import split_eval_set_from_args, split_examples + +# Default values +OUTPUT_DIR = "tmp-split" +TMPT_DIR = "tmp-t" +DATA_PATH_SPLIT = "train" +EVAL_SPLIT_RATIO = 0.1 +DEFAULT_FILE = TMPT_DIR + "/train-00000-of-00001.jsonl" + + +def calculate_file_hash(filepath: str) -> str: + with open(filepath, "rb") as f: + file_hash = hashlib.sha256(f.read()).hexdigest() + return file_hash + + +def count_lines(filepath: str) -> int: + with open(filepath, "r") as f: + return sum(1 for _ in f) + + +@pytest.fixture(scope="module", autouse=True) +def setup_and_teardown_module(): + # Setup: create local testing file + os.makedirs(TMPT_DIR, exist_ok=True) + with open(DEFAULT_FILE, "w") as f: + for i in range(1000): + f.write(json.dumps({"prompt": "hello world " + str(i), "response": "hi you!"}) + "\n") + yield + + # Teardown: clean up output and tmp directories + os.system(f"rm -rf {OUTPUT_DIR}") + os.system(f"rm -rf {TMPT_DIR}") + + +def test_basic_split(): + """Test basic functionality on local file""" + output_path = os.path.join(OUTPUT_DIR, "basic-test") + split_eval_set_from_args(TMPT_DIR, DATA_PATH_SPLIT, output_path, EVAL_SPLIT_RATIO) + assert os.path.isfile(os.path.join(output_path, "train.jsonl")) + assert os.path.isfile(os.path.join(output_path, "eval.jsonl")) + + +def test_basic_split_output_exists(): + """Test that split overwrites existing files in output directory""" + output_path = os.path.join(OUTPUT_DIR, "basic-test") + os.makedirs(output_path, exist_ok=True) + train_file = os.path.join(output_path, "train.jsonl") + eval_file = os.path.join(output_path, "eval.jsonl") + with open(train_file, "w") as f: + f.write("existing file train") + with open(eval_file, "w") as f: + f.write("existing file eval") + old_train_hash = calculate_file_hash(train_file) + old_eval_hash = calculate_file_hash(eval_file) + split_eval_set_from_args( + TMPT_DIR, + DATA_PATH_SPLIT, + output_path, + EVAL_SPLIT_RATIO, + ) + assert calculate_file_hash(train_file) != old_train_hash + assert calculate_file_hash(eval_file) != old_eval_hash + + +def test_max_eval_samples(): + """Test case where max_eval_samples < eval_split_ratio * total samples""" + output_path = os.path.join(OUTPUT_DIR, "max-eval-test") + max_eval_samples = 50 + split_eval_set_from_args( + TMPT_DIR, + DATA_PATH_SPLIT, + output_path, + EVAL_SPLIT_RATIO, + max_eval_samples, + ) + eval_lines = count_lines(os.path.join(output_path, "eval.jsonl")) + assert eval_lines == max_eval_samples + + +def test_eval_split_ratio(): + """Test case where max_eval_samples is not used""" + output_path = os.path.join(OUTPUT_DIR, "eval-split-test") + split_eval_set_from_args(TMPT_DIR, DATA_PATH_SPLIT, output_path, EVAL_SPLIT_RATIO) + original_data_lines = count_lines(DEFAULT_FILE) + eval_lines = count_lines(os.path.join(output_path, "eval.jsonl")) + assert abs(eval_lines - EVAL_SPLIT_RATIO * original_data_lines) < 1 # allow for rounding errors + + +def test_seed_consistency(): + """Test if the same seed generates consistent splits""" + output_path_1 = os.path.join(OUTPUT_DIR, "seed-test-1") + output_path_2 = os.path.join(OUTPUT_DIR, "seed-test-2") + split_examples(DEFAULT_FILE, output_path_1, EVAL_SPLIT_RATIO, seed=12345) + split_examples(DEFAULT_FILE, output_path_2, EVAL_SPLIT_RATIO, seed=12345) + train_hash_1 = calculate_file_hash(os.path.join(output_path_1, "train.jsonl")) + train_hash_2 = calculate_file_hash(os.path.join(output_path_2, "train.jsonl")) + eval_hash_1 = calculate_file_hash(os.path.join(output_path_1, "eval.jsonl")) + eval_hash_2 = calculate_file_hash(os.path.join(output_path_2, "eval.jsonl")) + + assert train_hash_1 == train_hash_2 + assert eval_hash_1 == eval_hash_2 + + output_path_3 = os.path.join(OUTPUT_DIR, "seed-test-3") + split_examples(DEFAULT_FILE, output_path_3, EVAL_SPLIT_RATIO, seed=54321) + train_hash_3 = calculate_file_hash(os.path.join(output_path_3, "train.jsonl")) + eval_hash_3 = calculate_file_hash(os.path.join(output_path_3, "eval.jsonl")) + + assert train_hash_1 != train_hash_3 + assert eval_hash_1 != eval_hash_3 + + +def test_hf_data_split(): + """Test splitting a dataset from Hugging Face""" + output_path = os.path.join(OUTPUT_DIR, "hf-split-test") + split_eval_set_from_args( + "databricks/databricks-dolly-15k", "train", output_path, EVAL_SPLIT_RATIO + ) + assert os.path.isfile(os.path.join(output_path, "train.jsonl")) + assert os.path.isfile(os.path.join(output_path, "eval.jsonl")) + assert count_lines(os.path.join(output_path, "train.jsonl")) > 0 + assert count_lines(os.path.join(output_path, "eval.jsonl")) > 0 + + +def _mock_get_file(remote_path: str, data_path: str, overwrite: bool): + with open(data_path, "w") as f: + for i in range(1000): + f.write(json.dumps({"prompt": "hello world " + str(i), "response": "hi you!"}) + "\n") + + +def test_remote_store_data_split(): + """Test splitting a dataset from a remote store""" + output_path = os.path.join(OUTPUT_DIR, "remote-split-test") + with patch("composer.utils.get_file", side_effect=_mock_get_file) as mock_get_file: + split_eval_set_from_args( + "dbfs:/Volumes/test/test/test.jsonl", + "unique-split-name", + output_path, + EVAL_SPLIT_RATIO, + ) + mock_get_file.assert_called() + + assert os.path.isfile(os.path.join(output_path, "train.jsonl")) + assert os.path.isfile(os.path.join(output_path, "eval.jsonl")) + assert count_lines(os.path.join(output_path, "train.jsonl")) > 0 + assert count_lines(os.path.join(output_path, "eval.jsonl")) > 0 + + +def test_missing_delta_file_error(): + # expects file 'TMPT_DIR/missing-00000-of-00001.jsonl + with pytest.raises(FileNotFoundError): + split_eval_set_from_args(TMPT_DIR, "missing", OUTPUT_DIR, EVAL_SPLIT_RATIO) + + +def test_unknown_file_format_error(): + with pytest.raises(ValueError): + split_eval_set_from_args("s3:/path/to/file.jsonl", "train", OUTPUT_DIR, EVAL_SPLIT_RATIO) From 4e7b357079e4feeb0f0935d22dd3c5540e3a9d90 Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Mon, 16 Sep 2024 01:08:53 -0700 Subject: [PATCH 05/42] undo autoformat --- llmfoundry/command_utils/__init__.py | 38 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 8757f3b1bc..4f74fe6ec9 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -37,23 +37,23 @@ ) __all__ = [ - "train", - "train_from_yaml", - "TrainConfig", - "TRAIN_CONFIG_KEYS", - "validate_config", - "evaluate", - "eval_from_yaml", - "convert_dataset_hf", - "convert_dataset_hf_from_args", - "convert_dataset_json", - "convert_dataset_json_from_args", - "convert_finetuning_dataset_from_args", - "convert_finetuning_dataset", - "convert_text_to_mds", - "convert_text_to_mds_from_args", - "convert_delta_to_json_from_args", - "fetch_DT", - "split_eval_set_from_args", - "split_examples", + 'train', + 'train_from_yaml', + 'TrainConfig', + 'TRAIN_CONFIG_KEYS', + 'validate_config', + 'evaluate', + 'eval_from_yaml', + 'convert_dataset_hf', + 'convert_dataset_hf_from_args', + 'convert_dataset_json', + 'convert_dataset_json_from_args', + 'convert_finetuning_dataset_from_args', + 'convert_finetuning_dataset', + 'convert_text_to_mds', + 'convert_text_to_mds_from_args', + 'convert_delta_to_json_from_args', + 'fetch_DT', + 'split_eval_set_from_args', + 'split_examples', ] From 83ab9c30e0a2432bcc6213e4cb8b55296b13e438 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 16 Sep 2024 13:54:10 -0700 Subject: [PATCH 06/42] Replace FSDP args (#1517) Co-authored-by: v-chen_data --- llmfoundry/command_utils/eval.py | 8 ++++++-- tests/a_scripts/inference/test_convert_composer_to_hf.py | 5 +++-- tests/models/hf/test_fsdp_weight_tying.py | 2 +- tests/models/hf/test_hf_peft_wrapping.py | 2 +- tests/models/test_fsdp_act_checkpoint.py | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index f622ca182d..eca16bd815 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -52,7 +52,7 @@ def evaluate_model( device_eval_batch_size: Union[int, float], eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]], - fsdp_config: Optional[dict[str, Any]], + parallelism_config: Optional[dict[str, Any]], loggers: list[LoggerDestination], python_log_level: Optional[str], precision: str, @@ -99,6 +99,10 @@ def evaluate_model( mosaicml_logger.log_metrics(metadata) mosaicml_logger._flush_metadata(force_flush=True) + fsdp_config = parallelism_config.get( + 'fsdp_config', + None, + ) if parallelism_config else None if fsdp_config and model.get('load_in_8bit', False): raise ValueError( 'The FSDP config block is not supported when loading ' + @@ -316,7 +320,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]: device_eval_batch_size=eval_config.device_eval_batch_size, eval_gauntlet_config=eval_gauntlet_config, eval_loader_config=eval_loader_config, - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, loggers=loggers, python_log_level=eval_config.python_log_level, precision=eval_config.precision, diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 4f1bd63c62..66ec739a65 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -1042,7 +1042,8 @@ def test_huggingface_conversion_callback( model=original_model, device='gpu', precision=trainer_precision, - fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None, + parallelism_config={'fsdp': fsdp_config} + if fsdp_state_dict_type is not None else None, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, @@ -1469,7 +1470,7 @@ def test_mptmoe_huggingface_conversion_callback( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 69ced673a1..8e6c113169 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -91,7 +91,7 @@ def test_fsdp_weight_tying( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 56cb36c8c1..01acc22a60 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -125,7 +125,7 @@ def test_lora_mixed_init( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py index a41574538a..366bcf7786 100644 --- a/tests/models/test_fsdp_act_checkpoint.py +++ b/tests/models/test_fsdp_act_checkpoint.py @@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint( trainer = Trainer( model=model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, ) assert trainer.state.fsdp_enabled From 0114f33da83b5e2c43f6399f69acd8401525a9e8 Mon Sep 17 00:00:00 2001 From: Abhay Gupta Date: Mon, 16 Sep 2024 17:09:12 -0700 Subject: [PATCH 07/42] enable correct padding_idx for embedding layers (#1527) --- llmfoundry/models/mpt/modeling_mpt.py | 1 + llmfoundry/models/utils/param_init_fns.py | 3 +++ tests/models/utils/test_param_init_fns.py | 27 +++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 06b64101c3..cfe1172634 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -396,6 +396,7 @@ def __init__(self, config: MPTConfig): self.wte = SharedEmbedding( config.vocab_size, config.d_model, + padding_idx=config.pad_token_id, device=config.init_device, ) if self.learned_pos_emb: diff --git a/llmfoundry/models/utils/param_init_fns.py b/llmfoundry/models/utils/param_init_fns.py index 180e7b894c..8ad6e77c57 100644 --- a/llmfoundry/models/utils/param_init_fns.py +++ b/llmfoundry/models/utils/param_init_fns.py @@ -224,6 +224,9 @@ def embedding_init( emb_init_fn_ = init_fn_ emb_init_fn_(module.weight) + if module.padding_idx is not None: + with torch.no_grad(): + module.weight[module.padding_idx].fill_(0) return True diff --git a/tests/models/utils/test_param_init_fns.py b/tests/models/utils/test_param_init_fns.py index 0eaf60c869..11d9fba430 100644 --- a/tests/models/utils/test_param_init_fns.py +++ b/tests/models/utils/test_param_init_fns.py @@ -199,3 +199,30 @@ def test_emb_init(emb_init_cfg: Optional[tuple[str, Union[int, list[int]]]]): emb_init_uniform_lim, ) == 2 and emb_init_uniform_lim[0] == emb_init_uniform_lim[1]: assert (model.emb.weight == emb_init_uniform_lim[0]).all() + + +@pytest.mark.parametrize( + 'padding_idx', + [0, 2], +) +def test_emb_padding_init(padding_idx: int,): + cfg: dict[str, Union[int, list[int]]] = { + 'vocab_size': 64, + 'in_features': 16, + 'n_layers': 2, + 'padding_idx': padding_idx, + 'emb_init_std': 5, + } + dict_cfg = om.create(cfg) + + model = nn.Embedding( + dict_cfg.vocab_size, + dict_cfg.in_features, + dict_cfg.padding_idx, + ) + + model.apply(partial(param_init_fns.get('kaiming_normal_'), **dict_cfg)) + assert isinstance(model, torch.nn.Embedding) + + if dict_cfg.get('emb_init_std') is not None: + assert (model.weight[padding_idx] == 0).all() From 9a1b78b128a242590b00f364a99d2d2d735f9468 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 17 Sep 2024 10:29:09 -0700 Subject: [PATCH 08/42] Revert "Replace FSDP args" (#1533) --- llmfoundry/command_utils/eval.py | 8 ++------ tests/a_scripts/inference/test_convert_composer_to_hf.py | 5 ++--- tests/models/hf/test_fsdp_weight_tying.py | 2 +- tests/models/hf/test_hf_peft_wrapping.py | 2 +- tests/models/test_fsdp_act_checkpoint.py | 2 +- 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index eca16bd815..f622ca182d 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -52,7 +52,7 @@ def evaluate_model( device_eval_batch_size: Union[int, float], eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]], - parallelism_config: Optional[dict[str, Any]], + fsdp_config: Optional[dict[str, Any]], loggers: list[LoggerDestination], python_log_level: Optional[str], precision: str, @@ -99,10 +99,6 @@ def evaluate_model( mosaicml_logger.log_metrics(metadata) mosaicml_logger._flush_metadata(force_flush=True) - fsdp_config = parallelism_config.get( - 'fsdp_config', - None, - ) if parallelism_config else None if fsdp_config and model.get('load_in_8bit', False): raise ValueError( 'The FSDP config block is not supported when loading ' + @@ -320,7 +316,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]: device_eval_batch_size=eval_config.device_eval_batch_size, eval_gauntlet_config=eval_gauntlet_config, eval_loader_config=eval_loader_config, - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, loggers=loggers, python_log_level=eval_config.python_log_level, precision=eval_config.precision, diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 66ec739a65..4f1bd63c62 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -1042,8 +1042,7 @@ def test_huggingface_conversion_callback( model=original_model, device='gpu', precision=trainer_precision, - parallelism_config={'fsdp': fsdp_config} - if fsdp_state_dict_type is not None else None, + fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, @@ -1470,7 +1469,7 @@ def test_mptmoe_huggingface_conversion_callback( trainer = Trainer( model=original_model, device='gpu', - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 8e6c113169..69ced673a1 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -91,7 +91,7 @@ def test_fsdp_weight_tying( trainer = Trainer( model=original_model, device='gpu', - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 01acc22a60..56cb36c8c1 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -125,7 +125,7 @@ def test_lora_mixed_init( trainer = Trainer( model=original_model, device='gpu', - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py index 366bcf7786..a41574538a 100644 --- a/tests/models/test_fsdp_act_checkpoint.py +++ b/tests/models/test_fsdp_act_checkpoint.py @@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint( trainer = Trainer( model=model, device='gpu', - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, ) assert trainer.state.fsdp_enabled From 7a23f60ad5ce25e80c3d5f3ab3badfb413743daa Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Tue, 17 Sep 2024 12:54:28 -0700 Subject: [PATCH 09/42] Delete unneeded inner base model in PEFT HF Checkpointer (#1532) --- llmfoundry/callbacks/hf_checkpointer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 4e6a501f2f..65bdcb3b6c 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -585,6 +585,7 @@ def tensor_hook( new_base_model_instance, original_model.peft_config[active_adapter], ) + del new_base_model_instance else: new_model_instance = type(original_model)(new_config) new_model_instance.generation_config.update( From 2e3d14f6130ebad5a149c1c52f53fd07628e1006 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 17 Sep 2024 13:45:04 -0700 Subject: [PATCH 10/42] Add deprecation warning to fsdp_config (#1530) Co-authored-by: v-chen_data --- llmfoundry/command_utils/eval.py | 35 ++++- .../inference/test_convert_composer_to_hf.py | 5 +- tests/eval/test_eval_deprecation.py | 125 ++++++++++++++++++ tests/models/hf/test_fsdp_weight_tying.py | 2 +- tests/models/hf/test_hf_peft_wrapping.py | 2 +- tests/models/test_fsdp_act_checkpoint.py | 2 +- 6 files changed, 163 insertions(+), 8 deletions(-) create mode 100644 tests/eval/test_eval_deprecation.py diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index f622ca182d..e644ad1f0f 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -4,6 +4,7 @@ import logging import os import time +import warnings from typing import Any, Optional, Union import pandas as pd @@ -11,7 +12,7 @@ from composer.core import Callback from composer.loggers.logger_destination import LoggerDestination from composer.trainer import Trainer -from composer.utils import dist, get_device, reproducibility +from composer.utils import dist, get_device, parallelism, reproducibility from omegaconf import DictConfig from omegaconf import OmegaConf as om @@ -36,6 +37,7 @@ process_init_device, ) from llmfoundry.utils.registry_utils import import_file +from llmfoundry.utils.warnings import VersionedDeprecationWarning log = logging.getLogger(__name__) @@ -52,7 +54,6 @@ def evaluate_model( device_eval_batch_size: Union[int, float], eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]], - fsdp_config: Optional[dict[str, Any]], loggers: list[LoggerDestination], python_log_level: Optional[str], precision: str, @@ -62,9 +63,33 @@ def evaluate_model( callback_configs: Optional[dict[str, Any]], metadata: Optional[dict[str, str]], logged_config: dict[str, Any], + fsdp_config: Optional[dict[str, Any]] = None, + parallelism_config: Optional[dict[str, Any]] = None, should_log_config: bool = True, load_path: Optional[str] = None, ): + if parallelism_config: + deprecated_fsdp_args = list( + parallelism.FSDPConfig.__annotations__.keys(), + ) + for deprecated_arg in deprecated_fsdp_args: + if deprecated_arg in parallelism_config: + raise ValueError( + 'parallelism_config cannot contain deprecated fsdp_config arguments.', + ) + + if fsdp_config: + warnings.warn( + VersionedDeprecationWarning( + 'The argument fsdp_config is deprecated. Please use parallelism_config instead.', + remove_version='0.13.0', + ), + ) + if fsdp_config and parallelism_config: + raise ValueError( + 'Both fsdp_config and parallelism_config cannot be provided at the same time. Please use parallelism_config.', + ) + log.info(f'Evaluating model: {model_name}') # Build tokenizer and model tokenizer_cfg = tokenizer @@ -99,6 +124,10 @@ def evaluate_model( mosaicml_logger.log_metrics(metadata) mosaicml_logger._flush_metadata(force_flush=True) + fsdp_config = parallelism_config.get( + 'fsdp_config', + None, + ) if parallelism_config else fsdp_config if fsdp_config and model.get('load_in_8bit', False): raise ValueError( 'The FSDP config block is not supported when loading ' + @@ -146,7 +175,7 @@ def evaluate_model( callbacks=callbacks, loggers=loggers, precision=precision, - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, load_path=load_path, load_weights_only=True, progress_bar=False, diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 4f1bd63c62..66ec739a65 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -1042,7 +1042,8 @@ def test_huggingface_conversion_callback( model=original_model, device='gpu', precision=trainer_precision, - fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None, + parallelism_config={'fsdp': fsdp_config} + if fsdp_state_dict_type is not None else None, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, @@ -1469,7 +1470,7 @@ def test_mptmoe_huggingface_conversion_callback( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, diff --git a/tests/eval/test_eval_deprecation.py b/tests/eval/test_eval_deprecation.py new file mode 100644 index 0000000000..828186245a --- /dev/null +++ b/tests/eval/test_eval_deprecation.py @@ -0,0 +1,125 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import unittest +import warnings + +from llmfoundry.command_utils.eval import evaluate_model +from llmfoundry.utils.warnings import VersionedDeprecationWarning + + +class TestEvaluateModelDeprecation(unittest.TestCase): + + def setUp(self): + self.common_args = { # type: ignore + 'tokenizer': { + 'name': 'test_tokenizer', + }, + 'model': { + 'name': 'test_model', + }, + 'model_name': 'test', + 'dist_timeout': 60, + 'run_name': 'test_run', + 'seed': 42, + 'icl_tasks': [], + 'max_seq_len': 512, + 'device_eval_batch_size': 1, + 'eval_gauntlet_config': None, + 'eval_loader_config': None, + 'loggers': [], + 'python_log_level': None, + 'precision': 'fp32', + 'eval_gauntlet_df': None, + 'eval_subset_num_batches': 1, + 'icl_subset_num_batches': None, + 'callback_configs': None, + 'metadata': None, + 'logged_config': {}, + } + + def test_no_deprecation_warning(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + import composer.utils.parallelism + deprecated_fsdp_args = list( + composer.utils.parallelism.FSDPConfig.__annotations__.keys(), + ) + print(deprecated_fsdp_args) + + try: + parallelism_config = {'fsdp': {'verbose': True}} + evaluate_model( + **self.common_args, + parallelism_config=parallelism_config, + ) + except ValueError as ve: + if 'parallelism_config cannot contain deprecated fsdp_config arguments.' in str( + ve, + ): + self.fail( + 'Raised ValueError about deprecated fsdp_config arguments', + ) + elif 'Both fsdp_config and parallelism_config cannot be provided at the same time.' in str( + ve, + ): + self.fail( + 'Raised ValueError about both configs being provided', + ) + except Exception: + pass + + deprecation_warnings = [ + warning for warning in w + if isinstance(warning.message, VersionedDeprecationWarning) + ] + if deprecation_warnings: + self.fail('VersionedDeprecationWarning was raised') + + def test_deprecation_warning_with_deprecated_arg(self): + # Use assertRaises to catch the expected ValueError + with self.assertRaises(ValueError) as context: + # Directly call evaluate_model; do not use try-except here + evaluate_model( + **self.common_args, + parallelism_config={'activation_checkpointing': True}, + ) + + # Assert that the correct error message is in the exception + self.assertIn( + 'parallelism_config cannot contain deprecated fsdp_config arguments.', + str(context.exception), + ) + + def test_deprecation_warning_with_fsdp_config(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + + try: + evaluate_model( + **self.common_args, + parallelism_config=None, + fsdp_config={'verbose': True}, + ) + except Exception: + pass + + self.assertTrue( + any( + issubclass(warning.category, VersionedDeprecationWarning) + for warning in w + ), + ) + + def test_error_with_both_fsdp_and_parallelism_config(self): + with self.assertRaises(ValueError) as context: + evaluate_model( + **self.common_args, + parallelism_config={'some_arg': True}, + fsdp_config={'some_arg': True}, + ) + + self.assertIn( + 'Both fsdp_config and parallelism_config cannot be provided at the same time.', + str(context.exception), + ) diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 69ced673a1..8e6c113169 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -91,7 +91,7 @@ def test_fsdp_weight_tying( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 56cb36c8c1..01acc22a60 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -125,7 +125,7 @@ def test_lora_mixed_init( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py index a41574538a..366bcf7786 100644 --- a/tests/models/test_fsdp_act_checkpoint.py +++ b/tests/models/test_fsdp_act_checkpoint.py @@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint( trainer = Trainer( model=model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, ) assert trainer.state.fsdp_enabled From d7c78229e91129d4c35006209fabd5fb2f2252e9 Mon Sep 17 00:00:00 2001 From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com> Date: Sun, 22 Sep 2024 14:03:42 -0400 Subject: [PATCH 11/42] Fix reuse kv cache for torch attention (#1539) --- llmfoundry/models/layers/attention.py | 3 +++ tests/models/layers/test_flash_torch.py | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index a1af2235cf..625327767e 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -656,6 +656,9 @@ def get_qkv( 'prev_layer_key_value is None, cannot reuse_prev_layer_kv.', ) key, value = prev_layer_key_value + if self.attn_impl == 'torch': + key = rearrange(key, 'b h d s -> b s (h d)') + value = rearrange(value, 'b h s d -> b s (h d)') query = self.Wq(x) if self.clip_qkv: diff --git a/tests/models/layers/test_flash_torch.py b/tests/models/layers/test_flash_torch.py index 01a6a7576d..0a4b32a73a 100644 --- a/tests/models/layers/test_flash_torch.py +++ b/tests/models/layers/test_flash_torch.py @@ -188,7 +188,7 @@ def gen_bias(attn_impl: str): alibi=alibi, alibi_bias_max=8, ) - if attn_impl != 'flash' and attn_uses_sequence_id and sequence_id is not None: + if attn_impl == 'torch' and attn_uses_sequence_id and sequence_id is not None: assert isinstance(attn_bias, torch.Tensor) # pyright attn_bias = apply_sequence_id( attn_bias, @@ -561,8 +561,10 @@ def test_grouped_query_invalid_heads(): }, }], ) +@pytest.mark.parametrize('attn_impl', ['flash', 'torch']) def test_reuse_prev_layer_kv_cache( pos_emb_config: dict, + attn_impl: str, device: str = 'cuda', ): """Checks reusing previous layer's kv cache.""" @@ -570,7 +572,7 @@ def test_reuse_prev_layer_kv_cache( rope = pos_emb_config['rope'] cfg = { - 'attn_impl': 'flash', + 'attn_impl': attn_impl, 'd_model': 64, 'n_heads': 4, 'attn_pdrop': 0, @@ -630,6 +632,13 @@ def gen_bias(attn_impl: str): alibi=alibi, alibi_bias_max=8, ) + if attn_impl == 'torch': + assert isinstance(attn_bias, torch.Tensor) # pyright + attn_bias = apply_sequence_id( + attn_bias, + sequence_id, # type: ignore + s, + ) return attn_bias @@ -637,7 +646,7 @@ def gen_bias(attn_impl: str): sequence_id=sequence_id, S=s, attn_uses_sequence_id=True, - attn_impl='flash', + attn_impl=attn_impl, attention_mask=attention_mask, ) @@ -656,7 +665,7 @@ def gen_bias(attn_impl: str): x1.requires_grad = True with torch.autocast(x0.device.type): - attn_bias_0 = gen_bias('flash') + attn_bias_0 = gen_bias(attn_impl) alibi_slopes_0 = None if alibi: alibi_slopes_0 = gen_slopes( @@ -703,7 +712,7 @@ def gen_bias(attn_impl: str): flash_attn_padding_info=flash_attn_padding_info, alibi_slopes=alibi_slopes_0, ) - attn_bias_1 = gen_bias('flash') + attn_bias_1 = gen_bias(attn_impl) alibi_slopes_1 = None if alibi: alibi_slopes_1 = gen_slopes( From 14cff668750dc08eb4511ddee0d55b127e711dea Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 22 Sep 2024 19:49:21 -0400 Subject: [PATCH 12/42] Error on text dataset file not found (#1534) --- .../data_prep/convert_text_to_mds.py | 15 ++++++++++----- llmfoundry/utils/exceptions.py | 11 +++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 9a1f8a912d..3ea5aeb5d4 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -32,6 +32,7 @@ CannotUnicodeDecodeFile, DatasetTooSmallError, InputFolderMissingDataError, + InputFolderNotFound, OutputFolderNotEmptyError, ) @@ -125,11 +126,15 @@ def get_object_names(input_folder: str) -> list[str]: object_store = maybe_create_object_store_from_uri(input_folder) if object_store is not None: _, _, folder_prefix = parse_uri(input_folder) - names = [ - name for name in object_store.list_objects(folder_prefix) - if name.endswith('.txt') - ] - log.info(f'Found {len(names)} text files in remote storage') + try: + names = [ + name for name in object_store.list_objects(folder_prefix) + if name.endswith('.txt') + ] + log.info(f'Found {len(names)} text files in remote storage') + except FileNotFoundError: + raise InputFolderNotFound(folder_prefix) + else: # input_folder is a local folder names = [ diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 11895564f2..900355dff5 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -348,6 +348,17 @@ def __init__(self, input_folder: str) -> None: super().__init__(message, input_folder=input_folder) +class InputFolderNotFound(UserError): + """Error thrown when the a folder is not found.""" + + def __init__(self, folder_that_was_not_found: str) -> None: + message = f'{folder_that_was_not_found} not found.' + super().__init__( + message, + folder_that_was_not_found=folder_that_was_not_found, + ) + + class CannotUnicodeDecodeFile(UserError): """Error thrown when the input folder is missing data.""" From a2c0507795a887b6fb71d3ef975b714523fe2abb Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Sun, 22 Sep 2024 18:23:51 -0700 Subject: [PATCH 13/42] Make ICL tasks not required for eval (#1540) --- llmfoundry/command_utils/eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index e644ad1f0f..70c4319ea8 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -262,7 +262,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]: EvalConfig, EVAL_CONFIG_KEYS, transforms=[allow_toplevel_keys], - icl_tasks_required=True, + icl_tasks_required=False, ) model_configs = eval_config.models @@ -273,7 +273,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]: # Mandatory Evaluation Parameters icl_tasks = eval_config.icl_tasks or eval_config.icl_tasks_str if icl_tasks is None: - raise ValueError('icl_tasks must be specified in the config') + icl_tasks = [] # Optional Evaluation Parameters with default values eval_loader_config = eval_config.eval_loader or eval_config.eval_loaders From 85403c086710bc0f62d03fc03c0fcbb2e5ffda1d Mon Sep 17 00:00:00 2001 From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com> Date: Mon, 23 Sep 2024 10:37:26 -0400 Subject: [PATCH 14/42] Bumping flash attention version to 2.6.3 and adding option for softcap in attention and lm_head logits. (#1374) --- llmfoundry/models/layers/attention.py | 24 +++++- llmfoundry/models/mpt/configuration_mpt.py | 14 +++ llmfoundry/models/mpt/modeling_mpt.py | 6 ++ llmfoundry/models/utils/config_defaults.py | 1 + setup.py | 2 +- tests/models/layers/test_flash_attn.py | 99 +++++++++++++++++++++- 6 files changed, 140 insertions(+), 6 deletions(-) diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index 625327767e..612d6b9642 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -112,6 +112,7 @@ def scaled_multihead_dot_product_attention( dropout_p: float = 0.0, training: bool = False, needs_weights: bool = False, + attn_logit_softcapping: Optional[float] = None, sliding_window_size: int = -1, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]: @@ -149,6 +150,11 @@ def scaled_multihead_dot_product_attention( attn_weight = q.matmul(k) * softmax_scale + if attn_logit_softcapping is not None: + attn_weight = attn_logit_softcapping * torch.tanh( + attn_weight / attn_logit_softcapping, + ) + if attn_bias is not None: # clamp to 0 necessary for torch 2.0 compile() _s_q = max(0, attn_bias.size(2) - s_q) @@ -264,6 +270,7 @@ def flash_attn_fn( sliding_window_size: int = -1, alibi_slopes: Optional[torch.Tensor] = None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]] = None, + attn_logit_softcapping: Optional[float] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]: if key_padding_mask is not None: @@ -381,13 +388,17 @@ def flash_attn_fn( return_attn_probs=needs_weights, ) elif is_flash_v2_installed(): - alibi_kwargs = {} + extra_attn_kwargs = {} if check_alibi_support('flash'): - alibi_kwargs = {'alibi_slopes': alibi_slopes} + extra_attn_kwargs['alibi_slopes'] = alibi_slopes elif alibi_slopes is not None: raise ValueError( 'alibi_slopes is only supported for flash-attn>=2.4.2', ) + if is_flash_v2_installed( + v2_version='v2.6.2', + ) and attn_logit_softcapping is not None: + extra_attn_kwargs['softcap'] = attn_logit_softcapping output_unpad = flash_attn_interface.flash_attn_varlen_func( q=query_unpad, k=key_unpad, @@ -401,7 +412,7 @@ def flash_attn_fn( causal=reset_is_causal, return_attn_probs=needs_weights, window_size=(sliding_window_size, sliding_window_size), - **alibi_kwargs, + **extra_attn_kwargs, ) else: raise RuntimeError( @@ -448,6 +459,7 @@ def __init__( bias: bool = True, sliding_window_size: int = -1, reuse_kv_layer_idx: Optional[int] = None, + attn_logit_softcapping: Optional[float] = None, kv_dim: Optional[int] = None, ): super().__init__() @@ -463,6 +475,7 @@ def __init__( self.kv_n_heads = kv_n_heads self.sliding_window_size = sliding_window_size self.reuse_kv_layer_idx = reuse_kv_layer_idx + self.attn_logit_softcapping = attn_logit_softcapping self.kv_dim = kv_dim if kv_dim is not None else self.d_model self.head_dim = d_model // n_heads @@ -625,6 +638,7 @@ def forward( dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, + attn_logit_softcapping=self.attn_logit_softcapping, sliding_window_size=self.sliding_window_size, **extra_attn_kwargs, ) @@ -853,6 +867,7 @@ def __init__( bias: bool = True, sliding_window_size: int = -1, reuse_kv_layer_idx: Optional[int] = None, + attn_logit_softcapping: Optional[float] = None, kv_dim: Optional[int] = None, ): super().__init__( @@ -873,6 +888,7 @@ def __init__( bias=bias, sliding_window_size=sliding_window_size, reuse_kv_layer_idx=reuse_kv_layer_idx, + attn_logit_softcapping=attn_logit_softcapping, kv_dim=kv_dim, ) @@ -902,6 +918,7 @@ def __init__( bias: bool = True, sliding_window_size: int = -1, reuse_kv_layer_idx: Optional[int] = None, + attn_logit_softcapping: Optional[float] = None, kv_dim: Optional[int] = None, ): super().__init__( @@ -922,6 +939,7 @@ def __init__( bias=bias, sliding_window_size=sliding_window_size, reuse_kv_layer_idx=reuse_kv_layer_idx, + attn_logit_softcapping=attn_logit_softcapping, kv_dim=kv_dim, ) diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index 91b431e3b4..dbcabdf5f9 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -51,6 +51,7 @@ def __init__( tie_word_embeddings: bool = True, use_pad_tok_in_ffn: bool = True, block_overrides: Optional[dict[str, Any]] = None, + final_logit_softcapping: Optional[float] = None, **kwargs: Any, ): """The MPT configuration class. @@ -148,6 +149,7 @@ def __init__( reuse_kv_layer: attn_config: reuse_kv_layer_idx: -6 # Relative index of the layer whose kv cache to reuse + final_logit_softcapping (float | None): Softcapping threshold for final logit. Set to None to disable (default value None). Please see https://arxiv.org/pdf/2403.08295 for more details. kwargs (Any): Other relevant keyword arguments. """ self.d_model = d_model @@ -181,6 +183,7 @@ def __init__( if block_overrides is not None: self._validate_block_overrides(block_overrides) self.block_overrides = block_overrides + self.final_logit_softcapping = final_logit_softcapping if isinstance(fc_type, str): fc_type = {'name': fc_type} @@ -325,6 +328,17 @@ def _validate_config(self) -> None: raise NotImplementedError( 'sliding window attention only implemented for torch attention and flash attention (v2.3.0 or higher).', ) + if self.attn_config['attn_logit_softcapping'] is not None: + if self.attn_config['attn_logit_softcapping'] <= 0: + raise ValueError( + 'Attention attn_logit_softcapping should be positive.', + ) + if self.attn_config[ + 'attn_impl' + ] == 'flash' and not is_flash_v2_installed(v2_version='v2.6.2',): + raise NotImplementedError( + 'Attention attn_logit_softcapping is only implemented with torch attention or flash attention v2.6.2 (or higher).', + ) if self.attn_config['kv_dim'] is not None and self.attn_config[ 'fused_qkv']: raise ValueError( diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index cfe1172634..9212f5594d 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -1071,6 +1071,7 @@ def __init__(self, config: MPTConfig): f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.", ) self.logit_scale = logit_scale + self.final_logit_softcapping = config.final_logit_softcapping @property def backbone_model_class(self) -> type[MPTModel]: @@ -1172,6 +1173,11 @@ def forward( ) logits *= self.logit_scale + if self.final_logit_softcapping is not None: + logits = self.final_logit_softcapping * torch.tanh( + logits / self.final_logit_softcapping, + ) + loss = None if labels is not None: _labels = torch.roll(labels, shifts=-1) diff --git a/llmfoundry/models/utils/config_defaults.py b/llmfoundry/models/utils/config_defaults.py index bd3b29a479..5550785149 100644 --- a/llmfoundry/models/utils/config_defaults.py +++ b/llmfoundry/models/utils/config_defaults.py @@ -18,6 +18,7 @@ 'softmax_scale': None, 'attn_uses_sequence_id': False, 'sliding_window_size': -1, + 'attn_logit_softcapping': None, 'alibi': False, 'alibi_bias_max': 8, 'rope': False, diff --git a/setup.py b/setup.py index 0a75c610b8..ebc66fdacf 100644 --- a/setup.py +++ b/setup.py @@ -104,7 +104,7 @@ # Flash 2 group kept for backwards compatibility extra_deps['gpu-flash2'] = [ - 'flash-attn>=2.5.8,<3', + 'flash-attn>=2.6.3,<3', ] extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) diff --git a/tests/models/layers/test_flash_attn.py b/tests/models/layers/test_flash_attn.py index 987ea7160a..666d93c9b4 100644 --- a/tests/models/layers/test_flash_attn.py +++ b/tests/models/layers/test_flash_attn.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import math +from typing import Optional import pytest import torch @@ -334,5 +335,99 @@ def gen_bias(): _assert_approx_equal(value_1.grad, value_2.grad) -def _assert_approx_equal(value1: torch.Tensor, value2: torch.Tensor): - assert torch.norm(value2 - value1) <= 1e-2 + 1e-2 * torch.norm(value2) +@pytest.mark.gpu +@pytest.mark.skipif( + not is_flash_v2_installed(v2_version='v2.6.2'), + reason= + 'attn_logit_softcapping only supported by Flash Attention after v2.6.2.', +) +@pytest.mark.parametrize( + 'attn_logit_softcapping', + [None, 0.1, 1.0, 10.0, 100.0], +) +def test_attn_logit_softcapping(attn_logit_softcapping: Optional[float]): + # Test that attn_logit_softcapping in attention works as expected. + dtype = torch.bfloat16 + device = 'cuda' + d = 128 + seqlen_1 = 8 + bsz = 2 + n_heads = 4 + + query_1 = torch.randn(bsz, seqlen_1, + n_heads * d).to(dtype=dtype, device=device) + query_1.requires_grad = True + key_1 = torch.randn(bsz, seqlen_1, + n_heads * d).to(dtype=dtype, device=device) + key_1.requires_grad = True + value_1 = torch.randn(bsz, seqlen_1, + n_heads * d).to(dtype=dtype, device=device) + value_1.requires_grad = True + output_1, _, _ = flash_attn_fn( + query=query_1, + key=key_1, + value=value_1, + n_heads=n_heads, + kv_n_heads=n_heads, + past_key_value=None, + softmax_scale=1 / math.sqrt(d), + attn_bias=None, + key_padding_mask=None, + is_causal=True, + dropout_p=0.0, + training=False, + needs_weights=False, + flash_attn_padding_info=gen_flash_attn_padding_info( + bsz, + seqlen_1, + 0, + query_1.device, + None, + None, + ), + should_repeat_kv_for_gqa=True, + attn_logit_softcapping=attn_logit_softcapping, + ) + output_1.sum().backward() + + query_2 = query_1.detach().clone() + query_2.requires_grad = True + key_2 = key_1.detach().clone() + key_2.requires_grad = True + value_2 = value_1.detach().clone() + value_2.requires_grad = True + output_2, _, _ = scaled_multihead_dot_product_attention( + query=query_2, + key=key_2, + value=value_2, + n_heads=n_heads, + kv_n_heads=n_heads, + past_key_value=None, + softmax_scale=1 / math.sqrt(d), + key_padding_mask=None, + is_causal=True, + dropout_p=0.0, + training=False, + needs_weights=False, + attn_logit_softcapping=attn_logit_softcapping, + ) + output_2.sum().backward() + + _assert_approx_equal(output_1, output_2) + assert (query_2.grad is not None) and (query_1.grad is not None) + _assert_approx_equal(query_1.grad, query_2.grad) + assert (key_2.grad is not None) and (key_1.grad is not None) + _assert_approx_equal(key_1.grad, key_2.grad) + assert (value_2.grad is not None) and (value_1.grad is not None) + _assert_approx_equal(value_1.grad, value_2.grad) + + +def _assert_approx_equal( + value1: torch.Tensor, + value2: torch.Tensor, + atol: float = 1e-2, + rtol: float = 1e-2, +): + actual_difference = torch.norm(value2 - value1) + allowed_difference = atol + rtol * torch.norm(value2) + assert actual_difference < allowed_difference, f'{actual_difference=}, {allowed_difference=}' From f377090dec102afc646fb29a4510ded6ae74ecf9 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Mon, 23 Sep 2024 16:00:07 -0700 Subject: [PATCH 15/42] Register mosaic logger (#1542) --- llmfoundry/loggers/__init__.py | 2 ++ tests/loggers/test_mosaic_ml_logger.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 tests/loggers/test_mosaic_ml_logger.py diff --git a/llmfoundry/loggers/__init__.py b/llmfoundry/loggers/__init__.py index cd3f3fdc62..c60d9be2cd 100644 --- a/llmfoundry/loggers/__init__.py +++ b/llmfoundry/loggers/__init__.py @@ -4,6 +4,7 @@ from composer.loggers import ( InMemoryLogger, MLFlowLogger, + MosaicMLLogger, TensorboardLogger, WandBLogger, ) @@ -18,3 +19,4 @@ func=InMemoryLogger, ) # for backwards compatibility loggers.register('mlflow', func=MLFlowLogger) +loggers.register('mosaicml', func=MosaicMLLogger) diff --git a/tests/loggers/test_mosaic_ml_logger.py b/tests/loggers/test_mosaic_ml_logger.py new file mode 100644 index 0000000000..e9c003321b --- /dev/null +++ b/tests/loggers/test_mosaic_ml_logger.py @@ -0,0 +1,16 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from composer.loggers import MosaicMLLogger + +from llmfoundry.utils.builders import build_logger + + +def test_mosaic_ml_logger_constructs(): + mosaic_ml_logger = build_logger( + 'mosaicml', + kwargs={'ignore_exceptions': True}, + ) + + assert isinstance(mosaic_ml_logger, MosaicMLLogger) + assert mosaic_ml_logger.ignore_exceptions == True From d85c83b15d5b07a1b8cd00eaa7e400aaf7b22ea7 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 23 Sep 2024 23:24:16 -0700 Subject: [PATCH 16/42] Hfcheckpointer optional generation config (#1543) Co-authored-by: v-chen_data --- llmfoundry/callbacks/hf_checkpointer.py | 7 ++- .../inference/test_convert_composer_to_hf.py | 56 ++++++++++++++++++- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 65bdcb3b6c..4365a5b2e5 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -588,9 +588,10 @@ def tensor_hook( del new_base_model_instance else: new_model_instance = type(original_model)(new_config) - new_model_instance.generation_config.update( - **original_model.generation_config.to_dict(), - ) + if new_model_instance.generation_config is not None: + new_model_instance.generation_config.update( + **original_model.generation_config.to_dict(), + ) # Then load the state dict in with "assign" so that the state dict # is loaded properly even though the model is initially on meta device. diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 66ec739a65..bf5f2a970b 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -8,13 +8,14 @@ import pathlib import shutil from argparse import Namespace -from typing import Any, Callable, Optional, cast +from typing import Any, Callable, Optional, Union, cast from unittest import mock from unittest.mock import ANY, MagicMock, patch import catalogue import pytest import torch +import torch.nn as nn import transformers from composer import ComposerModel, Trainer from composer.loggers import MLFlowLogger @@ -23,7 +24,13 @@ from omegaconf import OmegaConf as om from torch.distributed._tensor.api import DTensor from torch.utils.data import DataLoader -from transformers import PreTrainedModel, PreTrainedTokenizerBase +from transformers import ( + AutoConfig, + GenerationConfig, + PretrainedConfig, + PreTrainedModel, + PreTrainedTokenizerBase, +) from llmfoundry.callbacks import HuggingFaceCheckpointer from llmfoundry.callbacks.hf_checkpointer import _maybe_get_license_filename @@ -1637,3 +1644,48 @@ def test_license_file_finder( found_path = _maybe_get_license_filename(str(tmp_path)) assert (found_path == license_file_name ) if license_file_name is not None else (found_path is None) + + +@pytest.mark.parametrize('generation_config', [None, {}, {'max_length': 200}]) +def test_generation_config_variants( + generation_config: Optional[Union[dict[str, Any], GenerationConfig]], +): + + class MockModel(nn.Module): + + def __init__(self, config: PretrainedConfig): + super().__init__() + self.config = config + # Ensure generation_config is always a GenerationConfig object + if isinstance(config.generation_config, dict): + self.generation_config = GenerationConfig( + **config.generation_config, + ) + else: + self.generation_config = config.generation_config + + config = AutoConfig.from_pretrained('gpt2') + # Convert dict to GenerationConfig if needed + if isinstance(generation_config, dict): + generation_config = GenerationConfig(**generation_config) + config.generation_config = generation_config + + mock_model = MockModel(config) + logger = MagicMock() + state = MagicMock() + state.timestamp.batch = 1 + state.is_model_ddp = False + state.model.model = mock_model + state.model.tokenizer = None + + checkpointer = HuggingFaceCheckpointer( + save_folder='test', + save_interval='1ba', + ) + + checkpointer._save_checkpoint( + state=state, + logger=logger, + upload_to_save_folder=False, + register_to_mlflow=False, + ) From 275a2a40d86a36882cc7963e2677628e05aaaf01 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 24 Sep 2024 16:57:21 -0700 Subject: [PATCH 17/42] Bump composer version to 0.25.0 (#1546) --- setup.py | 8 ++++---- tests/a_scripts/inference/test_convert_composer_to_hf.py | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index ebc66fdacf..48c1326b0d 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.24.1,<0.25', + 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.25.0,<0.26', 'mlflow>=2.14.1,<2.17', 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.44', @@ -91,7 +91,7 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.24.1,<0.25', + 'mosaicml[databricks]>=0.25.0,<0.26', 'numpy<2', 'databricks-sql-connector>=3,<4', 'databricks-connect==14.1.0', @@ -99,7 +99,7 @@ ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.24.1,<0.25', + 'mosaicml[tensorboard]>=0.25.0,<0.26', ] # Flash 2 group kept for backwards compatibility @@ -110,7 +110,7 @@ extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) extra_deps['peft'] = [ - 'mosaicml[peft]>=0.24.1,<0.25', + 'mosaicml[peft]>=0.25.0,<0.26', ] extra_deps['openai'] = [ diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index bf5f2a970b..c25432dc48 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -1563,6 +1563,8 @@ def test_mptmoe_huggingface_conversion_callback( # Check output equivalence loaded_model = loaded_model.cuda().bfloat16() # type: ignore + for k, v in batch.items(): + batch[k] = v.cuda() loaded_model_logits = loaded_model( input_ids=batch.get('input_ids', None), attention_mask=batch.get('attention_mask', None), From 151a2e297b603d84e1e4dfed389c3494990936e6 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 25 Sep 2024 08:53:05 -0700 Subject: [PATCH 18/42] Bump streaming version to 0.9.0 (#1550) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 48c1326b0d..d1979faf63 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ 'mlflow>=2.14.1,<2.17', 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.44', - 'mosaicml-streaming>=0.8.1,<0.9', + 'mosaicml-streaming>=0.9.0,<0.10', 'torch>=2.4.0,<2.4.1', 'datasets>=2.19,<2.20', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data From 722526d420dab9adc5a5be18425d5e08c97ee0c8 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 25 Sep 2024 09:25:27 -0700 Subject: [PATCH 19/42] Bump version to 0.13.0.dev0 (#1549) --- llmfoundry/_version.py | 2 +- llmfoundry/command_utils/eval.py | 2 +- llmfoundry/models/hf/model_wrapper.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py index 2f1f590b19..0cddcaf967 100644 --- a/llmfoundry/_version.py +++ b/llmfoundry/_version.py @@ -3,4 +3,4 @@ """The LLM Foundry Version.""" -__version__ = '0.12.0.dev0' +__version__ = '0.13.0.dev0' diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index 70c4319ea8..73127e8a07 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -82,7 +82,7 @@ def evaluate_model( warnings.warn( VersionedDeprecationWarning( 'The argument fsdp_config is deprecated. Please use parallelism_config instead.', - remove_version='0.13.0', + remove_version='0.14.0', ), ) if fsdp_config and parallelism_config: diff --git a/llmfoundry/models/hf/model_wrapper.py b/llmfoundry/models/hf/model_wrapper.py index c8805e5d6d..f2b67db1ec 100644 --- a/llmfoundry/models/hf/model_wrapper.py +++ b/llmfoundry/models/hf/model_wrapper.py @@ -48,7 +48,7 @@ def __init__( warnings.warn( VersionedDeprecationWarning( '`HuggingFaceModelWithFSDP` is deprecated. In the future please use `BaseHuggingFaceModel`.', - remove_version='0.13.0', + remove_version='0.14.0', ), ) super().__init__( From c786defb6b6175243cd9e4a1b69918488ba7e3b9 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 25 Sep 2024 14:34:40 -0700 Subject: [PATCH 20/42] Add proper user error for accessing schema (#1548) Co-authored-by: v-chen_data --- .../data_prep/convert_delta_to_json.py | 24 ++++++++++++- .../data_prep/test_convert_delta_to_json.py | 35 +++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index 666d0278c6..d676fc2165 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -233,7 +233,27 @@ def run_query( elif method == 'dbconnect': if spark == None: raise ValueError(f'sparkSession is required for dbconnect') - df = spark.sql(query) + + try: + df = spark.sql(query) + except Exception as e: + from pyspark.errors import AnalysisException + if isinstance(e, AnalysisException): + if 'INSUFFICIENT_PERMISSIONS' in e.message: # pyright: ignore + match = re.search( + r"Schema\s+'([^']+)'", + e.message, # pyright: ignore + ) + if match: + schema_name = match.group(1) + action = f'using the schema {schema_name}' + else: + action = 'using the schema' + raise InsufficientPermissionsError(action=action,) from e + raise RuntimeError( + f'Error in querying into schema. Restart sparkSession and try again', + ) from e + if collect: return df.collect() return df @@ -461,6 +481,8 @@ def fetch( raise InsufficientPermissionsError( action=f'reading from {tablename}', ) from e + if isinstance(e, InsufficientPermissionsError): + raise e raise RuntimeError( f'Error in get rows from {tablename}. Restart sparkSession and try again', ) from e diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py index e623467bf7..bbb03a26d9 100644 --- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py +++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py @@ -1,12 +1,14 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import sys import unittest from argparse import Namespace from typing import Any from unittest.mock import MagicMock, mock_open, patch from llmfoundry.command_utils.data_prep.convert_delta_to_json import ( + InsufficientPermissionsError, download, fetch_DT, format_tablename, @@ -17,6 +19,39 @@ class TestConvertDeltaToJsonl(unittest.TestCase): + def test_run_query_dbconnect_insufficient_permissions(self): + error_message = ( + '[INSUFFICIENT_PERMISSIONS] Insufficient privileges: User does not have USE SCHEMA ' + "on Schema 'main.oogabooga'. SQLSTATE: 42501" + ) + + class MockAnalysisException(Exception): + + def __init__(self, message: str): + self.message = message + + with patch.dict('sys.modules', {'pyspark.errors': MagicMock()}): + sys.modules[ + 'pyspark.errors' + ].AnalysisException = MockAnalysisException # pyright: ignore + + mock_spark = MagicMock() + mock_spark.sql.side_effect = MockAnalysisException(error_message) + + with self.assertRaises(InsufficientPermissionsError) as context: + run_query( + 'SELECT * FROM table', + method='dbconnect', + cursor=None, + spark=mock_spark, + ) + + self.assertIn( + 'using the schema main.oogabooga', + str(context.exception), + ) + mock_spark.sql.assert_called_once_with('SELECT * FROM table') + @patch( 'databricks.sql.connect', ) From e6b8d142c3c8133f21b9e1d7c05927201976b2e8 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 25 Sep 2024 15:47:48 -0700 Subject: [PATCH 21/42] Validate Cluster Access Mode (#1551) Co-authored-by: v-chen_data Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- .../data_prep/convert_delta_to_json.py | 12 +++++++++++ llmfoundry/utils/exceptions.py | 13 ++++++++++++ .../data_prep/test_convert_delta_to_json.py | 20 +++++++++++++++---- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index d676fc2165..fbbc5f2cd9 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -20,6 +20,7 @@ from llmfoundry.utils.exceptions import ( ClusterDoesNotExistError, + ClusterInvalidAccessMode, FailedToConnectToDatabricksError, FailedToCreateSQLConnectionError, InsufficientPermissionsError, @@ -568,6 +569,17 @@ def validate_and_get_cluster_info( if res is None: raise ClusterDoesNotExistError(cluster_id) + data_security_mode = str( + res.data_security_mode, + ).upper()[len('DATASECURITYMODE.'):] + + # NONE stands for No Isolation Shared + if data_security_mode == 'NONE': + raise ClusterInvalidAccessMode( + cluster_id=cluster_id, + access_mode=data_security_mode, + ) + assert res.spark_version is not None stripped_runtime = re.sub( r'[a-zA-Z]', diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 900355dff5..265b9bbe8f 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -318,6 +318,19 @@ def __init__(self, cluster_id: str) -> None: super().__init__(message, cluster_id=cluster_id) +class ClusterInvalidAccessMode(NetworkError): + """Error thrown when the cluster does not exist.""" + + def __init__(self, cluster_id: str, access_mode: str) -> None: + message = f'Cluster with id {cluster_id} has access mode {access_mode}. ' + \ + 'please make sure the cluster used has access mode Shared or Single User!' + super().__init__( + message, + cluster_id=cluster_id, + access_mode=access_mode, + ) + + class FailedToCreateSQLConnectionError( NetworkError, ): diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py index bbb03a26d9..b1a9f1e878 100644 --- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py +++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py @@ -264,7 +264,10 @@ def test_dbconnect_called( DATABRICKS_TOKEN = 'token' use_serverless = False - mock_cluster_response = Namespace(spark_version='14.1.0-scala2.12') + mock_cluster_response = Namespace( + spark_version='14.1.0-scala2.12', + data_security_mode='SINGLE_USER', + ) mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response mock_remote = MagicMock() @@ -321,7 +324,10 @@ def test_sqlconnect_called_dbr13( DATABRICKS_TOKEN = 'token' use_serverless = False - mock_cluster_response = Namespace(spark_version='13.0.0-scala2.12') + mock_cluster_response = Namespace( + spark_version='13.0.0-scala2.12', + data_security_mode='SINGLE_USER', + ) mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response fetch_DT( @@ -373,7 +379,10 @@ def test_sqlconnect_called_dbr14( DATABRICKS_TOKEN = 'token' use_serverless = False - mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12') + mock_cluster_response = Namespace( + spark_version='14.2.0-scala2.12', + data_security_mode='SINGLE_USER', + ) mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response fetch_DT( @@ -425,7 +434,10 @@ def test_sqlconnect_called_https( DATABRICKS_TOKEN = 'token' use_serverless = False - mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12') + mock_cluster_response = Namespace( + spark_version='14.2.0-scala2.12', + data_security_mode='SINGLE_USER', + ) mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response fetch_DT( From dc58bb7eb95e52874774e1d5a7669a1a5f194429 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 26 Sep 2024 09:40:13 -0700 Subject: [PATCH 22/42] Update mcli yamls (#1552) --- mcli/mcli-1b-eval.yaml | 4 ++-- mcli/mcli-1b-max-seq-len-8k.yaml | 4 ++-- mcli/mcli-1b.yaml | 4 ++-- mcli/mcli-benchmark-mpt.yaml | 4 ++-- mcli/mcli-convert-composer-to-hf.yaml | 4 ++-- mcli/mcli-hf-eval.yaml | 4 ++-- mcli/mcli-hf-generate.yaml | 4 ++-- mcli/mcli-llama2-finetune.yaml | 4 ++-- mcli/mcli-openai-eval.yaml | 4 ++-- mcli/mcli-pretokenize-oci-upload.yaml | 4 ++-- 10 files changed, 20 insertions(+), 20 deletions(-) diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml index 4fcf8b3cb9..bd6a7b538a 100644 --- a/mcli/mcli-1b-eval.yaml +++ b/mcli/mcli-1b-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo @@ -9,7 +9,7 @@ integrations: command: | cd llm-foundry/scripts/ composer eval/eval.py /mnt/config/parameters.yaml -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest name: mpt-1b-eval compute: diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index fb96c576e0..b437bc5f0d 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo @@ -17,7 +17,7 @@ command: | --out_root ./my-copy-c4 --splits train_small val_small \ --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' composer train/train.py /mnt/config/parameters.yaml -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest name: mpt-1b-ctx-8k-gpus-8 compute: diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml index 26255977f4..789fc4fc02 100644 --- a/mcli/mcli-1b.yaml +++ b/mcli/mcli-1b.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo @@ -21,7 +21,7 @@ command: | eval_loader.dataset.split=val_small \ max_duration=100ba \ eval_interval=0 -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest name: mpt-1b-gpus-8 compute: diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml index 3995598fd3..0c023f9a83 100644 --- a/mcli/mcli-benchmark-mpt.yaml +++ b/mcli/mcli-benchmark-mpt.yaml @@ -6,12 +6,12 @@ compute: # cluster: TODO # Name of the cluster to use for this run # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: .[gpu] diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml index 7b715f6792..a211e3baeb 100644 --- a/mcli/mcli-convert-composer-to-hf.yaml +++ b/mcli/mcli-convert-composer-to-hf.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: . ssh_clone: false # Should be true if using a private repo @@ -13,7 +13,7 @@ command: | --hf_output_path s3://bucket/folder/hf/ \ --output_precision bf16 \ -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest name: convert-composer-hf compute: diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index 27f5938d67..9bcebfbea0 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo @@ -16,7 +16,7 @@ gpu_num: 8 # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml index cb3040e4ee..85a0f6b0e4 100644 --- a/mcli/mcli-hf-generate.yaml +++ b/mcli/mcli-hf-generate.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo @@ -35,7 +35,7 @@ command: | "Here's a quick recipe for baking chocolate chip cookies: Start by" \ "The best 5 cities to visit in Europe are" -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest name: hf-generate compute: diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 7134e6204c..210e8942b5 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo @@ -9,7 +9,7 @@ integrations: command: | cd llm-foundry/scripts composer train/train.py /mnt/config/parameters.yaml -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest name: llama2-finetune compute: diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index cd04d89f4e..987fc829a9 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: .[gpu,openai] ssh_clone: false # Should be true if using a private repo @@ -16,7 +16,7 @@ gpu_num: # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml index 5425ce9897..49fbbb08d8 100644 --- a/mcli/mcli-pretokenize-oci-upload.yaml +++ b/mcli/mcli-pretokenize-oci-upload.yaml @@ -1,5 +1,5 @@ name: c4-2k-pre-tokenized -image: mosaicml/llm-foundry:2.3.1_cu121-latest +image: mosaicml/llm-foundry:2.4.0_cu124-latest compute: gpus: 8 # Number of GPUs to use @@ -14,7 +14,7 @@ integrations: - oci-cli==3.23.2 - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.11.0 + git_branch: v0.12.0 # git_commit: # OR use your commit hash pip_install: . ssh_clone: false # Should be true if using a private repo From 3b1fc4ae5c205118901fcf1557260952fe844e2e Mon Sep 17 00:00:00 2001 From: Eitan Turok <150733043+eitanturok@users.noreply.github.com> Date: Thu, 26 Sep 2024 17:23:34 -0400 Subject: [PATCH 23/42] Use `allenai/c4` instead of `c4` dataset (#1554) Co-authored-by: Eitan Turok --- README.md | 2 +- TUTORIAL.md | 4 ++-- .../data_prep/convert_dataset_hf.py | 4 ++-- .../data_prep/convert_dataset_json.py | 2 +- mcli/mcli-1b-max-seq-len-8k.yaml | 2 +- mcli/mcli-1b.yaml | 2 +- mcli/mcli-pretokenize-oci-upload.yaml | 2 +- scripts/data_prep/README.md | 2 +- scripts/train/README.md | 6 ++--- .../train/benchmarking/submit_benchmarks.py | 2 +- .../data_prep/test_convert_dataset_hf.py | 2 +- tests/a_scripts/eval/test_eval.py | 11 +++++----- tests/a_scripts/train/test_train.py | 22 ++++++++++--------- tests/data/test_dataloader.py | 6 ++--- tests/data_utils.py | 2 +- 15 files changed, 37 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 0fabb98653..bc4eff48fd 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,7 @@ cd scripts # Convert C4 dataset to StreamingDataset format python data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root my-copy-c4 --splits train_small val_small \ --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' diff --git a/TUTORIAL.md b/TUTORIAL.md index 3be4910c4f..d1751f62e3 100644 --- a/TUTORIAL.md +++ b/TUTORIAL.md @@ -216,7 +216,7 @@ Output the processed data to `./my-adaptation-data`. Note that we use smaller su ```bash python scripts/data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root my-adaptation-data --splits train_small val_small \ --concat_tokens 4096 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \ --compression zstd @@ -248,7 +248,7 @@ The first step to training from scratch is to get your pretraining data prepared ```bash python scripts/data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root my-copy-c4 --splits train_small val_small \ --concat_tokens 2048 --tokenizer gpt2 \ --eos_text '<|endoftext|>' \ diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py index 0ea94ac687..2667407110 100644 --- a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py +++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py @@ -158,7 +158,7 @@ def __init__( truncated_samples=100, ) -CONSTS = {'c4': c4constants, 'the_pile': pileconstants} +CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants} def build_hf_dataset( @@ -335,7 +335,7 @@ def convert_dataset_hf( dataset_constants = CONSTS[dataset] except KeyError: raise ValueError( - f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "c4" are supported.', + f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "allenai/c4" are supported.', ) if concat_tokens is not None and tokenizer is not None: diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py index 35d7e637e6..c6f7d51c02 100644 --- a/llmfoundry/command_utils/data_prep/convert_dataset_json.py +++ b/llmfoundry/command_utils/data_prep/convert_dataset_json.py @@ -43,7 +43,7 @@ def build_hf_dataset( no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset. - Typically "all" (The Pile) or "en" (c4). + Typically "all" (The Pile) or "en" (allenai/c4). Returns: An IterableDataset. diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index b437bc5f0d..1d48cd8105 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -13,7 +13,7 @@ integrations: command: | cd llm-foundry/scripts python data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root ./my-copy-c4 --splits train_small val_small \ --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' composer train/train.py /mnt/config/parameters.yaml diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml index 789fc4fc02..71566d4c46 100644 --- a/mcli/mcli-1b.yaml +++ b/mcli/mcli-1b.yaml @@ -13,7 +13,7 @@ integrations: command: | cd llm-foundry/scripts python data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root ./my-copy-c4 --splits train_small val_small \ --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' composer train/train.py train/yamls/pretrain/mpt-1b.yaml \ diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml index 49fbbb08d8..a3e8c40b88 100644 --- a/mcli/mcli-pretokenize-oci-upload.yaml +++ b/mcli/mcli-pretokenize-oci-upload.yaml @@ -24,7 +24,7 @@ command: | # Run the dataset conversion python convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root ./my-copy-c4 \ --splits val_small val train_small train \ --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' diff --git a/scripts/data_prep/README.md b/scripts/data_prep/README.md index 3601cc865f..b72caeebc4 100644 --- a/scripts/data_prep/README.md +++ b/scripts/data_prep/README.md @@ -14,7 +14,7 @@ Currently supports `c4` and `The Pile`. ```bash # Convert C4 dataset to StreamingDataset format python convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root my-copy-c4 --splits train_small val_small \ --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \ --compression zstd diff --git a/scripts/train/README.md b/scripts/train/README.md index 6730cb793b..247814d782 100644 --- a/scripts/train/README.md +++ b/scripts/train/README.md @@ -27,7 +27,7 @@ If you haven't already, make sure to [install the requirements](../../README.md# To run pretraining, you'll need to make yourself a copy of a pretraining dataset and format it for efficient streaming. Check out the [`llm-foundry/data_prep`](../data_prep) folder for detailed instructions on how to convert your dataset to the MosaicML [StreamingDataset](https://github.com/mosaicml/streaming) format. -As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/c4) dataset here. +As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/allenai/c4) dataset here. We first convert the dataset from its native format (a collection of zipped JSONs) to MosaicML's StreamingDataset format, which is a collection of binary `.mds` files. @@ -44,13 +44,13 @@ This will take 20-60 seconds depending on your internet bandwidth. You should see two folders once completed: `./my-copy-c4/train_small` and `./my-copy-c4/val_small` that are ~1.0GB total. Note that we are using the `--concat_tokens` option to pre tokenize our samples to be of the max sequence length without padding ```bash -python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' +python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' ``` Alternatively, you can download the full `train` and `val` splits if you really want to train the model (i.e. not just profile the model). This will take 1-to-many hours depending on bandwidth, number of CPUs, etc. The final folder `./my-copy-c4/train` will be ~800GB so make sure you have space! ```bash -python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' +python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' ``` For any of the above commands, you can also choose to compress the `.mds` files. diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py index fd7be1fc6d..27f5c26c7d 100644 --- a/scripts/train/benchmarking/submit_benchmarks.py +++ b/scripts/train/benchmarking/submit_benchmarks.py @@ -479,7 +479,7 @@ def run_config( if args.data_remote is None: command += f""" cd llm-foundry/scripts - python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>' + python data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>' composer train/train.py /mnt/config/parameters.yaml """ else: diff --git a/tests/a_scripts/data_prep/test_convert_dataset_hf.py b/tests/a_scripts/data_prep/test_convert_dataset_hf.py index e09c54ca70..da1e101ae7 100644 --- a/tests/a_scripts/data_prep/test_convert_dataset_hf.py +++ b/tests/a_scripts/data_prep/test_convert_dataset_hf.py @@ -11,7 +11,7 @@ def test_download_script_from_api(tmp_path: Path): # test calling it directly path = os.path.join(tmp_path, 'my-copy-c4-1') convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=['val_xsmall'], out_root=path, diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index fc0dc8a882..f1b76913d1 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -121,7 +121,7 @@ def test_loader_eval( # Set up multiple eval dataloaders first_eval_loader = test_cfg.eval_loader - first_eval_loader.label = 'c4' + first_eval_loader.label = 'allenai/c4' # Create second eval dataloader using the arxiv dataset. second_eval_loader = copy.deepcopy(first_eval_loader) second_eval_loader.label = 'arxiv' @@ -157,16 +157,17 @@ def test_loader_eval( print(inmemorylogger.data.keys()) # Checks for first eval dataloader - assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys() + assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys( + ) assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'], list, ) assert len( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], ) > 0 assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], tuple, ) diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 9af96f9868..b1bca9ebd0 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -134,7 +134,7 @@ def test_train_multi_eval(tmp_path: pathlib.Path): test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') # Set up multiple eval dataloaders first_eval_loader = test_cfg.eval_loader - first_eval_loader.label = 'c4' + first_eval_loader.label = 'allenai/c4' # Create second eval dataloader using the arxiv dataset. second_eval_loader = copy.deepcopy(first_eval_loader) second_eval_loader.label = 'arxiv' @@ -154,16 +154,17 @@ def test_train_multi_eval(tmp_path: pathlib.Path): assert isinstance(inmemorylogger, InMemoryLogger) # Checks for first eval dataloader - assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys() + assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys( + ) assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'], list, ) assert len( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], ) > 0 assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], tuple, ) @@ -212,7 +213,7 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path): c4_dataset_name = create_c4_dataset_xxsmall(tmp_path) test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') first_eval_loader = test_cfg.eval_loader - first_eval_loader.label = 'c4' + first_eval_loader.label = 'allenai/c4' test_cfg.eval_loader = om.create([first_eval_loader]) test_cfg.eval_subset_num_batches = 1 # -1 to evaluate on all batches test_cfg.max_duration = '1ba' @@ -226,15 +227,16 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path): 0] # pyright: ignore [reportGeneralTypeIssues] assert isinstance(inmemorylogger, InMemoryLogger) - assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys() + assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys( + ) assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'], list, ) assert len( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], ) > 0 assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], tuple, ) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index d215d93542..7239bfe958 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -204,7 +204,7 @@ def test_correct_padding( shutil.rmtree(path, ignore_errors=True) if pretokenize: convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=[split], out_root=path, @@ -219,7 +219,7 @@ def test_correct_padding( ) else: convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=[split], out_root=path, @@ -233,7 +233,7 @@ def test_correct_padding( num_workers=None, ) if not os.path.isdir(path): - raise RuntimeError(f'c4 dataset at {path} not set up as expected') + raise RuntimeError(f'allenai/c4 dataset at {path} not set up as expected') test_cfg = get_config( conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml', diff --git a/tests/data_utils.py b/tests/data_utils.py index 117310b0cf..1f6c26b72e 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -231,7 +231,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str: # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188 convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=[downloaded_split], out_root=c4_dir, From ee456002a1dd86f3d9102ac5ade9f7436be51d82 Mon Sep 17 00:00:00 2001 From: Eitan Turok <150733043+eitanturok@users.noreply.github.com> Date: Fri, 27 Sep 2024 10:39:39 -0400 Subject: [PATCH 24/42] Tensor Parallelism (#1521) Co-authored-by: Eitan Turok Co-authored-by: Mihir Patel --- llmfoundry/__init__.py | 2 + llmfoundry/command_utils/train.py | 32 +++++-- llmfoundry/registry.py | 22 +++++ llmfoundry/tp/__init__.py | 11 +++ llmfoundry/tp/ffn_tp_strategy.py | 56 +++++++++++++ llmfoundry/utils/builders.py | 29 +++++-- llmfoundry/utils/config_utils.py | 14 +++- tests/test_registry.py | 1 + tests/tp/__init__.py | 2 + tests/tp/test_tp_strategies.py | 133 ++++++++++++++++++++++++++++++ 10 files changed, 289 insertions(+), 13 deletions(-) create mode 100644 llmfoundry/tp/__init__.py create mode 100644 llmfoundry/tp/ffn_tp_strategy.py create mode 100644 tests/tp/__init__.py create mode 100644 tests/tp/test_tp_strategies.py diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py index b851aaa559..07e8f35747 100644 --- a/llmfoundry/__init__.py +++ b/llmfoundry/__init__.py @@ -48,6 +48,7 @@ models, optim, tokenizers, + tp, utils, ) from llmfoundry._version import __version__ @@ -87,5 +88,6 @@ 'models', 'optim', 'tokenizers', + 'tp', 'utils', ] diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index 14b7980d57..29878714f6 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -5,6 +5,7 @@ import os import time import warnings +from copy import deepcopy from typing import Any, Optional, Union import torch @@ -43,6 +44,7 @@ build_save_planner, build_scheduler, build_tokenizer, + build_tp_strategies, ) from llmfoundry.utils.config_utils import ( TRAIN_CONFIG_KEYS, @@ -329,16 +331,27 @@ def train(cfg: DictConfig) -> Trainer: changing autoresume default to True...', ) - # Warn if fsdp is enabled but user only has 1 GPU - if dist.get_world_size() == 1 and fsdp_config is not None: + # Optional tp config + tp_config: Optional[dict[str, Any]] = train_cfg.tp_config + + # Warn if FSDP or TP is enabled but user only has 1 GPU + if dist.get_world_size( + ) == 1 and (fsdp_config is not None or tp_config is not None): + parallelism = '' + if fsdp_config is not None: + parallelism += 'FSDP' + if tp_config is not None: + parallelism += '+TP' if fsdp_config is not None else 'TP' warnings.warn( - 'FSDP is not applicable for single-GPU training. Reverting to DDP.', + f'{parallelism} is not applicable for single-GPU training. Reverting to DDP.', ) fsdp_config = None + tp_config = None # Initialize context - init_context = process_init_device(model_config, fsdp_config) + init_context = process_init_device(model_config, fsdp_config, tp_config) logged_cfg.update({'fsdp_config': fsdp_config}, merge=True) + logged_cfg.update({'tp_config': deepcopy(tp_config)}, merge=True) # Build tokenizer log.info('Building tokenizer...') @@ -502,6 +515,15 @@ def train(cfg: DictConfig) -> Trainer: _log_num_params(model, logged_cfg) + # TP config + if tp_config is not None: + strategy = tp_config.pop('strategy', None) + assert isinstance(strategy, str), '`strategy` must be in `tp_config`.' + tp_config['layer_plan'] = build_tp_strategies(strategy, model) + + # Parallelism config + parallelism_config = {'fsdp': fsdp_config, 'tp': tp_config} + # Optimizer optimizer_name: str = train_cfg.optimizer.pop('name') optimizer_cfg = train_cfg.optimizer @@ -546,7 +568,7 @@ def train(cfg: DictConfig) -> Trainer: precision=train_cfg.precision, algorithms=algorithms, device_train_microbatch_size=train_cfg.device_train_microbatch_size, - parallelism_config={'fsdp': fsdp_config}, + parallelism_config=parallelism_config, save_folder=train_cfg.save_folder, save_filename=save_filename, save_latest_filename=save_latest_filename, diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py index cb2455a760..850c4f3bbd 100644 --- a/llmfoundry/registry.py +++ b/llmfoundry/registry.py @@ -7,6 +7,7 @@ from composer.models import ComposerModel from composer.optim import ComposerScheduler from torch.distributed.checkpoint import LoadPlanner, SavePlanner +from torch.distributed.tensor.parallel.style import ParallelStyle from torch.optim import Optimizer from torch.utils.data import DataLoader as TorchDataloader from torch.utils.data import Dataset @@ -389,6 +390,26 @@ description=_save_planners_description, ) +_tp_strategies_description = ( + """The tp_strategies registry is used to register strategies for tensor parallelism. + + Args: + model (ComposerModel): The model. + + Returns: + layer_plan (Dict[str, ParallelStyle]): The plan used to parallelize the model. + model (ComposerModel): The model. + """ +) + +tp_strategies = create_registry( + 'llmfoundry', + 'tp_strategies', + generic_type=Callable[[ComposerModel], dict[str, ParallelStyle]], + entry_points=True, + description=_tp_strategies_description, +) + __all__ = [ 'loggers', 'callbacks', @@ -416,4 +437,5 @@ 'config_transforms', 'load_planners', 'save_planners', + 'tp_strategies', ] diff --git a/llmfoundry/tp/__init__.py b/llmfoundry/tp/__init__.py new file mode 100644 index 0000000000..323ae23727 --- /dev/null +++ b/llmfoundry/tp/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from llmfoundry.registry import tp_strategies +from llmfoundry.tp.ffn_tp_strategy import ffn_tp_strategy + +tp_strategies.register('ffn', func=ffn_tp_strategy) + +__all__ = [ + 'ffn_tp_strategy', +] diff --git a/llmfoundry/tp/ffn_tp_strategy.py b/llmfoundry/tp/ffn_tp_strategy.py new file mode 100644 index 0000000000..1de92ef6ae --- /dev/null +++ b/llmfoundry/tp/ffn_tp_strategy.py @@ -0,0 +1,56 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from composer.models import ComposerModel +from torch.distributed._tensor import Replicate, Shard +from torch.distributed.tensor.parallel import ( + ColwiseParallel, + PrepareModuleInput, + RowwiseParallel, +) +from torch.distributed.tensor.parallel.style import ParallelStyle + + +def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]: + TP_LAYERS = {'ffn', 'ffn.up_proj', 'ffn.down_proj'} + + # Validate that all TP_LAYERS are in model + tp_layers_in_model = { + layer for layer in TP_LAYERS for name, _ in model.named_modules() + if layer in name + } + if tp_layers_in_model != TP_LAYERS: + raise RuntimeError( + f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.', + ) + + # Generate layer plan + layer_plan: dict[str, ParallelStyle] = {} + for name, _ in model.named_modules(): + # Before the ffn layer starts, distribute the input data for proper TP use + # Inputs are currently sharded across the batch dimension (dim 0) as is done in standard DDP + # Inputs will be replicated across hidden dimension (dim 1) via allgather + if name.split('.')[-1] == 'ffn': + layer_plan[name] = PrepareModuleInput( + input_layouts=Shard(0), + desired_input_layouts=Replicate(), + use_local_output=True, + ) + # Shard the ffn.up_proj weight matrix across its columns + # Inputs are already replicated across each TP group + # Outputs will be sharded along the hidden dimension (dim 1) via allgather + elif name.split('.')[-2:] == ['ffn', 'up_proj']: + layer_plan[name] = ColwiseParallel( + input_layouts=Replicate(), + output_layouts=Shard(-1), + ) + # Shard the ffn.down_proj weight matrix across its rows + # Inputs are sharded along the hidden dimension (dim 1) + # Outputs will be sharded along batch dimension (dim 0) via allreduce + elif name.split('.')[-2:] == ['ffn', 'down_proj']: + layer_plan[name] = RowwiseParallel( + input_layouts=Shard(-1), + output_layouts=Shard(0), + ) + + return layer_plan diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index f2d5cfc0f7..687b21b46d 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -7,14 +7,9 @@ import logging import os import re +import warnings from collections import OrderedDict -from typing import ( - Any, - ContextManager, - Iterable, - Optional, - Union, -) +from typing import Any, ContextManager, Iterable, Optional, Union import torch from composer.core import Algorithm, Callback, Evaluator @@ -25,6 +20,7 @@ from omegaconf import DictConfig from omegaconf import OmegaConf as om from torch.distributed.checkpoint import LoadPlanner, SavePlanner +from torch.distributed.tensor.parallel.style import ParallelStyle from torch.optim.optimizer import Optimizer from torchmetrics import Metric from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -37,6 +33,7 @@ ) from llmfoundry.utils.config_utils import to_dict_container, to_list_container from llmfoundry.utils.registry_utils import construct_from_registry +from llmfoundry.utils.warnings import experimental_function log = logging.getLogger(__name__) @@ -52,6 +49,7 @@ 'build_tokenizer', 'build_composer_model', 'build_metric', + 'build_tp_strategies', ] @@ -701,3 +699,20 @@ def _validate_cfg(icl_cfg: dict[str, Any]): ) return evaluators, logger_keys + + +@experimental_function('Tensor Parallelism') +def build_tp_strategies( + name: str, + model: ComposerModel, +) -> dict[str, ParallelStyle]: + + warnings.warn( + 'Checkpointing is not currently supported for tensor parallelism due to this pytorch bug: https://github.com/pytorch/pytorch/issues/134095#issuecomment-2345018244', + ) + return construct_from_registry( + name=name, + registry=registry.tp_strategies, + partial_function=False, + kwargs={'model': model}, + ) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index ba5c5941b8..c22495993c 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -120,6 +120,7 @@ class TrainConfig: # Distributed training parameters dist_timeout: Union[int, float] = 600.0 fsdp_config: Optional[dict[str, Any]] = None + tp_config: Optional[dict[str, Any]] = None # Evaluation parameters eval_interval: Union[int, str] = 1 @@ -501,7 +502,11 @@ def update_batch_size_info(cfg: dict[str, Any]) -> dict[str, Any]: return cfg -def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict]): +def process_init_device( + model_cfg: dict[str, Any], + fsdp_config: Optional[dict] = None, + tp_config: Optional[dict] = None, +): # Restrict model init_device to 'meta' and 'cpu', # using 'cuda' vs. 'cuda:id' is tricky and can lead to common user errors # when multiple GPUs are available. @@ -533,6 +538,13 @@ def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict]): # Set defaults for mixed initialization fsdp_config.setdefault('load_monolith_rank0_only', True) + # Check we are not using tensor parallelism with MoEs + if tp_config is not None and 'ffn_config' in model_cfg and model_cfg[ + 'ffn_config'].get('ffn_type', None) in ffns_with_megablocks: + raise ValueError( + 'Tensor Parallelism is not currently supported for MoE models.', + ) + # Set ffn_config.device_mesh using fsdp_config if fsdp_config is not None and 'ffn_config' in model_cfg and model_cfg[ 'ffn_config'].get('ffn_type', None) in ffns_with_megablocks: diff --git a/tests/test_registry.py b/tests/test_registry.py index 5108a7d46c..90ef3bfaac 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -47,6 +47,7 @@ def test_expected_registries_exist(): 'config_transforms', 'load_planners', 'save_planners', + 'tp_strategies', } assert existing_registries == expected_registry_names diff --git a/tests/tp/__init__.py b/tests/tp/__init__.py new file mode 100644 index 0000000000..80950cb7b4 --- /dev/null +++ b/tests/tp/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/tp/test_tp_strategies.py b/tests/tp/test_tp_strategies.py new file mode 100644 index 0000000000..fd2fa384ce --- /dev/null +++ b/tests/tp/test_tp_strategies.py @@ -0,0 +1,133 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest +from omegaconf import OmegaConf as om +from torch.distributed._tensor import Replicate, Shard +from torch.distributed.tensor.parallel import ( + ColwiseParallel, + PrepareModuleInput, + RowwiseParallel, +) + +from llmfoundry.command_utils.train import train +from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM +from llmfoundry.utils.builders import build_tp_strategies +from llmfoundry.utils.config_utils import process_init_device +from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg + + +@pytest.mark.gpu +@pytest.mark.filterwarnings( + 'ignore:tp_strategies is experimental and may change with future versions.', +) +def test_ffn_tp_strategy(): + """Test the FFN tensor parallelism strategy is correct.""" + # Create layer plan from fnn tp_strategy + tp_config = { + 'strategy': 'ffn', + } + + model_cfg = { + 'name': 'mpt_causal_lm', + 'd_model': 128, + 'n_heads': 4, + 'n_layers': 3, + 'expansion_ratio': 1, + 'max_seq_len': 16, + 'vocab_size': 50368, + } + model = ComposerMPTCausalLM(**model_cfg) + layer_plan = build_tp_strategies(tp_config['strategy'], model) + + # Expected layer plan + _expected_layer_plan = { + 'ffn': + PrepareModuleInput( + input_layouts=Shard(0), + desired_input_layouts=Replicate(), + use_local_output=True, + ), + 'ffn.down_proj': + RowwiseParallel( + input_layouts=Shard(-1), + output_layouts=Shard(0), + ), + 'ffn.up_proj': + ColwiseParallel( + input_layouts=Replicate(), + output_layouts=Shard(-1), + ), + } + expected_layer_plan = { + f'model.transformer.blocks.{layer_idx}.{name}': layer_plan + for name, layer_plan in _expected_layer_plan.items() + for layer_idx in range(model_cfg['n_layers']) + } + + # Compare expected and actual layer plans + for (n1, lp1), (n2, lp2) in zip( + sorted(expected_layer_plan.items()), + sorted(layer_plan.items()), + ): + assert n1 == n2 + assert type(lp1) == type(lp2) + if isinstance( + lp1, + PrepareModuleInput, + ) and isinstance(lp2, PrepareModuleInput): + assert lp1.input_layouts == lp2.input_layouts + assert lp1.desired_input_layouts == lp2.desired_input_layouts + assert lp1.use_local_output == lp2.use_local_output + elif ( + isinstance(lp1, ColwiseParallel) and + isinstance(lp2, ColwiseParallel) + ) or ( + isinstance(lp1, RowwiseParallel) and + isinstance(lp2, RowwiseParallel) + ): + assert lp1.input_layouts == lp2.input_layouts + assert lp1.output_layouts == lp2.output_layouts + assert lp1.use_local_output == lp2.use_local_output + else: + raise ValueError(f'Layer plan of wrong type: {type(layer_plan)}') + + +@pytest.mark.gpu +def test_no_tp_with_one_gpu(): + """Test that when we have one GPU, we use DDP and not FSDP-TP.""" + with TemporaryDirectory() as tmp_path: + # Make `train_cfg`` with a tensor parallelism strategy + dataset_name = create_c4_dataset_xxsmall(Path(tmp_path)) + train_cfg = gpt_tiny_cfg(dataset_name, 'gpu') + train_cfg.tp_config = {'strategy': 'ffn'} + + # Expect a warning + with pytest.warns( + UserWarning, + match= + r'FSDP\+TP is not applicable for single-GPU training. Reverting to DDP.', + ): + train(train_cfg) + + +@pytest.mark.gpu # use gpu because `megablocks` only installed with `gpu` dependencies +def test_no_tp_with_moes(): + """Test that tensor parallelism is not compatible with MoEs.""" + # Make `cfg` for MoE model, fsdp, and tp + train_cfg_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml' + with open(train_cfg_path, 'r', encoding='utf-8') as f: + train_cfg = om.load(f) + model_cfg = train_cfg.model + fsdp_cfg = train_cfg.fsdp_config + tp_cfg = {'strategy': 'ffn'} + + # Expect an error + with pytest.raises( + ValueError, + match='Tensor Parallelism is not currently supported for MoE models.', + ): + process_init_device(model_cfg, fsdp_cfg, tp_cfg) From 107d246a4c9c04f0a906f8f0fafcca1297d9e68e Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 27 Sep 2024 13:12:00 -0700 Subject: [PATCH 25/42] Insufficient Permissions Error when trying to access table (#1555) Co-authored-by: v-chen_data --- .../data_prep/convert_delta_to_json.py | 127 +++++++----------- llmfoundry/utils/exceptions.py | 13 +- .../data_prep/test_convert_delta_to_json.py | 23 ++-- tests/utils/test_exceptions.py | 39 ++++-- 4 files changed, 103 insertions(+), 99 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index fbbc5f2cd9..44e8651cdf 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -234,27 +234,7 @@ def run_query( elif method == 'dbconnect': if spark == None: raise ValueError(f'sparkSession is required for dbconnect') - - try: - df = spark.sql(query) - except Exception as e: - from pyspark.errors import AnalysisException - if isinstance(e, AnalysisException): - if 'INSUFFICIENT_PERMISSIONS' in e.message: # pyright: ignore - match = re.search( - r"Schema\s+'([^']+)'", - e.message, # pyright: ignore - ) - if match: - schema_name = match.group(1) - action = f'using the schema {schema_name}' - else: - action = 'using the schema' - raise InsufficientPermissionsError(action=action,) from e - raise RuntimeError( - f'Error in querying into schema. Restart sparkSession and try again', - ) from e - + df = spark.sql(query) if collect: return df.collect() return df @@ -469,71 +449,66 @@ def fetch( """ cursor = dbsql.cursor() if dbsql is not None else None try: - nrows = get_total_rows( - tablename, - method, - cursor, - sparkSession, - ) - except Exception as e: - from pyspark.errors import AnalysisException - if isinstance(e, AnalysisException): - if 'INSUFFICIENT_PERMISSIONS' in e.message: # pyright: ignore - raise InsufficientPermissionsError( - action=f'reading from {tablename}', - ) from e - if isinstance(e, InsufficientPermissionsError): - raise e - raise RuntimeError( - f'Error in get rows from {tablename}. Restart sparkSession and try again', - ) from e + # Get total rows + nrows = get_total_rows(tablename, method, cursor, sparkSession) - try: + # Get columns info columns, order_by, columns_str = get_columns_info( tablename, method, cursor, sparkSession, ) + + if method == 'dbconnect' and sparkSession is not None: + log.info(f'{processes=}') + df = sparkSession.table(tablename) + + # Running the query and collecting the data as arrow or json. + signed, _, _ = df.collect_cf('arrow') # pyright: ignore + log.info(f'len(signed) = {len(signed)}') + + args = get_args(signed, json_output_folder, columns) + + # Stopping the SparkSession to avoid spilling connection state into the subprocesses. + sparkSession.stop() + + with ProcessPoolExecutor(max_workers=processes) as executor: + list(executor.map(download_starargs, args)) + + elif method == 'dbsql' and cursor is not None: + for start in range(0, nrows, batch_size): + log.warning(f'batch {start}') + end = min(start + batch_size, nrows) + fetch_data( + method, + cursor, + sparkSession, + start, + end, + order_by, + tablename, + columns_str, + json_output_folder, + ) + except Exception as e: - raise RuntimeError( - f'Error in get columns from {tablename}. Restart sparkSession and try again', - ) from e + from databricks.sql.exc import ServerOperationError + from pyspark.errors import AnalysisException - if method == 'dbconnect' and sparkSession is not None: - log.info(f'{processes=}') - df = sparkSession.table(tablename) - - # Running the query and collecting the data as arrow or json. - signed, _, _ = df.collect_cf('arrow') # pyright: ignore - log.info(f'len(signed) = {len(signed)}') - - args = get_args(signed, json_output_folder, columns) - - # Stopping the SparkSession to avoid spilling connection state into the subprocesses. - sparkSession.stop() - - with ProcessPoolExecutor(max_workers=processes) as executor: - list(executor.map(download_starargs, args)) - - elif method == 'dbsql' and cursor is not None: - for start in range(0, nrows, batch_size): - log.warning(f'batch {start}') - end = min(start + batch_size, nrows) - fetch_data( - method, - cursor, - sparkSession, - start, - end, - order_by, - tablename, - columns_str, - json_output_folder, - ) + if isinstance(e, (AnalysisException, ServerOperationError)): + if 'INSUFFICIENT_PERMISSIONS' in str(e): + raise InsufficientPermissionsError(str(e)) from e + + if isinstance(e, InsufficientPermissionsError): + raise + + # For any other exception, raise a general error + raise RuntimeError(f'Error processing {tablename}: {str(e)}') from e - if cursor is not None: - cursor.close() + finally: + if cursor is not None: + cursor.close() def validate_and_get_cluster_info( diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 265b9bbe8f..242ac4f32c 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -456,6 +456,13 @@ def __init__( class InsufficientPermissionsError(UserError): """Error thrown when the user does not have sufficient permissions.""" - def __init__(self, action: str) -> None: - message = f'Insufficient permissions when {action}. Please check your permissions.' - super().__init__(message, action=action) + def __init__(self, message: str) -> None: + self.message = message + super().__init__(message) + + def __reduce__(self): + # Return a tuple of class, a tuple of arguments, and optionally state + return (InsufficientPermissionsError, (self.message,)) + + def __str__(self): + return self.message diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py index b1a9f1e878..981f5c1ed6 100644 --- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py +++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py @@ -10,6 +10,7 @@ from llmfoundry.command_utils.data_prep.convert_delta_to_json import ( InsufficientPermissionsError, download, + fetch, fetch_DT, format_tablename, iterative_combine_jsons, @@ -30,27 +31,33 @@ class MockAnalysisException(Exception): def __init__(self, message: str): self.message = message + def __str__(self): + return self.message + with patch.dict('sys.modules', {'pyspark.errors': MagicMock()}): sys.modules[ 'pyspark.errors' - ].AnalysisException = MockAnalysisException # pyright: ignore + ].AnalysisException = MockAnalysisException # type: ignore mock_spark = MagicMock() mock_spark.sql.side_effect = MockAnalysisException(error_message) with self.assertRaises(InsufficientPermissionsError) as context: - run_query( - 'SELECT * FROM table', + fetch( method='dbconnect', - cursor=None, - spark=mock_spark, + tablename='main.oogabooga', + json_output_folder='/fake/path', + batch_size=1, + processes=1, + sparkSession=mock_spark, + dbsql=None, ) - self.assertIn( - 'using the schema main.oogabooga', + self.assertEqual( str(context.exception), + error_message, ) - mock_spark.sql.assert_called_once_with('SELECT * FROM table') + mock_spark.sql.assert_called() @patch( 'databricks.sql.connect', diff --git a/tests/utils/test_exceptions.py b/tests/utils/test_exceptions.py index 8bfc7287ab..564dfa2f14 100644 --- a/tests/utils/test_exceptions.py +++ b/tests/utils/test_exceptions.py @@ -4,7 +4,7 @@ import contextlib import inspect import pickle -from typing import Any, Optional +from typing import Any, Optional, get_type_hints import pytest @@ -14,16 +14,30 @@ def create_exception_object( exception_class: type[foundry_exceptions.BaseContextualError], ): - # get required arg types of exception class by inspecting its __init__ method - if hasattr(inspect, 'get_annotations'): - required_args = inspect.get_annotations( # type: ignore - exception_class.__init__, - ) # type: ignore - else: - required_args = exception_class.__init__.__annotations__ # python 3.9 and below - - # create a dictionary of required args with default values + def get_init_annotations(cls: type): + try: + return get_type_hints(cls.__init__) + except (AttributeError, TypeError): + # Handle cases where __init__ does not exist or has no annotations + return {} + + # First, try to get annotations from the class itself + required_args = get_init_annotations(exception_class) + + # If the annotations are empty, look at parent classes + if not required_args: + for parent in exception_class.__bases__: + if parent == object: + break + parent_args = get_init_annotations(parent) + if parent_args: + required_args = parent_args + break + + # Remove self, return, and kwargs + required_args.pop('self', None) + required_args.pop('return', None) required_args.pop('kwargs', None) def get_default_value(arg_type: Optional[type] = None): @@ -51,8 +65,6 @@ def get_default_value(arg_type: Optional[type] = None): return [{'key': 'value'}] raise ValueError(f'Unsupported arg type: {arg_type}') - required_args.pop('self', None) - required_args.pop('return', None) kwargs = { arg: get_default_value(arg_type) for arg, arg_type in required_args.items() @@ -80,6 +92,7 @@ def filter_exceptions(possible_exceptions: list[str]): def test_exception_serialization( exception_class: type[foundry_exceptions.BaseContextualError], ): + print(f'Testing serialization for {exception_class.__name__}') excluded_base_classes = [ foundry_exceptions.InternalError, foundry_exceptions.UserError, @@ -88,6 +101,7 @@ def test_exception_serialization( ] exception = create_exception_object(exception_class) + print(f'Created exception object: {exception}') expect_reduce_error = exception.__class__ in excluded_base_classes error_context = pytest.raises( @@ -95,6 +109,7 @@ def test_exception_serialization( ) if expect_reduce_error else contextlib.nullcontext() exc_str = str(exception) + print(f'Exception string: {exc_str}') with error_context: pkl = pickle.dumps(exception) unpickled_exc = pickle.loads(pkl) From 4202a063ea744f1713bba3c4aa52955913974418 Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Mon, 30 Sep 2024 10:32:21 -0700 Subject: [PATCH 26/42] Add NoOp optimizer (#1560) --- llmfoundry/optim/__init__.py | 3 +++ llmfoundry/optim/no_op.py | 44 +++++++++++++++++++++++++++++++++ tests/optim/test_no_op.py | 48 ++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 llmfoundry/optim/no_op.py create mode 100644 tests/optim/test_no_op.py diff --git a/llmfoundry/optim/__init__.py b/llmfoundry/optim/__init__.py index 0b55944338..ce93487aef 100644 --- a/llmfoundry/optim/__init__.py +++ b/llmfoundry/optim/__init__.py @@ -10,6 +10,7 @@ from llmfoundry.optim.adaptive_lion import DecoupledAdaLRLion, DecoupledClipLion from llmfoundry.optim.lion import DecoupledLionW +from llmfoundry.optim.no_op import NoOp from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler from llmfoundry.registry import optimizers, schedulers @@ -17,6 +18,7 @@ optimizers.register('clip_lion', func=DecoupledClipLion) optimizers.register('decoupled_lionw', func=DecoupledLionW) optimizers.register('decoupled_adamw', func=DecoupledAdamW) +optimizers.register('no_op', func=NoOp) schedulers.register('constant_with_warmup', func=ConstantWithWarmupScheduler) schedulers.register( @@ -33,5 +35,6 @@ 'DecoupledLionW', 'DecoupledClipLion', 'DecoupledAdaLRLion', + 'NoOp', 'InverseSquareRootWithWarmupScheduler', ] diff --git a/llmfoundry/optim/no_op.py b/llmfoundry/optim/no_op.py new file mode 100644 index 0000000000..416363c261 --- /dev/null +++ b/llmfoundry/optim/no_op.py @@ -0,0 +1,44 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Callable, Iterable, Optional + +import torch + + +class NoOp(torch.optim.Optimizer): + """Optimizer that performs no optimization steps.""" + + def __init__( + self, + params: Iterable[torch.Tensor], + ): + """Initialize NoOp optimizer. + + Args: + params (Iterable[torch.Tensor]): Model parameters for the optimizer. + """ + # LR schedulers expect param groups to have LR. Unused. + defaults = {'lr': 0.0} + super().__init__(params, defaults) + + def __setstate__(self, state: dict[str, dict[Any, Any]]) -> None: + super().__setstate__(state) + + def state_dict(self): + return super().state_dict() + + @torch.no_grad() + def step(self, closure: Optional[Callable] = None): + """Perform no-op optimization step where no parameters are updated. + + Args: + closure (Callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + return loss diff --git a/tests/optim/test_no_op.py b/tests/optim/test_no_op.py new file mode 100644 index 0000000000..27766d6eaf --- /dev/null +++ b/tests/optim/test_no_op.py @@ -0,0 +1,48 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import copy +from typing import Callable + +import torch +from composer.trainer import Trainer +from torch.utils.data import DataLoader + +from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM +from llmfoundry.utils.builders import build_optimizer + + +def test_no_op_does_nothing( + build_tiny_mpt: Callable[..., ComposerMPTCausalLM], + tiny_ft_dataloader: DataLoader, +): + + # Build MPT model + model = build_tiny_mpt( + loss_fn='torch_crossentropy', + attn_config={ + 'attn_impl': 'torch', + }, + ) + + # Build NoOp optimizer + no_op_optim = build_optimizer(model, 'no_op', optimizer_config={}) + + orig_model = copy.deepcopy(model) + + # build trainer + trainer = Trainer( + model=model, + train_dataloader=tiny_ft_dataloader, + max_duration=f'2ba', + optimizers=no_op_optim, + ) + trainer.fit() + + # Check that the model has not changed + for ( + (orig_name, orig_param), + (new_name, new_param), + ) in zip(orig_model.named_parameters(), model.named_parameters()): + print(f'Checking {orig_name} and {new_name}') + assert torch.equal(orig_param, new_param) From 0ad6ab4757bc5bd232a85278d65e7399efcf44dc Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 30 Sep 2024 11:16:44 -0700 Subject: [PATCH 27/42] Deterministic GCRP Errors (#1559) Co-authored-by: v-chen_data --- .../data_prep/convert_delta_to_json.py | 31 ++++++---- llmfoundry/utils/exceptions.py | 15 +++++ .../data_prep/test_convert_delta_to_json.py | 58 +++++++++++++++++++ 3 files changed, 94 insertions(+), 10 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index 44e8651cdf..2321d306ff 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -23,6 +23,7 @@ ClusterInvalidAccessMode, FailedToConnectToDatabricksError, FailedToCreateSQLConnectionError, + FaultyDataPrepCluster, InsufficientPermissionsError, ) @@ -660,16 +661,26 @@ def fetch_DT( ) formatted_delta_table_name = format_tablename(delta_table_name) - - fetch( - method, - formatted_delta_table_name, - json_output_folder, - batch_size, - processes, - sparkSession, - dbsql, - ) + import grpc + try: + fetch( + method, + formatted_delta_table_name, + json_output_folder, + batch_size, + processes, + sparkSession, + dbsql, + ) + except grpc.RpcError as e: + if e.code( + ) == grpc.StatusCode.INTERNAL and 'Job aborted due to stage failure' in e.details( + ): + raise FaultyDataPrepCluster( + message= + f'Faulty data prep cluster, please try swapping data prep cluster: {e.details()}', + ) from e + raise e if dbsql is not None: dbsql.close() diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 242ac4f32c..9cbea2cac8 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -466,3 +466,18 @@ def __reduce__(self): def __str__(self): return self.message + + +class FaultyDataPrepCluster(UserError): + """Error thrown when the user uses faulty data prep cluster.""" + + def __init__(self, message: str) -> None: + self.message = message + super().__init__(message) + + def __reduce__(self): + # Return a tuple of class, a tuple of arguments, and optionally state + return (FaultyDataPrepCluster, (self.message,)) + + def __str__(self): + return self.message diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py index 981f5c1ed6..34a5b5ca55 100644 --- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py +++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py @@ -7,7 +7,10 @@ from typing import Any from unittest.mock import MagicMock, mock_open, patch +import grpc + from llmfoundry.command_utils.data_prep.convert_delta_to_json import ( + FaultyDataPrepCluster, InsufficientPermissionsError, download, fetch, @@ -524,3 +527,58 @@ def test_format_tablename(self): format_tablename('hyphenated-catalog.schema.test_table'), '`hyphenated-catalog`.`schema`.`test_table`', ) + + @patch('llmfoundry.command_utils.data_prep.convert_delta_to_json.fetch') + @patch( + 'llmfoundry.command_utils.data_prep.convert_delta_to_json.validate_and_get_cluster_info', + ) + def test_fetch_DT_grpc_error_handling( + self, + mock_validate_cluster_info: MagicMock, + mock_fetch: MagicMock, + ): + # Arrange + # Mock the validate_and_get_cluster_info to return test values + mock_validate_cluster_info.return_value = ('dbconnect', None, None) + + # Create a grpc.RpcError with StatusCode.INTERNAL and specific details + grpc_error = grpc.RpcError() + grpc_error.code = lambda: grpc.StatusCode.INTERNAL + grpc_error.details = lambda: 'Job aborted due to stage failure: Task failed due to an error.' + + # Configure the fetch function to raise the grpc.RpcError + mock_fetch.side_effect = grpc_error + + # Test inputs + delta_table_name = 'test_table' + json_output_folder = '/tmp/to/jsonl' + http_path = None + cluster_id = None + use_serverless = False + DATABRICKS_HOST = 'https://test-host' + DATABRICKS_TOKEN = 'test-token' + + # Act & Assert + with self.assertRaises(FaultyDataPrepCluster) as context: + fetch_DT( + delta_table_name=delta_table_name, + json_output_folder=json_output_folder, + http_path=http_path, + cluster_id=cluster_id, + use_serverless=use_serverless, + DATABRICKS_HOST=DATABRICKS_HOST, + DATABRICKS_TOKEN=DATABRICKS_TOKEN, + ) + + # Verify that the FaultyDataPrepCluster contains the expected message + self.assertIn( + 'Faulty data prep cluster, please try swapping data prep cluster: ', + str(context.exception), + ) + self.assertIn( + 'Job aborted due to stage failure', + str(context.exception), + ) + + # Verify that fetch was called + mock_fetch.assert_called_once() From bdc58b3c9279485ea3cf242d34260261fd1af4bc Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:13:54 -0400 Subject: [PATCH 28/42] Simplify CL API (#1510) --- .../callbacks/curriculum_learning_callback.py | 78 ++++++++++++------- .../test_curriculum_learning_callback.py | 48 ++++++++---- 2 files changed, 82 insertions(+), 44 deletions(-) diff --git a/llmfoundry/callbacks/curriculum_learning_callback.py b/llmfoundry/callbacks/curriculum_learning_callback.py index 449ab338bc..70e996e494 100644 --- a/llmfoundry/callbacks/curriculum_learning_callback.py +++ b/llmfoundry/callbacks/curriculum_learning_callback.py @@ -9,7 +9,8 @@ import copy import logging -from typing import Any +import warnings +from typing import Any, Optional, Union from composer import DataSpec from composer.core import State, Time, TimeUnit, ensure_time @@ -23,6 +24,7 @@ BaseContextualError, TrainDataLoaderLocation, ) +from llmfoundry.utils.warnings import VersionedDeprecationWarning log = logging.getLogger(__name__) @@ -32,19 +34,21 @@ class CurriculumLearning(CallbackWithConfig): """Starts an epoch with a different dataset when resuming from a checkpoint. + Example duration: + tok Example schedule: [ { 'duration': tok, - 'train_loader': , # matches top level train_loader + 'dataset': , }, { 'duration': tok, - 'train_loader': , + 'dataset': , }, { 'duration': tok, - 'train_loader': , + 'dataset': , ], ] @@ -53,48 +57,59 @@ class CurriculumLearning(CallbackWithConfig): being used. Note that this is the full train config and must contain the 'train_loader', 'device_train_batch_size', and 'tokenizer' keys. + duration (Union[Time, str, int], optional): The duration of the first datamix + (which corresponds to the train_loader). Defaults to None. schedule (list[dict[str, Any]]): The list of datamixes to use and their durations. Duration units must match max_duration and be in terms of a TimeUnit that is supported by Iteration. The duration values must be positive. There must be at least one datamix in the schedule. The - first datamix in the schedule must match the train_loader in the - train_config. On resumption, previously trained on datamixes and - durations cannot be changed. The duration of the current datamix - must be greater than the saved timestamp. The dataset must be a - StreamingDataset. + first datamix during training is not included in the schedule. On + resumption, previously trained on datamixes and durations cannot be + changed. The duration of the current datamix must be greater than + the saved timestamp. The dataset must be a StreamingDataset. """ def __init__( self, train_config: dict[str, Any], schedule: list[dict[str, Any]], + duration: Optional[Union[Time, str, int]] = None, ): + if duration is None: + warnings.warn( + VersionedDeprecationWarning( + 'Specifying the full schedule in the CurriculumLearning ' + + 'callback is deprecated. Please specify the duration of ' + + 'the first datamix separately and change the schedule ' + + 'use datasets instead of dataloaders.', + remove_version='0.15.0', + ), + ) + # Ensure all duration units are in epochs or tokens and values are positive self._schedule = schedule if len(self._schedule) == 0: raise ValueError('The schedule must have at least one datamix.') - for index, datamix in enumerate(self._schedule): + if duration is not None: + first_datamix = { + 'duration': duration, + 'dataset': train_config['train_loader']['dataset'], + } + self._schedule.insert(0, first_datamix) + for datamix in self._schedule: self._validate_datamix(datamix) - if ( - index == 0 and - train_config['train_loader'] != datamix['train_loader'] - ): - raise ValueError(( - 'The first datamix in the schedule must match the ' - 'train_loader in the train_config.' - )) - self._schedule_index = 0 - self.device_train_batch_size = train_config['device_train_batch_size'] - self.tokenizer = None + self._train_loader_config: dict[str, Any] = train_config['train_loader'] + self._device_train_batch_size = train_config['device_train_batch_size'] + self._tokenizer = None def init(self, state: State, logger: Logger): del logger # unused if not hasattr(state.model, 'tokenizer'): raise ValueError('state.model must have a tokenizer attribute.') - self.tokenizer = state.model.tokenizer + self._tokenizer = state.model.tokenizer def before_load(self, state: State, logger: Logger): del logger # unused @@ -151,8 +166,13 @@ def iteration_start(self, state: State, logger: Logger): # which is stale clean_stale_shared_memory() datamix = copy.deepcopy(self._schedule[self._schedule_index]) + train_loader_config = copy.deepcopy(self._train_loader_config) + if 'dataset' in datamix: + train_loader_config['dataset'].update(datamix['dataset']) + else: + train_loader_config = datamix['train_loader'] data_spec = self._build_train_loader( - train_loader_config=datamix['train_loader'], + train_loader_config=train_loader_config, logger=logger, ) state.set_dataloader( @@ -211,18 +231,20 @@ def _build_train_loader( train_loader_config: dict[str, Any], logger: Logger, ) -> DataSpec: + del logger # unused + from llmfoundry.data.dataloader import build_dataloader # Copied from scripts/train/train.py log.info( f'Building train loader in CurriculumLearning callback for dataset {self._schedule_index}', ) - assert self.tokenizer is not None + assert self._tokenizer is not None try: return build_dataloader( train_loader_config, - self.tokenizer, - self.device_train_batch_size, + self._tokenizer, + self._device_train_batch_size, ) except BaseContextualError as e: e.location = TrainDataLoaderLocation @@ -260,5 +282,5 @@ def _validate_datamix(self, datamix: dict[str, Any]): 'Schedules can only be defined in terms of epochs or tokens.', ) - if 'train_loader' not in datamix: - raise ValueError('Each datamix must have a train_loader.') + if 'train_loader' not in datamix and 'dataset' not in datamix: + raise ValueError('Each datamix must have a dataset.') diff --git a/tests/callbacks/test_curriculum_learning_callback.py b/tests/callbacks/test_curriculum_learning_callback.py index 075698a4c0..0e6a6c1efe 100644 --- a/tests/callbacks/test_curriculum_learning_callback.py +++ b/tests/callbacks/test_curriculum_learning_callback.py @@ -22,7 +22,7 @@ [ (None, '1ep'), ({ - 'dataset': 'some_dataset', + 'hf_name': 'some_dataset', }, '1ep'), (None, '10tok'), (None, ''), @@ -36,23 +36,29 @@ def test_curriculum_learning_callback_init( ): test_cfg = _get_test_cfg() test_cfg['train_loader'] = tiny_ft_dataloader_cfg - train_loader = test_cfg['train_loader'] if datamix is None else datamix + if datamix is None: + train_loader = test_cfg['train_loader']['dataset'] + else: + train_loader = datamix kwargs = { 'schedule': [{ 'duration': duration, - 'train_loader': train_loader, + 'dataset': train_loader, }, { 'duration': '2ep', - 'train_loader': {}, + 'dataset': {}, }], } + + kwargs['duration'] = kwargs['schedule'].pop(0)['duration'] + if duration == '': del kwargs['schedule'][0]['duration'] if datamix is not None and len(datamix) == 0: - del kwargs['schedule'][0]['train_loader'] + del kwargs['schedule'][0]['dataset'] context = nullcontext() - if datamix is not None or duration == '': + if (datamix is not None and len(datamix) == 0) or duration == '': context = pytest.raises(ValueError) with context: callback = build_callback( @@ -85,13 +91,15 @@ def test_curriculum_learning_callback_before_load( kwargs = { 'schedule': [{ 'duration': duration, - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }, { 'duration': '2ep', - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }], } + kwargs['duration'] = kwargs['schedule'].pop(0)['duration'] + callback = build_callback( 'curriculum_learning', kwargs=kwargs, @@ -123,13 +131,15 @@ def test_curriculum_learning_callback_after_load(build_tiny_mpt: Callable,): kwargs = { 'schedule': [{ 'duration': '1ep', - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }, { 'duration': '2ep', - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }], } + kwargs['duration'] = kwargs['schedule'].pop(0)['duration'] + callback = build_callback( 'curriculum_learning', kwargs=kwargs, @@ -168,13 +178,15 @@ def test_curriculum_learning_callback_iteration( kwargs = { 'schedule': [{ 'duration': '1ep', - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }, { 'duration': '2ep', - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }], } + kwargs['duration'] = kwargs['schedule'].pop(0)['duration'] + callback = build_callback( 'curriculum_learning', kwargs=kwargs, @@ -208,13 +220,15 @@ def test_curriculum_learning_callback_state_dict(build_tiny_mpt: Callable,): kwargs = { 'schedule': [{ 'duration': '1ep', - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }, { 'duration': '2ep', - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }], } + kwargs['duration'] = kwargs['schedule'].pop(0)['duration'] + callback = build_callback( 'curriculum_learning', kwargs=kwargs, @@ -249,13 +263,15 @@ def test_curriculum_learning_callback_load_state_dict( kwargs = { 'schedule': [{ 'duration': '1ep', - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }, { 'duration': '2ep', - 'train_loader': test_cfg['train_loader'], + 'dataset': test_cfg['train_loader']['dataset'], }], } + kwargs['duration'] = kwargs['schedule'].pop(0)['duration'] + callback = build_callback( 'curriculum_learning', kwargs=kwargs, From 30cdd67f54581143722cbfcf1b775c32ebc56730 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:36:36 -0700 Subject: [PATCH 29/42] Reapply #1389 (#1561) --- llmfoundry/data/finetuning/dataloader.py | 56 +++++++++--------------- llmfoundry/data/finetuning/tasks.py | 2 +- llmfoundry/models/hf/hf_base.py | 2 +- llmfoundry/utils/builders.py | 2 +- 4 files changed, 23 insertions(+), 39 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 69051a2d51..612b8d6385 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -575,42 +575,26 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: # Since we don't know exactly what the extension will be, since it is one of a list # use a signal file to wait for instead of the desired file - signal_file_path = os.path.join( - finetune_dir, - f'.node_{dist.get_node_rank()}_local_rank0_completed', - ) - if dist.get_local_rank() == 0: - try: - get_file(path=name, destination=destination, overwrite=True) - except FileNotFoundError as e: - if extension == SUPPORTED_EXTENSIONS[-1]: - files_searched = [ - f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS - ] - raise FileNotFoundError( - f'Could not find a file with any of ' + \ - f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \ - f'at {files_searched}', - ) from e - else: - log.debug( - f'Could not find {name}, looking for another extension', - ) - continue - - os.makedirs(os.path.dirname(signal_file_path), exist_ok=True) - with open(signal_file_path, 'wb') as f: - f.write(b'local_rank0_completed_download') - - # Avoid the collective call until the local rank zero has finished trying to download the dataset - # so that we don't timeout for large downloads. This syncs all processes on the node - with dist.local_rank_zero_download_and_wait(signal_file_path): - # Then, wait to ensure every node has finished trying to download the dataset - dist.barrier() - - # clean up signal file - if dist.get_local_rank() == 0: - os.remove(signal_file_path) + with dist.busy_wait_for_local_rank_zero(finetune_dir): + if dist.get_local_rank() == 0: + try: + get_file(path=name, destination=destination, overwrite=True) + except FileNotFoundError as e: + if extension == SUPPORTED_EXTENSIONS[-1]: + files_searched = [ + f'{name}/{split}{ext}' + for ext in SUPPORTED_EXTENSIONS + ] + raise FileNotFoundError( + f'Could not find a file with any of ' + \ + f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \ + f'at {files_searched}', + ) from e + else: + log.debug( + f'Could not find {name}, looking for another extension', + ) + continue dist.barrier() break return finetune_dir diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index e8f6484ef2..e099ffe14a 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -877,7 +877,7 @@ def build_from_hf( if tokenizer is None: raise ValueError('A tokenizer must be provided.') - signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed' + signal_file_path = dist.get_node_signal_file_name() # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing. # Once local rank 0 is done, the datasets are all cached on disk, and all other ranks diff --git a/llmfoundry/models/hf/hf_base.py b/llmfoundry/models/hf/hf_base.py index d193e1067f..2ec9bbaa98 100644 --- a/llmfoundry/models/hf/hf_base.py +++ b/llmfoundry/models/hf/hf_base.py @@ -356,7 +356,7 @@ def build_inner_model( f'init_device="{init_device}" must be either "cpu" or "meta".', ) - signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed' + signal_file_path = dist.get_node_signal_file_name() if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_download') diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 687b21b46d..ae04b68ee5 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -494,7 +494,7 @@ def build_tokenizer( os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' os.environ['TOKENIZERS_PARALLELISM'] = 'false' - signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup' + signal_file_path = dist.get_node_signal_file_name() if dist.is_available() and dist.is_initialized( ) and dist.get_world_size() > 1: From ec4cafd4faa417b370bd189811aef85cc9506cc9 Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Tue, 1 Oct 2024 11:14:49 -0400 Subject: [PATCH 30/42] Add dataset swap callback (#1536) --- llmfoundry/callbacks/__init__.py | 2 + llmfoundry/callbacks/dataset_swap_callback.py | 114 ++++++++++++++++++ tests/callbacks/test_dataset_swap_callback.py | 14 +++ 3 files changed, 130 insertions(+) create mode 100644 llmfoundry/callbacks/dataset_swap_callback.py create mode 100644 tests/callbacks/test_dataset_swap_callback.py diff --git a/llmfoundry/callbacks/__init__.py b/llmfoundry/callbacks/__init__.py index fe84efa316..8a7e1312eb 100644 --- a/llmfoundry/callbacks/__init__.py +++ b/llmfoundry/callbacks/__init__.py @@ -17,6 +17,7 @@ from llmfoundry.callbacks.async_eval_callback import AsyncEval from llmfoundry.callbacks.curriculum_learning_callback import CurriculumLearning +from llmfoundry.callbacks.dataset_swap_callback import DatasetSwap from llmfoundry.callbacks.env_logging_callback import EnvironmentLoggingCallback from llmfoundry.callbacks.eval_gauntlet_callback import EvalGauntlet from llmfoundry.callbacks.eval_output_logging_callback import EvalOutputLogging @@ -65,6 +66,7 @@ callbacks_with_config.register('async_eval', func=AsyncEval) callbacks_with_config.register('curriculum_learning', func=CurriculumLearning) +callbacks_with_config.register('dataset_swap', func=DatasetSwap) __all__ = [ 'FDiffMetrics', diff --git a/llmfoundry/callbacks/dataset_swap_callback.py b/llmfoundry/callbacks/dataset_swap_callback.py new file mode 100644 index 0000000000..415819e428 --- /dev/null +++ b/llmfoundry/callbacks/dataset_swap_callback.py @@ -0,0 +1,114 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +"""Enable curriculum learning by resuming with a different dataset. + +This callback is currently experimental. The API may change without warning in +the future. +""" + +import logging +from typing import Any + +from composer.core import State +from composer.loggers import Logger +from streaming import StreamingDataset +from torch.utils.data import DataLoader + +from llmfoundry.interfaces import CallbackWithConfig +from llmfoundry.utils.warnings import experimental_class + +log = logging.getLogger(__name__) + +__all__ = ['DatasetSwap'] + + +@experimental_class('DatasetSwap callback') +class DatasetSwap(CallbackWithConfig): + """Starts an epoch with a different dataset when resuming from a checkpoint. + + Args: + train_config (Dict): The configuration of the dataset currently + being used. Note that this is the full train config and must + contain the 'train_loader' key. + dataset_index (int): The index of the dataset currently being used. + """ + + def __init__(self, train_config: dict, dataset_index: int): + self.dataset_index = dataset_index + self.saved_dataset_index = 0 + self.all_dataset_configs = [] + self.current_dataset_state = {} + # The current dataset config is resolved and passed in train.py + self.current_dataset_config = train_config['train_loader'] + + def before_load(self, state: State, logger: Logger): + del logger + + # Save the current dataset state so we can restore it correctly + # if we are resuming with a new dataset. + train_loader = state.train_dataloader + # Check if we are using a DataLoader and StreamingDataset + if not isinstance(train_loader, DataLoader): + raise ValueError( + f'CurriculumLearning callback can only be used with a train ', + f'dataloader of type DataLoader, but got {type(train_loader)}.', + ) + dataset = train_loader.dataset + if not isinstance(dataset, StreamingDataset): + raise ValueError( + f'CurriculumLearning callback only supports StreamingDataset ', + f'because it requires loading and saving dataset state. ', + f'Instead, got a dataset of type {type(dataset)}', + ) + assert isinstance(dataset, StreamingDataset) + # Save the current dataset state so we can restore it if needed. + self.current_dataset_state = dataset.state_dict( # type: ignore + num_samples=0, from_beginning=False) + + def after_load(self, state: State, logger: Logger): + del logger + + # As saved_dataset_index is loaded from state_dict, this only runs when + # a user explicitly increments the dataset_index and not on any other + # resumption, including autoresume. + train_loader = state._train_dataloader + assert isinstance( + train_loader, + DataLoader, + ), 'CurriculumLearning callback requires a DataLoader.' + dataset = train_loader.dataset + assert isinstance( + dataset, + StreamingDataset, + ), 'CurriculumLearning callback requires a StreamingDataset.' + if self.saved_dataset_index < self.dataset_index: + # Ignore the dataset state that was read in from the checkpoint, and + # replace with the new dataset state. This preserves resumption info. + if self.current_dataset_state['epoch'] < 0: + # Make sure the epoch in the loaded state dict is not negative. + # Since `__iter__` has not yet been called on the dataset, the + # epoch index in the dataset will still be -1. We need to ensure + # that we set the epoch correctly to 0 in this case. + self.current_dataset_state['epoch'] = 0 + dataset.load_state_dict( # type: ignore + self.current_dataset_state) + # Start a new epoch since we are using a new dataset. + # This will also reset the sample_in_epoch written to checkpoint, + # making sure that subsequent resumptions proceed correctly. + state.timestamp = state.timestamp.to_next_epoch() + # Append the new dataset config to the list of all dataset configs. + self.all_dataset_configs.append(self.current_dataset_config) + elif self.dataset_index == 0 and len(self.all_dataset_configs) == 0: + # Make sure to track our current dataset config if we are just starting training. + self.all_dataset_configs.append(self.current_dataset_config) + + def state_dict(self): + return { + 'dataset_index': self.dataset_index, + 'all_dataset_configs': self.all_dataset_configs, + } + + def load_state_dict(self, state: dict[str, Any]): + self.saved_dataset_index = state.get('dataset_index', 0) + self.all_dataset_configs = state.get('all_dataset_configs', []) diff --git a/tests/callbacks/test_dataset_swap_callback.py b/tests/callbacks/test_dataset_swap_callback.py new file mode 100644 index 0000000000..f54b0f6f5f --- /dev/null +++ b/tests/callbacks/test_dataset_swap_callback.py @@ -0,0 +1,14 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from llmfoundry.utils.builders import build_callback + + +def test_dataset_swap_callback_builds(): + kwargs = {'dataset_index': 0} + callback = build_callback( + 'dataset_swap', + kwargs=kwargs, + train_config={'train_loader': {}}, + ) + assert callback is not None From b517297091d4ecfd6f2030d64e5cddb59cac7935 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 1 Oct 2024 14:02:38 -0400 Subject: [PATCH 31/42] Add error to catch more unknown example types (#1562) --- llmfoundry/data/finetuning/tasks.py | 2 ++ tests/data/test_template_tokenization.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index e099ffe14a..c81856b8ba 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -174,6 +174,8 @@ def _get_key(dictionary: Mapping[str, Any], allowed_keys: set[str]): if not isinstance(dictionary, Mapping): raise InvalidExampleTypeError(str(type(dictionary))) desired_keys = allowed_keys.intersection(dictionary.keys()) + if len(desired_keys) == 0: + raise UnknownExampleTypeError(str(set(dictionary.keys()))) return list(desired_keys)[0] diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py index fdf7233115..0697894bb2 100644 --- a/tests/data/test_template_tokenization.py +++ b/tests/data/test_template_tokenization.py @@ -53,11 +53,21 @@ def test_tokenize_chat_example_malformed(): } wrong_example_type = ['this is not a dictionary'] wrong_messages_type = {'messages': 'this is not a list of messages'} + wrong_role = { + 'messages': [{ + 'role': 'user', + 'content': 'Hello GPT!', + }, { + 'role': 'misnamed_assistant', + 'content': 'user message not followed by an assistant label', + }], + } malformed_chat_examples = [ too_few_messages, no_content, ends_with_user_role, no_assistant_message, + wrong_role, ] my_tokenizer = build_tokenizer('mosaicml/mpt-7b-8k-chat', {}) for example in malformed_chat_examples: From 8cf3d8718763e7d9760b6f4df5780e6c23e18e0f Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Tue, 1 Oct 2024 14:12:55 -0400 Subject: [PATCH 32/42] Add FileExtensionNotFoundError (#1564) --- llmfoundry/data/finetuning/dataloader.py | 7 +++---- llmfoundry/utils/exceptions.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 612b8d6385..3e64360a67 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -28,6 +28,7 @@ from llmfoundry.data.text_data import build_streams from llmfoundry.utils.config_utils import to_dict_container from llmfoundry.utils.exceptions import ( + FinetuningFileNotFoundError, MissingHuggingFaceURLSplitError, NotEnoughDatasetSamplesError, ) @@ -585,10 +586,8 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS ] - raise FileNotFoundError( - f'Could not find a file with any of ' + \ - f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \ - f'at {files_searched}', + raise FinetuningFileNotFoundError( + files_searched=files_searched, ) from e else: log.debug( diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 9cbea2cac8..4a4321637f 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -481,3 +481,19 @@ def __reduce__(self): def __str__(self): return self.message + + +class FinetuningFileNotFoundError(UserError): + """Error thrown when a file can't be found with any supported extension.""" + + def __init__(self, files_searched: list[str]) -> None: + from llmfoundry.data.finetuning.tasks import SUPPORTED_EXTENSIONS + message = ( + f'Could not find a file with any of ' + \ + f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \ + f'at {files_searched}' + ) + super().__init__( + message, + files_searched=files_searched, + ) From a462f037c62fd6d23e7bdfa346d49e1598025052 Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:37:16 -0400 Subject: [PATCH 33/42] Add InvalidConversationError (#1565) --- llmfoundry/data/finetuning/tasks.py | 7 ++++--- llmfoundry/utils/exceptions.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index c81856b8ba..a68a611c52 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -76,6 +76,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: DatasetTooSmallError, IncorrectMessageKeyQuantityError, InvalidContentTypeError, + InvalidConversationError, InvalidExampleTypeError, InvalidFileExtensionError, InvalidLastChatMessageRoleError, @@ -270,17 +271,17 @@ def slice_out_last_turn( if conversation_through_previous_turn != full_conversation[:len( conversation_through_previous_turn, )]: - raise ValueError( + raise InvalidConversationError( f'The full conversation must start with the conversation through the previous turn. {conversation_through_previous_turn=}, {full_conversation=}', ) if conversation_through_previous_turn != prompt_with_history[:len( conversation_through_previous_turn, )]: - raise ValueError( + raise InvalidConversationError( f'The prompt_with_history must start with the conversation through the previous turn. {conversation_through_previous_turn=}, {prompt_with_history=}', ) if prompt_with_history != full_conversation[:len(prompt_with_history)]: - raise ValueError( + raise InvalidConversationError( f'prompt_with_history must be the first part of the full conversation. {prompt_with_history=}, {full_conversation=}', ) prompt = prompt_with_history[len(conversation_through_previous_turn):] diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 4a4321637f..81cfb21d11 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -497,3 +497,18 @@ def __init__(self, files_searched: list[str]) -> None: message, files_searched=files_searched, ) + + +class InvalidConversationError(UserError): + """Error thrown when the conversation is invalid.""" + + def __init__(self, message: str) -> None: + self.message = message + super().__init__(message) + + def __reduce__(self): + # Return a tuple of class, a tuple of arguments, and optionally state + return (InvalidConversationError, (self.message,)) + + def __str__(self): + return self.message From 24fec7908cdd5b5de31a569049e2dd366d307251 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 2 Oct 2024 11:26:55 -0700 Subject: [PATCH 34/42] Release docker img (#1547) Co-authored-by: v-chen_data --- .github/workflows/release.yaml | 64 ++++++++++++++++++++++++++++++++++ Dockerfile | 9 +++-- 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c09f9bb7a5..3617732c8f 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -50,3 +50,67 @@ jobs: user: __token__ password: ${{ secrets.TEST_PYPI_API_TOKEN }} repository_url: https://test.pypi.org/legacy/ + + build-docker: + name: Build llm-foundry Release Docker Image + needs: + - code-quality + runs-on: mosaic-8wide + if: github.repository_owner == 'mosaicml' + steps: + - name: Checkout source + uses: actions/checkout@v3 + + - name: Setup Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Define Docker tags + id: define-tags + run: | + BRANCH_NAME="${{ github.ref_name }}" + TAG_NAME=$(echo "${BRANCH_NAME}" | sed 's/\//_/g') + echo "BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV + + echo "DOCKER_TAG=mosaicml/llm-foundry:release_${TAG_NAME}" >> $GITHUB_ENV + echo "AWS_DOCKER_TAG=mosaicml/llm-foundry:release_${TAG_NAME}_aws" >> $GITHUB_ENV + echo "LATEST_TAG=mosaicml/llm-foundry:release-latest" >> $GITHUB_ENV + echo "AWS_LATEST_TAG=mosaicml/llm-foundry:release_aws-latest" >> $GITHUB_ENV + + + - name: Build and push AWS Docker image + uses: docker/build-push-action@v3 + with: + context: . + file: Dockerfile + push: true + tags: | + ${{ env.AWS_DOCKER_TAG }} + ${{ env.AWS_LATEST_TAG }} + build-args: | + BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws + BRANCH_NAME=${{ env.BRANCH_NAME }} + TE_COMMIT=901e5d2 + DEP_GROUPS=[all] + KEEP_FOUNDRY=true + + - name: Build and push Docker image + uses: docker/build-push-action@v3 + with: + context: . + file: Dockerfile + push: true + tags: | + ${{ env.DOCKER_TAG }} + ${{ env.LATEST_TAG }} + build-args: | + BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + BRANCH_NAME=${{ env.BRANCH_NAME }} + TE_COMMIT=901e5d2 + DEP_GROUPS=[all] + KEEP_FOUNDRY=true diff --git a/Dockerfile b/Dockerfile index ca52532395..a9d44bfa27 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,7 @@ FROM $BASE_IMAGE ARG BRANCH_NAME ARG DEP_GROUPS ARG TE_COMMIT +ARG KEEP_FOUNDRY=false ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0" @@ -21,5 +22,9 @@ RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=4 MAX_JOBS=4 pip install g # Install and uninstall foundry to cache foundry requirements RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}" -RUN pip uninstall -y llm-foundry -RUN rm -rf llm-foundry + +# Conditionally uninstall llm-foundry and remove its directory +RUN if [ "$KEEP_FOUNDRY" != "true" ]; then \ + pip uninstall -y llm-foundry && \ + rm -rf /llm-foundry; \ + fi From 214305fb90070e6aeb054c971d5ab6e08e84671b Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Wed, 2 Oct 2024 14:47:57 -0700 Subject: [PATCH 35/42] Revert FT dataloader changes from #1561, keep #1564 (#1566) --- llmfoundry/data/finetuning/dataloader.py | 52 ++++++++++++++++-------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 3e64360a67..ca841979f9 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -576,24 +576,40 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: # Since we don't know exactly what the extension will be, since it is one of a list # use a signal file to wait for instead of the desired file - with dist.busy_wait_for_local_rank_zero(finetune_dir): - if dist.get_local_rank() == 0: - try: - get_file(path=name, destination=destination, overwrite=True) - except FileNotFoundError as e: - if extension == SUPPORTED_EXTENSIONS[-1]: - files_searched = [ - f'{name}/{split}{ext}' - for ext in SUPPORTED_EXTENSIONS - ] - raise FinetuningFileNotFoundError( - files_searched=files_searched, - ) from e - else: - log.debug( - f'Could not find {name}, looking for another extension', - ) - continue + signal_file_path = os.path.join( + finetune_dir, + f'.node_{dist.get_node_rank()}_local_rank0_completed', + ) + if dist.get_local_rank() == 0: + try: + get_file(path=name, destination=destination, overwrite=True) + except FileNotFoundError as e: + if extension == SUPPORTED_EXTENSIONS[-1]: + files_searched = [ + f'{name}/{split}{ext}' for ext in SUPPORTED_EXTENSIONS + ] + raise FinetuningFileNotFoundError( + files_searched=files_searched, + ) from e + else: + log.debug( + f'Could not find {name}, looking for another extension', + ) + continue + + os.makedirs(os.path.dirname(signal_file_path), exist_ok=True) + with open(signal_file_path, 'wb') as f: + f.write(b'local_rank0_completed_download') + + # Avoid the collective call until the local rank zero has finished trying to download the dataset + # so that we don't timeout for large downloads. This syncs all processes on the node + with dist.local_rank_zero_download_and_wait(signal_file_path): + # Then, wait to ensure every node has finished trying to download the dataset + dist.barrier() + + # clean up signal file + if dist.get_local_rank() == 0: + os.remove(signal_file_path) dist.barrier() break return finetune_dir From 4bbb4a5cddeef5fa22f6d7b1e47bbefb9f0edc30 Mon Sep 17 00:00:00 2001 From: Eitan Turok <150733043+eitanturok@users.noreply.github.com> Date: Wed, 2 Oct 2024 18:11:26 -0400 Subject: [PATCH 36/42] Cleanup TP (#1556) Co-authored-by: Eitan Turok --- llmfoundry/command_utils/train.py | 13 ++-- llmfoundry/utils/config_utils.py | 17 ++++- tests/data_utils.py | 1 + tests/tp/test_tp_strategies.py | 114 +++++++++++++++++++++++++++--- 4 files changed, 127 insertions(+), 18 deletions(-) diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index 29878714f6..9a5931ddba 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -5,7 +5,6 @@ import os import time import warnings -from copy import deepcopy from typing import Any, Optional, Union import torch @@ -19,7 +18,7 @@ TraceHandler, cyclic_schedule, ) -from composer.utils import dist, get_device, reproducibility +from composer.utils import TPConfig, dist, get_device, reproducibility from omegaconf import DictConfig from omegaconf import OmegaConf as om @@ -332,7 +331,7 @@ def train(cfg: DictConfig) -> Trainer: ) # Optional tp config - tp_config: Optional[dict[str, Any]] = train_cfg.tp_config + tp_config: Optional[Union[TPConfig, dict[str, Any]]] = train_cfg.tp_config # Warn if FSDP or TP is enabled but user only has 1 GPU if dist.get_world_size( @@ -351,7 +350,7 @@ def train(cfg: DictConfig) -> Trainer: # Initialize context init_context = process_init_device(model_config, fsdp_config, tp_config) logged_cfg.update({'fsdp_config': fsdp_config}, merge=True) - logged_cfg.update({'tp_config': deepcopy(tp_config)}, merge=True) + logged_cfg.update({'tp_config': tp_config}, merge=True) # Build tokenizer log.info('Building tokenizer...') @@ -517,9 +516,9 @@ def train(cfg: DictConfig) -> Trainer: # TP config if tp_config is not None: - strategy = tp_config.pop('strategy', None) - assert isinstance(strategy, str), '`strategy` must be in `tp_config`.' - tp_config['layer_plan'] = build_tp_strategies(strategy, model) + strategy = tp_config.pop('strategy') + layer_plan = build_tp_strategies(strategy, model) + tp_config = TPConfig(**tp_config, layer_plan=layer_plan) # Parallelism config parallelism_config = {'fsdp': fsdp_config, 'tp': tp_config} diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index c22495993c..18112c18aa 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -288,7 +288,6 @@ def apply_transforms_to_config( for transform in transform_functions: cfg = transform(cfg) - return cfg @@ -538,6 +537,22 @@ def process_init_device( # Set defaults for mixed initialization fsdp_config.setdefault('load_monolith_rank0_only', True) + if tp_config is not None: + # Check tp_config has required fields + if 'strategy' not in tp_config or 'tensor_parallel_degree' not in tp_config: + raise ValueError( + "`tp_config` requires 'strategy' and 'tensor_parallel_degree' values. ", + ) + + # Check we are not using tensor parallelism with MoEs + if 'ffn_config' in model_cfg and model_cfg['ffn_config'].get( + 'ffn_type', + None, + ) in ffns_with_megablocks: + raise ValueError( + 'Tensor Parallelism is not currently supported for MoE models.', + ) + # Check we are not using tensor parallelism with MoEs if tp_config is not None and 'ffn_config' in model_cfg and model_cfg[ 'ffn_config'].get('ffn_type', None) in ffns_with_megablocks: diff --git a/tests/data_utils.py b/tests/data_utils.py index 1f6c26b72e..67c1be9f6e 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -251,6 +251,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str: shutil.copytree( os.path.join(c4_dir, 'val_xxsmall'), os.path.join(c4_dir, mocked_split), + dirs_exist_ok=True, ) assert os.path.exists(c4_dir) return c4_dir diff --git a/tests/tp/test_tp_strategies.py b/tests/tp/test_tp_strategies.py index fd2fa384ce..6dfc30759b 100644 --- a/tests/tp/test_tp_strategies.py +++ b/tests/tp/test_tp_strategies.py @@ -1,10 +1,16 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import os +import pathlib +import shutil from pathlib import Path from tempfile import TemporaryDirectory +from typing import Optional import pytest +from composer import Trainer +from omegaconf import DictConfig from omegaconf import OmegaConf as om from torch.distributed._tensor import Replicate, Shard from torch.distributed.tensor.parallel import ( @@ -96,9 +102,94 @@ def test_ffn_tp_strategy(): raise ValueError(f'Layer plan of wrong type: {type(layer_plan)}') +def get_cfg( + dataset_name: pathlib.Path, + tp_strategy: Optional[str] = None, + tp_degree: Optional[int] = None, + yaml_path: str = 'scripts/train/yamls/pretrain/testing.yaml', +): + # Read cfg from `testing.yaml` + from tests.fixtures.autouse import REPO_DIR + cfg_path: str = os.path.join(REPO_DIR, yaml_path) + with open(cfg_path, 'r', encoding='utf-8') as f: + train_cfg = om.load(f) + assert isinstance(train_cfg, DictConfig) + + # Set the name, dataset, loggers + train_cfg.variables.run_name = 'fsdp-test' + train_cfg.variables.data_local = dataset_name + train_cfg.loggers = DictConfig({'inmemory': DictConfig({})}) + + # Set batch size, duration + train_cfg.global_train_batch_size = 16 + train_cfg.device_eval_batch_size = 2 + train_cfg.device_train_microbatch_size = 2 + train_cfg.max_duration = '1ep' + train_cfg.eval_interval = '1ep' + + # TP needs unfused qkv (even without TP, we unfuse qkv for a fair comparison) + train_cfg.model.attn_cfg = {'fused_qkv': False} + + if tp_strategy and tp_degree: + train_cfg.variables.run_name = 'tp-test' + train_cfg.tp_config = { + 'strategy': tp_strategy, + 'tensor_parallel_degree': tp_degree, + } + + return train_cfg + + +def get_loss_array(trainer: Trainer): + logger = trainer.logger.destinations[0] + loss_array = logger.get_timeseries('loss/train/total')['loss/train/total' + ] # type: ignore + return loss_array + + +@pytest.mark.gpu +@pytest.mark.world_size(4) +@pytest.mark.parametrize('tp_degree', [2]) +@pytest.mark.parametrize('tp_strategy', ['ffn']) +def test_tp_train(tp_degree: int, tp_strategy: str): + """Test that we can train with FSDP-TP.""" + my_dir = Path('/my-data-dir') + + try: + # create c4 dataset + if my_dir.is_dir() and my_dir.exists(): + shutil.rmtree(my_dir) + my_dir.mkdir(parents=True) + tp_dataset_name = create_c4_dataset_xxsmall(my_dir) + + # Train model with TP and get loss + tp_cfg = get_cfg(pathlib.Path(tp_dataset_name), tp_strategy, tp_degree) + tp_trainer = train(tp_cfg) + tp_trainer.close() + tp_loss = get_loss_array(tp_trainer) + + # Compare loss and expected loss for TP + import numpy as np + expected_tp_loss = np.array([ + 12.02126884, + 11.96996498, + 12.02957344, + 11.97966957, + 11.99677086, + 11.96347618, + ]) + np.testing.assert_allclose(tp_loss, expected_tp_loss) + except Exception as e: + raise e + finally: + # always remove the directory + if os.path.isdir(my_dir): + shutil.rmtree(my_dir) + + @pytest.mark.gpu -def test_no_tp_with_one_gpu(): - """Test that when we have one GPU, we use DDP and not FSDP-TP.""" +def test_tp_train_with_one_gpu(): + """Test that when we have one GPU, we train DDP and not FSDP-TP.""" with TemporaryDirectory() as tmp_path: # Make `train_cfg`` with a tensor parallelism strategy dataset_name = create_c4_dataset_xxsmall(Path(tmp_path)) @@ -115,19 +206,22 @@ def test_no_tp_with_one_gpu(): @pytest.mark.gpu # use gpu because `megablocks` only installed with `gpu` dependencies -def test_no_tp_with_moes(): +@pytest.mark.parametrize('tp_degree', [2]) +@pytest.mark.parametrize('tp_strategy', ['ffn']) +def test_tp_train_with_moes(tp_degree: int, tp_strategy: str): """Test that tensor parallelism is not compatible with MoEs.""" # Make `cfg` for MoE model, fsdp, and tp - train_cfg_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml' - with open(train_cfg_path, 'r', encoding='utf-8') as f: - train_cfg = om.load(f) - model_cfg = train_cfg.model - fsdp_cfg = train_cfg.fsdp_config - tp_cfg = {'strategy': 'ffn'} + moe_yaml_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml' + dataset_name = Path('') # dummy dataset path + train_cfg = get_cfg(dataset_name, tp_strategy, tp_degree, moe_yaml_path) # Expect an error with pytest.raises( ValueError, match='Tensor Parallelism is not currently supported for MoE models.', ): - process_init_device(model_cfg, fsdp_cfg, tp_cfg) + process_init_device( + train_cfg.model, + train_cfg.fsdp_config, + train_cfg.tp_config, + ) From 788c1f59ca5832842007fca620ec6bbf3abe9611 Mon Sep 17 00:00:00 2001 From: Abhay Gupta Date: Fri, 4 Oct 2024 13:36:55 -0700 Subject: [PATCH 37/42] Changes for dataset swap callback (#1569) --- llmfoundry/callbacks/dataset_swap_callback.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/llmfoundry/callbacks/dataset_swap_callback.py b/llmfoundry/callbacks/dataset_swap_callback.py index 415819e428..d95846bd34 100644 --- a/llmfoundry/callbacks/dataset_swap_callback.py +++ b/llmfoundry/callbacks/dataset_swap_callback.py @@ -8,7 +8,7 @@ """ import logging -from typing import Any +from dataclasses import dataclass from composer.core import State from composer.loggers import Logger @@ -23,6 +23,12 @@ __all__ = ['DatasetSwap'] +@dataclass +class DatasetSwapStateDict: + dataset_index: int + all_dataset_configs: list + + @experimental_class('DatasetSwap callback') class DatasetSwap(CallbackWithConfig): """Starts an epoch with a different dataset when resuming from a checkpoint. @@ -105,10 +111,18 @@ def after_load(self, state: State, logger: Logger): def state_dict(self): return { - 'dataset_index': self.dataset_index, - 'all_dataset_configs': self.all_dataset_configs, + 'callback_state': + DatasetSwapStateDict( + dataset_index=self.dataset_index, + all_dataset_configs=self.all_dataset_configs, + ), } - def load_state_dict(self, state: dict[str, Any]): - self.saved_dataset_index = state.get('dataset_index', 0) - self.all_dataset_configs = state.get('all_dataset_configs', []) + def load_state_dict(self, state: dict[str, DatasetSwapStateDict]): + _dummy_obj = DatasetSwapStateDict( + dataset_index=0, + all_dataset_configs=[], + ) + _state_obj = state.get('callback_state', _dummy_obj) + self.saved_dataset_index = getattr(_state_obj, 'dataset_index') + self.all_dataset_configs = getattr(_state_obj, 'all_dataset_configs') From 56e45732d52b2e5cdb3bf67a936c6e1cebbeadbd Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Tue, 10 Sep 2024 09:04:08 -0700 Subject: [PATCH 38/42] refactor hf download --- llmfoundry/data/finetuning/tasks.py | 115 +++++++++++++++++----------- 1 file changed, 71 insertions(+), 44 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index a68a611c52..f894a22a6a 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -705,7 +705,73 @@ def state_dict(self, num_samples: int, num_samples=num_samples, from_beginning=from_beginning, ) + +def download_hf_dataset_if_needed( + dataset_name: str, + hf_kwargs: Optional[dict[str, Any]] = None +) -> str: + """ + Download a HuggingFace dataset locally if it does not already exist. + + Args: + dataset_name (str): The name of the HuggingFace dataset to use. Can be a remote http(s) + directory or object store bucket containing the file {split}.jsonl. + safe_load (bool): Whether to enforce safe loading of the dataset. + hf_kwargs (dict, optional): Additional kwargs to pass to `datasets.load_dataset`. + + Returns: + str: The local path to the dataset. + """ + if hf_kwargs is None: + hf_kwargs = {} + + if not os.path.isdir(dataset_name): + local_dataset_dir = os.path.join( + DOWNLOADED_FT_DATASETS_DIRPATH, + dataset_name, + ) + + if _is_empty_or_nonexistent(dirpath=local_dataset_dir): + # Safely load the dataset from HF Hub with restricted file types. + hf_hub.snapshot_download( + dataset_name, + repo_type='dataset', + allow_patterns=[ + '*' + ext for ext in SUPPORTED_EXTENSIONS + ], + token=hf_kwargs.get('token', None), + revision=hf_kwargs.get('revision', None), + local_dir_use_symlinks=False, + local_dir=local_dataset_dir, + ) + if _is_empty_or_nonexistent(dirpath=dataset_name): + log.error("Failed to safely load the dataset from HF Hub.") + raise InvalidFileExtensionError( + dataset_name, + SUPPORTED_EXTENSIONS, + ) + # Set dataset_name to the downloaded location. + dataset_name = local_dataset_dir + + # Ensure dataset_name is a local directory path (using abspath to avoid confusion). + dataset_name = os.path.abspath(dataset_name) + + # Check that the directory contains only allowed file types. + dataset_files = [ + f for _, _, files in os.walk(dataset_name) for f in files + ] + if not all( + Path(f).suffix in SUPPORTED_EXTENSIONS + + HUGGINGFACE_FOLDER_EXTENSIONS or f == '.gitignore' + for f in dataset_files + ): + log.error(f"Invalid file extension found in dataset during safe load.") + raise InvalidFileExtensionError( + dataset_name, + SUPPORTED_EXTENSIONS, + ) + return dataset_name class DatasetConstructor: @@ -904,50 +970,11 @@ def build_from_hf( filtered_dataset = None try: if safe_load: - if not os.path.isdir(dataset_name): - # dataset_name is not a local dir path, download if needed. - local_dataset_dir = os.path.join( - DOWNLOADED_FT_DATASETS_DIRPATH, - dataset_name, - ) - - if _is_empty_or_nonexistent(dirpath=local_dataset_dir): - # Safely load a dataset from HF Hub with restricted file types. - hf_hub.snapshot_download( - dataset_name, - repo_type='dataset', - allow_patterns=[ - '*' + ext for ext in SUPPORTED_EXTENSIONS - ], - token=hf_kwargs.get('token', None), - revision=hf_kwargs.get('revision', None), - local_dir_use_symlinks=False, - local_dir=local_dataset_dir, - ) - if _is_empty_or_nonexistent(dirpath=local_dataset_dir): - raise InvalidFileExtensionError( - dataset_name, - SUPPORTED_EXTENSIONS, - ) - # Set dataset_name to the downloaded location. - dataset_name = local_dataset_dir - - # dataset_name is a local dir path. Use the abspath to prevent confusion. - dataset_name = os.path.abspath(dataset_name) - - # Ensure that the local dir contains only allowed file types. - dataset_files = [ - f for _, _, files in os.walk(dataset_name) for f in files - ] - if not all( - Path(f).suffix in SUPPORTED_EXTENSIONS + - HUGGINGFACE_FOLDER_EXTENSIONS or f == '.gitignore' - for f in dataset_files - ): - raise InvalidFileExtensionError( - dataset_name, - SUPPORTED_EXTENSIONS, - ) + dataset_name = download_hf_dataset_if_needed( + dataset_name, + safe_load, + hf_kwargs, + ) dataset = hf_datasets.load_dataset( dataset_name, From 983a32d4d37a0ab7c79fd481b57916fa67c45d1b Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Thu, 12 Sep 2024 13:34:59 -0700 Subject: [PATCH 39/42] split_eval_set skeleton --- llmfoundry/command_utils/__init__.py | 36 +++++++------ .../command_utils/data_prep/split_eval_set.py | 37 +++++++++++++ scripts/data_prep/split_eval_set.py | 54 +++++++++++++++++++ 3 files changed, 110 insertions(+), 17 deletions(-) create mode 100644 llmfoundry/command_utils/data_prep/split_eval_set.py create mode 100644 scripts/data_prep/split_eval_set.py diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 0226c4f408..5407b723cc 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -20,6 +20,7 @@ convert_text_to_mds, convert_text_to_mds_from_args, ) +from llmfoundry.command_utils.data_prep.split_eval_set import split_eval_set_from_args from llmfoundry.command_utils.eval import ( eval_from_yaml, evaluate, @@ -33,21 +34,22 @@ ) __all__ = [ - 'train', - 'train_from_yaml', - 'TrainConfig', - 'TRAIN_CONFIG_KEYS', - 'validate_config', - 'evaluate', - 'eval_from_yaml', - 'convert_dataset_hf', - 'convert_dataset_hf_from_args', - 'convert_dataset_json', - 'convert_dataset_json_from_args', - 'convert_finetuning_dataset_from_args', - 'convert_finetuning_dataset', - 'convert_text_to_mds', - 'convert_text_to_mds_from_args', - 'convert_delta_to_json_from_args', - 'fetch_DT', + "train", + "train_from_yaml", + "TrainConfig", + "TRAIN_CONFIG_KEYS", + "validate_config", + "evaluate", + "eval_from_yaml", + "convert_dataset_hf", + "convert_dataset_hf_from_args", + "convert_dataset_json", + "convert_dataset_json_from_args", + "convert_finetuning_dataset_from_args", + "convert_finetuning_dataset", + "convert_text_to_mds", + "convert_text_to_mds_from_args", + "convert_delta_to_json_from_args", + "fetch_DT", + "split_eval_set_from_args", ] diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py new file mode 100644 index 0000000000..01205cba15 --- /dev/null +++ b/llmfoundry/command_utils/data_prep/split_eval_set.py @@ -0,0 +1,37 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import os +import json +from enum import Enum + +import datasets +from llmfoundry.data.finetuning.tasks import download_hf_dataset_if_needed + + +class SupportedDataFormats(Enum): + REMOTE_JSONL = "jsonl" # UC JSONL + DELTA_JSONL = "delta_jsonl" # Delta table preprocessed to JSONL + HF = "huggingface" + + +def validate_data_path(data_path: str) -> None: + """ + Validates the data path and returns the format of the data. + + Args: + data_path (str): Path to the training dataset + """ + + +def split_eval_set_from_args() -> None: + """ + Args: + data_path_folder (str): Path to the training dataset folder + data_path_split (str): Data split + output_path (str): Directory to save the split dataset + eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training + max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used + seed (int): Random seed for splitting the dataset + """ + pass diff --git a/scripts/data_prep/split_eval_set.py b/scripts/data_prep/split_eval_set.py new file mode 100644 index 0000000000..ee8bfee453 --- /dev/null +++ b/scripts/data_prep/split_eval_set.py @@ -0,0 +1,54 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from argparse import ArgumentParser + +from llmfoundry.command_utils import split_eval_set_from_args + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Split training dataset into train and eval sets", + ) + parser.add_argument( + "--data_path_folder", required=True, type=str, help="Path to the training dataset folder" + ) + parser.add_argument( + "--data_path_split", required=True, type=str, help="Path to the training dataset split" + ) + parser.add_argument( + "--output_path", + required=True, + type=str, + help="Path to save the split dataset", + ) + parser.add_argument( + "--eval_split_ratio", + required=False, + type=float, + default=0.1, + help="Ratio of the dataset to use for evaluation. The remainder will be used for training", + ) + parser.add_argument( + "--max_eval_samples", + required=False, + type=int, + default=None, + help="Maximum number of samples to include in the eval set", + ) + parser.add_argument( + "--seed", + required=False, + type=int, + default=42, + help="Random seed for splitting the dataset", + ) + args = parser.parse_args() + split_eval_set_from_args( + data_path_folder=args.data_path_folder, + data_path_split=args.data_path_split, + output_path=args.output_path, + eval_split_ratio=args.eval_split_ratio, + max_eval_samples=args.max_eval_samples, + seed=args.seed, + ) From d3d587da142ac67c74fc6ba60f5a24e219910078 Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Sun, 15 Sep 2024 16:22:57 -0700 Subject: [PATCH 40/42] splitting script --- .../command_utils/data_prep/split_eval_set.py | 162 ++++++++++++++++-- llmfoundry/data/finetuning/tasks.py | 6 +- 2 files changed, 152 insertions(+), 16 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py index 01205cba15..f6afc8722d 100644 --- a/llmfoundry/command_utils/data_prep/split_eval_set.py +++ b/llmfoundry/command_utils/data_prep/split_eval_set.py @@ -1,31 +1,167 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import logging import os +import re import json -from enum import Enum +import contextlib +import datasets as hf_datasets +import numpy as np +from typing import Optional -import datasets -from llmfoundry.data.finetuning.tasks import download_hf_dataset_if_needed +from composer.utils import get_file +from llmfoundry.data.finetuning.tasks import maybe_safe_download_hf_data -class SupportedDataFormats(Enum): - REMOTE_JSONL = "jsonl" # UC JSONL - DELTA_JSONL = "delta_jsonl" # Delta table preprocessed to JSONL - HF = "huggingface" +DELTA_JSONL_REGEX = re.compile(r"^tmp-t$") +REMOTE_OBJECT_STORE_FILE_REGEX = re.compile( + r"^((s3|oci|gs):\/\/|dbfs:\/Volumes\/)[/a-zA-Z0-9 ()_\-.]+$" +) +HF_REGEX = re.compile(r"^[/a-zA-Z0-9 ()_\-.]+$") +TEMP_DIR = "tmp-split" -def validate_data_path(data_path: str) -> None: +log = logging.getLogger(__name__) + +import sys + +log.setLevel(logging.DEBUG) +log.addHandler(logging.StreamHandler(sys.stdout)) + + +def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> str: """ - Validates the data path and returns the format of the data. + Prepares dataset as a local JSONL file. Downloads from remote object store or HF if necessary. + + This function is intended to be invoked by DBX Finetuning. + Thus, it assumes the provided data is in one of three formats: + 1. A Delta table converted to JSONL at 'tmp-t/{data_path_split}-00000-of-00001.jsonl` + using the 'llmfoundry.scripts.convert_delta_to_json.py' script. + 2. A JSONL stored as a remote object store file (e.g. S3, OCI, GCS) + 3. A Hugging Face dataset Args: - data_path (str): Path to the training dataset + data_path_folder (str): Path to the training dataset folder + data_path_split (str): Data split + + Returns: + str: Path to the training dataset """ + os.makedirs(TEMP_DIR, exist_ok=True) + + if DELTA_JSONL_REGEX.match(data_path_folder): + data_path = os.path.join(data_path_folder, f"{data_path_split}-00000-of-00001.jsonl") + if not os.path.exists(data_path): + # TODO: error handling + raise FileNotFoundError(f"File {data_path} does not exist.") + + if REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder): + log.info( + f"Downloading dataset from remote object store: {data_path_folder}{data_path_split}.jsonl" + ) + remote_path = f"{data_path_folder}/{data_path_split}.jsonl" + data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl") + try: + get_file(remote_path, data_path, overwrite=True) + except FileNotFoundError as e: + # TODO: error handling + raise e + + elif HF_REGEX.match(data_path_folder): + log.info( + f"Downloading dataset from Hugging Face: {data_path_folder} with split {data_path_split}" + ) + # TODO: maybe add support for HF kwargs + local_hf_path = maybe_safe_download_hf_data(data_path_folder) + # convert dataset split to JSONL + dataset = hf_datasets.load_dataset( + local_hf_path, + split=data_path_split, + ) + data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl") + with open(data_path, "w") as f: + for example in dataset: + f.write(json.dumps(example) + "\n") + + else: + # TODO: error handling + raise ValueError( + f"Unrecognized data_path_folder: {data_path_folder}. Must be a Delta table, remote object store file, or Hugging Face dataset." + ) + + if not os.path.exists(data_path): + # TODO: error handling + raise FileNotFoundError(f"File {data_path} does not exist.") + + return data_path + +@contextlib.contextmanager +def temp_seed(seed: int): + state = np.random.get_state() + np.random.seed(seed) + try: + yield + finally: + np.random.set_state(state) -def split_eval_set_from_args() -> None: + +def _split_examples( + data_path: str, + output_path: str, + eval_split_ratio: float, + max_eval_samples: Optional[int], + seed: Optional[int] = None, +) -> None: + """ + Splits the dataset into training and evaluation sets. + + Args: + data_path (str): Path to the training dataset (local jsonl file) + eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training + max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used + seed (int): Random seed for splitting the dataset """ + # first pass: count total number of lines and determine sample size + total_lines = 0 + with open(data_path, "r") as infile: + for _ in infile: + total_lines += 1 + sample_size = int(eval_split_ratio * total_lines) + if max_eval_samples is not None: + sample_size = min(sample_size, max_eval_samples) + + with temp_seed(seed) if seed is not None else contextlib.nullcontext(): + random_numbers = np.random.rand(total_lines) + sample_indices = set(np.argsort(random_numbers)[:sample_size]) + + # second pass: sample indices + with open(data_path, "r") as infile, open( + os.path.join(output_path, "train.jsonl"), "w" + ) as train_outfile, open(os.path.join(output_path, "eval.jsonl"), "w") as eval_outfile: + for idx, line in enumerate(infile): + if idx in sample_indices: + eval_outfile.write(line) + else: + train_outfile.write(line) + + log.info( + f"Split {data_path} into train set of size {total_lines - sample_size} and eval set of size {sample_size}." + ) + + +def split_eval_set_from_args( + data_path_folder: str, + data_path_split: str, + output_path: str, + eval_split_ratio: float, + max_eval_samples: Optional[int] = None, + seed: Optional[int] = None, +) -> None: + """ + A wrapper for split_eval_set that parses arguments + Args: data_path_folder (str): Path to the training dataset folder data_path_split (str): Data split @@ -34,4 +170,6 @@ def split_eval_set_from_args() -> None: max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used seed (int): Random seed for splitting the dataset """ - pass + os.makedirs(output_path, exist_ok=True) + data_path = maybe_download_data_as_json(data_path_folder, data_path_split) + _split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index f894a22a6a..f9ffaf4463 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -706,7 +706,7 @@ def state_dict(self, num_samples: int, from_beginning=from_beginning, ) -def download_hf_dataset_if_needed( +def maybe_safe_download_hf_data( dataset_name: str, hf_kwargs: Optional[dict[str, Any]] = None ) -> str: @@ -716,7 +716,6 @@ def download_hf_dataset_if_needed( Args: dataset_name (str): The name of the HuggingFace dataset to use. Can be a remote http(s) directory or object store bucket containing the file {split}.jsonl. - safe_load (bool): Whether to enforce safe loading of the dataset. hf_kwargs (dict, optional): Additional kwargs to pass to `datasets.load_dataset`. Returns: @@ -970,9 +969,8 @@ def build_from_hf( filtered_dataset = None try: if safe_load: - dataset_name = download_hf_dataset_if_needed( + dataset_name = maybe_download_hf_data( dataset_name, - safe_load, hf_kwargs, ) From b921b309829ea9c4f4b428a22cf07cf34a2333e0 Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Mon, 16 Sep 2024 00:58:53 -0700 Subject: [PATCH 41/42] error handling and testing --- llmfoundry/command_utils/__init__.py | 6 +- .../command_utils/data_prep/split_eval_set.py | 38 ++-- .../data_prep/test_split_eval_set.py | 163 ++++++++++++++++++ 3 files changed, 183 insertions(+), 24 deletions(-) create mode 100644 tests/a_scripts/data_prep/test_split_eval_set.py diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 5407b723cc..8757f3b1bc 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -20,7 +20,10 @@ convert_text_to_mds, convert_text_to_mds_from_args, ) -from llmfoundry.command_utils.data_prep.split_eval_set import split_eval_set_from_args +from llmfoundry.command_utils.data_prep.split_eval_set import ( + split_eval_set_from_args, + split_examples, +) from llmfoundry.command_utils.eval import ( eval_from_yaml, evaluate, @@ -52,4 +55,5 @@ "convert_delta_to_json_from_args", "fetch_DT", "split_eval_set_from_args", + "split_examples", ] diff --git a/llmfoundry/command_utils/data_prep/split_eval_set.py b/llmfoundry/command_utils/data_prep/split_eval_set.py index f6afc8722d..b4b150f81f 100644 --- a/llmfoundry/command_utils/data_prep/split_eval_set.py +++ b/llmfoundry/command_utils/data_prep/split_eval_set.py @@ -10,7 +10,7 @@ import numpy as np from typing import Optional -from composer.utils import get_file +import composer.utils as utils from llmfoundry.data.finetuning.tasks import maybe_safe_download_hf_data @@ -24,11 +24,6 @@ log = logging.getLogger(__name__) -import sys - -log.setLevel(logging.DEBUG) -log.addHandler(logging.StreamHandler(sys.stdout)) - def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> str: """ @@ -51,22 +46,16 @@ def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> os.makedirs(TEMP_DIR, exist_ok=True) if DELTA_JSONL_REGEX.match(data_path_folder): + log.info(f"Dataset is converted from Delta table. Using local file {data_path_folder}") data_path = os.path.join(data_path_folder, f"{data_path_split}-00000-of-00001.jsonl") - if not os.path.exists(data_path): - # TODO: error handling - raise FileNotFoundError(f"File {data_path} does not exist.") - if REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder): + elif REMOTE_OBJECT_STORE_FILE_REGEX.match(data_path_folder): log.info( f"Downloading dataset from remote object store: {data_path_folder}{data_path_split}.jsonl" ) remote_path = f"{data_path_folder}/{data_path_split}.jsonl" data_path = os.path.join(TEMP_DIR, f"{data_path_split}.jsonl") - try: - get_file(remote_path, data_path, overwrite=True) - except FileNotFoundError as e: - # TODO: error handling - raise e + utils.get_file(remote_path, data_path, overwrite=True) elif HF_REGEX.match(data_path_folder): log.info( @@ -85,20 +74,21 @@ def maybe_download_data_as_json(data_path_folder: str, data_path_split: str) -> f.write(json.dumps(example) + "\n") else: - # TODO: error handling raise ValueError( - f"Unrecognized data_path_folder: {data_path_folder}. Must be a Delta table, remote object store file, or Hugging Face dataset." + f"Encountered unknown data path format when splitting dataset: {data_path_folder} with split {data_path_split}" ) if not os.path.exists(data_path): - # TODO: error handling - raise FileNotFoundError(f"File {data_path} does not exist.") + raise FileNotFoundError( + f"Expected dataset file at {data_path} for splitting, but it does not exist." + ) return data_path @contextlib.contextmanager def temp_seed(seed: int): + log.info(f"Setting random seed to {seed}") state = np.random.get_state() np.random.seed(seed) try: @@ -107,11 +97,11 @@ def temp_seed(seed: int): np.random.set_state(state) -def _split_examples( +def split_examples( data_path: str, output_path: str, eval_split_ratio: float, - max_eval_samples: Optional[int], + max_eval_samples: Optional[int] = None, seed: Optional[int] = None, ) -> None: """ @@ -119,10 +109,13 @@ def _split_examples( Args: data_path (str): Path to the training dataset (local jsonl file) + output_path (str): Directory to save the split dataset eval_split_ratio (float): Ratio of the dataset to use for evaluation. The remainder will be used for training max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used seed (int): Random seed for splitting the dataset """ + os.makedirs(output_path, exist_ok=True) + # first pass: count total number of lines and determine sample size total_lines = 0 with open(data_path, "r") as infile: @@ -170,6 +163,5 @@ def split_eval_set_from_args( max_eval_samples (int): Maximum number of samples to include in the eval set. If None, all eval_split_ratio * train_dataset_size samples will be used seed (int): Random seed for splitting the dataset """ - os.makedirs(output_path, exist_ok=True) data_path = maybe_download_data_as_json(data_path_folder, data_path_split) - _split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed) + split_examples(data_path, output_path, eval_split_ratio, max_eval_samples, seed) diff --git a/tests/a_scripts/data_prep/test_split_eval_set.py b/tests/a_scripts/data_prep/test_split_eval_set.py new file mode 100644 index 0000000000..a1b80b91cd --- /dev/null +++ b/tests/a_scripts/data_prep/test_split_eval_set.py @@ -0,0 +1,163 @@ +import os +import json +import pytest +import hashlib +from unittest.mock import patch + +from llmfoundry.command_utils import split_eval_set_from_args, split_examples + +# Default values +OUTPUT_DIR = "tmp-split" +TMPT_DIR = "tmp-t" +DATA_PATH_SPLIT = "train" +EVAL_SPLIT_RATIO = 0.1 +DEFAULT_FILE = TMPT_DIR + "/train-00000-of-00001.jsonl" + + +def calculate_file_hash(filepath: str) -> str: + with open(filepath, "rb") as f: + file_hash = hashlib.sha256(f.read()).hexdigest() + return file_hash + + +def count_lines(filepath: str) -> int: + with open(filepath, "r") as f: + return sum(1 for _ in f) + + +@pytest.fixture(scope="module", autouse=True) +def setup_and_teardown_module(): + # Setup: create local testing file + os.makedirs(TMPT_DIR, exist_ok=True) + with open(DEFAULT_FILE, "w") as f: + for i in range(1000): + f.write(json.dumps({"prompt": "hello world " + str(i), "response": "hi you!"}) + "\n") + yield + + # Teardown: clean up output and tmp directories + os.system(f"rm -rf {OUTPUT_DIR}") + os.system(f"rm -rf {TMPT_DIR}") + + +def test_basic_split(): + """Test basic functionality on local file""" + output_path = os.path.join(OUTPUT_DIR, "basic-test") + split_eval_set_from_args(TMPT_DIR, DATA_PATH_SPLIT, output_path, EVAL_SPLIT_RATIO) + assert os.path.isfile(os.path.join(output_path, "train.jsonl")) + assert os.path.isfile(os.path.join(output_path, "eval.jsonl")) + + +def test_basic_split_output_exists(): + """Test that split overwrites existing files in output directory""" + output_path = os.path.join(OUTPUT_DIR, "basic-test") + os.makedirs(output_path, exist_ok=True) + train_file = os.path.join(output_path, "train.jsonl") + eval_file = os.path.join(output_path, "eval.jsonl") + with open(train_file, "w") as f: + f.write("existing file train") + with open(eval_file, "w") as f: + f.write("existing file eval") + old_train_hash = calculate_file_hash(train_file) + old_eval_hash = calculate_file_hash(eval_file) + split_eval_set_from_args( + TMPT_DIR, + DATA_PATH_SPLIT, + output_path, + EVAL_SPLIT_RATIO, + ) + assert calculate_file_hash(train_file) != old_train_hash + assert calculate_file_hash(eval_file) != old_eval_hash + + +def test_max_eval_samples(): + """Test case where max_eval_samples < eval_split_ratio * total samples""" + output_path = os.path.join(OUTPUT_DIR, "max-eval-test") + max_eval_samples = 50 + split_eval_set_from_args( + TMPT_DIR, + DATA_PATH_SPLIT, + output_path, + EVAL_SPLIT_RATIO, + max_eval_samples, + ) + eval_lines = count_lines(os.path.join(output_path, "eval.jsonl")) + assert eval_lines == max_eval_samples + + +def test_eval_split_ratio(): + """Test case where max_eval_samples is not used""" + output_path = os.path.join(OUTPUT_DIR, "eval-split-test") + split_eval_set_from_args(TMPT_DIR, DATA_PATH_SPLIT, output_path, EVAL_SPLIT_RATIO) + original_data_lines = count_lines(DEFAULT_FILE) + eval_lines = count_lines(os.path.join(output_path, "eval.jsonl")) + assert abs(eval_lines - EVAL_SPLIT_RATIO * original_data_lines) < 1 # allow for rounding errors + + +def test_seed_consistency(): + """Test if the same seed generates consistent splits""" + output_path_1 = os.path.join(OUTPUT_DIR, "seed-test-1") + output_path_2 = os.path.join(OUTPUT_DIR, "seed-test-2") + split_examples(DEFAULT_FILE, output_path_1, EVAL_SPLIT_RATIO, seed=12345) + split_examples(DEFAULT_FILE, output_path_2, EVAL_SPLIT_RATIO, seed=12345) + train_hash_1 = calculate_file_hash(os.path.join(output_path_1, "train.jsonl")) + train_hash_2 = calculate_file_hash(os.path.join(output_path_2, "train.jsonl")) + eval_hash_1 = calculate_file_hash(os.path.join(output_path_1, "eval.jsonl")) + eval_hash_2 = calculate_file_hash(os.path.join(output_path_2, "eval.jsonl")) + + assert train_hash_1 == train_hash_2 + assert eval_hash_1 == eval_hash_2 + + output_path_3 = os.path.join(OUTPUT_DIR, "seed-test-3") + split_examples(DEFAULT_FILE, output_path_3, EVAL_SPLIT_RATIO, seed=54321) + train_hash_3 = calculate_file_hash(os.path.join(output_path_3, "train.jsonl")) + eval_hash_3 = calculate_file_hash(os.path.join(output_path_3, "eval.jsonl")) + + assert train_hash_1 != train_hash_3 + assert eval_hash_1 != eval_hash_3 + + +def test_hf_data_split(): + """Test splitting a dataset from Hugging Face""" + output_path = os.path.join(OUTPUT_DIR, "hf-split-test") + split_eval_set_from_args( + "databricks/databricks-dolly-15k", "train", output_path, EVAL_SPLIT_RATIO + ) + assert os.path.isfile(os.path.join(output_path, "train.jsonl")) + assert os.path.isfile(os.path.join(output_path, "eval.jsonl")) + assert count_lines(os.path.join(output_path, "train.jsonl")) > 0 + assert count_lines(os.path.join(output_path, "eval.jsonl")) > 0 + + +def _mock_get_file(remote_path: str, data_path: str, overwrite: bool): + with open(data_path, "w") as f: + for i in range(1000): + f.write(json.dumps({"prompt": "hello world " + str(i), "response": "hi you!"}) + "\n") + + +def test_remote_store_data_split(): + """Test splitting a dataset from a remote store""" + output_path = os.path.join(OUTPUT_DIR, "remote-split-test") + with patch("composer.utils.get_file", side_effect=_mock_get_file) as mock_get_file: + split_eval_set_from_args( + "dbfs:/Volumes/test/test/test.jsonl", + "unique-split-name", + output_path, + EVAL_SPLIT_RATIO, + ) + mock_get_file.assert_called() + + assert os.path.isfile(os.path.join(output_path, "train.jsonl")) + assert os.path.isfile(os.path.join(output_path, "eval.jsonl")) + assert count_lines(os.path.join(output_path, "train.jsonl")) > 0 + assert count_lines(os.path.join(output_path, "eval.jsonl")) > 0 + + +def test_missing_delta_file_error(): + # expects file 'TMPT_DIR/missing-00000-of-00001.jsonl + with pytest.raises(FileNotFoundError): + split_eval_set_from_args(TMPT_DIR, "missing", OUTPUT_DIR, EVAL_SPLIT_RATIO) + + +def test_unknown_file_format_error(): + with pytest.raises(ValueError): + split_eval_set_from_args("s3:/path/to/file.jsonl", "train", OUTPUT_DIR, EVAL_SPLIT_RATIO) From 792500186e4b233a062085318e32f777a6261692 Mon Sep 17 00:00:00 2001 From: Matthew Ding Date: Mon, 16 Sep 2024 01:08:53 -0700 Subject: [PATCH 42/42] undo autoformat --- llmfoundry/command_utils/__init__.py | 38 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/llmfoundry/command_utils/__init__.py b/llmfoundry/command_utils/__init__.py index 8757f3b1bc..4f74fe6ec9 100644 --- a/llmfoundry/command_utils/__init__.py +++ b/llmfoundry/command_utils/__init__.py @@ -37,23 +37,23 @@ ) __all__ = [ - "train", - "train_from_yaml", - "TrainConfig", - "TRAIN_CONFIG_KEYS", - "validate_config", - "evaluate", - "eval_from_yaml", - "convert_dataset_hf", - "convert_dataset_hf_from_args", - "convert_dataset_json", - "convert_dataset_json_from_args", - "convert_finetuning_dataset_from_args", - "convert_finetuning_dataset", - "convert_text_to_mds", - "convert_text_to_mds_from_args", - "convert_delta_to_json_from_args", - "fetch_DT", - "split_eval_set_from_args", - "split_examples", + 'train', + 'train_from_yaml', + 'TrainConfig', + 'TRAIN_CONFIG_KEYS', + 'validate_config', + 'evaluate', + 'eval_from_yaml', + 'convert_dataset_hf', + 'convert_dataset_hf_from_args', + 'convert_dataset_json', + 'convert_dataset_json_from_args', + 'convert_finetuning_dataset_from_args', + 'convert_finetuning_dataset', + 'convert_text_to_mds', + 'convert_text_to_mds_from_args', + 'convert_delta_to_json_from_args', + 'fetch_DT', + 'split_eval_set_from_args', + 'split_examples', ]