From 813d50e7316458a51610f4625916d3bb980fc140 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 9 Oct 2024 12:46:22 -0700 Subject: [PATCH] Handle long file names in convert text to mds (#1579) --- .../data_prep/convert_text_to_mds.py | 64 +++++++++---------- llmfoundry/utils/data_prep_utils.py | 8 ++- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 3ea5aeb5d4..9de13f9d5b 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -240,41 +240,39 @@ def download_and_convert( object_store = maybe_create_object_store_from_uri(input_folder) # Download file_names - with tempfile.TemporaryDirectory() as tmp_dir: - log.info(f'Created temporary directory: {tmp_dir}') - downloading_iter = DownloadingIterable( - object_names=file_names, - output_folder=tmp_dir, - object_store=object_store, - ) - log.info(f'Initializing tokenizer: {tokenizer_name}') - tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name, - trust_remote_code=trust_remote_code, - ) - tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace - - # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up - # to the maximum sequence length - dataset = ConcatTokensFromFilesDataset( - files=downloading_iter, - max_length=concat_tokens, - tokenizer=tokenizer, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - ) + downloading_iter = DownloadingIterable( + object_names=file_names, + output_folder=None, # Downloads to temporary files. + object_store=object_store, + ) + log.info(f'Initializing tokenizer: {tokenizer_name}') + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + trust_remote_code=trust_remote_code, + ) + tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace + + # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up + # to the maximum sequence length + dataset = ConcatTokensFromFilesDataset( + files=downloading_iter, + max_length=concat_tokens, + tokenizer=tokenizer, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + ) - columns = {'tokens': 'ndarray:int32'} + columns = {'tokens': 'ndarray:int32'} - log.info('Converting to MDS format...') - with MDSWriter( - out=output_folder, - columns=columns, - compression=compression, - ) as out: - for sample in tqdm(dataset): - out.write(sample) + log.info('Converting to MDS format...') + with MDSWriter( + out=output_folder, + columns=columns, + compression=compression, + ) as out: + for sample in tqdm(dataset): + out.write(sample) log.info(f'Completed download and conversion for {len(file_names)} files') diff --git a/llmfoundry/utils/data_prep_utils.py b/llmfoundry/utils/data_prep_utils.py index df67f3223a..7bbb80fa4c 100644 --- a/llmfoundry/utils/data_prep_utils.py +++ b/llmfoundry/utils/data_prep_utils.py @@ -3,6 +3,7 @@ import json import os +import tempfile from glob import glob from typing import Optional @@ -105,7 +106,7 @@ class DownloadingIterable: def __init__( self, object_names: list[str], - output_folder: str, + output_folder: Optional[str], object_store: Optional[ObjectStore], ): """Iterable that downloads files before yielding the local filename. @@ -114,7 +115,7 @@ def __init__( Args: object_names (List[str]): Names of objects to download - output_folder (str): Local folder to write downloaded files to + output_folder (Optional[str]): Local folder to write downloaded files to. If none, uses a temporary folder. object_store (Optional[ObjectStore]): Object store to download from """ self.object_names = object_names @@ -131,7 +132,8 @@ def __iter__(self): output_filename = os.path.join( self.output_folder, object_name.strip('/'), - ) + ) if self.output_folder is not None else tempfile.NamedTemporaryFile( + ).name download_file( object_store=self.object_store,