Skip to content

Commit

Permalink
Handle long file names in convert text to mds (#1579)
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea authored Oct 9, 2024
1 parent 237886c commit 813d50e
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 36 deletions.
64 changes: 31 additions & 33 deletions llmfoundry/command_utils/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,41 +240,39 @@ def download_and_convert(
object_store = maybe_create_object_store_from_uri(input_folder)

# Download file_names
with tempfile.TemporaryDirectory() as tmp_dir:
log.info(f'Created temporary directory: {tmp_dir}')
downloading_iter = DownloadingIterable(
object_names=file_names,
output_folder=tmp_dir,
object_store=object_store,
)
log.info(f'Initializing tokenizer: {tokenizer_name}')
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name,
trust_remote_code=trust_remote_code,
)
tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace

# Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up
# to the maximum sequence length
dataset = ConcatTokensFromFilesDataset(
files=downloading_iter,
max_length=concat_tokens,
tokenizer=tokenizer,
eos_text=eos_text,
bos_text=bos_text,
no_wrap=no_wrap,
)
downloading_iter = DownloadingIterable(
object_names=file_names,
output_folder=None, # Downloads to temporary files.
object_store=object_store,
)
log.info(f'Initializing tokenizer: {tokenizer_name}')
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name,
trust_remote_code=trust_remote_code,
)
tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace

# Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up
# to the maximum sequence length
dataset = ConcatTokensFromFilesDataset(
files=downloading_iter,
max_length=concat_tokens,
tokenizer=tokenizer,
eos_text=eos_text,
bos_text=bos_text,
no_wrap=no_wrap,
)

columns = {'tokens': 'ndarray:int32'}
columns = {'tokens': 'ndarray:int32'}

log.info('Converting to MDS format...')
with MDSWriter(
out=output_folder,
columns=columns,
compression=compression,
) as out:
for sample in tqdm(dataset):
out.write(sample)
log.info('Converting to MDS format...')
with MDSWriter(
out=output_folder,
columns=columns,
compression=compression,
) as out:
for sample in tqdm(dataset):
out.write(sample)

log.info(f'Completed download and conversion for {len(file_names)} files')

Expand Down
8 changes: 5 additions & 3 deletions llmfoundry/utils/data_prep_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import json
import os
import tempfile
from glob import glob
from typing import Optional

Expand Down Expand Up @@ -105,7 +106,7 @@ class DownloadingIterable:
def __init__(
self,
object_names: list[str],
output_folder: str,
output_folder: Optional[str],
object_store: Optional[ObjectStore],
):
"""Iterable that downloads files before yielding the local filename.
Expand All @@ -114,7 +115,7 @@ def __init__(
Args:
object_names (List[str]): Names of objects to download
output_folder (str): Local folder to write downloaded files to
output_folder (Optional[str]): Local folder to write downloaded files to. If none, uses a temporary folder.
object_store (Optional[ObjectStore]): Object store to download from
"""
self.object_names = object_names
Expand All @@ -131,7 +132,8 @@ def __iter__(self):
output_filename = os.path.join(
self.output_folder,
object_name.strip('/'),
)
) if self.output_folder is not None else tempfile.NamedTemporaryFile(
).name

download_file(
object_store=self.object_store,
Expand Down

0 comments on commit 813d50e

Please sign in to comment.