Skip to content

Commit

Permalink
Adapt composer -> HF conversion script to support all causal lms (#526)
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg authored Aug 18, 2023
1 parent 0b8c81d commit 9946fc6
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 154 deletions.
211 changes: 74 additions & 137 deletions scripts/inference/convert_composer_to_hf.py
Original file line number Diff line number Diff line change
@@ -1,120 +1,34 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

# Note: This script is specifically for converting MPT composer checkpoints to HuggingFace format
# For composer checkpoints containing model that are in the transformers library, see
# https://docs.mosaicml.com/projects/composer/en/latest/api_reference/generated/composer.models.write_huggingface_pretrained_from_composer_checkpoint.html

import json
import os
import random
import string
import tempfile
from argparse import ArgumentParser, Namespace
from pathlib import Path
from typing import Any, Dict, Optional, Union
from typing import Any, Dict, Optional, Tuple, Union

import sentencepiece as spm
import torch
import transformers
from composer.models.huggingface import get_hf_config_from_composer_state_dict
from composer.utils import (get_file, maybe_create_object_store_from_uri,
parse_uri, safe_torch_load)
from transformers import (AutoConfig, AutoTokenizer, PretrainedConfig,
PreTrainedTokenizer)
PreTrainedTokenizer, PreTrainedTokenizerBase)

from llmfoundry import MPTConfig, MPTForCausalLM
from llmfoundry.utils.huggingface_hub_utils import \
edit_files_for_hf_compatibility


# TODO: maybe move this functionality to Composer
def get_hf_config_from_composer_state_dict(
state_dict: Dict[str, Any]) -> PretrainedConfig:
if 'state' not in state_dict:
raise RuntimeError(
'Unexpected composer state dictionary. Did you pass in a full composer checkpoint?'
)
if 'integrations' not in state_dict[
'state'] or 'huggingface' not in state_dict['state']['integrations']:
raise RuntimeError(
'Did not find HuggingFace related state (e.g., tokenizer) in the provided composer checkpoint!'
)
hf_config_dict = state_dict['state']['integrations']['huggingface'][
'model']['config']['content']

# Always set init_device='cpu'
hf_config_dict['init_device'] = 'cpu'

AutoConfig.register('mpt', MPTConfig)

# backwards compatibility changes
if hf_config_dict['model_type'] == 'mosaic_gpt':
hf_config_dict['model_type'] = 'mpt'

if 'attn_config' not in hf_config_dict:
attn_config = {}
attn_config['attn_type'] = 'multihead_attention'
attn_config['attn_pdrop'] = hf_config_dict['attn_pdrop']
del hf_config_dict['attn_pdrop']
attn_config['attn_impl'] = hf_config_dict['attn_impl']
del hf_config_dict['attn_impl']
attn_config['qk_ln'] = hf_config_dict['attn_qk_ln']
del hf_config_dict['attn_qk_ln']
attn_config['clip_qkv'] = hf_config_dict['attn_clip_qkv']
del hf_config_dict['attn_clip_qkv']
attn_config['softmax_scale'] = hf_config_dict['softmax_scale']
del hf_config_dict['softmax_scale']
attn_config['prefix_lm'] = hf_config_dict['prefix_lm']
del hf_config_dict['prefix_lm']
attn_config['attn_uses_sequence_id'] = hf_config_dict[
'attn_uses_sequence_id']
del hf_config_dict['attn_uses_sequence_id']
attn_config['alibi'] = hf_config_dict['alibi']
del hf_config_dict['alibi']
attn_config['alibi_bias_max'] = hf_config_dict['alibi_bias_max']
del hf_config_dict['alibi_bias_max']

hf_config_dict['attn_config'] = attn_config

if 'init_config' not in hf_config_dict:
init_config = {}

init_config['name'] = hf_config_dict['param_init_fn']
del hf_config_dict['param_init_fn']
init_config['fan_mode'] = hf_config_dict['fan_mode']
del hf_config_dict['fan_mode']
init_config['init_nonlinearity'] = hf_config_dict['init_nonlinearity']
del hf_config_dict['init_nonlinearity']
init_config['init_gain'] = hf_config_dict['init_gain']
del hf_config_dict['init_gain']
init_config['init_std'] = hf_config_dict['init_std']
del hf_config_dict['init_std']
init_config['init_div_is_residual'] = hf_config_dict[
'init_div_is_residual']
del hf_config_dict['init_div_is_residual']
init_config['emb_init_std'] = hf_config_dict['emb_init_std']
del hf_config_dict['emb_init_std']
init_config['emb_init_uniform_lim'] = hf_config_dict[
'emb_init_uniform_lim']
del hf_config_dict['emb_init_uniform_lim']

hf_config_dict['init_config'] = init_config

if 'mlp_ratio' in hf_config_dict:
hf_config_dict['expansion_ratio'] = hf_config_dict['mlp_ratio']
del hf_config_dict['mlp_ratio']

if 'low_precision_layernorm' in hf_config_dict:
if hf_config_dict['low_precision_layernorm']:
hf_config_dict['norm_type'] = 'low_precision_layernorm'
else:
hf_config_dict['norm_type'] = 'layernorm'
del hf_config_dict['low_precision_layernorm']

return AutoConfig.for_model(**hf_config_dict)


# TODO: maybe move this functionality to Composer
# TODO: move this functionality to composer once the bug fixes are upstreamed
def get_hf_tokenizer_from_composer_state_dict(
state_dict: Dict[str, Any]) -> Optional[PreTrainedTokenizer]:
state_dict: Dict[str, Any],
tokenizer_save_dir: Optional[str] = None
) -> Optional[PreTrainedTokenizer]:
if 'state' not in state_dict:
raise RuntimeError(
'Unexpected composer state dictionary. Did you pass in a full composer checkpoint?'
Expand All @@ -128,38 +42,51 @@ def get_hf_tokenizer_from_composer_state_dict(
'tokenizer']
hf_tokenizer = None
if hf_tokenizer_state != {}:
with tempfile.TemporaryDirectory() as _tmp_dir:
for filename, saved_content in hf_tokenizer_state.items():
tokenizer_file_path = Path(
_tmp_dir) / f'{filename}{saved_content["file_extension"]}'
if saved_content['file_extension'] == '.json':
with open(tokenizer_file_path, 'w') as _tmp_file:
json.dump(saved_content['content'], _tmp_file)
elif saved_content['file_extension'] == '.txt':
with open(tokenizer_file_path, 'w') as _tmp_file:
for line in saved_content['content']:
_tmp_file.write(line)
_tmp_file.write('\n')
elif saved_content['file_extension'] == '.model':
s = spm.SentencePieceProcessor()
s.load_from_serialized_proto(saved_content['content'])
with open(tokenizer_file_path, 'wb') as _tmp_file:
_tmp_file.write(s.serialized_model_proto())
hf_tokenizer = AutoTokenizer.from_pretrained(_tmp_dir)

# remove 'name_or_path'
hf_tokenizer.name_or_path = ''
hf_tokenizer.init_kwargs['name_or_path'] = ''
if tokenizer_save_dir is None:
unique_suffix = ''.join(
random.choices(string.ascii_letters + string.digits, k=6))
tokenizer_save_dir = os.path.join(
os.getcwd(), f'tokenizer-save-dir-{unique_suffix}')
os.makedirs(tokenizer_save_dir, exist_ok=True)

for filename, saved_content in hf_tokenizer_state.items():
# This cannot be a temporary directory because huggingface relies on the slow tokenizer file
# being persistent on disk
tokenizer_file_path = Path(
tokenizer_save_dir
) / f'{filename}{saved_content["file_extension"]}'
if saved_content['file_extension'] == '.json':
with open(tokenizer_file_path, 'w') as _tmp_file:
json.dump(saved_content['content'], _tmp_file)
elif saved_content['file_extension'] == '.txt':
with open(tokenizer_file_path, 'w') as _tmp_file:
for line in saved_content['content']:
_tmp_file.write(line)
_tmp_file.write('\n')
elif saved_content['file_extension'] == '.py':
with open(tokenizer_file_path, 'w') as _tmp_file:
_tmp_file.write(saved_content['content'])
elif saved_content['file_extension'] == '.model':
s = spm.SentencePieceProcessor()
s.load_from_serialized_proto(saved_content['content'])
with open(tokenizer_file_path, 'wb') as _tmp_file:
_tmp_file.write(s.serialized_model_proto())

hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_dir)

# remove 'name_or_path'
hf_tokenizer.name_or_path = ''
hf_tokenizer.init_kwargs['name_or_path'] = ''

return hf_tokenizer


def write_huggingface_pretrained_from_composer_checkpoint(
checkpoint_path: Union[Path, str],
output_path: Union[Path, str],
output_precision: str = 'fp32',
local_checkpoint_save_location: Optional[Union[Path,
str]] = None) -> None:
checkpoint_path: Union[Path, str],
output_path: Union[Path, str],
output_precision: str = 'fp32',
local_checkpoint_save_location: Optional[Union[Path, str]] = None
) -> Tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]:
"""Convert a Composer checkpoint to a pretrained HF checkpoint folder.
Write a ``config.json`` and ``pytorch_model.bin``, like
Expand Down Expand Up @@ -274,12 +201,14 @@ def write_huggingface_pretrained_from_composer_checkpoint(
print('Done.')
print('#' * 30)

return hf_config, hf_tokenizer


def parse_args() -> Namespace:
"""Parse commandline arguments."""
parser = ArgumentParser(
description=
'Convert an MPT Composer checkpoint and Omegaconf model config into a standard HuggingFace checkpoint folder, and optionally upload to the hub.'
'Convert a HuggingFace causal LM in a Composer checkpoint into a standard HuggingFace checkpoint folder, and optionally upload to the hub.'
)
parser.add_argument('--composer_path', type=str, required=True)
parser.add_argument('--hf_output_path', type=str, required=True)
Expand All @@ -297,9 +226,16 @@ def parse_args() -> Namespace:


def convert_composer_to_hf(args: Namespace) -> None:
# Register MPT auto classes so that this script works with MPT
# This script will not work without modification for other custom models,
# but will work for other HuggingFace causal LMs
AutoConfig.register('mpt', MPTConfig)
MPTConfig.register_for_auto_class()
MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')

_, _, local_folder_path = parse_uri(args.hf_output_path)

write_huggingface_pretrained_from_composer_checkpoint(
config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint(
checkpoint_path=args.composer_path,
output_path=local_folder_path,
output_precision=args.output_precision,
Expand All @@ -311,19 +247,18 @@ def convert_composer_to_hf(args: Namespace) -> None:
'bf16': torch.bfloat16,
}[args.output_precision]

# register config auto class
MPTConfig.register_for_auto_class()
print(f'Loading model from {local_folder_path}')
if config.model_type == 'mpt':
config.attn_config['attn_impl'] = 'torch'

# register model auto class
MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
if config.model_type == 'mpt':
loaded_hf_model = MPTForCausalLM.from_pretrained(local_folder_path,
config=config,
torch_dtype=dtype)
else:
loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(
local_folder_path, config=config, torch_dtype=dtype)

print(f'Loading model from {local_folder_path}')
config = MPTConfig.from_pretrained(local_folder_path)
# You have to edit the config this way, because attn_config is a nested dictionary
config.attn_config['attn_impl'] = 'torch'
loaded_hf_model = MPTForCausalLM.from_pretrained(local_folder_path,
config=config,
torch_dtype=dtype)
delattr(loaded_hf_model.config, '_name_or_path')

loaded_hf_model.save_pretrained(local_folder_path)
Expand All @@ -332,8 +267,10 @@ def convert_composer_to_hf(args: Namespace) -> None:
tokenizer = transformers.AutoTokenizer.from_pretrained(local_folder_path)
tokenizer.save_pretrained(local_folder_path)

print('Editing files for HF compatibility...')
edit_files_for_hf_compatibility(local_folder_path)
# Only need to edit files for MPT because it has custom code
if config.model_type == 'mpt':
print('Editing files for HF compatibility...')
edit_files_for_hf_compatibility(local_folder_path)

object_store = maybe_create_object_store_from_uri(str(args.hf_output_path))

Expand Down
68 changes: 51 additions & 17 deletions tests/test_hf_conversion_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from omegaconf import DictConfig
from omegaconf import OmegaConf as om

from llmfoundry import COMPOSER_MODEL_REGISTRY
from scripts.inference.convert_composer_to_hf import convert_composer_to_hf


Expand All @@ -43,19 +44,44 @@ def get_config(
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
with open(conf_path) as f:
test_cfg = om.load(f)

return cast(DictConfig, test_cfg)


def test_convert_and_generate_torch(tmp_path: pathlib.Path):
@pytest.mark.parametrize('model', ['mpt', 'neo', 'llama2'])
def test_convert_and_generate(model: str, tmp_path: pathlib.Path):
delete_transformers_cache()

cfg = get_config()
cfg['model']['init_device'] = 'cpu'
cfg['model']['attn_config']['attn_impl'] = 'torch'
om_cfg = None
if model == 'mpt':
om_cfg = get_config(
conf_path='scripts/train/yamls/pretrain/testing.yaml')
elif model == 'neo':
om_cfg = get_config(
conf_path='scripts/train/yamls/pretrain/gpt-neo-125m.yaml')
om_cfg['model']['config_overrides']['hidden_size'] = 36
elif model == 'llama2':
if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
pytest.skip(
'The CI cluster does not have access to the Llama models, so skip this test.'
)
om_cfg = get_config(
conf_path='scripts/train/yamls/pretrain/gpt-neo-125m.yaml')
om_cfg['model'][
'pretrained_model_name_or_path'] = 'meta-llama/Llama-2-7b-hf'
om_cfg['model']['config_overrides']['num_hidden_layers'] = 2
om_cfg['model']['use_auth_token'] = True
om_cfg['tokenizer']['name'] = 'meta-llama/Llama-2-7b-hf'
else:
raise ValueError(f'Unknown model {model}')
assert om_cfg is not None

om_cfg['model']['init_device'] = 'cpu'
tokenizer = transformers.AutoTokenizer.from_pretrained(
'EleutherAI/gpt-neox-20b')
model = ComposerMPTCausalLM(cfg['model'], tokenizer)
trainer = Trainer(model=model)
om_cfg.tokenizer.name, use_auth_token=model == 'llama2')
original_model = COMPOSER_MODEL_REGISTRY[om_cfg['model'].name](
om_cfg['model'], tokenizer)
trainer = Trainer(model=original_model, device='cpu')
trainer.save_checkpoint(os.path.join(tmp_path, 'checkpoint.pt'))

args = Namespace(composer_path=os.path.join(tmp_path, 'checkpoint.pt'),
Expand All @@ -66,21 +92,29 @@ def test_convert_and_generate_torch(tmp_path: pathlib.Path):
test_uploaded_model=False)
convert_composer_to_hf(args)

config = transformers.AutoConfig.from_pretrained(os.path.join(
tmp_path, 'hf-output-folder'),
trust_remote_code=True)
config.attn_config['attn_impl'] = 'torch'
model = transformers.AutoModelForCausalLM.from_pretrained(
loaded_config = transformers.AutoConfig.from_pretrained(
os.path.join(tmp_path, 'hf-output-folder'), trust_remote_code=True)
loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
os.path.join(tmp_path, 'hf-output-folder'),
config=config,
config=loaded_config,
trust_remote_code=True)
tokenizer = transformers.AutoTokenizer.from_pretrained(
os.path.join(tmp_path, 'hf-output-folder'), trust_remote_code=True)

output = model.generate(tokenizer('hello',
return_tensors='pt')['input_ids'],
max_new_tokens=1)
assert output.shape == (1, 2)
output = loaded_model.generate(tokenizer('hello',
return_tensors='pt')['input_ids'],
max_new_tokens=1)
assert output.shape == (1, 2 + (1 if model == 'llama2' else 0))

assert sum(p.numel() for p in original_model.model.parameters()) == sum(
p.numel() for p in loaded_model.parameters())
assert all(
str(type(module1)).split('.')[-1] == str(type(module2)).split('.')[-1]
for module1, module2 in zip(original_model.model.modules(),
loaded_model.modules()))
for p1, p2 in zip(original_model.model.parameters(),
loaded_model.parameters()):
assert torch.allclose(p1, p2)

delete_transformers_cache()

Expand Down

0 comments on commit 9946fc6

Please sign in to comment.