Adapt composer -> HF conversion script to support all causal lms (#526)

mosaicml · Aug 18, 2023 · 9946fc6 · 9946fc6
1 parent 0b8c81d
commit 9946fc6
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 154 deletions.
diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py
@@ -1,120 +1,34 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Note: This script is specifically for converting MPT composer checkpoints to HuggingFace format
-# For composer checkpoints containing model that are in the transformers library, see
-# https://docs.mosaicml.com/projects/composer/en/latest/api_reference/generated/composer.models.write_huggingface_pretrained_from_composer_checkpoint.html
-
 import json
 import os
+import random
+import string
 import tempfile
 from argparse import ArgumentParser, Namespace
 from pathlib import Path
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import sentencepiece as spm
 import torch
 import transformers
+from composer.models.huggingface import get_hf_config_from_composer_state_dict
 from composer.utils import (get_file, maybe_create_object_store_from_uri,
                             parse_uri, safe_torch_load)
 from transformers import (AutoConfig, AutoTokenizer, PretrainedConfig,
-                          PreTrainedTokenizer)
+                          PreTrainedTokenizer, PreTrainedTokenizerBase)
 
 from llmfoundry import MPTConfig, MPTForCausalLM
 from llmfoundry.utils.huggingface_hub_utils import \
     edit_files_for_hf_compatibility
 
 
-# TODO: maybe move this functionality to Composer
-def get_hf_config_from_composer_state_dict(
-        state_dict: Dict[str, Any]) -> PretrainedConfig:
-    if 'state' not in state_dict:
-        raise RuntimeError(
-            'Unexpected composer state dictionary. Did you pass in a full composer checkpoint?'
-        )
-    if 'integrations' not in state_dict[
-            'state'] or 'huggingface' not in state_dict['state']['integrations']:
-        raise RuntimeError(
-            'Did not find HuggingFace related state (e.g., tokenizer) in the provided composer checkpoint!'
-        )
-    hf_config_dict = state_dict['state']['integrations']['huggingface'][
-        'model']['config']['content']
-
-    # Always set init_device='cpu'
-    hf_config_dict['init_device'] = 'cpu'
-
-    AutoConfig.register('mpt', MPTConfig)
-
-    # backwards compatibility changes
-    if hf_config_dict['model_type'] == 'mosaic_gpt':
-        hf_config_dict['model_type'] = 'mpt'
-
-    if 'attn_config' not in hf_config_dict:
-        attn_config = {}
-        attn_config['attn_type'] = 'multihead_attention'
-        attn_config['attn_pdrop'] = hf_config_dict['attn_pdrop']
-        del hf_config_dict['attn_pdrop']
-        attn_config['attn_impl'] = hf_config_dict['attn_impl']
-        del hf_config_dict['attn_impl']
-        attn_config['qk_ln'] = hf_config_dict['attn_qk_ln']
-        del hf_config_dict['attn_qk_ln']
-        attn_config['clip_qkv'] = hf_config_dict['attn_clip_qkv']
-        del hf_config_dict['attn_clip_qkv']
-        attn_config['softmax_scale'] = hf_config_dict['softmax_scale']
-        del hf_config_dict['softmax_scale']
-        attn_config['prefix_lm'] = hf_config_dict['prefix_lm']
-        del hf_config_dict['prefix_lm']
-        attn_config['attn_uses_sequence_id'] = hf_config_dict[
-            'attn_uses_sequence_id']
-        del hf_config_dict['attn_uses_sequence_id']
-        attn_config['alibi'] = hf_config_dict['alibi']
-        del hf_config_dict['alibi']
-        attn_config['alibi_bias_max'] = hf_config_dict['alibi_bias_max']
-        del hf_config_dict['alibi_bias_max']
-
-        hf_config_dict['attn_config'] = attn_config
-
-    if 'init_config' not in hf_config_dict:
-        init_config = {}
-
-        init_config['name'] = hf_config_dict['param_init_fn']
-        del hf_config_dict['param_init_fn']
-        init_config['fan_mode'] = hf_config_dict['fan_mode']
-        del hf_config_dict['fan_mode']
-        init_config['init_nonlinearity'] = hf_config_dict['init_nonlinearity']
-        del hf_config_dict['init_nonlinearity']
-        init_config['init_gain'] = hf_config_dict['init_gain']
-        del hf_config_dict['init_gain']
-        init_config['init_std'] = hf_config_dict['init_std']
-        del hf_config_dict['init_std']
-        init_config['init_div_is_residual'] = hf_config_dict[
-            'init_div_is_residual']
-        del hf_config_dict['init_div_is_residual']
-        init_config['emb_init_std'] = hf_config_dict['emb_init_std']
-        del hf_config_dict['emb_init_std']
-        init_config['emb_init_uniform_lim'] = hf_config_dict[
-            'emb_init_uniform_lim']
-        del hf_config_dict['emb_init_uniform_lim']
-
-        hf_config_dict['init_config'] = init_config
-
-    if 'mlp_ratio' in hf_config_dict:
-        hf_config_dict['expansion_ratio'] = hf_config_dict['mlp_ratio']
-        del hf_config_dict['mlp_ratio']
-
-    if 'low_precision_layernorm' in hf_config_dict:
-        if hf_config_dict['low_precision_layernorm']:
-            hf_config_dict['norm_type'] = 'low_precision_layernorm'
-        else:
-            hf_config_dict['norm_type'] = 'layernorm'
-        del hf_config_dict['low_precision_layernorm']
-
-    return AutoConfig.for_model(**hf_config_dict)
-
-
-# TODO: maybe move this functionality to Composer
+# TODO: move this functionality to composer once the bug fixes are upstreamed
 def get_hf_tokenizer_from_composer_state_dict(
-        state_dict: Dict[str, Any]) -> Optional[PreTrainedTokenizer]:
+        state_dict: Dict[str, Any],
+        tokenizer_save_dir: Optional[str] = None
+) -> Optional[PreTrainedTokenizer]:
     if 'state' not in state_dict:
         raise RuntimeError(
             'Unexpected composer state dictionary. Did you pass in a full composer checkpoint?'
@@ -128,38 +42,51 @@ def get_hf_tokenizer_from_composer_state_dict(
         'tokenizer']
     hf_tokenizer = None
     if hf_tokenizer_state != {}:
-        with tempfile.TemporaryDirectory() as _tmp_dir:
-            for filename, saved_content in hf_tokenizer_state.items():
-                tokenizer_file_path = Path(
-                    _tmp_dir) / f'{filename}{saved_content["file_extension"]}'
-                if saved_content['file_extension'] == '.json':
-                    with open(tokenizer_file_path, 'w') as _tmp_file:
-                        json.dump(saved_content['content'], _tmp_file)
-                elif saved_content['file_extension'] == '.txt':
-                    with open(tokenizer_file_path, 'w') as _tmp_file:
-                        for line in saved_content['content']:
-                            _tmp_file.write(line)
-                            _tmp_file.write('\n')
-                elif saved_content['file_extension'] == '.model':
-                    s = spm.SentencePieceProcessor()
-                    s.load_from_serialized_proto(saved_content['content'])
-                    with open(tokenizer_file_path, 'wb') as _tmp_file:
-                        _tmp_file.write(s.serialized_model_proto())
-            hf_tokenizer = AutoTokenizer.from_pretrained(_tmp_dir)
-
-            # remove 'name_or_path'
-            hf_tokenizer.name_or_path = ''
-            hf_tokenizer.init_kwargs['name_or_path'] = ''
+        if tokenizer_save_dir is None:
+            unique_suffix = ''.join(
+                random.choices(string.ascii_letters + string.digits, k=6))
+            tokenizer_save_dir = os.path.join(
+                os.getcwd(), f'tokenizer-save-dir-{unique_suffix}')
+        os.makedirs(tokenizer_save_dir, exist_ok=True)
+
+        for filename, saved_content in hf_tokenizer_state.items():
+            # This cannot be a temporary directory because huggingface relies on the slow tokenizer file
+            # being persistent on disk
+            tokenizer_file_path = Path(
+                tokenizer_save_dir
+            ) / f'{filename}{saved_content["file_extension"]}'
+            if saved_content['file_extension'] == '.json':
+                with open(tokenizer_file_path, 'w') as _tmp_file:
+                    json.dump(saved_content['content'], _tmp_file)
+            elif saved_content['file_extension'] == '.txt':
+                with open(tokenizer_file_path, 'w') as _tmp_file:
+                    for line in saved_content['content']:
+                        _tmp_file.write(line)
+                        _tmp_file.write('\n')
+            elif saved_content['file_extension'] == '.py':
+                with open(tokenizer_file_path, 'w') as _tmp_file:
+                    _tmp_file.write(saved_content['content'])
+            elif saved_content['file_extension'] == '.model':
+                s = spm.SentencePieceProcessor()
+                s.load_from_serialized_proto(saved_content['content'])
+                with open(tokenizer_file_path, 'wb') as _tmp_file:
+                    _tmp_file.write(s.serialized_model_proto())
+
+        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_dir)
+
+        # remove 'name_or_path'
+        hf_tokenizer.name_or_path = ''
+        hf_tokenizer.init_kwargs['name_or_path'] = ''
 
     return hf_tokenizer
 
 
 def write_huggingface_pretrained_from_composer_checkpoint(
-        checkpoint_path: Union[Path, str],
-        output_path: Union[Path, str],
-        output_precision: str = 'fp32',
-        local_checkpoint_save_location: Optional[Union[Path,
-                                                       str]] = None) -> None:
+    checkpoint_path: Union[Path, str],
+    output_path: Union[Path, str],
+    output_precision: str = 'fp32',
+    local_checkpoint_save_location: Optional[Union[Path, str]] = None
+) -> Tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]:
     """Convert a Composer checkpoint to a pretrained HF checkpoint folder.
 
     Write a ``config.json`` and ``pytorch_model.bin``, like
@@ -274,12 +201,14 @@ def write_huggingface_pretrained_from_composer_checkpoint(
     print('Done.')
     print('#' * 30)
 
+    return hf_config, hf_tokenizer
+
 
 def parse_args() -> Namespace:
     """Parse commandline arguments."""
     parser = ArgumentParser(
         description=
-        'Convert an MPT Composer checkpoint and Omegaconf model config into a standard HuggingFace checkpoint folder, and optionally upload to the hub.'
+        'Convert a HuggingFace causal LM in a Composer checkpoint into a standard HuggingFace checkpoint folder, and optionally upload to the hub.'
     )
     parser.add_argument('--composer_path', type=str, required=True)
     parser.add_argument('--hf_output_path', type=str, required=True)
@@ -297,9 +226,16 @@ def parse_args() -> Namespace:
 
 
 def convert_composer_to_hf(args: Namespace) -> None:
+    # Register MPT auto classes so that this script works with MPT
+    # This script will not work without modification for other custom models,
+    # but will work for other HuggingFace causal LMs
+    AutoConfig.register('mpt', MPTConfig)
+    MPTConfig.register_for_auto_class()
+    MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
+
     _, _, local_folder_path = parse_uri(args.hf_output_path)
 
-    write_huggingface_pretrained_from_composer_checkpoint(
+    config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path=args.composer_path,
         output_path=local_folder_path,
         output_precision=args.output_precision,
@@ -311,19 +247,18 @@ def convert_composer_to_hf(args: Namespace) -> None:
         'bf16': torch.bfloat16,
     }[args.output_precision]
 
-    # register config auto class
-    MPTConfig.register_for_auto_class()
+    print(f'Loading model from {local_folder_path}')
+    if config.model_type == 'mpt':
+        config.attn_config['attn_impl'] = 'torch'
 
-    # register model auto class
-    MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
+    if config.model_type == 'mpt':
+        loaded_hf_model = MPTForCausalLM.from_pretrained(local_folder_path,
+                                                         config=config,
+                                                         torch_dtype=dtype)
+    else:
+        loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(
+            local_folder_path, config=config, torch_dtype=dtype)
 
-    print(f'Loading model from {local_folder_path}')
-    config = MPTConfig.from_pretrained(local_folder_path)
-    # You have to edit the config this way, because attn_config is a nested dictionary
-    config.attn_config['attn_impl'] = 'torch'
-    loaded_hf_model = MPTForCausalLM.from_pretrained(local_folder_path,
-                                                     config=config,
-                                                     torch_dtype=dtype)
     delattr(loaded_hf_model.config, '_name_or_path')
 
     loaded_hf_model.save_pretrained(local_folder_path)
@@ -332,8 +267,10 @@ def convert_composer_to_hf(args: Namespace) -> None:
     tokenizer = transformers.AutoTokenizer.from_pretrained(local_folder_path)
     tokenizer.save_pretrained(local_folder_path)
 
-    print('Editing files for HF compatibility...')
-    edit_files_for_hf_compatibility(local_folder_path)
+    # Only need to edit files for MPT because it has custom code
+    if config.model_type == 'mpt':
+        print('Editing files for HF compatibility...')
+        edit_files_for_hf_compatibility(local_folder_path)
 
     object_store = maybe_create_object_store_from_uri(str(args.hf_output_path))
 

diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
@@ -22,6 +22,7 @@
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
+from llmfoundry import COMPOSER_MODEL_REGISTRY
 from scripts.inference.convert_composer_to_hf import convert_composer_to_hf
 
 
@@ -43,19 +44,44 @@ def get_config(
     os.environ['TOKENIZERS_PARALLELISM'] = 'false'
     with open(conf_path) as f:
         test_cfg = om.load(f)
+
     return cast(DictConfig, test_cfg)
 
 
-def test_convert_and_generate_torch(tmp_path: pathlib.Path):
+@pytest.mark.parametrize('model', ['mpt', 'neo', 'llama2'])
+def test_convert_and_generate(model: str, tmp_path: pathlib.Path):
     delete_transformers_cache()
 
-    cfg = get_config()
-    cfg['model']['init_device'] = 'cpu'
-    cfg['model']['attn_config']['attn_impl'] = 'torch'
+    om_cfg = None
+    if model == 'mpt':
+        om_cfg = get_config(
+            conf_path='scripts/train/yamls/pretrain/testing.yaml')
+    elif model == 'neo':
+        om_cfg = get_config(
+            conf_path='scripts/train/yamls/pretrain/gpt-neo-125m.yaml')
+        om_cfg['model']['config_overrides']['hidden_size'] = 36
+    elif model == 'llama2':
+        if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
+            pytest.skip(
+                'The CI cluster does not have access to the Llama models, so skip this test.'
+            )
+        om_cfg = get_config(
+            conf_path='scripts/train/yamls/pretrain/gpt-neo-125m.yaml')
+        om_cfg['model'][
+            'pretrained_model_name_or_path'] = 'meta-llama/Llama-2-7b-hf'
+        om_cfg['model']['config_overrides']['num_hidden_layers'] = 2
+        om_cfg['model']['use_auth_token'] = True
+        om_cfg['tokenizer']['name'] = 'meta-llama/Llama-2-7b-hf'
+    else:
+        raise ValueError(f'Unknown model {model}')
+    assert om_cfg is not None
+
+    om_cfg['model']['init_device'] = 'cpu'
     tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'EleutherAI/gpt-neox-20b')
-    model = ComposerMPTCausalLM(cfg['model'], tokenizer)
-    trainer = Trainer(model=model)
+        om_cfg.tokenizer.name, use_auth_token=model == 'llama2')
+    original_model = COMPOSER_MODEL_REGISTRY[om_cfg['model'].name](
+        om_cfg['model'], tokenizer)
+    trainer = Trainer(model=original_model, device='cpu')
     trainer.save_checkpoint(os.path.join(tmp_path, 'checkpoint.pt'))
 
     args = Namespace(composer_path=os.path.join(tmp_path, 'checkpoint.pt'),
@@ -66,21 +92,29 @@ def test_convert_and_generate_torch(tmp_path: pathlib.Path):
                      test_uploaded_model=False)
     convert_composer_to_hf(args)
 
-    config = transformers.AutoConfig.from_pretrained(os.path.join(
-        tmp_path, 'hf-output-folder'),
-                                                     trust_remote_code=True)
-    config.attn_config['attn_impl'] = 'torch'
-    model = transformers.AutoModelForCausalLM.from_pretrained(
+    loaded_config = transformers.AutoConfig.from_pretrained(
+        os.path.join(tmp_path, 'hf-output-folder'), trust_remote_code=True)
+    loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
         os.path.join(tmp_path, 'hf-output-folder'),
-        config=config,
+        config=loaded_config,
         trust_remote_code=True)
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         os.path.join(tmp_path, 'hf-output-folder'), trust_remote_code=True)
 
-    output = model.generate(tokenizer('hello',
-                                      return_tensors='pt')['input_ids'],
-                            max_new_tokens=1)
-    assert output.shape == (1, 2)
+    output = loaded_model.generate(tokenizer('hello',
+                                             return_tensors='pt')['input_ids'],
+                                   max_new_tokens=1)
+    assert output.shape == (1, 2 + (1 if model == 'llama2' else 0))
+
+    assert sum(p.numel() for p in original_model.model.parameters()) == sum(
+        p.numel() for p in loaded_model.parameters())
+    assert all(
+        str(type(module1)).split('.')[-1] == str(type(module2)).split('.')[-1]
+        for module1, module2 in zip(original_model.model.modules(),
+                                    loaded_model.modules()))
+    for p1, p2 in zip(original_model.model.parameters(),
+                      loaded_model.parameters()):
+        assert torch.allclose(p1, p2)
 
     delete_transformers_cache()