diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py index 091550a5f6..8a944e29c2 100644 --- a/composer/models/huggingface.py +++ b/composer/models/huggingface.py @@ -582,7 +582,18 @@ def _is_registered_causal_lm(model: transformers.PreTrainedModel) -> bool: raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='transformers', conda_channel='conda-forge') from e - causal_lm_classes = list(MODEL_FOR_CAUSAL_LM_MAPPING.values()) + + # This try/except is needed until https://github.com/huggingface/transformers/issues/26778 + # is resolved in a release. This means that this attempt to automatically detect causal LMs + # does not currently work in an environment with flash attention <2 installed. + try: + causal_lm_classes = list(MODEL_FOR_CAUSAL_LM_MAPPING.values()) + except RuntimeError as e: + if 'Failed to import transformers.models' in str(e): + MODEL_FOR_CAUSAL_LM_MAPPING = {} + return False + else: + raise e return any(isinstance(model, causal_lm_class) for causal_lm_class in causal_lm_classes) diff --git a/setup.py b/setup.py index 028b4b8029..6a0e873f9b 100644 --- a/setup.py +++ b/setup.py @@ -184,7 +184,7 @@ def package_files(prefix: str, directory: str, extension: str): ] extra_deps['nlp'] = [ - 'transformers>=4.11,<4.34', + 'transformers>=4.11,<4.35,!=4.34.0', 'datasets>=2.4,<3', ] diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py index 4b94ba6f3d..0f6076116f 100644 --- a/tests/models/test_hf_model.py +++ b/tests/models/test_hf_model.py @@ -261,6 +261,46 @@ def check_hf_tokenizer_equivalence(tokenizer1, tokenizer2): tokenizer1.__dict__['init_kwargs'].pop('auto_map', None) tokenizer2.__dict__['init_kwargs'].pop('auto_map', None) + # Additional special tokens do not match between original tokenizer and loaded tokenizer due to transformers + # constructor differences + additional_special_tokens_1 = { + t if isinstance(t, str) else t.content for t in tokenizer1.__dict__.pop('_additional_special_tokens', []) + } + additional_special_tokens_2 = { + t if isinstance(t, str) else t.content for t in tokenizer2.__dict__.pop('_additional_special_tokens', []) + } + # Also pop it out of init_kwargs + tokenizer1.__dict__['init_kwargs'].pop('additional_special_tokens', None) + tokenizer2.__dict__['init_kwargs'].pop('additional_special_tokens', None) + tokenizer1.__dict__['init_kwargs'].pop('added_tokens_decoder', None) + tokenizer2.__dict__['init_kwargs'].pop('added_tokens_decoder', None) + # If the additional special tokens are the same (or a subset of each other), or if one of them is empty, then we are good + assert additional_special_tokens_1.issubset(additional_special_tokens_2) or additional_special_tokens_2.issubset( + additional_special_tokens_1) + + # The special token attributes may be strings or they may be AddedToken objects, so we just check string values + # First check that they have the same attrs + assert tokenizer1.SPECIAL_TOKENS_ATTRIBUTES == tokenizer2.SPECIAL_TOKENS_ATTRIBUTES + # Then check that the values are the same + for special_token_attr in tokenizer1.SPECIAL_TOKENS_ATTRIBUTES: + # Skip additional_special_tokens because we already checked it above + if special_token_attr == 'additional_special_tokens': + continue + + # The init_kwargs can change between the original tokenizer and the loaded tokenizer, + # so we just pop them + tokenizer1.__dict__['init_kwargs'].pop(special_token_attr, None) + tokenizer2.__dict__['init_kwargs'].pop(special_token_attr, None) + + attr1 = tokenizer1.__dict__.pop('_' + special_token_attr, None) + attr2 = tokenizer2.__dict__.pop('_' + special_token_attr, None) + if attr1 is None and attr2 is None: + continue + + attr_value1 = attr1 if isinstance(attr1, str) else attr1.content + attr_value2 = attr2 if isinstance(attr2, str) else attr2.content + assert attr_value1 == attr_value2 + assert tokenizer1.__dict__ == tokenizer2.__dict__ @@ -559,7 +599,10 @@ def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Pa if modify_tokenizer: assert t0_pp_tokenizer is not None # pyright t0_pp_tokenizer.add_special_tokens({'bos_token': '[NEWSPECIAL]'}) - t0_pp_tokenizer.add_special_tokens({'additional_special_tokens': ['[MOSAICML']}) + # This is apparently not allowed anymore + # It results in ValueError: Both extra_ids (100) and additional_special_tokens (['[MOSAICML']) + # are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokens + # t0_pp_tokenizer.add_special_tokens({'additional_special_tokens': ['[MOSAICML']}) t0_pp_tokenizer.add_tokens(['totallyarealtoken', 'mosaicml']) tiny_t5_model.resize_token_embeddings(len(t0_pp_tokenizer)) @@ -585,6 +628,8 @@ def test_hf_loading_sentencepiece_tokenizer(modify_tokenizer: bool, tmp_path: Pa @pytest.mark.parametrize('modify_tokenizer', [False, True]) +# https://github.com/huggingface/transformers/issues/26777 +@pytest.mark.skip('This tokenizer no longer loads at all as of transformers 4.34') def test_hf_loading_tokenizer_with_python_file(modify_tokenizer: bool, tmp_path: Path, tiny_gpt2_model): transformers = pytest.importorskip('transformers') replit_tokenizer = transformers.AutoTokenizer.from_pretrained('replit/replit-code-v1-3b', trust_remote_code=True) @@ -618,6 +663,7 @@ def test_hf_loading_llama_tokenizer(modify_tokenizer: bool, tmp_path: Path, tiny llama_tokenizer.add_special_tokens({'bos_token': '[NEWSPECIAL]'}) llama_tokenizer.add_special_tokens({'additional_special_tokens': ['[MOSAICML']}) llama_tokenizer.add_tokens(['totallyarealtoken', 'mosaicml']) + llama_tokenizer.update_post_processor() # we don't actually need the right model here, so avoiding adding llama tiny_gpt2_model.resize_token_embeddings(len(llama_tokenizer))