You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
File ~/miniconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py:563, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
561 elif type(config) in cls._model_mapping.keys():
562 model_class = _get_model_class(config, cls._model_mapping)
--> 563 return model_class.from_pretrained(
564 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
565 )
566 raise ValueError(
567 f"Unrecognized configuration class {config.class} for this kind of AutoModel: {cls.name}.\n"
568 f"Model type should be one of {', '.join(c.name for c in cls._model_mapping.keys())}."
569 )
File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:3754, in from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
3746 def to_bettertransformer(self) -> "PreTrainedModel":
3747 """
3748 Converts the model to use PyTorch's native attention
3749 implementation, integrated to
3750 Transformers through Optimum library. Only a
3751 subset of all Transformers models are supported.
3752
3753 PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of nested
-> 3754 tensors. Detailed benchmarks can be found in this blog
3755 post.
3756
3757 Returns:
3758 [PreTrainedModel]: The model converted to BetterTransformer.
3759 """
3760 if not is_optimum_available():
3761 raise ImportError("The package optimum is required to use Better Transformer.")
File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:4214, in _load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules, gguf_path)
4211 if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
4212 self.last_dropout = nn.Dropout(config.summary_last_dropout)
-> 4214 def forward(
4215 self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
4216 ) -> torch.FloatTensor:
4217 """
4218 Compute a single vector summary of a sequence hidden states.
4219
(...)
4227 torch.FloatTensor: The summary of the sequence hidden states.
4228 """
4229 if self.summary_type == "last":
File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:889, in _load_state_dict_into_meta_model(model, state_dict, loaded_state_dict_keys, start_prefix, expected_keys, device_map, offload_folder, offload_index, state_dict_folder, state_dict_index, dtype, hf_quantizer, is_safetensors, keep_in_fp32_modules, unexpected_keys)
885 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
886 return extended_attention_mask
888 def get_extended_attention_mask(
--> 889 self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
890 ) -> Tensor:
891 """
892 Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
893
(...)
901 torch.Tensor The extended attention mask, with a the same dtype as attention_mask.dtype.
902 """
903 if dtype is None:
运行OneKE快速上手时报错。bitsandbytes是requirement里的0.39.1,升级到0.44.1也不行
ValueError Traceback (most recent call last)
Cell In [21], line 26
16 # 4bit量化OneKE
17 quantization_config=BitsAndBytesConfig(
18 load_in_4bit=True,
19 llm_int8_threshold=6.0,
(...)
23 bnb_4bit_quant_type="nf4",
24 )
---> 26 model = AutoModelForCausalLM.from_pretrained(
27 model_path,
28 config=config,
29 device_map="auto",
30 quantization_config=quantization_config,
31 torch_dtype=torch.float16,
32 trust_remote_code=True,
33 )
34 model.eval()
37 system_prompt = '<>\nYou are a helpful assistant. 你是一个乐于助人的助手。\n<>\n\n'
File ~/miniconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py:563, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
561 elif type(config) in cls._model_mapping.keys():
562 model_class = _get_model_class(config, cls._model_mapping)
--> 563 return model_class.from_pretrained(
564 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
565 )
566 raise ValueError(
567 f"Unrecognized configuration class {config.class} for this kind of AutoModel: {cls.name}.\n"
568 f"Model type should be one of {', '.join(c.name for c in cls._model_mapping.keys())}."
569 )
File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:3754, in from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
3746 def to_bettertransformer(self) -> "PreTrainedModel":
3747 """
3748 Converts the model to use PyTorch's native attention
3749 implementation, integrated to
3750 Transformers through Optimum library. Only a
3751 subset of all Transformers models are supported.
3752
3753 PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of nested
-> 3754 tensors. Detailed benchmarks can be found in this blog
3755 post.
3756
3757 Returns:
3758 [
PreTrainedModel
]: The model converted to BetterTransformer.3759 """
3760 if not is_optimum_available():
3761 raise ImportError("The package
optimum
is required to use Better Transformer.")File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:4214, in _load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules, gguf_path)
4211 if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
4212 self.last_dropout = nn.Dropout(config.summary_last_dropout)
-> 4214 def forward(
4215 self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
4216 ) -> torch.FloatTensor:
4217 """
4218 Compute a single vector summary of a sequence hidden states.
4219
(...)
4227
torch.FloatTensor
: The summary of the sequence hidden states.4228 """
4229 if self.summary_type == "last":
File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:889, in _load_state_dict_into_meta_model(model, state_dict, loaded_state_dict_keys, start_prefix, expected_keys, device_map, offload_folder, offload_index, state_dict_folder, state_dict_index, dtype, hf_quantizer, is_safetensors, keep_in_fp32_modules, unexpected_keys)
885 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
886 return extended_attention_mask
888 def get_extended_attention_mask(
--> 889 self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
890 ) -> Tensor:
891 """
892 Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
893
(...)
901
torch.Tensor
The extended attention mask, with a the same dtype asattention_mask.dtype
.902 """
903 if dtype is None:
File ~/miniconda3/lib/python3.8/site-packages/transformers/quantizers/quantizer_bnb_4bit.py:216, in Bnb4BitHfQuantizer.create_quantized_param(self, model, param_value, param_name, target_device, state_dict, unexpected_keys)
213 new_value = new_value.T
215 kwargs = old_value.dict
--> 216 new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
218 module._parameters[tensor_name] = new_value
File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/nn/modules.py:176, in Params4bit.to(self, *args, **kwargs)
173 device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
175 if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"):
--> 176 return self.cuda(device)
177 else:
178 s = self.quant_state
File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/nn/modules.py:154, in Params4bit.cuda(self, device)
152 def cuda(self, device):
153 w = self.data.contiguous().half().cuda(device)
--> 154 w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
155 self.data = w_4bit
156 self.quant_state = quant_state
File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/functional.py:786, in quantize_4bit(A, absmax, out, blocksize, compress_statistics, quant_type)
783 absmax -= offset
784 #code = create_custom_map().to(absmax.device)
785 #qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
--> 786 qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
787 del absmax
788 state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/functional.py:621, in quantize_blockwise(A, code, absmax, out, blocksize, nested)
619 lib.cquantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()))
620 else:
--> 621 raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
622 post_call(A.device)
623 else:
624 # cpu
ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.bfloat16
The text was updated successfully, but these errors were encountered: