ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.bfloat16 #591

Kqdl159 · 2024-10-09T23:27:38Z

运行OneKE快速上手时报错。bitsandbytes是requirement里的0.39.1，升级到0.44.1也不行

ValueError Traceback (most recent call last)
Cell In [21], line 26
16 # 4bit量化OneKE
17 quantization_config=BitsAndBytesConfig(
18 load_in_4bit=True,
19 llm_int8_threshold=6.0,
(...)
23 bnb_4bit_quant_type="nf4",
24 )
---> 26 model = AutoModelForCausalLM.from_pretrained(
27 model_path,
28 config=config,
29 device_map="auto",
30 quantization_config=quantization_config,
31 torch_dtype=torch.float16,
32 trust_remote_code=True,
33 )
34 model.eval()
37 system_prompt = '<>\nYou are a helpful assistant. 你是一个乐于助人的助手。\n<>\n\n'

File ~/miniconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py:563, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
561 elif type(config) in cls._model_mapping.keys():
562 model_class = _get_model_class(config, cls._model_mapping)
--> 563 return model_class.from_pretrained(
564 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
565 )
566 raise ValueError(
567 f"Unrecognized configuration class {config.class} for this kind of AutoModel: {cls.name}.\n"
568 f"Model type should be one of {', '.join(c.name for c in cls._model_mapping.keys())}."
569 )

File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:3754, in from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
3746 def to_bettertransformer(self) -> "PreTrainedModel":
3747 """
3748 Converts the model to use PyTorch's native attention
3749 implementation, integrated to
3750 Transformers through Optimum library. Only a
3751 subset of all Transformers models are supported.
3752
3753 PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of nested
-> 3754 tensors. Detailed benchmarks can be found in this blog
3755 post.
3756
3757 Returns:
3758 [PreTrainedModel]: The model converted to BetterTransformer.
3759 """
3760 if not is_optimum_available():
3761 raise ImportError("The package optimum is required to use Better Transformer.")

File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:4214, in _load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules, gguf_path)
4211 if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
4212 self.last_dropout = nn.Dropout(config.summary_last_dropout)
-> 4214 def forward(
4215 self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
4216 ) -> torch.FloatTensor:
4217 """
4218 Compute a single vector summary of a sequence hidden states.
4219
(...)
4227 torch.FloatTensor: The summary of the sequence hidden states.
4228 """
4229 if self.summary_type == "last":

File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:889, in _load_state_dict_into_meta_model(model, state_dict, loaded_state_dict_keys, start_prefix, expected_keys, device_map, offload_folder, offload_index, state_dict_folder, state_dict_index, dtype, hf_quantizer, is_safetensors, keep_in_fp32_modules, unexpected_keys)
885 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
886 return extended_attention_mask
888 def get_extended_attention_mask(
--> 889 self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
890 ) -> Tensor:
891 """
892 Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
893
(...)
901 torch.Tensor The extended attention mask, with a the same dtype as attention_mask.dtype.
902 """
903 if dtype is None:

File ~/miniconda3/lib/python3.8/site-packages/transformers/quantizers/quantizer_bnb_4bit.py:216, in Bnb4BitHfQuantizer.create_quantized_param(self, model, param_value, param_name, target_device, state_dict, unexpected_keys)
213 new_value = new_value.T
215 kwargs = old_value.dict
--> 216 new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
218 module._parameters[tensor_name] = new_value

File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/nn/modules.py:176, in Params4bit.to(self, *args, **kwargs)
173 device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
175 if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"):
--> 176 return self.cuda(device)
177 else:
178 s = self.quant_state

File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/nn/modules.py:154, in Params4bit.cuda(self, device)
152 def cuda(self, device):
153 w = self.data.contiguous().half().cuda(device)
--> 154 w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
155 self.data = w_4bit
156 self.quant_state = quant_state

File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/functional.py:786, in quantize_4bit(A, absmax, out, blocksize, compress_statistics, quant_type)
783 absmax -= offset
784 #code = create_custom_map().to(absmax.device)
785 #qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
--> 786 qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
787 del absmax
788 state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]

File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/functional.py:621, in quantize_blockwise(A, code, absmax, out, blocksize, nested)
619 lib.cquantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()))
620 else:
--> 621 raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
622 post_call(A.device)
623 else:
624 # cpu

ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.bfloat16

The text was updated successfully, but these errors were encountered:

guihonghao · 2024-10-10T04:08:30Z

torch_dtype=torch.float16,需要设置torch_dtype=torch.bfloat16,

Kqdl159 · 2024-10-10T13:38:11Z

torch_dtype=torch.float16,需要设置torch_dtype=torch.bfloat16,

这两个我都试过了，都有这个错误

Kqdl159 added the question Further information is requested label Oct 9, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.bfloat16 #591

ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.bfloat16 #591

Kqdl159 commented Oct 9, 2024

guihonghao commented Oct 10, 2024

Kqdl159 commented Oct 10, 2024

ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.bfloat16 #591

ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.bfloat16 #591

Comments

Kqdl159 commented Oct 9, 2024

guihonghao commented Oct 10, 2024

Kqdl159 commented Oct 10, 2024