Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.bfloat16 #591

Open
Kqdl159 opened this issue Oct 9, 2024 · 2 comments
Labels
question Further information is requested

Comments

@Kqdl159
Copy link

Kqdl159 commented Oct 9, 2024

运行OneKE快速上手时报错。bitsandbytes是requirement里的0.39.1,升级到0.44.1也不行


ValueError Traceback (most recent call last)
Cell In [21], line 26
16 # 4bit量化OneKE
17 quantization_config=BitsAndBytesConfig(
18 load_in_4bit=True,
19 llm_int8_threshold=6.0,
(...)
23 bnb_4bit_quant_type="nf4",
24 )
---> 26 model = AutoModelForCausalLM.from_pretrained(
27 model_path,
28 config=config,
29 device_map="auto",
30 quantization_config=quantization_config,
31 torch_dtype=torch.float16,
32 trust_remote_code=True,
33 )
34 model.eval()
37 system_prompt = '<>\nYou are a helpful assistant. 你是一个乐于助人的助手。\n<>\n\n'

File ~/miniconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py:563, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
561 elif type(config) in cls._model_mapping.keys():
562 model_class = _get_model_class(config, cls._model_mapping)
--> 563 return model_class.from_pretrained(
564 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
565 )
566 raise ValueError(
567 f"Unrecognized configuration class {config.class} for this kind of AutoModel: {cls.name}.\n"
568 f"Model type should be one of {', '.join(c.name for c in cls._model_mapping.keys())}."
569 )

File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:3754, in from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
3746 def to_bettertransformer(self) -> "PreTrainedModel":
3747 """
3748 Converts the model to use PyTorch's native attention
3749 implementation
, integrated to
3750 Transformers through Optimum library. Only a
3751 subset of all Transformers models are supported.
3752
3753 PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of nested
-> 3754 tensors
. Detailed benchmarks can be found in this blog
3755 post
.
3756
3757 Returns:
3758 [PreTrainedModel]: The model converted to BetterTransformer.
3759 """
3760 if not is_optimum_available():
3761 raise ImportError("The package optimum is required to use Better Transformer.")

File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:4214, in _load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules, gguf_path)
4211 if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
4212 self.last_dropout = nn.Dropout(config.summary_last_dropout)
-> 4214 def forward(
4215 self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
4216 ) -> torch.FloatTensor:
4217 """
4218 Compute a single vector summary of a sequence hidden states.
4219
(...)
4227 torch.FloatTensor: The summary of the sequence hidden states.
4228 """
4229 if self.summary_type == "last":

File ~/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:889, in _load_state_dict_into_meta_model(model, state_dict, loaded_state_dict_keys, start_prefix, expected_keys, device_map, offload_folder, offload_index, state_dict_folder, state_dict_index, dtype, hf_quantizer, is_safetensors, keep_in_fp32_modules, unexpected_keys)
885 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
886 return extended_attention_mask
888 def get_extended_attention_mask(
--> 889 self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
890 ) -> Tensor:
891 """
892 Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
893
(...)
901 torch.Tensor The extended attention mask, with a the same dtype as attention_mask.dtype.
902 """
903 if dtype is None:

File ~/miniconda3/lib/python3.8/site-packages/transformers/quantizers/quantizer_bnb_4bit.py:216, in Bnb4BitHfQuantizer.create_quantized_param(self, model, param_value, param_name, target_device, state_dict, unexpected_keys)
213 new_value = new_value.T
215 kwargs = old_value.dict
--> 216 new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
218 module._parameters[tensor_name] = new_value

File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/nn/modules.py:176, in Params4bit.to(self, *args, **kwargs)
173 device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
175 if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"):
--> 176 return self.cuda(device)
177 else:
178 s = self.quant_state

File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/nn/modules.py:154, in Params4bit.cuda(self, device)
152 def cuda(self, device):
153 w = self.data.contiguous().half().cuda(device)
--> 154 w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
155 self.data = w_4bit
156 self.quant_state = quant_state

File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/functional.py:786, in quantize_4bit(A, absmax, out, blocksize, compress_statistics, quant_type)
783 absmax -= offset
784 #code = create_custom_map().to(absmax.device)
785 #qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
--> 786 qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
787 del absmax
788 state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]

File ~/miniconda3/lib/python3.8/site-packages/bitsandbytes/functional.py:621, in quantize_blockwise(A, code, absmax, out, blocksize, nested)
619 lib.cquantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()))
620 else:
--> 621 raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
622 post_call(A.device)
623 else:
624 # cpu

ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.bfloat16

@Kqdl159 Kqdl159 added the question Further information is requested label Oct 9, 2024
@guihonghao
Copy link
Contributor

torch_dtype=torch.float16,需要设置torch_dtype=torch.bfloat16,

@Kqdl159
Copy link
Author

Kqdl159 commented Oct 10, 2024

torch_dtype=torch.float16,需要设置torch_dtype=torch.bfloat16,

这两个我都试过了,都有这个错误

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
question Further information is requested
Projects
None yet
Development

No branches or pull requests

2 participants