diff --git a/compiler/luci/pass/src/QuantizeWeightsWithGPTQPass.cpp b/compiler/luci/pass/src/QuantizeWeightsWithGPTQPass.cpp index 47b3cb0ad26..6f20f083a23 100644 --- a/compiler/luci/pass/src/QuantizeWeightsWithGPTQPass.cpp +++ b/compiler/luci/pass/src/QuantizeWeightsWithGPTQPass.cpp @@ -43,9 +43,9 @@ void iterate_per_channel_with_order(CircleConst *node, IterFunc func, bool rever loco::TensorShape dimension; dimension.rank(4); uint32_t indices[4] = {0}; - int32_t channel_dim_index{0}; + int32_t index_channel_dim{0}; uint32_t num_dims[4]; - if (!get_channel_dim_index(node, dimension, channel_dim_index)) + if (!get_channel_dim_index(node, dimension, index_channel_dim)) { throw std::runtime_error("GPTQPass: Failed to get channel dim index."); } @@ -69,7 +69,7 @@ void iterate_per_channel_with_order(CircleConst *node, IterFunc func, bool rever indices[order[1]] = j; indices[order[2]] = s; indices[order[3]] = t; - func(indices, dimension, channel_dim_index); + func(indices, dimension, index_channel_dim); } } } @@ -77,20 +77,20 @@ void iterate_per_channel_with_order(CircleConst *node, IterFunc func, bool rever } size_t calculate_qauntized_value(CircleConst *node, uint32_t *indices, loco::TensorShape &dimension, - int channel_dim_index, std::vector &scaling_factor, + int index_channel_dim, std::vector &scaling_factor, std::vector &max, std::vector &min) { assert(node != nullptr); - int channel_idx = indices[channel_dim_index]; + int idx_channel = indices[index_channel_dim]; - assert(scaling_factor[channel_idx] > 0); - const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx]; + assert(scaling_factor[idx_channel] > 0); + const float scaling_factor_inv = 1.0 / scaling_factor[idx_channel]; auto data = node->at(cal_offset(dimension, indices)); - auto data_clipped = data < min[channel_idx] ? min[channel_idx] : data; - data_clipped = data_clipped > max[channel_idx] ? max[channel_idx] : data_clipped; + auto data_clipped = data < min[idx_channel] ? min[idx_channel] : data; + data_clipped = data_clipped > max[idx_channel] ? max[idx_channel] : data_clipped; - return static_cast(std::round((data_clipped - min[channel_idx]) * scaling_factor_inv)); + return static_cast(std::round((data_clipped - min[idx_channel]) * scaling_factor_inv)); } void cholesky_decomposition(std::vector &src, uint32_t num_size) @@ -220,31 +220,31 @@ void cal_minmax_per_channel(CircleConst *node, std::vector &min, std::vec { loco::TensorShape dimension; dimension.rank(4); - int32_t channel_dim_index{0}; + int32_t index_channel_dim{0}; - if (!get_channel_dim_index(node, dimension, channel_dim_index)) + if (!get_channel_dim_index(node, dimension, index_channel_dim)) { throw std::runtime_error("GPTQPass: Failed to get channel dim index."); } - auto size = dimension.dim(channel_dim_index).value(); + auto size = dimension.dim(index_channel_dim).value(); std::vector has_min_max_value(size, false); min.resize(size); max.resize(size); - auto cal_minmax = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) { - int channel_idx = indices[channel_dim_index]; + auto cal_minmax = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) { + int idx_channel = indices[index_channel_dim]; auto data = node->at(cal_offset(dimension, indices)); - if (has_min_max_value[channel_idx]) + if (has_min_max_value[idx_channel]) { - min[channel_idx] = data < min[channel_idx] ? data : min[channel_idx]; - max[channel_idx] = data > max[channel_idx] ? data : max[channel_idx]; + min[idx_channel] = data < min[idx_channel] ? data : min[idx_channel]; + max[idx_channel] = data > max[idx_channel] ? data : max[idx_channel]; } else { - min[channel_idx] = data; - max[channel_idx] = data; - has_min_max_value[channel_idx] = true; + min[idx_channel] = data; + max[idx_channel] = data; + has_min_max_value[idx_channel] = true; } }; @@ -325,6 +325,39 @@ void compute_asym_scale_zp(float min, float max, int32_t k_max_scale, float &sca zp = nudged_zero_point; } +void apply_damping_to_hessian(std::vector &hessian, uint32_t num_size) +{ + float damp = 0; + float percdamp = .01; + + for (uint32_t i = 0; i < num_size; i++) + { + damp += hessian[i * num_size + i]; + } + + assert(num_size != 0); + damp /= num_size; + damp *= percdamp; + + for (uint32_t i = 0; i < num_size; i++) + { + hessian[i * num_size + i] += damp; + } +} + +void transpose_to_upper_triangular(std::vector &matrix, uint32_t num_size) +{ + for (uint32_t i = 0; i < num_size; i++) + { + for (uint32_t j = 0; j < i; j++) + { + float tmp = matrix[i * num_size + j]; + matrix[i * num_size + j] = matrix[j * num_size + i]; + matrix[j * num_size + i] = tmp; + } + } +} + void asymmetric_wquant_per_channel(CircleConst *node, std::vector &min, std::vector &max, std::vector &scaling_factor, std::vector &zp, std::vector &nudged_min, @@ -339,8 +372,8 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector &min, const int32_t kMinScale = 0; const int32_t kMaxScale = output_type == loco::DataType::U4 ? 15 : 255; - uint32_t size = node->size(); - std::vector quantized_values(size); + uint32_t input_size = node->size(); + std::vector quantized_values(input_size); for (size_t i = 0; i < min.size(); ++i) { @@ -350,48 +383,24 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector &min, if (hessian.empty()) // Case where GPTQ is not applied { - quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) { + quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) { quantized_values[cal_offset(dimension, indices)] = calculate_qauntized_value( - node, indices, dimension, channel_dim_index, scaling_factor, nudged_max, nudged_min); + node, indices, dimension, index_channel_dim, scaling_factor, nudged_max, nudged_min); }; iterate_per_channel_with_order(node, quantize, false); } else // Case where GPTQ is applied { uint32_t size_hessian = static_cast(sqrt(hessian.size())); - float percdamp = .01; - float damp = 0; - - for (uint32_t i = 0; i < size_hessian; i++) - { - damp += hessian[i * size_hessian + i]; - } - assert(size_hessian != 0); - damp /= size_hessian; - damp *= percdamp; - - for (uint32_t i = 0; i < size_hessian; i++) - { - hessian[i * size_hessian + i] += damp; - } - // calculate hessian inverse + // Calculate hessian inverse + apply_damping_to_hessian(hessian, size_hessian); cholesky_decomposition(hessian, size_hessian); cholesky_inverse(hessian, size_hessian); cholesky_decomposition(hessian, size_hessian); + transpose_to_upper_triangular(hessian, size_hessian); - // transpose hessian to make upper triangular - for (uint32_t i = 0; i < size_hessian; i++) - { - for (uint32_t j = 0; j < i; j++) - { - float tmp = hessian[i * size_hessian + j]; - hessian[i * size_hessian + j] = hessian[j * size_hessian + i]; - hessian[j * size_hessian + i] = tmp; - } - } - - std::vector error(size); + std::vector error(input_size); loco::TensorShape dimension_channel_last; dimension_channel_last.rank(4); @@ -401,35 +410,34 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector &min, dimension_hessian.dim(0).set(size_hessian); dimension_hessian.dim(1).set(size_hessian); - quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) { - quantized_values[cal_offset(dimension, indices)] = calculate_qauntized_value( - node, indices, dimension, channel_dim_index, scaling_factor, nudged_max, nudged_min); + quantize = [&](uint32_t *indices, loco::TensorShape &dimension_input, int index_channel_dim) { + quantized_values[cal_offset(dimension_input, indices)] = calculate_qauntized_value( + node, indices, dimension_input, index_channel_dim, scaling_factor, nudged_max, nudged_min); uint32_t indices_channel_last[4] = { indices[0], indices[3], indices[1], indices[2] // ohwi -> oihw }; - - uint32_t dimension_channel_last[4] = {dimension.dim(0).value(), dimension.dim(3).value(), - dimension.dim(1).value(), dimension.dim(2).value()}; + uint32_t dimension_channel_last[4] = { + dimension_input.dim(0).value(), dimension_input.dim(3).value(), + dimension_input.dim(1).value(), dimension_input.dim(2).value()}; uint32_t idx_quant_column = dimension_channel_last[2] * dimension_channel_last[3] * indices_channel_last[1] + dimension_channel_last[3] * indices_channel_last[2] + indices_channel_last[3]; + uint32_t idx_channel = indices[index_channel_dim]; uint32_t indices_diag_hessian[2] = {idx_quant_column, idx_quant_column}; - uint32_t channel_idx = indices[channel_dim_index]; + auto idx_input_data = cal_offset(dimension_input, indices); + auto idx_hessian = cal_offset_2d(dimension_hessian, indices_diag_hessian); - auto data_indices = cal_offset(dimension, indices); - auto hessian_indices = cal_offset_2d(dimension_hessian, indices_diag_hessian); - - auto data = node->at(data_indices); + auto input_data = node->at(idx_input_data); auto quantized_rvalue = - (quantized_values[data_indices] - zp[channel_idx]) * scaling_factor[channel_idx]; + (quantized_values[idx_input_data] - zp[idx_channel]) * scaling_factor[idx_channel]; - error[data_indices] = (data - quantized_rvalue) / hessian[hessian_indices]; + error[idx_input_data] = (input_data - quantized_rvalue) / hessian[idx_hessian]; - if (channel_idx == (dimension.dim(channel_dim_index).value() - 1)) + if (idx_channel == (dimension_input.dim(index_channel_dim).value() - 1)) { for (uint32_t o = 0; o < dimension_channel_last[0]; o++) { @@ -439,19 +447,20 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector &min, { for (uint32_t w = 0; w < dimension_channel_last[3]; w++) { - // convert coordination + // Convert coordination uint32_t indices_channel_first[4] = {o, h, w, i}; uint32_t indices_error[4] = {o, indices[1], indices[2], indices[3]}; uint32_t idx_ihw = dimension_channel_last[2] * dimension_channel_last[3] * i + dimension_channel_last[3] * h + w; uint32_t indices_hessain[2] = {idx_quant_column, idx_ihw}; - auto _h_indices = cal_offset_2d(dimension_hessian, indices_hessain); - auto _data_indices = cal_offset(dimension, indices_channel_first); - auto _error_indices = cal_offset(dimension, indices_error); + auto _idx_h = cal_offset_2d(dimension_hessian, indices_hessain); + auto _idx_input_data = cal_offset(dimension_input, indices_channel_first); + auto _idx_error = cal_offset(dimension_input, indices_error); - node->at(_data_indices) -= - error[_error_indices] * hessian[_h_indices]; + // Compensate quantize error + node->at(_idx_input_data) -= + error[_idx_error] * hessian[_idx_h]; } } } @@ -461,9 +470,9 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector &min, iterate_per_channel_with_order(node, quantize, true); } - node->dtype(loco::DataType::U8); // Change the type of tensor - node->size(size); // Resize tensor - for (uint32_t i = 0; i < size; ++i) + node->dtype(loco::DataType::U8); // Change the type of tensor + node->size(input_size); // Resize tensor + for (uint32_t i = 0; i < input_size; ++i) { node->at(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); } @@ -476,11 +485,11 @@ void asymmetric_wdequant_per_channel(CircleConst *node, std::vector &scal uint32_t size = node->size(); std::vector dequantized_values(size); - auto dequantize = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) { - int channel_idx = indices[channel_dim_index]; + auto dequantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) { + int idx_channel = indices[index_channel_dim]; auto data = node->at(cal_offset(dimension, indices)); dequantized_values[cal_offset(dimension, indices)] = - static_cast(data) * scaling_factor[channel_idx] + nudged_min[channel_idx]; + static_cast(data) * scaling_factor[idx_channel] + nudged_min[idx_channel]; }; iterate_per_channel_with_order(node, dequantize, false);