diff --git a/compiler/record-hessian/CMakeLists.txt b/compiler/record-hessian/CMakeLists.txt new file mode 100644 index 00000000000..75281e6c8cb --- /dev/null +++ b/compiler/record-hessian/CMakeLists.txt @@ -0,0 +1,36 @@ +file(GLOB_RECURSE SOURCES "src/*.cpp") +file(GLOB_RECURSE TESTS "src/*.test.cpp") +list(REMOVE_ITEM SOURCES ${TESTS}) + +add_library(record-hessian STATIC ${SOURCES}) + +target_include_directories(record-hessian PUBLIC include) +target_include_directories(record-hessian PRIVATE src) + +target_link_libraries(record-hessian luci_import) +target_link_libraries(record-hessian luci_env) +target_link_libraries(record-hessian luci_export) +target_link_libraries(record-hessian luci_interpreter) +target_link_libraries(record-hessian luci_log) +target_link_libraries(record-hessian dio_hdf5) + +install(TARGETS record-hessian DESTINATION lib) +install(DIRECTORY include/ DESTINATION include + FILES_MATCHING PATTERN "*.h") + +if(NOT ENABLE_TEST) + return() +endif(NOT ENABLE_TEST) + +nnas_find_package(GTest REQUIRED) + +GTest_AddTest(record_hessian_tests ${TESTS}) +target_include_directories(record_hessian_tests PRIVATE include) +target_include_directories(record_hessian_tests PRIVATE src) +target_link_libraries(record_hessian_tests luci_lang) +target_link_libraries(record_hessian_tests luci_pass) +target_link_libraries(record_hessian_tests loco) +target_link_libraries(record_hessian_tests dio_hdf5) +target_link_libraries(record_hessian_tests nncc_coverage) +target_link_libraries(record_hessian_tests luci_interpreter) +target_link_libraries(record_hessian_tests record-hessian) diff --git a/compiler/record-hessian/README.md b/compiler/record-hessian/README.md new file mode 100644 index 00000000000..49e6b2d9365 --- /dev/null +++ b/compiler/record-hessian/README.md @@ -0,0 +1,3 @@ +# record-hessian + +_record-hessian_ calculates hessian metrix of activations for quantization. diff --git a/compiler/record-hessian/include/record-hessian/HessianComputer.h b/compiler/record-hessian/include/record-hessian/HessianComputer.h new file mode 100644 index 00000000000..fc3cdebcb93 --- /dev/null +++ b/compiler/record-hessian/include/record-hessian/HessianComputer.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __RECORD_HESSIAN_HESSIANCOMPUTER_H__ +#define __RECORD_HESSIAN_HESSIANCOMPUTER_H__ + +#include "record-hessian/HessianVector.h" + +#include +#include + +#include +#include +#include + +namespace record_hessian +{ +/** + * @brief Record approximated hessian matrix from + * GPTQ paper(https://arxiv.org/abs/2210.17323). + */ +using HessianMap = std::unordered_map>; +using HessianVectorMap = std::unordered_map; + +class HessianComputer +{ +public: + // Record min/max of node + void recordHessian(const luci::CircleNode *node, const luci_interpreter::Tensor *input_tensor); + + std::unique_ptr getMap(); + +private: + HessianVectorMap _hessian_map; + const luci_interpreter::Tensor *_input_tensor = nullptr; + + void recordHessianForConv2D(const luci::CircleNode *node); + + void recordHessianForFullyConnected(const luci::CircleNode *node); +}; + +void unfold(std::vector &buf, uint32_t input_n, uint32_t input_h, uint32_t input_w, + uint32_t input_c, uint32_t stride_h, uint32_t stride_w, uint32_t dilation_h, + uint32_t dilation_w, uint32_t kernel_oc, uint32_t kernel_h, uint32_t kernel_w, + uint32_t kernel_ic); + +} // namespace record_hessian + +#endif // __RECORD_HESSIAN_HESSIANCOMPUTER_H__ diff --git a/compiler/record-hessian/include/record-hessian/HessianVector.h b/compiler/record-hessian/include/record-hessian/HessianVector.h new file mode 100644 index 00000000000..400b477616f --- /dev/null +++ b/compiler/record-hessian/include/record-hessian/HessianVector.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __RECORD_HESSIAN_HESSIANVECTOR_H__ +#define __RECORD_HESSIAN_HESSIANVECTOR_H__ + +#include + +#include +#include + +namespace record_hessian +{ + +struct HessianVector +{ + std::vector hessian; + size_t count; + + HessianVector() : count(0) {} + + void update(const std::vector &new_hessian) + { + if (count == 0) + { + hessian.resize(new_hessian.size()); + } + else if (hessian.size() != new_hessian.size()) + { + hessian.resize(new_hessian.size()); + } + + size_t numel = new_hessian.size(); + float alpha = 1.f / static_cast(count + 1); + + for (size_t i = 0; i < numel; ++i) + { + hessian[i] = (hessian[i] * count + new_hessian[i]) * alpha; + } + + count++; + }; +}; + +} // namespace record_hessian + +#endif // __RECORD_HESSIAN_HESSIANVECTOR_H__ diff --git a/compiler/record-hessian/requires.cmake b/compiler/record-hessian/requires.cmake new file mode 100644 index 00000000000..bfba787368f --- /dev/null +++ b/compiler/record-hessian/requires.cmake @@ -0,0 +1,3 @@ +require("luci") +require("luci-interpreter") +require("dio-hdf5") diff --git a/compiler/record-hessian/src/HessianComputer.cpp b/compiler/record-hessian/src/HessianComputer.cpp new file mode 100644 index 00000000000..6ae36cf0797 --- /dev/null +++ b/compiler/record-hessian/src/HessianComputer.cpp @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "record-hessian/HessianComputer.h" + +#include + +namespace record_hessian +{ + +/** + * @brief unfold the vector with NHWC shape, inherently acting in an in-place manner. + * @note (N, H, W, C) -> (N, L, K_h * K_w * C). + * See details(https://pytorch.org/docs/stable/generated/torch.nn.Unfold.html). + */ +void unfold(std::vector &buf, uint32_t input_n, uint32_t input_h, uint32_t input_w, + uint32_t input_c, uint32_t stride_h, uint32_t stride_w, uint32_t dilation_h, + uint32_t dilation_w, uint32_t kernel_oc, uint32_t kernel_h, uint32_t kernel_w, + uint32_t kernel_ic) +{ + assert(input_n > 0 && input_h > 0 && input_w > 0 && input_c > 0); + assert(stride_h > 0 && stride_w > 0); + assert(kernel_oc > 0 && kernel_h > 0 && kernel_w > 0 && kernel_ic > 0); + + if (input_c != kernel_ic) + throw std::runtime_error("RecordHessian: Input channels do not match kernel channels."); + int out_height = (input_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1; + int out_width = (input_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1; + int patch_size = kernel_h * kernel_w * kernel_ic; + std::vector unfolded_buf(input_n * out_height * out_width * patch_size, 0.0f); + + int index = 0; + int in_y, in_x; + for (int n = 0; n < input_n; ++n) + { + for (int y = 0; y < out_height; ++y) + { + for (int x = 0; x < out_width; ++x) + { + for (int in_c = 0; in_c < input_c; ++in_c) + { + for (int ky = 0; ky < kernel_h; ++ky) + { + for (int kx = 0; kx < kernel_w; ++kx) + { + in_y = y * stride_h + ky * dilation_h; + in_x = x * stride_w + kx * dilation_w; + if (in_y < input_h && in_x < input_w) + { + unfolded_buf[index] = buf[((n * input_h + in_y) * input_w + in_x) * input_c + in_c]; + } + index++; + } + } + } + } + } + } + + buf.swap(unfolded_buf); +} + +void HessianComputer::recordHessianForFullyConnected(const luci::CircleNode *node) +{ + assert(_input_tensor->shape().num_dims() < 4); + assert(_input_tensor->element_type() == luci_interpreter::DataType::FLOAT32); + + uint32_t size_in_ch; + uint32_t length; + + const auto data = _input_tensor->data(); + const auto num_elements = _input_tensor->shape().num_elements(); + std::vector buf(data, data + num_elements); + + if (_input_tensor->shape().num_dims() == 3) + { + size_in_ch = _input_tensor->shape().dim(2); // input_tensor [batch, length, channel] + } + else if (_input_tensor->shape().num_dims() == 2) + { + size_in_ch = _input_tensor->shape().dim(1); // input_tensor [length, channel] + } + else + { + throw std::runtime_error("RecordHessian: Unsupported node rank"); + } + assert(size_in_ch != 0); + length = num_elements / size_in_ch; + + std::vector hessian(size_in_ch * size_in_ch, 0); + + for (int i = 0; i < size_in_ch; ++i) + { + for (int j = 0; j < size_in_ch; ++j) + { + float sum = 0; + for (int k = 0; k < length; ++k) + { + sum += buf[i + k * size_in_ch] * buf[j + k * size_in_ch]; + } + hessian[i * size_in_ch + j] = 2 * sum; + } + } + + HessianVector &vector = _hessian_map[node]; + vector.update(hessian); +} + +void HessianComputer::recordHessianForConv2D(const luci::CircleNode *node) +{ + assert(_input_tensor->shape().num_dims() == 4); + assert(_input_tensor->element_type() == luci_interpreter::DataType::FLOAT32); + + const auto circle_conv2d = loco::must_cast(node); + const auto node_filter = loco::must_cast((circle_conv2d)->filter()); + assert(circle_conv2d->rank() >= 4); + assert(node_filter->dtype() == loco::DataType::FLOAT32); + assert(node_filter->rank() == 4); + + uint32_t size_filter = node_filter->size(); + uint32_t size_in_ch = + node_filter->size() / circle_conv2d->dim(3).value(); + + uint32_t input_n = _input_tensor->shape().dim(0); + uint32_t input_h = _input_tensor->shape().dim(1); + uint32_t input_w = _input_tensor->shape().dim(2); + uint32_t input_c = _input_tensor->shape().dim(3); + + uint32_t stride_h = circle_conv2d->stride()->h(); + uint32_t stride_w = circle_conv2d->stride()->w(); + uint32_t dilation_h = circle_conv2d->dilation()->h(); + uint32_t dilation_w = circle_conv2d->dilation()->w(); + + uint32_t kernel_oc = node_filter->dim(0).value(); + uint32_t kernel_h = node_filter->dim(1).value(); + uint32_t kernel_w = node_filter->dim(2).value(); + uint32_t kernel_ic = node_filter->dim(3).value(); + + const auto data = _input_tensor->data(); + const auto num_elements = _input_tensor->shape().num_elements(); + assert(data != 0); + assert(num_elements != 0); + std::vector buf(data, data + num_elements); + + unfold(buf, input_n, input_h, input_w, input_c, stride_h, stride_w, dilation_h, dilation_w, + kernel_oc, kernel_h, kernel_w, kernel_ic); + assert(size_in_ch != 0); + uint32_t length = buf.size() / size_in_ch; + + std::vector hessian(size_in_ch * size_in_ch, 0); + for (int i = 0; i < size_in_ch; ++i) + { + for (int j = 0; j < size_in_ch; ++j) + { + float sum = 0; + for (int k = 0; k < length; ++k) + { + sum += buf[i + k * size_in_ch] * buf[j + k * size_in_ch]; + } + hessian[i * size_in_ch + j] = 2 * sum; + } + } + + HessianVector &vector = _hessian_map[node]; + vector.update(hessian); +} + +void HessianComputer::recordHessian(const luci::CircleNode *node, + const luci_interpreter::Tensor *input_tensor) +{ + if (node == nullptr || input_tensor == nullptr) + throw std::invalid_argument("RecordHessian: node or input_tensor is null."); + + if (input_tensor->element_type() != luci_interpreter::DataType::FLOAT32) + throw std::runtime_error("RecordHessian: Unsupported dtype: only FLOAT32 is supported."); + + _input_tensor = input_tensor; + + switch (node->opcode()) + { + case luci::CircleOpcode::FULLY_CONNECTED: + recordHessianForFullyConnected(node); + break; + case luci::CircleOpcode::CONV_2D: + recordHessianForConv2D(node); + break; + default: + throw std::runtime_error("RecordHessian: " + node->name() + " is unsupported op."); + } +} + +std::unique_ptr HessianComputer::getMap() +{ + auto hessian_map = std::make_unique(); + + for (auto item : _hessian_map) + { + auto &vec = (*hessian_map)[item.first]; + vec = item.second.hessian; + } + + return hessian_map; +} + +} // namespace record_hessian diff --git a/compiler/record-hessian/src/HessianComputer.test.cpp b/compiler/record-hessian/src/HessianComputer.test.cpp new file mode 100644 index 00000000000..d64ab99678d --- /dev/null +++ b/compiler/record-hessian/src/HessianComputer.test.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "record-hessian/HessianComputer.h" + +#include +#include + +#include + +#include + +using namespace record_hessian; + +TEST(HessianComputerTest, recordHessianValidInput) +{ + luci::CircleFullyConnected node; + + std::vector input_data = {1.0, 2.0, 3.0, 4.0}; + + luci_interpreter::DataType data_type = luci_interpreter::DataType::FLOAT32; + luci_interpreter::Shape shape({1, 4}); + luci_interpreter::AffineQuantization quantization; + + std::string tensor_name = "input_tensor"; + + luci_interpreter::Tensor input_tensor(data_type, shape, quantization, tensor_name); + + size_t data_size = input_data.size() * sizeof(float); + std::vector buffer(data_size); + + input_tensor.set_data_buffer(buffer.data()); + input_tensor.writeData(input_data.data(), data_size); + + HessianComputer computer; + + EXPECT_NO_THROW(computer.recordHessian(&node, &input_tensor)); +} + +TEST(HessianComputerTest, recordHessian_wrong_op_NEG) +{ + luci::CircleAdd node; + + std::vector input_data = {1.0, 2.0, 3.0, 4.0}; + + luci_interpreter::DataType data_type = luci_interpreter::DataType::FLOAT32; + luci_interpreter::Shape shape({1, 2, 2, 1}); + luci_interpreter::AffineQuantization quantization; + + std::string tensor_name = "input_tensor"; + + luci_interpreter::Tensor input_tensor(data_type, shape, quantization, tensor_name); + + size_t data_size = input_data.size() * sizeof(float); + std::vector buffer(data_size); + + input_tensor.set_data_buffer(buffer.data()); + input_tensor.writeData(input_data.data(), data_size); + + HessianComputer computer; + + EXPECT_ANY_THROW(computer.recordHessian(&node, &input_tensor)); +} + +TEST(HessianComputerTest, recordHessianNullTensor_NEG) +{ + luci::CircleAdd node; + HessianComputer computer; + EXPECT_ANY_THROW(computer.recordHessian(&node, nullptr)); +} + +TEST(HessianComputerTest, unfoldValidInput) +{ + std::vector buf = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + uint32_t input_n = 1, input_h = 2, input_w = 2, input_c = 2; + uint32_t stride_h = 1, stride_w = 1, dilation_h = 1, dilation_w = 1; + uint32_t kernel_oc = 1, kernel_h = 2, kernel_w = 2, kernel_ic = 2; + + unfold(buf, input_n, input_h, input_w, input_c, stride_h, stride_w, dilation_h, dilation_w, + kernel_oc, kernel_h, kernel_w, kernel_ic); + std::vector expected_output = {1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0}; + + EXPECT_EQ(buf, expected_output); +} + +TEST(HessianComputerTest, unfoldInvalidInput_NEG) +{ + std::vector buf = {1.0, 2.0, 3.0, 4.0}; + uint32_t input_n = 1, input_h = 2, input_w = 2, input_c = 1; + uint32_t stride_h = 1, stride_w = 1, dilation_h = 1, dilation_w = 1; + uint32_t kernel_oc = 1, kernel_h = 2, kernel_w = 2, kernel_ic = 2; + + EXPECT_ANY_THROW(unfold(buf, input_n, input_h, input_w, input_c, stride_h, stride_w, dilation_h, + dilation_w, kernel_oc, kernel_h, kernel_w, kernel_ic)); +}