From 6287a5e0bf8b28ec0d463a7b2f23b4e1e610f89f Mon Sep 17 00:00:00 2001
From: Hyeongseok Oh <hseok82.oh@samsung.com>
Date: Mon, 19 Aug 2024 14:39:44 +0900
Subject: [PATCH] Draft: blockwise quantization

Blockwise quantization: uint4 / int8 with 32 block size, fp16 delta.

ONE-DCO-1.0-Signed-off-by: Hyeongseok Oh <hseok82.oh@samsung.com>
---
 runtime/onert/core/include/ir/DataType.h      |  2 ++
 runtime/onert/core/src/ir/DataType.cc         |  4 ++++
 runtime/onert/core/src/loader/BaseLoader.h    |  9 ++++++++-
 runtime/onert/core/src/loader/CircleLoader.cc | 14 ++++++++++++++
 4 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/runtime/onert/core/include/ir/DataType.h b/runtime/onert/core/include/ir/DataType.h
index 0ec0e07119e..b269af8effe 100644
--- a/runtime/onert/core/include/ir/DataType.h
+++ b/runtime/onert/core/include/ir/DataType.h
@@ -39,6 +39,8 @@ enum class DataType
   QUANT_INT16_ASYMM = 10,
   QUANT_INT8_SYMM_PER_CHANNEL = 11,
   QUANT_INT16_SYMM = 12,
+  QUANT_UINT4_SYMM_BLOCK = 13, // Blockwise quantization - for Gather input and FC weight
+  QUANT_INT8_SYMM_BLOCK = 14   // Blockwise quantization - for FC weight
 };
 
 size_t sizeOfDataType(DataType data_type);
diff --git a/runtime/onert/core/src/ir/DataType.cc b/runtime/onert/core/src/ir/DataType.cc
index 07670c72081..5a5ab705233 100644
--- a/runtime/onert/core/src/ir/DataType.cc
+++ b/runtime/onert/core/src/ir/DataType.cc
@@ -52,6 +52,10 @@ size_t sizeOfDataType(DataType data_type)
       return sizeof(int16_t);
     case DataType::QUANT_INT16_SYMM:
       return sizeof(int16_t);
+    case DataType::QUANT_UINT4_SYMM_BLOCK:
+      return sizeof(uint8_t) * 32 / 2 + sizeof(uint16_t);
+    case DataType::QUANT_INT8_SYMM_BLOCK:
+      return sizeof(uint8_t) * 32 + sizeof(uint16_t);
     default:
       throw std::runtime_error{"Unsupported type size"};
   }
diff --git a/runtime/onert/core/src/loader/BaseLoader.h b/runtime/onert/core/src/loader/BaseLoader.h
index c3a50b0d8c9..e66f90ef4fc 100644
--- a/runtime/onert/core/src/loader/BaseLoader.h
+++ b/runtime/onert/core/src/loader/BaseLoader.h
@@ -94,6 +94,7 @@ template <typename LoaderDomain> class BaseLoader
 
   // Helper functions
   ir::Activation convertActivation(ActivationFunctionType type);
+  virtual ir::DataType getTensorDataType(const Tensor *tensor);
   ir::DataType tensorTypeToDataType(TensorType type);
   ir::OperandIndex tensorIdxToOperandIdx(int32_t tensorIdx);
   flexbuffers::Map getCustomOpAttrMap(const Operator *op);
@@ -295,6 +296,12 @@ BaseLoader<LoaderDomain>::BaseLoader::convertActivation(const ActivationFunction
   }
 }
 
+template <typename LoaderDomain>
+ir::DataType BaseLoader<LoaderDomain>::BaseLoader::getTensorDataType(const Tensor *tensor)
+{
+  return tensorTypeToDataType(tensor->type());
+}
+
 template <typename LoaderDomain>
 ir::DataType BaseLoader<LoaderDomain>::BaseLoader::tensorTypeToDataType(const TensorType type)
 {
@@ -381,7 +388,7 @@ ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir:
   //       be used.
 
   // TypeInfo
-  ir::TypeInfo type_info(tensorTypeToDataType(tensor->type()));
+  ir::TypeInfo type_info(getTensorDataType(tensor));
   loadQuantization(tensor, type_info);
   loadSparsity(tensor, type_info);
 
diff --git a/runtime/onert/core/src/loader/CircleLoader.cc b/runtime/onert/core/src/loader/CircleLoader.cc
index 1d502308bea..f0778ffe1bf 100644
--- a/runtime/onert/core/src/loader/CircleLoader.cc
+++ b/runtime/onert/core/src/loader/CircleLoader.cc
@@ -87,6 +87,20 @@ class CircleLoader final : public loader::BaseLoader<LoaderDomain>
     }
   }
 
+protected:
+  ir::DataType getTensorDataType(const Tensor *tensor) override
+  {
+    auto type = tensor->type();
+    // Workaround: No quantization parameter for blockwize quantization
+    //             Actual parameter(scale) for each block is in data
+    // TODO Handle custom quantization parameter to represent blockwise quantization
+    if (type == TensorType::TensorType_UINT4 && !tensor->quantization())
+      return ir::DataType::QUANT_UINT4_SYMM_BLOCK;
+    if (type == TensorType::TensorType_INT8 && !tensor->quantization())
+      return ir::DataType::QUANT_INT8_SYMM_BLOCK;
+    return tensorTypeToDataType(type);
+  }
+
 private:
   std::unique_ptr<ir::Graph> loadSubgraph(const circle::SubGraph *circle_subg) override
   {