diff --git a/runtime/onert/core/src/backend/builtin/KernelGenerator.cc b/runtime/onert/core/src/backend/builtin/KernelGenerator.cc index 00c200a9286..8f6b4fa355f 100644 --- a/runtime/onert/core/src/backend/builtin/KernelGenerator.cc +++ b/runtime/onert/core/src/backend/builtin/KernelGenerator.cc @@ -103,9 +103,14 @@ void KernelGenerator::visit(const ir::operation::Permute &node) // Add PermuteLayer std::vector output_tensors{getTensor(output_index)}; std::vector input_tensors{getTensor(input_index)}; + std::vector permute_types; - auto fn = - std::make_unique(input_tensors, output_tensors, _external_context); + // Layout in graph is always NHWC, so layout is not changed + for (uint32_t i = 0; i < input_tensors.size(); i++) + permute_types.emplace_back(ir::PermuteType::COPY); + + auto fn = std::make_unique(input_tensors, output_tensors, permute_types, + _external_context); _return_fn = std::move(fn); } diff --git a/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc index d46f0fea516..1dfa20720f4 100644 --- a/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc +++ b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc @@ -31,26 +31,18 @@ namespace kernel PermuteLayer::PermuteLayer(const std::vector &src_tensors, const std::vector &dst_tensors, + const std::vector &types, const std::shared_ptr &external_context) : _external_context{external_context}, _tasks_map{} { assert(src_tensors.size() == dst_tensors.size()); + assert(src_tensors.size() == types.size()); _src_tensors = src_tensors; _dst_tensors = dst_tensors; + _permute_types = types; _src_tensors_offsets.resize(src_tensors.size()); _dst_tensors_offsets.resize(dst_tensors.size()); _permute_types.resize(src_tensors.size()); - - // TODO Get from constructor parameter - for (uint32_t i = 0; i < src_tensors.size(); i++) - { - if (src_tensors[i]->layout() == dst_tensors[i]->layout()) - _permute_types[i] = ir::PermuteType::COPY; - else if (src_tensors[i]->layout() == ir::Layout::NHWC) - _permute_types[i] = ir::PermuteType::NHWC_TO_NCHW; - else - _permute_types[i] = ir::PermuteType::NCHW_TO_NHWC; - } } void PermuteLayer::optimize() diff --git a/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.h b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.h index cf25f5447c1..6ae6eb9bd62 100644 --- a/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.h +++ b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.h @@ -35,6 +35,7 @@ class PermuteLayer : public onert::exec::IPermuteFunction { public: PermuteLayer(const std::vector &src_tensors, const std::vector &dst_tensors, + const std::vector &types, const std::shared_ptr &external_context); void optimize() override; diff --git a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc index 25941315db1..58835449640 100644 --- a/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc +++ b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc @@ -84,10 +84,15 @@ void WhileLayer::run() std::vector op_inputs(_input_tensors.begin(), _input_tensors.end()); std::vector op_outputs(_output_tensors.begin(), _output_tensors.end()); + std::vector permute_types; + // Layout in graph is always NHWC, so layout is not changed + for (uint32_t i = 0; i < op_outputs.size(); i++) + permute_types.emplace_back(ir::PermuteType::COPY); // Copying body inputs to outputs when the loop body is never executed if (!getResultCond(cond_output_tensor.get())) { - PermuteLayer copy_body_inputs_to_op_outputs{op_inputs, op_outputs, _external_context}; + PermuteLayer copy_body_inputs_to_op_outputs{op_inputs, op_outputs, permute_types, + _external_context}; copy_body_inputs_to_op_outputs.run(); return; } @@ -105,7 +110,8 @@ void WhileLayer::run() } std::vector body_outputs(temp_outputs.begin(), temp_outputs.end()); - PermuteLayer copy_body_outputs_to_op_outputs{body_outputs, op_outputs, _external_context}; + PermuteLayer copy_body_outputs_to_op_outputs{body_outputs, op_outputs, permute_types, + _external_context}; const auto body_execute_with_op_inputs = [&]() { VERBOSE(While) << "Call to $" << _body_subg_index << " (body)" << std::endl; diff --git a/runtime/onert/core/src/backend/builtin/train/KernelGenerator.cc b/runtime/onert/core/src/backend/builtin/train/KernelGenerator.cc index 32032de4a7c..b162d05802d 100644 --- a/runtime/onert/core/src/backend/builtin/train/KernelGenerator.cc +++ b/runtime/onert/core/src/backend/builtin/train/KernelGenerator.cc @@ -61,12 +61,17 @@ void KernelGenerator::visit(const ir::train::operation::Permute &node) std::vector output_back_prop_tensors; std::vector input_back_prop_tensors; + std::vector permute_types; auto input_back_prop_tensor = getBackPropTensor(input_index); auto output_back_prop_tensor = getBackPropTensor(output_index); output_back_prop_tensors.emplace_back(output_back_prop_tensor); input_back_prop_tensors.emplace_back(input_back_prop_tensor); + // Layout in graph is always NHWC, so layout is not changed + for (uint32_t i = 0; i < input_tensors.size(); i++) + permute_types.emplace_back(ir::PermuteType::COPY); + // NOTE The output buffers of IOTensors are not essential for training. If there // is no output buffer provided by the user, permute is not performed. bool ignore_forward_in_training = false; @@ -77,7 +82,7 @@ void KernelGenerator::visit(const ir::train::operation::Permute &node) } auto fn = std::make_unique( - input_tensors, output_tensors, input_back_prop_tensors, output_back_prop_tensors, + input_tensors, output_tensors, input_back_prop_tensors, output_back_prop_tensors, permute_types, ignore_forward_in_training, _external_context); _return_fn = std::move(fn); diff --git a/runtime/onert/core/src/backend/builtin/train/kernel/PermuteLayer.cc b/runtime/onert/core/src/backend/builtin/train/kernel/PermuteLayer.cc index 495b51a85e3..28a00ed3e69 100644 --- a/runtime/onert/core/src/backend/builtin/train/kernel/PermuteLayer.cc +++ b/runtime/onert/core/src/backend/builtin/train/kernel/PermuteLayer.cc @@ -33,9 +33,10 @@ PermuteLayer::PermuteLayer(const std::vector &src_tensors, const std::vector &dst_tensors, const std::vector &input_back_prop_tensors, const std::vector &output_back_prop_tensors, + const std::vector &types, bool ignore_forward_in_training, const std::shared_ptr &external_context) - : builtin::kernel::PermuteLayer{src_tensors, dst_tensors, external_context}, + : builtin::kernel::PermuteLayer{src_tensors, dst_tensors, types, external_context}, _input_back_prop_tensors{input_back_prop_tensors}, _output_back_prop_tensors{output_back_prop_tensors}, _ignore_forward_in_training{ignore_forward_in_training} diff --git a/runtime/onert/core/src/backend/builtin/train/kernel/PermuteLayer.h b/runtime/onert/core/src/backend/builtin/train/kernel/PermuteLayer.h index 1dc221b0910..d3b92021fce 100644 --- a/runtime/onert/core/src/backend/builtin/train/kernel/PermuteLayer.h +++ b/runtime/onert/core/src/backend/builtin/train/kernel/PermuteLayer.h @@ -38,7 +38,7 @@ class PermuteLayer : public builtin::kernel::PermuteLayer, public exec::train::I PermuteLayer(const std::vector &src_tensors, const std::vector &dst_tensors, const std::vector &input_back_prop_tensors, const std::vector &output_back_prop_tensors, - bool ignore_forward_in_training, + const std::vector &types, bool ignore_forward_in_training, const std::shared_ptr &external_context); void optimize() override; diff --git a/runtime/onert/core/src/exec/IPermuteFunction.h b/runtime/onert/core/src/exec/IPermuteFunction.h index 96ac3f35389..517d0dc6bee 100644 --- a/runtime/onert/core/src/exec/IPermuteFunction.h +++ b/runtime/onert/core/src/exec/IPermuteFunction.h @@ -252,23 +252,14 @@ class PermuteLayer : public onert::exec::IPermuteFunction { public: PermuteLayer(const std::vector &inputs, - const std::vector &outputs) + const std::vector &outputs, + const std::vector &types) { assert(inputs.size() == outputs.size()); + assert(inputs.size() == types.size()); _src_tensors = inputs; _dst_tensors = outputs; - _permute_types.resize(inputs.size()); - - // TODO Get from constructor parameter - for (uint32_t i = 0; i < inputs.size(); i++) - { - if (inputs[i]->layout() == outputs[i]->layout()) - _permute_types[i] = ir::PermuteType::COPY; - else if (inputs[i]->layout() == ir::Layout::NHWC) - _permute_types[i] = ir::PermuteType::NHWC_TO_NCHW; - else - _permute_types[i] = ir::PermuteType::NCHW_TO_NHWC; - } + _permute_types = types; } virtual ~PermuteLayer() {} void optimize() override {} diff --git a/runtime/onert/core/src/exec/MultiModelExecutors.cc b/runtime/onert/core/src/exec/MultiModelExecutors.cc index 45fc4157454..011d500d6c3 100644 --- a/runtime/onert/core/src/exec/MultiModelExecutors.cc +++ b/runtime/onert/core/src/exec/MultiModelExecutors.cc @@ -186,6 +186,7 @@ void MultiModelExecutors::createEdgeQuantLayers() std::vector inputs; std::vector outputs; + std::vector permute_types; for (const auto &[from_iodesc, to_list] : _edge_map) { if (std::get(from_iodesc) == model_index && @@ -212,13 +213,16 @@ void MultiModelExecutors::createEdgeQuantLayers() auto type_aware_quant_tensor = std::make_unique(to_info, to_layout); outputs.emplace_back(type_aware_quant_tensor.get()); + // No layout change on edge + permute_types.emplace_back(ir::PermuteType::COPY); + _edge_quant_tensors[to_iodesc] = std::move(type_aware_quant_tensor); } } } } - auto layer = std::make_unique(inputs, outputs); + auto layer = std::make_unique(inputs, outputs, permute_types); layer->prepare(); _edge_quant_layers[{model_index, subg_index}] = std::move(layer); } @@ -282,6 +286,7 @@ void MultiModelExecutors::createPkgIOQuantLayers(const IODescription &desc) } std::vector src_tensors; std::vector dst_tensors; + std::vector permute_types; for (const auto &pkg_input : pkg_inputs) { const auto &io_index = std::get(pkg_input); @@ -294,7 +299,8 @@ void MultiModelExecutors::createPkgIOQuantLayers(const IODescription &desc) // Create EdgeTensor for nnpkg input if type is different const auto &orig_info = executor->inputInfo(io_index.value()); const auto orig_layout = executor->inputLayout(io_index.value()); - if (input_desc->info.typeInfo().type() != orig_info.typeInfo().type()) + if ((input_desc->info.typeInfo().type() != orig_info.typeInfo().type()) || + (input_desc->layout == ir::Layout::NCHW)) { auto pkg_input_edge_tensor = std::make_unique(orig_info, orig_layout); _pkg_input_quant_tensors[pkg_input] = std::move(pkg_input_edge_tensor); @@ -302,11 +308,16 @@ void MultiModelExecutors::createPkgIOQuantLayers(const IODescription &desc) // Append type-aware quantization layer's inputs/outputs src_tensors.emplace_back(_pkg_input_tensors[pkg_input].get()); dst_tensors.emplace_back(_pkg_input_quant_tensors[pkg_input].get()); + + if (input_desc->layout == ir::Layout::NCHW) + permute_types.emplace_back(ir::PermuteType::NCHW_TO_NHWC); + else + permute_types.emplace_back(ir::PermuteType::COPY); } } // Create type-aware quantization layer for nnpkg inputs - auto pkg_input_layer = std::make_unique(src_tensors, dst_tensors); + auto pkg_input_layer = std::make_unique(src_tensors, dst_tensors, permute_types); pkg_input_layer->prepare(); _pkg_input_quant_layers[{model_index, subg_index}] = std::move(pkg_input_layer); @@ -322,6 +333,7 @@ void MultiModelExecutors::createPkgIOQuantLayers(const IODescription &desc) } src_tensors.clear(); dst_tensors.clear(); + permute_types.clear(); // Create Tensors of nnpkg outputs for type-aware quantization for (const auto &pkg_output : pkg_outputs) { @@ -335,7 +347,8 @@ void MultiModelExecutors::createPkgIOQuantLayers(const IODescription &desc) // Create EdgeTensor for nnpkg output if type is different const auto &orig_info = executor->outputInfo(io_index.value()); const auto orig_layout = executor->outputLayout(io_index.value()); - if (output_desc->info.typeInfo().type() != orig_info.typeInfo().type()) + if ((output_desc->info.typeInfo().type() != orig_info.typeInfo().type()) || + (output_desc->layout == ir::Layout::NCHW)) { auto pkg_output_edge_tensor = std::make_unique(orig_info, orig_layout); _pkg_output_quant_tensors[pkg_output] = std::move(pkg_output_edge_tensor); @@ -343,11 +356,16 @@ void MultiModelExecutors::createPkgIOQuantLayers(const IODescription &desc) // Append type-aware quantization layer's inputs/outputs src_tensors.emplace_back(_pkg_output_quant_tensors[pkg_output].get()); dst_tensors.emplace_back(_pkg_output_tensors[pkg_output].get()); + + if (output_desc->layout == ir::Layout::NCHW) + permute_types.emplace_back(ir::PermuteType::NHWC_TO_NCHW); + else + permute_types.emplace_back(ir::PermuteType::COPY); } } // Create type-aware quantization layer for nnpkg outputs - auto pkg_output_layer = std::make_unique(src_tensors, dst_tensors); + auto pkg_output_layer = std::make_unique(src_tensors, dst_tensors, permute_types); pkg_output_layer->prepare(); _pkg_output_quant_layers[{model_index, subg_index}] = std::move(pkg_output_layer); } diff --git a/runtime/onert/core/src/exec/SingleModelExecutors.cc b/runtime/onert/core/src/exec/SingleModelExecutors.cc index 44c5e57425b..fcb8a371175 100644 --- a/runtime/onert/core/src/exec/SingleModelExecutors.cc +++ b/runtime/onert/core/src/exec/SingleModelExecutors.cc @@ -66,10 +66,12 @@ void SingleModelExecutors::execute(const ExecutionContext &ctx) // Vector for input quantization I/O std::vector input_tensors; std::vector input_qtensors; + std::vector input_permute_types; // Vector for output dequantization I/O std::vector output_qtensors; std::vector output_tensors; + std::vector output_permute_types; // Prepare UserTensor and EdgeTensor for input quantization for (uint32_t i = 0; i < inputs.size(); i++) @@ -87,7 +89,8 @@ void SingleModelExecutors::execute(const ExecutionContext &ctx) auto user_type = desc->info.typeInfo().type(); auto &model_info = entryExecutor()->inputInfo(i).typeInfo(); auto model_type = model_info.type(); - if (user_type != model_type && user_type == ir::DataType::FLOAT32) + if ((user_type != model_type && user_type == ir::DataType::FLOAT32) || + (desc->layout == ir::Layout::NCHW)) { auto quantized_info = desc->info; quantized_info.typeInfo(model_info); @@ -98,6 +101,10 @@ void SingleModelExecutors::execute(const ExecutionContext &ctx) input_tensors.push_back(tensorpool.back().get()); input_qtensors.push_back(qtensorpool.back().get()); inputs[i] = qtensorpool.back().get(); + if (desc->layout == ir::Layout::NCHW) + input_permute_types.push_back(ir::PermuteType::NCHW_TO_NHWC); + else + input_permute_types.push_back(ir::PermuteType::COPY); } else inputs[i] = tensorpool.back().get(); @@ -118,7 +125,8 @@ void SingleModelExecutors::execute(const ExecutionContext &ctx) auto user_type = desc->info.typeInfo().type(); auto &model_info = entryExecutor()->outputInfo(i).typeInfo(); auto model_type = model_info.type(); - if (user_type != model_type && user_type == ir::DataType::FLOAT32) + if ((user_type != model_type && user_type == ir::DataType::FLOAT32) || + (desc->layout == ir::Layout::NCHW)) { auto quantized_info = desc->info; quantized_info.typeInfo(model_info); @@ -129,6 +137,10 @@ void SingleModelExecutors::execute(const ExecutionContext &ctx) output_qtensors.push_back(qtensorpool.back().get()); output_tensors.push_back(tensorpool.back().get()); outputs[i] = qtensorpool.back().get(); + if (desc->layout == ir::Layout::NCHW) + output_permute_types.push_back(ir::PermuteType::NHWC_TO_NCHW); + else + output_permute_types.push_back(ir::PermuteType::COPY); } else outputs[i] = tensorpool.back().get(); @@ -137,7 +149,7 @@ void SingleModelExecutors::execute(const ExecutionContext &ctx) // Run quantization if (input_tensors.size() > 0) { - auto input_quantize_layer = PermuteLayer(input_tensors, input_qtensors); + auto input_quantize_layer = PermuteLayer(input_tensors, input_qtensors, input_permute_types); input_quantize_layer.prepare(); input_quantize_layer.run(); } @@ -148,7 +160,8 @@ void SingleModelExecutors::execute(const ExecutionContext &ctx) // Run dequantization if (output_tensors.size() != 0) { - auto output_dequantize_layer = PermuteLayer(output_qtensors, output_tensors); + auto output_dequantize_layer = + PermuteLayer(output_qtensors, output_tensors, output_permute_types); output_dequantize_layer.prepare(); output_dequantize_layer.run(); }