diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst index 4bf6a80d65a..e6ee1dc8de1 100644 --- a/compiler/circle2circle-dredd-recipe-test/test.lst +++ b/compiler/circle2circle-dredd-recipe-test/test.lst @@ -48,6 +48,10 @@ Add(Net_DwConv_BN_000 PASS fuse_batchnorm_with_dwconv) Add(Net_DwConv_BN_001 PASS fuse_batchnorm_with_dwconv) Add(Net_FC_Gelu_FC_000 PASS replace_with_fc_gelu_fc) Add(Net_FullyConnected_Add_000 PASS fold_fully_connected) +Add(Net_FullyConnected_Mul_000 PASS fuse_mul_with_fullyconnected) +Add(Net_FullyConnected_Mul_001 PASS fuse_mul_with_fullyconnected) +Add(Net_FullyConnected_Mul_002 PASS fuse_mul_with_fullyconnected) +Add(Net_FullyConnected_Mul_003 PASS fuse_mul_with_fullyconnected) Add(Net_Gelu_000 PASS fuse_gelu) Add(Net_Gelu_001 PASS fuse_gelu) Add(Net_Horizontal_FullyConnected_Add_000 PASS fuse_horizontal_fc_layers) diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp index 757c368f31d..ccea8c65788 100644 --- a/compiler/circle2circle/src/Circle2Circle.cpp +++ b/compiler/circle2circle/src/Circle2Circle.cpp @@ -118,6 +118,8 @@ int entry(int argc, char **argv) "This will fuse Mul operation with a preceding Conv if possible."); add_switch(arser, "--fuse_mul_with_div", "This will fuse Mul operation with a Div operation whose numerator is const."); + add_switch(arser, "--fuse_mul_with_fullyconnected", + "This will fuse Mul operator with a preceding FullyConnected operator."); add_switch(arser, "--fuse_slice_with_tconv", "This will fuse Slice operation with a preceding TConv if possible."); add_switch(arser, "--fuse_transpose_with_mean", @@ -326,6 +328,8 @@ int entry(int argc, char **argv) options->enable(Algorithms::FuseMulWithConv); if (arser.get("--fuse_mul_with_div")) options->enable(Algorithms::FuseMulWithDiv); + if (arser.get("--fuse_mul_with_fullyconnected")) + options->enable(Algorithms::FuseMulWithFullyConnected); if (arser.get("--make_batchnorm_gamma_positive")) options->enable(Algorithms::MakeBatchNormGammaPositive); if (arser.get("--fuse_preactivation_batchnorm")) diff --git a/compiler/luci-pass-value-py-test/test.lst b/compiler/luci-pass-value-py-test/test.lst index e2c37517486..d610f980bf2 100644 --- a/compiler/luci-pass-value-py-test/test.lst +++ b/compiler/luci-pass-value-py-test/test.lst @@ -33,6 +33,10 @@ eval(Net_Dequantize_Add_000 fold_dequantize) eval(Net_DwConv_BN_000 fuse_batchnorm_with_dwconv) eval(Net_DwConv_BN_001 fuse_batchnorm_with_dwconv) eval(Net_FullyConnected_Add_000 fold_fully_connected) +eval(Net_FullyConnected_Mul_000 fuse_mul_with_fullyconnected) +eval(Net_FullyConnected_Mul_001 fuse_mul_with_fullyconnected) +eval(Net_FullyConnected_Mul_002 fuse_mul_with_fullyconnected) +eval(Net_FullyConnected_Mul_003 fuse_mul_with_fullyconnected) eval(Net_Horizontal_FullyConnected_Add_000 fuse_horizontal_fc_layers) eval(Net_InstanceNorm_001 fuse_instnorm) eval(Net_InstanceNorm_002 fuse_instnorm) diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h index 9cbd26f0da5..8a1eb6d4f78 100644 --- a/compiler/luci/pass/include/luci/CircleOptimizer.h +++ b/compiler/luci/pass/include/luci/CircleOptimizer.h @@ -49,6 +49,7 @@ class CircleOptimizer final FuseMeanWithMean, FuseMulWithConv, FuseMulWithDiv, + FuseMulWithFullyConnected, FuseTransposeWithMean, ResolveCustomOpAdd, ResolveCustomOpBatchMatMul, diff --git a/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h b/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h new file mode 100644 index 00000000000..718039f1c69 --- /dev/null +++ b/compiler/luci/pass/include/luci/Pass/FuseMulWithFullyConnectedPass.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_FUSE_MUL_WITH_FULLYCONNECTED_PASS_H__ +#define __LUCI_FUSE_MUL_WITH_FULLYCONNECTED_PASS_H__ + +#include + +namespace luci +{ + +/** + * @brief Class to fuse Mul into CircleFullyConnected + */ +struct FuseMulWithFullyConnectedPass final : public logo::Pass +{ + const char *name(void) const final { return "luci::FuseMulWithFullyConnectedPass"; } + + bool run(loco::Graph *g) final; +}; + +} // namespace luci + +#endif // __LUCI_FUSE_MUL_WITH_FULLYCONNECTED_PASS_H__ diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp index 840c8dd25dd..e4bf84eeef9 100644 --- a/compiler/luci/pass/src/CircleOptimizer.cpp +++ b/compiler/luci/pass/src/CircleOptimizer.cpp @@ -48,6 +48,7 @@ #include "luci/Pass/FuseMeanWithMeanPass.h" #include "luci/Pass/FuseMulWithConvPass.h" #include "luci/Pass/FuseMulWithDivPass.h" +#include "luci/Pass/FuseMulWithFullyConnectedPass.h" #include "luci/Pass/FusePreActivationBatchNormPass.h" #include "luci/Pass/FusePReluPass.h" #include "luci/Pass/FuseGeluPass.h" @@ -278,6 +279,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const phase.emplace_back(std::make_unique()); phase.emplace_back(std::make_unique()); + if (_options->query(Options::Algorithm::FuseMulWithFullyConnected)) + { + phase.emplace_back(std::make_unique()); + } if (_options->query(Options::Algorithm::CommonSubExpressionElimination)) { phase.emplace_back(std::make_unique()); @@ -310,6 +315,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const { phase.emplace_back(std::make_unique()); } + if (_options->query(Options::Algorithm::FuseMulWithFullyConnected)) + { + phase.emplace_back(std::make_unique()); + } if (_options->query(Options::Algorithm::ResolveCustomOpMaxPoolWithArgmax)) { phase.emplace_back(std::make_unique()); diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp new file mode 100644 index 00000000000..d4fb75953ed --- /dev/null +++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.cpp @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/FuseMulWithFullyConnectedPass.h" + +#include "helpers/NodeFiller.h" + +#include +#include +#include + +namespace +{ + +#define RETURN_FALSE_UNLESS(cond) \ + if (not(cond)) \ + return false; + +inline bool is_single_element(const luci::CircleConst *node) +{ + return ((node->rank() == 1 || node->rank() == 0) && node->size() == 1); +} + +inline void update_with_single_element(luci::CircleConst *fused_node, + const luci::CircleConst *multiplication) +{ + for (uint32_t i = 0; i < fused_node->size(); i++) + { + fused_node->at(i) *= multiplication->at(0); + } +} + +luci::CircleConst *gen_fused_weights(luci::CircleConst *weights, + const luci::CircleConst *multiplication) +{ + auto fused_weights = luci::clone(weights); + // Single element multiplication: + if (is_single_element(multiplication)) + { + update_with_single_element(fused_weights, multiplication); + } + // N-size multiplication: + else + { + // Go along channels, multiplication size is ensured to be compatible with channels. + auto count = fused_weights->dim(0).value(); + auto size = fused_weights->dim(fused_weights->rank() - 1).value(); + float val; + for (uint32_t c = 0; c < count; c++) + { + val = multiplication->at(c); + for (uint32_t i = 0; i < size; i++) + { + fused_weights->at(c * size + i) *= val; + } + } + } + return fused_weights; +} + +luci::CircleConst *gen_fused_bias(luci::CircleConst *bias, const luci::CircleConst *multiplication) +{ + auto fused_bias = luci::clone(bias); + // Single element multiplication: + if (is_single_element(multiplication)) + { + update_with_single_element(fused_bias, multiplication); + } + // N-size multiplication: + else + { + // Go along channels, multiplication size is ensured to be compatible with channels. + for (uint32_t i = 0; i < fused_bias->size(); i++) + { + fused_bias->at(i) *= multiplication->at(i); + } + } + return fused_bias; +} + +/** + * Fuse Mul to FullyConnected if the multiplied value is a channel(last dimension)-wise constant + * + * BEFORE + * | + * [CircleFullyConnected] + * | + * [CircleMul] + * | + * + * AFTER + * | + * [CircleFullyConnected] [CircleMul] (dead) + * | + * + */ +bool fuse_mul_with_fc(luci::CircleMul *mul) +{ + // Sanity check: + RETURN_FALSE_UNLESS(mul); + // Allow Mul node only with FLOAT32 data type: + RETURN_FALSE_UNLESS(mul->dtype() == loco::DataType::FLOAT32); + // Check if any FC node connects to Mul. + // Find the pattern of Mul(FC, CircleConst): + luci::CircleFullyConnected *fc = nullptr; + luci::CircleConst *multiplication = nullptr; + RETURN_FALSE_UNLESS(luci::fill(&fc, &multiplication).with_commutative_args_of(mul)); + /** + * Make sure that FullyConnected has only one successor. + * + * If the FullyConnected output is connected to more nodes, + * this pass will replace node with new fused FullyConnected. + * Thus pass success will only introduce extra FullyConnected + * without reducing overall number of nodes. + * Which tends to increase model's size and degrades model's performance. + * Thus one successor is required to benefit from this pass. + * + * Example graph that illustrates the described scenario: + * + * BEFORE + * | + * [CircleFullyConnected] + * | + * +-------+----------------+ + * | | + * | | + * [Other Node] [CircleMul] + * | | + * + * AFTER + * | + * [CircleFullyConnected] + * | + * +-------+-----------------------+ + * | | + * | | + * [Other Node] [New CircleFullyConnected Fused with Mul] + * | | + * + */ + RETURN_FALSE_UNLESS(loco::succs(fc).size() == 1); + // Allow only FLOAT32 data type: + RETURN_FALSE_UNLESS(fc->dtype() == loco::DataType::FLOAT32); + // Allow only without activation functions as values are going to + // be multiplied before activation function. + RETURN_FALSE_UNLESS(fc->fusedActivationFunction() == luci::FusedActFunc::NONE); + // Check for weights being Constant: + auto weights = dynamic_cast(fc->weights()); + RETURN_FALSE_UNLESS(weights); + // Get rank of multiplication: + auto rank = multiplication->rank(); + // Check that all dimensions are ones, checks broadcast capabilites. + // Last dimesion of multiplication must be compatible with FC. + // N-D case (N>1): + if (multiplication->rank() > 1) + { + // Check channel-wise broadcasting: + for (uint32_t i = 0; i < rank - 1; i++) + RETURN_FALSE_UNLESS(multiplication->dim(i).value() == 1); + // Check the last dimesion of Mul is the same with the first dimension of FullyConnected + RETURN_FALSE_UNLESS(multiplication->dim(rank - 1) == weights->dim(0)); + } + // 1-D or scalar case: + else if (multiplication->rank() == 1) + { + RETURN_FALSE_UNLESS(multiplication->size() == 1 || + multiplication->size() == weights->dim(0)); + } + else if (multiplication->rank() == 0) + { + RETURN_FALSE_UNLESS(multiplication->size() == 1); + } + + // Only supports: + // (1) constant bias + // (2) no bias + auto bias = loco::must_cast(fc->bias()); + if (bias->opcode() == luci::CircleOpcode::CIRCLECONST) + { + // Create new bias to be updated with values: + auto const_bias = dynamic_cast(fc->bias()); + RETURN_FALSE_UNLESS(const_bias) + RETURN_FALSE_UNLESS(const_bias->dtype() == loco::DataType::FLOAT32); + // Create new bias with updated values and replace: + auto fused_bias = gen_fused_bias(const_bias, multiplication); + fc->bias(fused_bias); + } + else if (bias->opcode() != luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE) + { + return false; + } + + // Create new weights with updated values and replace: + auto fused_weights = gen_fused_weights(weights, multiplication); + fc->weights(fused_weights); + + // Set origin and copy Activation Function if exisitng: + fc->fusedActivationFunction(mul->fusedActivationFunction()); + luci::add_origin(fc, luci::get_origin(mul)); + + replace(mul).with(fc); + + return true; +} + +} // namespace + +namespace luci +{ + +bool FuseMulWithFullyConnectedPass::run(loco::Graph *g) +{ + bool changed = false; + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + if (auto mul = dynamic_cast(node)) + { + if (fuse_mul_with_fc(mul)) + changed = true; + } + } + + return changed; +} + +} // namespace luci diff --git a/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp new file mode 100644 index 00000000000..a4f9d6bf087 --- /dev/null +++ b/compiler/luci/pass/src/FuseMulWithFullyConnectedPass.test.cpp @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/FuseMulWithFullyConnectedPass.h" +#include "helpers/CreateCircleConst.h" + +#include +#include + +#include + +#define DIM_ONE 8 +#define DIM_TWO 4 +#define MUL_VAL 2.0f + +namespace +{ + +using namespace luci::test; + +/** + * Graph for this test + * + * BEFORE (without extra_fc_successor) + * + * [FC] + * | + * [Mul w/ Relu] + * + * BEFORE (with extra_fc_successor) + * + * [FC] + * | + * |------------------- + * | | + * | | + * [Mul w/ Relu] [other FC] + * + * AFTER (if pass applied) + * + * [FC w/ Relu] (weights and bias updated) + * + */ +class FCMulGraphlet +{ +public: + void init(loco::Graph *g, luci::FusedActFunc fc_activation, bool is_mul_scalar, bool use_bias, + bool extra_successor) + { + _fc = g->nodes()->create(); + + std::vector weights_val(DIM_ONE * DIM_TWO); + for (uint32_t i = 0; i < DIM_ONE * DIM_TWO; i++) + weights_val.at(i) = i; + + _fc_f = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE, DIM_TWO}, weights_val); + _fc->weights(_fc_f); + + if (use_bias) + { + std::vector bias_val(DIM_ONE); + for (uint32_t i = 0; i < DIM_ONE; i++) + bias_val.at(i) = i; + + _fc_b = luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE}, bias_val); + } + else + { + // Create CircleOutputExclude -- no bias + _fc_b = g->nodes()->create(); + } + _fc->bias(_fc_b); + + _fc->fusedActivationFunction(fc_activation); + _fc->dtype(loco::DataType::FLOAT32); + _fc->shape({1, DIM_ONE}); + _fc->name("fc"); + + if (extra_successor) + { + _extra_succ = g->nodes()->create(); + // Set previous FC as input to bump number of successors for it: + _extra_succ->input(_fc); + std::vector weights_val(DIM_ONE * DIM_TWO); + _extra_f = + luci::create_const_node(g, loco::DataType::FLOAT32, {DIM_ONE, DIM_TWO}, weights_val); + _extra_succ->weights(_extra_f); + _extra_succ->bias(nullptr); + _extra_succ->fusedActivationFunction(luci::FusedActFunc::NONE); + _extra_succ->dtype(loco::DataType::FLOAT32); + _extra_succ->shape({1, DIM_ONE}); + _extra_succ->name("extra_fc"); + } + + std::vector mul_values; + + if (is_mul_scalar) + { + mul_values.push_back(static_cast(MUL_VAL)); + _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {}, mul_values); + } + else + { + for (uint32_t i = 0; i < DIM_ONE; i++) + { + mul_values.push_back(static_cast(i)); + } + _mul_c = luci::create_const_node(g, loco::DataType::FLOAT32, {1, 1, 1, DIM_ONE}, mul_values); + } + + _mul = g->nodes()->create(); + _mul->x(_fc); + _mul->y(_mul_c); + _mul->fusedActivationFunction(luci::FusedActFunc::RELU); + _mul->dtype(loco::DataType::FLOAT32); + if (is_mul_scalar) + { + _mul->shape({1, DIM_ONE}); + } + else + { + _mul->shape({1, 1, 1, DIM_ONE}); + } + _mul->name("mul"); + } + +public: + luci::CircleFullyConnected *fc() { return _fc; } + + void to_fm_bias(void) + { + assert(_fc != nullptr); + + auto new_fc = _fc->graph()->nodes()->create(); + _fc->bias(new_fc); + } + +protected: + luci::CircleFullyConnected *_fc = nullptr; + luci::CircleMul *_mul = nullptr; + luci::CircleConst *_fc_f = nullptr; + luci::CircleNode *_fc_b = nullptr; + luci::CircleConst *_mul_c = nullptr; + luci::CircleFullyConnected *_extra_succ = nullptr; + luci::CircleConst *_extra_f = nullptr; +}; + +class FuseMulWithFCTestGraph : public TestIOGraph, public FCMulGraphlet +{ +public: + void init(luci::FusedActFunc fc_activation, bool is_mul_scalar, bool use_bias, + bool extra_successor) + { + TestIOGraph::init({1, DIM_TWO}, {1, DIM_ONE}); + FCMulGraphlet::init(g(), fc_activation, is_mul_scalar, use_bias, extra_successor); + + _fc->input(input()); + + output()->from(_mul); + } +}; + +class FuseMulWithFullyConnectedPassTest : public ::testing::Test +{ +public: + FuseMulWithFCTestGraph g; + luci::FuseMulWithFullyConnectedPass pass; +}; + +} // namespace + +TEST_F(FuseMulWithFullyConnectedPassTest, fc_mul_tensor) +{ + g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */, + false /* extra_successor */); + + EXPECT_EQ(true, pass.run(g.g())); + + auto fc = dynamic_cast(g.output()->from()); + EXPECT_NE(nullptr, fc); + + auto weights = loco::must_cast(g.fc()->weights()); + auto weights_n = weights->dim(0).value(); + auto weights_m = weights->dim(1).value(); + uint32_t offset = 0; + for (uint32_t i = 0; i < weights_n; i++) + { + for (uint32_t j = 0; j < weights_m; j++) + { + offset = i * weights_m + j; + EXPECT_EQ(i * offset, weights->at(offset)); + } + } + + auto bias = loco::must_cast(g.fc()->bias()); + for (uint32_t i = 0; i < bias->size(); i++) + { + EXPECT_EQ(i * i, bias->at(i)); + } +} + +TEST_F(FuseMulWithFullyConnectedPassTest, fc_mul_scalar) +{ + g.init(luci::FusedActFunc::NONE, true /* is_mul_scalar */, true /* use_bias */, + false /* extra_successor */); + + EXPECT_EQ(true, pass.run(g.g())); + + auto fc = dynamic_cast(g.output()->from()); + EXPECT_NE(nullptr, fc); + + auto weights = loco::must_cast(g.fc()->weights()); + auto weights_n = weights->dim(0).value(); + auto weights_m = weights->dim(1).value(); + uint32_t offset = 0; + for (uint32_t i = 0; i < weights_n; i++) + { + for (uint32_t j = 0; j < weights_m; j++) + { + offset = i * weights_m + j; + EXPECT_EQ(MUL_VAL * offset, weights->at(offset)); + } + } + + auto bias = loco::must_cast(g.fc()->bias()); + for (uint32_t i = 0; i < bias->size(); i++) + { + EXPECT_EQ(MUL_VAL * i, bias->at(i)); + } +} + +TEST_F(FuseMulWithFullyConnectedPassTest, fc_no_bias) +{ + g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, false /* use_bias */, + false /* extra_successor */); + + EXPECT_EQ(true, pass.run(g.g())); + + auto fc = dynamic_cast(g.output()->from()); + EXPECT_NE(nullptr, fc); + auto no_bias = dynamic_cast(fc->bias()); + ASSERT_NE(nullptr, no_bias); + + auto weights = loco::must_cast(g.fc()->weights()); + auto weights_n = weights->dim(0).value(); + auto weights_m = weights->dim(1).value(); + uint32_t offset = 0; + for (uint32_t i = 0; i < weights_n; i++) + { + for (uint32_t j = 0; j < weights_m; j++) + { + offset = i * weights_m + j; + EXPECT_EQ(i * offset, weights->at(offset)); + } + } +} + +TEST_F(FuseMulWithFullyConnectedPassTest, bias_feature_map_NEG) +{ + g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */, + false /* extra_successor */); + + // Bias cannot be fused as it's passed as feature map. + g.to_fm_bias(); + + EXPECT_EQ(false, pass.run(g.g())); +} + +TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_activation_NEG) +{ + g.init(luci::FusedActFunc::RELU, false /* is_mul_scalar */, true /* use_bias */, + false /* extra_successor */); + + EXPECT_EQ(false, pass.run(g.g())); +} + +TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_null_weights_NEG) +{ + g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */, + false /* extra_successor */); + + g.fc()->weights(nullptr); + + EXPECT_EQ(false, pass.run(g.g())); +} + +TEST_F(FuseMulWithFullyConnectedPassTest, fc_with_extra_successor_NEG) +{ + g.init(luci::FusedActFunc::NONE, false /* is_mul_scalar */, true /* use_bias */, + true /* extra_successor */); + + EXPECT_EQ(false, pass.run(g.g())); +} diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt index fefbabf9a17..d6656545ff8 100644 --- a/compiler/one-cmds/how-to-use-one-commands.txt +++ b/compiler/one-cmds/how-to-use-one-commands.txt @@ -174,6 +174,7 @@ Current transformation options are - fuse_mul_to_fullyconnected_weights : This fuses Mul operator to following FullyConnected operator weights - fuse_mul_with_conv: This fuses Mul with a preceding Convolution op if possible. - fuse_mul_with_div: This fuses Mul and Div op as Div. +- fuse_mul_with_fullyconnected: This fuses Mul operator with the preceding FullyConnected operator if possible. - fuse_slice_with_tconv: This fuses Slice with a preceding TConv if possible. - fuse_bcq: This enables Binary-Coded-bases Quantized DNNs - read https://arxiv.org/abs/2005.09904 for detailed information diff --git a/compiler/one-cmds/onelib/constant.py b/compiler/one-cmds/onelib/constant.py index 8c5de1b646d..a8dabf139d0 100644 --- a/compiler/one-cmds/onelib/constant.py +++ b/compiler/one-cmds/onelib/constant.py @@ -52,6 +52,7 @@ class CONSTANT: 'fuse_mean_with_mean', 'fuse_mul_with_conv', 'fuse_mul_with_div', + 'fuse_mul_with_fullyconnected', 'fuse_transpose_with_mean', 'fuse_slice_with_tconv', 'fuse_horizontal_fc_layers', @@ -131,6 +132,7 @@ class CONSTANT: ('fuse_mean_with_mean', 'fuse two consecutive Mean ops'), ('fuse_mul_with_conv', 'fuse Mul op to Convolution op'), ('fuse_mul_with_div', 'fuse Mul with Div as Div'), + ('fuse_mul_with_fullyconnected', 'fuse Mul op to FullyConnected op'), ('fuse_transpose_with_mean', 'fuse Mean with a preceding Transpose under certain conditions'), ('fuse_horizontal_fc_layers', diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.recipe new file mode 100644 index 00000000000..84203a12d04 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.recipe @@ -0,0 +1,67 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 1 dim: 1 dim: 6 } +} +operand { + name: "fc_wgt" + type: FLOAT32 + shape { dim: 6 dim: 6 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "fc_bias" + type: FLOAT32 + shape { dim: 6 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "B" + type: FLOAT32 + shape { dim: 1, dim: 1, dim: 6 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "fc_out" + type: FLOAT32 + shape: { dim: 1 dim: 1 dim: 6 } +} +operand { + name: "mul_out" + type: FLOAT32 + shape: { dim: 1 dim: 1 dim: 6 } +} +operation { + type: "FullyConnected" + fullyconnected_options { + activation: NONE + keep_num_dims: true + } + input: "ifm" + input: "fc_wgt" + input: "fc_bias" + output: "fc_out" +} +operation { + type: "Mul" + mul_options { + activation: NONE + } + input: "fc_out" + input: "B" + output: "mul_out" +} +input: "ifm" +output: "mul_out" diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.rule b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.rule new file mode 100644 index 00000000000..c1f2a827884 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_000/test.rule @@ -0,0 +1,12 @@ +# This checks if: +# Mul(FC(input, weights, bias), other) +# is converted to: +# FC(input, Mul(weights, other), Mul(bias, other)) +# and then Mul is fused to: +# FC(input, weights', bias') +# Here Mul is in shape of (1, 1, X). + +RULE "VERIFY_FILE_FORMAT" $(verify_file_format) '=' 1 + +RULE "NO_MUL" $(op_count MUL) '=' 0 +RULE "FC_EXIST" $(op_count FULLY_CONNECTED) '=' 1 diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.recipe new file mode 100644 index 00000000000..d446424c238 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.recipe @@ -0,0 +1,67 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 3 dim: 1 dim: 4 } +} +operand { + name: "fc_wgt" + type: FLOAT32 + shape { dim: 6 dim: 4 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "fc_bias" + type: FLOAT32 + shape { dim: 6 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "B" + type: FLOAT32 + shape { dim: 6 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "fc_out" + type: FLOAT32 + shape: { dim: 3 dim: 1 dim: 6 } +} +operand { + name: "mul_out" + type: FLOAT32 + shape: { dim: 3 dim: 1 dim: 6 } +} +operation { + type: "FullyConnected" + fullyconnected_options { + activation: NONE + keep_num_dims: true + } + input: "ifm" + input: "fc_wgt" + input: "fc_bias" + output: "fc_out" +} +operation { + type: "Mul" + mul_options { + activation: RELU + } + input: "fc_out" + input: "B" + output: "mul_out" +} +input: "ifm" +output: "mul_out" diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.rule b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.rule new file mode 100644 index 00000000000..acdd2d6a96b --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_001/test.rule @@ -0,0 +1,12 @@ +# This checks if: +# Mul(FC(input, weights, bias), other) +# is converted to: +# FC(input, Mul(weights, other), Mul(bias, other)) +# and then Mul is fused to: +# FC(input, weights', bias') +# Here Mul is in shape of (X). + +RULE "VERIFY_FILE_FORMAT" $(verify_file_format) '=' 1 + +RULE "NO_MUL" $(op_count MUL) '=' 0 +RULE "FC_EXIST" $(op_count FULLY_CONNECTED) '=' 1 diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.recipe new file mode 100644 index 00000000000..34e3cde4839 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.recipe @@ -0,0 +1,66 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 1 dim: 16 } +} +operand { + name: "fc_wgt" + type: FLOAT32 + shape { dim: 4 dim: 16 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "fc_bias" + type: FLOAT32 + shape { dim: 4 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "B" + type: FLOAT32 + shape { dim: 1 } + filler { + tag: "constant" + arg: "2.0" + } +} +operand { + name: "fc_out" + type: FLOAT32 + shape: { dim: 1 dim: 4 } +} +operand { + name: "mul_out" + type: FLOAT32 + shape: { dim: 1 dim: 4 } +} +operation { + type: "FullyConnected" + fullyconnected_options { + activation: NONE + keep_num_dims: true + } + input: "ifm" + input: "fc_wgt" + input: "fc_bias" + output: "fc_out" +} +operation { + type: "Mul" + mul_options { + activation: NONE + } + input: "fc_out" + input: "B" + output: "mul_out" +} +input: "ifm" +output: "mul_out" diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.rule b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.rule new file mode 100644 index 00000000000..9cc8d5fd0a7 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_002/test.rule @@ -0,0 +1,12 @@ +# This checks if: +# Mul(FC(input, weights, bias), other) +# is converted to: +# FC(input, Mul(weights, other), Mul(bias, other)) +# and then Mul is fused to: +# FC(input, weights', bias') +# Here Mul is in shape of (1), it's a scalar. + +RULE "VERIFY_FILE_FORMAT" $(verify_file_format) '=' 1 + +RULE "NO_MUL" $(op_count MUL) '=' 0 +RULE "FC_EXIST" $(op_count FULLY_CONNECTED) '=' 1 diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe new file mode 100644 index 00000000000..2883ebabdf0 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.recipe @@ -0,0 +1,57 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 3 dim: 1 dim: 4 } +} +operand { + name: "fc_wgt" + type: FLOAT32 + shape { dim: 6 dim: 4 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "scale" + type: FLOAT32 + shape { dim: 6 } + filler { + tag: "gaussian" + arg: "0.0" + arg: "1.0" + } +} +operand { + name: "fc_out" + type: FLOAT32 + shape: { dim: 3 dim: 1 dim: 6 } +} +operand { + name: "mul_out" + type: FLOAT32 + shape: { dim: 3 dim: 1 dim: 6 } +} +operation { + type: "FullyConnected" + fullyconnected_options { + activation: NONE + keep_num_dims: true + } + input: "ifm" + input: "fc_wgt" + input: "" + output: "fc_out" +} +operation { + type: "Mul" + mul_options { + activation: RELU + } + input: "fc_out" + input: "scale" + output: "mul_out" +} +input: "ifm" +output: "mul_out" diff --git a/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.rule b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.rule new file mode 100644 index 00000000000..16bb2ff2788 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_FullyConnected_Mul_003/test.rule @@ -0,0 +1,13 @@ +# This checks if: +# Mul(FC(input, weights, _), other) +# is converted to: +# FC(input, Mul(weights, other), _) +# and then Mul is fused to: +# FC(input, weights', _) +# Here the bias is empty/excluded "_". +# Thus Mul is only fused with weights. + +RULE "VERIFY_FILE_FORMAT" $(verify_file_format) '=' 1 + +RULE "NO_MUL" $(op_count MUL) '=' 0 +RULE "FC_EXIST" $(op_count FULLY_CONNECTED) '=' 1