Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into mbencer/ReshapeAv…
Browse files Browse the repository at this point in the history
…oidCopy
  • Loading branch information
mbencer committed Oct 16, 2024
2 parents 50036d0 + c5fd64a commit c790a5e
Show file tree
Hide file tree
Showing 51 changed files with 822 additions and 150 deletions.
4 changes: 3 additions & 1 deletion compiler/circle2circle-dredd-recipe-test/test.lst
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ Add(Net_Mul_FullyConnected_001 PASS fuse_mul_to_fullyconnected_weights fold_mul)
Add(Net_Mul_FullyConnected_002 PASS fuse_mul_to_fullyconnected_weights fold_mul)
Add(Net_Preactivation_BN_000 PASS fuse_preactivation_batchnorm)
Add(Net_Reshape_Reshape_000 PASS remove_redundant_reshape)
#Add(Net_RmsNorm_000 PASS fuse_rmsnorm)
Add(Net_RmsNorm_000 PASS fuse_rmsnorm)
Add(Net_RoPE_000 PASS fuse_rope)
Add(Net_Shape_Add_000 PASS fold_shape)
Add(Net_Sqrt_Div_000 PASS transform_sqrt_div_to_rsqrt_mul)
Expand Down Expand Up @@ -152,3 +152,5 @@ Add(Inf_FullyConnected_001 PASS)
Add(Inf_Mul_000 PASS)
Add(Inf_Neg_000 PASS)
Add(Inf_Pad_000 PASS)
Add(Inf_StridedSlice_000 PASS)
Add(Inf_StridedSlice_001 PASS)
3 changes: 3 additions & 0 deletions compiler/circle2circle/src/Circle2Circle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ int entry(int argc, char **argv)
"it only converts weights whose row is a multiple of 16");
add_switch(arser, "--replace_non_const_fc_with_batch_matmul",
"Replace FullyConnected with BatchMatMul when its weight is non-constant");
add_switch(arser, "--substitute_expand_dims_to_reshape",
"This will convert ExpandDims with constant axis to Reshape");
add_switch(arser, "--substitute_pack_to_reshape",
"This will convert single input Pack to Reshape");
add_switch(arser, "--substitute_padv2_to_pad",
Expand Down Expand Up @@ -333,6 +335,7 @@ int entry(int argc, char **argv)
option_str_to_enum["resolve_former_customop"] = Algorithms::ResolveFormerCustomOp;
option_str_to_enum["shuffle_weight_to_16x1float32"] = Algorithms::ShuffleWeightTo16x1Float32;
option_str_to_enum["replace_non_const_fc_with_batch_matmul"] = Algorithms::ReplaceNonConstFCWithBatchMatMul;
option_str_to_enum["substitute_expand_dims_to_reshape"] = Algorithms::SubstituteExpandDimsToReshape;
option_str_to_enum["substitute_pack_to_reshape"] = Algorithms::SubstitutePackToReshape;
option_str_to_enum["substitute_padv2_to_pad"] = Algorithms::SubstitutePadV2ToPad;
option_str_to_enum["substitute_splitv_to_split"] = Algorithms::SubstituteSplitVToSplit;
Expand Down
5 changes: 2 additions & 3 deletions compiler/circlechef/circle/src/Op/RmsNorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,11 @@ namespace circlechef
void CircleOpRmsNorm::filler(const circle::Operator *op, CircleImport *import,
circlechef::ModelRecipe *model_recipe) const
{
// index 1 and 2 maybe constant
// index 1 maybe constant
const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
assert(inputs.size() == 3);
assert(inputs.size() == 2);

import->set_tensor_filler(inputs[1]); // set gaussian filler
import->set_tensor_filler(inputs[2]);
}

circlechef::Operation *CircleOpRmsNorm::build(const circle::Operator *op, CircleImport *import,
Expand Down
43 changes: 34 additions & 9 deletions compiler/common-artifacts/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,43 @@
# Ubuntu18.04; explictly installed python3.8 (default is python3.6)
# Ubuntu20.04; default python3.8
# Ubuntu22.04; default python3.10
# Ubuntu24.04; explictly installed python3.8 (default is python3.12)
# refer https://github.com/Samsung/ONE/issues/9962
find_package(PythonInterp 3.8 QUIET)
find_package(PythonLibs 3.8 QUIET)
if(CMAKE_VERSION VERSION_LESS 3.12)
find_package(PythonInterp 3.8 QUIET)
find_package(PythonLibs 3.8 QUIET)

if(NOT ${PYTHONINTERP_FOUND})
message(STATUS "Build common-artifacts: FALSE (Python3 is missing)")
return()
endif()
if(NOT ${PYTHONINTERP_FOUND})
message(STATUS "Build common-artifacts: FALSE (Python3 is missing)")
return()
endif()

if(${PYTHON_VERSION_MINOR} LESS 8)
message(STATUS "Build common-artifacts: FALSE (You need to install Python version higher than 3.8)")
return()
if(${PYTHON_VERSION_MINOR} LESS 8)
message(STATUS "Build common-artifacts: FALSE (You need to install Python version higher than 3.8)")
return()
endif()
else()
find_package(Python 3.8 EXACT COMPONENTS Interpreter QUIET)
if(NOT Python_FOUND)
find_package(Python 3.8 COMPONENTS Interpreter QUIET)
endif()

# tensorflow 2.12.1 supports Python 3.8 ~ 3.11
if(Python_VERSION VERSION_GREATER_EQUAL 3.12)
message(STATUS "Build common-artifacts: FALSE (Python version 3.12 or higher is not supported yet)")
return()
endif()
if(Python_VERSION VERSION_LESS 3.8)
message(STATUS "Build common-artifacts: FAILED (Install Python version 3.8 or 3.10)")
return()
endif()

if(NOT Python_Interpreter_FOUND)
message(STATUS "Build common-artifacts: FAILED (Python3 is missing)")
return()
endif()

set(PYTHON_EXECUTABLE ${Python_EXECUTABLE})
endif()

# Create python virtual environment with tensorflow 2.12.1
Expand Down
2 changes: 0 additions & 2 deletions compiler/common-artifacts/exclude.lst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
## TensorFlowLiteRecipes

## CircleRecipes
optimize(RmsNorm_000)

#[[ optimize : Exclude from circle optimization(circle2circle) ]]
## TensorFlowLiteRecipes
Expand Down Expand Up @@ -183,5 +182,4 @@ tcgenerate(CircleFullyConnected_U4_002)
tcgenerate(GRU_000) # luci-interpreter does not support custom GRU
tcgenerate(InstanceNorm_000)
tcgenerate(InstanceNorm_001)
tcgenerate(RmsNorm_000)
tcgenerate(RoPE_000)
44 changes: 35 additions & 9 deletions compiler/dalgona/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,44 @@
# Ubuntu18.04; explictly installed python3.8 (default is python3.6)
# Ubuntu20.04; default python3.8
# Ubuntu22.04; default python3.10
# Ubuntu24.04; explictly installed python3.8 (default is python3.12)
# refer https://github.com/Samsung/ONE/issues/9962
find_package(PythonInterp 3.8 QUIET)
find_package(PythonLibs 3.8 QUIET)
if(CMAKE_VERSION VERSION_LESS 3.12)
find_package(PythonInterp 3.8 QUIET)
find_package(PythonLibs 3.8 QUIET)

if(NOT ${PYTHONINTERP_FOUND})
message(STATUS "Build dalgona: FAILED (Python3 is missing)")
return()
endif()
if(NOT ${PYTHONINTERP_FOUND})
message(STATUS "Build dalgona: FAILED (Python3 is missing)")
return()
endif()

if(${PYTHON_VERSION_MINOR} LESS 8)
message(STATUS "Build dalgona: FAILED (Install Python version higher than or equal to 3.8)")
return()
if(${PYTHON_VERSION_MINOR} LESS 8)
message(STATUS "Build dalgona: FAILED (Install Python version higher than or equal to 3.8)")
return()
endif()
else()
find_package(Python 3.8 EXACT COMPONENTS Development QUIET)
if(NOT Python_FOUND)
find_package(Python 3.8 COMPONENTS Development QUIET)
endif()

# Require same python version of common-artifacts
if(Python_VERSION VERSION_GREATER_EQUAL 3.12)
message(STATUS "Build dalgona: FALSE (Python version 3.12 or higher is not supported yet)")
return()
endif()
if(Python_VERSION VERSION_LESS 3.8)
message(STATUS "Build dalgona: FAILED (Install Python version 3.8 or 3.10)")
return()
endif()

if(NOT Python_Development_FOUND)
message(STATUS "Build dalgona: FAILED (Python3 development package is missing)")
return()
endif()

set(PYTHON_INCLUDE_DIRS ${Python_INCLUDE_DIRS})
set(PYTHON_LIBRARIES ${Python_LIBRARIES})
endif()

nnas_find_package(Pybind11)
Expand Down
20 changes: 5 additions & 15 deletions compiler/luci-interpreter/src/kernels/RmsNorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ namespace luci_interpreter
namespace kernels
{

RmsNorm::RmsNorm(const Tensor *input, const Tensor *gamma, const Tensor *beta, Tensor *output,
RmsNorm::RmsNorm(const Tensor *input, const Tensor *gamma, Tensor *output,
const RmsNormParams &params)
: KernelWithParams<RmsNormParams>({input, gamma, beta}, {output}, params)
: KernelWithParams<RmsNormParams>({input, gamma}, {output}, params)
{
}

Expand All @@ -38,13 +38,9 @@ void RmsNorm::configure()
LUCI_INTERPRETER_CHECK(num_dims == 3 || num_dims == 4);
LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
LUCI_INTERPRETER_CHECK(gamma()->element_type() == input()->element_type());
LUCI_INTERPRETER_CHECK(beta()->element_type() == input()->element_type());
LUCI_INTERPRETER_CHECK(gamma()->shape().num_dims() == 1);
LUCI_INTERPRETER_CHECK(beta()->shape().num_dims() == 1);
LUCI_INTERPRETER_CHECK((gamma()->shape().dim(0) == input()->shape().dim(num_dims - 1)) ||
(gamma()->shape().dim(0) == 1));
LUCI_INTERPRETER_CHECK((beta()->shape().dim(0) == input()->shape().dim(num_dims - 1)) ||
(beta()->shape().dim(0) == 1));

output()->resize(input()->shape());
}
Expand All @@ -70,9 +66,6 @@ void RmsNorm::evalFloat() const
const float *gamma_data = getTensorData<float>(gamma());
auto gamma_shape = getTensorShape(gamma());
bool single_gamma = gamma_shape.DimensionsCount() == 1 && gamma_shape.Dims(0) == 1;
const float *beta_data = getTensorData<float>(beta());
auto beta_shape = getTensorShape(beta());
bool single_beta = beta_shape.DimensionsCount() == 1 && beta_shape.Dims(0) == 1;
float *output_data = getTensorData<float>(output());

if (input_shape.DimensionsCount() == 4)
Expand All @@ -99,11 +92,9 @@ void RmsNorm::evalFloat() const
for (int32_t channel = 0; channel < channels; channel++)
{
double gamma = single_gamma ? gamma_data[0] : gamma_data[channel];
double beta = single_beta ? beta_data[0] : beta_data[channel];
output_data[tflite::Offset(output_shape, batch, height, width, channel)] =
(gamma *
(input_data[tflite::Offset(input_shape, batch, height, width, channel)] / rms) +
beta);
gamma *
(input_data[tflite::Offset(input_shape, batch, height, width, channel)] / rms);
}
}
}
Expand Down Expand Up @@ -131,8 +122,7 @@ void RmsNorm::evalFloat() const
for (int32_t i = 0; i < size; i++)
{
double gamma = single_gamma ? gamma_data[0] : gamma_data[i];
double beta = single_beta ? beta_data[0] : beta_data[i];
output_data[offset + i] = (gamma * (input_data[offset + i] / rms) + beta);
output_data[offset + i] = gamma * (input_data[offset + i] / rms);
}
}
}
Expand Down
4 changes: 1 addition & 3 deletions compiler/luci-interpreter/src/kernels/RmsNorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,10 @@ namespace kernels
class RmsNorm : public KernelWithParams<RmsNormParams>
{
public:
RmsNorm(const Tensor *input, const Tensor *gamma, const Tensor *beta, Tensor *output,
const RmsNormParams &params);
RmsNorm(const Tensor *input, const Tensor *gamma, Tensor *output, const RmsNormParams &params);

const Tensor *input() const { return _inputs[0]; }
const Tensor *gamma() const { return _inputs[1]; }
const Tensor *beta() const { return _inputs[2]; }
Tensor *output() const { return _outputs[0]; }

void configure() override;
Expand Down
19 changes: 7 additions & 12 deletions compiler/luci-interpreter/src/kernels/RmsNorm.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,12 @@ TEST_F(RmsNormTest, Simple)
Tensor input_tensor =
makeInputTensor<DataType::FLOAT32>({1, 2, 2, 1}, {0, 1, 2, 3}, _memory_manager.get());
Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {0}, _memory_manager.get());
Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);

RmsNormParams params{};
params.epsilon = 0.00001f;

RmsNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
RmsNorm kernel(&input_tensor, &gamma_tensor, &output_tensor, params);
kernel.configure();
_memory_manager->allocate_memory(output_tensor);
kernel.execute();
Expand All @@ -54,18 +53,17 @@ TEST_F(RmsNormTest, Simple)
EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 1}));
}

TEST_F(RmsNormTest, Default_gamma_beta)
TEST_F(RmsNormTest, Default_gamma)
{
Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 2, 2}, {0, 1, 2, 3, 4, 5, 6, 7},
_memory_manager.get());
Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {0}, _memory_manager.get());
Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);

RmsNormParams params{};
params.epsilon = 0.001f;

RmsNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
RmsNorm kernel(&input_tensor, &gamma_tensor, &output_tensor, params);
kernel.configure();
_memory_manager->allocate_memory(output_tensor);
kernel.execute();
Expand All @@ -81,13 +79,12 @@ TEST_F(RmsNormTest, Have_gamma)
Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 2, 2}, {0, 1, 2, 3, 4, 5, 6, 7},
_memory_manager.get());
Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {2}, _memory_manager.get());
Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {0}, _memory_manager.get());
Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);

RmsNormParams params{};
params.epsilon = 0.001f;

RmsNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
RmsNorm kernel(&input_tensor, &gamma_tensor, &output_tensor, params);
kernel.configure();
_memory_manager->allocate_memory(output_tensor);
kernel.execute();
Expand All @@ -98,18 +95,17 @@ TEST_F(RmsNormTest, Have_gamma)
EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 2}));
}

TEST_F(RmsNormTest, Wrong_gamma_beta_dim_NEG)
TEST_F(RmsNormTest, Wrong_gamma_dim_NEG)
{
Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 2, 2, 2}, {0, 1, 2, 3, 4, 5, 6, 7},
_memory_manager.get());
Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1, 1, 1}, _memory_manager.get());
Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({3}, {0, 0, 0}, _memory_manager.get());
Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);

RmsNormParams params{};
params.epsilon = 0.001f;

RmsNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
RmsNorm kernel(&input_tensor, &gamma_tensor, &output_tensor, params);
EXPECT_ANY_THROW(kernel.configure());
}

Expand All @@ -118,13 +114,12 @@ TEST_F(RmsNormTest, Unsupported_dims_NEG)
Tensor input_tensor =
makeInputTensor<DataType::FLOAT32>({2, 2}, {0, 1, 2, 3}, _memory_manager.get());
Tensor gamma_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1}, _memory_manager.get());
Tensor beta_tensor = makeInputTensor<DataType::FLOAT32>({1}, {0}, _memory_manager.get());
Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);

RmsNormParams params{};
params.epsilon = 0.001f;

RmsNorm kernel(&input_tensor, &gamma_tensor, &beta_tensor, &output_tensor, params);
RmsNorm kernel(&input_tensor, &gamma_tensor, &output_tensor, params);
EXPECT_ANY_THROW(kernel.configure());
}

Expand Down
3 changes: 0 additions & 3 deletions compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1101,20 +1101,17 @@ TEST_F(KernelBuilderTest, RmsNorm)
{
auto *input = createInputNode();
auto *gamma = createInputNode();
auto *beta = createInputNode();

auto *op = createNode<luci::CircleRmsNorm>();
op->input(input);
op->gamma(gamma);
op->beta(beta);
op->epsilon(1e-06);

auto kernel = buildKernel<kernels::RmsNorm>(op);
ASSERT_THAT(kernel, NotNull());

checkTensor(kernel->input(), input);
checkTensor(kernel->gamma(), gamma);
checkTensor(kernel->beta(), beta);
checkTensor(kernel->output(), op);
EXPECT_THAT(kernel->params().epsilon, Eq(op->epsilon()));
}
Expand Down
5 changes: 2 additions & 3 deletions compiler/luci-interpreter/src/loader/nodes/RmsNorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,17 @@ std::unique_ptr<Kernel> build_kernel_CircleRmsNorm(const luci::CircleNode *circl
KernelBuilderHelper &helper)
{
const auto *node = loco::must_cast<const luci::CircleRmsNorm *>(circle_node);
assert(node->arity() == 3);
assert(node->arity() == 2);

const Tensor *input = helper.getInputTensor(node->input());
const Tensor *gamma = helper.getInputTensor(node->gamma());
const Tensor *beta = helper.getInputTensor(node->beta());

Tensor *output = helper.getOutputTensor(node);

RmsNormParams params{};
params.epsilon = node->epsilon();

return std::make_unique<kernels::RmsNorm>(input, gamma, beta, output, params);
return std::make_unique<kernels::RmsNorm>(input, gamma, output, params);
}

} // namespace luci_interpreter
2 changes: 1 addition & 1 deletion compiler/luci-pass-value-py-test/test.lst
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ eval(Net_Mul_FullyConnected_002 fuse_mul_to_fullyconnected_weights)
eval(Net_Reshape_Mean_000 forward_reshape_to_unaryop)
eval(Net_Reshape_Neg_000 forward_reshape_to_unaryop)
eval(Net_Reshape_Reshape_000 remove_redundant_reshape)
#eval(Net_RmsNorm_000 fuse_rmsnorm)
eval(Net_RmsNorm_000 fuse_rmsnorm)
eval(Net_RoPE_000 fuse_rope)
eval(Net_Shape_Add_000 fold_shape)
eval(Net_Sqrt_Div_000 transform_sqrt_div_to_rsqrt_mul)
Expand Down
3 changes: 1 addition & 2 deletions compiler/luci/import/src/Nodes/CircleRmsNorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace luci
bool CircleRmsNormGraphBuilder::validate(const ValidateArgs &args) const
{
// TODO check dtypes
return GraphBuilder::validate(args, 3);
return GraphBuilder::validate(args, 2);
}

CircleNode *CircleRmsNormGraphBuilder::build_node(const circle::OperatorT &op,
Expand All @@ -36,7 +36,6 @@ CircleNode *CircleRmsNormGraphBuilder::build_node(const circle::OperatorT &op,
auto *node = graph->nodes()->create<CircleRmsNorm>();
node->input(inputs.at(0));
node->gamma(inputs.at(1));
node->beta(inputs.at(2));

const auto *options = op.builtin_options.AsRmsNormOptions();
node->epsilon(options->epsilon);
Expand Down
Loading

0 comments on commit c790a5e

Please sign in to comment.