diff --git a/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt b/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
index 521b23a1b7..5105ed66dc 100644
--- a/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
+++ b/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
@@ -19,6 +19,6 @@ if (WITH_CUDA)
     nv_test(test_multi_level_tiling SRCS multi_level_tiling_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
 endif()
 
-cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore)
+cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore auto_gen_rule_test_helper)
 cc_test(test_skip_rule SRCS skip_rule_test.cc DEPS cinncore)
 cc_test(test_auto_unroll SRCS auto_unroll_test.cc DEPS cinncore)
diff --git a/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc b/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
index 0334162520..061c3cd667 100644
--- a/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
+++ b/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
+#include "cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
 #include "cinn/cinn.h"
 #include "cinn/frontend/net_builder.h"
 #include "cinn/hlir/framework/op_lowering.h"
@@ -37,6 +38,7 @@
 #include "cinn/poly/stage.h"
 #include "cinn/runtime/flags.h"
 #include "cinn/utils/string.h"
+#include "tests/concrete_program_builder.h"
 
 DECLARE_bool(cinn_ir_schedule);
 
@@ -239,5 +241,253 @@ TEST(AutoInline, AddReluInline) {
   EXPECT_EQ(auto_inline.AnalyseApplyType(new_states[0], "var_2"), RuleApplyType::kCannotApply);
 }
 
+#ifdef CINN_WITH_CUDA
+class TestAutoInline : public TestAutoGenRuleBase {};
+
+/* The single chain graph composed of multiple blocks can be inlined into one.
+ *
+ * Before AutoInline: The output of the previous block is the input of another block.
+ *   Loop1:
+ *     x1 = Add()
+ *   Loop2:
+ *     x2 = Multiply(x1)
+ *   Loop3:
+ *     x3 = Add(x2)
+ *   Loop4:
+ *     x4 = Relu(x3)
+ *
+ * After AutoInline: All loops are inlined into a loop.
+ *   Loop:
+ *     Add(Multiply(Add(Relu())))
+ */
+TEST_F(TestAutoInline, SingleChain) {
+  Target target = common::DefaultNVGPUTarget();
+  Initialize(target);
+  std::vector<std::string> input_names   = {"bias", "conv_output", "bn_scale", "bn_offset"};
+  std::vector<std::string> output_names  = {"var_6", "var_5", "var_1", "var", "var_0", "var_4", "var_3"};
+  std::vector<int32_t> conv_output_shape = {1, 512, 56, 56};
+  int32_t channel                        = conv_output_shape[1];
+  std::vector<tests::VariableInfo> inputs_varinfo({{"conv_output", conv_output_shape},
+                                                   {"bias", {channel, 1, 1}},
+                                                   {"bn_scale", {channel, 1, 1}},
+                                                   {"bn_offset", {channel, 1, 1}}});
+
+  // Construct the computation graph and convert it to ir::Expr
+  Context::Global().ResetNameId();
+  ir::IRSchedule ir_schedule = MakeIRSchedule(tests::BiasBnReLUBuilder().Build(inputs_varinfo));
+  SearchState state(ir_schedule, 0, {});
+  std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+  ASSERT_EQ(func_bodys.size(), 1UL);
+  VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+  // Apply AutoInline for every block that can be inline
+  AutoInline auto_inline(target_, {output_names.front()});
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "var_3"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = auto_inline.ApplyOnBlock(state, "var_3");
+  std::vector<std::string> inline_block_names({"var_4", "var_5", "var_6", "var", "var_0", "var_1"});
+  for (const auto& inline_block_name : inline_block_names) {
+    new_states = auto_inline.ApplyOnBlock(new_states[0], inline_block_name);
+  }
+  std::vector<ir::Expr> exprs = new_states[0]->ir_schedule.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+  VLOG(6) << "Expr after AutoInline applied on block: " << exprs[0];
+
+  // build ir::Module and debug source code
+  auto build_module_auto = BuildIRModule(new_states[0]->ir_schedule);
+  auto build_module_manually =
+      BuildIRModule(MakeIRSchedule(tests::BiasBnReLUBuilder().Build(inputs_varinfo), -1, true));
+  auto source_code_auto = GenSourceCode(build_module_auto);
+  VLOG(6) << " auto-schedule source code:\n" << source_code_auto;
+  auto source_code_manually = GenSourceCode(build_module_manually);
+  VLOG(6) << " manually-schedule source code:\n" << source_code_manually;
+
+  CheckResult(GenExecutableKernel(build_module_auto),
+              GenExecutableKernel(build_module_manually),
+              input_names,
+              output_names,
+              {{conv_output_shape[1], 1, 1}, conv_output_shape, conv_output_shape, conv_output_shape},
+              {conv_output_shape, {1}, {1}, {1}, {1}, {1}, {1}},
+              target);
+}
+
+/* An op can be inlined into multiple consumers at the same time.
+ *
+ * Before AutoInline: The output of Exp is used by Add and Multiply.
+ *   Loop1:
+ *     x = Exp()
+ *   Loop2:
+ *     y = Add(x)
+ *   Loop3:
+ *     z = Multiply(x)
+ *
+ * After AutoInline: Exp is inlined into Add and Multiply.
+ *   Loop:
+ *     y = Add(Exp())
+ *     z = Multiply(Exp())
+ */
+TEST_F(TestAutoInline, InlineToMultiConsumers) {
+  Target target = common::DefaultNVGPUTarget();
+  Initialize(target);
+  std::vector<std::string> input_names  = {"x"};
+  std::vector<std::string> output_names = {"var_2", "var_1", "var_0"};
+  std::vector<int32_t> input_shape{256, 256};
+  std::vector<tests::VariableInfo> inputs_varinfo({{"x", input_shape}});
+
+  // Construct the computation graph and convert it to ir::Expr
+  Context::Global().ResetNameId();
+  ir::IRSchedule ir_schedule = MakeIRSchedule(tests::ExpTwoConsumersOpBuilder().Build(inputs_varinfo));
+  SearchState state(ir_schedule, 0, {});
+  std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+  ASSERT_EQ(func_bodys.size(), 1UL);
+  VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+  // Apply AutoInline for every block that can be inline
+  AutoInline auto_inline(target_, {output_names.front()});
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "var_0"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states             = auto_inline.ApplyOnBlock(state, "var_1");
+  new_states                  = auto_inline.ApplyOnBlock(state, "var_0");
+  std::vector<ir::Expr> exprs = new_states[0]->ir_schedule.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+  VLOG(6) << "Expr after AutoInline applied on block: " << exprs[0];
+
+  // build ir::Module and debug source code
+  auto build_module_auto = BuildIRModule(new_states[0]->ir_schedule);
+  auto build_module_manually =
+      BuildIRModule(MakeIRSchedule(tests::ExpTwoConsumersOpBuilder().Build(inputs_varinfo), -1, true));
+  auto source_code_auto = GenSourceCode(build_module_auto);
+  VLOG(6) << " auto-schedule source code:\n" << source_code_auto;
+  auto source_code_manually = GenSourceCode(build_module_manually);
+  VLOG(6) << " manually-schedule source code:\n" << source_code_manually;
+
+  CheckResult(GenExecutableKernel(build_module_auto),
+              GenExecutableKernel(build_module_manually),
+              input_names,
+              output_names,
+              {input_shape},
+              {input_shape, {1}, {1}},
+              target);
+}
+
+/* Operators of type elementwise or injective can all be inlined.
+ *
+ * Before AutoInline: A graph of Gather, Add and Subtract
+ *   Loop1:
+ *     x1 = Gather()
+ *   Loop2:
+ *     x2 = Add(x1)
+ *   Loop3:
+ *     y1 = Gather()
+ *   Loop4:
+ *     z1 = Subtract(y1, x1)
+ *
+ * After AutoInline: All loops are inlined to one
+ *     z1 = Subtract(Gather(), Add(Gather()))
+ */
+TEST_F(TestAutoInline, OnlySpatialOp) {
+  Target target = common::DefaultNVGPUTarget();
+  Initialize(target);
+  std::vector<std::string> input_names  = {"x", "y"};
+  std::vector<std::string> output_names = {
+      "var_6", "var_4", "constant_idx_last", "constant_idx_first", "var_2", "var_5"};
+  std::vector<int32_t> input_shape{256, 256};
+  std::vector<tests::VariableInfo> inputs_varinfo({{"x", input_shape}, {"y", input_shape}});
+
+  // Construct the computation graph and convert it to ir::Expr
+  Context::Global().ResetNameId();
+  ir::IRSchedule ir_schedule = MakeIRSchedule(tests::GatherAddSubBuilder().Build(inputs_varinfo));
+  SearchState state(ir_schedule, 0, {});
+  std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+  ASSERT_EQ(func_bodys.size(), 1UL);
+  VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+  // Apply AutoInline for every block that can be inline
+  AutoInline auto_inline(target_, {output_names.front()});
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "constant_idx_first"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states = auto_inline.ApplyOnBlock(state, "constant_idx_first");
+  std::vector<std::string> inline_block_names({"constant_idx_last", "var_2", "var_5", "var_4"});
+  for (const auto& inline_block_name : inline_block_names) {
+    new_states = auto_inline.ApplyOnBlock(new_states[0], inline_block_name);
+  }
+  std::vector<ir::Expr> exprs = new_states[0]->ir_schedule.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+  VLOG(6) << "Expr after AutoInline applied on block: " << exprs[0];
+
+  // build ir::Module and debug source code
+  auto build_module_auto = BuildIRModule(new_states[0]->ir_schedule);
+  auto build_module_manually =
+      BuildIRModule(MakeIRSchedule(tests::GatherAddSubBuilder().Build(inputs_varinfo), -1, true));
+  auto source_code_auto = GenSourceCode(build_module_auto);
+  VLOG(6) << " auto-schedule source code:\n" << source_code_auto;
+  auto source_code_manually = GenSourceCode(build_module_manually);
+  VLOG(6) << " manually-schedule source code:\n" << source_code_manually;
+
+  CheckResult(GenExecutableKernel(build_module_auto),
+              GenExecutableKernel(build_module_manually),
+              input_names,
+              output_names,
+              {input_shape, input_shape},
+              {input_shape, {1}, {1}, {1}, {1}, {1}},
+              target);
+}
+
+/* An op that does not read data can be directly inlined.
+ *
+ * Before AutoInline: fill_constant op is in a separate loop.
+ *   Loop1:
+ *     x = fill_constant()
+ *   Loop2:
+ *     y = Add(x)
+ *
+ * After AutoInline: fill_constant op is inlined into other loop
+ *   Loop:
+ *     y = Add(fill_constant())
+ */
+TEST_F(TestAutoInline, NoReadBufferOp) {
+  Target target = common::DefaultNVGPUTarget();
+  Initialize(target);
+  std::vector<std::string> input_names  = {"x"};
+  std::vector<std::string> output_names = {"var_0", "fill_constant"};
+  std::vector<int32_t> input_shape{256, 256};
+  std::vector<tests::VariableInfo> inputs_varinfo({{"x", input_shape}});
+
+  // Construct the computation graph and convert it to ir::Expr
+  ir::IRSchedule ir_schedule = MakeIRSchedule(tests::FillConstantAddBuilder().Build(inputs_varinfo));
+  SearchState state(ir_schedule, 0, {});
+  std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
+  ASSERT_EQ(func_bodys.size(), 1UL);
+  VLOG(6) << "Original Expr:\n" << func_bodys[0];
+
+  // Apply AutoInline for every block that can be inline
+  AutoInline auto_inline(target_, {output_names.front()});
+  EXPECT_EQ(auto_inline.AnalyseApplyType(state, "fill_constant"), RuleApplyType::kApplyAndPruneOtherRules);
+  auto new_states             = auto_inline.ApplyOnBlock(state, "fill_constant");
+  std::vector<ir::Expr> exprs = new_states[0]->ir_schedule.GetModule().GetExprs();
+  EXPECT_EQ(exprs.size(), 1UL);
+  VLOG(6) << "Expr after AutoInline applied on block: " << exprs[0];
+
+  // build ir::Module and debug source code
+  auto build_module_auto = BuildIRModule(new_states[0]->ir_schedule);
+  auto build_module_manually =
+      BuildIRModule(MakeIRSchedule(tests::FillConstantAddBuilder().Build(inputs_varinfo), -1, true));
+  auto source_code_auto = GenSourceCode(build_module_auto);
+  VLOG(6) << " auto-schedule source code:\n" << source_code_auto;
+  auto source_code_manually = GenSourceCode(build_module_manually);
+  VLOG(6) << " manually-schedule source code:\n" << source_code_manually;
+
+  CheckResult(GenExecutableKernel(build_module_auto),
+              GenExecutableKernel(build_module_manually),
+              input_names,
+              output_names,
+              {input_shape},
+              {input_shape, {1}},
+              target);
+}
+
+/* An op can be inlined into multiple producers at the same time.
+ */
+// TEST_F(TestAutoInline, InlineToMultiProducers) {
+// TODO(6clc): Complete the unit test, once ReverseComputeInline is ready.
+// }
+#endif
 }  // namespace auto_schedule
 }  // namespace cinn
diff --git a/tests/concrete_program_builder.h b/tests/concrete_program_builder.h
new file mode 100644
index 0000000000..5b70f76a97
--- /dev/null
+++ b/tests/concrete_program_builder.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "tests/program_builder.h"
+
+namespace cinn {
+namespace tests {
+
+/*
+ * Add --* Multiply --* Add --* Relu
+ */
+class BiasBnReLUBuilder : public ProgramBuilder {
+ public:
+  BiasBnReLUBuilder() : ProgramBuilder("bias_bn_relu_builder") {}
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo, const utils::AttributeMap& attrs = {}) {
+    CHECK(inputs_varinfo.size() == 4);
+    auto conv_output = builder_.CreateInput(inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id);
+    auto bias        = builder_.CreateInput(inputs_varinfo[1].type, inputs_varinfo[1].shape, inputs_varinfo[1].id);
+    auto bn_scale    = builder_.CreateInput(inputs_varinfo[2].type, inputs_varinfo[2].shape, inputs_varinfo[2].id);
+    auto bn_offset   = builder_.CreateInput(inputs_varinfo[3].type, inputs_varinfo[3].shape, inputs_varinfo[3].id);
+
+    auto bias_add = builder_.Add(conv_output, bias);
+    auto bn_mul   = builder_.Multiply(bias_add, bn_scale);
+    auto bn_add   = builder_.Add(bn_mul, bn_offset);
+    builder_.Relu(bn_add);
+    return builder_.Build();
+  }
+};
+
+/*
+ * Exp --* Add
+ *    \
+ *     --* Multiply
+ */
+class ExpTwoConsumersOpBuilder : public ProgramBuilder {
+ public:
+  ExpTwoConsumersOpBuilder() : ProgramBuilder("exp_two_consumers_builder") {}
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo, const utils::AttributeMap& attrs = {}) {
+    CHECK(inputs_varinfo.size() == 1);
+    auto x     = builder_.CreateInput(inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id);
+    auto exp_x = builder_.Exp(x);
+    auto add_x = builder_.Add(exp_x, x);
+    auto mul_1 = builder_.Multiply(exp_x, add_x);
+    return builder_.Build();
+  }
+};
+
+/*
+ * Gather --* Add --* Subtract
+ *                    *
+ *                   /
+ *            Gather
+ */
+class GatherAddSubBuilder : public ProgramBuilder {
+ public:
+  GatherAddSubBuilder() : ProgramBuilder("gather_add_sub_builder") {}
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo, const utils::AttributeMap& attrs = {}) {
+    CHECK(inputs_varinfo.size() == 2);
+    auto x             = builder_.CreateInput(inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id);
+    auto y             = builder_.CreateInput(inputs_varinfo[1].type, inputs_varinfo[1].shape, inputs_varinfo[1].id);
+    auto input_x_shape = inputs_varinfo[0].shape;
+    auto where_x_0     = builder_.Gather(x, builder_.FillConstant({input_x_shape[0]}, 0, "constant_idx_first"));
+    auto where_x_last =
+        builder_.Gather(x, builder_.FillConstant({input_x_shape[0]}, input_x_shape[0] - 1, "constant_idx_last"));
+    auto add_1 = builder_.Add(where_x_0, y);
+    builder_.Subtract(where_x_last, add_1);
+    return builder_.Build();
+  }
+};
+
+/*
+ * FillConstant --* Add
+ */
+class FillConstantAddBuilder : public ProgramBuilder {
+ public:
+  FillConstantAddBuilder() : ProgramBuilder("fill_constant_add_builder") {}
+  frontend::Program Build(const std::vector<VariableInfo>& inputs_varinfo, const utils::AttributeMap& attrs = {}) {
+    CHECK(inputs_varinfo.size() == 1);
+    auto x             = builder_.CreateInput(inputs_varinfo[0].type, inputs_varinfo[0].shape, inputs_varinfo[0].id);
+    auto fill_constant = builder_.FillConstant(inputs_varinfo[0].shape, 1.0f, "fill_constant");
+    builder_.Add(x, fill_constant);
+    return builder_.Build();
+  }
+};
+
+}  // namespace tests
+}  // namespace cinn