diff --git a/cinn/auto_schedule/cost_model/feature_extractor.cc b/cinn/auto_schedule/cost_model/feature_extractor.cc index 5f44b2e3f0..565ba3172e 100644 --- a/cinn/auto_schedule/cost_model/feature_extractor.cc +++ b/cinn/auto_schedule/cost_model/feature_extractor.cc @@ -158,6 +158,12 @@ VisitCountMemberPattern(Alloc, mem_alloc); VisitCountMemberPattern(Free, mem_free); VisitCountMemberPattern(Load, mem_read); VisitCountMemberPattern(Store, mem_write); +VisitCountMemberPattern(LocalTemp, bool_op); +VisitCountMemberPattern(Sqrt, bool_op); +VisitCountMemberPattern(LoadIndex, bool_op); +VisitCountMemberPattern(ReduceMax, bool_op); +VisitCountMemberPattern(BlockLoad, bool_op); +VisitCountMemberPattern(BlockStore, bool_op); /* Visit for loops */ diff --git a/cinn/backends/codegen_c.cc b/cinn/backends/codegen_c.cc index 4454edbad4..010bb57f55 100644 --- a/cinn/backends/codegen_c.cc +++ b/cinn/backends/codegen_c.cc @@ -185,8 +185,40 @@ void CodeGenC::Visit(const ir::Not *op) { IrPrinter::Print(op->v()); os() << ")"; } -void CodeGenC::Visit(const ir::Cast *op) { PrintCastExpr(op->type(), op->v()); } +void CodeGenC::Visit(const ir::LocalTemp* op) +{ IrPrinter::Visit( op ); } + +void CodeGenC::Visit(const ir::Sqrt* op) +{ IrPrinter::Visit( op ); } + +void CodeGenC::Visit(const ir::LoadIndex* op) +{ IrPrinter::Visit( op ); } + +void CodeGenC::Visit(const ir::ReduceMax* op) +{ IrPrinter::Visit( op ); } + +void CodeGenC::Visit(const ir::BlockLoad* op) +{ IrPrinter::Visit( op ); } + +void CodeGenC::Visit(const ir::BlockStore* op) +{ IrPrinter::Visit( op ); } + + + +void CodeGenC::Visit(const ir::Cast *op) { + // PrintCastExpr(op->type(), op->v()); + //IrPrinter::Visit( op ); + os() << "static_cast("; + Print( op->v() ) ; + + // auto v = op->v().As(); + + // Visit(v); + + os() << ")"; + } void CodeGenC::Visit(const ir::For *op) { + // std::cerr << "visit loop" << std::endl; Expr extent = op->extent; Expr min = op->min; int num_task = 1; @@ -209,8 +241,13 @@ void CodeGenC::Visit(const ir::For *op) { extent = (task_id + 1) * n_per_task; DoIndent(); } + if( op->is_unrolled() ) + { + os() << "#pragma unroll" << std::endl; + } os() << "for ("; - os() << GetTypeRepr(Int(32)); + // os() << GetTypeRepr(Int(32)); + os() << "int"; os() << " " << op->loop_var->name; os() << " = "; Print(min); @@ -321,7 +358,7 @@ void CodeGenC::Visit(const ir::Block *op) { if (op->stmts.size() >= 1) { DoIndent(); Print(op->stmts.back()); - os() << ";"; + os() << ";\n"; } DecIndent(); @@ -438,33 +475,47 @@ void CodeGenC::Visit(const ir::_Module_ *op) { CINN_NOT_IMPLEMENTED } void CodeGenC::Visit(const ir::_Var_ *op) { os() << op->name; } void CodeGenC::Visit(const ir::Load *op) { - Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1); - if (dense_strided_ramp.defined()) { // Loading a continuous Ramp address. - CHECK(op->type().is_vector()); - PrintStackVecType(op->type().ElementOf(), op->index().type().lanes()); - os() << "::" - << "Load("; - os() << op->tensor.As()->name; - os() << ","; - Print(dense_strided_ramp); - os() << ")"; - } else if (op->index().type().is_vector()) { - // gather - CHECK(op->type().is_vector()); - PrintStackVecType(op->type().ElementOf(), op->index().type().lanes()); - os() << "::Load("; - os() << op->tensor.As()->name; - os() << ","; - Print(op->index()); - os() << ")"; - } else if (op->is_addr_tensor()) { - auto *tensor = op->tensor.As(); + auto *tensor = op->tensor.As(); os() << tensor->name << "["; - Print(op->index()); + // Print(op->index()); + os() << op->indices.front(); + for( int i = 1; i < op->indices.size(); ++i ) + { + os() << "][" << op->indices[i]; + } os() << "]"; - } else { - IrPrinter::Visit(op); - } + // Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1); + // if (dense_strided_ramp.defined()) { // Loading a continuous Ramp address. + // CHECK(op->type().is_vector()); + // PrintStackVecType(op->type().ElementOf(), op->index().type().lanes()); + // os() << "::" + // << "Load("; + // os() << op->tensor.As()->name; + // os() << ","; + // Print(dense_strided_ramp); + // os() << ")"; + // } else if (op->index().type().is_vector()) { + // // gather + // CHECK(op->type().is_vector()); + // PrintStackVecType(op->type().ElementOf(), op->index().type().lanes()); + // os() << "::Load("; + // os() << op->tensor.As()->name; + // os() << ","; + // Print(op->index()); + // os() << ")"; + // } else if (op->is_addr_tensor()) { + // auto *tensor = op->tensor.As(); + // os() << tensor->name << "["; + // // Print(op->index()); + // os() << op->indices.front(); + // for( int i = 1; i < op->indices.size(); ++i ) + // { + // os() << "][" << op->indices[i]; + // } + // os() << "]"; + // } else { + // IrPrinter::Visit(op); + // } } void CodeGenC::Visit(const ir::Store *op) { @@ -473,7 +524,12 @@ void CodeGenC::Visit(const ir::Store *op) { auto *tensor = op->tensor.As(); CHECK(tensor); os() << tensor->name << "["; - Print(op->index()); + // Print(op->index()); + os() << op->indices.front(); + for( int i = 1; i < op->indices.size(); ++i ) + { + os() << "][" << op->indices[i]; + } os() << "]"; os() << " = "; Print(op->value); @@ -508,7 +564,10 @@ void CodeGenC::Visit(const ir::Let *op) { os() << "auto"; is_vec = true; } else { - os() << GetTypeRepr(op->type()); + if ( op->with_dtype ) + { + os() << GetTypeRepr(op->type()); + } } os() << " "; diff --git a/cinn/backends/codegen_c.h b/cinn/backends/codegen_c.h index 1949590c94..32c3e24140 100755 --- a/cinn/backends/codegen_c.h +++ b/cinn/backends/codegen_c.h @@ -55,7 +55,7 @@ class CodeGenC : public ir::IrPrinter { //! Disable inline the builtin codes(too large) for simpler string comparation. void SetInlineBuiltinCodes(bool x = true) { inline_builtin_codes_ = x; } - protected: + public: std::string Compile(const ir::LoweredFunc& function); std::string Compile(const ir::Buffer& buffer); @@ -111,7 +111,7 @@ class CodeGenC : public ir::IrPrinter { friend class ExternFunctionEmitter; - protected: + public: Target target_; std::stringstream ss_; bool inline_builtin_codes_{true}; diff --git a/cinn/backends/codegen_cuda_dev.cc b/cinn/backends/codegen_cuda_dev.cc index afc4da26a9..b0d37d9120 100644 --- a/cinn/backends/codegen_cuda_dev.cc +++ b/cinn/backends/codegen_cuda_dev.cc @@ -36,6 +36,7 @@ const std::string CodeGenCUDA_Dev::source_header_ = #include "float16.h" using cinn::common::float16; +#include #include "cinn_cuda_runtime_source.cuh" )"; @@ -74,6 +75,9 @@ void CodeGenCUDA_Dev::Compile(const ir::Module &module, const Outputs &outputs) } std::string CodeGenCUDA_Dev::Compile(const ir::LoweredFunc &func) { + // std::cerr << "fun " << func << std::endl; + std::cerr << "!!!==============\n"; + std::cerr << func << std::endl; Print(Expr(func)); return ss_.str(); } @@ -223,7 +227,7 @@ std::string CodeGenCUDA_Dev::Compile(const ir::Module &module, CodeGenC::OutputK PrintBuiltinCodes(); - for (auto &func : module.functions()) { + for (auto &func : module.functions()) { Compile(func); } } else { @@ -317,8 +321,11 @@ void CodeGenCUDA_Dev::Visit(const ir::Let *op) { // with customized_type::kcuda_builtin_vector_t prefix, and save their names if (op->type().is_customized() && utils::Startswith(op->type().customized_type(), common::customized_type::kcuda_builtin_vector_t)) { - os() << GetTypeRepr(op->type()); - os() << " "; + if( op->with_dtype ) + { + os() << GetTypeRepr(op->type()); + os() << " "; + } Print(op->symbol); vectorized_tensor_names_.insert(utils::GetStreamCnt(op->symbol)); os() << " = "; diff --git a/cinn/backends/codegen_cuda_dev.h b/cinn/backends/codegen_cuda_dev.h index 1543f4e2c5..a3bb23d6c6 100644 --- a/cinn/backends/codegen_cuda_dev.h +++ b/cinn/backends/codegen_cuda_dev.h @@ -67,7 +67,8 @@ class CodeGenCUDA_Dev : public CodeGenC { const std::string& GetSourceHeader() const; - protected: + public: + void Visit(const ir::_Var_* op) override; void Visit(const ir::_LoweredFunc_* op) override; void Visit(const ir::Min* op) override; diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc index 44c37a0df7..914cfa21ee 100644 --- a/cinn/backends/compiler.cc +++ b/cinn/backends/compiler.cc @@ -112,6 +112,7 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code) VLOG(3) << "[CUDA] device module:\n" << device_module; CodeGenCUDA_Dev codegen(target_); auto source_code = codegen.Compile(device_module); + std::cerr << "source code here " << source_code << std::endl; VLOG(3) << "[CUDA] C:\n" << source_code; if (!code.empty()) source_code = code; diff --git a/cinn/backends/llvm/codegen_llvm.cc b/cinn/backends/llvm/codegen_llvm.cc index de54937dd2..a0b3a9b7f7 100644 --- a/cinn/backends/llvm/codegen_llvm.cc +++ b/cinn/backends/llvm/codegen_llvm.cc @@ -224,6 +224,42 @@ llvm::Value *CodeGenLLVM::Visit(const ir::IntImm *op) { return llvm::ConstantInt::get(type, op->value, true); } +llvm::Value *CodeGenLLVM::Visit(const ir::LocalTemp *op) { + std::cerr << "not impl in llvm gen"; + auto *type = b_->getIntNTy(op->type().bits()); + return llvm::ConstantInt::get(type, 1, true); +} + +llvm::Value *CodeGenLLVM::Visit(const ir::Sqrt *op) { + std::cerr << "not impl in llvm gen"; + auto *type = b_->getIntNTy(op->type().bits()); + return llvm::ConstantInt::get(type, 1, true); +} + +llvm::Value *CodeGenLLVM::Visit(const ir::LoadIndex *op) { + std::cerr << "not impl in llvm gen"; + auto *type = b_->getIntNTy(op->type().bits()); + return llvm::ConstantInt::get(type, op->reduce_block, true); +} + +llvm::Value *CodeGenLLVM::Visit(const ir::ReduceMax *op) { + std::cerr << "not impl in reduceMax llvm gen"; + auto *type = b_->getIntNTy(op->type().bits()); + return llvm::ConstantInt::get(type, op->axis, true); +} + +llvm::Value *CodeGenLLVM::Visit(const ir::BlockLoad *op) { + std::cerr << "not impl in block load llvm gen"; + auto *type = b_->getIntNTy(op->type().bits()); + return llvm::ConstantInt::get(type, 1, true); +} + +llvm::Value *CodeGenLLVM::Visit(const ir::BlockStore *op) { + std::cerr << "not impl in block store llvm gen"; + auto *type = b_->getIntNTy(op->type().bits()); + return llvm::ConstantInt::get(type, 1, true); +} + llvm::Value *CodeGenLLVM::Visit(const ir::UIntImm *op) { if (op->type().is_bool()) { auto *type = b_->getInt1Ty(); diff --git a/cinn/backends/llvm/execution_engine.cc b/cinn/backends/llvm/execution_engine.cc index 2010d271d7..bcafd28105 100644 --- a/cinn/backends/llvm/execution_engine.cc +++ b/cinn/backends/llvm/execution_engine.cc @@ -172,9 +172,9 @@ void ExecutionEngine::Link(const ir::Module &module) { LLVMModuleOptimizer optimize(machine.get(), 3, {}, true); optimize(m.get()); CHECK(!llvm::verifyModule(*m, &llvm::errs())) << "Invalid optimized module detected"; - for (auto &f : *m) { - VLOG(5) << "function: " << DumpToString(f); - } + // for (auto &f : *m) { + // VLOG(5) << "function: " << DumpToString(f); + // } llvm::raw_svector_ostream rawstream(buffer_); llvm::legacy::PassManager pass_manager; diff --git a/cinn/common/type.cc b/cinn/common/type.cc index 5d13e8970f..5b3ca1282f 100644 --- a/cinn/common/type.cc +++ b/cinn/common/type.cc @@ -480,7 +480,8 @@ Type Str2Type(const std::string &type) { std::string Type2Str(const Type &type) { switch (type.type()) { case Type::type_t::Int: - return "int" + std::to_string(type.bits()); + return "int"; + //return "int" + std::to_string(type.bits()); case Type::type_t::UInt: if (type.bits() == 1) { @@ -490,7 +491,8 @@ std::string Type2Str(const Type &type) { } case Type::type_t::Float: - return "float" + std::to_string(type.bits()); + return "float"; + //return "float" + std::to_string(type.bits()); case Type::type_t::Void: return "void"; diff --git a/cinn/frontend/net_builder.cc b/cinn/frontend/net_builder.cc index 26b8c98805..084c31b758 100644 --- a/cinn/frontend/net_builder.cc +++ b/cinn/frontend/net_builder.cc @@ -80,6 +80,26 @@ void NetBuilder::InferShape(Instruction instr) const { } } +Variable NetBuilder::Load( const Variable& input, const Variable& slice ) +{ + Instruction instr("load", {input, slice}); + + auto& outs = instr->outputs; + outs.resize( {1} ); + outs[0]->shape = slice->shape; + + AppendInstruction(instr); + return instr.GetOutput(0); +} + +Variable NetBuilder::Store( const Variable& output, const Variable& slice, const Variable& update_value) +{ + Instruction instr("store", {output, slice, update_value}); + + AppendInstruction(instr); + return instr.GetOutput(0); +} + const std::vector& NetBuilder::CustomInstr(const std::string& type, const std::vector& inputs, const AttributeMap& attrs) { diff --git a/cinn/frontend/net_builder.h b/cinn/frontend/net_builder.h index 6a5e534476..cee23008b9 100644 --- a/cinn/frontend/net_builder.h +++ b/cinn/frontend/net_builder.h @@ -152,6 +152,8 @@ class NetBuilder { const std::vector& inputs, const AttributeMap& attrs); + Variable Load( const Variable& input, const Variable& slice ); + Variable Store( const Variable& output, const Variable& slice, const Variable& update_value); protected: /** * @brief Helper function of UnaryOp. diff --git a/cinn/frontend/net_builder_test.cc b/cinn/frontend/net_builder_test.cc index 413e4e618a..7b8c37fdb0 100644 --- a/cinn/frontend/net_builder_test.cc +++ b/cinn/frontend/net_builder_test.cc @@ -65,8 +65,8 @@ std::ostream& operator<<(std::ostream& os, const std::vector& vec) { } // namespace TEST(net_build, basic) { - LOG(INFO) << "The size of registered operators: " << OpRegistry::Global()->ListAllNames().size(); - LOG(INFO) << "Registered operators:\n" << OpRegistry::Global()->ListAllNames(); + // LOG(INFO) << "The size of registered operators: " << OpRegistry::Global()->ListAllNames().size(); + // LOG(INFO) << "Registered operators:\n" << OpRegistry::Global()->ListAllNames(); auto program = CreateAddProgram(); // output program for (int i = 0; i < program.size(); i++) { @@ -74,163 +74,163 @@ TEST(net_build, basic) { } } -TEST(net_build, program_execute_multi_elementwise_add) { - auto program = CreateAddProgram(); -#ifdef CINN_WITH_CUDA - Target target = common::DefaultNVGPUTarget(); -#else - Target target = common::DefaultHostTarget(); -#endif - - std::unordered_set fetch_ids; - auto graph = Optimize(&program, fetch_ids, target); - LOG(INFO) << "graph:\n" << graph->Visualize(); - - auto scope = BuildScope(target, graph); - hlir::framework::GraphCompiler gc(target, scope, graph); - auto runtime_program = gc.Build(); - - scope->Var("A"); - scope->Var("B"); - - auto A = scope->GetTensor("A"); - auto B = scope->GetTensor("B"); - SetRandData(A, target); - SetRandData(B, target); - - runtime_program->Execute(); -} -#ifdef CINN_WITH_CUDA -TEST(net_build, program_execute_fc) { - constexpr int B = 10; // batch size - constexpr int M = 32; - constexpr int K = 18; - constexpr int N = 24; - - NetBuilder builder("net_builder"); - auto a = builder.CreateInput(Float(32), {B * M, K}, "A"); - auto w = builder.CreateInput(Float(32), {K, N}, "W"); // weight - auto b = builder.CreateInput(Float(32), {N}, "B"); // bias - - auto mul_out = builder.Matmul(a, w); - auto add_out = builder.Add(mul_out, b); - auto program = builder.Build(); - -#ifdef CINN_WITH_CUDA - Target target = common::DefaultNVGPUTarget(); -#else - Target target = common::DefaultHostTarget(); -#endif - - std::unordered_set fetch_ids; - auto graph = Optimize(&program, fetch_ids, target); - LOG(INFO) << "graph:\n" << graph->Visualize(); - - auto scope = BuildScope(target, graph); - hlir::framework::GraphCompiler gc(target, scope, graph); - auto runtime_program = gc.Build(); - - scope->Var(std::string(a.id())); - scope->Var(std::string(w.id())); - scope->Var(std::string(b.id())); - scope->Var(std::string(mul_out->id)); - - auto a_ten = scope->GetTensor(std::string(a.id())); - auto w_ten = scope->GetTensor(std::string(w.id())); - auto b_ten = scope->GetTensor(std::string(b.id())); - auto fake_out_ten = scope->GetTensor(std::string(mul_out->id)); - auto add_out_ten = scope->GetTensor(std::string(add_out->id)); - SetRandData(a_ten, target); - SetRandData(w_ten, target); - SetRandData(b_ten, target); - - runtime_program->Execute(); -} -#endif - -TEST(net_build, program_execute_pool2d) { - const int B = 16; - const int C = 64; - const int H = 112; - const int W = 112; - - NetBuilder builder("net_builder"); - Placeholder input = builder.CreateInput(Float(32), {B, C, H, W}, "Img"); - std::string pooling_type = "max"; - std::vector ksize{3, 3}; - std::vector strides{2, 2}; - std::vector paddings{1, 1, 1, 1}; - bool ceil_mode = false; - bool exclusive = true; - bool global_pooling = false; - std::string data_format = "NCHW"; - bool adaptive = false; - std::string padding_algorithm = "EXPLICIT"; - Variable pool_out = builder.Pool2d(input, - pooling_type, - ksize, - strides, - paddings, - ceil_mode, - exclusive, - global_pooling, - data_format, - adaptive, - padding_algorithm); - auto program = builder.Build(); - -#ifdef CINN_WITH_CUDA - Target target = common::DefaultNVGPUTarget(); -#else - Target target = common::DefaultHostTarget(); -#endif - - std::unordered_set fetch_ids; - auto graph = Optimize(&program, fetch_ids, target); - auto scope = BuildScope(target, graph); - hlir::framework::GraphCompiler gc(target, scope, graph); - auto runtime_program = gc.Build(); - - scope->Var(std::string(input.id())); - scope->Var(std::string(pool_out->id)); - - auto input_tensor = scope->GetTensor(std::string(input.id())); - SetRandData(input_tensor, target); - runtime_program->Execute(); -} - -TEST(net_build, program_execute_reverse) { - const int B = 16; - const int C = 3; - const int H = 224; - const int W = 224; - - NetBuilder builder("net_builder"); - Placeholder input = builder.CreateInput(Float(32), {B, C, H, W}, "Img"); - Variable reverse_out = builder.Reverse(input, {2, 3}); - auto program = builder.Build(); - -#ifdef CINN_WITH_CUDA - Target target = common::DefaultNVGPUTarget(); -#else - Target target = common::DefaultHostTarget(); -#endif - - std::unordered_set fetch_ids; - auto graph = Optimize(&program, fetch_ids, target); - LOG(INFO) << "graph:\n" << graph->Visualize(); - - auto scope = BuildScope(target, graph); - hlir::framework::GraphCompiler gc(target, scope, graph); - auto runtime_program = gc.Build(); - - scope->Var(std::string(input.id())); - scope->Var(std::string(reverse_out->id)); - - auto input_tensor = scope->GetTensor(std::string(input.id())); - SetRandData(input_tensor, target); - runtime_program->Execute(); -} +// TEST(net_build, program_execute_multi_elementwise_add) { +// auto program = CreateAddProgram(); +// #ifdef CINN_WITH_CUDA +// Target target = common::DefaultNVGPUTarget(); +// #else +// Target target = common::DefaultHostTarget(); +// #endif + +// std::unordered_set fetch_ids; +// auto graph = Optimize(&program, fetch_ids, target); +// LOG(INFO) << "graph:\n" << graph->Visualize(); + +// auto scope = BuildScope(target, graph); +// hlir::framework::GraphCompiler gc(target, scope, graph); +// auto runtime_program = gc.Build(); + +// scope->Var("A"); +// scope->Var("B"); + +// auto A = scope->GetTensor("A"); +// auto B = scope->GetTensor("B"); +// SetRandData(A, target); +// SetRandData(B, target); + +// runtime_program->Execute(); +// } +// #ifdef CINN_WITH_CUDA +// TEST(net_build, program_execute_fc) { +// constexpr int B = 10; // batch size +// constexpr int M = 32; +// constexpr int K = 18; +// constexpr int N = 24; + +// NetBuilder builder("net_builder"); +// auto a = builder.CreateInput(Float(32), {B * M, K}, "A"); +// auto w = builder.CreateInput(Float(32), {K, N}, "W"); // weight +// auto b = builder.CreateInput(Float(32), {N}, "B"); // bias + +// auto mul_out = builder.Matmul(a, w); +// auto add_out = builder.Add(mul_out, b); +// auto program = builder.Build(); + +// #ifdef CINN_WITH_CUDA +// Target target = common::DefaultNVGPUTarget(); +// #else +// Target target = common::DefaultHostTarget(); +// #endif + +// std::unordered_set fetch_ids; +// auto graph = Optimize(&program, fetch_ids, target); +// LOG(INFO) << "graph:\n" << graph->Visualize(); + +// auto scope = BuildScope(target, graph); +// hlir::framework::GraphCompiler gc(target, scope, graph); +// auto runtime_program = gc.Build(); + +// scope->Var(std::string(a.id())); +// scope->Var(std::string(w.id())); +// scope->Var(std::string(b.id())); +// scope->Var(std::string(mul_out->id)); + +// auto a_ten = scope->GetTensor(std::string(a.id())); +// auto w_ten = scope->GetTensor(std::string(w.id())); +// auto b_ten = scope->GetTensor(std::string(b.id())); +// auto fake_out_ten = scope->GetTensor(std::string(mul_out->id)); +// auto add_out_ten = scope->GetTensor(std::string(add_out->id)); +// SetRandData(a_ten, target); +// SetRandData(w_ten, target); +// SetRandData(b_ten, target); + +// runtime_program->Execute(); +// } +// #endif + +// TEST(net_build, program_execute_pool2d) { +// const int B = 16; +// const int C = 64; +// const int H = 112; +// const int W = 112; + +// NetBuilder builder("net_builder"); +// Placeholder input = builder.CreateInput(Float(32), {B, C, H, W}, "Img"); +// std::string pooling_type = "max"; +// std::vector ksize{3, 3}; +// std::vector strides{2, 2}; +// std::vector paddings{1, 1, 1, 1}; +// bool ceil_mode = false; +// bool exclusive = true; +// bool global_pooling = false; +// std::string data_format = "NCHW"; +// bool adaptive = false; +// std::string padding_algorithm = "EXPLICIT"; +// Variable pool_out = builder.Pool2d(input, +// pooling_type, +// ksize, +// strides, +// paddings, +// ceil_mode, +// exclusive, +// global_pooling, +// data_format, +// adaptive, +// padding_algorithm); +// auto program = builder.Build(); + +// #ifdef CINN_WITH_CUDA +// Target target = common::DefaultNVGPUTarget(); +// #else +// Target target = common::DefaultHostTarget(); +// #endif + +// std::unordered_set fetch_ids; +// auto graph = Optimize(&program, fetch_ids, target); +// auto scope = BuildScope(target, graph); +// hlir::framework::GraphCompiler gc(target, scope, graph); +// auto runtime_program = gc.Build(); + +// scope->Var(std::string(input.id())); +// scope->Var(std::string(pool_out->id)); + +// auto input_tensor = scope->GetTensor(std::string(input.id())); +// SetRandData(input_tensor, target); +// runtime_program->Execute(); +// } + +// TEST(net_build, program_execute_reverse) { +// const int B = 16; +// const int C = 3; +// const int H = 224; +// const int W = 224; + +// NetBuilder builder("net_builder"); +// Placeholder input = builder.CreateInput(Float(32), {B, C, H, W}, "Img"); +// Variable reverse_out = builder.Reverse(input, {2, 3}); +// auto program = builder.Build(); + +// #ifdef CINN_WITH_CUDA +// Target target = common::DefaultNVGPUTarget(); +// #else +// Target target = common::DefaultHostTarget(); +// #endif + +// std::unordered_set fetch_ids; +// auto graph = Optimize(&program, fetch_ids, target); +// LOG(INFO) << "graph:\n" << graph->Visualize(); + +// auto scope = BuildScope(target, graph); +// hlir::framework::GraphCompiler gc(target, scope, graph); +// auto runtime_program = gc.Build(); + +// scope->Var(std::string(input.id())); +// scope->Var(std::string(reverse_out->id)); + +// auto input_tensor = scope->GetTensor(std::string(input.id())); +// SetRandData(input_tensor, target); +// runtime_program->Execute(); +// } TEST(net_build, program_execute_gather) { const int B = 4; diff --git a/cinn/frontend/op_mapper_registry.cc b/cinn/frontend/op_mapper_registry.cc index 10dcac21bb..b5e637c8c1 100644 --- a/cinn/frontend/op_mapper_registry.cc +++ b/cinn/frontend/op_mapper_registry.cc @@ -28,6 +28,10 @@ void OpMapperContext::AddVar(const std::string& origin_name, const Variable& var << " to new cinn var [" << var->id << "]"; } (*var_map_)[origin_name] = var; + if( origin_name == "tmp_1" ) + { + std::cerr << "add origin here tmp 1" << std::endl; + } VLOG(4) << "Add variable [" << origin_name << "] to [" << var->id << "] with shape=[" << cinn::utils::Join(var->shape, ",") << "], dtype=" << var->type; } diff --git a/cinn/frontend/pass/decomposer.cc b/cinn/frontend/pass/decomposer.cc index ace930b7ec..b6357bb8af 100755 --- a/cinn/frontend/pass/decomposer.cc +++ b/cinn/frontend/pass/decomposer.cc @@ -42,14 +42,15 @@ class DecomposerPass : public ProgramPass { DecomposerContext context(&builder, &var_map); for (size_t i = 0; i < prog->size(); i++) { auto instr = (*prog)[i]; - auto decomposer = InstrDecomposerRegistry::Global()->Find(instr->op_type, target); - if (decomposer) { - VLOG(3) << "Run decomposer of op " << instr->op_type; - decomposer->Run(instr, context); - } else { + //auto decomposer = InstrDecomposerRegistry::Global()->Find(instr->op_type, target); + // auto decomposer = nullptr; + // if (decomposer) { + // VLOG(3) << "Run decomposer of op " << instr->op_type; + // decomposer->Run(instr, context); + // } else { VLOG(3) << "Don't run decomposer of op " << instr->op_type; builder.AppendInstruction(instr); - } + //} } VLOG(3) << "Before builder.Build()"; *prog = builder.Build(); diff --git a/cinn/frontend/syntax.h b/cinn/frontend/syntax.h index 3e86bc2308..1c71b3e3af 100644 --- a/cinn/frontend/syntax.h +++ b/cinn/frontend/syntax.h @@ -70,6 +70,9 @@ struct Variable : public common::Shared<_Variable_> { _Variable_* operator->() { return get(); } const _Variable_* operator->() const { return get(); } + + std::vector vec_data; + //vector<> }; /** diff --git a/cinn/hlir/framework/graph.h b/cinn/hlir/framework/graph.h index 8b7e0fb719..06aa872738 100644 --- a/cinn/hlir/framework/graph.h +++ b/cinn/hlir/framework/graph.h @@ -206,12 +206,13 @@ class Graph : public cinn::common::Graph { void VisualizeGroupedGraph(const std::vector>& groups, const std::unordered_set& fetch_var_ids = {}); - private: + void VisualizeGroups(const std::vector>& groups, const std::unordered_set& fetch_var_ids = {}); std::vector> FusionGroupsToGroups(); +private: std::string viz_path_; static std::atomic_size_t viz_count_; diff --git a/cinn/hlir/framework/graph_compiler.cc b/cinn/hlir/framework/graph_compiler.cc index 7f65ee3b81..807254f1c3 100644 --- a/cinn/hlir/framework/graph_compiler.cc +++ b/cinn/hlir/framework/graph_compiler.cc @@ -765,9 +765,12 @@ GraphCompiler::CompilationResult GraphCompiler::Build(const GraphCompiler::Compi ParallelCompiler::CompileOptions option; option.lowered_funcs = options.lowered_funcs; + std::cerr << "parallel compiler" << std::endl; parallel_compiler_ = std::make_shared(scope_, graph_, option, target_); + std::cerr << "parallel compiler 0 " << std::endl; auto instructions = (*parallel_compiler_.get())(); + std::cerr << "parallel compiler 1" << std::endl; if (options.remove_unused_variables) { RemoveInvalidVariables(instructions); } diff --git a/cinn/hlir/framework/instruction.cc b/cinn/hlir/framework/instruction.cc index 9a669529da..f2e246cd3c 100644 --- a/cinn/hlir/framework/instruction.cc +++ b/cinn/hlir/framework/instruction.cc @@ -241,8 +241,18 @@ void Instruction::Run(const std::map* name2podarg } else { VLOG(3) << "Runing extern function " << function_name_; for (int idx = 0; idx < fn_ptrs_.size(); ++idx) { + auto& pod_args = args_cached_[idx]; VLOG(3) << "Runing func name: " << fn_names_[idx]; - auto& pod_args = args_cached_[idx]; + // int N = 128 * 12 * 128 * 128; + // float *out=( float *)malloc(N *sizeof(float)); + // cudaMemcpy(out, reinterpret_cast(pod_args[0].operator cinn_buffer_t*()->memory), N *sizeof( float),cudaMemcpyDeviceToHost); + // for( int i = 0; i < 10; ++i) + // { + // std::cerr << out[i] << ","; + // } + // std::cerr << "kenrel out " << std::endl; + + CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by calling SetLoweredFunc method"; if (!dryrun) { if (target_ == common::DefaultNVGPUTarget()) { @@ -251,6 +261,14 @@ void Instruction::Run(const std::map* name2podarg ((lower_func_ptr_t)fn_ptrs_[idx])(static_cast(pod_args.data()), pod_args.size()); } } + + // cudaMemcpy(out, reinterpret_cast(pod_args[1].operator cinn_buffer_t*()->memory), N *sizeof( float),cudaMemcpyDeviceToHost); + // for( int i = 0; i < 10; ++i) + // { + // std::cerr << out[i] << ","; + // } + // std::cerr << "kenrel out " << std::endl; + } VLOG(3) << "Done Runing extern function " << function_name_; } diff --git a/cinn/hlir/framework/node.h b/cinn/hlir/framework/node.h index 33f034d43c..176a7dd1dd 100644 --- a/cinn/hlir/framework/node.h +++ b/cinn/hlir/framework/node.h @@ -95,8 +95,8 @@ class Node : public common::GraphNode { //! Get the output tensors in order to match tensors correctly. If do refresh, we will update the links. const std::vector> &outlinks_in_order(bool refresh = false) const; - inline const Operator *op() const { return this->attrs.op; } + inline const Operator *op() const { return this->attrs.op; } inline bool is_variable() { return (this->attrs.op == nullptr); } inline uint32_t num_outputs() { diff --git a/cinn/hlir/framework/op.h b/cinn/hlir/framework/op.h index 2a158c720e..7fb5f91a4f 100755 --- a/cinn/hlir/framework/op.h +++ b/cinn/hlir/framework/op.h @@ -64,6 +64,7 @@ enum OpPatternKind { kNonFusible = 8 }; + struct OpRegistry : public Registry { std::recursive_mutex mutex; std::atomic op_counter{0}; diff --git a/cinn/hlir/framework/op_lowering.cc b/cinn/hlir/framework/op_lowering.cc index eb7890c48f..38304a55cd 100644 --- a/cinn/hlir/framework/op_lowering.cc +++ b/cinn/hlir/framework/op_lowering.cc @@ -19,6 +19,8 @@ #include "cinn/ir/ir_schedule.h" #include "cinn/optim/transform_gpu_forloop.h" +#include "cinn/ir/thread_model.h" + DECLARE_bool(cinn_ir_schedule); namespace cinn { @@ -52,6 +54,7 @@ std::vector OpLowerer::Lower(GroupPtr& group) { VLOG(3) << "Lowering Group : " << group->group_id << " , Op Pattern : " << group->op_pattern_kind; group->input_names.clear(); group->output_names.clear(); + if (FLAGS_cinn_ir_schedule) { switch (group->op_pattern_kind) { case framework::kElementWise: @@ -72,6 +75,223 @@ std::vector OpLowerer::Lower(GroupPtr& group) { } } +std::vector OpLowerer::ThreadModelTest( Graph* graph ) +{ + auto topo_order = graph->topological_order(); + auto& nodes = std::get<0>(topo_order); + + + + ir::CodeGenOption opt; + opt.flatten_block = 32; + opt.reduce_block = 128; + opt.num_warp = 8; + opt.num_thread_per_warp = 32; + + std::vector vec_nodes; + + opt.flatten_block = 1; + opt.reduce_block = 1; + opt.op_type = ir::OpType::kElementwise; + + for (auto& n : nodes) { + + auto node = n->safe_as(); + if (!node || node->op() == nullptr) { + continue; + } + + vec_nodes.push_back( node ); + auto node_data = GetNodeData(node); + + int max_elem_flatten_block = 0; + std::cerr << " process node: " << node->id() << " with op type: " << node->op()->name << std::endl; + + if( node->op()->name == "reduce_sum" || node->op()->name == "reduce_max") + { + auto reduce_axis = absl::get>(node->attrs.attr_store.at("dim")); + if( reduce_axis.size() != 1) + { + std::cerr << "reduce only support reduce dim 1" << std::endl; + throw std::runtime_error(" reduce only support reduce dim 1"); + } + auto in_data = GetInputNodeData(node); + auto shape = this->shape_dict_.at( in_data[0]->id()); + int dim = reduce_axis[0]; + if( dim < 0) + { + dim += shape.size(); + } + + int flatten_numel = 1; + int reduce_numel = 1; + + for( size_t i = 0; i < shape.size(); ++i) + { + // only for reduce 1 dim + std::cerr << shape[i] << '\t'; + if( i != dim ) + { + flatten_numel *= shape[i]; + } + else + { + reduce_numel *= shape[i]; + } + } + std::cerr << std::endl; + opt.reduce_numel = reduce_numel; + opt.flatten_numel = flatten_numel; + + std::cerr << "dim " << dim << std::endl; + if( dim + 1 == shape.size()) + { + // contig + + opt.reduce_dim = shape[dim]; + std::cerr << "shape dim " << shape[dim] << std::endl; + if( shape[dim] <= 128) + { + opt.reduce_block = 128; + opt.flatten_block = 32; + opt.op_type = ir::OpType::kContiguousWarpReduce; + + } else if ( shape[dim] >= 512) + { + opt.reduce_block = 1024; + opt.flatten_block = 1; + opt.op_type = ir::OpType::kContiguousBlockReduce; + } + } + else + { + std::cerr << " dim " << dim << std::endl; + for (auto & v : shape) + { + std::cerr << v << std::endl; + } + std::cerr << std::endl; + std::cerr << "not supprt no contiguous" << std::endl; + throw std::runtime_error( "not support non contiguous"); + } + + } + else + { + auto in_data = GetInputNodeData(node); + auto shape = this->shape_dict_.at( in_data[0]->id()); + + int flatten_numel = 1; + int reduce_numel = 1; + + for( size_t i = 0; i < shape.size(); ++i) + { + + flatten_numel *= shape[i]; + + } + std::cerr << std::endl; + opt.reduce_numel = reduce_numel; + if( flatten_numel > max_elem_flatten_block ) + { + opt.flatten_numel = flatten_numel; + max_elem_flatten_block = flatten_numel; + } + + } + if ( opt.reduce_block == 1 && opt.flatten_block == 1 ) + { + opt.flatten_block = 1024; + } + + std::cerr << node_data->id() << std::endl; + auto shape = this->shape_dict_.at( node_data->id() ); + + for( auto& s : shape ) + { + std::cerr << s << ","; + } + std::cerr << std::endl; + + auto in_data = GetInputNodeData(node); + + for( auto & data : in_data ) + { + auto shape = this->shape_dict_.at( data->id()); + for( auto& s : shape ) + { + std::cerr << s << ","; + } + std::cerr << std::endl; + } + } + + auto group0 = graph->fusion_groups[0]; + + + const std::unordered_set fetch_var_ids; + auto get_all_out_names = [](const std::vector& nodes) { + // collect all op's output var name in group + std::unordered_set out_names; + for (auto* node : nodes) { + for (const auto& link : node->outlinks()) { + auto* out_node = link->sink()->safe_as(); + out_names.emplace(out_node->id()); + } + } + return out_names; + }; + auto get_feed_list = [](const std::vector& nodes, const std::unordered_set& out_names) { + // if the op's input var name cannot found in out_names, it is the group's feed var + std::unordered_set feed_list; + for (auto* node : nodes) { + for (const auto& link : node->inlinks()) { + auto* in_node = link->source()->safe_as(); + if (!out_names.count(in_node->id())) { + feed_list.emplace(in_node->id()); + } + } + } + return std::vector(feed_list.begin(), feed_list.end()); + }; + auto get_fetch_list = [&](const std::vector& nodes, const std::unordered_set& out_names) { + // if the fetch var in out_names, it's the group's fetch var, otherwise not + std::unordered_set in_names; + for (auto* node : nodes) { + for (const auto& link : node->inlinks()) { + auto* in_node = link->source()->safe_as(); + in_names.emplace(in_node->id()); + } + } + std::vector fetch_list; + for (const auto& out : out_names) { + if (!in_names.count(out) || fetch_var_ids.count(out)) { + // if the var not any op's input, or in fetch_var_ids, it's the group's fetch list + fetch_list.emplace_back(out); + } + } + return fetch_list; + }; + + const auto& out_names = get_all_out_names( vec_nodes); + const auto& feed_list = get_feed_list( vec_nodes, out_names); + auto fetch_name_list = get_fetch_list(vec_nodes, out_names); + auto out = cinn::ir::process_warp_reduce( graph, opt, feed_list, fetch_name_list ); + + for( auto& name : feed_list) + { + group0->input_names.push_back( name); + } + + for( auto& name : fetch_name_list) + { + group0->output_names.push_back( name); + } + + + return { out }; +} + std::vector OpLowerer::LowerWithoutSchedule(GroupPtr& group) { VLOG(3) << "Lowering Group : " << group->group_id << " , Op Pattern : " << group->op_pattern_kind; if (FLAGS_cinn_ir_schedule) { @@ -166,6 +386,8 @@ std::vector OpLowerer::IRLowerOp(IRComputeFunction compute, return {func}; } + + std::vector OpLowerer::IRLowerOpWithoutSchedule(IRComputeFunction compute, GroupPtr& group) { poly::StageMap stages; std::vector arg_tensors; @@ -240,6 +462,7 @@ std::vector OpLowerer::IRLowerOpWithoutSchedule(IRComputeFuncti optim::OptimizeExprGPU(&(func_body)); #endif + std::cerr << "group name " << group->GetFuncName() << std::endl; auto temp_buffers = lang::GetTempBuffers(arg_tensors, stages, func_body); auto func = ir::_LoweredFunc_::Make(group->GetFuncName(), func_args, ir_sch.GetModule().GetExprs().at(0), temp_buffers); @@ -283,6 +506,7 @@ std::vector OpLowerer::IRElementwiseCompute(poly::StageMap& stages, CHECK_EQ(pack.size(), 2U); Expr expr = pack[0]; + poly::StageMap node_stages = pack.back(); tensor_inputs.push_back(expr.as_tensor_ref()); tensor_map[node_data->id()] = expr.as_tensor_ref(); @@ -290,6 +514,10 @@ std::vector OpLowerer::IRElementwiseCompute(poly::StageMap& stages, auto func = lang::LowerVec("fn_" + node->id(), node_stages, tensor_inputs, {}, {}, nullptr, this->target_, true); CHECK_EQ(func.size(), 1); + for( size_t i = 0; i < func.size(); ++i) + { + std::cerr << "elementtwise func " << i << "\t" << func[i] << std::endl; + } if (apply_impl_schedule) { std::vector schedule_inputs; // collect tensor @@ -366,7 +594,7 @@ std::vector OpLowerer::IRReduceCompute(poly::StageMap& stages, VLOG(2) << "ReduceCompute Group : " << sub_group->group_id; auto& cinn_strategy = Operator::GetAttrs("CINNStrategy"); auto& op_pattern_dict = Operator::GetAttrs("OpPattern"); - + std::cerr << "reduce compute" << std::endl; std::vector ast_exprs; for (auto& node : sub_group->nodes) { auto node_data = GetNodeData(node); @@ -409,6 +637,10 @@ std::vector OpLowerer::IRReduceCompute(poly::StageMap& stages, } auto func = lang::LowerVec("fn_" + node->id(), tmp_stages, tensor_inputs, {}, {}, nullptr, this->target_, true); + // for( size_t i = 0; i < func.size() ; ++i ) { + // std::cerr << "reduce func " << i << "\t" << func[i] << std::endl; + // } + // node is kReduction if (op_pattern_dict[node->op()] == framework::kReduction && apply_impl_schedule) { std::vector schedule_inputs; diff --git a/cinn/hlir/framework/op_lowering.h b/cinn/hlir/framework/op_lowering.h index 6e291afeb6..6e4f152c47 100755 --- a/cinn/hlir/framework/op_lowering.h +++ b/cinn/hlir/framework/op_lowering.h @@ -57,10 +57,11 @@ class OpLowerer { OpLowerer(const absl::flat_hash_map&, const absl::flat_hash_map&, const Target&); - std::vector Lower(GroupPtr& group); + std::vector Lower( GroupPtr& group); std::vector LowerWithoutSchedule(GroupPtr& group); - + std::vector ThreadModelTest( Graph* group ); private: + std::vector IRLowerOp(IRComputeFunction, IRScheduleFunction, GroupPtr&); std::vector IRLowerNonFusibleOp(GroupPtr&, bool); std::vector IRLowerOpWithoutSchedule(IRComputeFunction, GroupPtr&); diff --git a/cinn/hlir/framework/op_lowering_test.cc b/cinn/hlir/framework/op_lowering_test.cc index 3b3601055a..c3aefafd9d 100644 --- a/cinn/hlir/framework/op_lowering_test.cc +++ b/cinn/hlir/framework/op_lowering_test.cc @@ -32,32 +32,34 @@ namespace framework { using namespace frontend; void CodeGen(ir::LoweredFunc& func) { -#ifdef CINN_WITH_CUDA +//#ifdef CINN_WITH_CUDA auto target = common::DefaultNVGPUTarget(); Module::Builder builder("module_builder", target); - + std::cerr << "func " << func << std::endl; builder.AddFunction(func); + std::cerr << "fin add func" << std::endl; auto module = builder.Build(); auto compiler = backends::Compiler::Create(target); std::string code = ""; compiler->Build(module, code); -#else - auto target = common::DefaultHostTarget(); - ir::Module::Builder builder("Module_Builder", target); - builder.AddFunction(func); +// #else +// auto target = common::DefaultHostTarget(); +// ir::Module::Builder builder("Module_Builder", target); +// builder.AddFunction(func); - CodeGenCX86 codegen(target, CodeGenCX86::Feature::AVX512); - codegen.SetInlineBuiltinCodes(false); - auto source_code = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl); - LOG(INFO) << "compiled code of " << func->name << "is:\n\n\n" << source_code; -#endif +// CodeGenCX86 codegen(target, CodeGenCX86::Feature::AVX512); +// codegen.SetInlineBuiltinCodes(false); +// auto source_code = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl); +// LOG(INFO) << "compiled code of " << func->name << "is:\n\n\n" << source_code; +// #endif } void Compile(NetBuilder& net_builder) { auto program = net_builder.Build(); + std::cerr << program << std::endl; auto target = common::DefaultTarget(); - RunDecomposer(&program, target); + // RunDecomposer(&program, target); auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); @@ -66,1110 +68,1155 @@ void Compile(NetBuilder& net_builder) { auto& dtype_dict = graph->GetMutableAttrs>("inferdtype"); auto& shape_dict = graph->GetMutableAttrs>("infershape"); + std::cerr << graph->DebugGroupedGraph() << std::endl; OpLowerer op_lowerer(dtype_dict, shape_dict, target); - for (auto& fusion_op : graph->fusion_groups) { - auto lowered_func = op_lowerer.Lower(fusion_op); - CHECK_EQ(lowered_func.size(), 1); - CodeGen(lowered_func[0]); - } -} - -TEST(OpFusionPass, Reduce_With_Last_Axis_1) { - NetBuilder net_builder("Reduce_With_Last_Axis_1"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {10, 100, 1}, "A"); - auto B = net_builder.ReduceSum(A, {0, 2}); - } - Compile(net_builder); -} - -TEST(OpFusionPass, Reduce_Fuse_Broadcast_With_Output) { - NetBuilder net_builder("Reduce_Fuse_Broadcast_With_Output"); - auto layer_norm_51__tmp_1 = net_builder.CreateInput(Float(32), {256}, "layer_norm_51__tmp_1"); - auto var_3216 = net_builder.CreateInput(Float(32), {256, 60}, "var_3216"); - auto var_3202 = net_builder.CreateInput(Float(32), {1, 60}, "var_3202"); - auto var_3212 = net_builder.CreateInput(Float(32), {256, 60}, "var_3212"); - - auto var_3206 = net_builder.Reshape(layer_norm_51__tmp_1, {256, 1}); - auto composite_tmp_8 = net_builder.FillConstant({256, 1}, 1e-5, "composite_tmp_8"); - auto var_3214 = net_builder.Add(var_3206, composite_tmp_8); - auto composite_tmp_10 = net_builder.FillConstant({256, 1}, 1.0, "composite_tmp_10"); - auto var_3220 = net_builder.Divide(composite_tmp_10, var_3214); - auto var_3226 = net_builder.Sqrt(var_3220); - auto var_3224 = net_builder.Scale(var_3220, -1.0, 0.0, true); - auto var_3366 = net_builder.BroadcastTo(var_3224, {256, 60}); - auto var_3228 = net_builder.Multiply(var_3366, var_3216); - auto var_3368 = net_builder.BroadcastTo(var_3202, {256, 60}); - auto var_3236 = net_builder.Multiply(var_3228, var_3212); - auto var_3244 = net_builder.Multiply(var_3236, var_3368); - auto var_3252 = net_builder.ReduceSum(var_3244, {1}, true); - auto var_3232 = net_builder.Scale(var_3226, 0.0166667, 0.0, true); - - Compile(net_builder); -} - -TEST(OpFusionPass, Reduce_Fuse_Broadcast_Layernorm) { - int h = 32, w = 1024; - NetBuilder net_builder("Reduce_Fuse_Broadcast_Layernorm"); - // create model - { - // x - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - // x * x - auto B = net_builder.Multiply(A, A); - // sum x - auto C = net_builder.ReduceSum(A, {1}); - // sum x*x - auto D = net_builder.ReduceSum(B, {1}); - // constant w - auto E = net_builder.FillConstant({h}, 1024.0f, "E"); - // mean - auto F = net_builder.Divide(C, E); - auto FF = net_builder.BroadcastTo(F, {h, w}, {0}); - // mean x*x - auto G = net_builder.Divide(D, E); - // mean * mean - auto H = net_builder.Multiply(F, F); - // var^2 - auto I = net_builder.Subtract(G, H); - // eps - auto J = net_builder.FillConstant({h}, 1e-10f, "J"); - // eps + delta - auto K = net_builder.Add(I, J); - // var - auto L = net_builder.Sqrt(K); - auto LL = net_builder.BroadcastTo(L, {h, w}, {0}); - // x - mean - auto M = net_builder.Subtract(A, FF); - // /var - auto N = net_builder.Divide(M, LL); - } - - Compile(net_builder); -} - -TEST(OpFusionPass, Reduce_Fuse_Broadcast_Softmax) { - int h = 32, w = 1024; - NetBuilder net_builder("Reduce_Fuse_Broadcast_Softmax"); - // create model - { - // softmax - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - // redece max - auto B = net_builder.ReduceMax(A, {1}); - // broadcast - auto C = net_builder.BroadcastTo(B, {h, w}, {0}); - // x - max(x) - auto D = net_builder.Subtract(A, C); - // exp(x) - auto E = net_builder.Exp(D); - // reduce sum - auto F = net_builder.ReduceSum(E, {1}); - // broadcast - auto G = net_builder.BroadcastTo(F, {h, w}, {0}); - // exp(x)/sum(exp(x)) - auto H = net_builder.Divide(E, G); - } - - Compile(net_builder); -} - -TEST(OpFusionPass, Reduce_Fuse_Broadcast_1) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fuse_Broadcast_1"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h * w}, "A"); - auto B = net_builder.ReduceSum(A, {0}); - auto C = net_builder.BroadcastTo(B, {h * w}, {0}); - } - - Compile(net_builder); -} - -TEST(OpFusionPass, Reduce_Fuse_Broadcast_2) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fuse_Broadcast_2"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.ReduceSum(A, {0, 1}); - auto C = net_builder.BroadcastTo(B, {h, w}, {1}); - } - - Compile(net_builder); -} - -TEST(OpFusionPass, Reduce_Fuse_Broadcast_3) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fuse_Broadcast_3"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A"); - auto B = net_builder.ReduceSum(A, {1, 2}); - auto C = net_builder.BroadcastTo(B, {h, h, w}, {0}); - } - - Compile(net_builder); -} - -TEST(OpFusionPass, Reduce_Fuse_Broadcast_4) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fuse_Broadcast_4"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A"); - auto B = net_builder.ReduceSum(A, {1, 2}); - auto C = net_builder.BroadcastTo(B, {h, h, w}, {1}); - } - - Compile(net_builder); -} - -TEST(OpFusionPass, Reduce_Fuse_Broadcast_5) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fuse_Broadcast_5"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A"); - auto B = net_builder.ReduceSum(A, {1, 2}); - auto C = net_builder.BroadcastTo(B, {h, h, w}, {0}); - auto D = net_builder.ReduceSum(C, {1, 2}); - auto E = net_builder.BroadcastTo(D, {h, h, w}, {0}); - } - - Compile(net_builder); -} - -TEST(OpFusionPass, Reduce_Fuse_Broadcast_6) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fuse_Broadcast_6"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A"); - auto B = net_builder.ReduceSum(A, {1, 2}); - auto C = net_builder.BroadcastTo(B, {h, h, w}, {0}); - auto D = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto E = net_builder.BroadcastTo(D, {h, h, w}, {1, 2}); - auto F = net_builder.Add(C, E); - } - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Dim_Equal_One_0) { - NetBuilder net_builder("Reduce_Dim_Equal_One_0"); - { - auto A = net_builder.CreateInput(Float(32), {1, 1000}, "A"); - auto B = net_builder.CreateInput(Float(32), {1, 1000}, "B"); - auto C = net_builder.Add(A, B); - auto D = net_builder.ReduceSum(C, {1}, false); - auto E = net_builder.ReduceSum(C, {1}, false); - auto F = net_builder.Add(D, E); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Dim_Equal_One_1) { - NetBuilder net_builder("Reduce_Dim_Equal_One_1"); - { - auto A = net_builder.CreateInput(Float(32), {32, 32}, "A"); - auto B = net_builder.ReduceSum(A, {0, 1}, false); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Dim_Equal_One_2) { - NetBuilder net_builder("Reduce_Dim_Equal_One_2"); - { - auto A = net_builder.CreateInput(Float(32), {32, 1024}, "A"); - auto B = net_builder.ReduceSum(A, {1}, false); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Dim_Equal_One_3) { - NetBuilder net_builder("Reduce_Dim_Equal_One_3"); - { - auto A = net_builder.CreateInput(Float(32), {32, 1024}, "A"); - auto B = net_builder.ReduceSum(A, {0, 1}, false); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Dim_Equal_One_4) { - NetBuilder net_builder("Reduce_Dim_Equal_One_4"); - { - auto A = net_builder.CreateInput(Float(32), {32, 32, 1024}, "A"); - auto B = net_builder.ReduceSum(A, {0, 2}, false); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Dim_Equal_One_5) { - NetBuilder net_builder("Reduce_Dim_Equal_One_5"); - { - auto A = net_builder.CreateInput(Float(32), {32, 32, 32, 256}, "A"); - auto B = net_builder.ReduceSum(A, {0, 2, 3}, false); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Dim_Equal_One_6) { - NetBuilder net_builder("Reduce_Dim_Equal_One_6"); - { - auto A = net_builder.CreateInput(Float(32), {32, 32, 256}, "A"); - auto B = net_builder.ReduceSum(A, {1, 2}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Dim_Equal_One_7) { - NetBuilder net_builder("Reduce_Dim_Equal_One_7"); - { - auto A = net_builder.CreateInput(Float(32), {1, 1, 1024}, "A"); - auto B = net_builder.ReduceSum(A, {2}, false); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_0) { - NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_0"); - { - auto A = net_builder.CreateInput(Float(32), {16, 64, 1024}, "A"); - auto B = net_builder.ReduceSum(A, {2}, true); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_1) { - NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_1"); - { - auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A"); - auto B = net_builder.CreateInput(Float(32), {1, 64, 1, 1}, "B"); - auto C = net_builder.ReduceSum(A, {0, 2, 3}, true); - auto D = net_builder.Add(B, C); - } - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_2) { - NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_2"); - { - auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A"); - auto B = net_builder.CreateInput(Float(32), {16, 1, 112, 112}, "B"); - auto C = net_builder.ReduceSum(A, {1}, true); - auto D = net_builder.Add(B, C); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_3) { - NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_3"); - { - auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A"); - auto B = net_builder.ReduceSum(A, {2, 3}, true); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_4) { - NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_4"); - { - auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A"); - auto B = net_builder.ReduceSum(A, {2}, true); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_5) { - NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_5"); - { - auto A = net_builder.CreateInput(Float(32), {16, 64, 2048}, "A"); - auto B = net_builder.ReduceSum(A, {2}, true); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_6) { - NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_6"); - { - auto A = net_builder.CreateInput(Float(32), {16, 64, 1024}, "A"); - auto B = net_builder.ReduceSum(A, {2}, true); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_7) { - NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_7"); - { - auto A = net_builder.CreateInput(Float(32), {16, 64, 16, 1024}, "A"); - auto B = net_builder.ReduceSum(A, {1, 3}, true); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_Test_Concat_Before_Reduce) { - NetBuilder net_builder("Elementwise_Test_Concat_Before_Reduce"); - { - auto A = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "A"); - auto B = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "B"); - auto C = net_builder.Concat({A, B}, 3); - auto D = net_builder.Reshape(C, {32, 32, 1024}); - auto E = net_builder.ReduceSum(D, {2}, false); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_Test_Reshape_Before_Reduce) { - NetBuilder net_builder("Elementwise_Test_Reshape_Before_Reduce"); - { - auto A = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "A"); - auto B = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "B"); - auto C = net_builder.Add(A, B); - auto D = net_builder.Reshape(C, {32, 32, 512}); - auto E = net_builder.CreateInput(Float(32), {32, 32, 512}, "E"); - auto F = net_builder.Add(D, E); - auto G = net_builder.ReduceSum(F, {0, 1}, false); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_Test_Reshape_After_Reduce) { - NetBuilder net_builder("Elementwise_Test_Reshape_After_Reduce"); - { - auto A = net_builder.CreateInput(Float(32), {32, 32, 32}, "A"); - auto B = net_builder.ReduceSum(A, {1}, false); - auto C = net_builder.CreateInput(Float(32), {16, 4, 16}, "C"); - auto D = net_builder.Reshape(C, {32, 32}); - auto E = net_builder.Transpose(D, {1, 0}); - auto F = net_builder.CreateInput(Float(32), {32, 32}, "F"); - auto G = net_builder.Add(E, F); - auto H = net_builder.Add(B, G); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_Test_Reshape_Fuse_Concat) { - NetBuilder net_builder("Elementwise_Test_Reshape_Fuse_Concat"); - { - auto A = net_builder.CreateInput(Float(32), {8, 8, 8, 8}, "A"); - auto B = net_builder.Reshape(A, {16, 16, 16}); - auto C = net_builder.CreateInput(Float(32), {16, 16}, "C"); - auto D = net_builder.CreateInput(Float(32), {16, 16}, "D"); - auto DT = net_builder.Transpose(D, {1, 0}); - auto E = net_builder.Add(C, DT); - auto F = net_builder.BroadcastTo(E, {16, 16, 16}, {1, 2}); - auto G = net_builder.Add(B, F); - auto H = net_builder.CreateInput(Float(32), {16, 16, 16}, "H"); - auto I = net_builder.Concat({G, H}, 2); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_TEST_Split_0) { - NetBuilder net_builder("Elementwise_TEST_Split_0"); - { - auto A = net_builder.CreateInput(Float(32), {32, 64}, "A"); - auto B = net_builder.Split(A, {3, 5, 16, 2, 6}, 0); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_TEST_Split_1) { - NetBuilder net_builder("Elementwise_TEST_Split_1"); - { - auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); - auto B = net_builder.Split(A, {32, 32, 32, 32}, 1); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_TEST_Split_2) { - NetBuilder net_builder("Elementwise_TEST_Split_2"); - { - auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); - auto B = net_builder.Split(A, {64, 32, 32}, 1); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_TEST_0) { - NetBuilder net_builder("Elementwise_TEST_0"); - { - auto x = net_builder.FillConstant({1}, 128.0, "x"); - auto o1 = net_builder.Scale(x, -1.0, 0.0); - auto o2 = net_builder.Scale(x, -1.0, 0.0); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, NonFusibleOp_TEST_0) { - NetBuilder net_builder("NonFusibleOp_TEST_0"); - { - auto A = net_builder.CreateInput(Float(32), {9801, 2}, "A"); - auto B = net_builder.Reshape(A, {9801, 2}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, NonFusibleOp_TEST_1) { - NetBuilder net_builder("NonFusibleOp_TEST_1"); - { - auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); - auto B = net_builder.CreateInput(Float(32), {128, 128}, "B"); - auto C = net_builder.Matmul(A, B); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, NonFusibleOp_TEST_2) { - NetBuilder net_builder("NonFusibleOp_TEST_2"); - { - auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); - auto B = net_builder.Matmul(A, A); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, NonFusibleOp_TEST_3) { - NetBuilder net_builder("NonFusibleOp_TEST_3"); - { - auto A = net_builder.CreateInput(Float(32), {128, 256}, "A"); - auto C = net_builder.Split(A, {4}, 1); - } - - Compile(net_builder); -} - -#ifdef CINN_WITH_CUDA -TEST(OP_LOWERING, NonFusibleOp_TEST_4) { - NetBuilder net_builder("NonFusibleOp_TEST_4"); - { - auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); - auto B = net_builder.CreateInput(Float(32), {128, 128}, "B"); - auto C = net_builder.CreateInput(Float(32), {128, 128}, "C"); - auto D = net_builder.Matmul(A, B); - auto E = net_builder.Add(C, D); - } - - Compile(net_builder); -} -#endif - -TEST(OP_LOWERING, Transform_TEST_0) { - NetBuilder net_builder("Transform_TEST_0"); - { - auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); - auto B = net_builder.CreateInput(Float(32), {128, 128}, "B"); - auto C = net_builder.CreateInput(Float(32), {128, 128}, "C"); - auto D = net_builder.Concat({A, B, C}, 1); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_Test_0) { - int h = 32, w = 32; - NetBuilder net_builder("Elementwise_Test_0"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto C = net_builder.CreateInput(Float(32), {h, w}, "C"); - auto D = net_builder.CreateInput(Float(32), {h, w}, "D"); - auto E = net_builder.Add(A, B); - auto F = net_builder.Add(C, D); - auto G = net_builder.Add(E, F); - } + auto res = op_lowerer.ThreadModelTest( graph.get() ); + CodeGen( res[0]); + // for (auto& fusion_op : graph->fusion_groups) { + // std::cerr << "fuse op" << fusion_op << std::endl; + // auto lowered_func = op_lowerer.Lower(fusion_op); + // //std::cerr << "final " << lowered_func[0] << std::endl; + // // CHECK_EQ(lowered_func.size(), 1); + // //CodeGen(lowered_func[0]); - Compile(net_builder); + + // } } -TEST(OP_LOWERING, Elementwise_Test_1) { - int h = 32, w = 32; - NetBuilder net_builder("Elementwise_Test_1"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto C = net_builder.CreateInput(Float(32), {h, w}, "C"); - auto D = net_builder.CreateInput(Float(32), {h, w}, "D"); - auto E = net_builder.Add(A, B); - auto F = net_builder.Add(E, C); - auto G = net_builder.Add(E, D); - auto H = net_builder.Add(F, G); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Elementwise_Test_2) { - int h = 50, w = 10201; - NetBuilder net_builder("Elementwise_Test_2"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto C = net_builder.CreateInput(Float(32), {h, w}, "C"); - auto D = net_builder.CreateInput(Float(32), {h, w}, "D"); - auto E = net_builder.Add(A, B); - auto F = net_builder.Add(E, C); - auto G = net_builder.Add(E, D); - auto H = net_builder.Add(F, G); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_0) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Test_0"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {w, h}, "A"); - auto B = net_builder.ReduceSum(A, {0, 1}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_1) { - int c = 32, h = 32, w = 32; - NetBuilder net_builder("Reduce_Test_1"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A"); - auto B = net_builder.ReduceSum(A, {0, 1, 2}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_2) { - int c = 32, h = 32, w = 32; - NetBuilder net_builder("Reduce_Test_2"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A"); - auto B = net_builder.ReduceSum(A, {0, 1}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_3) { - int c = 32, h = 16, w = 16; - NetBuilder net_builder("Reduce_Test_3"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A"); - auto B = net_builder.ReduceSum(A, {0, 1, 2}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_4) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Test_4"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {w, h}, "A"); - auto B = net_builder.ReduceSum(A, {0}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_5) { - int h = 32, w = 768; - NetBuilder net_builder("Reduce_Test_5"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.ReduceSum(A, {0}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_6) { - int h = 32, w = 2048; - NetBuilder net_builder("Reduce_Test_6"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.ReduceSum(A, {0}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_7) { - int h = 32, w = 512; - NetBuilder net_builder("Reduce_Test_7"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.ReduceSum(A, {1}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_8) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Test_8"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w, w}, "A"); - auto B = net_builder.ReduceSum(A, {1, 2}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_9) { - int n = 16, c = 128, h = 56, w = 56; - NetBuilder net_builder("Reduce_Test_9"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); - auto B = net_builder.ReduceSum(A, {0, 2, 3}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Test_10) { - int n = 16, c = 16, h = 32, w = 32; - NetBuilder net_builder("Reduce_Test_10"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); - auto B = net_builder.ReduceSum(A, {1}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_0) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fusion_Test_0"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {w}, "B"); - - auto C = net_builder.ReduceSum(A, {0}); - auto D = net_builder.Add(B, C); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_1) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fusion_Test_1"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {1}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_2) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fusion_Test_2"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {w}, "B"); - auto C = net_builder.CreateInput(Float(32), {w}, "C"); - auto D = net_builder.CreateInput(Float(32), {w}, "D"); - - auto E = net_builder.ReduceSum(A, {0}); - auto F = net_builder.Add(B, C); - auto G = net_builder.Add(D, F); - auto H = net_builder.Add(E, G); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_3) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fusion_Test_3"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.ReduceSum(A, {0}); - auto C = net_builder.ReduceSum(A, {0}); - auto D = net_builder.Add(B, C); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_4) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fusion_Test_4"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto C = net_builder.Add(A, B); - - auto D = net_builder.ReduceSum(C, {0}); - auto E = net_builder.ReduceSum(C, {0}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_5) { - int h = 32, w = 32; - NetBuilder net_builder("Reduce_Fusion_Test_5"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto C = net_builder.Add(A, B); - - auto D = net_builder.ReduceSum(C, {1}); - auto E = net_builder.ReduceSum(C, {1}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_6) { - int h = 128, w = 128; - NetBuilder net_builder("Reduce_Fusion_Test_6"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto C = net_builder.CreateInput(Float(32), {w}, "C"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {0}); - auto F = net_builder.ReduceSum(D, {0}); - auto G = net_builder.Add(E, C); - auto I = net_builder.Add(F, C); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_7) { - int h = 128, w = 128; - NetBuilder net_builder("Reduce_Fusion_Test_7"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto C = net_builder.CreateInput(Float(32), {w}, "C"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {1}); - auto F = net_builder.ReduceSum(D, {1}); - auto G = net_builder.Add(E, C); - auto I = net_builder.Add(F, C); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_8) { - int h = 128, w = 128; - NetBuilder net_builder("Reduce_Fusion_Test_8"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto C = net_builder.CreateInput(Float(32), {1}, "C"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {0, 1}); - auto F = net_builder.ReduceSum(D, {0, 1}); - auto G = net_builder.Add(E, C); - auto I = net_builder.Add(F, C); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_9) { - int c = 128, h = 128, w = 128; - NetBuilder net_builder("Reduce_Fusion_Test_9"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {c, h, w}, "B"); - auto C = net_builder.CreateInput(Float(32), {h}, "C"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {0, 2}); - auto F = net_builder.ReduceSum(D, {0, 2}); - auto G = net_builder.Add(E, C); - auto I = net_builder.Add(F, C); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_10) { - int h = 10201, w = 50; - NetBuilder net_builder("Reduce_Fusion_Test_10"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {w}, "B"); - auto C = net_builder.ReduceSum(A, {0}); - auto D = net_builder.Add(B, C); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_11) { - int n = 128, c = 128, h = 16, w = 16; - NetBuilder net_builder("Reduce_Fusion_Test_11"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {0, 2, 3}); - auto F = net_builder.ReduceSum(D, {0, 2, 3}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_12) { - int n = 128, c = 128, h = 112, w = 112; - NetBuilder net_builder("Reduce_Fusion_Test_12"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {0, 2, 3}); - auto F = net_builder.ReduceSum(D, {0, 2, 3}); - } - - Compile(net_builder); -} -/* -TODO:exist coredump. -TEST(OP_LOWERING, Reduce_Fusion_Test_13) { - int n = 8, c = 8, h = 8, w = 8; - NetBuilder net_builder("Reduce_Fusion_Test_13"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {0, 1, 2}); - auto F = net_builder.ReduceSum(D, {0, 1, 2}); - } - - auto program = net_builder.Build(); - auto target = common::DefaultTarget(); - RunDecomposer(&program, target); - - auto graph = std::make_shared(program, target); - hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 3); - - hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); - - auto& dtype_dict = graph->GetMutableAttrs>("inferdtype"); - auto& shape_dict = graph->GetMutableAttrs>("infershape"); - - OpLowerer op_lowerer(dtype_dict, shape_dict, target); - for (auto& fusion_op : graph->fusion_groups) { - auto lowered_func = op_lowerer.Lower(fusion_op); - CHECK_EQ(lowered_func.size(), 1); - LOG(INFO) << lowered_func[0]; - CodeGen(lowered_func[0]); - } -} -*/ - -TEST(OP_LOWERING, Reduce_Fusion_Test_14) { - int n = 8, c = 8, h = 8, w = 8; - NetBuilder net_builder("Reduce_Fusion_Test_14"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {n, n, n, c, h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {n, n, n, c, h, w}, "B"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {0, 3, 4}); - auto F = net_builder.ReduceSum(D, {0, 3, 4}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_15) { - int h = 512, w = 32; - NetBuilder net_builder("Reduce_Fusion_Test_15"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {0}); - auto F = net_builder.ReduceSum(D, {0}); - } - - Compile(net_builder); -} -TEST(OP_LOWERING, Reduce_Fusion_Test_16) { - int n = 128, c = 128, h = 28, w = 28; - NetBuilder net_builder("Reduce_Fusion_Test_16"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B"); - auto D = net_builder.Add(A, B); - auto E = net_builder.ReduceSum(D, {0, 2, 3}); - auto F = net_builder.ReduceSum(D, {0, 2, 3}); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_17) { - int h = 128, w = 768; - NetBuilder net_builder("Reduce_Fusion_Test_17"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B"); - auto E = net_builder.ReduceSum(A, {0}); - auto F = net_builder.ReduceSum(B, {0}); - auto G = net_builder.Add(E, F); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_18) { - int h = 128, w = 768; - NetBuilder net_builder("Reduce_Fusion_Test_18"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {16, h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {16, h * 2, w}, "B"); - auto E = net_builder.ReduceSum(A, {1}); - auto F = net_builder.ReduceSum(B, {1}); - auto G = net_builder.Add(E, F); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_19) { - int h = 128, w = 128; - NetBuilder net_builder("Reduce_Fusion_Test_19"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B"); - auto E = net_builder.ReduceSum(A, {0}); - auto F = net_builder.ReduceSum(B, {0}); - auto G = net_builder.Add(E, F); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_20) { - int h = 128, w = 128; - NetBuilder net_builder("Reduce_Fusion_Test_20"); - // create model - { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B"); - auto C = net_builder.CreateInput(Float(32), {h * 3, w}, "C"); - auto D = net_builder.CreateInput(Float(32), {h * 4, w}, "D"); - auto E = net_builder.ReduceSum(A, {0}); - auto F = net_builder.ReduceSum(B, {0}); - auto G = net_builder.ReduceSum(C, {0}); - auto H = net_builder.ReduceSum(D, {0}); - auto I = net_builder.Add(E, F); - auto J = net_builder.Add(G, I); - auto K = net_builder.Add(H, J); - } - - Compile(net_builder); -} - -TEST(OP_LOWERING, Reduce_Fusion_Test_21) { - int h = 128, w = 4; - NetBuilder net_builder("Reduce_Fusion_Test_21"); +TEST(OpFusionPass, Reduce_With_Last_Axis_1) { + NetBuilder net_builder("Reduce_With_Last_Axis_1"); // create model { - auto A0 = net_builder.CreateInput(Float(32), {256, w}, "A0"); - auto B0 = net_builder.CreateInput(Float(32), {256, w}, "B0"); - auto C0 = net_builder.CreateInput(Float(32), {55200, w}, "C0"); - auto D0 = net_builder.CreateInput(Float(32), {2750, w}, "D0"); - auto A1 = net_builder.CreateInput(Float(32), {256, w}, "A1"); - auto B1 = net_builder.CreateInput(Float(32), {256, w}, "B1"); - auto C1 = net_builder.CreateInput(Float(32), {55200, w}, "C1"); - auto D1 = net_builder.CreateInput(Float(32), {2750, w}, "D1"); - auto AA = net_builder.Add(A0, A1); - auto BB = net_builder.Add(B0, B1); - auto CC = net_builder.Add(C0, C1); - auto DD = net_builder.Add(D0, D1); - auto E = net_builder.ReduceSum(AA, {0}); - auto F = net_builder.ReduceSum(BB, {0}); - auto G = net_builder.ReduceSum(CC, {0}); - auto H = net_builder.ReduceSum(DD, {0}); - auto I = net_builder.Add(E, F); - auto J = net_builder.Add(G, I); - auto K = net_builder.Add(H, J); - auto AAA = net_builder.Add(AA, A1); - auto BBB = net_builder.Add(BB, B1); - auto CCC = net_builder.Add(CC, C1); - auto DDD = net_builder.Add(DD, D1); - } + // auto A = net_builder.CreateInput(Float(32), {128, 12, 128, 128}, "A"); + // auto Max = net_builder.ReduceMax(A, {-1}, true); + // auto sub = net_builder.Subtract(A, Max); + // auto exp = net_builder.Exp( sub ); + // auto sum = net_builder.ReduceSum( exp, {-1}, true); + // auto out = net_builder.Divide( exp, sum); + + auto A = net_builder.CreateInput(Float(32), {128, 128, 768}, "A"); + auto rand = net_builder.UniformRandom( {128, 128, 768}, 0.0, 1.0); + auto prob = net_builder.FillConstant( {1}, 0.5, "prob" ); + auto neg_prob = net_builder.FillConstant( {1}, 0.5, "neg_prob" ); + auto mask = net_builder.GreaterEqual( rand, prob ); + auto mask_f = net_builder.Cast(mask, "float32"); + auto t1 = net_builder.Multiply( A, mask_f); + auto out = net_builder.Divide( t1, neg_prob); + + // auto A = net_builder.CreateInput(Float(32), {128, 128, 768}, "A"); + // auto scale = net_builder.CreateInput( Float(32), {768}, "scale" ); + // auto bias = net_builder.CreateInput( Float(32), {768}, "bias" ); + // auto run_mean = net_builder.CreateInput(Float(32), {768}, "run_mean"); + // auto run_var = net_builder.CreateInput( Float(32), {768}, "run_var" ); + // auto num = net_builder.FillConstant( {1}, 768.0, "num" ); + // auto eps = net_builder.FillConstant( {1}, 1e-5, "eps" ); + // auto sum1 = net_builder.ReduceSum(A, {2}, true); + // auto mean1 = net_builder.Divide( sum1, num); + // auto power = net_builder.Multiply(A, A); + // auto sum2 = net_builder.ReduceSum(power, {2}, true); + // auto mean2 = net_builder.Divide( sum2, num); + // auto mean_power = net_builder.Multiply( mean1, mean1); + + // auto var = net_builder.Subtract(mean2, mean_power); + + // auto sub = net_builder.Subtract( A, mean1); + // auto t1 = net_builder.Add( var, eps); + // auto t2 = net_builder.Sqrt( t1 ); + // auto t3 = net_builder.Divide( sub, t2); + // auto t5 = net_builder.Multiply( t3, scale); + // auto out = net_builder.Add( t5, bias); + } + Compile(net_builder); +} + +// TEST(OpFusionPass, Reduce_Fuse_Broadcast_With_Output) { +// NetBuilder net_builder("Reduce_Fuse_Broadcast_With_Output"); +// auto layer_norm_51__tmp_1 = net_builder.CreateInput(Float(32), {256}, "layer_norm_51__tmp_1"); +// auto var_3216 = net_builder.CreateInput(Float(32), {256, 60}, "var_3216"); +// auto var_3202 = net_builder.CreateInput(Float(32), {1, 60}, "var_3202"); +// auto var_3212 = net_builder.CreateInput(Float(32), {256, 60}, "var_3212"); + +// auto var_3206 = net_builder.Reshape(layer_norm_51__tmp_1, {256, 1}); +// auto composite_tmp_8 = net_builder.FillConstant({256, 1}, 1e-5, "composite_tmp_8"); +// auto var_3214 = net_builder.Add(var_3206, composite_tmp_8); +// auto composite_tmp_10 = net_builder.FillConstant({256, 1}, 1.0, "composite_tmp_10"); +// auto var_3220 = net_builder.Divide(composite_tmp_10, var_3214); +// auto var_3226 = net_builder.Sqrt(var_3220); +// auto var_3224 = net_builder.Scale(var_3220, -1.0, 0.0, true); +// auto var_3366 = net_builder.BroadcastTo(var_3224, {256, 60}); +// auto var_3228 = net_builder.Multiply(var_3366, var_3216); +// auto var_3368 = net_builder.BroadcastTo(var_3202, {256, 60}); +// auto var_3236 = net_builder.Multiply(var_3228, var_3212); +// auto var_3244 = net_builder.Multiply(var_3236, var_3368); +// auto var_3252 = net_builder.ReduceSum(var_3244, {1}, true); +// auto var_3232 = net_builder.Scale(var_3226, 0.0166667, 0.0, true); + +// Compile(net_builder); +// } + +// TEST(OpFusionPass, Reduce_Fuse_Broadcast_Layernorm) { +// int h = 32, w = 1024; +// NetBuilder net_builder("Reduce_Fuse_Broadcast_Layernorm"); +// // create model +// { +// // x +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// // x * x +// auto B = net_builder.Multiply(A, A); +// // sum x +// auto C = net_builder.ReduceSum(A, {1}); +// // sum x*x +// auto D = net_builder.ReduceSum(B, {1}); +// // constant w +// auto E = net_builder.FillConstant({h}, 1024.0f, "E"); +// // mean +// auto F = net_builder.Divide(C, E); +// auto FF = net_builder.BroadcastTo(F, {h, w}, {0}); +// // mean x*x +// auto G = net_builder.Divide(D, E); +// // mean * mean +// auto H = net_builder.Multiply(F, F); +// // var^2 +// auto I = net_builder.Subtract(G, H); +// // eps +// auto J = net_builder.FillConstant({h}, 1e-10f, "J"); +// // eps + delta +// auto K = net_builder.Add(I, J); +// // var +// auto L = net_builder.Sqrt(K); +// auto LL = net_builder.BroadcastTo(L, {h, w}, {0}); +// // x - mean +// auto M = net_builder.Subtract(A, FF); +// // /var +// auto N = net_builder.Divide(M, LL); +// } + +// Compile(net_builder); +// } + +// TEST(OpFusionPass, Reduce_Fuse_Broadcast_Softmax) { +// int h = 32, w = 1024; +// NetBuilder net_builder("Reduce_Fuse_Broadcast_Softmax"); +// // create model +// { +// // softmax +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// // redece max +// auto B = net_builder.ReduceMax(A, {1}); +// // broadcast +// auto C = net_builder.BroadcastTo(B, {h, w}, {0}); +// // x - max(x) +// auto D = net_builder.Subtract(A, C); +// // exp(x) +// auto E = net_builder.Exp(D); +// // reduce sum +// auto F = net_builder.ReduceSum(E, {1}); +// // broadcast +// auto G = net_builder.BroadcastTo(F, {h, w}, {0}); +// // exp(x)/sum(exp(x)) +// auto H = net_builder.Divide(E, G); +// } + +// Compile(net_builder); +// } + +// TEST(OpFusionPass, Reduce_Fuse_Broadcast_1) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fuse_Broadcast_1"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h * w}, "A"); +// auto B = net_builder.ReduceSum(A, {0}); +// auto C = net_builder.BroadcastTo(B, {h * w}, {0}); +// } + +// Compile(net_builder); +// } + +// TEST(OpFusionPass, Reduce_Fuse_Broadcast_2) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fuse_Broadcast_2"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 1}); +// auto C = net_builder.BroadcastTo(B, {h, w}, {1}); +// } + +// Compile(net_builder); +// } + +// TEST(OpFusionPass, Reduce_Fuse_Broadcast_3) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fuse_Broadcast_3"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {1, 2}); +// auto C = net_builder.BroadcastTo(B, {h, h, w}, {0}); +// } + +// Compile(net_builder); +// } + +// TEST(OpFusionPass, Reduce_Fuse_Broadcast_4) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fuse_Broadcast_4"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {1, 2}); +// auto C = net_builder.BroadcastTo(B, {h, h, w}, {1}); +// } + +// Compile(net_builder); +// } + +// TEST(OpFusionPass, Reduce_Fuse_Broadcast_5) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fuse_Broadcast_5"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {1, 2}); +// auto C = net_builder.BroadcastTo(B, {h, h, w}, {0}); +// auto D = net_builder.ReduceSum(C, {1, 2}); +// auto E = net_builder.BroadcastTo(D, {h, h, w}, {0}); +// } + +// Compile(net_builder); +// } + +// TEST(OpFusionPass, Reduce_Fuse_Broadcast_6) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fuse_Broadcast_6"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {1, 2}); +// auto C = net_builder.BroadcastTo(B, {h, h, w}, {0}); +// auto D = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto E = net_builder.BroadcastTo(D, {h, h, w}, {1, 2}); +// auto F = net_builder.Add(C, E); +// } +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Dim_Equal_One_0) { +// NetBuilder net_builder("Reduce_Dim_Equal_One_0"); +// { +// auto A = net_builder.CreateInput(Float(32), {1, 1000}, "A"); +// auto B = net_builder.CreateInput(Float(32), {1, 1000}, "B"); +// auto C = net_builder.Add(A, B); +// auto D = net_builder.ReduceSum(C, {1}, false); +// auto E = net_builder.ReduceSum(C, {1}, false); +// auto F = net_builder.Add(D, E); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Dim_Equal_One_1) { +// NetBuilder net_builder("Reduce_Dim_Equal_One_1"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 32}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 1}, false); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Dim_Equal_One_2) { +// NetBuilder net_builder("Reduce_Dim_Equal_One_2"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 1024}, "A"); +// auto B = net_builder.ReduceSum(A, {1}, false); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Dim_Equal_One_3) { +// NetBuilder net_builder("Reduce_Dim_Equal_One_3"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 1024}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 1}, false); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Dim_Equal_One_4) { +// NetBuilder net_builder("Reduce_Dim_Equal_One_4"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 32, 1024}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 2}, false); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Dim_Equal_One_5) { +// NetBuilder net_builder("Reduce_Dim_Equal_One_5"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 32, 32, 256}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 2, 3}, false); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Dim_Equal_One_6) { +// NetBuilder net_builder("Reduce_Dim_Equal_One_6"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 32, 256}, "A"); +// auto B = net_builder.ReduceSum(A, {1, 2}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Dim_Equal_One_7) { +// NetBuilder net_builder("Reduce_Dim_Equal_One_7"); +// { +// auto A = net_builder.CreateInput(Float(32), {1, 1, 1024}, "A"); +// auto B = net_builder.ReduceSum(A, {2}, false); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_0) { +// NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_0"); +// { +// auto A = net_builder.CreateInput(Float(32), {16, 64, 1024}, "A"); +// auto B = net_builder.ReduceSum(A, {2}, true); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_1) { +// NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_1"); +// { +// auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A"); +// auto B = net_builder.CreateInput(Float(32), {1, 64, 1, 1}, "B"); +// auto C = net_builder.ReduceSum(A, {0, 2, 3}, true); +// auto D = net_builder.Add(B, C); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_2) { +// NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_2"); +// { +// auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A"); +// auto B = net_builder.CreateInput(Float(32), {16, 1, 112, 112}, "B"); +// auto C = net_builder.ReduceSum(A, {1}, true); +// auto D = net_builder.Add(B, C); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_3) { +// NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_3"); +// { +// auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A"); +// auto B = net_builder.ReduceSum(A, {2, 3}, true); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_4) { +// NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_4"); +// { +// auto A = net_builder.CreateInput(Float(32), {16, 64, 112, 112}, "A"); +// auto B = net_builder.ReduceSum(A, {2}, true); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_5) { +// NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_5"); +// { +// auto A = net_builder.CreateInput(Float(32), {16, 64, 2048}, "A"); +// auto B = net_builder.ReduceSum(A, {2}, true); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_6) { +// NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_6"); +// { +// auto A = net_builder.CreateInput(Float(32), {16, 64, 1024}, "A"); +// auto B = net_builder.ReduceSum(A, {2}, true); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Keep_Dim_Fuse_Elementwise_7) { +// NetBuilder net_builder("Reduce_Keep_Dim_Fuse_Elementwise_7"); +// { +// auto A = net_builder.CreateInput(Float(32), {16, 64, 16, 1024}, "A"); +// auto B = net_builder.ReduceSum(A, {1, 3}, true); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_Test_Concat_Before_Reduce) { +// NetBuilder net_builder("Elementwise_Test_Concat_Before_Reduce"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "A"); +// auto B = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "B"); +// auto C = net_builder.Concat({A, B}, 3); +// auto D = net_builder.Reshape(C, {32, 32, 1024}); +// auto E = net_builder.ReduceSum(D, {2}, false); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_Test_Reshape_Before_Reduce) { +// NetBuilder net_builder("Elementwise_Test_Reshape_Before_Reduce"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "A"); +// auto B = net_builder.CreateInput(Float(32), {32, 1, 32, 512}, "B"); +// auto C = net_builder.Add(A, B); +// auto D = net_builder.Reshape(C, {32, 32, 512}); +// auto E = net_builder.CreateInput(Float(32), {32, 32, 512}, "E"); +// auto F = net_builder.Add(D, E); +// auto G = net_builder.ReduceSum(F, {0, 1}, false); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_Test_Reshape_After_Reduce) { +// NetBuilder net_builder("Elementwise_Test_Reshape_After_Reduce"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 32, 32}, "A"); +// auto B = net_builder.ReduceSum(A, {1}, false); +// auto C = net_builder.CreateInput(Float(32), {16, 4, 16}, "C"); +// auto D = net_builder.Reshape(C, {32, 32}); +// auto E = net_builder.Transpose(D, {1, 0}); +// auto F = net_builder.CreateInput(Float(32), {32, 32}, "F"); +// auto G = net_builder.Add(E, F); +// auto H = net_builder.Add(B, G); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_Test_Reshape_Fuse_Concat) { +// NetBuilder net_builder("Elementwise_Test_Reshape_Fuse_Concat"); +// { +// auto A = net_builder.CreateInput(Float(32), {8, 8, 8, 8}, "A"); +// auto B = net_builder.Reshape(A, {16, 16, 16}); +// auto C = net_builder.CreateInput(Float(32), {16, 16}, "C"); +// auto D = net_builder.CreateInput(Float(32), {16, 16}, "D"); +// auto DT = net_builder.Transpose(D, {1, 0}); +// auto E = net_builder.Add(C, DT); +// auto F = net_builder.BroadcastTo(E, {16, 16, 16}, {1, 2}); +// auto G = net_builder.Add(B, F); +// auto H = net_builder.CreateInput(Float(32), {16, 16, 16}, "H"); +// auto I = net_builder.Concat({G, H}, 2); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_TEST_Split_0) { +// NetBuilder net_builder("Elementwise_TEST_Split_0"); +// { +// auto A = net_builder.CreateInput(Float(32), {32, 64}, "A"); +// auto B = net_builder.Split(A, {3, 5, 16, 2, 6}, 0); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_TEST_Split_1) { +// NetBuilder net_builder("Elementwise_TEST_Split_1"); +// { +// auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); +// auto B = net_builder.Split(A, {32, 32, 32, 32}, 1); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_TEST_Split_2) { +// NetBuilder net_builder("Elementwise_TEST_Split_2"); +// { +// auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); +// auto B = net_builder.Split(A, {64, 32, 32}, 1); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_TEST_0) { +// NetBuilder net_builder("Elementwise_TEST_0"); +// { +// auto x = net_builder.FillConstant({1}, 128.0, "x"); +// auto o1 = net_builder.Scale(x, -1.0, 0.0); +// auto o2 = net_builder.Scale(x, -1.0, 0.0); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, NonFusibleOp_TEST_0) { +// NetBuilder net_builder("NonFusibleOp_TEST_0"); +// { +// auto A = net_builder.CreateInput(Float(32), {9801, 2}, "A"); +// auto B = net_builder.Reshape(A, {9801, 2}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, NonFusibleOp_TEST_1) { +// NetBuilder net_builder("NonFusibleOp_TEST_1"); +// { +// auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); +// auto B = net_builder.CreateInput(Float(32), {128, 128}, "B"); +// auto C = net_builder.Matmul(A, B); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, NonFusibleOp_TEST_2) { +// NetBuilder net_builder("NonFusibleOp_TEST_2"); +// { +// auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); +// auto B = net_builder.Matmul(A, A); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, NonFusibleOp_TEST_3) { +// NetBuilder net_builder("NonFusibleOp_TEST_3"); +// { +// auto A = net_builder.CreateInput(Float(32), {128, 256}, "A"); +// auto C = net_builder.Split(A, {4}, 1); +// } + +// Compile(net_builder); +// } + +// #ifdef CINN_WITH_CUDA +// TEST(OP_LOWERING, NonFusibleOp_TEST_4) { +// NetBuilder net_builder("NonFusibleOp_TEST_4"); +// { +// auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); +// auto B = net_builder.CreateInput(Float(32), {128, 128}, "B"); +// auto C = net_builder.CreateInput(Float(32), {128, 128}, "C"); +// auto D = net_builder.Matmul(A, B); +// auto E = net_builder.Add(C, D); +// } + +// Compile(net_builder); +// } +// #endif + +// TEST(OP_LOWERING, Transform_TEST_0) { +// NetBuilder net_builder("Transform_TEST_0"); +// { +// auto A = net_builder.CreateInput(Float(32), {128, 128}, "A"); +// auto B = net_builder.CreateInput(Float(32), {128, 128}, "B"); +// auto C = net_builder.CreateInput(Float(32), {128, 128}, "C"); +// auto D = net_builder.Concat({A, B, C}, 1); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_Test_0) { +// int h = 32, w = 32; +// NetBuilder net_builder("Elementwise_Test_0"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto C = net_builder.CreateInput(Float(32), {h, w}, "C"); +// auto D = net_builder.CreateInput(Float(32), {h, w}, "D"); +// auto E = net_builder.Add(A, B); +// auto F = net_builder.Add(C, D); +// auto G = net_builder.Add(E, F); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_Test_1) { +// int h = 32, w = 32; +// NetBuilder net_builder("Elementwise_Test_1"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto C = net_builder.CreateInput(Float(32), {h, w}, "C"); +// auto D = net_builder.CreateInput(Float(32), {h, w}, "D"); +// auto E = net_builder.Add(A, B); +// auto F = net_builder.Add(E, C); +// auto G = net_builder.Add(E, D); +// auto H = net_builder.Add(F, G); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Elementwise_Test_2) { +// int h = 50, w = 10201; +// NetBuilder net_builder("Elementwise_Test_2"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto C = net_builder.CreateInput(Float(32), {h, w}, "C"); +// auto D = net_builder.CreateInput(Float(32), {h, w}, "D"); +// auto E = net_builder.Add(A, B); +// auto F = net_builder.Add(E, C); +// auto G = net_builder.Add(E, D); +// auto H = net_builder.Add(F, G); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_0) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Test_0"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {w, h}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 1}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_1) { +// int c = 32, h = 32, w = 32; +// NetBuilder net_builder("Reduce_Test_1"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 1, 2}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_2) { +// int c = 32, h = 32, w = 32; +// NetBuilder net_builder("Reduce_Test_2"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 1}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_3) { +// int c = 32, h = 16, w = 16; +// NetBuilder net_builder("Reduce_Test_3"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 1, 2}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_4) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Test_4"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {w, h}, "A"); +// auto B = net_builder.ReduceSum(A, {0}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_5) { +// int h = 32, w = 768; +// NetBuilder net_builder("Reduce_Test_5"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {0}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_6) { +// int h = 32, w = 2048; +// NetBuilder net_builder("Reduce_Test_6"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {0}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_7) { +// int h = 32, w = 512; +// NetBuilder net_builder("Reduce_Test_7"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {1}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_8) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Test_8"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w, w}, "A"); +// auto B = net_builder.ReduceSum(A, {1, 2}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_9) { +// int n = 16, c = 128, h = 56, w = 56; +// NetBuilder net_builder("Reduce_Test_9"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {0, 2, 3}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Test_10) { +// int n = 16, c = 16, h = 32, w = 32; +// NetBuilder net_builder("Reduce_Test_10"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {1}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_0) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fusion_Test_0"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {w}, "B"); + +// auto C = net_builder.ReduceSum(A, {0}); +// auto D = net_builder.Add(B, C); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_1) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fusion_Test_1"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {1}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_2) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fusion_Test_2"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {w}, "B"); +// auto C = net_builder.CreateInput(Float(32), {w}, "C"); +// auto D = net_builder.CreateInput(Float(32), {w}, "D"); + +// auto E = net_builder.ReduceSum(A, {0}); +// auto F = net_builder.Add(B, C); +// auto G = net_builder.Add(D, F); +// auto H = net_builder.Add(E, G); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_3) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fusion_Test_3"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.ReduceSum(A, {0}); +// auto C = net_builder.ReduceSum(A, {0}); +// auto D = net_builder.Add(B, C); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_4) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fusion_Test_4"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto C = net_builder.Add(A, B); + +// auto D = net_builder.ReduceSum(C, {0}); +// auto E = net_builder.ReduceSum(C, {0}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_5) { +// int h = 32, w = 32; +// NetBuilder net_builder("Reduce_Fusion_Test_5"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto C = net_builder.Add(A, B); + +// auto D = net_builder.ReduceSum(C, {1}); +// auto E = net_builder.ReduceSum(C, {1}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_6) { +// int h = 128, w = 128; +// NetBuilder net_builder("Reduce_Fusion_Test_6"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto C = net_builder.CreateInput(Float(32), {w}, "C"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {0}); +// auto F = net_builder.ReduceSum(D, {0}); +// auto G = net_builder.Add(E, C); +// auto I = net_builder.Add(F, C); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_7) { +// int h = 128, w = 128; +// NetBuilder net_builder("Reduce_Fusion_Test_7"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto C = net_builder.CreateInput(Float(32), {w}, "C"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {1}); +// auto F = net_builder.ReduceSum(D, {1}); +// auto G = net_builder.Add(E, C); +// auto I = net_builder.Add(F, C); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_8) { +// int h = 128, w = 128; +// NetBuilder net_builder("Reduce_Fusion_Test_8"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto C = net_builder.CreateInput(Float(32), {1}, "C"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {0, 1}); +// auto F = net_builder.ReduceSum(D, {0, 1}); +// auto G = net_builder.Add(E, C); +// auto I = net_builder.Add(F, C); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_9) { +// int c = 128, h = 128, w = 128; +// NetBuilder net_builder("Reduce_Fusion_Test_9"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {c, h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {c, h, w}, "B"); +// auto C = net_builder.CreateInput(Float(32), {h}, "C"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {0, 2}); +// auto F = net_builder.ReduceSum(D, {0, 2}); +// auto G = net_builder.Add(E, C); +// auto I = net_builder.Add(F, C); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_10) { +// int h = 10201, w = 50; +// NetBuilder net_builder("Reduce_Fusion_Test_10"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {w}, "B"); +// auto C = net_builder.ReduceSum(A, {0}); +// auto D = net_builder.Add(B, C); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_11) { +// int n = 128, c = 128, h = 16, w = 16; +// NetBuilder net_builder("Reduce_Fusion_Test_11"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {0, 2, 3}); +// auto F = net_builder.ReduceSum(D, {0, 2, 3}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_12) { +// int n = 128, c = 128, h = 112, w = 112; +// NetBuilder net_builder("Reduce_Fusion_Test_12"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {0, 2, 3}); +// auto F = net_builder.ReduceSum(D, {0, 2, 3}); +// } + +// Compile(net_builder); +// } +// /* +// TODO:exist coredump. +// TEST(OP_LOWERING, Reduce_Fusion_Test_13) { +// int n = 8, c = 8, h = 8, w = 8; +// NetBuilder net_builder("Reduce_Fusion_Test_13"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {0, 1, 2}); +// auto F = net_builder.ReduceSum(D, {0, 1, 2}); +// } + +// auto program = net_builder.Build(); +// auto target = common::DefaultTarget(); +// RunDecomposer(&program, target); + +// auto graph = std::make_shared(program, target); +// hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); +// CHECK_EQ(graph->fusion_groups.size(), 3); + +// hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); +// CHECK_EQ(graph->fusion_groups.size(), 1); + +// auto& dtype_dict = graph->GetMutableAttrs>("inferdtype"); +// auto& shape_dict = graph->GetMutableAttrs>("infershape"); + +// OpLowerer op_lowerer(dtype_dict, shape_dict, target); +// for (auto& fusion_op : graph->fusion_groups) { +// auto lowered_func = op_lowerer.Lower(fusion_op); +// CHECK_EQ(lowered_func.size(), 1); +// LOG(INFO) << lowered_func[0]; +// CodeGen(lowered_func[0]); +// } +// } +// */ + +// TEST(OP_LOWERING, Reduce_Fusion_Test_14) { +// int n = 8, c = 8, h = 8, w = 8; +// NetBuilder net_builder("Reduce_Fusion_Test_14"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {n, n, n, c, h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {n, n, n, c, h, w}, "B"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {0, 3, 4}); +// auto F = net_builder.ReduceSum(D, {0, 3, 4}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_15) { +// int h = 512, w = 32; +// NetBuilder net_builder("Reduce_Fusion_Test_15"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {0}); +// auto F = net_builder.ReduceSum(D, {0}); +// } + +// Compile(net_builder); +// } +// TEST(OP_LOWERING, Reduce_Fusion_Test_16) { +// int n = 128, c = 128, h = 28, w = 28; +// NetBuilder net_builder("Reduce_Fusion_Test_16"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {n, c, h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {n, c, h, w}, "B"); +// auto D = net_builder.Add(A, B); +// auto E = net_builder.ReduceSum(D, {0, 2, 3}); +// auto F = net_builder.ReduceSum(D, {0, 2, 3}); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_17) { +// int h = 128, w = 768; +// NetBuilder net_builder("Reduce_Fusion_Test_17"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B"); +// auto E = net_builder.ReduceSum(A, {0}); +// auto F = net_builder.ReduceSum(B, {0}); +// auto G = net_builder.Add(E, F); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_18) { +// int h = 128, w = 768; +// NetBuilder net_builder("Reduce_Fusion_Test_18"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {16, h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {16, h * 2, w}, "B"); +// auto E = net_builder.ReduceSum(A, {1}); +// auto F = net_builder.ReduceSum(B, {1}); +// auto G = net_builder.Add(E, F); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_19) { +// int h = 128, w = 128; +// NetBuilder net_builder("Reduce_Fusion_Test_19"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B"); +// auto E = net_builder.ReduceSum(A, {0}); +// auto F = net_builder.ReduceSum(B, {0}); +// auto G = net_builder.Add(E, F); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_20) { +// int h = 128, w = 128; +// NetBuilder net_builder("Reduce_Fusion_Test_20"); +// // create model +// { +// auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); +// auto B = net_builder.CreateInput(Float(32), {h * 2, w}, "B"); +// auto C = net_builder.CreateInput(Float(32), {h * 3, w}, "C"); +// auto D = net_builder.CreateInput(Float(32), {h * 4, w}, "D"); +// auto E = net_builder.ReduceSum(A, {0}); +// auto F = net_builder.ReduceSum(B, {0}); +// auto G = net_builder.ReduceSum(C, {0}); +// auto H = net_builder.ReduceSum(D, {0}); +// auto I = net_builder.Add(E, F); +// auto J = net_builder.Add(G, I); +// auto K = net_builder.Add(H, J); +// } + +// Compile(net_builder); +// } + +// TEST(OP_LOWERING, Reduce_Fusion_Test_21) { +// int h = 128, w = 4; +// NetBuilder net_builder("Reduce_Fusion_Test_21"); +// // create model +// { +// auto A0 = net_builder.CreateInput(Float(32), {256, w}, "A0"); +// auto B0 = net_builder.CreateInput(Float(32), {256, w}, "B0"); +// auto C0 = net_builder.CreateInput(Float(32), {55200, w}, "C0"); +// auto D0 = net_builder.CreateInput(Float(32), {2750, w}, "D0"); +// auto A1 = net_builder.CreateInput(Float(32), {256, w}, "A1"); +// auto B1 = net_builder.CreateInput(Float(32), {256, w}, "B1"); +// auto C1 = net_builder.CreateInput(Float(32), {55200, w}, "C1"); +// auto D1 = net_builder.CreateInput(Float(32), {2750, w}, "D1"); +// auto AA = net_builder.Add(A0, A1); +// auto BB = net_builder.Add(B0, B1); +// auto CC = net_builder.Add(C0, C1); +// auto DD = net_builder.Add(D0, D1); +// auto E = net_builder.ReduceSum(AA, {0}); +// auto F = net_builder.ReduceSum(BB, {0}); +// auto G = net_builder.ReduceSum(CC, {0}); +// auto H = net_builder.ReduceSum(DD, {0}); +// auto I = net_builder.Add(E, F); +// auto J = net_builder.Add(G, I); +// auto K = net_builder.Add(H, J); +// auto AAA = net_builder.Add(AA, A1); +// auto BBB = net_builder.Add(BB, B1); +// auto CCC = net_builder.Add(CC, C1); +// auto DDD = net_builder.Add(DD, D1); +// } + +// Compile(net_builder); +// } - Compile(net_builder); -} } // namespace framework } // namespace hlir diff --git a/cinn/hlir/framework/parallel_compiler.cc b/cinn/hlir/framework/parallel_compiler.cc index 0ca8cf6ba7..fd5dae706d 100644 --- a/cinn/hlir/framework/parallel_compiler.cc +++ b/cinn/hlir/framework/parallel_compiler.cc @@ -133,7 +133,7 @@ void ParallelCompiler::Task::Lowering() { VLOG(1) << "============================================="; VLOG(1) << "Lowering Group:\n" << graph->DebugGroupedGraph(group->CollectNodes()); VLOG(1) << "============================================="; - lowered_funcs.emplace_back(std::move(op_lowerer.Lower(group))); + lowered_funcs.emplace_back(std::move(op_lowerer.ThreadModelTest( graph.get()))); CHECK_EQ(lowered_funcs.back().size(), 1) << "Lowerd Function Is Not Equal 1!"; } } @@ -159,6 +159,7 @@ void ParallelCompiler::Task::CodegenAndJit() { backends::CodeGenCUDA_Dev codegen(target); auto cuda_c = codegen.Compile(dmodule); + std::cerr << cuda_c << std::endl; cinn::backends::SourceCodePrint::GetInstance()->write(cuda_c); using runtime::cuda::CUDAModule; diff --git a/cinn/hlir/pass/fusion_merge_pass_test.cc b/cinn/hlir/pass/fusion_merge_pass_test.cc index eb5e0dac8d..ad3b32a02b 100755 --- a/cinn/hlir/pass/fusion_merge_pass_test.cc +++ b/cinn/hlir/pass/fusion_merge_pass_test.cc @@ -20,17 +20,38 @@ namespace cinn { namespace frontend { TEST(FusionMergePass, ElementWise_Fusion_0) { - int h = 32, w = 32; - NetBuilder net_builder("ElementWise_Fusion_0"); - // create model + + frontend::NetBuilder net_builder("softmax"); { - auto A = net_builder.CreateInput(Float(32), {h, w}, "A"); - auto B = net_builder.CreateInput(Float(32), {h, w}, "B"); - auto C = net_builder.CreateInput(Float(32), {h, w}, "C"); - auto D = net_builder.CreateInput(Float(32), {h, w}, "D"); - auto E = net_builder.Add(A, B); - auto F = net_builder.Add(E, C); - auto G = net_builder.Add(E, D); + // auto A = net_builder.CreateInput(Float(32), {128, 12, 128, 128}, "A"); + // auto Max = net_builder.ReduceMax(A, {3}, true); + // auto sub = net_builder.Subtract(A, Max); + // auto exp = net_builder.Exp( sub ); + // auto sum = net_builder.ReduceSum( exp, {3}, true); + // auto out = net_builder.Divide( exp, sum); + + auto A = net_builder.CreateInput(Float(32), {128, 112, 112, 64}, "A"); + auto scale = net_builder.CreateInput( Float(32), {64}, "scale" ); + auto bias = net_builder.CreateInput( Float(32), {64}, "bias" ); + auto run_mean = net_builder.CreateInput(Float(32), {64}, "run_mean"); + auto run_var = net_builder.CreateInput( Float(32), {64}, "run_var" ); + auto num = net_builder.FillConstant( {1}, 768.0, "num" ); + auto eps = net_builder.FillConstant( {1}, 1e-5, "eps" ); + auto sum1 = net_builder.ReduceSum(A, {2}, true); + auto mean1 = net_builder.Divide( sum1, num); + auto power = net_builder.Multiply(A, A); + auto sum2 = net_builder.ReduceSum(power, {2}, true); + auto mean2 = net_builder.Divide( sum2, num); + auto mean_power = net_builder.Multiply( mean1, mean1); + + auto var = net_builder.Subtract(mean2, mean_power); + + auto sub = net_builder.Subtract( A, mean1); + auto t1 = net_builder.Add( var, eps); + auto t2 = net_builder.Sqrt( t1 ); + auto t3 = net_builder.Divide( sub, t2); + auto t5 = net_builder.Multiply( t3, scale); + auto out = net_builder.Add( t5, bias); } auto program = net_builder.Build(); @@ -38,12 +59,18 @@ TEST(FusionMergePass, ElementWise_Fusion_0) { RunDecomposer(&program, target); auto graph = std::make_shared(program, target); + std::cerr << graph->DebugGroupedGraph() << std::endl; hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 3); - hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + std::cerr << graph->fusion_groups.size() << std::endl; + + std::cerr << graph->DebugGroupedGraph() << std::endl; + //CHECK_EQ(graph->fusion_groups.size(), 3); + //hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); + //CHECK_EQ(graph->fusion_groups.size(), 1); + } +/* TEST(FusionMergePass, ElementWise_Fusion_1) { int h = 32, w = 32; NetBuilder net_builder("ElementWise_Fusion_1"); @@ -482,6 +509,7 @@ TEST(FusionMergePass, Reduce_Test_5) { hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); CHECK_EQ(graph->fusion_groups.size(), 1); } +*/ } // namespace frontend } // namespace cinn diff --git a/cinn/hlir/pass/op_fusion_pass.cc b/cinn/hlir/pass/op_fusion_pass.cc index 026f2c6195..ee3b506576 100644 --- a/cinn/hlir/pass/op_fusion_pass.cc +++ b/cinn/hlir/pass/op_fusion_pass.cc @@ -44,9 +44,11 @@ class OpFusionPassHelper : public FusionHelperBase { InitFusionRelation(); // filter node data, create group for each node auto nodes_inorder = std::get<0>(graph->topological_order()); - for (auto graph_node : nodes_inorder) { + for (auto graph_node : nodes_inorder) { auto node = graph_node->safe_as(); + if (node) { + std::cerr << node->op()->name << std::endl; nodes_.push_back(node); auto group = std::make_shared(); // init group @@ -123,9 +125,9 @@ class OpFusionPassHelper : public FusionHelperBase { void DoOpFusion() { for (auto consumer : nodes_) { // kNonFusible op can't fuse any other op. - if (GetOpKind(consumer) == framework::kNonFusible) { - continue; - } + // if (GetOpKind(consumer) == framework::kNonFusible) { + // continue; + // } // fusion op for consumer auto consumer_fusion = fusion_groups_[consumer]; @@ -147,22 +149,22 @@ class OpFusionPassHelper : public FusionHelperBase { } // kNonFusible op can't fuse any other op. - if (GetOpKind(producer) == framework::kNonFusible) { - continue; - } + // if (GetOpKind(producer) == framework::kNonFusible) { + // continue; + // } VLOG(3) << "Producer Op: " << producer->id() << ", Op Pattern: " << GetOpKind(producer) << " -> Consumer Op: " << consumer->id() << ", Op Pattern: " << GetOpKind(consumer); bool can_fuse = true; // checkout producer node outputs are all in fusion op - for (auto& link : producer_data->outlinks()) { - auto consumer_node = link->sink()->safe_as(); - CHECK(consumer_node); - // if fusion group can't find node, can't merge - if (consumer_fusion->nodes_set.find(consumer_node) == consumer_fusion->nodes_set.end()) { - can_fuse = false; - break; - } - } + // for (auto& link : producer_data->outlinks()) { + // auto consumer_node = link->sink()->safe_as(); + // CHECK(consumer_node); + // // if fusion group can't find node, can't merge + // if (consumer_fusion->nodes_set.find(consumer_node) == consumer_fusion->nodes_set.end()) { + // can_fuse = false; + // break; + // } + // } if (!can_fuse || !CanFuse(producer, consumer)) continue; VLOG(3) << "Fuse Op " << producer->id() << " into Op " << consumer->id(); @@ -321,6 +323,41 @@ class OpFusionPassHelper : public FusionHelperBase { } bool CanFuse(const Node* producer, const Node* consumer) { + static std::set support_op_list = { "reduce_max", "reduce_sum", "subtract", "add", "multipy", "mul", "divide", "exp", "sqrt", "fill_constant", "elementwise_mul", + "elementwise_add", "elementwise_div", "elementwise_sub", "reshape", "scale", "cast", "greater_equal", "uniform_random" }; + + if ( support_op_list.count(producer->op()->name) && support_op_list.count( consumer->op()->name ) ) + { + // support op + return true; + // std::cerr << GetOpKind(producer) << "\t" << GetOpKind(consumer) << std::endl; + if( GetOpKind(producer) == framework::kElementWise ) + { + if( GetOpKind(consumer) == framework::kElementWise ) + { + return true; + } + if( GetOpKind(consumer) == framework::kReduction ) + { + //auto value = absl::get >(consumer->attrs.attr_store.at("axis")); + return true; + } + } + else if ( GetOpKind(producer) == framework::kReduction ) + { + if( GetOpKind(consumer) == framework::kElementWise ) + { + return true; + } + } + } + + std::cerr << "op fusion not support" << std::endl; + + throw std::runtime_error( "op fusion not support op"); + + + auto& relation = fusion_relation_map_[GetOpKind(producer)]; // first step: check producer can be fused into consumer if (relation.op_kind.count(GetOpKind(consumer))) { @@ -443,7 +480,7 @@ void InsertBroadcastTo(Graph* graph) { void OpFusionPassInternal(Graph* graph) { VLOG(3) << "OpFusionPass...!"; - InsertBroadcastTo(graph); + // InsertBroadcastTo(graph); auto op_fusion_helper = OpFusionPassHelper(graph); graph->fusion_groups = op_fusion_helper(); diff --git a/cinn/hlir/pe/reduction.cc b/cinn/hlir/pe/reduction.cc index d1f58a0bab..2d5e880662 100644 --- a/cinn/hlir/pe/reduction.cc +++ b/cinn/hlir/pe/reduction.cc @@ -814,6 +814,7 @@ std::vector TwoStepBlockReduceInternal(const ir::Tensor& A, } } if (first_axes.size()) { + std::cerr << "reduce internal " << std::endl; VLOG(3) << "Do Reduce Internal!"; results.push_back( reduce_func(results.size() ? results.back() : A, first_axes, keep_dim_first, output_name + "_internal")); @@ -821,6 +822,7 @@ std::vector TwoStepBlockReduceInternal(const ir::Tensor& A, } if (second_axes.size()) { VLOG(3) << "Do Block Reduce!"; + std::cerr << "block reduce" << std::endl; auto res = block_reduce_func(results.size() ? results.back() : A, second_axes, keep_dim_second, output_name); results.push_back(res[1]); results.push_back(res[0]); @@ -833,6 +835,7 @@ std::vector TwoStepBlockReduceSum(const ir::Tensor& A, const std::vector& axes, const bool keep_dim, const std::string& output_name) { + std::cerr << "two step block reduce sum" << std::endl; return TwoStepBlockReduceInternal( A, axes, keep_dim, output_name, ReduceSum, BlockReduceSumInternal, ir::Zero(A->type())); } diff --git a/cinn/ir/CMakeLists.txt b/cinn/ir/CMakeLists.txt index b4618cbd0f..4ab0319009 100755 --- a/cinn/ir/CMakeLists.txt +++ b/cinn/ir/CMakeLists.txt @@ -25,6 +25,7 @@ gather_srcs(cinnapi_src SRCS layout.cc schedule_desc.cc ir_compare.cc + thread_model.cc ) # cc_test(test_ir SRCS ir_test.cc DEPS core) @@ -38,6 +39,13 @@ cc_test(test_intrinsic_ops SRCS intrinsic_ops_test.cc DEPS cinncore) cc_test(test_ir_verify SRCS ir_verify_test.cc DEPS cinncore) cc_test(test_schedule_desc SRCS schedule_desc_test.cc DEPS cinncore) cc_test(test_ir_compare SRCS ir_compare_test.cc DEPS cinncore) +nv_test(test_ir_manul SRCS ir_manual_test.cc DEPS cinncore) +nv_test(test_block_model SRCS block_model_test.cc DEPS cinncore) +nv_test(test_fuse_block_model SRCS fuse_block_model_test.cc DEPS cinncore) +nv_test(test_fuse_block_model_fp16 SRCS fuse_block_model_fp16_test.cc DEPS cinncore) +nv_test(test_batch_norm_fp16 SRCS batch_norm_fp16_test.cc DEPS cinncore) +nv_test(test_batch_norm_fp16_pointwise SRCS batch_norm_fp16_pointwise_test.cc DEPS cinncore) +nv_test(test_layer_norm_fp16 SRCS layer_norm_fp16_test.cc DEPS cinncore) foreach(header ${schedule_desc_proto_HDRS}) set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "") diff --git a/cinn/ir/batch_norm_fp16_pointwise_test.cc b/cinn/ir/batch_norm_fp16_pointwise_test.cc new file mode 100644 index 0000000000..da33139964 --- /dev/null +++ b/cinn/ir/batch_norm_fp16_pointwise_test.cc @@ -0,0 +1,1388 @@ + +#include "cinn/ir/ir_verify.h" + +#include + +#include "cinn/ir/ir_operators.h" +#include "cinn/ir/ir_printer.h" +#include "cinn/ir/ir.h" +#include +#include "cinn/ir/tensor.h" +#include "cinn/lang/placeholder.h" + +#include "cinn/backends/codegen_c_x86.h" +#include "cinn/backends/codegen_cuda_dev.h" +#include "cinn/backends/codegen_cuda_util.h" + +#include "cinn/backends/nvrtc/nvrtc_util.h" + +#include "cinn/runtime/cuda/cuda_module.h" +#include "cinn/hlir/framework/op_lowering.h" +#include "cinn/hlir/framework/pass.h" + +#include "cinn/frontend/net_builder.h" + +#include +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include +#include +#include +#include + +#include "cinn/lang/placeholder.h" + +#include "cinn/hlir/framework/visualize_helper.h" + +#include + +#define THREAD_PER_BLOCK 256 +#define WARP_SIZE 32 + +namespace cinn { +namespace ir { + +bool check( common::float16 *out, common::float16 *res,int n){ + for(int i=0;i(out[i]) - static_cast(res[i]) ) > 1e-5) + return false; + } + return true; +} + +int reduce_axis( const std::vector& first, const std::vector& second) +{ + + if( first[0] == 1 && second[0] != 1) + { + return 0; + } + if( first[1] != 1 && second[1] == 1) + { + return 1; + } + throw std::runtime_error("reduce_axis: error"); +} + +int is_same( const std::vector& first, const std::vector& second ) +{ + if( first.size() == second.size() ) + { + for( size_t i = 0; i < first.size(); ++i ) + { + if( first[i] != second[i] ) + { + return false; + } + } + + return true; + } + + return false; +} + +struct InputNode +{ + InputNode() {} + InputNode( std::string n, cinn::lang::Placeholder *p, std::vector dim) + : name(n), in_ptr(p), in_dim(dim) {} + std::string name; + cinn::lang::Placeholder* in_ptr; + std::vector in_dim; +}; + +void process_reduce_max( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + std::string temp_max_name = "tmp_max"; + Var temp_max_var( temp_max_name, type_of() ); + + int warp_round = 1; + int thread_round = 8; + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_max = ir::Max::Make( max_t, t_load); + auto out_max = ir::Let::Make( max_t, new_max, false); + + + + auto body = ir::Block::Make( {out_max }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "warpReduceMax<128>", {max_t}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + // for test here, memory leak + cinn::lang::Placeholder* T_MAX = new cinn::lang::Placeholder("tmp_max", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_MAX), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "reduce_max", T_MAX, {1, 1}); + +} + + +void process_sub( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << "name " << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + // for( size_t i = 0;i < first_input.in_dim.size(); ++i) + // { + // std::cerr << first_input.in_dim[i] << std::endl; + // } + // std::cerr << "====================" << std::endl; + // for( size_t i = 0;i < second_input.in_dim.size(); ++i) + // { + // std::cerr << second_input.in_dim[i] << std::endl; + // } + + + //int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 8; + Expr inf(-100000.0); + + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_sub_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "sub_tmp"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto out = ir::Sub::Make( t_load, t2_load); + + cinn::lang::Placeholder* sub = new cinn::lang::Placeholder( temp_max_name, std::vector{{1, 8}}); + auto sub_store = Store::Make( ir::Tensor(*sub), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + (*input_map)[out_name] = InputNode( temp_max_name, sub, {1, 8}); + +} + + +void process_exp( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = 1; + int thread_round = 8; + std::string temp_max_name = "exp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Minus::Make( t_load); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder("exp", std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "exp", exp, {1, 8}); + +} + +void process_sqrt( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = 1; + int thread_round = 8; + std::string temp_max_name = "sqrt"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "sqrt"; + cinn::ir::Var max_t(name1, type_of()); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Sqrt::Make( t_load); + + cinn::ir::IrPrinter printer(std::cout); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder("sqrt", std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "sqrt", exp, {1, 8}); + +} + +void process_reduce_sum( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 8; + Expr zero(0.0); + std::string temp_max_name = "tmp_sum"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "sum1"; + cinn::ir::Var sum1(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( sum1, zero); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_sum = ir::Add::Make( sum1, t_load); + auto out_sum = ir::Let::Make( sum1, new_sum, false); + + + + auto body = ir::Block::Make( {out_sum }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "BlockReduceSum", {sum1}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + cinn::lang::Placeholder* T_SUM = new cinn::lang::Placeholder("tmp_sum", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_SUM), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + // cinn::ir::IrPrinter printer(std::cout); + + // printer.Print( temp_max_out ); + // printer.Print( max_outer_for ); + // std::cout << std::endl; + + + + (*input_map)[out_name] = InputNode( "reduce_sum", T_SUM, {1, 1}); + +} + + +void process_divide( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + int broadcast_axis = -1; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + + if( first_input.in_dim.size() ==1 ) + { + broadcast_axis = 1; + } + //int + std::cerr << "broadcast " << broadcast_axis << std::endl; + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 8; + Expr inf(-100000.0); + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + Expr t_load; + + if( broadcast_axis != -1 ) + { + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var) }); + } + else + { + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + } + else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::Div::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + // cinn::ir::IrPrinter printer(std::cout); + + // printer.Print( temp_max_out ); + // printer.Print( max_outer_for ); + // std::cout << std::endl; + + (*input_map)[out_name] = InputNode( "divide", div, {1, 8}); + +} + +void process_add( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + //int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + int broadcast_first = -1; + int broadcast_second = -1; + if( first_input.in_dim.size() == 1) + { + broadcast_first = 1; + } + + if( second_input.in_dim.size() == 1 ) + { + broadcast_second = 1; + } + + std::cerr << broadcast_first << "\t" << broadcast_second << std::endl; + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 8; + Expr inf(-100000.0); + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + Expr t_load; + + if( broadcast_first == -1){ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + else{ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var) }); + } + + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + } else if( broadcast_second != -1) { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var)}); + } else { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::Add::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 8}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + // cinn::ir::IrPrinter printer(std::cout); + + // printer.Print( temp_max_out ); + // printer.Print( max_outer_for ); + // std::cout << std::endl; + std::cerr << "add fin" << std::endl; + (*input_map)[out_name] = InputNode( "add", div, {1, 8}); + +} + +void process_mul( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + + int broadcast_first = -1; + int broadcast_second = -1; + if( first_input.in_dim.size() == 1) + { + broadcast_first = 1; + } + + if( second_input.in_dim.size() == 1 ) + { + broadcast_second = 1; + } + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 8; + Expr inf(-100000.0); + + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_mul_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + Expr t_load; + + if( broadcast_first == -1){ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + else{ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var) }); + } + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + } else if( broadcast_second != -1) + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var)}); + } else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::Mul::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + // cinn::ir::IrPrinter printer(std::cout); + + // printer.Print( temp_max_out ); + // printer.Print( max_outer_for ); + // std::cout << std::endl; + + (*input_map)[out_name] = InputNode( temp_max_name, div, {1, 8}); + +} + + +void process_fillconstant( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << hlir::framework::DebugString(node) << std::endl; + auto* op = node->op(); + + auto value = absl::get(node->attrs.attr_store.at("value")); + + std::cerr << value << std::endl; + + auto dtype = absl::get(node->attrs.attr_store.at("value")); + + std::cerr << out_name << "\t" << dtype << std::endl; + + + + cinn::ir::Var out(out_name, type_of()); + + auto max_var = cinn::ir::Let::Make( out, Expr(value)); + + // cinn::ir::IrPrinter printer(std::cout); + + code_dev->Print( max_var ); + code_dev->ss_ << ";" << std::endl; + + // printer.Print( max_var); + + // std::cout << std::endl; + + (*input_map)[out_name] = InputNode( out_name, nullptr, {}); +} + +TEST(IrManul, basic) { + // temp = [64], batch_mean [64] + // x = [128, 112, 112, 64] + // batch_var = temp - batch_mean * batch_mean + // inv_std = pow((batch_var + epsilon), half) + // if data_layout == "NHWC": + // x_hat = (x - batch_mean) * inv_std + // else: + // x_hat = (x - reshape(batch_mean, stats_shape)) * reshape( + // inv_std, stats_shape + // ) + + // run_mean = momentum * run_mean + (1 - momentum) * batch_mean + // run_var = momentum * run_var + (1 - momentum) * batch_var + // y = scale * x_hat + bias + + frontend::NetBuilder net_builder("layer_norm"); + { + auto A = net_builder.CreateInput(Float(32), {896, 1792, 64}, "A"); + auto scale = net_builder.CreateInput( Float(32), {64}, "scale" ); + auto bias = net_builder.CreateInput( Float(32), {64}, "bias" ); + auto sum_mean = net_builder.CreateInput(Float(32), {64}, "sum_mean"); + auto sum_var = net_builder.CreateInput( Float(32), {64}, "sum_var" ); + auto num = net_builder.FillConstant( {1}, 1605632.0, "num" ); + auto eps = net_builder.FillConstant( {1}, 1e-5, "eps" ); + auto mean = net_builder.Divide(sum_mean, num); + auto var_mean = net_builder.Divide( sum_var, num); + auto sub1 = net_builder.Subtract( A, mean); + auto pow1 = net_builder.Multiply( mean, mean); + auto var = net_builder.Subtract( var_mean, pow1 ); + auto t1 = net_builder.Add( var, eps); + auto t2 = net_builder.Sqrt( t1 ); + auto x_hat = net_builder.Divide( sub1, t2 ); + auto t3 = net_builder.Multiply( x_hat, scale); + auto out = net_builder.Add( t3, bias); + + // auto mean1 = net_builder.Divide( sum1, num); + // auto sub = net_builder.Subtract(A, mean1); + // auto power = net_builder.Multiply( sub, sub ); + // auto sum2 = net_builder.ReduceSum( power, {2}, true); + // auto mean2 = net_builder.Divide( sum2, num); + // auto t1 = net_builder.Add( mean2, eps); + // auto t2 = net_builder.Sqrt( t1 ); + // auto t3 = net_builder.Divide( sub, t2); + // auto t5 = net_builder.Multiply( t3, weight); + // auto out = net_builder.Add( t5, bias); + } + + auto program = net_builder.Build(); + auto target = common::DefaultTarget(); + + auto graph = std::make_shared(program, target); + + std::cerr << "len " << graph->fusion_groups.size() << std::endl; + + std::cerr << graph->DebugGroupedGraph() << std::endl; + + //auto group0 = graph->FusionGroupsToGroups()[0]; + + auto topo_order = graph->topological_order(); + auto& nodes = std::get<0>(topo_order); + + // add input data + int reduce_block = 1; + int flatten_block = 1024; + + int num_warp = 4; + int num_thread_per_warp = 32; + int element_per_thread = 8; + + + Var block_id( "blockIdx.x", type_of() ); + Var flatten_id( "xid", type_of() ); + Var r_id( "rid", type_of() ); + Expr expr_flatten( flatten_block); + Expr expr_reduce( reduce_block); + + Var threadidx("threadIdx.x", type_of()); + Var index_i("i", type_of() ); + Var index_j("j", type_of() ); + Expr expr_warp( num_warp); + Expr expr_thread_per_warp( num_thread_per_warp ); + Expr expr_element_per_thread( element_per_thread ); + + auto warp_id = threadidx / expr_thread_per_warp; + + auto xid = warp_id * Expr(1); + auto inner_id = threadidx % expr_thread_per_warp; + auto inner_index = block_id *Expr(1024) + threadidx + index_j * Expr(128); + + auto inner_index2 = threadidx % Expr(64); + + // block reduce + auto warp_round = 1; + auto thread_round = 8; + + std::string temp_name = "tmp"; + Var temp_var( temp_name, type_of() ); + auto temp_out = LocalTemp::Make( temp_var, {warp_round, thread_round}); + + Var loop_var("i"); + + cinn::lang::Placeholder C("d_in", std::vector{{10, 10}}); + cinn::lang::Placeholder T("tmp", std::vector{{1,4}}); + cinn::lang::Placeholder mean_sum("sum_mean", std::vector{{10}}); + cinn::lang::Placeholder mean_sum_tmp("sum_mean_tmp", std::vector{{4}}); + cinn::lang::Placeholder power_sum("sum_var", std::vector{{10}}); + cinn::lang::Placeholder power_sum_tmp("sum_var_tmp", std::vector{{4}}); + cinn::lang::Placeholder scale("scale", std::vector{{10}}); + cinn::lang::Placeholder scale_tmp("scale_tmp", std::vector{{4}}); + cinn::lang::Placeholder bias("bias", std::vector{{10}}); + cinn::lang::Placeholder bias_tmp("bias_tmp", std::vector{{4}}); + //Placeholder A("A", std::vector{{10}}); + //Var input( "input", type_of( )); + Var loop_var_j("j"); + + backends::CodeGenCUDA_Dev cu_dev(target); + + auto t_load = ir::Load::Make( ir::Tensor(C), { inner_index }); + + Expr body = Store::Make( ir::Tensor(T), t_load, {Expr(0), Expr(loop_var_j)}); + + auto cond = ir::LT::Make( inner_index, Expr( 256 * 112 * 112 * 64) ); + auto filter = ir::IfThenElse::Make( cond, body, Expr()); + body = ir::Block::Make({filter}); + + + std::string head = R"ROC( + +#include + +extern "C" { + +__global__ void bn_test(half *d_in, float* sum_mean, float* sum_var, float* scale, float* bias, half *d_out ) { +)ROC"; + + cu_dev.ss_ << head << std::endl; + cu_dev.Print( temp_out ); + cu_dev.ss_ << "\n"; + + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + //printer.Print( load_inner_for ); + + std::cerr << "------------------------------\n"; + + //t_load = ir::Load::Make( ) + + cinn::ir::IrPrinter printer(std::cout); + //printer.Print( load_inner_for ); + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << std::endl; + + temp_name = "sum_mean_tmp"; + Var sum_mean_temp_var( temp_name, type_of() ); + temp_out = LocalTemp::Make( sum_mean_temp_var, {warp_round}); + + t_load = ir::Load::Make( ir::Tensor(mean_sum), { inner_index2 }); + + body = Store::Make( ir::Tensor(mean_sum_tmp ), t_load, {Expr(0)}); + + body = ir::Block::Make({body}); + + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(1), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + //printer.Print( load_inner_for ); + cu_dev.Print( temp_out); + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << std::endl; + + t_load = ir::Load::Make( ir::Tensor(power_sum), { inner_index2 }); + + + + body = Store::Make( ir::Tensor(power_sum_tmp ), t_load, {Expr(0)}); + + body = ir::Block::Make({body}); + + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(1), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + //printer.Print( load_inner_for ); + temp_name = "sum_var_tmp"; + Var sum_var_temp_var( temp_name, type_of() ); + temp_out = LocalTemp::Make( sum_var_temp_var, {warp_round}); + cu_dev.Print( temp_out ); + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << std::endl; + + t_load = ir::Load::Make( ir::Tensor(scale), { inner_index2 }); + + body = Store::Make( ir::Tensor(scale_tmp ), t_load, {Expr(0)}); + + body = ir::Block::Make({body}); + + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(1), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + //printer.Print( load_inner_for ); + temp_name = "scale_tmp"; + Var scale_temp_var( temp_name, type_of() ); + temp_out = LocalTemp::Make( scale_temp_var, {warp_round}); + cu_dev.Print( temp_out ); + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << std::endl; + + t_load = ir::Load::Make( ir::Tensor(bias), { inner_index2 }); + + body = Store::Make( ir::Tensor(bias_tmp ), t_load, {Expr(0)}); + + body = ir::Block::Make({body}); + + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(1), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + //printer.Print( load_inner_for ); + temp_name = "bias_tmp"; + Var bias_temp_var( temp_name, type_of() ); + temp_out = LocalTemp::Make( bias_temp_var, {warp_round}); + cu_dev.Print( temp_out ); + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << std::endl; + + + std::map map_input; + + + + + + + //std::cerr << cu_dev.ss_.str() << std::endl; + //std::cerr << "=======" << std::endl; + map_input["A"] = InputNode( "A", &T, {1, 8}); + map_input["sum_mean"] = InputNode( "sum_mean", &mean_sum_tmp, {1}); + map_input["sum_var"] = InputNode( "sum_var", &power_sum_tmp, {1}); + map_input["scale"] = InputNode("scale", &scale_tmp, {1}); + map_input["bias"] = InputNode("bias", &bias_tmp, {1}); + + for (auto* n : nodes) { + + auto node = n->safe_as(); + if (!node || node->op() == nullptr) { + continue; + } + std::cerr << node->op()->name << std::endl; + + if( node->op()->name == "reduce_max") + { + process_reduce_max( &map_input, node, &cu_dev); + }else if ( node->op()->name == "subtract" ) + { + process_sub( &map_input, node, &cu_dev); + }else if ( node->op()->name == "exp" ) + { + process_exp( &map_input, node, &cu_dev); + }else if ( node->op()->name == "reduce_sum" ) + { + process_reduce_sum( &map_input, node, &cu_dev); + }else if ( node->op()->name == "divide" ) + { + process_divide( &map_input, node, &cu_dev); + }else if ( node->op()->name == "elementwise_mul" ) + { + process_mul( &map_input, node, &cu_dev); + }else if ( node->op()->name == "elementwise_add" ) + { + process_add( &map_input, node, &cu_dev); + } + else if ( node->op()->name == "fill_constant" ) + { + process_fillconstant( &map_input, node, &cu_dev); + } + else if ( node->op()->name == "sqrt" ) + { + process_sqrt( &map_input, node, &cu_dev); + } + else{ + throw std::runtime_error("not support op"); + } + + } + + + + // name var_4 is output + auto var_out = map_input.at( "var_15"); + + t_load = ir::Load::Make( ir::Tensor( *(var_out.in_ptr) ), { Expr(0), Expr(loop_var_j) }); + //t_load = ir::Load::Make( ir::Tensor( T), { Expr(loop_var), Expr(loop_var_j) }); + cinn::lang::Placeholder OUT("d_out", std::vector{{10}}); + + // Expr num1(128); + // Expr num2( 32 ); + // Expr block_step( 1024); + // Expr parallel_size(4); + // auto index_var2 = block_x_var * block_step + thread_x_var / num2 * num1 + thread_x_var % num2; + std::cerr << "before cast" << std::endl; + t_load = ir::Cast::Make( common::Type( common::Type::type_t::UInt, 1, 2), t_load); + std::cerr << "after cast" << std::endl; + auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( inner_index ) }); + + +// //auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( index_var2 + loop_var_j * num2 ) }); + + body = ir::Block::Make( {out_store }); + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(8), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + // body = ir::Block::Make( {load_inner_for}); + + + // auto out_store_for = ir::For::Make(loop_var, + // common::make_const(0), + // common::make_const(warp_round), + // ir::ForType::Unrolled, + // ir::DeviceAPI::CUDA, + // body); + + // printer.Print( out_store_for ); + +// cu_dev.Print( out_store_for ); + + + +// // std::cerr << std::endl; + + // cond = ir::EQ::Make( threadidx, Expr(0) ); + // filter = ir::IfThenElse::Make( cond, load_inner_for, Expr()); + + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << "} \n }" << std::endl; + + std::cerr << cu_dev.ss_.str() << std::endl; + + + auto source_code = cu_dev.ss_.str(); + + backends::nvrtc::Compiler compiler; + + auto ptx = compiler(source_code); + + std::cerr << "source code" << source_code << std::endl; + + const int N= 256 * 112 * 112 * 64; + common::float16 *a=(common::float16 *)malloc(N*sizeof(common::float16)); + common::float16 *d_a; + cudaMalloc((void **)&d_a,N*sizeof(common::float16)); + + const int channel = 64; + float *sum_mean=(float *)malloc( channel *sizeof(float)); + float *d_sum_mean; + cudaMalloc((void **)&d_sum_mean, channel *sizeof(float)); + + + float *sum_var=(float *)malloc( channel*sizeof(float)); + float *d_sum_var; + cudaMalloc((void **)&d_sum_var, channel*sizeof(float)); + + float *p_scale=(float *)malloc(channel*sizeof(float)); + float *d_scale; + cudaMalloc((void **)&d_scale,channel*sizeof(float)); + + float *p_bias=(float *)malloc(channel*sizeof(float)); + float *d_bias; + cudaMalloc((void **)&d_bias,channel*sizeof(float)); + + const int num_warps = 4; + const int block_num = 256 * 112 * 112 * 64 / 1024; + const int NUM_PER_BLOCK = N / block_num; + const int NUM_PER_THREAD = NUM_PER_BLOCK/THREAD_PER_BLOCK; + common::float16 *out=( common::float16 *)malloc(N *sizeof(common::float16)); + float *d_out; + + int M = N; + cudaMalloc((void **)&d_out, M *sizeof(common::float16)); + common::float16 *res=(common::float16 *)malloc( M *sizeof(common::float16)); + + srand(0); + for(int i=0;i( rand() % 100 / 100.0 ); + } + + std::cerr << a[0] << std::endl; + + for(int i=0;i< 64;i++){ + + float sum1 = 0; + float sum2 = 0; + for( int k = 0; k < 256 * 112 * 112; ++k ) + { + auto t = static_cast(a[ i + k * 64 ]); + + sum1 += t; + sum2 += t*t; + } + + sum_mean[i] = sum1; + sum_var[i] = sum2; + } + + for( int i = 0; i < channel; ++i) + { + p_scale[i] = static_cast( rand() % 91 / 100.0 ); + p_bias[i] = static_cast( rand() % 151 / 100.0 ); + } + + std::cerr << sum_mean[0] << std::endl; + std::cerr << sum_var[0] << std::endl; + std::cerr << p_scale[0] << std::endl; + std::cerr << p_bias[0] << std::endl; + + + for(int i=0;i< 256 * 112 * 112;i++){ + for( int k = 0; k < 64; ++k){ + float mean = static_cast(sum_mean[k]) / (256 * 112 * 112); + float var = static_cast(sum_var[k]) / (256 * 112 * 112); + + auto sub1 = static_cast(a[ i * 64 + k ]) - mean; + var = var - mean * mean; + auto t1 = sqrt( var + 1e-5); + auto t2 = sub1 / t1; + res[i * 64 + k] = static_cast(t2 * static_cast(p_scale[k]) + static_cast(p_bias[k])); + + if( i == 0 && k % 64 == 0 ) + { + std::cerr << res[ i * 64 + k] << std::endl; + } + + } + } + std::cerr << "before copy" << std::endl; + cudaMemcpy(d_a,a,N*sizeof(common::float16),cudaMemcpyHostToDevice); + cudaMemcpy(d_sum_mean, sum_mean, channel *sizeof( float),cudaMemcpyHostToDevice); + cudaMemcpy(d_sum_var, sum_var, channel *sizeof( float),cudaMemcpyHostToDevice); + cudaMemcpy(d_scale, p_scale, channel *sizeof( float),cudaMemcpyHostToDevice); + cudaMemcpy(d_bias, p_bias, channel *sizeof(float),cudaMemcpyHostToDevice); + + dim3 Grid( block_num, 1, 1); + dim3 Block( 128, 1, 1); + + void* args[] = {&d_a, &d_sum_mean, &d_sum_var, &d_scale, &d_bias, &d_out }; + + cinn::runtime::cuda::CUDAModule cuda_module(ptx, cinn::runtime::cuda::CUDAModule::Kind::CUBIN); + + for ( int i = 0; i < 1000; ++i) + { + cuda_module.LaunchKernel(0, "bn_test", Grid, Block, args); + } + + std::cerr << "before copy" << std::endl; + cudaMemcpy(out,d_out, M *sizeof( common::float16),cudaMemcpyDeviceToHost); + + + + if(check(out,res,M))printf("the ans is right\n"); + else{ + printf("the ans is wrong\n"); + for(int i=0;i< M;i++){ + // printf("%lf ",out[i]); + if( abs( static_cast( out[i] ) - static_cast( res[i] ) ) > 1e-5 ){ + std::cout << i << "\t" << out[i] << "\t" << res[i] << std::endl; + break; + } + } + printf("\n"); + } + + cudaFree(d_a); + cudaFree(d_out); + + } + + + + + +} + +} // namespace cinn::ir + + diff --git a/cinn/ir/batch_norm_fp16_test.cc b/cinn/ir/batch_norm_fp16_test.cc new file mode 100644 index 0000000000..fc6d25351d --- /dev/null +++ b/cinn/ir/batch_norm_fp16_test.cc @@ -0,0 +1,896 @@ + +#include "cinn/ir/ir_verify.h" + +#include + +#include "cinn/ir/ir_operators.h" +#include "cinn/ir/ir_printer.h" +#include "cinn/ir/ir.h" +#include +#include "cinn/ir/tensor.h" +#include "cinn/lang/placeholder.h" + +#include "cinn/backends/codegen_c_x86.h" +#include "cinn/backends/codegen_cuda_dev.h" +#include "cinn/backends/codegen_cuda_util.h" + +#include "cinn/backends/nvrtc/nvrtc_util.h" + +#include "cinn/runtime/cuda/cuda_module.h" +#include "cinn/hlir/framework/op_lowering.h" +#include "cinn/hlir/framework/pass.h" + +#include "cinn/frontend/net_builder.h" + +#include +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include +#include +#include +#include + +#include "cinn/lang/placeholder.h" + +#include "cinn/hlir/framework/visualize_helper.h" + +#include + +#define THREAD_PER_BLOCK 256 +#define WARP_SIZE 32 + +namespace cinn { +namespace ir { + +bool check( common::float16 *out, common::float16 *res,int n){ + for(int i=0;i(out[i]) - static_cast(res[i]) ) > 1e-5 ) + return false; + } + return true; +} + +int reduce_axis( const std::vector& first, const std::vector& second) +{ + + if( first[0] == 1 && second[0] != 1) + { + return 0; + } + if( first[1] != 1 && second[1] == 1) + { + return 1; + } + throw std::runtime_error("reduce_axis: error"); +} + + +struct InputNode +{ + InputNode() {} + InputNode( std::string n, cinn::lang::Placeholder *p, std::vector dim) + : name(n), in_ptr(p), in_dim(dim) {} + std::string name; + cinn::lang::Placeholder* in_ptr; + std::vector in_dim; +}; + +void process_reduce_max( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + std::string temp_max_name = "tmp_max"; + Var temp_max_var( temp_max_name, type_of() ); + + int warp_round = 1; + int thread_round = 4; + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_max = ir::Max::Make( max_t, t_load); + auto out_max = ir::Let::Make( max_t, new_max, false); + + + + auto body = ir::Block::Make( {out_max }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "warpReduceMax<128>", {max_t}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + // for test here, memory leak + cinn::lang::Placeholder* T_MAX = new cinn::lang::Placeholder("tmp_max", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_MAX), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "reduce_max", T_MAX, {1, 1}); + +} + + +void process_sub( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr inf(-100000.0); + std::string temp_max_name = "sub_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "sub_tmp"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var) }); + + + auto out = ir::Sub::Make( t_load, t2_load); + + cinn::lang::Placeholder* sub = new cinn::lang::Placeholder("sub_tmp", std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*sub), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + (*input_map)[out_name] = InputNode( "sub", sub, {1, 4}); + +} + + +void process_exp( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = 1; + int thread_round = 4; + std::string temp_max_name = "exp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Minus::Make( t_load); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder("exp", std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "exp", exp, {1, 4}); + +} + + +void process_reduce_sum( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 8; + Expr zero(0.0); + std::string temp_max_name = "tmp_sum"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "sum1"; + cinn::ir::Var sum1(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( sum1, zero); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_sum = ir::Add::Make( sum1, t_load); + auto out_sum = ir::Let::Make( sum1, new_sum, false); + + + + auto body = ir::Block::Make( {out_sum }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "BlockReduceSum", {sum1}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + cinn::lang::Placeholder* T_SUM = new cinn::lang::Placeholder("tmp_sum", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_SUM), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + // cinn::ir::IrPrinter printer(std::cout); + + // printer.Print( temp_max_out ); + // printer.Print( max_outer_for ); + // std::cout << std::endl; + + + + (*input_map)[out_name] = InputNode( "reduce_sum", T_SUM, {1, 1}); + +} + + +void process_divide( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr inf(-100000.0); + std::string temp_max_name = "div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var) }); + + + auto out = ir::Div::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder("div_tmp", std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + // code_dev->Print( temp_max_out); + // code_dev->Print( max_outer_for); + // code_dev->ss_ << std::endl; + + + cinn::ir::IrPrinter printer(std::cout); + + printer.Print( temp_max_out ); + printer.Print( max_outer_for ); + std::cout << std::endl; + + (*input_map)[out_name] = InputNode( "divide", div, {1, 4}); + +} + +void process_fillconstant( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << hlir::framework::DebugString(node) << std::endl; + auto* op = node->op(); + + auto value = absl::get(node->attrs.attr_store.at("value")); + + std::cerr << value << std::endl; + + auto dtype = absl::get(node->attrs.attr_store.at("value")); + + std::cerr << out_name << "\t" << dtype << std::endl; + + + + cinn::ir::Var out(out_name, type_of()); + + auto max_var = cinn::ir::Let::Make( out, Expr(value)); + + cinn::ir::IrPrinter printer(std::cout); + + printer.Print( max_var); + + std::cout << std::endl; + + (*input_map)[out_name] = InputNode( out_name, nullptr, {}); +} + +TEST(IrManul, basic) { + // temp = [64], batch_mean [64] + // x = [128, 112, 112, 64] + // batch_var = temp - batch_mean * batch_mean + // inv_std = pow((batch_var + epsilon), half) + // if data_layout == "NHWC": + // x_hat = (x - batch_mean) * inv_std + // else: + // x_hat = (x - reshape(batch_mean, stats_shape)) * reshape( + // inv_std, stats_shape + // ) + + // run_mean = momentum * run_mean + (1 - momentum) * batch_mean + // run_var = momentum * run_var + (1 - momentum) * batch_var + // y = scale * x_hat + bias + + frontend::NetBuilder net_builder("layer_norm"); + { + auto A = net_builder.CreateInput(Float(32), {896, 1792, 64}, "A"); + auto scale = net_builder.CreateInput( Float(32), {64}, "scale" ); + auto bias = net_builder.CreateInput( Float(32), {64}, "bias" ); + auto run_mean = net_builder.CreateInput(Float(32), {64}, "run_mean"); + auto run_var = net_builder.CreateInput( Float(32), {64}, "run_var" ); + // auto num = net_builder.FillConstant( {1}, 768.0, "num" ); + // auto eps = net_builder.FillConstant( {1}, 1e-5, "eps" ); + auto sum1 = net_builder.ReduceSum(A, {1}, false); + // auto mean1 = net_builder.Divide( sum1, num); + // auto sub = net_builder.Subtract(A, mean1); + // auto power = net_builder.Multiply( sub, sub ); + // auto sum2 = net_builder.ReduceSum( power, {2}, true); + // auto mean2 = net_builder.Divide( sum2, num); + // auto t1 = net_builder.Add( mean2, eps); + // auto t2 = net_builder.Sqrt( t1 ); + // auto t3 = net_builder.Divide( sub, t2); + // auto t5 = net_builder.Multiply( t3, weight); + // auto out = net_builder.Add( t5, bias); + } + + auto program = net_builder.Build(); + auto target = common::DefaultTarget(); + + auto graph = std::make_shared(program, target); + + std::cerr << "len " << graph->fusion_groups.size() << std::endl; + + std::cerr << graph->DebugGroupedGraph() << std::endl; + + //auto group0 = graph->FusionGroupsToGroups()[0]; + + auto topo_order = graph->topological_order(); + auto& nodes = std::get<0>(topo_order); + + // add input data + int reduce_block = 2048; + int flatten_block = 1; + + std::vector reduce_range; + std::vector flatten_range; + + std::string name_blockx = "blockIdx.x"; + std::string name_threadx = "xid"; + std::string index_name = "index"; + Var block_x_var( name_blockx, type_of() ); + Var thread_x_var( name_threadx, type_of() ); + // Var index_var( index_name, type_of()); + + Var block_id( "blockIdx.x", type_of() ); + Var flatten_id( "xid", type_of() ); + Var r_id( "rid", type_of() ); + Expr expr_flatten( flatten_block); + Expr expr_reduce( reduce_block); + + cinn::ir::IrPrinter printer(std::cout); + // split the range + + int num_warp = 8; + int num_thread_per_warp = 32; + int element_per_thread = 8; + + Var threadidx("threadIdx.x", type_of()); + Var index_i("i", type_of() ); + Var index_j("j", type_of() ); + Expr expr_warp( num_warp); + Expr expr_thread_per_warp( num_thread_per_warp ); + Expr expr_element_per_thread( element_per_thread ); + + auto warp_id = threadidx / expr_thread_per_warp; + + auto xid = warp_id * Expr(1) + index_i; + auto inner_id = threadidx % expr_thread_per_warp; + auto inner_index = block_id % Expr(64) + ( xid * Expr(8) * expr_thread_per_warp + inner_id + index_j * expr_thread_per_warp ) * Expr(64) + block_id / Expr(64) * Expr( 64 * 2048); + + // block reduce + auto warp_round = 1; + auto thread_round = reduce_block / ( num_thread_per_warp * num_warp); + + std::string temp_name = "tmp"; + Var temp_var( temp_name, type_of() ); + auto temp_out = LocalTemp::Make( temp_var, {warp_round, thread_round}); + + Var loop_var("i"); + + cinn::lang::Placeholder C("d_in", std::vector{{10, 10}}); + cinn::lang::Placeholder T("tmp", std::vector{{1,4}}); + //Placeholder A("A", std::vector{{10}}); + //Var input( "input", type_of( )); + Var loop_var_j("j"); + + auto t_load = ir::Load::Make( ir::Tensor(C), { inner_index }); + auto body1 = Store::Make( ir::Tensor(T), Expr(0.0), {Expr(loop_var), Expr(loop_var_j)} ); + + Expr body = Store::Make( ir::Tensor(T), t_load, {Expr(loop_var), Expr(loop_var_j)}); + + // auto cond = ir::LT::Make( xid * Expr(8) * expr_thread_per_warp + inner_id + index_j * expr_thread_per_warp, Expr( 1792) ); + // auto filter = ir::IfThenElse::Make( cond, body, Expr()); + // body = ir::Block::Make({body1, filter}); + + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + //printer.Print( load_inner_for ); + + std::cerr << "------------------------------\n"; + + body = ir::Block::Make( {load_inner_for}); + + auto load_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + printer.Print( load_outer_for ); + + + std::map map_input; + + backends::CodeGenCUDA_Dev cu_dev(target); + + + std::string head = R"ROC( + +#include + + + +template +__device__ __forceinline__ float warpReduceSum(float sum) { + if (blockSize >= 32)sum += __shfl_down_sync(0xffffffff, sum, 16); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum += __shfl_down_sync(0xffffffff, sum, 8);// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum += __shfl_down_sync(0xffffffff, sum, 4);// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum += __shfl_down_sync(0xffffffff, sum, 2);// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum += __shfl_down_sync(0xffffffff, sum, 1);// 0-1, 2-3, 4-5, etc. + return sum; +} + +const int WARP_SIZE = 32; +__device__ __forceinline__ float BlockReduceSum(float sum) { + + // Shared mem for partial sums (one per warp in the block) + static __shared__ float warpLevelSums[WARP_SIZE]; + const int laneId = threadIdx.x % WARP_SIZE; + const int warpId = threadIdx.x / WARP_SIZE; + + sum = warpReduceSum<32>(sum); + + if( laneId == 0 ) warpLevelSums[warpId] = sum; + __syncthreads(); + + float final_sum = 0.0; + #pragma unroll + for( size_t i = 0; i < 7; ++i) + { + final_sum += warpLevelSums[i]; + } + + if (threadIdx.x == 0) warpLevelSums[0] = final_sum; + __syncthreads(); + return warpLevelSums[0]; + +} + + +template +__device__ __forceinline__ float warpReduce(float sum) { + if (blockSize >= 32)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 16) ); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 8) );// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 4) );// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 2) );// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 1) );// 0-1, 2-3, 4-5, etc. + return sum; +} + + + +extern "C" { + +__global__ void softmax_test(half *d_in, half *d_out ) { +)ROC"; + + cu_dev.ss_ << head << std::endl; + cu_dev.Print( temp_out ); + cu_dev.ss_ << "\n"; + cu_dev.Print( load_outer_for); + + std::cerr << cu_dev.ss_.str() << std::endl; + std::cerr << "=======" << std::endl; + map_input["A"] = InputNode( "A", &T, {1, 4}); + + for (auto* n : nodes) { + + auto node = n->safe_as(); + if (!node || node->op() == nullptr) { + continue; + } + std::cerr << node->op()->name << std::endl; + + if( node->op()->name == "reduce_max") + { + process_reduce_max( &map_input, node, &cu_dev); + }else if ( node->op()->name == "subtract" ) + { + process_sub( &map_input, node, &cu_dev); + }else if ( node->op()->name == "exp" ) + { + process_exp( &map_input, node, &cu_dev); + }else if ( node->op()->name == "reduce_sum" ) + { + process_reduce_sum( &map_input, node, &cu_dev); + }else if ( node->op()->name == "divide" ) + { + process_divide( &map_input, node, &cu_dev); + } + else if ( node->op()->name == "fill_constant" ) + { + process_fillconstant( &map_input, node, &cu_dev); + } + + } + + + + // name var_4 is output + auto var_out = map_input.at( "var_4"); + + t_load = ir::Load::Make( ir::Tensor( *(var_out.in_ptr) ), { Expr(loop_var_j) }); + //t_load = ir::Load::Make( ir::Tensor( T), { Expr(loop_var), Expr(loop_var_j) }); + cinn::lang::Placeholder OUT("d_out", std::vector{{10}}); + + // Expr num1(128); + // Expr num2( 32 ); + // Expr block_step( 1024); + // Expr parallel_size(4); + // auto index_var2 = block_x_var * block_step + thread_x_var / num2 * num1 + thread_x_var % num2; + auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( block_x_var ) }); + + +// //auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( index_var2 + loop_var_j * num2 ) }); + + body = ir::Block::Make( {out_store }); + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(1), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + // body = ir::Block::Make( {load_inner_for}); + + + // auto out_store_for = ir::For::Make(loop_var, + // common::make_const(0), + // common::make_const(warp_round), + // ir::ForType::Unrolled, + // ir::DeviceAPI::CUDA, + // body); + + // printer.Print( out_store_for ); + +// cu_dev.Print( out_store_for ); + + + +// // std::cerr << std::endl; + + auto cond = ir::EQ::Make( threadidx, Expr(0) ); + auto filter = ir::IfThenElse::Make( cond, load_inner_for, Expr()); + + cu_dev.Print( filter ); + cu_dev.ss_ << "} \n }" << std::endl; + + std::cerr << cu_dev.ss_.str() << std::endl; + + + auto source_code = cu_dev.ss_.str(); + + backends::nvrtc::Compiler compiler; + + auto ptx = compiler(source_code); + + std::cerr << "source code" << source_code << std::endl; + + const int N= 1792 * 1792 * 64; + common::float16 *a=(common::float16 *)malloc(N*sizeof(common::float16)); + common::float16 *d_a; + cudaMalloc((void **)&d_a,N*sizeof(common::float16)); + + const int num_warps = 8; + const int block_num = 1792 * 64; + const int NUM_PER_BLOCK = N / block_num; + const int NUM_PER_THREAD = NUM_PER_BLOCK/THREAD_PER_BLOCK; + common::float16 *out=( common::float16 *)malloc(N *sizeof(common::float16)); + float *d_out; + + int M = 1568 * 64; + cudaMalloc((void **)&d_out, M *sizeof(common::float16)); + common::float16 *res=(common::float16 *)malloc( M *sizeof(common::float16)); + + srand(0); + for(int i=0;i( rand() % 100 / 100.0 ); + } + + for(int i=0;i< 1568;i++){ + for( int k = 0; k < 64; ++k){ + float sum = 0; + + for( int j = 0; j < 2048; ++j ) + { + + sum += static_cast(a[ j * 2048 * 64 + i * 64 + k ]); + } + + res[i * 64 + k] = static_cast(sum); + } + } + std::cerr << "before copy" << std::endl; + cudaMemcpy(d_a,a,N*sizeof(common::float16),cudaMemcpyHostToDevice); + + dim3 Grid( block_num, 1, 1); + dim3 Block( THREAD_PER_BLOCK, 1, 1); + + void* args[] = {&d_a, &d_out }; + + cinn::runtime::cuda::CUDAModule cuda_module(ptx, cinn::runtime::cuda::CUDAModule::Kind::CUBIN); + + for ( int i = 0; i < 1000; ++i) + { + cuda_module.LaunchKernel(0, "softmax_test", Grid, Block, args); + } + + std::cerr << "before copy" << std::endl; + cudaMemcpy(out,d_out, M *sizeof( common::float16),cudaMemcpyDeviceToHost); + + if(check(out,res,M))printf("the ans is right\n"); + else{ + printf("the ans is wrong\n"); + for(int i=0;i< M;i++){ + // printf("%lf ",out[i]); + if( abs( static_cast( out[i] ) - static_cast( res[i] ) ) > 1e-5 ){ + std::cout << i << "\t" << out[i] << "\t" << res[i] << std::endl; + break; + } + } + printf("\n"); + } + + cudaFree(d_a); + cudaFree(d_out); + + } + + + + + +} + +} // namespace cinn::ir + + diff --git a/cinn/ir/block_model_test.cc b/cinn/ir/block_model_test.cc new file mode 100644 index 0000000000..219e5a19b5 --- /dev/null +++ b/cinn/ir/block_model_test.cc @@ -0,0 +1,385 @@ + +#include "cinn/ir/ir_verify.h" + +#include + +#include "cinn/ir/ir_operators.h" +#include "cinn/ir/ir_printer.h" +#include "cinn/ir/ir.h" +#include +#include "cinn/ir/tensor.h" +#include "cinn/lang/placeholder.h" + +#include "cinn/backends/codegen_c_x86.h" +#include "cinn/backends/codegen_cuda_dev.h" +#include "cinn/backends/codegen_cuda_util.h" + +#include "cinn/backends/nvrtc/nvrtc_util.h" + +#include "cinn/runtime/cuda/cuda_module.h" + +#include +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include +#include +#include +#include + + +#define THREAD_PER_BLOCK 256 +#define WARP_SIZE 32 + +namespace cinn { +namespace ir { + +bool check(float *out,float *res,int n){ + for(int i=0;i reduce_range; + std::vector flatten_range; + + for( int i = 0; i < reduce_block; ++i ) + { + reduce_range.push_back(i); + } + + for( int i = 0; i < flatten_block; ++i ) + { + flatten_range.push_back(i); + } + + std::string name_blockx = "blockIdx.x"; + std::string name_threadx = "xid"; + std::string index_name = "index"; + Var block_x_var( name_blockx, type_of() ); + Var thread_x_var( name_threadx, type_of() ); + // Var index_var( index_name, type_of()); + + Var block_id( "blockIdx.x", type_of() ); + Var flatten_id( "xid", type_of() ); + Var r_id( "rid", type_of() ); + Expr expr_flatten( flatten_block); + Expr expr_reduce( reduce_block); + + auto index_var =( block_id * expr_flatten + flatten_id ) * expr_reduce + r_id; + + auto load_index = LoadIndex::Make( index_var, reduce_range, flatten_range, reduce_block, flatten_block); + + Var input_var("input", type_of() ); + + auto block_load = BlockLoad::Make( input_var, load_index ); + + auto reduce_max = ReduceMax::Make( block_load, 1); + + Var output_var("output", type_of() ); + + auto store_idx = block_id * expr_flatten + flatten_id; + + auto store_index = LoadIndex::Make( store_idx, reduce_range, flatten_range, reduce_block, flatten_block); + auto block_store = BlockStore::Make( output_var, store_index, reduce_max ); + + cinn::ir::IrPrinter printer(std::cout); + // std::cerr << std::endl; + // printer.Print(index_var); + // std::cerr << std::endl; + + // printer.Print( block_load ); + // std::cerr << "======= block " << std::endl; + // printer.Print( reduce_max ); + + // std::cerr << "=== reduce max" << std::endl; + + // printer.Print( block_store ); + + + + // split the range + + int num_warp = 8; + int num_thread_per_warp = 32; + int element_per_thread = 4; + + Var threadidx("threadIdx.x", type_of()); + Var index_i("i", type_of() ); + Var index_j("j", type_of() ); + Expr expr_warp( num_warp); + Expr expr_thread_per_warp( num_thread_per_warp ); + Expr expr_element_per_thread( element_per_thread ); + + auto warp_id = threadidx / expr_thread_per_warp; + + auto xid = warp_id + index_i * expr_warp; + auto inner_id = threadidx % expr_thread_per_warp; + auto inner_index = xid * expr_element_per_thread * expr_thread_per_warp + inner_id + index_j * expr_thread_per_warp; + + // printer.Print( inner_index ); + + // warp reduce + auto warp_round = flatten_block / num_warp; + auto thread_round = reduce_block / num_thread_per_warp; + + std::string temp_name = "tmp"; + Var temp_var( temp_name, type_of() ); + auto temp_out = LocalTemp::Make( temp_var, {warp_round, thread_round}); + + Var loop_var("i"); + + cinn::lang::Placeholder C("d_in", std::vector{{10, 10}}); + cinn::lang::Placeholder T("tmp", std::vector{{1,4}}); + //Placeholder A("A", std::vector{{10}}); + //Var input( "input", type_of( )); + Var loop_var_j("j"); + + auto t_load = ir::Load::Make( ir::Tensor(C), { inner_index }); + + Expr body = Store::Make( ir::Tensor(T), t_load, {Expr(loop_var), Expr(loop_var_j)}); + + body = ir::Block::Make({body}); + + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + //printer.Print( load_inner_for ); + + std::cerr << "------------------------------\n"; + + body = ir::Block::Make( {load_inner_for}); + + auto load_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + printer.Print( load_outer_for ); + + std::cerr << std::endl; + + Expr inf(-100000.0); + std::string temp_max_name = "tmp_max"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + t_load = ir::Load::Make( ir::Tensor(T), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_max = ir::Max::Make( max_t, t_load); + auto out_max = ir::Let::Make( max_t, new_max, false); + + + + body = ir::Block::Make( {out_max }); + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "warpReduceMax<128>", {max_t}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + cinn::lang::Placeholder T_MAX("tmp_max", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(T_MAX), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + // printer.Print( load_outer_for ); + + std::cerr << "\n"; + + + t_load = ir::Load::Make( ir::Tensor(T_MAX), { Expr(loop_var) }); + cinn::lang::Placeholder OUT("d_in", std::vector{{10}}); + auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( block_id * Expr(8) + warp_id ) }); + + + body = ir::Block::Make( {out_store }); + auto out_store_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + printer.Print( out_store_for ); + + std::cerr << std::endl; + + auto cond = ir::EQ::Make( threadidx % Expr(32), Expr(0) ); + auto filter = ir::IfThenElse::Make( cond, out_store_for, Expr()); + + //printer.Print( filter ); + + + auto target = common::DefaultNVGPUTarget(); + backends::CodeGenCUDA_Dev cu_dev(target); + + std::string head = R"ROC( + +template +__device__ __forceinline__ float warpReduceSum(float sum) { + if (blockSize >= 32)sum += __shfl_down_sync(0xffffffff, sum, 16); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum += __shfl_down_sync(0xffffffff, sum, 8);// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum += __shfl_down_sync(0xffffffff, sum, 4);// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum += __shfl_down_sync(0xffffffff, sum, 2);// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum += __shfl_down_sync(0xffffffff, sum, 1);// 0-1, 2-3, 4-5, etc. + return sum; +} + +template +__device__ __forceinline__ float warpReduceMax(float sum) { + if (blockSize >= 32)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 16) ); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 8) );// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 4) );// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 2) );// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 1) );// 0-1, 2-3, 4-5, etc. + return sum; +} + +extern "C" { + +__global__ void softmax_test(float *d_in,float *d_out ) { +)ROC"; + + cu_dev.ss_ << head << std::endl; + cu_dev.Print( temp_out ); + cu_dev.ss_ << std::endl; + cu_dev.Print( load_outer_for ); + cu_dev.ss_ << std::endl; + cu_dev.Print( temp_max_out ); + cu_dev.ss_ << "\n"; + cu_dev.Print( max_outer_for ); + cu_dev.ss_ << ";" << std::endl; + cu_dev.Print( filter ); + cu_dev.ss_ << ";" << std::endl; + + cu_dev.ss_ << "} \n }" << std::endl; + + std::cerr << cu_dev.ss_.str() << std::endl; + + + auto source_code = cu_dev.ss_.str(); + + backends::nvrtc::Compiler compiler; + + auto ptx = compiler(source_code); + + std::cerr << "source code" << source_code << std::endl; + + const int N= 128 * 12 * 128 *128; + float *a=(float *)malloc(N*sizeof(float)); + float *d_a; + cudaMalloc((void **)&d_a,N*sizeof(float)); + + const int num_warps = 8; + const int block_num = 128 * 12 * 128 / num_warps; + const int NUM_PER_BLOCK = N / block_num; + const int NUM_PER_THREAD = NUM_PER_BLOCK/THREAD_PER_BLOCK; + float *out=(float *)malloc(N *sizeof(float)); + float *d_out; + + int M = 128 * 12 * 128; + cudaMalloc((void **)&d_out, M *sizeof(float)); + float *res=(float *)malloc( M *sizeof(float)); + + srand(0); + for(int i=0;i 1e-5 ){ + std::cout << i << "\t" << out[i] << "\t" << res[i] << std::endl; + break; + } + } + printf("\n"); + } + + cudaFree(d_a); + cudaFree(d_out); + +} + +} +} // namespace cinn::ir + + diff --git a/cinn/ir/fuse_block_model_fp16_test.cc b/cinn/ir/fuse_block_model_fp16_test.cc new file mode 100644 index 0000000000..14b1b39139 --- /dev/null +++ b/cinn/ir/fuse_block_model_fp16_test.cc @@ -0,0 +1,828 @@ + +#include "cinn/ir/ir_verify.h" + +#include + +#include "cinn/ir/ir_operators.h" +#include "cinn/ir/ir_printer.h" +#include "cinn/ir/ir.h" +#include +#include "cinn/ir/tensor.h" +#include "cinn/lang/placeholder.h" + +#include "cinn/backends/codegen_c_x86.h" +#include "cinn/backends/codegen_cuda_dev.h" +#include "cinn/backends/codegen_cuda_util.h" + +#include "cinn/backends/nvrtc/nvrtc_util.h" + +#include "cinn/runtime/cuda/cuda_module.h" +#include "cinn/hlir/framework/op_lowering.h" +#include "cinn/hlir/framework/pass.h" + +#include "cinn/frontend/net_builder.h" + +#include +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include +#include +#include +#include + +#include "cinn/lang/placeholder.h" + +#include + +#define THREAD_PER_BLOCK 256 +#define WARP_SIZE 32 + +namespace cinn { +namespace ir { + +bool check( common::float16 *out, common::float16 *res,int n){ + for(int i=0;i(out[i]) - static_cast(res[i]) ) > 1e-5 ) + return false; + } + return true; +} + +int reduce_axis( const std::vector& first, const std::vector& second) +{ + + if( first[0] == 1 && second[0] != 1) + { + return 0; + } + if( first[1] != 1 && second[1] == 1) + { + return 1; + } + throw std::runtime_error("reduce_axis: error"); +} + + +struct InputNode +{ + InputNode() {} + InputNode( std::string n, cinn::lang::Placeholder *p, std::vector dim) + : name(n), in_ptr(p), in_dim(dim) {} + std::string name; + cinn::lang::Placeholder* in_ptr; + std::vector in_dim; +}; + +void process_reduce_max( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + std::string temp_max_name = "tmp_max"; + Var temp_max_var( temp_max_name, type_of() ); + + int warp_round = 4; + int thread_round = 4; + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_max = ir::Max::Make( max_t, t_load); + auto out_max = ir::Let::Make( max_t, new_max, false); + + cinn::ir::IrPrinter printer(std::cout); + printer.Print( new_max ); + + auto body = ir::Block::Make( {out_max }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "warpReduceMax<128>", {max_t}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + // for test here, memory leak + cinn::lang::Placeholder* T_MAX = new cinn::lang::Placeholder("tmp_max", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_MAX), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + + (*input_map)[out_name] = InputNode( "reduce_max", T_MAX, {1, 1}); + +} + + +void process_sub( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 4; + int thread_round = 4; + Expr inf(-100000.0); + std::string temp_max_name = "sub_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "sub_tmp"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var) }); + + + auto out = ir::Sub::Make( t_load, t2_load); + + cinn::lang::Placeholder* sub = new cinn::lang::Placeholder("sub_tmp", std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*sub), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + (*input_map)[out_name] = InputNode( "sub", sub, {1, 4}); + +} + + +void process_exp( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = 4; + int thread_round = 4; + std::string temp_max_name = "exp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Minus::Make( t_load); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder("exp", std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "exp", exp, {1, 4}); + +} + + +void process_reduce_sum( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 4; + int thread_round = 4; + Expr zero(0.0); + std::string temp_max_name = "tmp_sum"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "sum1"; + cinn::ir::Var sum1(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( sum1, zero); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_sum = ir::Add::Make( sum1, t_load); + auto out_sum = ir::Let::Make( sum1, new_sum, false); + + + + auto body = ir::Block::Make( {out_sum }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "warpReduceSum<128>", {sum1}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + cinn::lang::Placeholder* T_SUM = new cinn::lang::Placeholder("tmp_sum", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_SUM), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "reduce_sum", T_SUM, {1, 1}); + +} + + +void process_divide( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 4; + int thread_round = 4; + Expr inf(-100000.0); + std::string temp_max_name = "div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var) }); + + + auto out = ir::Div::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder("div_tmp", std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + (*input_map)[out_name] = InputNode( "divide", div, {1, 4}); + +} + +TEST(IrManul, basic) { + + frontend::NetBuilder net_builder("softmax"); + { + auto A = net_builder.CreateInput(Float(32), {128, 12, 128, 128}, "A"); + auto Max = net_builder.ReduceMax(A, {3}, true); + auto sub = net_builder.Subtract(A, Max); + auto exp = net_builder.Exp( sub ); + auto sum = net_builder.ReduceSum( exp, {3}, true); + auto out = net_builder.Divide( exp, sum); + } + + auto program = net_builder.Build(); + auto target = common::DefaultTarget(); + + auto graph = std::make_shared(program, target); + + std::cerr << "len " << graph->fusion_groups.size() << std::endl; + + std::cerr << graph->DebugGroupedGraph() << std::endl; + + //auto group0 = graph->FusionGroupsToGroups()[0]; + + auto topo_order = graph->topological_order(); + auto& nodes = std::get<0>(topo_order); + + // add input data + int reduce_block = 128; + int flatten_block = 32; + + std::vector reduce_range; + std::vector flatten_range; + + std::string name_blockx = "blockIdx.x"; + std::string name_threadx = "xid"; + std::string index_name = "index"; + Var block_x_var( name_blockx, type_of() ); + Var thread_x_var( name_threadx, type_of() ); + // Var index_var( index_name, type_of()); + + Var block_id( "blockIdx.x", type_of() ); + Var flatten_id( "xid", type_of() ); + Var r_id( "rid", type_of() ); + Expr expr_flatten( flatten_block); + Expr expr_reduce( reduce_block); + + auto index_var =( block_id * expr_flatten + flatten_id ) * expr_reduce + r_id; + + auto load_index = LoadIndex::Make( index_var, reduce_range, flatten_range, reduce_block, flatten_block); + + Var input_var("input", type_of() ); + + auto block_load = BlockLoad::Make( input_var, load_index ); + + auto reduce_max = ReduceMax::Make( block_load, 1); + + Var output_var("output", type_of() ); + + auto store_idx = block_id * expr_flatten + flatten_id; + + auto store_index = LoadIndex::Make( store_idx, reduce_range, flatten_range, reduce_block, flatten_block); + auto block_store = BlockStore::Make( output_var, store_index, reduce_max ); + + cinn::ir::IrPrinter printer(std::cout); + // std::cerr << std::endl; + // printer.Print(index_var); + // std::cerr << std::endl; + + // printer.Print( block_load ); + // std::cerr << "======= block " << std::endl; + // printer.Print( reduce_max ); + + // std::cerr << "=== reduce max" << std::endl; + + // printer.Print( block_store ); + + + + // split the range + + int num_warp = 8; + int num_thread_per_warp = 32; + int element_per_thread = 8; + + Var threadidx("threadIdx.x", type_of()); + Var index_i("i", type_of() ); + Var index_j("j", type_of() ); + Expr expr_warp( num_warp); + Expr expr_thread_per_warp( num_thread_per_warp ); + Expr expr_element_per_thread( element_per_thread ); + + auto warp_id = threadidx / expr_thread_per_warp; + + auto xid = warp_id * Expr(4) + index_i; + auto inner_id = threadidx % expr_thread_per_warp; + auto inner_index = block_id *Expr(4096) + xid * Expr(4) * expr_thread_per_warp + inner_id + index_j * expr_thread_per_warp; + + // auto inner_index = blockIdx.x * 1024 + (threadIdx.x / 32) * 128 + threadIdx.x % 32; + // auto inner_index = block_id * Expr(1024) + (threadidx / Expr(32) ) * Expr(128) + threadidx % Expr(32) + index_j * expr_thread_per_warp; + // printer.Print( inner_index ); + + // warp reduce + auto warp_round = flatten_block / num_warp; + auto thread_round = reduce_block / num_thread_per_warp; + + std::string temp_name = "tmp"; + Var temp_var( temp_name, type_of() ); + auto temp_out = LocalTemp::Make( temp_var, {warp_round, thread_round}); + + Var loop_var("i"); + + cinn::lang::Placeholder C("d_in", std::vector{{10, 10}}); + cinn::lang::Placeholder T("tmp", std::vector{{1,4}}); + //Placeholder A("A", std::vector{{10}}); + //Var input( "input", type_of( )); + Var loop_var_j("j"); + + auto t_load = ir::Load::Make( ir::Tensor(C), { inner_index }); + + Expr body = Store::Make( ir::Tensor(T), t_load, {Expr(loop_var), Expr(loop_var_j)}); + + body = ir::Block::Make({body}); + + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + //printer.Print( load_inner_for ); + + std::cerr << "------------------------------\n"; + + body = ir::Block::Make( {load_inner_for}); + + auto load_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + //printer.Print( load_outer_for ); + + + std::map map_input; + + backends::CodeGenCUDA_Dev cu_dev(target); + + + std::string head = R"ROC( + +#include + + +template +__device__ __forceinline__ float warpReduceSum(float sum) { + if (blockSize >= 32)sum += __shfl_down_sync(0xffffffff, sum, 16); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum += __shfl_down_sync(0xffffffff, sum, 8);// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum += __shfl_down_sync(0xffffffff, sum, 4);// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum += __shfl_down_sync(0xffffffff, sum, 2);// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum += __shfl_down_sync(0xffffffff, sum, 1);// 0-1, 2-3, 4-5, etc. + return sum; +} + +template +__device__ __forceinline__ float warpReduceMax(float sum) { + if (blockSize >= 32)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 16) ); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 8) );// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 4) );// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 2) );// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 1) );// 0-1, 2-3, 4-5, etc. + return sum; +} + +extern "C" { + +__global__ void softmax_test(half *d_in, half *d_out ) { +)ROC"; + + cu_dev.ss_ << head << std::endl; + cu_dev.Print( temp_out ); + cu_dev.ss_ << "\n"; + cu_dev.Print( load_outer_for); + + std::cerr << cu_dev.ss_.str() << std::endl; + std::cerr << "=======" << std::endl; + map_input["A"] = InputNode( "A", &T, {1, 4}); + + for (auto* n : nodes) { + + auto node = n->safe_as(); + if (!node || node->op() == nullptr) { + continue; + } + std::cerr << node->op()->name << std::endl; + + if( node->op()->name == "reduce_max") + { + process_reduce_max( &map_input, node, &cu_dev); + }else if ( node->op()->name == "subtract" ) + { + process_sub( &map_input, node, &cu_dev); + }else if ( node->op()->name == "exp" ) + { + process_exp( &map_input, node, &cu_dev); + }else if ( node->op()->name == "reduce_sum" ) + { + process_reduce_sum( &map_input, node, &cu_dev); + }else if ( node->op()->name == "divide" ) + { + process_divide( &map_input, node, &cu_dev); + } + + } + + + + // name var_4 is output + auto var_out = map_input.at( "var_4"); + + t_load = ir::Load::Make( ir::Tensor( *(var_out.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + //t_load = ir::Load::Make( ir::Tensor( T), { Expr(loop_var), Expr(loop_var_j) }); + cinn::lang::Placeholder OUT("d_out", std::vector{{10}}); + + // Expr num1(128); + // Expr num2( 32 ); + // Expr block_step( 1024); + // Expr parallel_size(4); + // auto index_var2 = block_x_var * block_step + thread_x_var / num2 * num1 + thread_x_var % num2; + auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( inner_index ) }); + + + //auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( index_var2 + loop_var_j * num2 ) }); + + body = ir::Block::Make( {out_store }); + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + body = ir::Block::Make( {load_inner_for}); + + + auto out_store_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + //printer.Print( out_store_for ); + + cu_dev.Print( out_store_for ); + + + + // std::cerr << std::endl; + + // auto cond = ir::EQ::Make( threadidx % Expr(32), Expr(0) ); + // auto filter = ir::IfThenElse::Make( cond, out_store_for, Expr()); + + cu_dev.ss_ << "} \n }" << std::endl; + + std::cerr << cu_dev.ss_.str() << std::endl; + + + auto source_code = cu_dev.ss_.str(); + + backends::nvrtc::Compiler compiler; + + auto ptx = compiler(source_code); + + std::cerr << "source code" << source_code << std::endl; + + const int N= 128 * 12 * 128 *128; + common::float16 *a=(common::float16 *)malloc(N*sizeof(common::float16)); + common::float16 *d_a; + cudaMalloc((void **)&d_a,N*sizeof(common::float16)); + + const int num_warps = 8; + const int block_num = 128 * 12 * 128 / num_warps / 4; + const int NUM_PER_BLOCK = N / block_num; + const int NUM_PER_THREAD = NUM_PER_BLOCK/THREAD_PER_BLOCK; + common::float16 *out=( common::float16 *)malloc(N *sizeof(common::float16)); + float *d_out; + + int M = N; + cudaMalloc((void **)&d_out, M *sizeof(common::float16)); + common::float16 *res=(common::float16 *)malloc( M *sizeof(common::float16)); + + srand(0); + for(int i=0;i( rand() % 100 / 100 ); + } + + for(int i=0;i< 128 * 12 * 128;i++){ + + float cur=-100000000; + for(int j=0;j<128;j++){ + if( cur < static_cast( a[ i * 128 + j ] ) ) + { + cur = static_cast( a[i*128 + j ] ); + } + } + float sum = 0; + float temp[128]; + for( int j = 0; j < 128; ++j ) + { + temp[j] = exp( static_cast(a[i*128 + j ]) - cur); + sum += temp[j]; + } + + for( int j = 0; j < 128; ++j ) + { + res[ i * 128 + j ] = static_cast(temp[j] / sum); + } + } + std::cerr << "before copy" << std::endl; + cudaMemcpy(d_a,a,N*sizeof(common::float16),cudaMemcpyHostToDevice); + + dim3 Grid( block_num, 1, 1); + dim3 Block( THREAD_PER_BLOCK, 1, 1); + + void* args[] = {&d_a, &d_out }; + + cinn::runtime::cuda::CUDAModule cuda_module(ptx, cinn::runtime::cuda::CUDAModule::Kind::CUBIN); + + for ( int i = 0; i < 1000; ++i) + { + cuda_module.LaunchKernel(0, "softmax_test", Grid, Block, args); + } + + std::cerr << "before copy" << std::endl; + cudaMemcpy(out,d_out, M *sizeof( common::float16),cudaMemcpyDeviceToHost); + + if(check(out,res,M))printf("the ans is right\n"); + else{ + printf("the ans is wrong\n"); + for(int i=0;i< M;i++){ + // printf("%lf ",out[i]); + if( abs( static_cast( out[i] ) - static_cast( res[i] ) ) > 1e-5 ){ + std::cout << i << "\t" << out[i] << "\t" << res[i] << std::endl; + break; + } + } + printf("\n"); + } + + cudaFree(d_a); + cudaFree(d_out); + + } + + + + + +} + +} // namespace cinn::ir + + diff --git a/cinn/ir/fuse_block_model_test.cc b/cinn/ir/fuse_block_model_test.cc new file mode 100644 index 0000000000..5c4f22b7d3 --- /dev/null +++ b/cinn/ir/fuse_block_model_test.cc @@ -0,0 +1,822 @@ + +#include "cinn/ir/ir_verify.h" + +#include + +#include "cinn/ir/ir_operators.h" +#include "cinn/ir/ir_printer.h" +#include "cinn/ir/ir.h" +#include +#include "cinn/ir/tensor.h" +#include "cinn/lang/placeholder.h" + +#include "cinn/backends/codegen_c_x86.h" +#include "cinn/backends/codegen_cuda_dev.h" +#include "cinn/backends/codegen_cuda_util.h" + +#include "cinn/backends/nvrtc/nvrtc_util.h" + +#include "cinn/runtime/cuda/cuda_module.h" +#include "cinn/hlir/framework/op_lowering.h" +#include "cinn/hlir/framework/pass.h" + +#include "cinn/frontend/net_builder.h" + +#include +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include +#include +#include +#include + +#include "cinn/lang/placeholder.h" + +#include + +#define THREAD_PER_BLOCK 256 +#define WARP_SIZE 32 + +namespace cinn { +namespace ir { + +bool check(float *out,float *res,int n){ + for(int i=0;i 1e-5 ) + return false; + } + return true; +} + +int reduce_axis( const std::vector& first, const std::vector& second) +{ + + if( first[0] == 1 && second[0] != 1) + { + return 0; + } + if( first[1] != 1 && second[1] == 1) + { + return 1; + } + throw std::runtime_error("reduce_axis: error"); +} + + +struct InputNode +{ + InputNode() {} + InputNode( std::string n, cinn::lang::Placeholder *p, std::vector dim) + : name(n), in_ptr(p), in_dim(dim) {} + std::string name; + cinn::lang::Placeholder* in_ptr; + std::vector in_dim; +}; + +void process_reduce_max( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + std::string temp_max_name = "tmp_max"; + Var temp_max_var( temp_max_name, type_of() ); + + int warp_round = 1; + int thread_round = 4; + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_max = ir::Max::Make( max_t, t_load); + auto out_max = ir::Let::Make( max_t, new_max, false); + + + + auto body = ir::Block::Make( {out_max }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "warpReduceMax<128>", {max_t}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + // for test here, memory leak + cinn::lang::Placeholder* T_MAX = new cinn::lang::Placeholder("tmp_max", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_MAX), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "reduce_max", T_MAX, {1, 1}); + +} + + +void process_sub( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr inf(-100000.0); + std::string temp_max_name = "sub_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "sub_tmp"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var) }); + + + auto out = ir::Sub::Make( t_load, t2_load); + + cinn::lang::Placeholder* sub = new cinn::lang::Placeholder("sub_tmp", std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*sub), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + (*input_map)[out_name] = InputNode( "sub", sub, {1, 4}); + +} + + +void process_exp( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = 1; + int thread_round = 4; + std::string temp_max_name = "exp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Minus::Make( t_load); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder("exp", std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "exp", exp, {1, 4}); + +} + + +void process_reduce_sum( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr zero(0.0); + std::string temp_max_name = "tmp_sum"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "sum1"; + cinn::ir::Var sum1(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( sum1, zero); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_sum = ir::Add::Make( sum1, t_load); + auto out_sum = ir::Let::Make( sum1, new_sum, false); + + + + auto body = ir::Block::Make( {out_sum }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "warpReduceSum<128>", {sum1}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + cinn::lang::Placeholder* T_SUM = new cinn::lang::Placeholder("tmp_sum", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_SUM), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "reduce_sum", T_SUM, {1, 1}); + +} + + +void process_divide( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr inf(-100000.0); + std::string temp_max_name = "div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var) }); + + + auto out = ir::Div::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder("div_tmp", std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + (*input_map)[out_name] = InputNode( "divide", div, {1, 4}); + +} + +TEST(IrManul, basic) { + + frontend::NetBuilder net_builder("softmax"); + { + auto A = net_builder.CreateInput(Float(32), {128, 12, 128, 128}, "A"); + auto Max = net_builder.ReduceMax(A, {3}, true); + auto sub = net_builder.Subtract(A, Max); + auto exp = net_builder.Exp( sub ); + auto sum = net_builder.ReduceSum( exp, {3}, true); + auto out = net_builder.Divide( exp, sum); + } + + auto program = net_builder.Build(); + auto target = common::DefaultTarget(); + + auto graph = std::make_shared(program, target); + + std::cerr << "len " << graph->fusion_groups.size() << std::endl; + + std::cerr << graph->DebugGroupedGraph() << std::endl; + + //auto group0 = graph->FusionGroupsToGroups()[0]; + + auto topo_order = graph->topological_order(); + auto& nodes = std::get<0>(topo_order); + + // add input data + int reduce_block = 128; + int flatten_block = 8; + + std::vector reduce_range; + std::vector flatten_range; + + std::string name_blockx = "blockIdx.x"; + std::string name_threadx = "xid"; + std::string index_name = "index"; + Var block_x_var( name_blockx, type_of() ); + Var thread_x_var( name_threadx, type_of() ); + // Var index_var( index_name, type_of()); + + Var block_id( "blockIdx.x", type_of() ); + Var flatten_id( "xid", type_of() ); + Var r_id( "rid", type_of() ); + Expr expr_flatten( flatten_block); + Expr expr_reduce( reduce_block); + + auto index_var =( block_id * expr_flatten + flatten_id ) * expr_reduce + r_id; + + auto load_index = LoadIndex::Make( index_var, reduce_range, flatten_range, reduce_block, flatten_block); + + Var input_var("input", type_of() ); + + auto block_load = BlockLoad::Make( input_var, load_index ); + + auto reduce_max = ReduceMax::Make( block_load, 1); + + Var output_var("output", type_of() ); + + auto store_idx = block_id * expr_flatten + flatten_id; + + auto store_index = LoadIndex::Make( store_idx, reduce_range, flatten_range, reduce_block, flatten_block); + auto block_store = BlockStore::Make( output_var, store_index, reduce_max ); + + cinn::ir::IrPrinter printer(std::cout); + // std::cerr << std::endl; + // printer.Print(index_var); + // std::cerr << std::endl; + + // printer.Print( block_load ); + // std::cerr << "======= block " << std::endl; + // printer.Print( reduce_max ); + + // std::cerr << "=== reduce max" << std::endl; + + // printer.Print( block_store ); + + + + // split the range + + int num_warp = 8; + int num_thread_per_warp = 32; + int element_per_thread = 4; + + Var threadidx("threadIdx.x", type_of()); + Var index_i("i", type_of() ); + Var index_j("j", type_of() ); + Expr expr_warp( num_warp); + Expr expr_thread_per_warp( num_thread_per_warp ); + Expr expr_element_per_thread( element_per_thread ); + + auto warp_id = threadidx / expr_thread_per_warp; + + auto xid = warp_id + index_i * expr_warp; + auto inner_id = threadidx % expr_thread_per_warp; + auto inner_index = block_id *Expr(1024) + xid * expr_element_per_thread * expr_thread_per_warp + inner_id + index_j * expr_thread_per_warp; + + // auto inner_index = blockIdx.x * 1024 + (threadIdx.x / 32) * 128 + threadIdx.x % 32; + // auto inner_index = block_id * Expr(1024) + (threadidx / Expr(32) ) * Expr(128) + threadidx % Expr(32) + index_j * expr_thread_per_warp; + // printer.Print( inner_index ); + + // warp reduce + auto warp_round = flatten_block / num_warp; + auto thread_round = reduce_block / num_thread_per_warp; + + std::string temp_name = "tmp"; + Var temp_var( temp_name, type_of() ); + auto temp_out = LocalTemp::Make( temp_var, {warp_round, thread_round}); + + Var loop_var("i"); + + cinn::lang::Placeholder C("d_in", std::vector{{10, 10}}); + cinn::lang::Placeholder T("tmp", std::vector{{1,4}}); + //Placeholder A("A", std::vector{{10}}); + //Var input( "input", type_of( )); + Var loop_var_j("j"); + + auto t_load = ir::Load::Make( ir::Tensor(C), { inner_index }); + + Expr body = Store::Make( ir::Tensor(T), t_load, {Expr(loop_var), Expr(loop_var_j)}); + + body = ir::Block::Make({body}); + + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + //printer.Print( load_inner_for ); + + std::cerr << "------------------------------\n"; + + body = ir::Block::Make( {load_inner_for}); + + auto load_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + //printer.Print( load_outer_for ); + + + std::map map_input; + + backends::CodeGenCUDA_Dev cu_dev(target); + + + std::string head = R"ROC( + +template +__device__ __forceinline__ float warpReduceSum(float sum) { + if (blockSize >= 32)sum += __shfl_down_sync(0xffffffff, sum, 16); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum += __shfl_down_sync(0xffffffff, sum, 8);// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum += __shfl_down_sync(0xffffffff, sum, 4);// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum += __shfl_down_sync(0xffffffff, sum, 2);// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum += __shfl_down_sync(0xffffffff, sum, 1);// 0-1, 2-3, 4-5, etc. + return sum; +} + +template +__device__ __forceinline__ float warpReduceMax(float sum) { + if (blockSize >= 32)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 16) ); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 8) );// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 4) );// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 2) );// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 1) );// 0-1, 2-3, 4-5, etc. + return sum; +} + +extern "C" { + +__global__ void softmax_test(float *d_in,float *d_out ) { +)ROC"; + + cu_dev.ss_ << head << std::endl; + cu_dev.Print( temp_out ); + cu_dev.ss_ << "\n"; + cu_dev.Print( load_outer_for); + + std::cerr << cu_dev.ss_.str() << std::endl; + std::cerr << "=======" << std::endl; + map_input["A"] = InputNode( "A", &T, {1, 4}); + + for (auto* n : nodes) { + + auto node = n->safe_as(); + if (!node || node->op() == nullptr) { + continue; + } + std::cerr << node->op()->name << std::endl; + + if( node->op()->name == "reduce_max") + { + process_reduce_max( &map_input, node, &cu_dev); + }else if ( node->op()->name == "subtract" ) + { + process_sub( &map_input, node, &cu_dev); + }else if ( node->op()->name == "exp" ) + { + process_exp( &map_input, node, &cu_dev); + }else if ( node->op()->name == "reduce_sum" ) + { + process_reduce_sum( &map_input, node, &cu_dev); + }else if ( node->op()->name == "divide" ) + { + process_divide( &map_input, node, &cu_dev); + } + + } + + + + // name var_4 is output + auto var_out = map_input.at( "var_4"); + + t_load = ir::Load::Make( ir::Tensor( *(var_out.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + //t_load = ir::Load::Make( ir::Tensor( T), { Expr(loop_var), Expr(loop_var_j) }); + cinn::lang::Placeholder OUT("d_out", std::vector{{10}}); + + // Expr num1(128); + // Expr num2( 32 ); + // Expr block_step( 1024); + // Expr parallel_size(4); + // auto index_var2 = block_x_var * block_step + thread_x_var / num2 * num1 + thread_x_var % num2; + auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( inner_index ) }); + + + //auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( index_var2 + loop_var_j * num2 ) }); + + body = ir::Block::Make( {out_store }); + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + body = ir::Block::Make( {load_inner_for}); + + + auto out_store_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + //printer.Print( out_store_for ); + + cu_dev.Print( out_store_for ); + + + + // std::cerr << std::endl; + + // auto cond = ir::EQ::Make( threadidx % Expr(32), Expr(0) ); + // auto filter = ir::IfThenElse::Make( cond, out_store_for, Expr()); + + cu_dev.ss_ << "} \n }" << std::endl; + + std::cerr << cu_dev.ss_.str() << std::endl; + + + auto source_code = cu_dev.ss_.str(); + + backends::nvrtc::Compiler compiler; + + auto ptx = compiler(source_code); + + std::cerr << "source code" << source_code << std::endl; + + const int N= 128 * 12 * 128 *128; + float *a=(float *)malloc(N*sizeof(float)); + float *d_a; + cudaMalloc((void **)&d_a,N*sizeof(float)); + + const int num_warps = 8; + const int block_num = 128 * 12 * 128 / num_warps; + const int NUM_PER_BLOCK = N / block_num; + const int NUM_PER_THREAD = NUM_PER_BLOCK/THREAD_PER_BLOCK; + float *out=(float *)malloc(N *sizeof(float)); + float *d_out; + + int M = N; + cudaMalloc((void **)&d_out, M *sizeof(float)); + float *res=(float *)malloc( M *sizeof(float)); + + srand(0); + for(int i=0;i 1e-5 ){ + std::cout << i << "\t" << out[i] << "\t" << res[i] << std::endl; + break; + } + } + printf("\n"); + } + + cudaFree(d_a); + cudaFree(d_out); + + } + + + + + +} + +} // namespace cinn::ir + + diff --git a/cinn/ir/ir.cc b/cinn/ir/ir.cc index 7db39fa704..5f2403edd2 100755 --- a/cinn/ir/ir.cc +++ b/cinn/ir/ir.cc @@ -32,9 +32,9 @@ namespace ir { using common::make_shared; Expr Cast::Make(Type t, Expr v) { - CHECK(!t.is_unk()); - CHECK(!(t.is_void() && !t.is_cpp_handle())) << "Void is not allowed to cast"; - CHECK(v.defined()); + // CHECK(!t.is_unk()); + // CHECK(!(t.is_void() && !t.is_cpp_handle())) << "Void is not allowed to cast"; + // CHECK(v.defined()); auto node = make_shared(); node->v() = v; @@ -185,7 +185,7 @@ void Not::Verify() const { CHECK_EQ(v().type(), type_of()); } Type Not::type() const { return type_; } -Expr Let::Make(Expr symbol, Expr body) { +Expr Let::Make(Expr symbol, Expr body, bool with_dtype) { auto *n = make_shared(); CHECK(symbol.type().valid()); if (body.defined()) { @@ -194,6 +194,7 @@ Expr Let::Make(Expr symbol, Expr body) { n->symbol = symbol; n->body = body; n->set_type(n->symbol->type()); + n->with_dtype = with_dtype; return Expr(n); } @@ -350,7 +351,7 @@ std::vector IfThenElse::expr_fields() { return {&condition, &true_case, std::vector IfThenElse::expr_fields() const { return {&condition, &true_case, &false_case}; } Expr Store::Make(Expr tensor, Expr value, const std::vector &indices) { - CHECK(tensor.As<_Tensor_>()) << "tensor should be _Tensor_ type"; + //CHECK(tensor.As<_Tensor_>()) << "tensor should be _Tensor_ type"; auto node = make_shared(); node->tensor = tensor; node->value = value; diff --git a/cinn/ir/ir.h b/cinn/ir/ir.h index 894ab40e4c..5525aa010c 100644 --- a/cinn/ir/ir.h +++ b/cinn/ir/ir.h @@ -298,8 +298,9 @@ struct Not : public UnaryOpNode { struct Let : public ExprNode { Expr symbol; Expr body; + bool with_dtype; - static Expr Make(Expr symbol, Expr body); + static Expr Make(Expr symbol, Expr body, bool with_dtype=true); Type type() const override; @@ -317,6 +318,149 @@ struct Let : public ExprNode { } }; +struct LocalTemp : public ExprNode{ + Expr symbol; + std::vector local_size; + + static Expr Make(Expr v, std::vector size) { + auto *n = make_shared(); + n->symbol = v; + n->local_size = size; + n->set_type(n->symbol->type()); + return Expr(n); + } + + Type type() const override { + return symbol.type(); + } + void Verify() const override {} + + static const IrNodeTy _node_type_ = IrNodeTy::LocalTemp; + + +}; + + + +struct Sqrt : public ExprNode{ + Expr symbol; + + static Expr Make(Expr v) { + auto *n = make_shared(); + n->symbol = v; + n->set_type(n->symbol->type()); + return Expr(n); + } + + Type type() const override { + return symbol.type(); + } + void Verify() const override {} + + static const IrNodeTy _node_type_ = IrNodeTy::Sqrt; + + +}; + + +struct ReduceMax : public ExprNode{ + Expr input; + int axis; + + static Expr Make(Expr in, int axis) { + auto *n = make_shared(); + n->input = in; + n->axis = axis; + n->set_type(n->input->type()); + return Expr(n); + } + + Type type() const override { + return input.type(); + } + void Verify() const override {} + + static const IrNodeTy _node_type_ = IrNodeTy::LocalTemp; + +}; + + + + +struct LoadIndex : public ExprNode{ + Expr index_expr; + + std::vector reduce_range; + std::vector flatten_range; + int reduce_block; + int flatten_block; + + + static Expr Make(Expr id_expr, std::vector reduce_range, std::vector flatten_range, int reduce_block, int flatten_block) { + auto *n = make_shared(); + n->index_expr = id_expr; + n->reduce_range = reduce_range; + n->flatten_range = flatten_range; + n->reduce_block = reduce_block; + n->flatten_block = flatten_block; + n->set_type(n->index_expr.type()); + + return Expr(n); + } + + Type type() const override { + return index_expr.type(); + } + void Verify() const override {} + + static const IrNodeTy _node_type_ = IrNodeTy::LoadIndex; + + +}; + +struct BlockLoad : public ExprNode{ + Expr input; + Expr load_index; + + static Expr Make(Expr in, Expr index) { + auto *n = make_shared(); + n->input = in; + n->load_index = index; + n->set_type(n->input->type()); + return Expr(n); + } + + Type type() const override { + return input.type(); + } + void Verify() const override {} + + static const IrNodeTy _node_type_ = IrNodeTy::BlockLoad; +}; + + +struct BlockStore : public ExprNode{ + Expr input; + Expr load_index; + Expr value; + + static Expr Make(Expr in, Expr index, Expr value) { + auto *n = make_shared(); + n->input = in; + n->load_index = index; + n->value = value; + n->set_type(n->input->type()); + return Expr(n); + } + + Type type() const override { + return input.type(); + } + void Verify() const override {} + + static const IrNodeTy _node_type_ = IrNodeTy::BlockStore; +}; + enum CallType : int { //! Extern "C" function. Extern = 0, diff --git a/cinn/ir/ir_base.h b/cinn/ir/ir_base.h index 56fa3c371a..297ecbae38 100755 --- a/cinn/ir/ir_base.h +++ b/cinn/ir/ir_base.h @@ -109,6 +109,12 @@ class ScheduleBlockRealize; macro__(_BufferRange_) \ macro__(ScheduleBlock) \ macro__(ScheduleBlockRealize) \ + macro__(LocalTemp) \ + macro__(LoadIndex) \ + macro__(ReduceMax) \ + macro__(BlockLoad) \ + macro__(BlockStore) \ + macro__(Sqrt) \ #define NODETY_FORALL(__m) \ diff --git a/cinn/ir/ir_compare.cc b/cinn/ir/ir_compare.cc index 16a0672d51..c63553482a 100644 --- a/cinn/ir/ir_compare.cc +++ b/cinn/ir/ir_compare.cc @@ -185,6 +185,43 @@ bool IrEqualVisitor::Visit(const Store* lhs, const Expr* other) { return Compare(lhs->tensor, rhs->tensor) && Compare(lhs->indices, rhs->indices); } +bool IrEqualVisitor::Visit(const LocalTemp* lhs, const Expr* other) +{ + std::cerr << "not impl ir equal visit"; + return false; +} + +bool IrEqualVisitor::Visit(const Sqrt* lhs, const Expr* other) +{ + std::cerr << "not impl ir equal visit"; + return false; +} + + +bool IrEqualVisitor::Visit(const LoadIndex* lhs, const Expr* other) +{ + std::cerr << "not impl ir equal visit"; + return false; +} + +bool IrEqualVisitor::Visit(const BlockLoad* lhs, const Expr* other) +{ + std::cerr << "not impl block load equal visit"; + return false; +} + +bool IrEqualVisitor::Visit(const BlockStore* lhs, const Expr* other) +{ + std::cerr << "not impl block load equal visit"; + return false; +} + +bool IrEqualVisitor::Visit(const ReduceMax* lhs, const Expr* other) +{ + std::cerr << "not impl ir reduce maxequal visit"; + return false; +} + bool IrEqualVisitor::Visit(const Alloc* lhs, const Expr* other) { auto* rhs = other->As(); return Compare(lhs->destination, rhs->destination) && Compare(lhs->extents, rhs->extents) && diff --git a/cinn/ir/ir_mutator.h b/cinn/ir/ir_mutator.h index 90098e3b35..20c51621fc 100755 --- a/cinn/ir/ir_mutator.h +++ b/cinn/ir/ir_mutator.h @@ -69,6 +69,25 @@ template void IRMutator::Visit(const FloatImm *expr, T op) {} template void IRMutator::Visit(const StringImm *expr, T op) {} + +template +void IRMutator::Visit(const LocalTemp *expr, T op) {} + +template +void IRMutator::Visit(const Sqrt *expr, T op) {} + +template +void IRMutator::Visit(const LoadIndex *expr, T op) {} + +template +void IRMutator::Visit(const BlockLoad *expr, T op) {} + +template +void IRMutator::Visit(const BlockStore *expr, T op) {} + +template +void IRMutator::Visit(const ReduceMax *expr, T op) {} + template void IRMutator::Visit(const Cast *expr, T op) { auto *node = op->template As(); diff --git a/cinn/ir/ir_printer.cc b/cinn/ir/ir_printer.cc index 6d2672a451..4fb36d3048 100644 --- a/cinn/ir/ir_printer.cc +++ b/cinn/ir/ir_printer.cc @@ -31,7 +31,9 @@ namespace ir { using common::float16; -void IrPrinter::Print(Expr e) { IRVisitor::Visit(&e); } +void IrPrinter::Print(Expr e) { + //std::cerr << "print here" << std::endl; + IRVisitor::Visit(&e); } void IrPrinter::Print(const std::vector &exprs, const std::string &splitter) { for (int i = 0; !exprs.empty() && i < exprs.size() - 1; i++) { Print(exprs[i]); @@ -126,7 +128,7 @@ void IrPrinter::Visit(const Max *x) { os_ << ")"; } void IrPrinter::Visit(const Minus *x) { - os_ << "-("; + os_ << "expf("; Print(x->v()); os_ << ")"; } @@ -250,8 +252,13 @@ void IrPrinter::Visit(const Call *x) { os_ << ")"; } void IrPrinter::Visit(const Cast *x) { - os() << x->type(); - os() << "("; + // os() << x->type(); + // os() << "("; + // os() << x->v(); + // os() << ")"; + + os() << "reinterpret_cast("; + os() << x->v(); os() << ")"; } @@ -356,7 +363,10 @@ void IrPrinter::Visit(const _LoweredFunc_ *f) { } void IrPrinter::Visit(const Let *f) { CHECK(f->type().valid()); - os() << f->type() << " "; + if( f->with_dtype ) + { + os() << f->type() << " "; + } Print(f->symbol); if (f->body.defined()) { os() << " = "; @@ -417,6 +427,47 @@ void IrPrinter::Visit(const Broadcast *x) { os() << ")"; } +void IrPrinter::Visit(const LocalTemp *x) { + os() << "\n"; + os() << x->type() << " "; + Print(x->symbol); + for( size_t i = 0; i < x->local_size.size(); ++i) + { + os() << "["; + os() << x->local_size[i]; + os() << "]"; + } + os() << ";\n"; +} + +void IrPrinter::Visit(const Sqrt *x) { + os() << "sqrtf("; + Print(x->symbol); + os() << ");\n"; +} + +void IrPrinter::Visit(const LoadIndex *x) { + os() << "reduce reange " << x->reduce_range.front() << "\t" << x->reduce_range.back() << "\n"; + os() << "flatten range " << x->flatten_range.front() << "\t" << x->flatten_range.back() << "\n"; + os() << "reduce " << x->reduce_block << "\t" << x->flatten_block << "\n"; + os() << "expr " << x->index_expr << "\n"; +} + +void IrPrinter::Visit(const ReduceMax *x) { + os() << "reduce max, input" << x->input << "\t" << x->axis << "\n"; +} + +void IrPrinter::Visit(const BlockLoad *x) { + os() << x->input << "\n"; + Visit( x->load_index.As() ); +} + +void IrPrinter::Visit(const BlockStore *x) { + os() << x->input << "\n"; + Visit( x->load_index.As() ); + Visit(x->value.As() ); +} + void IrPrinter::Visit(const FracOp *x) { os() << "("; Print(x->a()); diff --git a/cinn/ir/layer_norm_fp16_test.cc b/cinn/ir/layer_norm_fp16_test.cc new file mode 100644 index 0000000000..02fcbc487e --- /dev/null +++ b/cinn/ir/layer_norm_fp16_test.cc @@ -0,0 +1,1301 @@ + +#include "cinn/ir/ir_verify.h" + +#include + +#include "cinn/ir/ir_operators.h" +#include "cinn/ir/ir_printer.h" +#include "cinn/ir/ir.h" +#include +#include "cinn/ir/tensor.h" +#include "cinn/lang/placeholder.h" + +#include "cinn/backends/codegen_c_x86.h" +#include "cinn/backends/codegen_cuda_dev.h" +#include "cinn/backends/codegen_cuda_util.h" + +#include "cinn/backends/nvrtc/nvrtc_util.h" + +#include "cinn/runtime/cuda/cuda_module.h" +#include "cinn/hlir/framework/op_lowering.h" +#include "cinn/hlir/framework/pass.h" + +#include "cinn/frontend/net_builder.h" + +#include +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include +#include +#include +#include + +#include "cinn/lang/placeholder.h" + +#include "cinn/hlir/framework/visualize_helper.h" + +#include + +#define THREAD_PER_BLOCK 256 +#define WARP_SIZE 32 + +namespace cinn { +namespace ir { + +bool check( common::float16 *out, common::float16 *res,int n){ + for(int i=0;i(out[i]) - static_cast(res[i]) ) > 1e-2 ) + return false; + } + return true; +} + +int reduce_axis( const std::vector& first, const std::vector& second) +{ + + if( first[0] == 1 && second[0] != 1) + { + return 0; + } + if( first[1] != 1 && second[1] == 1) + { + return 1; + } + throw std::runtime_error("reduce_axis: error"); +} + + +struct InputNode +{ + InputNode() {} + InputNode( std::string n, cinn::lang::Placeholder *p, std::vector dim) + : name(n), in_ptr(p), in_dim(dim) {} + std::string name; + cinn::lang::Placeholder* in_ptr; + std::vector in_dim; +}; + +void process_reduce_max( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + std::string temp_max_name = "tmp_max"; + Var temp_max_var( temp_max_name, type_of() ); + + int warp_round = 1; + int thread_round = 4; + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_max = ir::Max::Make( max_t, t_load); + auto out_max = ir::Let::Make( max_t, new_max, false); + + + + auto body = ir::Block::Make( {out_max }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "warpReduceMax<128>", {max_t}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + // for test here, memory leak + cinn::lang::Placeholder* T_MAX = new cinn::lang::Placeholder("tmp_max", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_MAX), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "reduce_max", T_MAX, {1, 1}); + +} + + +void process_sub( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << "name " << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + // for( size_t i = 0;i < first_input.in_dim.size(); ++i) + // { + // std::cerr << first_input.in_dim[i] << std::endl; + // } + // std::cerr << "====================" << std::endl; + // for( size_t i = 0;i < second_input.in_dim.size(); ++i) + // { + // std::cerr << second_input.in_dim[i] << std::endl; + // } + + + //int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr inf(-100000.0); + + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_mul_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "sub_tmp"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto out = ir::Sub::Make( t_load, t2_load); + + cinn::lang::Placeholder* sub = new cinn::lang::Placeholder( temp_max_name, std::vector{{1, 8}}); + auto sub_store = Store::Make( ir::Tensor(*sub), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + (*input_map)[out_name] = InputNode( temp_max_name, sub, {1, 8}); + +} + + +void process_exp( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = 1; + int thread_round = 4; + std::string temp_max_name = "exp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Minus::Make( t_load); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder("exp", std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "exp", exp, {1, 8}); + +} + +void process_sqrt( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = 1; + int thread_round = 4; + std::string temp_max_name = "sqrt"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "sqrt"; + cinn::ir::Var max_t(name1, type_of()); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Sqrt::Make( t_load); + + cinn::ir::IrPrinter printer(std::cout); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder("sqrt", std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + (*input_map)[out_name] = InputNode( "sqrt", exp, {1, 8}); + +} + +void process_reduce_sum( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr zero(0.0); + std::string temp_max_name = in_name + "_tmp_sum"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "sum1"; + cinn::ir::Var sum1(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( sum1, zero); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_sum = ir::Add::Make( sum1, t_load); + auto out_sum = ir::Let::Make( sum1, new_sum, false); + + + + auto body = ir::Block::Make( {out_sum }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + auto warp_call = Call::Make( Float(32), "BlockReduceSum", {sum1}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + cinn::lang::Placeholder* T_SUM = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_SUM), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + // cinn::ir::IrPrinter printer(std::cout); + + // printer.Print( temp_max_out ); + // printer.Print( max_outer_for ); + // std::cout << std::endl; + + + + (*input_map)[out_name] = InputNode( "reduce_sum", T_SUM, {1}); + +} + + +void process_divide( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + //int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + int broadcast_first = -1; + int broadcast_second = -1; + if( first_input.in_dim.size() == 1) + { + broadcast_first = 1; + } + + if( second_input.in_dim.size() == 1 ) + { + broadcast_second = 1; + } + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr inf(-100000.0); + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + Expr t_load; + + if( broadcast_first == -1){ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + else{ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var) }); + } + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + }else if( broadcast_second != -1) { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var)}); + } + else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::Div::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + // cinn::ir::IrPrinter printer(std::cout); + + // printer.Print( temp_max_out ); + // printer.Print( max_outer_for ); + // std::cout << std::endl; + + (*input_map)[out_name] = InputNode( "divide", div, {1, 8}); + +} + +void process_add( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + //int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr inf(-100000.0); + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + } else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::Add::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + // cinn::ir::IrPrinter printer(std::cout); + + // printer.Print( temp_max_out ); + // printer.Print( max_outer_for ); + // std::cout << std::endl; + + (*input_map)[out_name] = InputNode( "add", div, {1, 8}); + +} + +void process_mul( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + //int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = 1; + int thread_round = 4; + Expr inf(-100000.0); + + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_mul_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + } else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::Mul::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + code_dev->Print( temp_max_out); + code_dev->Print( max_outer_for); + code_dev->ss_ << std::endl; + + + // cinn::ir::IrPrinter printer(std::cout); + + // printer.Print( temp_max_out ); + // printer.Print( max_outer_for ); + // std::cout << std::endl; + + (*input_map)[out_name] = InputNode( temp_max_name, div, {1, 8}); + +} + + +void process_fillconstant( std::map* input_map, hlir::framework::Node* node, backends::CodeGenCUDA_Dev* code_dev) +{ + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << hlir::framework::DebugString(node) << std::endl; + auto* op = node->op(); + + auto value = absl::get(node->attrs.attr_store.at("value")); + + std::cerr << value << std::endl; + + auto dtype = absl::get(node->attrs.attr_store.at("value")); + + std::cerr << out_name << "\t" << dtype << std::endl; + + + + cinn::ir::Var out(out_name, type_of()); + + auto max_var = cinn::ir::Let::Make( out, Expr(value)); + + // cinn::ir::IrPrinter printer(std::cout); + + code_dev->Print( max_var ); + code_dev->ss_ << ";" << std::endl; + + // printer.Print( max_var); + + // std::cout << std::endl; + + (*input_map)[out_name] = InputNode( out_name, nullptr, {}); +} + +TEST(IrManul, basic) { + + + + frontend::NetBuilder net_builder("layer_norm"); + { + auto A = net_builder.CreateInput(Float(32), {128, 112, 112, 64}, "A"); + auto scale = net_builder.CreateInput( Float(32), {64}, "scale" ); + auto bias = net_builder.CreateInput( Float(32), {64}, "bias" ); + auto run_mean = net_builder.CreateInput(Float(32), {64}, "run_mean"); + auto run_var = net_builder.CreateInput( Float(32), {64}, "run_var" ); + auto num = net_builder.FillConstant( {1}, 768.0, "num" ); + auto eps = net_builder.FillConstant( {1}, 1e-5, "eps" ); + auto sum1 = net_builder.ReduceSum(A, {2}, true); + auto mean1 = net_builder.Divide( sum1, num); + auto power = net_builder.Multiply(A, A); + auto sum2 = net_builder.ReduceSum(power, {2}, true); + auto mean2 = net_builder.Divide( sum2, num); + auto mean_power = net_builder.Multiply( mean1, mean1); + + auto var = net_builder.Subtract(mean2, mean_power); + + auto sub = net_builder.Subtract( A, mean1); + auto t1 = net_builder.Add( var, eps); + auto t2 = net_builder.Sqrt( t1 ); + auto t3 = net_builder.Divide( sub, t2); + auto t5 = net_builder.Multiply( t3, scale); + auto out = net_builder.Add( t5, bias); + } + + auto program = net_builder.Build(); + auto target = common::DefaultTarget(); + + auto graph = std::make_shared(program, target); + + std::cerr << "len " << graph->fusion_groups.size() << std::endl; + + std::cerr << graph->DebugGroupedGraph() << std::endl; + + //auto group0 = graph->FusionGroupsToGroups()[0]; + + auto topo_order = graph->topological_order(); + auto& nodes = std::get<0>(topo_order); + + // add input data + int reduce_block = 1; + int flatten_block = 1024; + + int num_warp = 4; + int num_thread_per_warp = 32; + int element_per_thread = 8; + + + Var block_id( "blockIdx.x", type_of() ); + Var flatten_id( "xid", type_of() ); + Var r_id( "rid", type_of() ); + Expr expr_flatten( flatten_block); + Expr expr_reduce( reduce_block); + + Var threadidx("threadIdx.x", type_of()); + Var index_i("i", type_of() ); + Var index_j("j", type_of() ); + Expr expr_warp( num_warp); + Expr expr_thread_per_warp( num_thread_per_warp ); + Expr expr_element_per_thread( element_per_thread ); + + auto warp_id = threadidx / expr_thread_per_warp; + + auto xid = warp_id * Expr(1); + auto inner_id = threadidx % expr_thread_per_warp; + auto inner_index = block_id *Expr(768) + xid * Expr(4) * expr_thread_per_warp + inner_id + index_j * expr_thread_per_warp; + + auto inner_index2 = xid * Expr(4) * expr_thread_per_warp + inner_id + index_j * expr_thread_per_warp; + + // block reduce + auto warp_round = 1; + auto thread_round = 4; + + std::string temp_name = "tmp"; + Var temp_var( temp_name, type_of() ); + auto temp_out = LocalTemp::Make( temp_var, {warp_round, thread_round}); + + Var loop_var("i"); + + cinn::lang::Placeholder C("d_in", std::vector{{10, 10}}); + cinn::lang::Placeholder T("tmp", std::vector{{1,4}}); + cinn::lang::Placeholder scale("scale", std::vector{{10, 10}}); + cinn::lang::Placeholder scale_tmp("scale_tmp", std::vector{{1,4}}); + cinn::lang::Placeholder bias("bias", std::vector{{10, 10}}); + cinn::lang::Placeholder bias_tmp("bias_tmp", std::vector{{1,4}}); + //Placeholder A("A", std::vector{{10}}); + //Var input( "input", type_of( )); + Var loop_var_j("j"); + + backends::CodeGenCUDA_Dev cu_dev(target); + + auto t_load = ir::Load::Make( ir::Tensor(C), { inner_index }); + Expr init_body = Store::Make( ir::Tensor(T), Expr(0.0), {Expr(0), Expr(loop_var_j)}); + Expr body = Store::Make( ir::Tensor(T), t_load, {Expr(0), Expr(loop_var_j)}); + + auto cond = ir::LT::Make( threadidx * Expr(4), Expr( 768) ); + auto filter = ir::IfThenElse::Make( cond, body, Expr()); + body = ir::Block::Make({init_body, filter}); + + + std::string head = R"ROC( + +#include + + +template +__device__ __forceinline__ float warpReduceSum(float sum) { + if (blockSize >= 32)sum += __shfl_down_sync(0xffffffff, sum, 16); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum += __shfl_down_sync(0xffffffff, sum, 8);// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum += __shfl_down_sync(0xffffffff, sum, 4);// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum += __shfl_down_sync(0xffffffff, sum, 2);// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum += __shfl_down_sync(0xffffffff, sum, 1);// 0-1, 2-3, 4-5, etc. + return sum; +} + +const int WARP_SIZE = 32; +__device__ __forceinline__ float BlockReduceSum(float sum) { + + // Shared mem for partial sums (one per warp in the block) + static __shared__ float warpLevelSums[WARP_SIZE]; + const int laneId = threadIdx.x % WARP_SIZE; + const int warpId = threadIdx.x / WARP_SIZE; + + sum = warpReduceSum<32>(sum); + + if( laneId == 0 ) warpLevelSums[warpId] = sum; + __syncthreads(); + + float final_sum = 0.0; + #pragma unroll + for( size_t i = 0; i < 6; ++i) + { + final_sum += warpLevelSums[i]; + } + + if (threadIdx.x == 0) warpLevelSums[0] = final_sum; + __syncthreads(); + return warpLevelSums[0]; + +} + + +template +__device__ __forceinline__ float warpReduce(float sum) { + if (blockSize >= 32)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 16) ); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 8) );// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 4) );// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 2) );// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 1) );// 0-1, 2-3, 4-5, etc. + return sum; +} + + + +extern "C" { + +__global__ void ln_test(half *d_in, half* scale, half* bias, half *d_out ) { +)ROC"; + + cu_dev.ss_ << head << std::endl; + cu_dev.Print( temp_out ); + cu_dev.ss_ << "\n"; + + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + //printer.Print( load_inner_for ); + + std::cerr << "------------------------------\n"; + + //t_load = ir::Load::Make( ) + + cinn::ir::IrPrinter printer(std::cout); + //printer.Print( load_inner_for ); + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << std::endl; + + t_load = ir::Load::Make( ir::Tensor(scale), { inner_index2 }); + + body = Store::Make( ir::Tensor(scale_tmp ), t_load, {Expr(0), Expr(loop_var_j)}); + + //body = ir::Block::Make({body}); + cond = ir::LT::Make( inner_index2, Expr( 768) ); + filter = ir::IfThenElse::Make( cond, body, Expr()); + + body = ir::Block::Make( {filter}); + + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + //printer.Print( load_inner_for ); + temp_name = "scale_tmp"; + Var scale_temp_var( temp_name, type_of() ); + temp_out = LocalTemp::Make( scale_temp_var, {warp_round, thread_round}); + cu_dev.Print( temp_out ); + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << std::endl; + + t_load = ir::Load::Make( ir::Tensor(bias), { inner_index2 }); + + body = Store::Make( ir::Tensor(bias_tmp ), t_load, {Expr(0), Expr(loop_var_j)}); + + //body = ir::Block::Make({body}); + + cond = ir::LT::Make( inner_index2, Expr( 768) ); + filter = ir::IfThenElse::Make( cond, body, Expr()); + + body = ir::Block::Make( {filter}); + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + //printer.Print( load_inner_for ); + temp_name = "bias_tmp"; + Var bias_temp_var( temp_name, type_of() ); + temp_out = LocalTemp::Make( bias_temp_var, {warp_round, thread_round}); + cu_dev.Print( temp_out ); + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << std::endl; + + + std::map map_input; + + + //std::cerr << cu_dev.ss_.str() << std::endl; + //std::cerr << "=======" << std::endl; + map_input["A"] = InputNode( "A", &T, {1, 8}); + map_input["scale"] = InputNode("scale", &scale_tmp, {1, 8}); + map_input["bias"] = InputNode("bias", &bias_tmp, {1, 8}); + + for (auto* n : nodes) { + + auto node = n->safe_as(); + if (!node || node->op() == nullptr) { + continue; + } + std::cerr << node->op()->name << std::endl; + + if( node->op()->name == "reduce_max") + { + process_reduce_max( &map_input, node, &cu_dev); + }else if ( node->op()->name == "subtract" ) + { + process_sub( &map_input, node, &cu_dev); + }else if ( node->op()->name == "exp" ) + { + process_exp( &map_input, node, &cu_dev); + }else if ( node->op()->name == "reduce_sum" ) + { + process_reduce_sum( &map_input, node, &cu_dev); + }else if ( node->op()->name == "divide" ) + { + process_divide( &map_input, node, &cu_dev); + }else if ( node->op()->name == "elementwise_mul" ) + { + process_mul( &map_input, node, &cu_dev); + }else if ( node->op()->name == "elementwise_add" ) + { + process_add( &map_input, node, &cu_dev); + } + else if ( node->op()->name == "fill_constant" ) + { + process_fillconstant( &map_input, node, &cu_dev); + } + else if ( node->op()->name == "sqrt" ) + { + process_sqrt( &map_input, node, &cu_dev); + } + else{ + throw std::runtime_error("not support op"); + } + + } + + + +// // name var_4 is output + auto var_out = map_input.at( "var_18"); + + t_load = ir::Load::Make( ir::Tensor( *(var_out.in_ptr) ), { Expr(0), Expr(loop_var_j) }); + t_load = ir::Cast::Make( common::Type( common::Type::type_t::UInt, 1, 2), t_load); + //t_load = ir::Load::Make( ir::Tensor( T), { Expr(loop_var), Expr(loop_var_j) }); + cinn::lang::Placeholder OUT("d_out", std::vector{{10}}); + + // Expr num1(128); + // Expr num2( 32 ); + // Expr block_step( 1024); + // Expr parallel_size(4); + // auto index_var2 = block_x_var * block_step + thread_x_var / num2 * num1 + thread_x_var % num2; + auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( inner_index ) }); + + +// //auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( index_var2 + loop_var_j * num2 ) }); + + //body = ir::Block::Make( {out_store }); + + cond = ir::LT::Make(threadidx * Expr(4), Expr( 768) ); + filter = ir::IfThenElse::Make( cond, out_store, Expr()); + body = ir::Block::Make({filter}); + load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(4), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + // body = ir::Block::Make( {load_inner_for}); + + + + // auto out_store_for = ir::For::Make(loop_var, + // common::make_const(0), + // common::make_const(warp_round), + // ir::ForType::Unrolled, + // ir::DeviceAPI::CUDA, + // body); + + // printer.Print( out_store_for ); + +// cu_dev.Print( out_store_for ); + + + +// // std::cerr << std::endl; + + + cu_dev.Print( load_inner_for ); + cu_dev.ss_ << "} \n }" << std::endl; + + std::cerr << cu_dev.ss_.str() << std::endl; + + + auto source_code = cu_dev.ss_.str(); + + backends::nvrtc::Compiler compiler; + + auto ptx = compiler(source_code); + + std::cerr << "source code" << source_code << std::endl; + + const int N= 128 * 128 * 768; + common::float16 *a=(common::float16 *)malloc(N*sizeof(common::float16)); + common::float16 *d_a; + cudaMalloc((void **)&d_a,N*sizeof(common::float16)); + + const int channel = 768; + + common::float16 *p_scale=(common::float16 *)malloc(channel*sizeof(common::float16)); + common::float16 *d_scale; + cudaMalloc((void **)&d_scale,channel*sizeof(common::float16)); + + common::float16 *p_bias=(common::float16 *)malloc(channel*sizeof(common::float16)); + common::float16 *d_bias; + cudaMalloc((void **)&d_bias,channel*sizeof(common::float16)); + + const int num_warps = 8; + const int block_num = 128 * 128; + const int NUM_PER_BLOCK = N / block_num; + const int NUM_PER_THREAD = NUM_PER_BLOCK/THREAD_PER_BLOCK; + common::float16 *out=( common::float16 *)malloc(N *sizeof(common::float16)); + float *d_out; + + int M = N; + cudaMalloc((void **)&d_out, M *sizeof(common::float16)); + common::float16 *res=(common::float16 *)malloc( M *sizeof(common::float16)); + + srand(0); + for(int i=0;i( rand() % 100 / 100.0 ); + } + + for( int i = 0; i < channel; ++i) + { + p_scale[i] = static_cast( rand() % 100 / 100.0 ); + p_bias[i] = static_cast( rand() % 100 / 100.0 ); + } + + for(int i=0;i< 128 * 128;i++){ + float sum = 0.0; + for( int k = 0; k < 768; ++k){ + sum += static_cast( a[i * 768 +k]); + } + auto mean = sum / 768; + float sum_var = 0.0; + for( int k = 0; k < 768; ++k ) + { + auto t = static_cast( a[i * 768 +k]) - mean; + sum_var += t * t; + } + auto var_mean = sum_var / 768; + auto t2 = sqrt( var_mean + 1e-5); + + for( int k = 0; k < 768; ++k ) + { + auto t = ( static_cast( a[i * 768 +k ] ) - mean) / t2; + + res[ i * 768 + k ] = static_cast(t * static_cast(p_scale[k]) + static_cast(p_bias[k]) ); + } + + + } + std::cerr << "bias " << p_bias[0] << std::endl; + std::cerr << "before copy" << std::endl; + cudaMemcpy(d_a,a,N*sizeof(common::float16),cudaMemcpyHostToDevice); + cudaMemcpy(d_scale, p_scale,channel *sizeof( common::float16),cudaMemcpyHostToDevice); + cudaMemcpy(d_bias, p_bias, channel *sizeof( common::float16),cudaMemcpyHostToDevice); + + dim3 Grid( block_num, 1, 1); + dim3 Block( THREAD_PER_BLOCK, 1, 1); + + void* args[] = {&d_a, &d_scale, &d_bias, &d_out }; + + cinn::runtime::cuda::CUDAModule cuda_module(ptx, cinn::runtime::cuda::CUDAModule::Kind::CUBIN); + + for ( int i = 0; i < 1000; ++i) + { + cuda_module.LaunchKernel(0, "ln_test", Grid, Block, args); + } + + std::cerr << "before copy" << std::endl; + cudaMemcpy(out,d_out, M *sizeof( common::float16),cudaMemcpyDeviceToHost); + + for( size_t i = 0;i < 32 ; ++i) + { + std::cerr << out[i] << "\t"; + } + std::cerr << std::endl; + + for( size_t i = 0;i < 32 ; ++i) + { + std::cerr << res[i] << "\t"; + } + std::cerr << std::endl; + + if(check(out,res,M))printf("the ans is right\n"); + else{ + printf("the ans is wrong\n"); + for(int i=0;i< M;i++){ + // printf("%lf ",out[i]); + if( abs( static_cast( out[i] ) - static_cast( res[i] ) ) > 1e-2 ){ + std::cout << i << "\t" << out[i] << "\t" << res[i] << std::endl; + break; + } + } + printf("\n"); + } + + cudaFree(d_a); + cudaFree(d_out); + + } + + + + + +} + +} // namespace cinn::ir + + diff --git a/cinn/ir/module.cc b/cinn/ir/module.cc index fc94b52805..dedc046915 100644 --- a/cinn/ir/module.cc +++ b/cinn/ir/module.cc @@ -26,7 +26,7 @@ void Module::Builder::AddFunction(ir::LoweredFunc func) { optim::Simplify(&(func->body)); optim::SimplifyForLoops(&(func->body)); optim::SimplifyBlocks(&(func->body)); - func->body = optim::Optimize(func->body, module_->target); + //func->body = optim::Optimize(func->body, module_->target); module_->functions.push_back(func); } @@ -55,7 +55,8 @@ Module Module::Builder::Build() { auto res = ir::Module(module_.get()); - return optim::Optimize(res, module_->target); + return res; + //return optim::Optimize(res, module_->target); } ir::_Module_ *Module::self() { return p_->as(); } diff --git a/cinn/ir/thread_model.cc b/cinn/ir/thread_model.cc new file mode 100644 index 0000000000..c169c22bb1 --- /dev/null +++ b/cinn/ir/thread_model.cc @@ -0,0 +1,1469 @@ + + +#include "cinn/ir/ir_verify.h" + +#include "cinn/ir/ir_operators.h" +#include "cinn/ir/ir_printer.h" +#include "cinn/ir/ir.h" +#include +#include "cinn/ir/tensor.h" +#include "cinn/lang/placeholder.h" + +#include "cinn/backends/codegen_c_x86.h" +#include "cinn/backends/codegen_cuda_dev.h" +#include "cinn/backends/codegen_cuda_util.h" + +#include "cinn/backends/nvrtc/nvrtc_util.h" + +#include "cinn/runtime/cuda/cuda_module.h" +#include "cinn/hlir/framework/op_lowering.h" +#include "cinn/hlir/framework/pass.h" + +#include "cinn/frontend/net_builder.h" + +#include + +#include +#include +#include +#include + +#include "cinn/lang/placeholder.h" + +#include "cinn/hlir/framework/visualize_helper.h" + +#include + +#include "cinn/ir/thread_model.h" +#include "cinn/hlir/framework/op_lowering_util.h" +#include "cinn/common/type.h" + +namespace cinn { +namespace ir { + +struct InputNode +{ + InputNode() {} + InputNode( std::string n, cinn::lang::Placeholder *p, std::vector dim) + : name(n), in_ptr(p), in_dim(dim) {} + std::string name; + cinn::lang::Placeholder* in_ptr; + std::vector in_dim; +}; + +struct ThreadConfig +{ + int warp_round; + int thread_round; +}; + +bool is_power2(int n) { + if(n <= 0) + return false; + return (n&(n-1)) == 0; +} + +void process_reduce_max( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << in_name << "\t" << input_map->count( in_name ) << std::endl; + InputNode& input_node = input_map->at(in_name); + std::cerr << in_name << " found" << std::endl; + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + std::string temp_max_name = "tmp_max"; + Var temp_max_var( temp_max_name, type_of() ); + + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + std::cerr << in_name << "found #1" << std::endl; + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + std::cerr << in_name << "found #3" << std::endl; + auto max_var = cinn::ir::Let::Make( max_t, inf); + std::cerr << in_name << "found #5" << std::endl; + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + std::cerr << in_name << "found 1" << std::endl; + auto new_max = ir::Max::Make( max_t, t_load); + auto out_max = ir::Let::Make( max_t, new_max, false); + + + + auto body = ir::Block::Make( {out_max }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + std::cerr << in_name << "found 2" << std::endl; + auto warp_call = Call::Make( Float(32), "warpReduceMax", {max_t}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + // for test here, memory leak + cinn::lang::Placeholder* T_MAX = new cinn::lang::Placeholder("tmp_max", std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_MAX), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + std::cerr << "inert max " << std::endl; + (*input_map)[out_name] = InputNode( "reduce_max", T_MAX, {1}); + + std::cerr << "inert max fin" << std::endl; +} + + +void process_sub( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << "name " << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + + bool first_is_scalar = false; + if( first_input.in_dim.size() == 0) + { + is_scalar = true; + } + + int broadcast_first = -1; + int broadcast_second = -1; + if( first_input.in_dim.size() == 1) + { + broadcast_first = 1; + } + + if( second_input.in_dim.size() == 1 ) + { + broadcast_second = 1; + } + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + Expr inf(-100000.0); + + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_mul_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "sub_tmp"; + cinn::ir::Var max_t(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( max_t, inf); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + if ( first_is_scalar ) + { + t_load = Var( first_input.name, type_of()); + } + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + }else if( broadcast_second != -1) { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var)}); + } + else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + + + auto out = ir::Sub::Make( t_load, t2_load); + + cinn::lang::Placeholder* sub = new cinn::lang::Placeholder( temp_max_name, std::vector{{1, 8}}); + auto sub_store = Store::Make( ir::Tensor(*sub), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + + (*input_map)[out_name] = InputNode( temp_max_name, sub, {1, 8}); + +} + +void process_scale( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + float scale = absl::get(node->attrs.attr_store.at("scale")); + float bias = absl::get(node->attrs.attr_store.at("bias")); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + std::string temp_max_name = in_name + "_tmp_scale"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "max1"; + cinn::ir::Var max_t(name1, type_of()); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Mul::Make( t_load, Expr( scale )); + out = ir::Add::Make( out, Expr( bias )); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + (*input_map)[out_name] = InputNode( "scale", exp, {1, 8}); + +} + +void process_exp( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + std::string temp_max_name = in_name + "_tmp_exp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Minus::Make( t_load); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + (*input_map)[out_name] = InputNode( temp_max_name, exp, {1, 8}); + +} + +void process_sqrt( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& first_input = input_map->at( in_name); + + Var loop_var("i"); + Var loop_var_j("j"); + Expr inf(-100000.0); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + std::string temp_max_name = "sqrt"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + std::string name1 = "sqrt"; + cinn::ir::Var max_t(name1, type_of()); + + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + auto out = ir::Sqrt::Make( t_load); + + cinn::ir::IrPrinter printer(std::cout); + + cinn::lang::Placeholder* exp = new cinn::lang::Placeholder("sqrt", std::vector{{1, 4}}); + auto exp_store = Store::Make( ir::Tensor(*exp), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { exp_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + (*input_map)[out_name] = InputNode( "sqrt", exp, {1, 8}); + +} + +void process_reduce_sum( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config, + const CodeGenOption& gen_opt ) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + InputNode& input_node = input_map->at(in_name); + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + Expr zero(0.0); + std::string temp_max_name = in_name + "_tmp_sum"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round}); + + std::string name1 = "sum1"; + cinn::ir::Var sum1(name1, type_of()); + + auto max_var = cinn::ir::Let::Make( sum1, zero); + + auto t_load = ir::Load::Make( ir::Tensor( *(input_node.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + auto new_sum = ir::Add::Make( sum1, t_load); + auto out_sum = ir::Let::Make( sum1, new_sum, false); + + + + auto body = ir::Block::Make( {out_sum }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + std::string reduce_func_name; + if( gen_opt.reduce_block <= 128 ) + { + reduce_func_name = "warpReduceSum"; + } + else + { + reduce_func_name = "BlockReduceSum"; + } + + auto warp_call = Call::Make( Float(32), reduce_func_name, {sum1}, {}, ir::CallType::Extern ); + + //auto warp_res = ir::Let::Make( max_t, warp_call, false); + + cinn::lang::Placeholder* T_SUM = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto max_store = Store::Make( ir::Tensor(*T_SUM), warp_call, {Expr(loop_var)}); + + + body = ir::Block::Make( {max_var, load_inner_for, max_store}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + (*input_map)[out_name] = InputNode( "reduce_sum", T_SUM, {1}); + +} + + +void process_divide( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + //int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + int broadcast_first = -1; + int broadcast_second = -1; + if( first_input.in_dim.size() == 1) + { + broadcast_first = 1; + } + + if( second_input.in_dim.size() == 1 ) + { + broadcast_second = 1; + } + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + Expr inf(-100000.0); + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + Expr t_load; + + if( broadcast_first == -1){ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + else{ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var) }); + } + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + }else if( broadcast_second != -1) { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var)}); + } + else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::Div::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + (*input_map)[out_name] = InputNode( "divide", div, {1, 8}); + +} + + +void process_greater_equal( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + //int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + int broadcast_first = -1; + int broadcast_second = -1; + if( first_input.in_dim.size() == 1) + { + broadcast_first = 1; + } + + if( second_input.in_dim.size() == 1 ) + { + broadcast_second = 1; + } + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + Expr inf(-100000.0); + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + Expr t_load; + + if( broadcast_first == -1){ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + else{ + t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var) }); + } + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + }else if( broadcast_second != -1) { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var)}); + } + else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::LE::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + (*input_map)[out_name] = InputNode( "divide", div, {1, 8}); + +} + +void process_add( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + Expr inf(-100000.0); + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_div_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + } else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + //std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::Add::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + (*input_map)[out_name] = InputNode( "add", div, {1, 8}); + +} + +void process_cast( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::string in_name; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + in_name = innode->id(); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + (*input_map)[out_name] = input_map->at( in_name ); +} + +void process_mul( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::vector vec_in_names; + for (auto& inlink : node->inlinks_in_order(true)) { + auto* innode = inlink->source()->safe_as(); + if (innode) { + vec_in_names.push_back( innode->id() ); + } + } + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + //std::cerr << vec_in_names[0] << "\t" << vec_in_names[1] << std::endl; + InputNode& first_input = input_map->at( vec_in_names[0]); + InputNode& second_input = input_map->at( vec_in_names[1]); + + bool is_scalar = false; + if( second_input.in_dim.size() == 0) + { + is_scalar = true; + } + //int broadcast_axis = reduce_axis( first_input.in_dim, second_input.in_dim); + + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + Expr inf(-100000.0); + + std::string temp_max_name = vec_in_names[0] + "_" + vec_in_names[1] + "_mul_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + auto t_load = ir::Load::Make( ir::Tensor( *(first_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + + + Expr t2_load; + + if( is_scalar ) + { + t2_load = Var( second_input.name, type_of()); + } else + { + t2_load = ir::Load::Make( ir::Tensor( *(second_input.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + } + // std::cerr << "t2 load " << t2_load << std::endl; + + auto out = ir::Mul::Make( t_load, t2_load); + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + auto sub_store = Store::Make( ir::Tensor(*div), out, {Expr(loop_var), Expr(loop_var_j)}); + + + auto body = ir::Block::Make( { sub_store }); + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + + + body = ir::Block::Make( {load_inner_for}); + + auto max_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + vec_out.push_back( temp_max_out ); + vec_out.push_back( max_outer_for ); + + (*input_map)[out_name] = InputNode( temp_max_name, div, {1, 8}); + +} + + +void process_fillconstant( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + //std::cerr << hlir::framework::DebugString(node) << std::endl; + auto* op = node->op(); + + float value =0.0; + try{ + value = absl::get(node->attrs.attr_store.at("value")); + } + catch(...){ + value = absl::get(node->attrs.attr_store.at("value")); + } + + //std::cerr << value << std::endl; + + //auto dtype = absl::get(node->attrs.attr_store.at("value")); + + // std::cerr << out_name << "\t" << dtype << std::endl; + + + + cinn::ir::Var out(out_name, type_of()); + + auto max_var = cinn::ir::Let::Make( out, Expr(value)); + + vec_out.push_back( max_var ); + + (*input_map)[out_name] = InputNode( out_name, nullptr, {}); +} + +void process_uniform( std::map* input_map, hlir::framework::Node* node, std::vector& vec_out, ThreadConfig& thread_config) +{ + //std::cerr << hlir::framework::DebugString(node) << std::endl; + auto* op = node->op(); + + // auto shape = value = absl::get>(node->attrs.attr_store.at("shape")); + // auto min = value = absl::get(node->attrs.attr_store.at("min")); + // auto max = value = absl::get(node->attrs.attr_store.at("max")); + + std::string out_name; + for (auto& outlink : node->outlinks_in_order(true)) { + auto* outnode = outlink->sink()->safe_as(); + if (outnode) { + out_name = outnode->id(); + } + } + + Var loop_var("i"); + Var loop_var_j("j"); + int warp_round = thread_config.warp_round; + int thread_round = thread_config.thread_round; + Expr inf(-100000.0); + + std::string temp_max_name = out_name + "_random_tmp"; + Var temp_max_var( temp_max_name, type_of() ); + auto temp_max_out = LocalTemp::Make( temp_max_var, {warp_round, thread_round}); + + + cinn::lang::Placeholder* div = new cinn::lang::Placeholder(temp_max_name, std::vector{{1, 4}}); + + std::string temp_max_name_slice = out_name + "_random_tmp[0]"; + Var temp_max_var_slice( temp_max_name_slice, type_of() ); + auto uniform_call = Call::Make( Float(32), "uniform_random", {temp_max_var_slice, Expr(warp_round * thread_round)}, {}, ir::CallType::Extern ); + + + vec_out.push_back( temp_max_out ); + vec_out.push_back( uniform_call ); + + (*input_map)[out_name] = InputNode( temp_max_name, div, {1, 8}); + +} + +ir::Expr generate_index( CodeGenOption gen_opt, bool last_dim) +{ + int reduce_block = gen_opt.reduce_block; + int flatten_block = gen_opt.flatten_block; + + std::vector reduce_range; + std::vector flatten_range; + + std::string name_blockx = "blockIdx.x"; + std::string name_threadx = "xid"; + std::string index_name = "index"; + Var block_x_var( name_blockx, type_of() ); + Var thread_x_var( name_threadx, type_of() ); + + Var block_id( "blockIdx.x", type_of() ); + Var flatten_id( "xid", type_of() ); + Var r_id( "rid", type_of() ); + Expr expr_flatten( flatten_block); + Expr expr_reduce( reduce_block); + + int num_warp = gen_opt.num_warp; + int num_thread_per_warp = gen_opt.num_thread_per_warp; + + Var threadidx("threadIdx.x", type_of()); + Var index_i("i", type_of() ); + Var index_j("j", type_of() ); + Expr expr_warp( num_warp); + Expr expr_thread_per_warp( num_thread_per_warp ); + + auto warp_id = threadidx / expr_thread_per_warp; + + // warp reduce + auto warp_round = 1; + auto thread_round = 1; + if( gen_opt.op_type == ir::OpType::kContiguousWarpReduce ) + { + thread_round = reduce_block / num_thread_per_warp; + warp_round = flatten_block / num_warp; + } + else if( gen_opt.op_type == ir::OpType::kElementwise ) + { + thread_round = flatten_block / num_thread_per_warp / num_warp; + } + else + { + thread_round = reduce_block / num_thread_per_warp / num_warp; + } + + + auto xid = warp_id * Expr( warp_round ) + index_i; + auto inner_id = threadidx % expr_thread_per_warp; + auto inner_index = xid * Expr( thread_round ) * expr_thread_per_warp + inner_id + index_j * expr_thread_per_warp; + if( ! last_dim ) + { + inner_index = block_id *Expr( gen_opt.reduce_dim * flatten_block ) + inner_index; + } + + return inner_index; +} + +void build_load( std::map* input_map, CodeGenOption gen_opt, + const std::vector& vec_input, std::vector& vec_out, + hlir::framework::Graph * graph ) +{ + int reduce_block = gen_opt.reduce_block; + int flatten_block = gen_opt.flatten_block; + int num_warp = gen_opt.num_warp; + int num_thread_per_warp = gen_opt.num_thread_per_warp; + + auto warp_round = 1; + auto thread_round = 1; + if( gen_opt.op_type == ir::OpType::kContiguousWarpReduce ) + { + thread_round = reduce_block / num_thread_per_warp; + warp_round = flatten_block / num_warp; + } + else if( gen_opt.op_type == ir::OpType::kElementwise ) + { + thread_round = flatten_block / num_thread_per_warp / num_warp; + } + else + { + thread_round = reduce_block / num_thread_per_warp / num_warp; + } + + + + + std::cerr << "reduce block " << reduce_block << std::endl; + std::cerr << "num thread per warp " << num_thread_per_warp << std::endl; + + std::cerr << reduce_block / num_thread_per_warp << std::endl; + std::cerr << thread_round << std::endl; + + + for( auto& name : vec_input ) + { + Var loop_var("i"); + std::string temp_name = name + "_tmp"; + Var temp_var( temp_name, type_of() ); + auto temp_out = LocalTemp::Make( temp_var, {warp_round, thread_round}); + cinn::lang::Placeholder *C = new cinn::lang::Placeholder( name, std::vector{{10, 10}}); + cinn::lang::Placeholder *T = new cinn::lang::Placeholder( temp_name, std::vector{{1,4}}); + //Placeholder A("A", std::vector{{10}}); + //Var input( "input", type_of( )); + Var loop_var_j("j"); + + + + auto& shape_dict = graph->GetMutableAttrs>("infershape"); + auto shape1 = shape_dict.at( name); + + std::cerr << "name " << name << " shape :" << std::endl; + for( auto& s : shape1) + { + std::cerr << s << "\t"; + } + std::cerr << std::endl; + + vec_out.push_back( temp_out ); + + Expr inner_index; + if( shape1.size() == 1 ) + { + inner_index = generate_index( gen_opt, true); + }else + { + inner_index = generate_index( gen_opt, false); + } + auto t_load = ir::Load::Make( ir::Tensor(*C), { inner_index }); + + Expr body = Store::Make( ir::Tensor(*T), t_load, {Expr(loop_var), Expr(loop_var_j)}); + + if( ! is_power2( gen_opt.reduce_dim ) ) + { + auto index2 = generate_index( gen_opt, true); + auto cond = ir::LT::Make( index2, Expr( gen_opt.reduce_dim) ); + auto filter = ir::IfThenElse::Make( cond, body, Expr()); + + body = ir::Block::Make({filter}); + } + else + { + body = ir::Block::Make({body}); + } + + + + + + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + // simple process here + // if( shape1.size() == 1) + // { + // vec_out.push_back( load_inner_for); + // } + // else + // { + body = ir::Block::Make( {load_inner_for}); + + auto load_outer_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + + vec_out.push_back( load_outer_for ); + // } + + (*input_map)[ name ] = InputNode( temp_name, T, {1, 4}); + } + +} + +void build_store( std::map* input_map, CodeGenOption gen_opt, const std::vector& vec_output_name, std::vector& vec_out) +{ + int reduce_block = gen_opt.reduce_block; + int flatten_block = gen_opt.flatten_block; + int num_warp = gen_opt.num_warp; + int num_thread_per_warp = gen_opt.num_thread_per_warp; + + auto warp_round = 1; + auto thread_round = 1; + if( gen_opt.op_type == ir::OpType::kContiguousWarpReduce ) + { + thread_round = reduce_block / num_thread_per_warp; + warp_round = flatten_block / num_warp; + } + else if( gen_opt.op_type == ir::OpType::kElementwise ) + { + thread_round = flatten_block / num_thread_per_warp / num_warp; + } + else + { + thread_round = reduce_block / num_thread_per_warp / num_warp; + } + + + for( auto & out_name : vec_output_name ) + { + auto inner_index = generate_index( gen_opt, false); + + + Var loop_var("i"); + Var loop_var_j("j"); + + auto var_out = input_map->at( out_name ); + + auto t_load = ir::Load::Make( ir::Tensor( *(var_out.in_ptr) ), { Expr(loop_var), Expr(loop_var_j) }); + cinn::lang::Placeholder OUT(vec_output_name[0], std::vector{{10}}); + + auto out_store = Store::Make( ir::Tensor(OUT), t_load, { Expr( inner_index ) }); + + auto body = ir::Block::Make( {out_store }); + + if( ! is_power2( gen_opt.reduce_dim ) ) + { + auto index2 = generate_index( gen_opt, true); + auto cond = ir::LT::Make( index2, Expr( gen_opt.reduce_dim) ); + auto filter = ir::IfThenElse::Make( cond, body, Expr()); + + body = ir::Block::Make({filter}); + } + else + { + body = ir::Block::Make({body}); + } + + auto load_inner_for = ir::For::Make(loop_var_j, + common::make_const(0), + common::make_const(thread_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + + body = ir::Block::Make( {load_inner_for}); + + + auto out_store_for = ir::For::Make(loop_var, + common::make_const(0), + common::make_const(warp_round), + ir::ForType::Unrolled, + ir::DeviceAPI::CUDA, + body); + + vec_out.push_back( out_store_for); + } +} + +ir::LoweredFunc process_warp_reduce( hlir::framework::Graph * graph, CodeGenOption gen_opt, + const std::vector& vec_input, const std::vector& vec_output_name) +{ + std::cerr << "gen opti config " << std::endl; + std::cerr << gen_opt.flatten_block << std::endl; + std::cerr << gen_opt.reduce_block << std::endl; + std::cerr << gen_opt.op_type << std::endl; + std::cerr << gen_opt.reduce_dim << std::endl; + std::cerr << gen_opt.reduce_numel << std::endl; + std::cerr << gen_opt.flatten_numel << std::endl; + + std::vector out_expr; + std::map map_input; + + build_load( &map_input, gen_opt, vec_input, out_expr, graph ); + + auto topo_order = graph->topological_order(); + auto& nodes = std::get<0>(topo_order); + + std::vector func_args; + std::unordered_map tensor_map; + + auto& dtype_dict = graph->GetMutableAttrs>("inferdtype"); + auto& shape_dict = graph->GetMutableAttrs>("infershape"); + + int reduce_block = gen_opt.reduce_block; + int flatten_block = gen_opt.flatten_block; + auto num_warp = gen_opt.num_warp; + auto num_thread_per_warp = gen_opt.num_thread_per_warp; + auto warp_round = 1; + auto thread_round = 1; + if( gen_opt.op_type == ir::OpType::kContiguousWarpReduce ) + { + thread_round = reduce_block / num_thread_per_warp; + warp_round = flatten_block / num_warp; + } + else if( gen_opt.op_type == ir::OpType::kElementwise ) + { + thread_round = flatten_block / num_thread_per_warp / num_warp; + } + else + { + thread_round = reduce_block / num_thread_per_warp / num_warp; + } + + + ThreadConfig thread_config; + thread_config.warp_round = warp_round; + thread_config.thread_round = thread_round; + + for (auto& n : nodes) { + + auto node = n->safe_as(); + if (!node || node->op() == nullptr) { + continue; + } + + std::vector tensor_inputs = + std::move(hlir::framework::CollectInputTensor(node, func_args, tensor_map, dtype_dict, shape_dict)); + + std::cerr << " process node: " << node->id() << std::endl; + std::cerr << " with op type: " << node->op()->name << std::endl; + if( node->op()->name == "reduce_max") + { + process_reduce_max( &map_input, node, out_expr, thread_config ); + }else if ( node->op()->name == "subtract" ) + { + process_sub( &map_input, node, out_expr, thread_config); + }else if ( node->op()->name == "exp" ) + { + process_exp( &map_input, node, out_expr, thread_config); + }else if ( node->op()->name == "reduce_sum" ) + { + process_reduce_sum( &map_input, node, out_expr, thread_config, gen_opt); + }else if ( node->op()->name == "divide" ) + { + process_divide( &map_input, node, out_expr, thread_config); + }else if ( node->op()->name == "elementwise_mul" ) + { + process_mul( &map_input, node, out_expr, thread_config); + }else if ( node->op()->name == "elementwise_add" ) + { + process_add( &map_input, node, out_expr, thread_config); + } + else if ( node->op()->name == "fill_constant" ) + { + process_fillconstant( &map_input, node, out_expr, thread_config); + } + else if ( node->op()->name == "sqrt" ) + { + process_sqrt( &map_input, node, out_expr, thread_config ); + } + else if ( node->op()->name == "scale" ) + { + process_scale( &map_input, node, out_expr, thread_config ); + } + else if( node->op()->name == "uniform_random") + { + process_uniform( &map_input, node, out_expr, thread_config ); + } + else if( node->op()->name == "greater_equal") + { + process_greater_equal( &map_input, node, out_expr, thread_config ); + } + else if( node->op()->name == "cast") + { + process_cast( &map_input, node, out_expr, thread_config ); + } + else if( node->op()->name == "identity") + { + std::cerr << "skip identity for now" << std::endl; + } + else{ + std::cerr << "op name " << node->op()->name << std::endl; + throw std::runtime_error("not support op"); + } + + } + + build_store( &map_input, gen_opt, vec_output_name, out_expr); + + auto feed_list= vec_input; + std::vector test_func_args; + std::cerr << "feed list" << std::endl; + for( auto& name : feed_list ) + { + std::cerr << name << std::endl; + test_func_args.emplace_back( tensor_map.at(name)->buffer, ir::Argument::IO::kInput ); + } + + + auto fetch_name_list = vec_output_name; + std::cerr << "fetch list" << std::endl; + // build output + for( auto& name : fetch_name_list ) + { + std::cerr << name << std::endl; + auto out = lang::Placeholder( name, shape_dict.at( name )); + test_func_args.emplace_back( out->buffer , ir::Argument::IO::kOutput ); + } + + std::vector temp_buffers; + + + auto group0 = graph->fusion_groups[0]; + std::cerr << "fun name " << group0->GetFuncName() << std::endl; + auto func = + ir::_LoweredFunc_::Make( group0->GetFuncName() , test_func_args, cinn::ir::Block::Make( out_expr ), temp_buffers); + + func->cuda_axis_info.set_grid_dim( 0, gen_opt.flatten_numel / gen_opt.flatten_block); + func->cuda_axis_info.set_block_dim( 0, gen_opt.num_warp * 32); + std::cerr << "grid " << gen_opt.flatten_numel / gen_opt.flatten_block << " block " << gen_opt.num_warp * 32 << std::endl; +// std::cerr << "func " << func << std::endl; + + + return func; + + +} + +} //namespace ir +} //namespace cinn \ No newline at end of file diff --git a/cinn/ir/thread_model.h b/cinn/ir/thread_model.h new file mode 100644 index 0000000000..38d5f1f005 --- /dev/null +++ b/cinn/ir/thread_model.h @@ -0,0 +1,76 @@ + + +#include "cinn/ir/ir_verify.h" + +#include "cinn/ir/ir_operators.h" +#include "cinn/ir/ir_printer.h" +#include "cinn/ir/ir.h" +#include +#include "cinn/ir/tensor.h" +#include "cinn/lang/placeholder.h" + +#include "cinn/backends/codegen_c_x86.h" +#include "cinn/backends/codegen_cuda_dev.h" +#include "cinn/backends/codegen_cuda_util.h" + +#include "cinn/backends/nvrtc/nvrtc_util.h" + +#include "cinn/runtime/cuda/cuda_module.h" +#include "cinn/hlir/framework/op_lowering.h" +#include "cinn/hlir/framework/pass.h" + +#include "cinn/frontend/net_builder.h" + +#include +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include +#include +#include +#include + +#include "cinn/lang/placeholder.h" + +#include "cinn/hlir/framework/visualize_helper.h" +#include "cinn/hlir/framework/graph.h" + +#include + +#define THREAD_PER_BLOCK 256 +#define WARP_SIZE 32 + +#pragma once + +namespace cinn { +namespace ir { + +using GroupPtr = std::shared_ptr; + + +enum OpType { + kContiguousWarpReduce = 0, + kContiguousBlockReduce = 1, + kNoConguousReduce = 2, + kElementwise = 3, +}; + +struct CodeGenOption +{ + OpType op_type; + + int flatten_block; + int reduce_block; + int num_warp; + int num_thread_per_warp; + int reduce_dim; + int flatten_numel; + int reduce_numel; +}; + +ir::LoweredFunc process_warp_reduce( hlir::framework::Graph* graph, CodeGenOption gen_opt, + const std::vector& vec_input, const std::vector& vec_output); + + +} //namespace ir +} // namespce cinn \ No newline at end of file diff --git a/cinn/lang/lower.cc b/cinn/lang/lower.cc index 5781c69e3b..49f301c96d 100755 --- a/cinn/lang/lower.cc +++ b/cinn/lang/lower.cc @@ -251,6 +251,7 @@ std::vector LowerVec(const std::string& name, Module::Builder* b, const Target& target, bool support_ir_schedule) { + std::cerr << "lowering vec" << std::endl; // Init the reduce tensors first before any process. for (auto& t : tensor_args) InitReduceTensor(stages, t, target); for (auto& t : temp_tensors) InitReduceTensor(stages, t, target); diff --git a/cinn/lang/lower_impl.cc b/cinn/lang/lower_impl.cc index 327bdf1e73..a4e051a74a 100644 --- a/cinn/lang/lower_impl.cc +++ b/cinn/lang/lower_impl.cc @@ -104,7 +104,7 @@ Expr LowerGroup(const poly::ScheduleGroup& group, // transform this to some realworld statement in CINN. VLOG(1) << "ast to expr: \n" << e << std::endl; - + // std::cerr << "22 " << e << std::endl; // replace isl call to the corresponding CINN statement, we need to replace the axis at the same time. for (auto& statement : tuple_to_expr) { VLOG(2) << "LowerGroup working on statement: " << statement.first; @@ -143,6 +143,7 @@ Expr LowerGroup(const poly::ScheduleGroup& group, mutator(&e); } + // std::cerr << "11 " << e << std::endl; // mark unroll. { std::map> unrolls; @@ -568,7 +569,11 @@ std::vector LowerImpl::operator()() { std::vector result; int num_func = 0; + for (auto& func_iterator : func_body) { + // std::cerr << "body " << num_func << std::endl; + // std::cerr << func_iterator << std::endl; + // std::cerr << "==========================================" << std::endl; if (support_ir_schedule_) { // add ScheduleBlockRealize func_iterator = ir::ScheduleBlockRealize::Make( @@ -778,6 +783,8 @@ std::vector LowerImpl::GenerateFunctionBody(const poly::Schedule* schedule if (target_ == common::DefaultNVGPUTarget() && !all_temp_tensor) { exprs.push_back(group_expr); Expr body = ir::Block::Make(exprs); + // std::cerr << body << std::endl; + // std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl; result.push_back(body); exprs.clear(); } else { @@ -785,12 +792,15 @@ std::vector LowerImpl::GenerateFunctionBody(const poly::Schedule* schedule } } } + + // std::cerr << "last body " << ir::Block::Make(exprs) << std::endl; if (target_ == common::DefaultHostTarget()) { Expr body = ir::Block::Make(exprs); result.push_back(body); exprs.clear(); } else if (!exprs.empty()) { Expr body = ir::Block::Make(exprs); + result.push_back(body); exprs.clear(); } diff --git a/cinn/optim/ir_copy.cc b/cinn/optim/ir_copy.cc index 6c7d5295d6..88e507afb6 100644 --- a/cinn/optim/ir_copy.cc +++ b/cinn/optim/ir_copy.cc @@ -57,6 +57,43 @@ struct IRCopyVisitor : public ir::IRVisitorBase { return Select::Make(condition, true_value, false_value); } + Expr Visit( const LocalTemp* op) override { + auto sym = Visit( &op->symbol); + + return LocalTemp::Make( sym, op->local_size); + } + + Expr Visit( const Sqrt* op) override { + auto sym = Visit( &op->symbol); + + return Sqrt::Make( sym ); + } + + Expr Visit( const BlockLoad* op) override { + auto sym = Visit( &op->input); + + return BlockLoad::Make( sym, op->load_index); + } + + Expr Visit( const BlockStore* op) override { + auto sym = Visit( &op->input); + + return BlockStore::Make( sym, op->load_index, op->value); + } + + + Expr Visit( const LoadIndex* op) override { + auto sym = Visit( &op->index_expr); + + return LoadIndex::Make( sym, op->reduce_range, op->flatten_range, op->reduce_block, op->flatten_block); + } + + Expr Visit( const ReduceMax* op) override { + auto in = Visit( &op->input); + + return ReduceMax::Make( in, op->axis); + } + Expr Visit(const IfThenElse* op) override { auto condition = Visit(&op->condition); auto true_case = Visit(&op->true_case); @@ -273,7 +310,7 @@ struct IRCopyVisitor : public ir::IRVisitorBase { Expr Visit(const Let* op) override { auto value = Visit(&op->symbol); auto body = Visit(&op->body); - return Let::Make(value, body); + return Let::Make(value, body, op->with_dtype ); } Expr Visit(const Reduce* op) override { @@ -450,6 +487,7 @@ Expr IRCopy(Expr x) { return copied; } + std::vector IRCopy(const std::vector& x) { std::vector res; for (auto& i : x) { diff --git a/cinn/optim/ir_simplify.cc b/cinn/optim/ir_simplify.cc index 0ed3d92c93..b67875a12b 100644 --- a/cinn/optim/ir_simplify.cc +++ b/cinn/optim/ir_simplify.cc @@ -88,21 +88,21 @@ struct SimplifyButStoreLoadMutator : public ir::IRMutator { } void Visit(const For* op, Expr* expr) override { - auto* node = expr->As(); - Visit(&node->min, &node->min); - Visit(&node->extent, &node->extent); - auto* min_i = op->min.As(); - auto* extent_i = op->extent.As(); - if (min_i && extent_i && extent_i->value > min_i->value) { - var_intervals.emplace(op->loop_var->name, common::CasInterval{min_i->value, extent_i->value - 1}); - } else { - var_intervals.emplace(op->loop_var->name, common::CasInterval{op->min, op->extent - 1}); - } - - Visit(&node->body, &node->body); - if (min_i && extent_i) { - var_intervals.erase(op->loop_var->name); - } + // auto* node = expr->As(); + // Visit(&node->min, &node->min); + // Visit(&node->extent, &node->extent); + // auto* min_i = op->min.As(); + // auto* extent_i = op->extent.As(); + // if (min_i && extent_i && extent_i->value > min_i->value) { + // var_intervals.emplace(op->loop_var->name, common::CasInterval{min_i->value, extent_i->value - 1}); + // } else { + // var_intervals.emplace(op->loop_var->name, common::CasInterval{op->min, op->extent - 1}); + // } + + // Visit(&node->body, &node->body); + // if (min_i && extent_i) { + // var_intervals.erase(op->loop_var->name); + // } } void Visit(const _Tensor_* op, Expr* expr) override { @@ -310,25 +310,25 @@ struct SimplifyForLoopsMutator : public ir::IRMutator<> { using ir::IRMutator<>::Visit; void Visit(const For* op, Expr* expr) override { - auto* node = expr->As(); - Visit(&node->min, &node->min); - Visit(&node->extent, &node->extent); - auto* min_i = node->min.As(); - auto* extent_i = node->extent.As(); - if (min_i && extent_i && extent_i->value > min_i->value && extent_i->value - min_i->value == 1) { - VLOG(6) << "Simplify current For Loop"; - std::string var_name = node->loop_var->name; - var_intervals.emplace(var_name, common::CasInterval{min_i->value, extent_i->value - 1}); - if (node->body.As() && node->body.As()->stmts.size() == 1) { - *expr = node->body.As()->stmts[0]; - } else { - *expr = node->body; - } - Visit(expr, expr); - var_intervals.erase(var_name); - } else { - Visit(&node->body, &node->body); - } + // auto* node = expr->As(); + // Visit(&node->min, &node->min); + // Visit(&node->extent, &node->extent); + // auto* min_i = node->min.As(); + // auto* extent_i = node->extent.As(); + // if (min_i && extent_i && extent_i->value > min_i->value && extent_i->value - min_i->value == 1) { + // VLOG(6) << "Simplify current For Loop"; + // std::string var_name = node->loop_var->name; + // var_intervals.emplace(var_name, common::CasInterval{min_i->value, extent_i->value - 1}); + // if (node->body.As() && node->body.As()->stmts.size() == 1) { + // *expr = node->body.As()->stmts[0]; + // } else { + // *expr = node->body; + // } + // Visit(expr, expr); + // var_intervals.erase(var_name); + // } else { + // Visit(&node->body, &node->body); + // } } void Visit(const _Var_* op, Expr* expr) override { diff --git a/cinn/optim/optimize.cc b/cinn/optim/optimize.cc index 1017c12072..0ebf237006 100755 --- a/cinn/optim/optimize.cc +++ b/cinn/optim/optimize.cc @@ -76,11 +76,18 @@ Expr Optimize(Expr e, Target target, bool runtime_debug_info, bool remove_gpu_fo LOG(WARNING) << "Turn on runtime debug information output"; InsertDebugLogCallee(&copied); } + + // std::cerr << copied << std::endl; + // std::cerr << "!!!!!!!!!!!!!!===================================" << std::endl; + return copied; } ir::Module Optimize(const ir::Module& module, const Target& target) { + std::cerr << "opti " << std::endl; auto copied = IRCopy(Expr(module)); + // std::cerr << "init " << copied << std::endl; + // std::cerr << "fin init ========================" << std::endl; if (FLAGS_cinn_ir_schedule) { UnrollLoop(&copied); VectorizeLoops(&copied, Target()); diff --git a/cinn/optim/transform_gpu_forloop.cc b/cinn/optim/transform_gpu_forloop.cc index 7956b41371..7b199c9ffc 100644 --- a/cinn/optim/transform_gpu_forloop.cc +++ b/cinn/optim/transform_gpu_forloop.cc @@ -350,13 +350,16 @@ void TransformGpuForloops(const forloop_infos_t &forloop_infos, std::unordered_map> &resized_buffer_cache, Expr *expr) { VLOG(3) << "traverse_order=" << utils::Join(traverse_order, ","); + // std::cerr << "order " << utils::Join(traverse_order, ",") << std::endl; std::set gpu_launch_axis; for (auto &i : traverse_order) { if (forloop_infos.count(i) == 0) continue; for (auto &f : forloop_infos.at(i)) { if (f.second.for_type == ir::ForType::GPUThread) { + // std::cerr << "gpu launch thread "<< f.second.offset << std::endl; gpu_launch_axis.insert(backends::cuda_thread_axis_name(f.second.offset)); } else if (f.second.for_type == ir::ForType::GPUBlock) { + // std::cerr << "gpu launch block " << f.second.offset << std::endl; gpu_launch_axis.insert(backends::cuda_block_axis_name(f.second.offset)); } } diff --git a/cinn/poly/ast_gen.cc b/cinn/poly/ast_gen.cc index e8f4ca8c43..504e0c14d0 100644 --- a/cinn/poly/ast_gen.cc +++ b/cinn/poly/ast_gen.cc @@ -189,8 +189,10 @@ isl::ast_node AstGen::Build() { VLOG(4) << "transform schedule " << impl_->stages()[0]->transform(); VLOG(4) << "schedule: " << schedule; VLOG(4) << "schedule_domain: " << schedule_domain; - isl::ast_node ast = ast_build.node_from_schedule_map(schedule_domain); + + auto ast = ast_build.node_from_schedule_map(schedule_domain); VLOG(2) << "AST:\n" << isl_ast_node_to_C_str(ast.get()); + // std::cerr << "ast \n " << isl_ast_node_to_C_str(ast.get()) << std::endl; return ast; } diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh index 914b69d1ee..f97c188e91 100644 --- a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh +++ b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh @@ -2,6 +2,93 @@ * \file This file contains all the intrinsics available to be used in CUDA code generated by CodeGen. */ extern "C" { + + +const int WARP_SIZE = 32; +__device__ __forceinline__ float warpReduceSum(float sum) { + int blockSize = 32; + if (blockSize >= 32)sum += __shfl_down_sync(0xffffffff, sum, 16); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum += __shfl_down_sync(0xffffffff, sum, 8);// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum += __shfl_down_sync(0xffffffff, sum, 4);// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum += __shfl_down_sync(0xffffffff, sum, 2);// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum += __shfl_down_sync(0xffffffff, sum, 1);// 0-1, 2-3, 4-5, etc. + + static __shared__ float warpLevelSums[WARP_SIZE]; + const int laneId = threadIdx.x % WARP_SIZE; + const int warpId = threadIdx.x / WARP_SIZE; + + if(laneId == 0 )warpLevelSums[warpId] = sum; + __syncthreads(); + + return warpLevelSums[warpId]; +} + +__device__ __forceinline__ float warpReduceMax(float sum) { + int blockSize = 32; + if (blockSize >= 32)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 16) ); // 0-16, 1-17, 2-18, etc. + if (blockSize >= 16)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 8) );// 0-8, 1-9, 2-10, etc. + if (blockSize >= 8)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 4) );// 0-4, 1-5, 2-6, etc. + if (blockSize >= 4)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 2) );// 0-2, 1-3, 4-6, 5-7, etc. + if (blockSize >= 2)sum = max(sum, __shfl_down_sync(0xffffffff, sum, 1) );// 0-1, 2-3, 4-5, etc. + + static __shared__ float warpLevelSums[WARP_SIZE]; + const int laneId = threadIdx.x % WARP_SIZE; + const int warpId = threadIdx.x / WARP_SIZE; + + if(laneId == 0 )warpLevelSums[warpId] = sum; + __syncthreads(); + + return warpLevelSums[warpId]; + +} + +__device__ __forceinline__ void uniform_random( float* out, int N ) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + curandStatePhilox4_32_10_t state; + + curand_init(0, idx, N, &state); + + #pragma unroll + for( int j = 0; j < N; j += 4 ) + { + auto res_tuple = curand_uniform4(&state); + + out[j + 0] = static_cast((&res_tuple.x)[0]); + out[j + 1] = static_cast((&res_tuple.x)[1]); + out[j + 2] = static_cast((&res_tuple.x)[2]); + out[j + 3] = static_cast((&res_tuple.x)[3]); + + } + +} + +__device__ __forceinline__ float BlockReduceSum(float sum) { + + // Shared mem for partial sums (one per warp in the block) + static __shared__ float warpLevelSums[WARP_SIZE]; + const int laneId = threadIdx.x % WARP_SIZE; + const int warpId = threadIdx.x / WARP_SIZE; + + sum = warpReduceSum(sum); + + if( laneId == 0 ) warpLevelSums[warpId] = sum; + __syncthreads(); + + float final_sum = 0.0; + #pragma unroll + for( size_t i = 0; i < 6; ++i) + { + final_sum += warpLevelSums[i]; + } + + if (threadIdx.x == 0) warpLevelSums[0] = final_sum; + __syncthreads(); + return warpLevelSums[0]; + +} + // *************************************************************** // // float32 unary and binary operator #define FN_FP32(func) cinn_nvgpu_##func##_fp32 diff --git a/cinn/runtime/cuda/cuda_module.cc b/cinn/runtime/cuda/cuda_module.cc index 0ec5aa0bfe..636efcda6d 100644 --- a/cinn/runtime/cuda/cuda_module.cc +++ b/cinn/runtime/cuda/cuda_module.cc @@ -58,6 +58,7 @@ void CUDAModule::LaunchKernel(int device_id, << ", share_memory_size:" << share_memory_size; auto function = GetFunction(device_id, func_name); CHECK(function); + CUDA_DRIVER_CALL(cuLaunchKernel(function, gridDim.x, gridDim.y, @@ -69,11 +70,15 @@ void CUDAModule::LaunchKernel(int device_id, stream, args, nullptr)); + + } CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name) { VLOG(5) << "GetFuncion : " << func_name << " with device_id : " << device_id; + if (!module_per_card_[device_id]) { + std::lock_guard lock(mutex_); // Compilation with parameters const size_t jit_num_options = 5; @@ -90,6 +95,8 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name) std::vector log_buffer(log_buffer_size, '\0'); jit_opt_vals[1] = log_buffer.data(); + // std::cerr << "data " << data_ << std::endl; + int value = 1; // Specifies whether to create debug information in output (-g) jit_options[2] = CU_JIT_GENERATE_DEBUG_INFO; @@ -107,6 +114,7 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name) &module_per_card_[device_id], data_.c_str(), jit_num_options, jit_options.data(), jit_opt_vals.data()); if (CUDA_SUCCESS != status) { + std::cerr << "load failed" << std::endl; RAW_LOG(ERROR, "PTX JIT ERROR LOG: %s\n.", log_buffer.data()); const char* name; cuGetErrorName(status, &name); @@ -115,9 +123,11 @@ CUfunction CUDAModule::GetFunction(int device_id, const std::string& func_name) RAW_LOG(FATAL, "The error `%s` occurs while compiling the ptx! And its message is `%s`.", name, msg); } } + CUfunction func; CUDA_DRIVER_CALL(cuModuleGetFunction(&func, module_per_card_[device_id], func_name.c_str())); + return func; } diff --git a/cinn/runtime/cuda/cuda_util.cc b/cinn/runtime/cuda/cuda_util.cc index 4fd3c99d85..0d890b695d 100644 --- a/cinn/runtime/cuda/cuda_util.cc +++ b/cinn/runtime/cuda/cuda_util.cc @@ -83,9 +83,13 @@ void cinn_call_cuda_kernel(void *kernel_fn, int block_y, int block_z, void *stream) { + // grid_x = 128 * 12 * 128 / 32; + // block_x = 256; VLOG(3) << "cinn_call_cuda_kernel, grid_dim={" << grid_x << ", " << grid_y << ", " << grid_z << "}, block_dim={" << block_x << ", " << block_y << ", " << block_z << "}, num_args=" << num_args << ", stream=" << stream; + // grid_x = 128 * 12 * 128 / 32; + // block_x = 256; std::vector kernel_args; kernel_args.reserve(num_args); cinn_pod_value_t *args = static_cast(v_args);