From 0f425c4fc13aa6d4396045bb3a13acd42460dd77 Mon Sep 17 00:00:00 2001 From: hzx Date: Wed, 4 Dec 2024 21:56:16 +0800 Subject: [PATCH] add tune_test --- transformers/llm/engine/CMakeLists.txt | 3 + transformers/llm/engine/app/tune_test.cpp | 182 ++++++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 transformers/llm/engine/app/tune_test.cpp diff --git a/transformers/llm/engine/CMakeLists.txt b/transformers/llm/engine/CMakeLists.txt index b7ccf1139..a5d39f643 100644 --- a/transformers/llm/engine/CMakeLists.txt +++ b/transformers/llm/engine/CMakeLists.txt @@ -28,12 +28,15 @@ endif() add_executable(llm_demo ${CMAKE_CURRENT_LIST_DIR}/app/llm_demo.cpp) add_executable(ppl_demo ${CMAKE_CURRENT_LIST_DIR}/app/ppl_demo.cpp) add_executable(embedding_demo ${CMAKE_CURRENT_LIST_DIR}/app/embedding_demo.cpp) +add_executable(tune_test ${CMAKE_CURRENT_LIST_DIR}/app/tune_test.cpp) IF (NOT MNN_SEP_BUILD) target_link_libraries(llm_demo ${MNN_DEPS}) target_link_libraries(ppl_demo ${MNN_DEPS}) target_link_libraries(embedding_demo ${MNN_DEPS}) + target_link_libraries(tune_test ${MNN_DEPS}) ELSE () target_link_libraries(llm_demo ${MNN_DEPS} llm) target_link_libraries(ppl_demo ${MNN_DEPS} llm) target_link_libraries(embedding_demo ${MNN_DEPS} llm) + target_link_libraries(tune_test ${MNN_DEPS} llm) ENDIF () diff --git a/transformers/llm/engine/app/tune_test.cpp b/transformers/llm/engine/app/tune_test.cpp new file mode 100644 index 000000000..6e8f3d716 --- /dev/null +++ b/transformers/llm/engine/app/tune_test.cpp @@ -0,0 +1,182 @@ +// +// MatMulSpeed.cpp +// MNNTests +// +// Created by MNN on 2019/09/17. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include +#include +#include +#include +#include "MNN_generated.h" +#define MNN_OPEN_TIME_TRACE +#include + +#include "llm/llm.hpp" +#include "evaluation/dataset.hpp" +#include +#include +#include +#include +#include +#include + +using namespace MNN::Express; +using namespace MNN::Transformer; + +static void trace_prepare(Llm* llm) { + MNN_PRINT("Prepare for resize opt Begin\n"); + llm->trace(true); + std::ostringstream cacheOs; + llm->generate(std::initializer_list{200, 200}, &cacheOs, ""); + MNN_PRINT("Prepare for resize opt End\n"); + llm->trace(false); + llm->reset(); +} + +static void fillFloat(float* dst, int h, int w, float offset = 0.0f) { + for (int y = 0; y < h; ++y) { + auto dstY = dst + w * y; + for (int x = 0; x < w; ++x) { + dstY[x] = ((float)x * 0.1f + (float)y + offset) / 10000.0f; + } + } +} + +static MNNForwardType backend_type_convert(const std::string& type_str) { + if (type_str == "cpu") return MNN_FORWARD_CPU; + if (type_str == "metal") return MNN_FORWARD_METAL; + if (type_str == "cuda") return MNN_FORWARD_CUDA; + if (type_str == "opencl") return MNN_FORWARD_OPENCL; + if (type_str == "opengl") return MNN_FORWARD_OPENGL; + if (type_str == "vulkan") return MNN_FORWARD_VULKAN; + if (type_str == "npu") return MNN_FORWARD_NN; + return MNN_FORWARD_AUTO; +} + +float profileMatMul(int e, int h, int l) { + // Test MatMul + // prepare MatMul config + std::unique_ptr op(new MNN::OpT); + op->type = MNN::OpType_MatMul; + op->main.type = MNN::OpParameter_MatMul; + op->main.value = new MNN::MatMulT; + auto matmulParam = op->main.AsMatMul(); + matmulParam->transposeA = false; + matmulParam->transposeB = false; + + // prepare input and output + auto x0 = _Input({}, NHWC, halide_type_of()); + auto x1 = _Input({}, NHWC, halide_type_of()); + x0->resize({e, l}); + x1->resize({l, h}); + auto y = Variable::create(Expr::create(op.get(), {x0, x1})); + Variable::prepareCompute({y}); + fillFloat(x0->writeMap(), e, l); + fillFloat(x1->writeMap(), l, h); + + // Test for 5 times + const auto time = 5; + MNN::Timer _t; + for (int t = 0; t < time; ++t) { + x0->writeMap(); + x1->writeMap(); + y->readMap(); + } + float timeCost = _t.durationInUs() / 1000.0f / (float)time; + float flops = (float)e * (float)l * (float)h / timeCost / 1000.0f / 1000.0f; + MNN_PRINT("[%d, %d, %d], Avg time: %f ms , flops: %f G\n", e, l, h, timeCost, flops); + return timeCost; +} + +float profileLLM(Llm* llm, int prefill_len) { + llm->trace(true); + std::vector test_prompt(prefill_len, 200); + MNN::Timer _t; + llm->forward(test_prompt, true); + float timeCost = _t.durationInUs() / 1000.0f; + llm->trace(false); + llm->reset(); + MNN_PRINT("[%d], LLM Prefill time: %f ms\n", prefill_len, timeCost); + return timeCost; +} + +std::shared_ptr init_tune(MNNForwardType type, int thread) { + int precision = (int)MNN::BackendConfig::Precision_Low; + int memory = (int)MNN::BackendConfig::Memory_Low; + const char* flag = ""; + MNN::BackendConfig config; + config.precision = (MNN::BackendConfig::PrecisionMode)precision; + config.memory = (MNN::BackendConfig::MemoryMode)memory; + auto exe = MNN::Express::Executor::newExecutor(type, config, thread); + if (exe == nullptr) { + MNN_PRINT("Can't create executor with type:%d, exit!\n", type); + } + return exe; +} + +void tune(Llm* llm) { +// void tune() { + // problems for direct tuning: 1. low frequency, 2. don't know if tempcache need to be stored for opencl. + // test: setCache for OpenCL + float inf = 10000.f; + int hidden_size = 1536; + int intermediate_size = hidden_size*3; + + std::vector prefill_len_list = {100, 200, 400, 600, 800, 1000, 1500}; + // std::vector prefill_len_list = {100}; + + float qkv_sim_cpu = inf; + float up_proj_sim_cpu = inf; + float down_proj_sim_cpu = inf; + float prefill_llm_sim_cpu = inf; + auto cpu_exe = init_tune(backend_type_convert("cpu"), 4); + for (auto prefill_len : prefill_len_list){ + MNN::Express::ExecutorScope scope(cpu_exe); + // qkv_sim_cpu = profileMatMul(prefill_len, hidden_size, hidden_size); + // up_proj_sim_cpu = profileMatMul(prefill_len, hidden_size, intermediate_size); + // down_proj_sim_cpu = profileMatMul(prefill_len, intermediate_size, hidden_size); + // prefill_llm_sim_cpu = profileLLM(llm, prefill_len); + } + + float qkv_sim_opencl = inf; + float up_proj_sim_opencl = inf; + float down_proj_sim_opencl = inf; + float prefill_llm_sim_opencl = inf; + auto opencl_exe = init_tune(backend_type_convert("opencl"), 68); + if (opencl_exe != nullptr) { + for (auto prefill_len : prefill_len_list) { + MNN::Express::ExecutorScope scope(opencl_exe); + // qkv_sim_opencl = profileMatMul(prefill_len, hidden_size, hidden_size); + // up_proj_sim_opencl = profileMatMul(prefill_len, hidden_size, intermediate_size); + // down_proj_sim_opencl = profileMatMul(prefill_len, intermediate_size, hidden_size); + prefill_llm_sim_opencl = profileLLM(llm, prefill_len); + } + } +} + +int main(int argc, const char* argv[]) { + if (argc < 2) { + std::cout << "Usage: " << argv[0] << " config.json [performance.txt]" << std::endl; + return 0; + } + std::string config_path = argv[1]; + std::cout << "config path is " << config_path << std::endl; + std::unique_ptr llm(Llm::createLLM(config_path)); + { + AUTOTIME; + llm->load(); + } + { + AUTOTIME; + trace_prepare(llm.get()); + } + tune(llm.get()); + // tune(); + // std::string prompt_file = argv[2]; + // std::unique_ptr perfOS(nullptr); + // if (argc == 4) { perfOS.reset(new std::ofstream(argv[3])); } + return 0; +} \ No newline at end of file