Skip to content

Commit

Permalink
A tutorial of build ort-extensions from source as a static library (m…
Browse files Browse the repository at this point in the history
…icrosoft#703)

* The tutorial of build from source as a static library

* update test flag control

* add the tutorial
  • Loading branch information
wenbingl authored May 1, 2024
1 parent 3b889fc commit 8645a84
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 5 deletions.
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,14 @@ set(CMAKE_CXX_EXTENSIONS OFF)
include(CheckCXXCompilerFlag)
include(CheckLanguage)

set(_ORTX_STANDALONE_PROJECT OFF)
if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
set(_ORTX_STANDALONE_PROJECT ON)
endif()

option(CC_OPTIMIZE "Allow compiler optimizations, Set to OFF to disable" ON)
option(OCOS_ENABLE_PYTHON "Enable Python component building, (deprecated)" OFF)
option(OCOS_ENABLE_CTEST "Enable C++ test" ON)
option(OCOS_ENABLE_CTEST "Enable C++ test" ${_ORTX_STANDALONE_PROJECT})
option(OCOS_ENABLE_CPP_EXCEPTIONS "Enable C++ Exception" ON)
option(OCOS_ENABLE_TF_STRING "Enable String Operator Set" ON)
option(OCOS_ENABLE_RE2_REGEX "Enable StringRegexReplace and StringRegexSplit" ON)
Expand Down Expand Up @@ -877,6 +882,7 @@ if(OCOS_BUILD_APPLE_FRAMEWORK)
endif()
endif()

if (_ORTX_STANDALONE_PROJECT)
# clean up the requirements.txt files from 3rd party project folder to suppress the code security false alarms
file(GLOB_RECURSE NO_USE_FILES ${CMAKE_BINARY_DIR}/_deps/*requirements.txt)
message(STATUS "Found the following requirements.txt: ${NO_USE_FILES}")
Expand All @@ -887,6 +893,7 @@ endforeach()

# Run CPack to generate the NuGet package
include(CPack)
endif()

if(OCOS_ENABLE_CTEST)
include(ext_tests)
Expand Down
8 changes: 4 additions & 4 deletions operators/tokenizer/bpe_tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ class BpeModel {

id2token_map_.resize(vocab_map_.size());
for (const auto& [t, i] : vocab_map_) {
if (i > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
if (i > static_cast<uint32_t>((std::numeric_limits<int32_t>::max)())) {
continue; // safe purpose.
}
if (i > id2token_map_.size()) {
Expand Down Expand Up @@ -183,7 +183,7 @@ class BpeModel {

id2token_map_.resize(vocab_map_.size());
for (const auto& [t, i] : vocab_map_) {
if (i > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
if (i > static_cast<uint32_t>((std::numeric_limits<int32_t>::max)())) {
continue; // safe purpose.
}
if (i > id2token_map_.size()) {
Expand Down Expand Up @@ -256,7 +256,7 @@ class BpeModel {
void PerformBPE(std::list<std::pair<uint32_t, uint32_t>>& vals) const {
while (vals.size() >= 2) {
auto pos_it = vals.end();
uint32_t minval = std::numeric_limits<uint32_t>::max();
uint32_t minval = (std::numeric_limits<uint32_t>::max)();
uint32_t ori_id1 = 0, ori_id2 = 0;
uint32_t aim_id = 0;
int token_length = 0;
Expand Down Expand Up @@ -355,7 +355,7 @@ class BpeModel {
std::unordered_map<std::string, uint32_t> vocab_map_;
std::vector<std::string> id2token_map_;

uint32_t unk_id_ = std::numeric_limits<uint32_t>::max();
uint32_t unk_id_ = (std::numeric_limits<uint32_t>::max)();
bpe::SpecialTokenMap special_tokens_;
TrieTree<char32_t> added_tokens_;
};
Expand Down
19 changes: 19 additions & 0 deletions tutorials/ortx_api/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
cmake_minimum_required(VERSION 3.25)

project(ortx_api_test)

set(CMAKE_CXX_STANDARD 17)
include(FetchContent)

FetchContent_Declare(
ortx
GIT_REPOSITORY https://github.com/microsoft/onnxruntime-extensions.git
GIT_TAG a7043c56e4f19c4bf11642d390f7b502f80a34ba)

set(OCOS_BUILD_PRESET token_api_only)
FetchContent_MakeAvailable(ortx)

file(GLOB_RECURSE SOURCES "src/*.cc")
add_executable(ortx_api_test ${SOURCES})
target_link_libraries(ortx_api_test onnxruntime_extensions)
target_include_directories(ortx_api_test PRIVATE ${ortx_SOURCE_DIR}/include)
3 changes: 3 additions & 0 deletions tutorials/ortx_api/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Running the Demo

To run this demo, you'll need a developer tool like Visual Studio Code or a command line tool that supports CMake to configure the project. Once configured, compile the C++ project `ortx_api_test` to build the test program, and then run it.
80 changes: 80 additions & 0 deletions tutorials/ortx_api/src/main.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include <iostream>
#include <string>
#include <vector>

#include "ortx_tokenizer.h"

extError_t tokenize_text(const OrtxTokenizer *tokenizer,
const char *text, std::string &decoded_text, std::vector<extTokenId_t> &ids)
{
OrtxTokenId2DArray *tok_2d_output = NULL;
const char *tok_input[] = {text};
extError_t err = OrtxTokenize(tokenizer, tok_input, 1, &tok_2d_output);
if (err != kOrtxOK)
{
return err;
}

size_t length = 0;
const extTokenId_t *token_ids = NULL;
OrtxTokenId2DArrayGetItem(tok_2d_output, 0, &token_ids, &length);

OrtxStringArray *detok_output = NULL;
err = OrtxDetokenize1D(tokenizer, token_ids, length, &detok_output);
if (err != kOrtxOK)
{
ORTX_DISPOSE(tok_2d_output);
return err;
}
ids.insert(ids.end(), token_ids, token_ids + length);

const char *decoded_str = NULL;
OrtxStringArrayGetItem(detok_output, 0, &decoded_str);
decoded_text = decoded_str;

ORTX_DISPOSE(tok_2d_output);
ORTX_DISPOSE(detok_output);
return kOrtxOK;
}

int main()
{
int ver = OrtxGetAPIVersion();
std::cout << "Ortx API version: " << ver << std::endl;
OrtxTokenizer *tokenizer = NULL;

std::cout << "Please specify the tokenizer model file path (like <root>/test/data/llama2)" << std::endl;
std::string model_path;
std::cin >> model_path;

extError_t err = OrtxCreateTokenizer(&tokenizer, model_path.c_str());
if (err != kOrtxOK)
{
std::cerr << "Failed to create tokenizer" << std::endl;
return 1;
}

const char *input = "How many hours does it take a man to eat a Helicopter?";
std::string decoded_text;
std::vector<extTokenId_t> ids;
err = tokenize_text(tokenizer, input, decoded_text, ids);
if (err != kOrtxOK)
{
std::cerr << "Failed to tokenize text" << std::endl;
return 1;
}

std::cout << "Input : " << input << std::endl;
// output the token ids
std::cout << "Token IDs: ";
for (const auto &id : ids)
{
std::cout << id << " ";
}
std::cout << std::endl;

std::cout << "Decoded: " << decoded_text << std::endl;

OrtxDisposeOnly(tokenizer); // Clean up the tokenizer
return 0;
}

0 comments on commit 8645a84

Please sign in to comment.