From 58b552388fd10787f1807558c0d4bf607bc95c88 Mon Sep 17 00:00:00 2001
From: Chester Liu <4710575+skyline75489@users.noreply.github.com>
Date: Thu, 20 Jun 2024 08:20:30 +0800
Subject: [PATCH 1/3] Fix several C5038 warnings (#748)

---
 include/ort_c_to_cpp.h        | 2 +-
 shared/api/image_processor.cc | 2 +-
 shared/api/runner.hpp         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/include/ort_c_to_cpp.h b/include/ort_c_to_cpp.h
index 92c2fb01d..7969f080e 100644
--- a/include/ort_c_to_cpp.h
+++ b/include/ort_c_to_cpp.h
@@ -343,8 +343,8 @@ struct BaseKernel {
   OrtErrorCode GetErrorCodeAndRelease(OrtStatusPtr status) const noexcept;
 
   const OrtApi& api_;
-  OrtW::CustomOpApi ort_;
   const OrtKernelInfo& info_;
+  OrtW::CustomOpApi ort_;
 };
 
 // Deprecated: Use OrtW::CustomOpApi::KernelInfoGetAttribute instead
diff --git a/shared/api/image_processor.cc b/shared/api/image_processor.cc
index 028015972..9ecedf917 100644
--- a/shared/api/image_processor.cc
+++ b/shared/api/image_processor.cc
@@ -85,7 +85,7 @@ OrtxStatus ImageProcessor::Init(std::string_view processor_def) {
 }
 
 ImageProcessor::ImageProcessor()
-    : allocator_(&CppAllocator::Instance()), OrtxObjectImpl(kOrtxKindProcessor) {
+    : OrtxObjectImpl(kOrtxKindProcessor), allocator_(&CppAllocator::Instance()) {
 }
 
 template <typename T>
diff --git a/shared/api/runner.hpp b/shared/api/runner.hpp
index b3170e0ae..ba5991400 100644
--- a/shared/api/runner.hpp
+++ b/shared/api/runner.hpp
@@ -278,8 +278,8 @@ class OrtxRunner {
   }
 
  private:
-  std::vector<Operation*> ops_;
   ortc::IAllocator* allocator_;
+  std::vector<Operation*> ops_;
 };
 
 }  // namespace ort_extensions

From cbed8fd5758c277081fc0ff627906d5ee3bbce6d Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Thu, 20 Jun 2024 10:53:49 -0700
Subject: [PATCH 2/3] Add a generic image processor and its C API (#745)

* Add a generic image processor

* add more tests

* Fix the test failures

* Update runner.hpp
---
 .clang-format                                 |   5 +-
 cmake/ext_tests.cmake                         |   4 +-
 cmake/externals/json.cmake                    |   2 +
 include/custom_op/tensor_api.h                |  14 +
 include/ortx_c_helper.h                       |  97 +++++
 include/ortx_processor.h                      |  53 ++-
 include/ortx_utils.h                          |  44 ++-
 shared/api/c_api_processor.cc                 | 103 +++++
 shared/api/c_api_utils.cc                     |  64 ++-
 shared/api/c_api_utils.hpp                    |  79 ++--
 shared/api/image_processor.cc                 | 116 +++++-
 shared/api/image_processor.h                  |  29 +-
 shared/api/image_transforms.hpp               | 374 +++++++++---------
 shared/api/image_transforms_phi_3.hpp         | 209 ++++++++++
 shared/api/runner.hpp                         |  94 +++--
 test/data/processor/clip_image.json           |  59 +++
 ...{image_processor.json => phi_3_image.json} |   0
 test/pp_api_test/test_processor.cc            |  40 +-
 18 files changed, 1064 insertions(+), 322 deletions(-)
 create mode 100644 include/ortx_c_helper.h
 create mode 100644 shared/api/image_transforms_phi_3.hpp
 create mode 100644 test/data/processor/clip_image.json
 rename test/data/processor/{image_processor.json => phi_3_image.json} (100%)

diff --git a/.clang-format b/.clang-format
index 491a83575..747fcdd0a 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,10 +1,7 @@
 ---
 # Defaults for all languages.
 BasedOnStyle:  Google
-
-# Setting ColumnLimit to 0 so developer choices about where to break lines are maintained.
-# Developers are responsible for adhering to the 120 character maximum.
-ColumnLimit:     0
+ColumnLimit:  120
 SortIncludes: false
 DerivePointerAlignment: false
 
diff --git a/cmake/ext_tests.cmake b/cmake/ext_tests.cmake
index 4e39e7bab..c300bc9d1 100644
--- a/cmake/ext_tests.cmake
+++ b/cmake/ext_tests.cmake
@@ -145,8 +145,8 @@ if (OCOS_ENABLE_C_API)
     "$<TARGET_PROPERTY:ortcustomops,INTERFACE_INCLUDE_DIRECTORIES>"
     "$<TARGET_PROPERTY:ocos_operators,INTERFACE_INCLUDE_DIRECTORIES>")
 
-  if (ORTX_TEST_DATA2)
-    file(TO_NATIVE_PATH "${ORTX_TEST_DATA2}/tests/data2" _TEST_DATA2)
+  if (ORTX_DATA_PATH)
+    file(TO_NATIVE_PATH "${ORTX_DATA_PATH}/tests/data2" _TEST_DATA2)
     add_custom_command(TARGET pp_api_test POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E create_symlink ${_TEST_DATA2} ${onnxruntime_extensions_BINARY_DIR}/data2)
   endif()
diff --git a/cmake/externals/json.cmake b/cmake/externals/json.cmake
index a41676c06..cdf63407a 100644
--- a/cmake/externals/json.cmake
+++ b/cmake/externals/json.cmake
@@ -8,3 +8,5 @@ FetchContent_GetProperties(nlohmann_json)
 if(NOT nlohmann_json_POPULATED)
   FetchContent_Populate(nlohmann_json)
 endif()
+
+add_compile_definitions(JSON_HAS_CPP_17=1)
diff --git a/include/custom_op/tensor_api.h b/include/custom_op/tensor_api.h
index 42f23d5dd..e8b6f9f54 100644
--- a/include/custom_op/tensor_api.h
+++ b/include/custom_op/tensor_api.h
@@ -174,6 +174,8 @@ class TensorBase : public Arg {
   virtual int64_t NumberOfElement() const = 0;
   virtual const void* DataRaw() const = 0;
   virtual size_t SizeInBytes() const = 0;
+
+  virtual std::byte* AllocateRaw(const std::vector<int64_t>& shape) = 0;
 };
 
 template <typename T>
@@ -283,6 +285,10 @@ class Tensor : public TensorBase {
       return static_cast<TT*>(buffer);
   }
 
+  std::byte* AllocateRaw(const std::vector<int64_t>& shape) override {
+    return reinterpret_cast<std::byte*>(Allocate(shape));
+  }
+
   const Span<T>& AsSpan() {
     if (!storage_)
       ORTX_CXX_API_THROW("tensor not initialized.", ORT_RUNTIME_EXCEPTION);
@@ -448,6 +454,10 @@ class Tensor<std::string> : public TensorBase {
     return ss[0].size();
   }
 
+  std::byte* AllocateRaw(const std::vector<int64_t>& shape) override {
+    ORTX_CXX_API_THROW("AllocateRaw() not supported for string tensor", ORT_RUNTIME_EXCEPTION);
+  }
+
   void SetStringOutput(const strings& ss, const std::vector<int64_t>& dims) {
     storage_->SetStringOutput(ss, dims);
   }
@@ -522,6 +532,10 @@ class Tensor<std::string_view> : public TensorBase {
     return ss[0].size();
   }
 
+  std::byte* AllocateRaw(const std::vector<int64_t>& shape) override {
+    ORTX_CXX_API_THROW("AllocateRaw() not supported for string tensor", ORT_RUNTIME_EXCEPTION);
+  }
+  
   void SetStringOutput(const strings& ss, const std::vector<int64_t>& dims) {
     storage_->SetStringOutput(ss, dims);
   }
diff --git a/include/ortx_c_helper.h b/include/ortx_c_helper.h
new file mode 100644
index 000000000..ce001ca31
--- /dev/null
+++ b/include/ortx_c_helper.h
@@ -0,0 +1,97 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "ortx_utils.h"
+
+namespace ort_extensions {
+
+template <typename T>
+class OrtxDeleter {
+ public:
+  void operator()(T* p) const {
+    if (p) {
+      OrtxDisposeOnly(p);
+    }
+  }
+};
+
+/**
+ * @brief A smart pointer class that manages the lifetime of an OrtxObject.
+ * 
+ * This class is derived from std::unique_ptr and provides additional functionality
+ * specific to OrtxObject. It automatically calls the OrtxDeleter to release the
+ * owned object when it goes out of scope.
+ * 
+ * @tparam T The type of the object being managed.
+ */
+template <typename T>
+class OrtxObjectPtr : public std::unique_ptr<T, OrtxDeleter<T>> {
+ public:
+  /**
+   * @brief Default constructor.
+   * 
+   * Constructs an OrtxObjectPtr with a null pointer.
+   */
+  OrtxObjectPtr() : std::unique_ptr<T, OrtxDeleter<T>>(nullptr) {}
+
+  /**
+   * @brief Constructor that creates an OrtxObjectPtr from a function call.
+   * 
+   * This constructor calls the specified function with the given arguments to
+   * create an OrtxObject. If the function call succeeds, the created object is
+   * owned by the OrtxObjectPtr.
+   * 
+   * @tparam TFn The type of the function pointer or function object.
+   * @tparam Args The types of the arguments to be passed to the function.
+   * @param fn The function pointer or function object used to create the OrtxObject.
+   * @param args The arguments to be passed to the function.
+   */
+  template <typename TFn, typename... Args>
+  OrtxObjectPtr(TFn fn, Args&&... args) {
+    OrtxObject* proc = nullptr;
+    err_ = fn(&proc, std::forward<Args>(args)...);
+    if (err_ == kOrtxOK) {
+      this->reset(static_cast<T*>(proc));
+    }
+  }
+
+  /**
+   * @brief Get the error code associated with the creation of the OrtxObject.
+   * 
+   * @return The error code.
+   */
+  extError_t Code() const { return err_; }
+
+ private:
+  extError_t err_ = kOrtxOK; /**< The error code associated with the creation of the OrtxObject. */
+};
+
+template <typename T>
+struct PointerAssigner {
+  OrtxObject* obj_{};
+  OrtxObjectPtr<T>& ptr_;
+  PointerAssigner(OrtxObjectPtr<T>& ptr) : ptr_(ptr){};
+
+  ~PointerAssigner() { ptr_.reset(static_cast<T*>(obj_)); };
+
+  operator T**() { return reinterpret_cast<T**>(&obj_); };
+};
+
+/**
+ * @brief A wrapper function for OrtxObjectPtr that can be used as a function parameter on creation.
+ * 
+ * This function creates a PointerAssigner object for the given OrtxObjectPtr. The PointerAssigner
+ * object can be used to assign a pointer value to the OrtxObjectPtr.
+ * 
+ * @tparam T The type of the object pointed to by the OrtxObjectPtr.
+ * @param ptr The OrtxObjectPtr to create the PointerAssigner for.
+ * @return A PointerAssigner object for the given OrtxObjectPtr.
+ */
+template <typename T>
+PointerAssigner<T> ptr(OrtxObjectPtr<T>& ptr) {
+  return PointerAssigner<T>{ptr};
+};
+
+}  // namespace ort_extensions
diff --git a/include/ortx_processor.h b/include/ortx_processor.h
index d89f16460..6dcc5a84e 100644
--- a/include/ortx_processor.h
+++ b/include/ortx_processor.h
@@ -9,6 +9,8 @@
 
 // typedefs to create/dispose function flood, and to make the API more C++ friendly with less casting
 typedef OrtxObject OrtxProcessor;
+typedef OrtxObject OrtxRawImages;
+typedef OrtxObject OrtxImageProcessorResult;
 
 #ifdef __cplusplus
 extern "C" {
@@ -17,11 +19,58 @@ extern "C" {
 /** \brief Create a processor object with the specified processor definition
  *
  * \param processor Pointer to store the created processor object
- * \param processor_def The processor definition, either a path to the processor directory or a JSON string, and is utf-8 encoded.
- * \return Error code indicating the success or failure of the operation
+ * \param processor_def The processor definition, either a path to the processor directory or a JSON string, and is
+ * utf-8 encoded. \return Error code indicating the success or failure of the operation
  */
 extError_t ORTX_API_CALL OrtxCreateProcessor(OrtxProcessor** processor, const char* processor_def);
 
+/**
+ * @brief Loads a set of images from the specified image paths.
+ *
+ * This function loads a set of images from the given image paths and returns a pointer to the loaded images.
+ * The number of images loaded is also returned through the `num_images_loaded` parameter.
+ *
+ * @param[out] images A pointer to a pointer that will be set to the loaded images.
+ * @param[in] image_paths An array of image paths.
+ * @param[in] num_images The number of images to load.
+ * @param[out] num_images_loaded A pointer to a variable that will be set to the number of images loaded.
+ *
+ * @return An error code indicating the status of the operation.
+ */
+extError_t ORTX_API_CALL OrtxLoadImages(OrtxRawImages** images, const char** image_paths, size_t num_images,
+                                        size_t* num_images_loaded);
+
+/**
+ * @brief Preprocesses the given raw images using the specified processor.
+ *
+ * This function applies preprocessing operations on the raw images using the provided processor.
+ * The result of the preprocessing is stored in the `OrtxImageProcessorResult` object.
+ *
+ * @param processor A pointer to the `OrtxProcessor` object used for preprocessing.
+ * @param images A pointer to the `OrtxRawImages` object containing the raw images to be processed.
+ * @param result A pointer to the `OrtxImageProcessorResult` object to store the preprocessing result.
+ * @return An `extError_t` value indicating the success or failure of the preprocessing operation.
+ */
+extError_t ORTX_API_CALL OrtxImagePreProcess(OrtxProcessor* processor, OrtxRawImages* images,
+                                             OrtxImageProcessorResult** result);
+
+/**
+ * @brief Retrieves the image processor result at the specified index.
+ *
+ * @param result Pointer to the OrtxImageProcessorResult structure to store the result.
+ * @param index The index of the result to retrieve.
+ * @return extError_t The error code indicating the success or failure of the operation.
+ */
+extError_t ORTX_API_CALL OrtxImageGetTensorResult(OrtxImageProcessorResult* result, size_t index, OrtxTensor** tensor);
+
+/** \brief Clear the outputs of the processor
+ *
+ * \param processor The processor object
+ * \param result The result object to clear
+ * \return Error code indicating the success or failure of the operation
+ */
+extError_t ORTX_API_CALL OrtxClearOutputs(OrtxProcessor* processor, OrtxImageProcessorResult* result);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/ortx_utils.h b/include/ortx_utils.h
index 8ee7bf217..e6c0af9aa 100644
--- a/include/ortx_utils.h
+++ b/include/ortx_utils.h
@@ -5,6 +5,8 @@
 
 #include "ortx_types.h"
 
+const int API_VERSION = 1;
+
 typedef enum {
   kOrtxKindUnknown = 0,
 
@@ -14,7 +16,10 @@ typedef enum {
   kOrtxKindTokenId2DArray = 0x778A,
   kOrtxKindDetokenizerCache = 0x778B,
   kOrtxKindProcessor = 0x778C,
-  kOrtxKindProcessorResult = 0x778D,
+  kOrtxKindRawImages = 0x778D,
+  kOrtxKindImageProcessorResult = 0x778E,
+  kOrtxKindProcessorResult = 0x778F,
+  kOrtxKindTensor = 0x7790,
   kOrtxKindEnd = 0x7999
 } extObjectKind_t;
 
@@ -24,7 +29,7 @@ typedef struct {
   int ext_kind_;
 } OrtxObject;
 
-const int API_VERSION = 1;
+typedef OrtxObject OrtxTensor;
 
 // C, instead of C++ doesn't cast automatically,
 // so we need to use a macro to cast the object to the correct type
@@ -72,6 +77,41 @@ extError_t ORTX_API_CALL OrtxDispose(OrtxObject** object);
  */
 extError_t ORTX_API_CALL OrtxDisposeOnly(OrtxObject* object);
 
+/** \brief Get the data from the tensor
+ *
+ * \param tensor The tensor object
+ * \param data Pointer to store the data
+ * \param shape Pointer to store the shape
+ * \param num_dims Pointer to store the number of dimensions
+ * \return Error code indicating the success or failure of the operation
+ */
+extError_t ORTX_API_CALL OrtxGetTensorData(OrtxTensor* tensor, const void** data, const int64_t** shape,
+                                           size_t* num_dims);
+/**
+ * \brief Get the data from the tensor as int64_t type
+ *
+ * \param tensor The tensor object
+ * \param data Pointer to store the data
+ * \param shape Pointer to store the shape
+ * \param num_dims Pointer to store the number of dimensions
+ * \return Error code indicating the success or failure of the operation
+ */
+
+extError_t ORTX_API_CALL OrtxGetTensorDataInt64(OrtxTensor* tensor, const int64_t** data, const int64_t** shape,
+                                                size_t* num_dims);
+
+/**
+ * \brief Get the data from the tensor as float type
+ *
+ * \param tensor The tensor object
+ * \param data Pointer to store the data
+ * \param shape Pointer to store the shape
+ * \param num_dims Pointer to store the number of dimensions
+ * \return Error code indicating the success or failure of the operation
+ */
+extError_t ORTX_API_CALL OrtxGetTensorDataFloat(OrtxTensor* tensor, const float** data, const int64_t** shape,
+                                                size_t* num_dims);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/shared/api/c_api_processor.cc b/shared/api/c_api_processor.cc
index 82a45f652..2beb90a13 100644
--- a/shared/api/c_api_processor.cc
+++ b/shared/api/c_api_processor.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "ortx_processor.h"
 #include "image_processor.h"
 
 using namespace ort_extensions;
@@ -20,3 +21,105 @@ extError_t OrtxCreateProcessor(OrtxProcessor** processor, const char* def) {
 
   return status.Code();
 }
+
+struct RawImagesObject : public OrtxObjectImpl {
+ public:
+  RawImagesObject() : OrtxObjectImpl(kOrtxKindRawImages) {}
+  std::unique_ptr<ort_extensions::ImageRawData[]> images;
+  size_t num_images;
+};
+
+extError_t ORTX_API_CALL OrtxLoadImages(OrtxRawImages** images, const char** image_paths, size_t num_images,
+                                        size_t* num_images_loaded) {
+  if (images == nullptr || image_paths == nullptr) {
+    ReturnableStatus::last_error_message_ = "Invalid argument";
+    return kOrtxErrorInvalidArgument;
+  }
+
+  auto images_obj = std::make_unique<RawImagesObject>();
+  auto [img, num] = LoadRawImages(image_paths, image_paths + num_images);
+  images_obj->images = std::move(img);
+  images_obj->num_images = num;
+  if (num_images_loaded != nullptr) {
+    *num_images_loaded = num;
+  }
+  
+  *images = static_cast<OrtxRawImages*>(images_obj.release());
+  return extError_t();
+}
+
+extError_t ORTX_API_CALL OrtxImagePreProcess(OrtxProcessor* processor, OrtxRawImages* images,
+                                             OrtxImageProcessorResult** result) {
+  if (processor == nullptr || images == nullptr || result == nullptr) {
+    ReturnableStatus::last_error_message_ = "Invalid argument";
+    return kOrtxErrorInvalidArgument;
+  }
+
+  auto processor_ptr = static_cast<ImageProcessor*>(processor);
+  ReturnableStatus status(processor_ptr->IsInstanceOf(extObjectKind_t::kOrtxKindProcessor));
+  if (!status.IsOk()) {
+    return status.Code();
+  }
+
+  auto images_ptr = static_cast<RawImagesObject*>(images);
+  status = images_ptr->IsInstanceOf(extObjectKind_t::kOrtxKindRawImages);
+  if (!status.IsOk()) {
+    return status.Code();
+  }
+
+  auto result_ptr = std::make_unique<ImageProcessorResult>();
+  status =
+      processor_ptr->PreProcess(ort_extensions::span(images_ptr->images.get(), images_ptr->num_images), *result_ptr);
+  if (status.IsOk()) {
+    *result = static_cast<OrtxImageProcessorResult*>(result_ptr.release());
+  } else {
+    *result = nullptr;
+  }
+
+  return {};
+}
+
+extError_t ORTX_API_CALL OrtxImageGetTensorResult(OrtxImageProcessorResult* result, size_t index, OrtxTensor** tensor) {
+  if (result == nullptr || tensor == nullptr) {
+    ReturnableStatus::last_error_message_ = "Invalid argument";
+    return kOrtxErrorInvalidArgument;
+  }
+
+  auto result_ptr = static_cast<ImageProcessorResult*>(result);
+  ReturnableStatus status(result_ptr->IsInstanceOf(extObjectKind_t::kOrtxKindImageProcessorResult));
+  if (!status.IsOk()) {
+    return status.Code();
+  }
+
+  if (index >= result_ptr->results.size()) {
+    ReturnableStatus::last_error_message_ = "Index out of range";
+    return kOrtxErrorInvalidArgument;
+  }
+
+  auto tensor_ptr = std::make_unique<OrtxObjectWrapper<ortc::TensorBase>>();
+  tensor_ptr->SetObject(result_ptr->results[index].get());
+  *tensor = static_cast<OrtxTensor*>(tensor_ptr.release());
+  return extError_t();
+}
+
+extError_t ORTX_API_CALL OrtxClearOutputs(OrtxProcessor* processor, OrtxImageProcessorResult* result) {
+  if (processor == nullptr || result == nullptr) {
+    ReturnableStatus::last_error_message_ = "Invalid argument";
+    return kOrtxErrorInvalidArgument;
+  }
+
+  const auto processor_ptr = static_cast<const ImageProcessor*>(processor);
+  ReturnableStatus status(processor_ptr->IsInstanceOf(extObjectKind_t::kOrtxKindProcessor));
+  if (!status.IsOk()) {
+    return status.Code();
+  }
+
+  auto result_ptr = static_cast<ImageProcessorResult*>(result);
+  status = result_ptr->IsInstanceOf(extObjectKind_t::kOrtxKindImageProcessorResult);
+  if (!status.IsOk()) {
+    return status.Code();
+  }
+
+  ImageProcessor::ClearOutputs(result_ptr);
+  return extError_t();
+}
diff --git a/shared/api/c_api_utils.cc b/shared/api/c_api_utils.cc
index c7963a20a..0345fdb23 100644
--- a/shared/api/c_api_utils.cc
+++ b/shared/api/c_api_utils.cc
@@ -6,6 +6,7 @@
 #include "file_sys.h"
 #include "image_processor.h"
 #include "tokenizer_impl.h"
+#include "ortx_utils.h"
 
 using namespace ort_extensions;
 
@@ -13,19 +14,14 @@ thread_local std::string ReturnableStatus::last_error_message_;
 
 OrtxStatus OrtxObjectImpl::IsInstanceOf(extObjectKind_t kind) const {
   if (ext_kind_ != static_cast<int>(kind)) {
-    return {extError_t::kOrtxErrorInvalidArgument,
-            "Object is not an instance of the requested type"};
+    return {extError_t::kOrtxErrorInvalidArgument, "Object is not an instance of the requested type"};
   }
   return {};
 }
 
-int ORTX_API_CALL OrtxGetAPIVersion() {
-  return API_VERSION;
-}
+int ORTX_API_CALL OrtxGetAPIVersion() { return API_VERSION; }
 
-const char* OrtxGetLastErrorMessage() {
-  return ReturnableStatus::last_error_message_.c_str();
-}
+const char* OrtxGetLastErrorMessage() { return ReturnableStatus::last_error_message_.c_str(); }
 
 extError_t ORTX_API_CALL OrtxCreate(extObjectKind_t kind, OrtxObject** object, ...) {
   if (object == nullptr) {
@@ -50,8 +46,7 @@ extError_t ORTX_API_CALL OrtxCreate(extObjectKind_t kind, OrtxObject** object, .
   return extError_t();
 }
 
-extError_t ORTX_API_CALL OrtxCreateTokenizer(OrtxTokenizer** tokenizer,
-                                             const char* tokenizer_path) {
+extError_t ORTX_API_CALL OrtxCreateTokenizer(OrtxTokenizer** tokenizer, const char* tokenizer_path) {
   // test if the tokenizer_path is a valid directory
   if (tokenizer_path == nullptr) {
     ReturnableStatus::last_error_message_ = "The tokenizer data directory is null";
@@ -86,17 +81,19 @@ extError_t ORTX_API_CALL OrtxDisposeOnly(OrtxObject* object) {
   }
 
   if (Ortx_object->ortx_kind() == extObjectKind_t::kOrtxKindStringArray) {
-    OrtxObjectFactory<ort_extensions::StringArray>::Dispose(object);
+    OrtxObjectFactory<StringArray>::Dispose(object);
   } else if (Ortx_object->ortx_kind() == extObjectKind_t::kOrtxKindTokenId2DArray) {
-    OrtxObjectFactory<ort_extensions::TokenId2DArray>::Dispose(object);
+    OrtxObjectFactory<TokenId2DArray>::Dispose(object);
   } else if (Ortx_object->ortx_kind() == extObjectKind_t::kOrtxKindDetokenizerCache) {
-    OrtxObjectFactory<ort_extensions::DetokenizerCache>::DisposeForward(object);
+    OrtxObjectFactory<DetokenizerCache>::DisposeForward(object);
   } else if (Ortx_object->ortx_kind() == extObjectKind_t::kOrtxKindTokenizer) {
-    OrtxObjectFactory<ort_extensions::TokenizerImpl>::Dispose(object);
+    OrtxObjectFactory<TokenizerImpl>::Dispose(object);
   } else if (Ortx_object->ortx_kind() == extObjectKind_t::kOrtxKindProcessorResult) {
-    OrtxObjectFactory<ort_extensions::ProcessorResult>::Dispose(object);
+    OrtxObjectFactory<ProcessorResult>::Dispose(object);
+  } else if (Ortx_object->ortx_kind() == extObjectKind_t::kOrtxKindImageProcessorResult) {
+    OrtxObjectFactory<ImageProcessorResult>::Dispose(object);
   } else if (Ortx_object->ortx_kind() == extObjectKind_t::kOrtxKindProcessor) {
-    OrtxObjectFactory<ort_extensions::ImageProcessor>::Dispose(object);
+    OrtxObjectFactory<ImageProcessor>::Dispose(object);
   }
 
   return extError_t();
@@ -115,3 +112,38 @@ extError_t ORTX_API_CALL OrtxDispose(OrtxObject** object) {
   *object = nullptr;
   return err;
 }
+
+extError_t ORTX_API_CALL OrtxGetTensorData(OrtxTensor* tensor, const void** data, const int64_t** shape,
+                                           size_t* num_dims) {
+  if (tensor == nullptr) {
+    ReturnableStatus::last_error_message_ = "Invalid argument";
+    return kOrtxErrorInvalidArgument;
+  }
+
+  auto tensor_impl = static_cast<OrtxObjectWrapper<ortc::TensorBase>*>(tensor);
+  if (tensor_impl->ortx_kind() != extObjectKind_t::kOrtxKindTensor) {
+    ReturnableStatus::last_error_message_ = "Invalid argument";
+    return kOrtxErrorInvalidArgument;
+  }
+
+  *data = tensor_impl->GetObject()->DataRaw();
+  *shape = tensor_impl->GetObject()->Shape().data();
+  *num_dims = tensor_impl->GetObject()->Shape().size();
+  return extError_t();
+}
+
+extError_t ORTX_API_CALL OrtxGetTensorDataInt64(OrtxTensor* tensor, const int64_t** data, const int64_t** shape,
+                                                size_t* num_dims) {
+  const void* data_ptr;
+  auto err = OrtxGetTensorData(tensor, &data_ptr, shape, num_dims);
+  *data = reinterpret_cast<const int64_t*>(data_ptr);  // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
+  return err;
+}
+
+extError_t ORTX_API_CALL OrtxGetTensorDataFloat(OrtxTensor* tensor, const float** data, const int64_t** shape,
+                                                size_t* num_dims) {
+  const void* data_ptr;
+  auto err = OrtxGetTensorData(tensor, &data_ptr, shape, num_dims);
+  *data = reinterpret_cast<const float*>(data_ptr);  // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
+  return err;
+}
diff --git a/shared/api/c_api_utils.hpp b/shared/api/c_api_utils.hpp
index 99a2b7ba4..d7794b610 100644
--- a/shared/api/c_api_utils.hpp
+++ b/shared/api/c_api_utils.hpp
@@ -24,6 +24,30 @@ class OrtxObjectImpl : public OrtxObject {
     }
     return static_cast<extObjectKind_t>(ext_kind_);
   }
+
+  template <typename T>
+  struct Type2Kind {
+    static const extObjectKind_t value = kOrtxKindUnknown;
+  };
+};
+
+template <>
+struct OrtxObjectImpl::Type2Kind<ortc::TensorBase> {
+  static const extObjectKind_t value = kOrtxKindTensor;
+};
+
+template <typename T>
+class OrtxObjectWrapper : public OrtxObjectImpl {
+ public:
+  OrtxObjectWrapper() : OrtxObjectImpl(OrtxObjectImpl::Type2Kind<T>::value) {}
+  ~OrtxObjectWrapper() override = default;
+
+  void SetObject(T* t) { stored_object_ = t; }
+
+  [[nodiscard]] T* GetObject() const { return stored_object_; }
+
+ private:
+  T* stored_object_{};
 };
 
 template <typename T>
@@ -39,7 +63,7 @@ class span {
 
   const T& operator[](size_t i) const { return data_[i]; }
   T& operator[](size_t i) { return data_[i]; }
-  
+
   T* data() const { return data_; }
   [[nodiscard]] size_t size() const { return size_; }
   T* begin() const { return data_; }
@@ -55,13 +79,9 @@ class TokenId2DArray : public OrtxObjectImpl {
   TokenId2DArray() : OrtxObjectImpl(extObjectKind_t::kOrtxKindTokenId2DArray) {}
   ~TokenId2DArray() override = default;
 
-  void SetTokenIds(std::vector<std::vector<extTokenId_t>>&& token_ids) {
-    token_ids_ = token_ids;
-  }
+  void SetTokenIds(std::vector<std::vector<extTokenId_t>>&& token_ids) { token_ids_ = token_ids; }
 
-  [[nodiscard]] const std::vector<std::vector<extTokenId_t>>& token_ids() const {
-    return token_ids_;
-  }
+  [[nodiscard]] const std::vector<std::vector<extTokenId_t>>& token_ids() const { return token_ids_; }
 
  private:
   std::vector<std::vector<extTokenId_t>> token_ids_;
@@ -72,13 +92,9 @@ class StringArray : public OrtxObjectImpl {
   StringArray() : OrtxObjectImpl(extObjectKind_t::kOrtxKindStringArray) {}
   ~StringArray() override = default;
 
-  void SetStrings(std::vector<std::string>&& strings) {
-    strings_ = strings;
-  }
+  void SetStrings(std::vector<std::string>&& strings) { strings_ = strings; }
 
-  [[nodiscard]] const std::vector<std::string>& strings() const {
-    return strings_;
-  }
+  [[nodiscard]] const std::vector<std::string>& strings() const { return strings_; }
 
  private:
   std::vector<std::string> strings_;
@@ -109,10 +125,8 @@ struct ReturnableStatus {
 
 template <typename T>
 class OrtxObjectFactory {
-  public:
-  static std::unique_ptr<T> Create() {
-    return std::make_unique<T>();
-  }
+ public:
+  static std::unique_ptr<T> Create() { return std::make_unique<T>(); }
 
   static OrtxObject* CreateForward();
   static void DisposeForward(OrtxObject* object);
@@ -122,42 +136,15 @@ class OrtxObjectFactory {
     std::unique_ptr<T> ptr(obj_ptr);
     ptr.reset();
   }
-
 };
 
 class DetokenizerCache;  // forward definition in tokenizer_impl.cc
-class ProcessorResult;  // forward definition in image_processor.h
-
-template <typename T>
-class OrtxDeleter {
- public:
-  void operator()(T* p) const {
-    if (p) {
-      OrtxDisposeOnly(p);
-    }
-  }
-};
-
-template <typename T>
-class OrtxObjectPtr : public std::unique_ptr<T, OrtxDeleter<T>> {
- public:
-  template <typename TFn>
-  OrtxObjectPtr(TFn fn, const char* def) {
-    OrtxObject* proc = nullptr;
-    err_ = fn(&proc, def);
-    if (err_ == kOrtxOK) {
-      this->reset(static_cast<T*>(proc));
-    }
-  }
+class ProcessorResult;   // forward definition in image_processor.h
 
-  int err_ = kOrtxOK;
-};
 
 class CppAllocator : public ortc::IAllocator {
  public:
-  void* Alloc(size_t size) override {
-    return std::make_unique<char[]>(size).release();
-  }
+  void* Alloc(size_t size) override { return std::make_unique<char[]>(size).release(); }
 
   void Free(void* p) override {
     std::unique_ptr<char[]> ptr(static_cast<char*>(p));
diff --git a/shared/api/image_processor.cc b/shared/api/image_processor.cc
index 9ecedf917..1cbab6e10 100644
--- a/shared/api/image_processor.cc
+++ b/shared/api/image_processor.cc
@@ -9,17 +9,18 @@
 #include "image_processor.h"
 #include "cv2/imgcodecs/imdecode.hpp"
 #include "image_transforms.hpp"
+#include "image_transforms_phi_3.hpp"
 
 using namespace ort_extensions;
 using json = nlohmann::json;
 
 namespace ort_extensions {
-std::tuple<std::unique_ptr<ImageRawData[]>, size_t>
-LoadRawImages(const std::initializer_list<const char*>& image_paths) {
-  auto raw_images = std::make_unique<ImageRawData[]>(image_paths.size());
+template <typename It>
+std::tuple<std::unique_ptr<ImageRawData[]>, size_t> LoadRawImages(It begin, It end) {
+  auto raw_images = std::make_unique<ImageRawData[]>(end - begin);
   size_t n = 0;
-  for (const auto& image_path : image_paths) {
-    std::ifstream ifs = path(image_path).open(std::ios::binary);
+  for (auto it = begin; it != end; ++it) {
+    std::ifstream ifs = path(*it).open(std::ios::binary);
     if (!ifs.is_open()) {
       break;
     }
@@ -35,11 +36,23 @@ LoadRawImages(const std::initializer_list<const char*>& image_paths) {
 
   return std::make_tuple(std::move(raw_images), n);
 }
+
+std::tuple<std::unique_ptr<ImageRawData[]>, size_t> LoadRawImages(
+    const std::initializer_list<const char*>& image_paths) {
+  return LoadRawImages(image_paths.begin(), image_paths.end());
+}
+
+template std::tuple<std::unique_ptr<ImageRawData[]>, size_t> LoadRawImages<char const**>(char const**, char const**);
+
 }  // namespace ort_extensions
 
 Operation::KernelRegistry ImageProcessor::kernel_registry_ = {
     {"DecodeImage", []() { return CreateKernelInstance(image_decoder); }},
-    {"ConvertRGB", []() { return CreateKernelInstance(&ConvertToRGB::Compute); }},
+    {"Resize", []() { return CreateKernelInstance(&Resize::Compute); }},
+    {"Rescale", []() { return CreateKernelInstance(&Rescale::Compute); }},
+    {"Normalize", []() { return CreateKernelInstance(&Normalize::Compute); }},
+    {"CenterCrop", []() { return CreateKernelInstance(&CenterCrop::Compute); }},
+    {"ConvertRGB", []() { return CreateKernelInstance(convert_to_rgb); }},
     {"Phi3ImageTransform", []() { return CreateKernelInstance(phi3_hd_transform); }},
 };
 
@@ -89,8 +102,7 @@ ImageProcessor::ImageProcessor()
 }
 
 template <typename T>
-static ortc::Tensor<T>*
-StackTensor(const std::vector<TensorArgs>& arg_lists, int axis, ortc::IAllocator* allocator) {
+static ortc::Tensor<T>* StackTensor(const std::vector<TensorArgs>& arg_lists, int axis, ortc::IAllocator* allocator) {
   using TT = ortc::Tensor<T>;
   auto output = std::make_unique<TT>(allocator);
 
@@ -124,12 +136,43 @@ StackTensor(const std::vector<TensorArgs>& arg_lists, int axis, ortc::IAllocator
   return output.release();
 }
 
-std::tuple<OrtxStatus, ProcessorResult>
-ImageProcessor::PreProcess(
-    ort_extensions::span<ImageRawData> image_data,
-    ortc::Tensor<float>** pixel_values,
-    ortc::Tensor<int64_t>** image_sizes,
-    ortc::Tensor<int64_t>** num_img_takens) {
+static OrtxStatus StackTensors(const std::vector<TensorArgs>& arg_lists, std::vector<TensorPtr>& outputs,
+                               ortc::IAllocator* allocator) {
+  if (arg_lists.empty()) {
+    return {};
+  }
+
+  size_t batch_size = arg_lists.size();
+  size_t num_outputs = arg_lists[0].size();
+  for (size_t axis = 0; axis < num_outputs; ++axis) {
+    std::vector<ortc::TensorBase*> ts_ptrs;
+    ts_ptrs.reserve(arg_lists.size());
+    std::vector<int64_t> shape = arg_lists[0][axis]->Shape();
+    for (auto& ts : arg_lists) {
+      if (shape != ts[axis]->Shape()) {
+        return {kOrtxErrorInvalidArgument, "[StackTensors]: shapes of tensors to stack are not the same."};
+      }
+      ts_ptrs.push_back(ts[axis]);
+    }
+
+    std::vector<int64_t> output_shape = shape;
+    output_shape.insert(output_shape.begin(), batch_size);
+    std::byte* tensor_buf = outputs[axis]->AllocateRaw(output_shape);
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto ts = ts_ptrs[i];
+      const std::byte* ts_buff = reinterpret_cast<const std::byte*>(ts->DataRaw());
+      auto ts_size = ts->SizeInBytes();
+      std::memcpy(tensor_buf + i * ts_size, ts_buff, ts_size);
+    }
+  }
+
+  return {};
+}
+
+std::tuple<OrtxStatus, ProcessorResult> ImageProcessor::PreProcess(ort_extensions::span<ImageRawData> image_data,
+                                                                   ortc::Tensor<float>** pixel_values,
+                                                                   ortc::Tensor<int64_t>** image_sizes,
+                                                                   ortc::Tensor<int64_t>** num_img_takens) const {
   ProcessorResult r;
   std::vector<TensorArgs> inputs;
   inputs.resize(image_data.size());
@@ -163,7 +206,39 @@ ImageProcessor::PreProcess(
   *image_sizes = r.image_sizes = StackTensor<int64_t>(outputs, 1, allocator_);
   *num_img_takens = r.num_img_takens = StackTensor<int64_t>(outputs, 2, allocator_);
 
-  return {status, r};
+  return {status, std::move(r)};
+}
+
+OrtxStatus ImageProcessor::PreProcess(ort_extensions::span<ImageRawData> image_data, ImageProcessorResult& r) const {
+  std::vector<TensorArgs> inputs;
+  inputs.resize(image_data.size());
+  for (size_t i = 0; i < image_data.size(); ++i) {
+    auto& ts_input = inputs[i];
+    ImageRawData& image = image_data[i];
+    std::vector<int64_t> shape = {static_cast<int64_t>(image.size())};
+    ts_input.push_back(std::make_unique<ortc::Tensor<uint8_t>>(shape, image.data()).release());
+  }
+
+  std::vector<TensorArgs> outputs;
+  std::vector<Operation*> ops(operations_.size());
+  std::transform(operations_.begin(), operations_.end(), ops.begin(), [](auto& op) { return op.get(); });
+  OrtxRunner runner(allocator_, ops.data(), ops.size());
+  auto status = runner.Run(inputs, outputs);
+  if (!status.IsOk()) {
+    return status;
+  }
+
+  // clear the input tensors
+  for (auto& input : inputs) {
+    for (auto& ts : input) {
+      std::unique_ptr<ortc::TensorBase>(ts).reset();
+    }
+  }
+
+  r.results = operations_.back()->AllocateOutputs(allocator_);
+  status = StackTensors(outputs, r.results, allocator_);
+  operations_.back()->ResetTensors(allocator_);
+  return status;
 }
 
 void ImageProcessor::ClearOutputs(ProcessorResult* r) {
@@ -182,3 +257,14 @@ void ImageProcessor::ClearOutputs(ProcessorResult* r) {
     r->num_img_takens = nullptr;
   }
 }
+
+void ort_extensions::ImageProcessor::ClearOutputs(ImageProcessorResult* r) {
+  if (r == nullptr) {
+    return;
+  }
+
+  for (auto& ts : r->results) {
+    ts.reset();
+  }
+  r->results.clear();  // clear the vector
+}
diff --git a/shared/api/image_processor.h b/shared/api/image_processor.h
index 5ff208d19..534e811d6 100644
--- a/shared/api/image_processor.h
+++ b/shared/api/image_processor.h
@@ -15,8 +15,12 @@
 namespace ort_extensions {
 
 using ImageRawData = std::vector<uint8_t>;
-std::tuple<std::unique_ptr<ImageRawData[]>, size_t>
-LoadRawImages(const std::initializer_list<const char*>& image_paths);
+
+template <typename It>
+std::tuple<std::unique_ptr<ImageRawData[]>, size_t> LoadRawImages(It begin, It end);
+
+std::tuple<std::unique_ptr<ImageRawData[]>, size_t> LoadRawImages(
+    const std::initializer_list<const char*>& image_paths);
 
 class ProcessorResult : public OrtxObjectImpl {
  public:
@@ -26,6 +30,12 @@ class ProcessorResult : public OrtxObjectImpl {
   ortc::Tensor<int64_t>* num_img_takens{};
 };
 
+class ImageProcessorResult : public OrtxObjectImpl {
+ public:
+  ImageProcessorResult() : OrtxObjectImpl(kOrtxKindImageProcessorResult) {}
+  std::vector<TensorPtr> results;
+};
+
 class ImageProcessor : public OrtxObjectImpl {
  public:
   ImageProcessor();
@@ -33,14 +43,15 @@ class ImageProcessor : public OrtxObjectImpl {
 
   OrtxStatus Init(std::string_view processor_def);
 
-  std::tuple<OrtxStatus, ProcessorResult>
-  PreProcess(
-      ort_extensions::span<ImageRawData> image_data,
-      ortc::Tensor<float>** pixel_values,
-      ortc::Tensor<int64_t>** image_sizes,
-      ortc::Tensor<int64_t>** num_img_takens);
+  std::tuple<OrtxStatus, ProcessorResult> PreProcess(ort_extensions::span<ImageRawData> image_data,
+                                                     ortc::Tensor<float>** pixel_values,
+                                                     ortc::Tensor<int64_t>** image_sizes,
+                                                     ortc::Tensor<int64_t>** num_img_takens) const;
+
+  OrtxStatus PreProcess(ort_extensions::span<ImageRawData> image_data, ImageProcessorResult& r) const;
 
-  void ClearOutputs(ProcessorResult* r);
+  static void ClearOutputs(ProcessorResult* r);
+  static void ClearOutputs(ImageProcessorResult* r);
 
   static Operation::KernelRegistry kernel_registry_;
 
diff --git a/shared/api/image_transforms.hpp b/shared/api/image_transforms.hpp
index 773d70cce..93f9ab120 100644
--- a/shared/api/image_transforms.hpp
+++ b/shared/api/image_transforms.hpp
@@ -5,232 +5,228 @@
 
 #include "ocos.h"
 
-constexpr int max_crops = 16;
-constexpr int num_img_tokens = 144;
-constexpr int image_resized_width = 336;
-constexpr int image_resized_height = 336;
+inline OrtxStatus convert_to_rgb(const ortc::Tensor<uint8_t>& input, ortc::Tensor<uint8_t>& output) {
+  auto& dimensions = input.Shape();
+  if (dimensions.size() != 3ULL || dimensions[2] != 3) {
+    return {kOrtxErrorInvalidArgument, "[ConvertToRGB]: input is not (H, W, C)"};
+  }
+
+  std::uint8_t* p_output_image = output.Allocate(dimensions);
+  auto* input_data = input.Data();
+  auto h = dimensions[0];
+  auto w = dimensions[1];
+  auto c = dimensions[2];
+
+  // convert BGR channel layouts to RGB
+  for (int64_t j = 0; j < h; ++j) {
+    for (int64_t k = 0; k < w; ++k) {
+      auto c0_index = j * w * c + k * c;
+      std::tie(p_output_image[c0_index], p_output_image[c0_index + 1], p_output_image[c0_index + 2]) =
+          std::make_tuple(input_data[c0_index + 2], input_data[c0_index + 1], input_data[c0_index]);
+    }
+  }
+
+  return {};
+}
+
+struct Resize {
+  template <typename DictT>
+  OrtxStatus Init(const DictT& attrs) {
+    for (const auto& [key, value] : attrs) {
+      if (key == "height") {
+        height_ = std::get<int64_t>(value);
+      } else if (key == "width") {
+        width_ = std::get<int64_t>(value);
+      } else if (key == "interpolation") {
+        interpolation_ = std::get<std::string>(value);
+        if (interpolation_ != "NEAREST" && interpolation_ != "LINEAR" && interpolation_ != "CUBIC") {
+          return {kOrtxErrorInvalidArgument, "[Resize]: Invalid interpolation method"};
+        }
+      } else {
+        return {kOrtxErrorInvalidArgument, "[Resize]: Invalid argument"};
+      }
+    }
+    return {};
+  }
+
+  OrtxStatus Compute(const ortc::Tensor<uint8_t>& input, ortc::Tensor<uint8_t>& output) {
+    auto& dimensions = input.Shape();
+    if (dimensions.size() != 3ULL) {
+      return {kOrtxErrorInvalidArgument, "[Resize]: Only raw image formats"};
+    }
+
+    auto* input_data = input.Data();
+    int h = static_cast<int>(dimensions[0]);
+    int w = static_cast<int>(dimensions[1]);
+    int c = static_cast<int>(dimensions[2]);
+
+    cv::Mat image(h, w, CV_8UC3, const_cast<uint8_t*>(input_data));
+    cv::Mat output_image;
+    cv::InterpolationFlags interp{};
+    if (interpolation_ == "NEAREST") {
+      interp = cv::INTER_NEAREST;
+    } else if (interpolation_ == "LINEAR") {
+      interp = cv::INTER_LINEAR;
+    } else if (interpolation_ == "CUBIC") {
+      interp = cv::INTER_CUBIC;
+    } else {
+      return {kOrtxErrorInvalidArgument, "[Resize]: Invalid interpolation method"};
+    }
+
+    cv::resize(image, output_image, {static_cast<int32_t>(width_), static_cast<int32_t>(height_)}, 0.0, 0.0, interp);
+
+    auto* p_output_image = output.Allocate({height_, width_, c});
+    std::memcpy(p_output_image, output_image.data, height_ * width_ * c);
+
+    return {};
+  }
+
+ private:
+  int64_t height_{256};
+  int64_t width_{256};
+  std::string interpolation_{"CUBIC"};  // LINEAR, NEAREST, CUBIC
+};
+
+struct Rescale {
+  template <typename DictT>
+  OrtxStatus Init(const DictT& attrs) {
+    for (const auto& [key, value] : attrs) {
+      if (key == "scale") {
+        scale_ = static_cast<float>(std::get<double>(value));
+      } else {
+        return {kOrtxErrorInvalidArgument, "[Rescale]: Invalid argument"};
+      }
+    }
 
-constexpr float OPENAI_CLIP_MEAN[] = {0.48145466f, 0.4578275f, 0.40821073f};
-constexpr float OPENAI_CLIP_STD[] = {0.26862954f, 0.26130258f, 0.27577711f};
+    return {};
+  }
 
-struct ConvertToRGB {
-  OrtxStatus Compute(const ortc::Tensor<uint8_t>& input,
-                     ortc::Tensor<uint8_t>& output) {
+  OrtxStatus Compute(const ortc::Tensor<uint8_t>& input, ortc::Tensor<float>& output) {
     auto& dimensions = input.Shape();
-    if (dimensions.size() != 3ULL || dimensions[2] != 3) {
-      return {kOrtxErrorInvalidArgument, "[ConvertToRGB]: input is not (H, W, C)"};
+    if (dimensions.size() != 3ULL) {  // Only raw image formats
+      return {kOrtxErrorInvalidArgument, "[Rescale]: Only raw image formats"};
     }
 
-    std::uint8_t* p_output_image = output.Allocate(dimensions);
     auto* input_data = input.Data();
     auto h = dimensions[0];
     auto w = dimensions[1];
     auto c = dimensions[2];
+    auto* p_output_image = output.Allocate({h, w, c});
 
-    // convert BGR channel layouts to RGB
     for (int64_t j = 0; j < h; ++j) {
       for (int64_t k = 0; k < w; ++k) {
         auto c0_index = j * w * c + k * c;
-        std::tie(p_output_image[c0_index], p_output_image[c0_index + 1], p_output_image[c0_index + 2]) =
-            std::make_tuple(input_data[c0_index + 2], input_data[c0_index + 1], input_data[c0_index]);
+        for (int64_t l = 0; l < c; ++l) {
+          p_output_image[c0_index + l] = input_data[c0_index + l] * scale_;
+        }
       }
     }
 
     return {};
   }
-};
 
-inline cv::Mat padding_336(const cv::Mat& image) {
-  // def padding_336(b):
-  //     width, height = b.size
-  //     tar = int(np.ceil(height / 336) * 336)
-  //     top_padding = int((tar - height)/2)
-  //     bottom_padding = tar - height - top_padding
-  //     left_padding = 0
-  //     right_padding = 0
-  //     b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
-
-  //     return b
-  float height = static_cast<float>(image.rows);
-  int32_t tar = static_cast<int32_t>(std::ceil(height / image_resized_height) * image_resized_height);
-  int32_t top_padding = static_cast<int32_t>((tar - height) / 2);
-  int32_t bottom_padding = tar - image.rows - top_padding;
-
-  cv::Mat output;
-  cv::copyMakeBorder(image, output, top_padding, bottom_padding, 0, 0, cv::BORDER_CONSTANT, {255, 255, 255});
-  return output;
-}
-
-inline cv::Mat hd_transform(const cv::Mat& image, int hd_num) {
-  //     width, height = img.size
-  auto [width, height] = std::make_tuple(image.cols, image.rows);
+ private:
+  float scale_{1.0f / 255.0f};
+};
 
-  //     ratio = width / height if width >= height else height / width
-  float ratio = 1.0f * width;
-  if (width >= height) {
-    ratio /= height;
-  } else {
-    ratio = 1.0f * height / width;
-  }
+struct Normalize {
+  template <typename DictT>
+  OrtxStatus Init(const DictT& attrs) {
+    for (const auto& [key, value] : attrs) {
+      if (key == "mean") {
+        auto mean = std::get<std::vector<double>>(value);
+        mean_ = {static_cast<float>(mean[0]), static_cast<float>(mean[1]), static_cast<float>(mean[2])};
+      } else if (key == "std") {
+        auto std = std::get<std::vector<double>>(value);
+        std_ = {static_cast<float>(std[0]), static_cast<float>(std[1]), static_cast<float>(std[2])};
+      } else {
+        return {kOrtxErrorInvalidArgument, "[Normalize]: Invalid argument"};
+      }
+    }
 
-  //     scale = 1
-  //     while scale * np.ceil(scale / ratio) <= hd_num:
-  //         scale += 1
-  //     scale -= 1
-  int scale = 1;
-  while (scale * std::ceil(scale / ratio) <= hd_num) {
-    scale += 1;
+    return {};
   }
-  scale -= 1;
 
-  //     new_w = int(scale * 336)
-  //     new_h = int(new_w / ratio)
-  int64_t new_w = scale * image_resized_width;
-  int64_t new_h = static_cast<int64_t>(new_w / ratio);
+  OrtxStatus Compute(const ortc::Tensor<float>& input, ortc::Tensor<float>& output) {
+    auto& dimensions = input.Shape();
+    if (dimensions.size() != 3ULL) {
+      return {kOrtxErrorInvalidArgument, "[Normalize]: Only raw image formats"};
+    }
 
-  //     if width < height:
-  //         new_w, new_h = new_h, new_w
-  if (width < height) {
-    std::swap(new_w, new_h);
-  }
+    auto* input_data = input.Data();
+    auto h = dimensions[0];
+    auto w = dimensions[1];
+    auto c = dimensions[2];
+    auto* p_output_image = output.Allocate({h, w, c});
 
-  //     img = torchvision.transforms.functional.resize(img, [new_h, new_w])
-  std::vector<int32_t> height_x_width{static_cast<int32_t>(new_h),   // H
-                                      static_cast<int32_t>(new_w)};  // W
+    for (int64_t j = 0; j < h; ++j) {
+      for (int64_t k = 0; k < w; ++k) {
+        auto c0_index = j * w * c + k * c;
+        for (int64_t l = 0; l < c; ++l) {
+          p_output_image[c0_index + l] = (input_data[c0_index + l] - mean_[l]) / std_[l];
+        }
+      }
+    }
 
-  cv::Mat output_image;
-  cv::resize(image, output_image,
-             {static_cast<int32_t>(new_w), static_cast<int32_t>(new_h)}, 0.0, 0.0,
-             cv::INTER_LINEAR);
-  //     img = padding_336(img)
-  return padding_336(output_image);
-}
+    return {};
+  }
 
-// Function to calculate 1D index from 3D indices
-inline size_t Index3D(size_t i, size_t j, size_t k, size_t dim1, size_t dim2, size_t dim3) {
-  return i * dim2 * dim3 + j * dim3 + k;
-}
+ private:
+  std::vector<float> mean_{0.48145466f, 0.4578275f, 0.40821073f};
+  std::vector<float> std_{0.26862954f, 0.26130258f, 0.27577711f};
+};
 
-// Function to permute 3D array stored in 1D array from (X, Y, Z) to (Z, X, Y)
-inline void Permute3DArray(const float* array, float* permutedArray, size_t X, size_t Y, size_t Z) {
-  for (size_t x = 0; x < X; ++x) {
-    for (size_t y = 0; y < Y; ++y) {
-      for (size_t z = 0; z < Z; ++z) {
-        size_t oldIndex = Index3D(x, y, z, X, Y, Z);
-        size_t newIndex = Index3D(z, x, y, Z, X, Y);
-        permutedArray[newIndex] = array[oldIndex];
+struct CenterCrop {
+  template <typename DictT>
+  OrtxStatus Init(const DictT& attrs) {
+    for (const auto& [key, value] : attrs) {
+      if (key == "height") {
+        target_h_ = std::get<int64_t>(value);
+      } else if (key == "width") {
+        target_w_ = std::get<int64_t>(value);
+      } else {
+        return {kOrtxErrorInvalidArgument, "[CenterCrop]: Invalid attribute " + key};
       }
     }
-  }
-}
 
-inline OrtxStatus phi3_hd_transform(const ortc::Tensor<uint8_t>& input,
-                                    ortc::Tensor<float>& pixel_values,
-                                    ortc::Tensor<int64_t>& image_sizes,
-                                    ortc::Tensor<int64_t>& num_img_takens) {
-  auto& dimensions = input.Shape();
-  if (dimensions.size() != 3ULL) {
-    return {kOrtxErrorInvalidArgument, "[hd_transform]: Only raw image formats"};
+    return {};
   }
 
-  // Normalize the pixel value with mean and var
-  auto input_data = input.Data();
-  int32_t h = static_cast<int32_t>(dimensions[0]);
-  int32_t w = static_cast<int32_t>(dimensions[1]);
-  int32_t c = static_cast<int32_t>(dimensions[2]);
-  std::vector<int32_t> height_x_width{static_cast<int32_t>(h),   // H
-                                      static_cast<int32_t>(w)};  // W
-
-  cv::Mat rgb_image(height_x_width, CV_8UC3, const_cast<uint8_t*>(input_data));
-  // elems = [HD_transform(im, hd_num = self.num_crops) for im in images]
-  auto elem = hd_transform(rgb_image, max_crops);
-  // # tensor transform and normalize
-  // hd_images = [img_processor(im) for im in elems]
-  std::tie(w, h) = std::make_tuple(elem.cols, elem.rows);
-  auto elem_image = elem.data;
-  auto rgb_image_ptr = std::make_unique<float[]>(h * w * c);
-  auto p_pixel_values = rgb_image_ptr.get();
-  for (int64_t j = 0; j < h; ++j) {
-    for (int64_t k = 0; k < w; ++k) {
-      auto c0_index = j * w * c + k * c;
-      p_pixel_values[c0_index] = (static_cast<float>(elem_image[c0_index]) / 255.f - OPENAI_CLIP_MEAN[0]) / OPENAI_CLIP_STD[0];
-      p_pixel_values[c0_index + 1] = (static_cast<float>(elem_image[c0_index + 1]) / 255.f - OPENAI_CLIP_MEAN[1]) / OPENAI_CLIP_STD[1];
-      p_pixel_values[c0_index + 2] = (static_cast<float>(elem_image[c0_index + 2]) / 255.f - OPENAI_CLIP_MEAN[2]) / OPENAI_CLIP_STD[2];
+  //   # T.CenterCrop(224),
+  // width, height = self.target_size, self.target_size
+  // img_h, img_w = img.shape[-2:]
+  // s_h = torch.div((img_h - height), 2, rounding_mode='trunc')
+  // s_w = torch.div((img_w - width), 2, rounding_mode='trunc')
+  // x = img[:, :, s_h:s_h + height, s_w:s_w + width]
+
+  OrtxStatus Compute(const ortc::Tensor<uint8_t>& input, ortc::Tensor<uint8_t>& output) {
+    auto& dimensions = input.Shape();
+    if (dimensions.size() != 3ULL) {
+      return {kOrtxErrorInvalidArgument, "[CenterCrop]: Only raw image formats"};
     }
-  }
 
-  // Debug code to check the image parity
-  // auto rgb_image_ptr_debug = std::make_unique<float[]>(h * w * c);
-  // Permute3DArray(p_pixel_values, rgb_image_ptr_debug.get(), h, w, c);
-
-  cv::Mat hd_image(h, w, CV_32FC3, p_pixel_values);
-  // # create global image
-  // global_image = [torch.nn.functional.interpolate(im.unsqueeze(0).float(), size=(336, 336), mode='bicubic',).to(im.dtype) for im in hd_images]
-  cv::Mat global_image;
-  cv::resize(hd_image, global_image, {image_resized_height, image_resized_width}, 0.0, 0.0, cv::INTER_CUBIC);
-
-  int64_t shape[2];
-  // # [(3, h, w)], where h, w is multiple of 336
-  // shapes = [[im.size(1), im.size(2)] for im in hd_images]
-  {
-    auto shapes = image_sizes.Allocate({2});
-    shapes[0] = shape[0] = hd_image.rows;
-    shapes[1] = shape[1] = hd_image.cols;
-  }
-  // num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
-  {
-    auto n_tokens = num_img_takens.Allocate({1});
-    auto [h_t, w_t] = std::make_tuple(image_sizes.Data()[0], image_sizes.Data()[1]);
-    auto num_t = (static_cast<int32_t>(
-                      static_cast<int32_t>(h_t / image_resized_height) * w_t / image_resized_width) +
-                  1) *
-                     144 +
-                 1 + static_cast<int32_t>(h_t / image_resized_height + 1) * 12;
-    *n_tokens = static_cast<int64_t>(num_t);
-  }
-  // # reshape to channel dimension -> (num_images, num_crops, 3, 336, 336)
-  // # (1, 3, h//336, 336, w//336, 336) -> (1, h//336, w//336, 3, 336, 336) -> (h//336*w//336, 3, 336, 336)
-  // hd_images_reshape = [im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336).contiguous() for im, (h, w) in zip(hd_images, shapes)]
-  // # concat global image and local image
-  // hd_images_reshape = [torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)]
-  // # pad to max_num_crops
-  // image_transformed = [pad_to_max_num_crops_tensor(im, self.num_crops+1) for im in hd_images_reshape]
-  // image_transformed = torch.stack(image_transformed, dim=0)
-  // padded_images = image_transformed
-  std::vector<int64_t> padded_image_shape = {max_crops + 1, 3, image_resized_height, image_resized_width};
-  float* output_pixel = pixel_values.Allocate(padded_image_shape);
-  // Copy the image pixel value from the global image
-  const int image_c_size = image_resized_height * image_resized_width * 3;
-  Permute3DArray(reinterpret_cast<float*>(global_image.data), output_pixel, image_resized_height, image_resized_width, 3);
-  auto num_crops = static_cast<int>((shape[0] / image_resized_height) * (shape[1] / image_resized_width));
-  float* image_transformed = reinterpret_cast<float*>(hd_image.data);
-  // for (int i = 0; i < num_crops; ++i) {
-  //   Permute3DArray(image_transformed + i * image_c_size, output_pixel + (i + 1) * image_c_size, image_resized_height, image_resized_width, 3);
-  // }
-
-  float* output_pixel_n_1 = output_pixel + image_c_size;
-  int m = static_cast<int>(shape[0] / image_resized_height);
-  int n = static_cast<int>(shape[1] / image_resized_width);
-  h = image_resized_height;
-  w = image_resized_width;
-  assert(m * n == num_crops);
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      int sub_index = (i * n + j) * image_c_size;
-      for (int x = 0; x < image_resized_height; ++x) {
-        for (int y = 0; y < image_resized_width; ++y) {
-          for (int k = 0; k < 3; ++k) {  // Loop over channels
-            output_pixel_n_1[sub_index + k * h * w + x * w + y] = image_transformed[((i * h + x) * shape[1] + (j * w + y)) * 3 + k];
-          }
+    auto* input_data = input.Data();
+    auto h = dimensions[0];
+    auto w = dimensions[1];
+    auto c = dimensions[2];
+
+    auto* p_output_image = output.Allocate({target_h_, target_w_, c});
+    auto s_h = (h - target_h_) / 2;
+    auto s_w = (w - target_w_) / 2;
+
+    for (int64_t j = 0; j < target_h_; ++j) {
+      for (int64_t k = 0; k < target_w_; ++k) {
+        auto c0_index = (j + s_h) * w * c + (k + s_w) * c;
+        for (int64_t l = 0; l < c; ++l) {
+          p_output_image[j * target_w_ * c + k * c + l] = input_data[c0_index + l];
         }
       }
     }
-  }
 
-  // padding the rest of the crops
-  // pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
-  memset(output_pixel_n_1 + num_crops * image_c_size, 0, image_c_size * (max_crops - num_crops) * sizeof(float));
+    return {};
+  }
 
-  // image_sizes = shapes
-  return {};
-}
+ private:
+  int64_t target_h_{224};
+  int64_t target_w_{224};
+};
diff --git a/shared/api/image_transforms_phi_3.hpp b/shared/api/image_transforms_phi_3.hpp
new file mode 100644
index 000000000..172793ae4
--- /dev/null
+++ b/shared/api/image_transforms_phi_3.hpp
@@ -0,0 +1,209 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "ocos.h"
+
+constexpr int max_crops = 16;
+constexpr int num_img_tokens = 144;
+constexpr int image_resized_width = 336;
+constexpr int image_resized_height = 336;
+
+constexpr float OPENAI_CLIP_MEAN[] = {0.48145466f, 0.4578275f, 0.40821073f};
+constexpr float OPENAI_CLIP_STD[] = {0.26862954f, 0.26130258f, 0.27577711f};
+
+inline cv::Mat padding_336(const cv::Mat& image) {
+  // def padding_336(b):
+  //     width, height = b.size
+  //     tar = int(np.ceil(height / 336) * 336)
+  //     top_padding = int((tar - height)/2)
+  //     bottom_padding = tar - height - top_padding
+  //     left_padding = 0
+  //     right_padding = 0
+  //     b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
+
+  //     return b
+  float height = static_cast<float>(image.rows);
+  int32_t tar = static_cast<int32_t>(std::ceil(height / image_resized_height) * image_resized_height);
+  int32_t top_padding = static_cast<int32_t>((tar - height) / 2);
+  int32_t bottom_padding = tar - image.rows - top_padding;
+
+  cv::Mat output;
+  cv::copyMakeBorder(image, output, top_padding, bottom_padding, 0, 0, cv::BORDER_CONSTANT, {255, 255, 255});
+  return output;
+}
+
+inline cv::Mat hd_transform(const cv::Mat& image, int hd_num) {
+  //     width, height = img.size
+  auto [width, height] = std::make_tuple(image.cols, image.rows);
+
+  //     ratio = width / height if width >= height else height / width
+  float ratio = 1.0f * width;
+  if (width >= height) {
+    ratio /= height;
+  } else {
+    ratio = 1.0f * height / width;
+  }
+
+  //     scale = 1
+  //     while scale * np.ceil(scale / ratio) <= hd_num:
+  //         scale += 1
+  //     scale -= 1
+  int scale = 1;
+  while (scale * std::ceil(scale / ratio) <= hd_num) {
+    scale += 1;
+  }
+  scale -= 1;
+
+  //     new_w = int(scale * 336)
+  //     new_h = int(new_w / ratio)
+  int64_t new_w = scale * image_resized_width;
+  int64_t new_h = static_cast<int64_t>(new_w / ratio);
+
+  //     if width < height:
+  //         new_w, new_h = new_h, new_w
+  if (width < height) {
+    std::swap(new_w, new_h);
+  }
+
+  //     img = torchvision.transforms.functional.resize(img, [new_h, new_w])
+  std::vector<int32_t> height_x_width{static_cast<int32_t>(new_h),   // H
+                                      static_cast<int32_t>(new_w)};  // W
+
+  cv::Mat output_image;
+  cv::resize(image, output_image,
+             {static_cast<int32_t>(new_w), static_cast<int32_t>(new_h)}, 0.0, 0.0,
+             cv::INTER_LINEAR);
+  //     img = padding_336(img)
+  return padding_336(output_image);
+}
+
+// Function to calculate 1D index from 3D indices
+inline size_t Index3D(size_t i, size_t j, size_t k, size_t dim1, size_t dim2, size_t dim3) {
+  return i * dim2 * dim3 + j * dim3 + k;
+}
+
+// Function to permute 3D array stored in 1D array from (X, Y, Z) to (Z, X, Y)
+inline void Permute3DArray(const float* array, float* permutedArray, size_t X, size_t Y, size_t Z) {
+  for (size_t x = 0; x < X; ++x) {
+    for (size_t y = 0; y < Y; ++y) {
+      for (size_t z = 0; z < Z; ++z) {
+        size_t oldIndex = Index3D(x, y, z, X, Y, Z);
+        size_t newIndex = Index3D(z, x, y, Z, X, Y);
+        permutedArray[newIndex] = array[oldIndex];
+      }
+    }
+  }
+}
+
+inline OrtxStatus phi3_hd_transform(const ortc::Tensor<uint8_t>& input,
+                                    ortc::Tensor<float>& pixel_values,
+                                    ortc::Tensor<int64_t>& image_sizes,
+                                    ortc::Tensor<int64_t>& num_img_takens) {
+  auto& dimensions = input.Shape();
+  if (dimensions.size() != 3ULL) {
+    return {kOrtxErrorInvalidArgument, "[hd_transform]: Only raw image formats"};
+  }
+
+  // Normalize the pixel value with mean and var
+  auto input_data = input.Data();
+  int32_t h = static_cast<int32_t>(dimensions[0]);
+  int32_t w = static_cast<int32_t>(dimensions[1]);
+  int32_t c = static_cast<int32_t>(dimensions[2]);
+  std::vector<int32_t> height_x_width{static_cast<int32_t>(h),   // H
+                                      static_cast<int32_t>(w)};  // W
+
+  cv::Mat rgb_image(height_x_width, CV_8UC3, const_cast<uint8_t*>(input_data));
+  // elems = [HD_transform(im, hd_num = self.num_crops) for im in images]
+  auto elem = hd_transform(rgb_image, max_crops);
+  // # tensor transform and normalize
+  // hd_images = [img_processor(im) for im in elems]
+  std::tie(w, h) = std::make_tuple(elem.cols, elem.rows);
+  auto elem_image = elem.data;
+  auto rgb_image_ptr = std::make_unique<float[]>(h * w * c);
+  auto p_pixel_values = rgb_image_ptr.get();
+  for (int64_t j = 0; j < h; ++j) {
+    for (int64_t k = 0; k < w; ++k) {
+      auto c0_index = j * w * c + k * c;
+      p_pixel_values[c0_index] = (static_cast<float>(elem_image[c0_index]) / 255.f - OPENAI_CLIP_MEAN[0]) / OPENAI_CLIP_STD[0];
+      p_pixel_values[c0_index + 1] = (static_cast<float>(elem_image[c0_index + 1]) / 255.f - OPENAI_CLIP_MEAN[1]) / OPENAI_CLIP_STD[1];
+      p_pixel_values[c0_index + 2] = (static_cast<float>(elem_image[c0_index + 2]) / 255.f - OPENAI_CLIP_MEAN[2]) / OPENAI_CLIP_STD[2];
+    }
+  }
+
+  // Debug code to check the image parity
+  // auto rgb_image_ptr_debug = std::make_unique<float[]>(h * w * c);
+  // Permute3DArray(p_pixel_values, rgb_image_ptr_debug.get(), h, w, c);
+
+  cv::Mat hd_image(h, w, CV_32FC3, p_pixel_values);
+  // # create global image
+  // global_image = [torch.nn.functional.interpolate(im.unsqueeze(0).float(), size=(336, 336), mode='bicubic',).to(im.dtype) for im in hd_images]
+  cv::Mat global_image;
+  cv::resize(hd_image, global_image, {image_resized_height, image_resized_width}, 0.0, 0.0, cv::INTER_CUBIC);
+
+  int64_t shape[2];
+  // # [(3, h, w)], where h, w is multiple of 336
+  // shapes = [[im.size(1), im.size(2)] for im in hd_images]
+  {
+    auto shapes = image_sizes.Allocate({2});
+    shapes[0] = shape[0] = hd_image.rows;
+    shapes[1] = shape[1] = hd_image.cols;
+  }
+  // num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
+  {
+    auto n_tokens = num_img_takens.Allocate({1});
+    auto [h_t, w_t] = std::make_tuple(image_sizes.Data()[0], image_sizes.Data()[1]);
+    auto num_t = (static_cast<int32_t>(
+                      static_cast<int32_t>(h_t / image_resized_height) * w_t / image_resized_width) +
+                  1) *
+                     144 +
+                 1 + static_cast<int32_t>(h_t / image_resized_height + 1) * 12;
+    *n_tokens = static_cast<int64_t>(num_t);
+  }
+  // # reshape to channel dimension -> (num_images, num_crops, 3, 336, 336)
+  // # (1, 3, h//336, 336, w//336, 336) -> (1, h//336, w//336, 3, 336, 336) -> (h//336*w//336, 3, 336, 336)
+  // hd_images_reshape = [im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336).contiguous() for im, (h, w) in zip(hd_images, shapes)]
+  // # concat global image and local image
+  // hd_images_reshape = [torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)]
+  // # pad to max_num_crops
+  // image_transformed = [pad_to_max_num_crops_tensor(im, self.num_crops+1) for im in hd_images_reshape]
+  // image_transformed = torch.stack(image_transformed, dim=0)
+  // padded_images = image_transformed
+  std::vector<int64_t> padded_image_shape = {max_crops + 1, 3, image_resized_height, image_resized_width};
+  float* output_pixel = pixel_values.Allocate(padded_image_shape);
+  // Copy the image pixel value from the global image
+  const int image_c_size = image_resized_height * image_resized_width * 3;
+  Permute3DArray(reinterpret_cast<float*>(global_image.data), output_pixel, image_resized_height, image_resized_width, 3);
+  auto num_crops = static_cast<int>((shape[0] / image_resized_height) * (shape[1] / image_resized_width));
+  float* image_transformed = reinterpret_cast<float*>(hd_image.data);
+  // for (int i = 0; i < num_crops; ++i) {
+  //   Permute3DArray(image_transformed + i * image_c_size, output_pixel + (i + 1) * image_c_size, image_resized_height, image_resized_width, 3);
+  // }
+
+  float* output_pixel_n_1 = output_pixel + image_c_size;
+  int m = static_cast<int>(shape[0] / image_resized_height);
+  int n = static_cast<int>(shape[1] / image_resized_width);
+  h = image_resized_height;
+  w = image_resized_width;
+  assert(m * n == num_crops);
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      int sub_index = (i * n + j) * image_c_size;
+      for (int x = 0; x < image_resized_height; ++x) {
+        for (int y = 0; y < image_resized_width; ++y) {
+          for (int k = 0; k < 3; ++k) {  // Loop over channels
+            output_pixel_n_1[sub_index + k * h * w + x * w + y] = image_transformed[((i * h + x) * shape[1] + (j * w + y)) * 3 + k];
+          }
+        }
+      }
+    }
+  }
+
+  // padding the rest of the crops
+  // pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
+  memset(output_pixel_n_1 + num_crops * image_c_size, 0, image_c_size * (max_crops - num_crops) * sizeof(float));
+
+  // image_sizes = shapes
+  return {};
+}
diff --git a/shared/api/runner.hpp b/shared/api/runner.hpp
index ba5991400..3590190bb 100644
--- a/shared/api/runner.hpp
+++ b/shared/api/runner.hpp
@@ -7,6 +7,7 @@
 #include <tuple>
 #include <string>
 #include <vector>
+#include <variant>
 #include <unordered_map>
 
 #include "nlohmann/json.hpp"
@@ -16,6 +17,7 @@
 namespace ort_extensions {
 
 using json = nlohmann::json;
+using TensorPtr = std::unique_ptr<ortc::TensorBase>;
 using TensorArgs = std::vector<ortc::TensorBase*>;
 
 class KernelDef {
@@ -26,6 +28,9 @@ class KernelDef {
   virtual TensorArgs AllocateOutput(ortc::IAllocator* allocator) const = 0;
   virtual OrtxStatus Apply(TensorArgs& inputs, TensorArgs& output) const = 0;
 
+  using AttrType = std::variant<std::string, double, int64_t, std::vector<double>>;
+  using AttrDict = std::unordered_map<std::string, AttrType>;
+
   template <typename... Args>
   using tuple_function_args = std::tuple<typename std::remove_reference<Args>::type*...>;
 
@@ -50,14 +55,14 @@ class KernelDef {
   }
 
   template <typename T>
-  static typename std::enable_if<std::is_const<T>::value, ortc::TensorBase*>::type
-  AllocateTensor(ortc::IAllocator* allocator) {
+  static typename std::enable_if<std::is_const<T>::value, ortc::TensorBase*>::type AllocateTensor(
+      ortc::IAllocator* allocator) {
     return nullptr;
   }
 
   template <typename T>
-  static typename std::enable_if<!std::is_const<T>::value, ortc::TensorBase*>::type
-  AllocateTensor(ortc::IAllocator* allocator) {
+  static typename std::enable_if<!std::is_const<T>::value, ortc::TensorBase*>::type AllocateTensor(
+      ortc::IAllocator* allocator) {
     return std::make_unique<T>(allocator).release();
   }
 
@@ -70,18 +75,17 @@ class KernelDef {
   static std::vector<ortc::TensorBase*> AllocateOutput(ortc::IAllocator* allocator) {
     using tuple_no_ref = std::tuple<typename std::remove_reference<Args>::type...>;
     auto result = AllocateTuple(allocator, (tuple_no_ref*)0);
-    return std::apply([](auto&&... elems) { return std::vector<ortc::TensorBase*>{std::forward<decltype(elems)>(elems)...}; }, std::move(result));
+    return std::apply(
+        [](auto&&... elems) { return std::vector<ortc::TensorBase*>{std::forward<decltype(elems)>(elems)...}; },
+        std::move(result));
   }
 
-  static auto CastOutputAllType(TensorArgs::iterator tensor) {
-    return std::make_tuple();
-  }
+  static auto CastOutputAllType(TensorArgs::iterator tensor) { return std::make_tuple(); }
 
   template <typename T, typename... Args>
   static auto CastOutputAllType(TensorArgs::iterator tensor, T& arg, Args&... args) {
     // return std::make_tuple(static_cast<T&>(*tensor), CastOutputAllType(args...));
-    return std::tuple_cat(CastOutputImpl<T>(tensor),
-                          CastOutputAllType(tensor + 1, args...));
+    return std::tuple_cat(CastOutputImpl<T>(tensor), CastOutputAllType(tensor + 1, args...));
   }
 
   template <typename... Args>
@@ -115,15 +119,14 @@ class KernelFunction : public KernelDef {
     all_args.insert(all_args.end(), inputs.begin(), inputs.end());
     all_args.insert(all_args.end(), outputs.begin(), outputs.end());
     auto args_tuple = std::tuple_cat(CastTensors<Args...>(all_args));
-    return std::apply([this](auto&&... args) { return this->Compute(std::forward<decltype(*args)>(*args)...); }, std::move(args_tuple));
+    return std::apply([this](auto&&... args) { return this->Compute(std::forward<decltype(*args)>(*args)...); },
+                      std::move(args_tuple));
   }
 
  private:
   std::function<OrtxStatus(Args...)> body_;
 
-  OrtxStatus Compute(Args... args) const {
-    return body_(std::forward<Args>(args)...);
-  }
+  OrtxStatus Compute(Args... args) const { return body_(std::forward<Args>(args)...); }
 };
 
 template <typename T, typename... Args>
@@ -144,10 +147,34 @@ class KernelStruct : public KernelDef {
     return all_args;
   }
 
-  template <typename DT>
-  OrtxStatus Init(DT attr) {
+  OrtxStatus Init(std::string_view attr_str) override {
     instance_ = std::make_unique<T>();
-    return instance_->Init(std::move(attr));
+
+    AttrDict attr_dict;
+    if (attr_str.empty()) {
+      return instance_->Init(attr_dict);
+    }
+
+    auto attr = json::parse(attr_str, nullptr, false);
+    if (attr.is_discarded()) {
+      return {kOrtxErrorCorruptData, "Failed to parse JSON for kernel attributes."};
+    }
+    attr_dict.reserve(attr.size());
+    for (auto& [key, value] : attr.items()) {
+      if (value.is_string()) {
+        attr_dict[key] = value.template get<std::string>();
+      } else if (value.is_number_integer() || value.is_number_unsigned()) {
+        attr_dict[key] = value.template get<int64_t>();
+      } else if (value.is_number_float()) {
+        attr_dict[key] = value.template get<double>();
+      } else if (value.is_array()) {
+        attr_dict[key] = value.template get<std::vector<double>>();
+      } else {
+        return {kOrtxErrorCorruptData, "Invalid attribute type."};
+      }
+    }
+
+    return instance_->Init(attr_dict);
   }
 
   OrtxStatus Apply(TensorArgs& inputs, TensorArgs& outputs) const override {
@@ -156,8 +183,9 @@ class KernelStruct : public KernelDef {
     all_args.insert(all_args.end(), inputs.begin(), inputs.end());
     all_args.insert(all_args.end(), outputs.begin(), outputs.end());
     auto args_tuple = std::tuple_cat(CastTensors<Args...>(all_args));
-    return std::apply([this](auto&&... args) {
-      return (instance_.get()->*body_)(std::forward<decltype(*args)>(*args)...); }, std::move(args_tuple));
+    return std::apply(
+        [this](auto&&... args) { return (instance_.get()->*body_)(std::forward<decltype(*args)>(*args)...); },
+        std::move(args_tuple));
   }
 
  private:
@@ -207,32 +235,36 @@ class Operation {
     op_name_ = op_name;
     kernel_ = kernel_iter->second();
 
+    std::string attr_str;
     if (op_json.contains("attrs")) {
       auto attrs = op_json.at("attrs");
-      auto status = kernel_->Init(attrs.dump());
-      if (!status.IsOk()) {
-        return status;
-      }
+      attr_str = attrs.dump();
     }
 
-    return {};
+    return kernel_->Init(attr_str);
   }
 
-  virtual ~Operation() {
-    ResetTensors(allocator_);
-  }
+  virtual ~Operation() { ResetTensors(allocator_); }
 
-  std::tuple<OrtxStatus, std::vector<ortc::TensorBase*>>
-  Apply(ortc::IAllocator* allocator, std::vector<ortc::TensorBase*> inputs) {
+  std::tuple<OrtxStatus, std::vector<ortc::TensorBase*>> Apply(ortc::IAllocator* allocator,
+                                                               std::vector<ortc::TensorBase*> inputs) {
     auto outputs = kernel_->AllocateOutput(allocator);
     auto status = kernel_->Apply(inputs, outputs);
     return std::make_tuple(status, outputs);
   }
 
-  void ResetTensors(ortc::IAllocator* allocator) {
-    outputs_.clear();
+  std::vector<TensorPtr> AllocateOutputs(ortc::IAllocator* allocator) {
+    auto tensors = kernel_->AllocateOutput(allocator);
+    std::vector<TensorPtr> outputs;
+    for (auto& tensor : tensors) {
+      outputs.push_back(std::unique_ptr<ortc::TensorBase>(tensor));
+    }
+
+    return outputs;
   }
 
+  void ResetTensors(ortc::IAllocator* allocator) { outputs_.clear(); }
+
  private:
   std::vector<std::unique_ptr<ortc::TensorBase>> outputs_;
 
diff --git a/test/data/processor/clip_image.json b/test/data/processor/clip_image.json
new file mode 100644
index 000000000..6891de8e4
--- /dev/null
+++ b/test/data/processor/clip_image.json
@@ -0,0 +1,59 @@
+{
+  "processor": {
+    "name": "image_processing",
+    "transforms": [
+      {
+        "operation": {
+          "name": "decode_image",
+          "type": "DecodeImage",
+          "attrs": {
+            "color_space": "BGR"
+          }
+        }
+      },
+      {
+        "operation": {
+          "name": "convert_to_rgb",
+          "type": "ConvertRGB"
+        }
+      },
+      {
+        "operation": {
+          "name": "resize",
+          "type": "Resize",
+          "attrs": {
+            "interpolation": "CUBIC",
+            "width": 256,
+            "height": 256
+          }
+        }
+      },
+      {
+        "operation": {
+          "name": "center_crop",
+          "type": "CenterCrop",
+          "attrs": {
+            "width": 224,
+            "height": 224
+          }
+        }
+      },
+      {
+        "operation": {
+          "name": "re-scale",
+          "type": "Rescale"
+        }
+      },
+      {
+        "operation": {
+          "name": "normalize",
+          "type": "Normalize",
+          "attrs": {
+            "mean": [0.485, 0.456, 0.406],
+            "std": [0.229, 0.224, 0.225]
+          }
+        }
+      }
+    ]
+  }
+}
diff --git a/test/data/processor/image_processor.json b/test/data/processor/phi_3_image.json
similarity index 100%
rename from test/data/processor/image_processor.json
rename to test/data/processor/phi_3_image.json
diff --git a/test/pp_api_test/test_processor.cc b/test/pp_api_test/test_processor.cc
index 076c3b9d7..df06e54e8 100644
--- a/test/pp_api_test/test_processor.cc
+++ b/test/pp_api_test/test_processor.cc
@@ -7,6 +7,7 @@
 #include <filesystem>
 
 #include "gtest/gtest.h"
+#include "ortx_c_helper.h"
 #include "shared/api/image_processor.h"
 
 using namespace ort_extensions;
@@ -30,16 +31,13 @@ TEST(ProcessorTest, TestPhi3VImageProcessing) {
   auto [input_data, n_data] = ort_extensions::LoadRawImages(
       {"data/processor/standard_s.jpg", "data/processor/australia.jpg", "data/processor/exceltable.png"});
 
-  auto proc = OrtxObjectPtr<ImageProcessor>(OrtxCreateProcessor, "data/processor/image_processor.json");
+  auto proc = OrtxObjectPtr<ImageProcessor>(OrtxCreateProcessor, "data/processor/phi_3_image.json");
   ortc::Tensor<float>* pixel_values;
   ortc::Tensor<int64_t>* image_sizes;
   ortc::Tensor<int64_t>* num_img_tokens;
 
-  auto [status, r] = proc->PreProcess(
-      ort_extensions::span(input_data.get(), (size_t)n_data),
-      &pixel_values,
-      &image_sizes,
-      &num_img_tokens);
+  auto [status, r] = proc->PreProcess(ort_extensions::span(input_data.get(), (size_t)n_data), &pixel_values,
+                                      &image_sizes, &num_img_tokens);
 
   ASSERT_TRUE(status.IsOk());
   int64_t expected_image_size[] = {1344, 1344, 1008, 1344, 1008, 1680};
@@ -72,3 +70,33 @@ TEST(ProcessorTest, TestPhi3VImageProcessing) {
 
   proc->ClearOutputs(&r);
 }
+
+TEST(ProcessorTest, TestClipImageProcessing) {
+  const char* images_path[] = {"data/processor/standard_s.jpg", "data/processor/australia.jpg",
+                               "data/processor/exceltable.png"};
+  OrtxObjectPtr<OrtxRawImages> raw_images;
+  extError_t err = OrtxLoadImages(ort_extensions::ptr(raw_images), images_path, 3, nullptr);
+  ASSERT_EQ(err, kOrtxOK);
+
+  OrtxObjectPtr<OrtxProcessor> processor;
+  err = OrtxCreateProcessor(ort_extensions::ptr(processor), "data/processor/clip_image.json");
+  if (err != kOrtxOK) {
+    std::cout << "Error: " << OrtxGetLastErrorMessage() << std::endl;
+  }
+  ASSERT_EQ(err, kOrtxOK);
+
+  OrtxObjectPtr<OrtxImageProcessorResult> result;
+  err = OrtxImagePreProcess(processor.get(), raw_images.get(), ort_extensions::ptr(result));
+  ASSERT_EQ(err, kOrtxOK);
+
+  OrtxObjectPtr<OrtxTensor> tensor;
+  err = OrtxImageGetTensorResult(result.get(), 0, ort_extensions::ptr(tensor));
+  ASSERT_EQ(err, kOrtxOK);
+
+  const float* data{};
+  const int64_t* shape{};
+  size_t num_dims;
+  err = OrtxGetTensorDataFloat(tensor.get(), &data, &shape, &num_dims);
+  ASSERT_EQ(err, kOrtxOK);
+  ASSERT_EQ(num_dims, 4);
+}

From 3b275b16bc7a11fb89f82546d1209a161ab057ca Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Thu, 20 Jun 2024 15:18:17 -0700
Subject: [PATCH 3/3] Upgrade pybind11 2.12 to support both numpy 1.x and 2.x
 (#750)

---
 cgmanifest.json                | 2 +-
 cmake/externals/pybind11.cmake | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cgmanifest.json b/cgmanifest.json
index df867353f..7eabed5a8 100644
--- a/cgmanifest.json
+++ b/cgmanifest.json
@@ -144,7 +144,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "80dc998efced8ceb2be59756668a7e90e8bef917",
+          "commitHash": "3e9dfa2866941655c56877882565e7577de6fc7b",
           "repositoryUrl": "https://github.com/pybind/pybind11.git"
         },
         "comments": "v2.10.1"
diff --git a/cmake/externals/pybind11.cmake b/cmake/externals/pybind11.cmake
index c6460d06a..e8064bfaf 100644
--- a/cmake/externals/pybind11.cmake
+++ b/cmake/externals/pybind11.cmake
@@ -1,7 +1,7 @@
 FetchContent_Declare(
   pybind11
-  URL       https://github.com/pybind/pybind11/archive/refs/tags/v2.10.1.zip
-  URL_HASH  SHA1=769b6aa67a77f17a770960f604b727645b6f6a13
+  URL       https://github.com/pybind/pybind11/archive/refs/tags/v2.12.0.zip
+  URL_HASH  SHA1=8482f57ed55c7b100672815a311d5450858723fb
 )
 
 FetchContent_GetProperties(pybind11)