Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove ExecuTorch copy of Vectorized #7042

Open
wants to merge 25 commits into
base: gh/swolchok/121/base
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9a552c8
Remove ExecuTorch copy of Vectorized
swolchok Nov 23, 2024
4af30de
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Nov 23, 2024
cd2cc4e
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Nov 23, 2024
6c5f798
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Nov 26, 2024
ff59c8b
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Dec 2, 2024
33d922b
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Dec 2, 2024
3610c35
rebase on "Remove ExecuTorch copy of Vectorized"
swolchok Dec 8, 2024
aa58719
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Dec 17, 2024
6124ad5
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Dec 17, 2024
f0f7a22
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Dec 18, 2024
3c179ad
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Dec 18, 2024
a67cfd3
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Dec 20, 2024
441a925
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 2, 2025
f3eb465
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 3, 2025
f8e4d16
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 4, 2025
7428552
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 6, 2025
91c258b
Update on "Remove ExecuTorch copy of Vectorized"
Jan 13, 2025
fa5f813
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 15, 2025
086820e
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 15, 2025
ccafe18
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 15, 2025
4a3726e
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 15, 2025
73ccca7
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 15, 2025
0455b2b
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 15, 2025
10508a5
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 17, 2025
f98d55e
Update on "Remove ExecuTorch copy of Vectorized"
swolchok Jan 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions extension/llm/custom_ops/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,11 @@ else()
endif()

add_library(custom_ops ${_custom_ops__srcs})

find_package_torch_headers()
target_include_directories(custom_ops PUBLIC "${_common_include_directories}")
target_include_directories(
custom_ops PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
${TORCH_INCLUDE_DIRS}
)
target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core)

Expand All @@ -68,8 +69,6 @@ target_compile_options(
install(TARGETS custom_ops DESTINATION lib)

if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
# Add a AOT library
find_package(Torch CONFIG REQUIRED)
add_library(
custom_ops_aot_lib SHARED
${_custom_ops__srcs}
Expand All @@ -83,6 +82,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
)
target_include_directories(
custom_ops_aot_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
${TORCH_INCLUDE_DIRS}
)
# TODO: This only works if we install portable_lib.so to
# <site-packages>/executorch/extension/pybindings/.
Expand Down
15 changes: 4 additions & 11 deletions extension/llm/custom_ops/op_sdpa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

#include <executorch/extension/llm/custom_ops/op_sdpa.h>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/blas/CPUBlas.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
// @lint-ignore CLANGTIDY facebook-unused-include-check
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
Expand All @@ -34,18 +35,10 @@ namespace util {
constexpr size_t kKVDim = 4;

template <typename T>
inline void _store(T* dst, ::executorch::vec::Vectorized<T> src) {
inline void _store(T* dst, ::at::vec::Vectorized<T> src) {
src.store(dst);
}

/*
inline void _store(::Half* dst, at::vec::Vectorized<float> src) {
//fp16_ieee_to_fp32_value
auto res = at::vec::convert_float_half(src, src);
res.store(dst, at::vec::Vectorized<float>::size());
}
*/

template <typename T>
inline T data_index_init(T offset) {
return offset;
Expand Down Expand Up @@ -78,7 +71,7 @@ inline double calculate_scale(const Tensor& query, optional<double> scale) {
}

} // namespace util
namespace vec = ::executorch::vec;
namespace vec = ::at::vec;
using Tensor = exec_aten::Tensor;

namespace {
Expand Down
28 changes: 14 additions & 14 deletions kernels/optimized/cpu/moments_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// for use in optimized ExecuTorch ops. Template specializations of BFloat16
// are excluded.

#include <executorch/kernels/optimized/vec/vec.h>
#include <ATen/cpu/vec/vec.h>

#include <executorch/kernels/optimized/utils/math_utils.h>
#include <executorch/runtime/platform/compiler.h>
Expand Down Expand Up @@ -47,12 +47,12 @@ void AddMoments(
template <typename T>
ET_INLINE void AddMomentsVec(
int64_t m0_add,
const executorch::vec::Vectorized<T>& m1_add,
const executorch::vec::Vectorized<T>& m2_add,
const at::vec::Vectorized<T>& m1_add,
const at::vec::Vectorized<T>& m2_add,
int64_t& m0,
executorch::vec::Vectorized<T>& m1,
executorch::vec::Vectorized<T>& m2) {
using Vec = executorch::vec::Vectorized<T>;
at::vec::Vectorized<T>& m1,
at::vec::Vectorized<T>& m2) {
using Vec = at::vec::Vectorized<T>;
const int64_t n = m0 + m0_add;
const T c =
n == 0 ? static_cast<T>(0) : static_cast<T>(m0_add) / static_cast<T>(n);
Expand All @@ -67,11 +67,11 @@ template <typename T>
inline void UpdateMomentsVec(
int64_t m0,
const T* X_ptr,
const std::array<executorch::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
const std::array<at::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
int64_t& m0_stk0,
executorch::vec::Vectorized<acc_t<T>>& m1_stk0,
executorch::vec::Vectorized<acc_t<T>>& m2_stk0) {
using Vec = executorch::vec::Vectorized<acc_t<T>>;
at::vec::Vectorized<acc_t<T>>& m1_stk0,
at::vec::Vectorized<acc_t<T>>& m2_stk0) {
using Vec = at::vec::Vectorized<acc_t<T>>;
Vec m1_vec(0);
Vec m2_vec(0);
for (int64_t j = 0; j < m0; ++j) {
Expand All @@ -92,13 +92,13 @@ std::pair<acc_t<T>, acc_t<T>>
RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
using T_ACC = acc_t<T>;

constexpr int64_t kVecSize = executorch::vec::Vectorized<T>::size();
constexpr int64_t kAccVecSize = executorch::vec::Vectorized<T_ACC>::size();
constexpr int64_t kVecSize = at::vec::Vectorized<T>::size();
constexpr int64_t kAccVecSize = at::vec::Vectorized<T_ACC>::size();
const int64_t n = N / kVecSize;
const int64_t m = executorch::utils::divup(n, kChunkSize);
const int64_t depth = executorch::utils::CeilLog2(m);

using Vec = executorch::vec::Vectorized<T_ACC>;
using Vec = at::vec::Vectorized<T_ACC>;
const Vec kZeroVec(T_ACC(0));
std::array<int64_t, kMaxDepth> m0_stk;
std::array<Vec, kMaxDepth> m1_stk;
Expand Down Expand Up @@ -168,7 +168,7 @@ RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
template <typename T>
std::pair<acc_t<T>, acc_t<T>>
RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
using Vec = executorch::vec::Vectorized<T>;
using Vec = at::vec::Vectorized<T>;
constexpr int64_t kVecSize = Vec::size();
const int64_t n = N / kVecSize;
const int64_t m = executorch::utils::divup(n, kChunkSize);
Expand Down
17 changes: 9 additions & 8 deletions kernels/optimized/cpu/op_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
Expand Down Expand Up @@ -99,8 +100,8 @@ Tensor& opt_add_out(
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[alpha_val, b_casted](Vec x) {
return x + Vec(alpha_val * b_casted);
},
Expand Down Expand Up @@ -131,8 +132,8 @@ Tensor& opt_add_out(
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -166,7 +167,7 @@ Tensor& opt_add_out(
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -244,8 +245,8 @@ Tensor& opt_add_scalar_out(
CTYPE alpha_val;
ET_EXTRACT_SCALAR(alpha, alpha_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[alpha_val, b_casted](Vec x) {
return x + Vec(alpha_val * b_casted);
},
Expand Down
19 changes: 10 additions & 9 deletions kernels/optimized/cpu/op_div.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
Expand Down Expand Up @@ -76,16 +77,16 @@ Tensor& opt_div_out(
CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
if (a.numel() == 1) {
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[scalar_casted](Vec x) { return Vec(scalar_casted) / x; },
out.mutable_data_ptr<CTYPE>(),
tensor->const_data_ptr<CTYPE>(),
out.numel());
} else {
Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted);
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[inv_scalar_casted_vec](Vec x) {
return x * inv_scalar_casted_vec;
},
Expand All @@ -111,8 +112,8 @@ Tensor& opt_div_out(
"Failed to resize output tensor.");

ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "div.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x / y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -142,7 +143,7 @@ Tensor& opt_div_out(
out,
"Failed to resize output tensor.");
ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
if (selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
Expand Down Expand Up @@ -222,9 +223,9 @@ Tensor& opt_div_scalar_out(
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
Vec inv_b_casted_vec(CTYPE(1) / b_casted);
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
7 changes: 4 additions & 3 deletions kernels/optimized/cpu/op_exp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

#include <cmath>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
Expand All @@ -34,8 +35,8 @@ void exp_data(
const CTYPE_IN* in_data,
const size_t numel,
CTYPE_OUT* out_data) {
using Vec = executorch::vec::Vectorized<CTYPE_IN>;
executorch::vec::map<CTYPE_IN>(
using Vec = at::vec::Vectorized<CTYPE_IN>;
at::vec::map<CTYPE_IN>(
[](Vec x) { return x.exp(); }, out_data, in_data, numel);
}

Expand Down
11 changes: 6 additions & 5 deletions kernels/optimized/cpu/op_le.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>
Expand Down Expand Up @@ -44,8 +45,8 @@ Tensor& opt_le_tensor_out(
if (a_type == b_type && a_type == out_type) {
ET_SWITCH_REAL_TYPES_AND(
Bool, out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x.le(y); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -109,8 +110,8 @@ Tensor& opt_le_scalar_out(
CTYPE_B b_val = 0;
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x.le(Vec(b_casted)); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
19 changes: 10 additions & 9 deletions kernels/optimized/cpu/op_mul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export
Expand Down Expand Up @@ -95,7 +96,7 @@ Tensor& handle_last_dim_broadcast(
const size_t outer_size = getLeadingDims(out, out.dim() - 1);
const auto broadcast_size = out.size(out.dim() - 1);
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_broadcast_last_dim<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -164,7 +165,7 @@ Tensor& handle_broadcast_mul(
inner_size = lhs->sizes()[lhs->dim() - 1];
}
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -203,8 +204,8 @@ Tensor& opt_mul_out(
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x * Vec(b_casted); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand All @@ -229,8 +230,8 @@ Tensor& opt_mul_out(
"Failed to resize output tensor.");

ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -306,8 +307,8 @@ Tensor& opt_mul_scalar_out(
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x * Vec(b_casted); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
Loading
Loading