diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake index 27b4ec884..294b95542 100644 --- a/fbgemm_gpu/FbgemmGpu.cmake +++ b/fbgemm_gpu/FbgemmGpu.cmake @@ -473,6 +473,7 @@ set(fbgemm_gpu_sources_static_cpu src/layout_transform_ops/layout_transform_ops_cpu.cpp src/quantize_ops/quantize_ops_cpu.cpp src/quantize_ops/quantize_ops_meta.cpp + src/sparse_ops/sparse_async_cumsum.cpp src/sparse_ops/sparse_ops_cpu.cpp src/sparse_ops/sparse_ops_meta.cpp src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp diff --git a/fbgemm_gpu/src/sparse_ops/common.h b/fbgemm_gpu/src/sparse_ops/common.h new file mode 100644 index 000000000..1cdd8ce9e --- /dev/null +++ b/fbgemm_gpu/src/sparse_ops/common.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +using Tensor = at::Tensor; + +namespace fbgemm_gpu { + +namespace { +inline Tensor native_empty_like(const Tensor& self) { + return at::native::empty_like( + self, + c10::optTypeMetaToScalarType(self.options().dtype_opt()), + self.options().layout_opt(), + self.options().device_opt(), + self.options().pinned_memory_opt(), + c10::nullopt); +} + +} // namespace + +}; // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp b/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp new file mode 100644 index 000000000..e3f04b58e --- /dev/null +++ b/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cpp @@ -0,0 +1,151 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include "common.h" +#include "fbgemm_gpu/sparse_ops.h" +#include "fbgemm_gpu/utils/dispatch_macros.h" +#include "fbgemm_gpu/utils/ops_utils.h" +#include "fbgemm_gpu/utils/tensor_utils.h" + +using Tensor = at::Tensor; + +namespace fbgemm_gpu { + +// 1D exclusive scan: output[i] = input[i-1] + input[i-2] + input[i-3] +// Used as a helper to several functions below. +template +U exclusive_scan_ptrs_cpu( + const int64_t N, + const T* const input, + U* const output) { + U cumsum = 0; + for (const auto i : c10::irange(N)) { + output[i] = cumsum; + cumsum += input[i]; + } + return cumsum; +} + +void asynchronous_exclusive_cumsum_cpu_out(Tensor& t_out, const Tensor& t_in) { + TENSOR_ON_CPU(t_in); + TENSOR_ON_CPU(t_out); + + const auto t_in_contig = t_in.expect_contiguous(); + at::native::resize_(t_out, t_in_contig->sizes(), c10::nullopt); + + FBGEMM_DISPATCH_ALL_TYPES( + t_in_contig->scalar_type(), + "asynchronous_exclusive_cumsum_cpu_kernel", + [&] { + exclusive_scan_ptrs_cpu( + t_in_contig->numel(), + t_in_contig->data_ptr(), + t_out.data_ptr()); + }); +} + +Tensor asynchronous_exclusive_cumsum_cpu(const Tensor& t_in) { + TENSOR_ON_CPU(t_in); + + const auto t_in_contig = t_in.expect_contiguous(); + auto output = native_empty_like(*t_in_contig); + asynchronous_exclusive_cumsum_cpu_out(output, *t_in_contig); + return output; +} + +Tensor asynchronous_inclusive_cumsum_cpu(const Tensor& t_in) { + TENSOR_ON_CPU(t_in); + + const auto t_in_contig = t_in.expect_contiguous(); + auto output = native_empty_like(*t_in_contig); + FBGEMM_DISPATCH_ALL_TYPES( + t_in_contig->scalar_type(), + "asynchronous_inclusive_cumsum_cpu_kernel", + [&] { + scalar_t cumsum = 0; + const auto* input_ptr = t_in_contig->data_ptr(); + const auto N = t_in_contig->numel(); + auto* output_ptr = output.data_ptr(); + + for (const auto i : c10::irange(N)) { + cumsum += input_ptr[i]; + output_ptr[i] = cumsum; + } + }); + return output; +} + +Tensor asynchronous_complete_cumsum_cpu_out(Tensor& t_out, const Tensor& t_in) { + TENSOR_ON_CPU(t_in); + TENSOR_ON_CPU(t_out); + const auto num_dims = t_in.dim(); + TORCH_CHECK(num_dims == 1 || num_dims == 2); + const auto t_in_contig = t_in.expect_contiguous(); + const auto t_out_contig = t_out.expect_contiguous(); + + FBGEMM_DISPATCH_ALL_TYPES( + t_in_contig->scalar_type(), + "asynchronous_complete_cumsum_cpu_kernel", + [&] { + if (num_dims == 1) { + const auto N = t_in_contig->numel(); + t_out.data_ptr()[N] = exclusive_scan_ptrs_cpu( + N, t_in_contig->data_ptr(), t_out.data_ptr()); + } else { + const auto num_vecs = t_in_contig->size(0); + const auto N = t_in_contig->size(1); + at::parallel_for(0, num_vecs, 1, [&](int64_t start, int64_t end) { + for (const auto i : c10::irange(start, end)) { + scalar_t* out_ptr = t_out.data_ptr() + i * (N + 1); + out_ptr[N] = exclusive_scan_ptrs_cpu( + N, t_in_contig->data_ptr() + i * N, out_ptr); + } + }); + } + }); + return t_out; +} + +Tensor asynchronous_complete_cumsum_cpu(const Tensor& t_in) { + const auto num_dims = t_in.dim(); + TORCH_CHECK(num_dims == 1 || num_dims == 2); + auto output = num_dims == 1 + ? at::empty({t_in.numel() + 1}, t_in.options()) + : at::empty({t_in.size(0), t_in.size(1) + 1}, t_in.options()); + + return asynchronous_complete_cumsum_cpu_out(output, t_in); +} + +} // namespace fbgemm_gpu + +TORCH_LIBRARY_FRAGMENT(fbgemm, m) { + m.def( + "asynchronous_exclusive_cumsum(Tensor t_in) -> Tensor", + {PT2_COMPLIANT_TAG}); + m.def( + "asynchronous_inclusive_cumsum(Tensor t_in) -> Tensor", + {PT2_COMPLIANT_TAG}); + m.def( + "asynchronous_complete_cumsum(Tensor t_in) -> Tensor", + {PT2_COMPLIANT_TAG}); +} + +TORCH_LIBRARY_IMPL(fbgemm, CPU, m) { + DISPATCH_TO_CPU( + "asynchronous_exclusive_cumsum", + fbgemm_gpu::asynchronous_exclusive_cumsum_cpu); + DISPATCH_TO_CPU( + "asynchronous_inclusive_cumsum", + fbgemm_gpu::asynchronous_inclusive_cumsum_cpu); + DISPATCH_TO_CPU( + "asynchronous_complete_cumsum", + fbgemm_gpu::asynchronous_complete_cumsum_cpu); +} diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp index a80eea05e..7734cc69a 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp @@ -20,6 +20,7 @@ #include #include +#include "common.h" #include "fbgemm_gpu/sparse_ops.h" #include "fbgemm_gpu/utils/dispatch_macros.h" #include "fbgemm_gpu/utils/ops_utils.h" @@ -128,16 +129,6 @@ Tensor pack_segments_autograd( return PackSegments::apply(t_in, lengths, max_length)[0]; } -Tensor native_empty_like(const Tensor& self) { - return at::native::empty_like( - self, - c10::optTypeMetaToScalarType(self.options().dtype_opt()), - self.options().layout_opt(), - self.options().device_opt(), - self.options().pinned_memory_opt(), - c10::nullopt); -} - template void prefix_sum(const int length, const T* const array, T* const presum) { presum[0] = 0; @@ -1317,115 +1308,6 @@ bucketize_sparse_features_cpu( return {new_lengths, new_indices, new_weights, new_pos}; } -// 1D exclusive scan: output[i] = input[i-1] + input[i-2] + input[i-3] -// Used as a helper to several functions below. -template -U exclusive_scan_ptrs_cpu( - const int64_t N, - const T* const input, - U* const output) { - U cumsum = 0; - for (const auto i : c10::irange(N)) { - output[i] = cumsum; - cumsum += input[i]; - } - return cumsum; -} - -void asynchronous_exclusive_cumsum_cpu_out( - at::Tensor& t_out, - const Tensor& t_in) { - TENSOR_ON_CPU(t_in); - TENSOR_ON_CPU(t_out); - - const auto t_in_contig = t_in.expect_contiguous(); - at::native::resize_(t_out, t_in_contig->sizes(), c10::nullopt); - - FBGEMM_DISPATCH_ALL_TYPES( - t_in_contig->scalar_type(), - "asynchronous_exclusive_cumsum_cpu_kernel", - [&] { - exclusive_scan_ptrs_cpu( - t_in_contig->numel(), - t_in_contig->data_ptr(), - t_out.data_ptr()); - }); -} - -Tensor asynchronous_exclusive_cumsum_cpu(const Tensor& t_in) { - TENSOR_ON_CPU(t_in); - - const auto t_in_contig = t_in.expect_contiguous(); - auto output = native_empty_like(*t_in_contig); - asynchronous_exclusive_cumsum_cpu_out(output, *t_in_contig); - return output; -} - -Tensor asynchronous_inclusive_cumsum_cpu(const Tensor& t_in) { - TENSOR_ON_CPU(t_in); - - const auto t_in_contig = t_in.expect_contiguous(); - auto output = native_empty_like(*t_in_contig); - FBGEMM_DISPATCH_ALL_TYPES( - t_in_contig->scalar_type(), - "asynchronous_inclusive_cumsum_cpu_kernel", - [&] { - scalar_t cumsum = 0; - const auto* input_ptr = t_in_contig->data_ptr(); - const auto N = t_in_contig->numel(); - auto* output_ptr = output.data_ptr(); - - for (const auto i : c10::irange(N)) { - cumsum += input_ptr[i]; - output_ptr[i] = cumsum; - } - }); - return output; -} - -at::Tensor asynchronous_complete_cumsum_cpu_out( - at::Tensor& t_out, - const at::Tensor& t_in) { - TENSOR_ON_CPU(t_in); - TENSOR_ON_CPU(t_out); - const auto num_dims = t_in.dim(); - TORCH_CHECK(num_dims == 1 || num_dims == 2); - const auto t_in_contig = t_in.expect_contiguous(); - const auto t_out_contig = t_out.expect_contiguous(); - - FBGEMM_DISPATCH_ALL_TYPES( - t_in_contig->scalar_type(), - "asynchronous_complete_cumsum_cpu_kernel", - [&] { - if (num_dims == 1) { - const auto N = t_in_contig->numel(); - t_out.data_ptr()[N] = exclusive_scan_ptrs_cpu( - N, t_in_contig->data_ptr(), t_out.data_ptr()); - } else { - const auto num_vecs = t_in_contig->size(0); - const auto N = t_in_contig->size(1); - at::parallel_for(0, num_vecs, 1, [&](int64_t start, int64_t end) { - for (const auto i : c10::irange(start, end)) { - scalar_t* out_ptr = t_out.data_ptr() + i * (N + 1); - out_ptr[N] = exclusive_scan_ptrs_cpu( - N, t_in_contig->data_ptr() + i * N, out_ptr); - } - }); - } - }); - return t_out; -} - -Tensor asynchronous_complete_cumsum_cpu(const Tensor& t_in) { - const auto num_dims = t_in.dim(); - TORCH_CHECK(num_dims == 1 || num_dims == 2); - auto output = num_dims == 1 - ? at::empty({t_in.numel() + 1}, t_in.options()) - : at::empty({t_in.size(0), t_in.size(1) + 1}, t_in.options()); - - return asynchronous_complete_cumsum_cpu_out(output, t_in); -} - template void reorder_batched_ad_lengths_( const Tensor& cat_ad_lengths, @@ -3100,15 +2982,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { "block_bucketize_sparse_features_inference(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None, bool return_bucket_mapping=False, bool keep_orig_idx=False) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?, Tensor?)"); m.def( "bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, SymInt my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?)"); - m.def( - "asynchronous_exclusive_cumsum(Tensor t_in) -> Tensor", - {PT2_COMPLIANT_TAG}); - m.def( - "asynchronous_inclusive_cumsum(Tensor t_in) -> Tensor", - {PT2_COMPLIANT_TAG}); - m.def( - "asynchronous_complete_cumsum(Tensor t_in) -> Tensor", - {PT2_COMPLIANT_TAG}); m.def( "reorder_batched_sequence_embeddings(Tensor cat_sequence_embeddings_offsets, Tensor cat_sequence_embeddings, Tensor reordered_cat_sequence_embeddings_offsets, Tensor batch_offsets, SymInt num_items_in_batch) -> Tensor"); m.def( @@ -3214,15 +3087,6 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) { fbgemm_gpu::block_bucketize_sparse_features_inference_cpu); DISPATCH_TO_CPU( "bucketize_sparse_features", fbgemm_gpu::bucketize_sparse_features_cpu); - DISPATCH_TO_CPU( - "asynchronous_exclusive_cumsum", - fbgemm_gpu::asynchronous_exclusive_cumsum_cpu); - DISPATCH_TO_CPU( - "asynchronous_inclusive_cumsum", - fbgemm_gpu::asynchronous_inclusive_cumsum_cpu); - DISPATCH_TO_CPU( - "asynchronous_complete_cumsum", - fbgemm_gpu::asynchronous_complete_cumsum_cpu); DISPATCH_TO_CPU( "reorder_batched_ad_lengths", fbgemm_gpu::reorder_batched_ad_lengths_cpu); DISPATCH_TO_CPU(