Skip to content

Commit

Permalink
2024-10-04 nightly release (a0966e8)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Oct 4, 2024
1 parent 2127673 commit d1fa47f
Show file tree
Hide file tree
Showing 7 changed files with 568 additions and 473 deletions.
1 change: 1 addition & 0 deletions fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1437,6 +1437,7 @@ __global__ void dequantize_fp8_cache_kernel(
auto MAX_T = cache_K.size(1);
auto D_H = cache_K_dq.size(3);
auto D_H_q = cache_K.size(3);
// TODO: support D_H < 128 for small model used in testing.
CUDA_KERNEL_ASSERT(D_H == 128);

auto b = blockIdx.x;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <cstdlib>
#include <functional>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include <tuple>
#include <unordered_map>

#include <ATen/ATen.h>
#include <c10/hip/HIPStream.h>
#include <torch/torch.h>

#if defined(USE_ROCM)

#include "ck/ck.hpp"
#include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/utility/flush_icache.hpp"

namespace fbgemm_gpu {

void flush_icache_ck()
{
hipDeviceProp_t deviceProps;
hip_check_error(hipGetDeviceProperties(&deviceProps, 0));
int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;

auto stream = at::cuda::getCurrentHIPStream().stream();

ck::flush_icache<<<dim3(gpu_block3), dim3(64), 0, stream>>>();
hip_check_error(hipGetLastError());
}

} // namespace fbgemm_gpu

#endif // defined(USE_ROCM)
10 changes: 10 additions & 0 deletions fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@

namespace fbgemm_gpu {

#ifdef USE_ROCM
// flush icache
void flush_icache_ck();
#endif

// SmoothQuant kernels
at::Tensor
i8i8bf16(at::Tensor XQ, at::Tensor WQ, double scale, int64_t split_k);
Expand Down Expand Up @@ -185,6 +190,11 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
m.impl(
"quantize_fp8_per_tensor_fixed_scale",
quantize_fp8_per_tensor_fixed_scale);

#ifdef USE_ROCM
m.def("flush_icache_hip() -> ()");
m.impl("flush_icache_hip", flush_icache_ck);
#endif
}

TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/include/fbgemm_gpu/config/feature_gates.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <string>

#ifdef FBGEMM_FBCODE
#include "fbgemm_gpu/config/feature_gates_fb.h"
#include "deeplearning/fbgemm/fbgemm_gpu/fb/include/fbgemm_gpu/config/feature_gates_fb.h"
#endif

/// @defgroup fbgemm-gpu-config FBGEMM_GPU Configuration
Expand Down
41 changes: 41 additions & 0 deletions fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
#pragma once

#include <ATen/ATen.h>
#include <ATen/core/dispatch/Dispatcher.h>
#include <c10/core/SymInt.h>
#include <c10/core/SymIntArrayRef.h>
#include <torch/csrc/autograd/custom_function.h>

#include <cstdint>

namespace fbgemm_gpu {
Expand Down Expand Up @@ -924,6 +927,44 @@ at::Tensor index_add_with_unique_indices_cuda(
const int consecutive_range_start,
const int consecutive_range_length);

torch::autograd::variable_list group_index_select_dim0_decomposed(
at::TensorList input_group,
at::TensorList indices_group);

torch::autograd::variable_list group_index_select_dim0_autograd_impl(
at::TensorList all_indices_input,
const int64_t group_size);

torch::autograd::variable_list group_index_select_dim0(
at::TensorList input_group,
at::TensorList indices_group);

torch::autograd::variable_list group_index_select_dim0_forward_impl_cpu(
at::TensorList all_indices_input,
const int64_t group_size);

torch::autograd::variable_list group_index_select_dim0_backward_impl_cpu(
at::TensorList all_inputs,
c10::SymIntArrayRef output_shape_group_ref);

std::pair<std::vector<at::Tensor>, std::vector<at::Tensor>>
group_index_select_dim0_unpack(
at::TensorList all_indices_input,
const int64_t group_size);

class GroupIndexSelectDim0Op
: public torch::autograd::Function<GroupIndexSelectDim0Op> {
public:
static torch::autograd::variable_list forward(
torch::autograd::AutogradContext* ctx,
at::TensorList all_indices_input,
const int64_t group_size);

static torch::autograd::variable_list backward(
torch::autograd::AutogradContext* ctx,
torch::autograd::variable_list grad_output_group);
};

///@ingroup sparse-data-cuda
void group_index_select_or_add_cuda(
const int64_t* input_ptrs,
Expand Down
176 changes: 155 additions & 21 deletions fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2851,19 +2851,84 @@ Tensor pack_segments_cpu(
const int64_t max_length) {
return pack_segments_forward_cpu(t_in, lengths, max_length);
}
namespace {
Tensor index_select_dim0(
const Tensor& input,
const Tensor& indices,
std::optional<int64_t> /*consecutive_range_start*/,
std::optional<int64_t> /*consecutive_range_length*/,
std::optional<bool> /*skip_indices_sorting_fwd*/) {
return at::index_select(input, 0, indices);

torch::autograd::variable_list group_index_select_dim0_autograd_impl(
at::TensorList all_indices_input,
const int64_t group_size) {
return GroupIndexSelectDim0Op::apply(all_indices_input, group_size);
}

std::pair<std::vector<Tensor>, std::vector<Tensor>>
group_index_select_dim0_unpack(
at::TensorList all_indices_input,
const int64_t group_size) {
std::vector<Tensor> indices_group;
std::vector<Tensor> input_group;

indices_group.reserve(group_size);
input_group.reserve(group_size);

for (const auto i : c10::irange(group_size)) {
indices_group.push_back(all_indices_input[i]);
input_group.push_back(all_indices_input[group_size + i]);
}

TORCH_CHECK(group_size == static_cast<int64_t>(indices_group.size()));

return std::make_pair(input_group, indices_group);
}

torch::autograd::variable_list group_index_select_dim0(
at::TensorList input_group,
at::TensorList indices_group) {
const auto group_size = indices_group.size();
std::vector<Tensor> output_group;

if (group_size == 0) {
return std::vector<Tensor>();
}

// Pack input_group and indices_group into TensorList
std::vector<Tensor> all_indices_input_vec;
all_indices_input_vec.reserve(group_size * 2);

for (const Tensor& index : indices_group) {
all_indices_input_vec.push_back(index);
}
for (const Tensor& input : input_group) {
all_indices_input_vec.push_back(input);
}

at::TensorList all_indices_input_tensor = all_indices_input_vec;

static auto forward_op =
at::Dispatcher::singleton()
.findSchemaOrThrow("fbgemm::group_index_select_dim0_gpu_impl", "")
.typed<decltype(group_index_select_dim0_autograd_impl)>();
auto res = forward_op.call(all_indices_input_tensor, group_size);
TORCH_CHECK(res.size() == group_size + 2);
// only return the outputs (the first group_size elements)
res.resize(group_size);
return res;
}

torch::autograd::variable_list group_index_select_dim0_forward_impl_cpu(
at::TensorList all_indices_input,
const int64_t group_size) {
throw std::runtime_error(
"group_index_select_dim0_forward_impl is not implemented for CPU");
}

torch::autograd::variable_list group_index_select_dim0_backward_impl_cpu(
at::TensorList all_inputs,
c10::SymIntArrayRef output_shape_group_ref) {
throw std::runtime_error(
"group_index_select_dim0_backward_impl is not implemented for CPU");
}

torch::autograd::variable_list group_index_select_dim0_decomposed(
at::TensorList input_group,
at::TensorList indices_group) {
int num_groups = input_group.size();
TORCH_CHECK(num_groups == (int)indices_group.size())
std::vector<Tensor> output_group;
Expand All @@ -2874,18 +2939,83 @@ torch::autograd::variable_list group_index_select_dim0(
return output_group;
}

torch::autograd::variable_list group_index_select_dim0_gpu_impl_cpu(
torch::autograd::variable_list GroupIndexSelectDim0Op::forward(
torch::autograd::AutogradContext* ctx,
at::TensorList all_indices_input,
const int64_t group_size) {
throw std::runtime_error(
"group_index_select_dim0_gpu_impl is not implemented for CPU");
at::AutoDispatchBelowADInplaceOrView guard;
static auto forward_op =
at::Dispatcher::singleton()
.findSchemaOrThrow("fbgemm::group_index_select_dim0_gpu_impl", "")
.typed<decltype(group_index_select_dim0_forward_impl_cpu)>();
auto result = forward_op.call(all_indices_input, group_size);
TORCH_CHECK(static_cast<int64_t>(result.size()) == group_size + 2);

auto [input_group, indices_group] =
group_index_select_dim0_unpack(all_indices_input, group_size);
const auto input_dim = input_group[0].dim();
std::vector<c10::SymInt> input_shape_group;
input_shape_group.reserve(group_size * input_dim);

for (const auto i : c10::irange(group_size)) {
const auto& input = input_group[i];
// Copy input shape
auto input_shape = input.sym_sizes().vec();
input_shape_group.insert(
input_shape_group.end(), input_shape.begin(), input_shape.end());
}

// save indices, args_tensor, saved_data
auto saved_tensors = std::vector<at::Tensor>(indices_group);
saved_tensors.insert(
saved_tensors.end(), result.cbegin() + group_size, result.cend());
saved_tensors.push_back(input_group[0]);
ctx->save_for_backward(saved_tensors);
ctx->saved_data["input_shape_group"] = input_shape_group;

return result;
}

torch::autograd::variable_list group_index_select_dim0_gpu_backward_cpu(
at::TensorList all_inputs,
c10::SymIntArrayRef output_shape_group_ref) {
throw std::runtime_error(
"group_index_select_dim0_gpu_backward is not implemented for CPU");
torch::autograd::variable_list GroupIndexSelectDim0Op::backward(
torch::autograd::AutogradContext* ctx,
torch::autograd::variable_list grad_output_group) {
TORCH_CHECK(grad_output_group.size() >= 2);
if (grad_output_group.size() == 2) {
// empty outputs
return torch::autograd::variable_list(1);
}
// remove redundant grads
auto group_size = grad_output_group.size() - 2;
grad_output_group.resize(group_size);

auto saved_tensors = ctx->get_saved_variables();
TORCH_CHECK(saved_tensors.size() == group_size + 3);
auto output_shape_group =
ctx->saved_data["input_shape_group"].toSymIntVector();
grad_output_group.insert(
grad_output_group.end(), saved_tensors.begin(), saved_tensors.end());
static auto backward_op =
at::Dispatcher::singleton()
.findSchemaOrThrow("fbgemm::group_index_select_dim0_gpu_backward", "")
.typed<decltype(group_index_select_dim0_backward_impl_cpu)>();
auto res = backward_op.call(grad_output_group, output_shape_group);
// 1) Add group_size Variable()'s for indices
// Replace all empty tensors with Variable(). This must be done after the
// op.call to make __torch_dispatch__ work for the backward op.
std::fill(res.begin(), res.begin() + group_size, torch::autograd::Variable());
// 3) Add 1 Variable() for group_size
res.push_back({});
return res;
}

namespace {
Tensor index_select_dim0(
const Tensor& input,
const Tensor& indices,
std::optional<int64_t> /*consecutive_range_start*/,
std::optional<int64_t> /*consecutive_range_length*/,
std::optional<bool> /*skip_indices_sorting_fwd*/) {
return at::index_select(input, 0, indices);
}

Tensor bottom_k_per_row(
Expand Down Expand Up @@ -3132,13 +3262,14 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) {
"pack_segments_backward", fbgemm_gpu::pack_segments_backward_cpu);
DISPATCH_TO_CPU("index_select_dim0", fbgemm_gpu::index_select_dim0);
DISPATCH_TO_CPU(
"group_index_select_dim0", fbgemm_gpu::group_index_select_dim0);
"group_index_select_dim0",
fbgemm_gpu::group_index_select_dim0_decomposed);
DISPATCH_TO_CPU(
"group_index_select_dim0_gpu_impl",
fbgemm_gpu::group_index_select_dim0_gpu_impl_cpu);
fbgemm_gpu::group_index_select_dim0_forward_impl_cpu);
DISPATCH_TO_CPU(
"group_index_select_dim0_gpu_backward",
fbgemm_gpu::group_index_select_dim0_gpu_backward_cpu);
fbgemm_gpu::group_index_select_dim0_backward_impl_cpu);
DISPATCH_TO_CPU("bottom_k_per_row", fbgemm_gpu::bottom_k_per_row);
}

Expand All @@ -3147,11 +3278,14 @@ TORCH_LIBRARY_IMPL(fbgemm, Autograd, m) {
}

TORCH_LIBRARY_IMPL(fbgemm, AutogradCPU, m) {
m.impl("group_index_select_dim0", &fbgemm_gpu::group_index_select_dim0);
m.impl(
"group_index_select_dim0",
&fbgemm_gpu::group_index_select_dim0_decomposed);
}

TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
// CPU group_index_select_dim0 is decomposable
m.impl(
"group_index_select_dim0", TORCH_FN(fbgemm_gpu::group_index_select_dim0));
"group_index_select_dim0",
TORCH_FN(fbgemm_gpu::group_index_select_dim0_decomposed));
}
Loading

0 comments on commit d1fa47f

Please sign in to comment.