Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
* Pushing for build tests

* Contrib files

* Removing deprecated checks
  • Loading branch information
mcarilli authored Sep 6, 2019
1 parent 1bf0d8d commit 325f5a0
Show file tree
Hide file tree
Showing 17 changed files with 281 additions and 266 deletions.
46 changes: 24 additions & 22 deletions apex/contrib/csrc/groupbn/batch_norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#include <cuda.h>

#include "compat.h"

#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
Expand Down Expand Up @@ -72,7 +74,7 @@ at::Tensor nhwc_bn_fwd_train(
const int C = x.size(3);

// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
int* magic = magic_tensor.DATA_PTR<int>();
*magic = (*magic + 1) & 0xff;

// Allocate output tensor
Expand All @@ -87,13 +89,13 @@ at::Tensor nhwc_bn_fwd_train(
bn->setConstants(momentum, epsilon);

// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
nullptr,
y.data<at::Half>(),
y.DATA_PTR<at::Half>(),
nullptr);

bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});

// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
Expand All @@ -114,12 +116,12 @@ at::Tensor nhwc_bn_fwd_train(
Workspace ws(total_workspace_bytes);

std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(minibatch_mean.DATA_PTR<float>());
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());

auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[2];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);

Expand Down Expand Up @@ -165,13 +167,13 @@ at::Tensor nhwc_bn_fwd_eval(
bn->setConstants(momentum, epsilon);

// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
nullptr,
y.data<at::Half>(),
y.DATA_PTR<at::Half>(),
nullptr);

bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});

// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
Expand All @@ -197,7 +199,7 @@ at::Tensor nhwc_bn_fwd_eval(

auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[2];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);

Expand Down Expand Up @@ -244,7 +246,7 @@ std::vector<at::Tensor> nhwc_bn_bwd(
const int C = x.size(3);

// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
int* magic = magic_tensor.DATA_PTR<int>();
*magic = (*magic + 1) & 0xff;

// outputs
Expand All @@ -264,13 +266,13 @@ std::vector<at::Tensor> nhwc_bn_bwd(
bn->setConstants(momentum, epsilon);

// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
x_grad.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
x_grad.DATA_PTR<at::Half>(),
nullptr,
dy.data<at::Half>());
dy.DATA_PTR<at::Half>());

bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {scale_grad.DATA_PTR<float>(), bias_grad.DATA_PTR<float>()});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});

// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
Expand All @@ -291,12 +293,12 @@ std::vector<at::Tensor> nhwc_bn_bwd(
Workspace ws(total_workspace_bytes);

std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(minibatch_mean.DATA_PTR<float>());
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());

auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[2];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);

Expand Down
56 changes: 29 additions & 27 deletions apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#include <cuda.h>

#include "compat.h"

//FIXME move the common stuff to common h file
#define cudaCheckErrors(msg) \
do { \
Expand Down Expand Up @@ -74,7 +76,7 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
const int C = x.size(3);

// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
int* magic = magic_tensor.DATA_PTR<int>();
*magic = (*magic + 1) & 0xff;

// Allocate output tensor
Expand All @@ -89,15 +91,15 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
bn->setConstants(momentum, epsilon);

// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
nullptr,
y.data<at::Half>(),
y.DATA_PTR<at::Half>(),
nullptr,
z.data<at::Half>(),
z.DATA_PTR<at::Half>(),
nullptr);

bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});

// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
Expand All @@ -118,13 +120,13 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
Workspace ws(total_workspace_bytes);

std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(bitmask.data<int32_t>());
workspace.push_back(minibatch_mean.DATA_PTR<float>());
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
workspace.push_back(bitmask.DATA_PTR<int32_t>());

auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[3];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);

workspace.push_back(retired_ctas);
Expand Down Expand Up @@ -171,15 +173,15 @@ at::Tensor nhwc_bn_addrelu_fwd_eval(
bn->setConstants(momentum, epsilon);

// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
nullptr,
y.data<at::Half>(),
y.DATA_PTR<at::Half>(),
nullptr,
z.data<at::Half>(),
z.DATA_PTR<at::Half>(),
nullptr);

bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});

// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
Expand All @@ -206,7 +208,7 @@ at::Tensor nhwc_bn_addrelu_fwd_eval(

auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[3];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);

Expand Down Expand Up @@ -253,7 +255,7 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
const int C = x.size(3);

// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
int* magic = magic_tensor.DATA_PTR<int>();
*magic = (*magic + 1) & 0xff;

// outputs
Expand All @@ -274,15 +276,15 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
bn->setConstants(momentum, epsilon);

// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
x_grad.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
x_grad.DATA_PTR<at::Half>(),
nullptr,
dy.data<at::Half>(),
dy.DATA_PTR<at::Half>(),
nullptr,
z_grad.data<at::Half>());
z_grad.DATA_PTR<at::Half>());

bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {scale_grad.DATA_PTR<float>(), bias_grad.DATA_PTR<float>()});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});

// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
Expand All @@ -303,13 +305,13 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
Workspace ws(total_workspace_bytes);

std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(bitmask.data<int32_t>());
workspace.push_back(minibatch_mean.DATA_PTR<float>());
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
workspace.push_back(bitmask.DATA_PTR<int32_t>());

auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[3];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);

Expand Down
8 changes: 5 additions & 3 deletions apex/contrib/csrc/groupbn/ipc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include <cuda.h>

#include "compat.h"


#define cudaCheckErrors(msg) \
do { \
Expand Down Expand Up @@ -114,17 +116,17 @@ int64_t get_buffer_size(const int bn_sync_steps) {

void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset) {
cudaIpcMemHandle_t my_handle;
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
return ipc_mem_registry.getPtr(my_handle, offset);
}

void close_remote_data(const at::Tensor& handle) {
cudaIpcMemHandle_t my_handle;
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
ipc_mem_registry.releasePtr(my_handle);
}

void* get_data_ptr(
const at::Tensor& data) {
return data.data<uint8_t>();
return data.DATA_PTR<uint8_t>();
}
25 changes: 13 additions & 12 deletions apex/contrib/csrc/xentropy/xentropy_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
#include <THC/THCThrustAllocator.cuh>

#include "type_shim.h"
#include "compat.h"

using Tensor = at::Tensor;
using TensorList = at::TensorList;
Expand Down Expand Up @@ -492,7 +493,7 @@ std::vector<Tensor> host_softmax_xentropy(
inner_size *= input.size(i);
// This kernel spawns a block per each element in the batch.
// XXX: it assumes that inner_size == 1
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");

const int ILP = 2;
dim3 grid(outer_size);
Expand All @@ -504,15 +505,15 @@ std::vector<Tensor> host_softmax_xentropy(
if (!half_to_float) {
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
losses.data<accscalar_t>(), max_log_sum_exp.data<scalar_t_0>(),
input.data<scalar_t_0>(), labels_.data<int64_t>(),
losses.DATA_PTR<accscalar_t>(), max_log_sum_exp.DATA_PTR<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(), labels_.DATA_PTR<int64_t>(),
dim_size, smoothing
);
} else {
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
losses.data<accscalar_t>(), max_log_sum_exp.data<accscalar_t>(),
input.data<scalar_t_0>(), labels_.data<int64_t>(),
losses.DATA_PTR<accscalar_t>(), max_log_sum_exp.DATA_PTR<accscalar_t>(),
input.DATA_PTR<scalar_t_0>(), labels_.DATA_PTR<int64_t>(),
dim_size, smoothing
);
}
Expand Down Expand Up @@ -561,7 +562,7 @@ Tensor host_softmax_xentropy_backward(
inner_size *= logits.size(i);
// See descriptions of kernels above.
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");

const int ILP = 2;
dim3 grid(outer_size);
Expand All @@ -572,17 +573,17 @@ Tensor host_softmax_xentropy_backward(
if (!half_to_float) {
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
max_log_sum_exp.data<scalar_t_0>(),
grad.data<scalar_t_0>(), labels.data<int64_t>(),
gI.DATA_PTR<scalar_t_0>(), logits.DATA_PTR<scalar_t_0>(),
max_log_sum_exp.DATA_PTR<scalar_t_0>(),
grad.DATA_PTR<scalar_t_0>(), labels.DATA_PTR<int64_t>(),
smoothing, dim_size
);
} else {
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
max_log_sum_exp.data<accscalar_t>(),
grad.data<accscalar_t>(), labels.data<int64_t>(),
gI.DATA_PTR<scalar_t_0>(), logits.DATA_PTR<scalar_t_0>(),
max_log_sum_exp.DATA_PTR<accscalar_t>(),
grad.DATA_PTR<accscalar_t>(), labels.DATA_PTR<int64_t>(),
smoothing, dim_size
);
}
Expand Down
6 changes: 6 additions & 0 deletions csrc/compat.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif

#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
Loading

0 comments on commit 325f5a0

Please sign in to comment.