Skip to content

Commit

Permalink
attempt to fix the cuda version
Browse files Browse the repository at this point in the history
  • Loading branch information
arrufat committed Aug 30, 2024
1 parent 60f29fc commit 6118f80
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 92 deletions.
7 changes: 4 additions & 3 deletions dlib/cuda/cpu_dlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1350,7 +1350,9 @@ namespace dlib
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
tensor& beta_grad,
resizable_tensor& dmeans,
resizable_tensor& dvars
)
{
const long num = src.nr() * src.nc();
Expand All @@ -1376,7 +1378,6 @@ namespace dlib
const auto p_invstds = invstds.host();
const auto p_means = means.host();

resizable_tensor dvars, dmeans;
dvars.copy_size(invstds);
dmeans.copy_size(means);
dvars = 0;
Expand All @@ -1386,9 +1387,9 @@ namespace dlib

for (long n = 0; n < src.num_samples(); ++n)
{
const float invstd_pow = -0.5 * std::pow(p_invstds[n], 3.0f);
for (long k = 0; k < src.k(); ++k)
{
const float invstd_pow = -0.5 * std::pow(p_invstds[n], 3.0f);
for (long i = 0; i < num; ++i)
{
const float x_hat = (*p_src - p_means[n]) * p_invstds[n];
Expand Down
4 changes: 3 additions & 1 deletion dlib/cuda/cpu_dlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,9 @@ namespace dlib
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
tensor& beta_grad,
resizable_tensor& dmeans,
resizable_tensor& dvars
);

// -----------------------------------------------------------------------------------
Expand Down
184 changes: 107 additions & 77 deletions dlib/cuda/cuda_dlib.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2085,84 +2085,50 @@ namespace dlib

// ----------------------------------------------------------------------------------------

__global__ void _cuda_layer_normalize(float* out, const float* s, float* m, float* v, const float* g, const float* b, float eps, size_t ns, size_t num)
__global__ void _cuda_layer_normalize(
float* out,
const float* s,
float* m,
float* v,
const float* g,
const float* b,
float eps,
size_t ns,
size_t k,
size_t num
)
{
// compute means and sum of squares
for (auto n : grid_stride_range_y(0, ns))
{
auto p = s + n * num;
const auto ps = s + n * k * num;
float means = 0;
float invstds = 0;
for (auto i : grid_stride_range(0, num))
for (auto i : grid_stride_range(0, k * num))
{
means += p[i];
invstds += p[i] * p[i];
means += ps[i];
invstds += ps[i] * ps[i];
}
warp_reduce_atomic_add(m[n], means/num);
warp_reduce_atomic_add(v[n], invstds/num);
warp_reduce_atomic_add(m[n], means / (k * num));
warp_reduce_atomic_add(v[n], invstds / (k * num));
}
__syncthreads();

// compute variances
for (auto n : grid_stride_range_y(0, ns))
{
for (auto i : grid_stride_range(0, 1))
{
auto var = v[n] - m[n] * m[n];
v[n] = 1.0f / std::sqrt(var + eps);
}
v[n] = 1.0f / std::sqrt(v[n] - m[n] * m[n] + eps);
}
__syncthreads();

for (auto n : grid_stride_range_y(0, ns))
{
for (auto i : grid_stride_range(0, num))
const auto ps = s + n * k * num;
const auto pout = out + n * k * num;
for (auto i : grid_stride_range(0, k * num))
{
const float val = (s[n*num+i]-m[n])*v[n];
out[n*num+i] = val*g[i]+b[i];
}
}
}

__global__ void _cuda_layer_normalize_gradient(float* out, float* gg, float* bg, const float* s, const float* gi, const float* m, const float* v, const float* g, float* dm, float* dv, float eps, size_t ns, size_t num)
{
for (auto n : grid_stride_range_y(0, ns))
{
float temp_dv = 0;
for (auto i : grid_stride_range(0, num))
{
auto idx = n*num+i;
const float x_hat = (s[idx] - m[n])*v[n];
bg[i] += gi[idx];
gg[i] += gi[idx]*x_hat;

const float dx = gi[idx] * g[i];
temp_dv += dx*(s[idx] - m[n])*-0.5*v[n]*v[n]*v[n];
}
warp_reduce_atomic_add(dv[n], temp_dv);
}
__syncthreads();

for (auto n : grid_stride_range_y(0, ns))
{
float temp_dm = 0;
for (auto i : grid_stride_range(0, num))
{
auto idx = n*num+i;
const float dx = gi[idx]*g[i];
temp_dm += dx*-v[n] + dv[n] * -2*(s[idx] - m[n])/num;
}
warp_reduce_atomic_add(dm[n], temp_dm);
}
__syncthreads();

for (auto n : grid_stride_range_y(0, ns))
{
for (auto i : grid_stride_range(0, num))
{
auto idx = n*num+i;
const float dx = gi[idx]*g[i];
out[idx] += dx*v[n] + dv[n] * 2*(s[idx] - m[n])/num + dm[n]/num;
pout[i] = (ps[i] - m[n]) * v[n];
pout[i] = pout[i] * g[i / num] + b[i / num];
}
}
}
Expand All @@ -2177,22 +2143,20 @@ namespace dlib
const tensor& beta
)
{
const long num = src.k() * src.nr() * src.nc();
const long num = src.nr() * src.nc();
DLIB_CASSERT(
have_same_dimensions(gamma, beta) &&
src.k() == gamma.k() &&
src.nr() == gamma.nr() &&
src.nc() == gamma.nc() &&
gamma.k() == src.k() &&
gamma.nr() == 1 &&
gamma.nc() == 1 &&
eps > 0,
"\nsrc.k(): " << src.k() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);

Expand All @@ -2201,8 +2165,73 @@ namespace dlib
invstds.set_size(src.num_samples());
means = 0;
invstds = 0;
launch_kernel(_cuda_layer_normalize, max_jobs(num, src.num_samples()), dest.device(), src.device(),
means.device(), invstds.device(), gamma.device(), beta.device(), eps, src.num_samples(), num);
launch_kernel(_cuda_layer_normalize, max_jobs(src.k() * num, src.num_samples()), dest.device(), src.device(),
means.device(), invstds.device(), gamma.device(), beta.device(), eps, src.num_samples(), src.k(), num);
}

// ----------------------------------------------------------------------------------------

__global__ void _cuda_layer_normalize_gradient(
float* out,
float* gg,
float* bg,
const float* s,
const float* gi,
const float* m,
const float* v,
const float* g,
float* dm,
float* dv,
float eps,
size_t ns,
size_t k,
size_t num)
{
for (auto n : grid_stride_range_y(0, ns))
{
const auto ps = s + n * k * num;
const auto pgi = gi + n * k * num;
const float invstd_pow = -0.5 * std::pow(v[n], 3.0f);
float temp_dv = 0;
for (auto i : grid_stride_range(0, k * num))
{
const float x_hat = (ps[i] - m[n]) * v[n];
bg[i / num] += pgi[i];
gg[i / num] += pgi[i] * x_hat;

const float dx = pgi[i] * g[i / num];
temp_dv += dx * (ps[i] - m[n]) * invstd_pow;
}
warp_reduce_atomic_add(dv[n], temp_dv);
}
__syncthreads();

const float invnum = 1.0f / (k * num);
for (auto n : grid_stride_range_y(0, ns))
{
const auto ps = s + n * k * num;
const auto pgi = gi + n * k * num;
float temp_dm = 0;
for (auto i : grid_stride_range(0, k * num))
{
const float dx = pgi[i] * g[i / num];
temp_dm += -dx * v[n] + dv[n] * -2 * (ps[i] - m[n]) * invnum;
}
warp_reduce_atomic_add(dm[n], temp_dm);
}
__syncthreads();

for (auto n : grid_stride_range_y(0, ns))
{
const auto ps = s + n * k * num;
const auto pgi = gi + n * k * num;
const auto pout = out + n * k * num;
for (auto i : grid_stride_range(0, k * num))
{
const float dx = pgi[i] * g[i / num];
pout[i] += dx * v[n] + dv[n] * 2 * (ps[i] - m[n]) * invnum + dm[n] * invnum;
}
}
}

void layer_normalize_gradient (
Expand All @@ -2214,32 +2243,33 @@ namespace dlib
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
tensor& beta_grad,
resizable_tensor& dmeans,
resizable_tensor& dvars
)
{
const long num = src.k() * src.nr() * src.nc();
const long num = src.nr() * src.nc();
DLIB_CASSERT(src.num_samples() == means.size());
DLIB_CASSERT(src.num_samples() == invstds.size());
DLIB_CASSERT(src.k() == gamma.k());
DLIB_CASSERT(src.nr() == gamma.nr());
DLIB_CASSERT(src.nc() == gamma.nc());
DLIB_CASSERT(have_same_dimensions(gamma, gamma_grad));
DLIB_CASSERT(have_same_dimensions(gamma_grad, beta_grad));
DLIB_CASSERT(gamma.k() == src.k());
DLIB_CASSERT(gamma.nr() == 1);
DLIB_CASSERT(gamma.nc() == 1);
DLIB_CASSERT(have_same_dimensions(gradient_input, src));
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
DLIB_CASSERT(have_same_dimensions(gamma_grad, gamma));
DLIB_CASSERT(have_same_dimensions(gamma_grad, beta_grad));
DLIB_CASSERT(eps > 0);

beta_grad = 0;
gamma_grad = 0;
resizable_tensor dvars, dmeans;
dvars.copy_size(invstds);
dmeans.copy_size(means);
dvars = 0;
dmeans = 0;
launch_kernel(_cuda_layer_normalize_gradient, max_jobs(num, src.num_samples()),
launch_kernel(_cuda_layer_normalize_gradient, max_jobs(src.k() * num, src.num_samples()),
src_grad.device(), gamma_grad.device(), beta_grad.device(), src.device(),
gradient_input.device(), means.device(), invstds.device(), gamma.device(),
dmeans.device(), dvars.device(), eps, src.num_samples(), num);
dmeans.device(), dvars.device(), eps, src.num_samples(), src.k(), num);
}

// ----------------------------------------------------------------------------------------
Expand Down
4 changes: 3 additions & 1 deletion dlib/cuda/cuda_dlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,9 @@ namespace dlib
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
tensor& beta_grad,
resizable_tensor& dmeans,
resizable_tensor& dvars
);

// -----------------------------------------------------------------------------------
Expand Down
8 changes: 5 additions & 3 deletions dlib/cuda/tensor_tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -684,13 +684,15 @@ namespace dlib { namespace tt
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
tensor& beta_grad,
resizable_tensor& dmeans,
resizable_tensor& dvars
)
{
#ifdef DLIB_USE_CUDA
cuda::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars);
#else
cpu::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cpu::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars);
#endif
}

Expand Down
12 changes: 6 additions & 6 deletions dlib/cuda/tensor_tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -814,13 +814,13 @@ namespace dlib { namespace tt
/*!
requires
- eps > 0
- src.num_samples() == gamma.size() == beta.size()
- src.k() == gamma.size() == beta.size()
- gamma.num_samples() == gamma.nr() == gamma.nc() == 1
- have_same_dimensions(gamma, beta) == true
- beta.num_samples() ==beta.nr() ==gamma.nc() == 1
ensures
- have_same_dimensions(#dest, src) == true
- #means.size() == invstds.size() == src.num_samples()
- #dest == the normalized version of src.
- #dest == the normalized version of src, sample-wise.
- #means == the mean values of the contents of src.
- #invstds == 1/(the standard deviation values of the contents of src).
!*/
Expand All @@ -834,7 +834,9 @@ namespace dlib { namespace tt
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
tensor& beta_grad,
resizable_tensor& dmeans,
resizable_tensor& dvars
);
/*!
requires
Expand All @@ -847,8 +849,6 @@ namespace dlib { namespace tt
- have_same_dimensions(gamma, beta_grad) == true
- means.size() == src.num_samples()
- invstds.size() == src.num_samples()
- have_same_dimensions(means, gamma) == true
- have_same_dimensions(invstds, gamma) == true
ensures
- Let f(src,gamma,beta) == dot(gradient_input, dest output of
layer_normalize(eps,dest,means,invstds,src,gamma,beta))
Expand Down
3 changes: 2 additions & 1 deletion dlib/dnn/layers.h
Original file line number Diff line number Diff line change
Expand Up @@ -1426,7 +1426,7 @@ namespace dlib
auto g = gamma(params, 0);
auto g_grad = gamma(params_grad, 0);
auto b_grad = beta(params_grad, gamma.size());
tt::layer_normalize_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad);
tt::layer_normalize_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad, dmeans, dvars);
}

const tensor& get_layer_params() const { return params; };
Expand Down Expand Up @@ -1493,6 +1493,7 @@ namespace dlib
resizable_tensor params;
alias_tensor gamma, beta;
resizable_tensor means, invstds;
resizable_tensor dmeans, dvars;
double learning_rate_multiplier;
double weight_decay_multiplier;
double bias_learning_rate_multiplier;
Expand Down

0 comments on commit 6118f80

Please sign in to comment.