Skip to content

Commit

Permalink
Use new atomics and add support for other backends.
Browse files Browse the repository at this point in the history
  • Loading branch information
richardmembarth committed Oct 22, 2020
1 parent 010c055 commit 46c8582
Show file tree
Hide file tree
Showing 12 changed files with 73 additions and 48 deletions.
3 changes: 1 addition & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ message(STATUS "Selected backend: ${BACKEND}")

set(BACKEND_FILE backend/backend_${BACKEND}.impala)

if(BACKEND STREQUAL "nvvm")
if(BACKEND STREQUAL "nvvm" OR BACKEND STREQUAL "amdgpu" OR BACKEND STREQUAL "cuda" OR BACKEND STREQUAL "opencl")
set(MAPPING_FILE backend/mapping_gpu.impala)
else()
set(MAPPING_FILE backend/mapping_cpu.impala)
Expand Down Expand Up @@ -54,7 +54,6 @@ set(MD_SRCS
core/thermo.impala
comm/comm.impala
comm/lb.impala
utils/atomic.impala
utils/utilities.impala
utils/print.impala
${MPI_FILE}
Expand Down
10 changes: 6 additions & 4 deletions backend/backend_amdgpu.impala
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ fn @accelerator(dev: i32) -> Accelerator { amdgpu_accelerator(dev) }

static device_id = 1;
static gpu_math = amdgpu_intrinsics;
static atomic_add_global = amdgcn_atomic_add_global;
static atomic_add_shared = amdgcn_atomic_add_shared;
static atomic_min_global = amdgcn_atomic_min_global;
static atomic_min_shared = amdgcn_atomic_min_shared;
static atomic_add_global_i32 = amdgcn_atomic_add_global_i32;
static atomic_add_shared_i32 = amdgcn_atomic_add_shared;
static atomic_min_global_i32 = amdgcn_atomic_min_global_i32;
static atomic_min_shared_i32 = amdgcn_atomic_min_shared;
fn @atomic_add_f32(addr: &mut f32, val: f32) -> f32 { amdgcn_atomic_add_global_f32(addr as &mut[1] f32, val) }
fn @atomic_add_f64(addr: &mut f64, val: f64) -> f64 { amdgcn_atomic_add_global_f64(addr as &mut[1] f64, val) }

fn @is_nvvm() -> bool { false }
fn @is_cuda() -> bool { false }
Expand Down
3 changes: 3 additions & 0 deletions backend/backend_avx.impala
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ fn @get_vector_width() -> i32 { 4 }

// amount of full vector iterations that trigger loop vectorization
static simd_iter_threshold = 2;

fn @atomic_add_f32(addr: &mut f32, val: f32) -> f32 { atomic(11u32, addr, val, 7u32, "") }
fn @atomic_add_f64(addr: &mut f64, val: f64) -> f64 { atomic(11u32, addr, val, 7u32, "") }
3 changes: 3 additions & 0 deletions backend/backend_avx512.impala
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ fn @get_vector_width() -> i32 { 8 }

// amount of full vector iterations that trigger loop vectorization
static simd_iter_threshold = 2;

fn @atomic_add_f32(addr: &mut f32, val: f32) -> f32 { atomic(11u32, addr, val, 7u32, "") }
fn @atomic_add_f64(addr: &mut f64, val: f64) -> f64 { atomic(11u32, addr, val, 7u32, "") }
3 changes: 3 additions & 0 deletions backend/backend_cpu.impala
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
fn @get_vector_width() -> i32 { 1 }

fn @atomic_add_f32(addr: &mut f32, val: f32) -> f32 { atomic(11u32, addr, val, 7u32, "") }
fn @atomic_add_f64(addr: &mut f64, val: f64) -> f64 { atomic(11u32, addr, val, 7u32, "") }
10 changes: 6 additions & 4 deletions backend/backend_cuda.impala
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ fn @accelerator(dev: i32) -> Accelerator { cuda_accelerator(dev) }

static device_id = 0;
static gpu_math = cuda_intrinsics;
static atomic_add_global = cuda_atomic_add_global;
static atomic_add_shared = cuda_atomic_add_shared;
static atomic_min_global = cuda_atomic_min_global;
static atomic_min_shared = cuda_atomic_min_shared;
static atomic_add_global_i32 = cuda_atomic_add_global_i32;
static atomic_add_shared_i32 = cuda_atomic_add_shared;
static atomic_min_global_i32 = cuda_atomic_min_global_i32;
static atomic_min_shared_i32 = cuda_atomic_min_shared;
fn @atomic_add_f32(addr: &mut f32, val: f32) -> f32 { cuda_atomic_add_global_f32(addr as &mut[1] f32, val) }
fn @atomic_add_f64(addr: &mut f64, val: f64) -> f64 { cuda_atomic_add_global_f64(addr as &mut[1] f64, val) }

fn @is_nvvm() -> bool { false }
fn @is_cuda() -> bool { true }
Expand Down
10 changes: 6 additions & 4 deletions backend/backend_nvvm.impala
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ fn @accelerator(dev: i32) -> Accelerator { nvvm_accelerator(dev) }

static device_id = 0;
static gpu_math = nvvm_intrinsics;
static atomic_add_global = nvvm_atomic_add_global;
static atomic_add_shared = nvvm_atomic_add_shared;
static atomic_min_global = nvvm_atomic_min_global;
static atomic_min_shared = nvvm_atomic_min_shared;
static atomic_add_global_i32 = nvvm_atomic_add_global_i32;
static atomic_add_shared_i32 = nvvm_atomic_add_shared;
static atomic_min_global_i32 = nvvm_atomic_min_global_i32;
static atomic_min_shared_i32 = nvvm_atomic_min_shared;
fn @atomic_add_f32(addr: &mut f32, val: f32) -> f32 { nvvm_atomic_add_global_f32(addr as &mut[1] f32, val) }
fn @atomic_add_f64(addr: &mut f64, val: f64) -> f64 { nvvm_atomic_add_global_f64(addr as &mut[1] f64, val) }

fn @is_nvvm() -> bool { true }
fn @is_cuda() -> bool { false }
Expand Down
42 changes: 38 additions & 4 deletions backend/backend_opencl.impala
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,44 @@ fn @accelerator(dev: i32) -> Accelerator { opencl_accelerator(dev) }

static device_id = 0;
static gpu_math = opencl_intrinsics;
static atomic_add_global = opencl_atomic_add_global;
static atomic_add_shared = opencl_atomic_add_shared;
static atomic_min_global = opencl_atomic_min_global;
static atomic_min_shared = opencl_atomic_min_shared;
static atomic_add_global_i32 = opencl_atomic_add_global;
static atomic_add_shared_i32 = opencl_atomic_add_shared;
static atomic_min_global_i32 = opencl_atomic_min_global;
static atomic_min_shared_i32 = opencl_atomic_min_shared;

extern "device" {
fn "atomic_cmpxchg" opencl_atomic_cmpxchg_u32(&mut[1]u32, u32, u32) -> u32;
fn "atom_cmpxchg" opencl_atomic_cmpxchg_u64(&mut[1]u64, u64, u64) -> u64;
}

fn @atomic_op_f32(a: &mut[1]f32, b: f32, cmpxchg: fn(&mut[1]u32, u32, u32) -> u32, op: fn(f32, f32) -> f32) -> f32 {
let addr_as_ui = bitcast[&mut[1]u32](a);
let mut assumed = *addr_as_ui;
let mut old = cmpxchg(addr_as_ui, assumed, bitcast[u32](op(b, bitcast[f32](assumed))));

while assumed != old {
assumed = old;
old = cmpxchg(addr_as_ui, assumed, bitcast[u32](op(b, bitcast[f32](assumed))));
}

bitcast[f32](old)
}

fn @atomic_op_f64(a: &mut[1]f64, b: f64, cmpxchg: fn(&mut[1]u64, u64, u64) -> u64, op: fn(f64, f64) -> f64) -> f64 {
let addr_as_ui = bitcast[&mut[1]u64](a);
let mut assumed = *addr_as_ui;
let mut old = cmpxchg(addr_as_ui, assumed, bitcast[u64](op(b, bitcast[f64](assumed))));

while assumed != old {
assumed = old;
old = cmpxchg(addr_as_ui, assumed, bitcast[u64](op(b, bitcast[f64](assumed))));
}

bitcast[f64](old)
}

fn @atomic_add_f32(addr: &mut f32, val: f32) -> f32 { atomic_op_f32(addr as &mut[1]f32, val, opencl_atomic_cmpxchg_u32, @|a, b| a + b) }
fn @atomic_add_f64(addr: &mut f64, val: f64) -> f64 { atomic_op_f64(addr as &mut[1]f64, val, opencl_atomic_cmpxchg_u64, @|a, b| a + b) }

fn @is_nvvm() -> bool { false }
fn @is_cuda() -> bool { false }
Expand Down
10 changes: 5 additions & 5 deletions backend/mapping_gpu.impala
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,17 @@ fn @device() -> Device {
alloc: |size| { accelerator(device_id).alloc(size) },
alloc_mirror: |buf, size| { alloc_cpu(size) },
transfer: |from, to| { copy(from, to); },
sqrt: @|a| { nvvm_intrinsics.sqrt(a) },
sqrt: @|a| { gpu_math.sqrt(a) },
loop_1d: loop_1d,
add_iterator: |iterator| {},
atomic_add_i32: @|ptr, value| { nvvm_atomic_add_global(ptr as &mut[1]i32, value) }
atomic_add_i32: @|ptr, value| { atomic_add_global_i32(ptr as &mut[1]i32, value) }
}
}

fn @reserve_reduction_buffers(nelements: i32, elem_size: i32) -> () {
if reduction_buffers_.capacity < nelements * elem_size {
let new_capacity = (nelements + 20) * elem_size;
reallocate_array(&mut reduction_buffers_.red_array, new_capacity, 1, 1, false);
reallocate_array(&mut reduction_buffers_.red_array, new_capacity, 1, 1i64, false);
reduction_buffers_.capacity = new_capacity;
}
}
Expand All @@ -63,7 +63,7 @@ fn @reduce_i32(n: i32, b: i32, reduce: fn(i32, i32) -> i32, body: fn(i32) -> i32
let mut red = b;

if n > 0 {
reserve_reduction_buffers(nblocks, sizeof[i32]());
reserve_reduction_buffers(nblocks, sizeof[i32]() as i32);
let blocks_red_host = bitcast[&[i32]](array_host(reduction_buffers_.red_array).data);
let blocks_red_gpu = bitcast[&mut[i32]](array_dev(reduction_buffers_.red_array).data);

Expand Down Expand Up @@ -114,7 +114,7 @@ fn @reduce_aabb(n: i32, b: AABB, reduce: fn(AABB, AABB) -> AABB, body: fn(i32) -
let mut red = b;

if n > 0 {
reserve_reduction_buffers(nblocks, sizeof[AABB]());
reserve_reduction_buffers(nblocks, sizeof[AABB]() as i32);
let blocks_red_host = bitcast[&[AABB]](array_host(reduction_buffers_.red_array).data);
let blocks_red_gpu = bitcast[&mut[AABB]](array_dev(reduction_buffers_.red_array).data);

Expand Down
2 changes: 1 addition & 1 deletion core/layouts.impala
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ fn @add_fn(atomic: i32, array: ArrayData, index: i32, value: real_t) -> () {
let data = get_array_real_ref(array_dev, array);

if atomic != 0 {
atomic_op_real(&mut data(index), value, @|a, b| { a + b });
atomic_add_real(&mut data(index), value);
} else {
data(index) += value;
}
Expand Down
23 changes: 0 additions & 23 deletions utils/atomic.impala

This file was deleted.

2 changes: 1 addition & 1 deletion utils/utilities.impala
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
type real_t = f64;
static math = cpu_intrinsics;
static real_floor = math.floor;
static atomic_op_real = atomic_op_f64;
static atomic_add_real = atomic_add_f64;

struct AABB {
xmin: real_t,
Expand Down

0 comments on commit 46c8582

Please sign in to comment.