Skip to content

Commit

Permalink
Merge pull request #218 from pkestene/fix/examples
Browse files Browse the repository at this point in the history
Fix examples that fail because Kokkos::finalize called too early
  • Loading branch information
NaderAlAwar authored Nov 30, 2023
2 parents ea78f3a + a9cff5a commit 01f920f
Show file tree
Hide file tree
Showing 17 changed files with 128 additions and 87 deletions.
13 changes: 8 additions & 5 deletions examples/BabelStream/functor/babel_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def dot(self, index: int, acc: pk.Acc[float]):
acc += self.a[index] * self.b[index]


if __name__ == "__main__":
def run() -> None:
array_size: int = 2**25 # 100000
startA: float = 0.1
startB: float = 0.2
Expand Down Expand Up @@ -92,7 +92,7 @@ def dot(self, index: int, acc: pk.Acc[float]):
timings[4].append(timer.seconds())
timer.reset()

goldA = startA
goldA = startA
goldB = startB
goldC = startC

Expand All @@ -108,9 +108,9 @@ def dot(self, index: int, acc: pk.Acc[float]):
errB /= len(w.b)
errC = reduce(lambda s, val: s + abs(val - goldC), w.c)
errC /= len(w.c)
# epsi = sys.float_info.epsilon * 100
epsi = 1e-8

# epsi = sys.float_info.epsilon * 100
epsi = 1e-8
if (errA > epsi):
print(f"Validation failed on a[]. Average error {errA}")
if (errB > epsi):
Expand Down Expand Up @@ -143,3 +143,6 @@ def dot(self, index: int, acc: pk.Acc[float]):
# bandwidth = 1.0e-9 * (total_bytes / runtime)
# print(f"Runtime (seconds): {runtime}")
# print(f"Bandwidth (GB/s): {bandwidth}")

if __name__ == "__main__":
run()
13 changes: 8 additions & 5 deletions examples/BabelStream/standalone/babel_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def dot(index, acc, a_view, b_view):
acc += a_view[index] * b_view[index]


if __name__ == "__main__":
def run() -> None:
array_size: int = 2**25 # 100000
startA: float = 0.1
startB: float = 0.2
Expand Down Expand Up @@ -85,7 +85,7 @@ def dot(index, acc, a_view, b_view):
timings[4].append(timer.seconds())
timer.reset()

goldA = startA
goldA = startA
goldB = startB
goldC = startC

Expand All @@ -101,9 +101,9 @@ def dot(index, acc, a_view, b_view):
errB /= len(b)
errC = reduce(lambda s, val: s + abs(val - goldC), c)
errC /= len(c)
# epsi = sys.float_info.epsilon * 100
epsi = 1e-8

# epsi = sys.float_info.epsilon * 100
epsi = 1e-8
if (errA > epsi):
print(f"Validation failed on a[]. Average error {errA}")
if (errB > epsi):
Expand Down Expand Up @@ -136,3 +136,6 @@ def dot(index, acc, a_view, b_view):
# bandwidth = 1.0e-9 * (total_bytes / runtime)
# print(f"Runtime (seconds): {runtime}")
# print(f"Bandwidth (GB/s): {bandwidth}")

if __name__ == "__main__":
run()
73 changes: 38 additions & 35 deletions examples/BabelStream/workload/babel_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

@pk.workload
class KokkosStream:
def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float,
def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float,
scalar: float, num_times: int):
self.array_size: int = ARRAY_SIZE

Expand All @@ -18,7 +18,7 @@ def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float,
self.initB: pk.double = initB
self.initC: pk.double = initC
self.scalar: pk.double = scalar
self.num_times: int = num_times
self.num_times: int = num_times
self.sum: pk.double = 0

self.runtime: float = 0
Expand Down Expand Up @@ -48,38 +48,38 @@ def run(self):

self.runtime = timer.seconds()

# @pk.callback
# def results(self):
# goldA = self.initA
# goldB = self.initB
# goldC = self.initC

# for i in range(self.num_times):
# goldC = goldA
# goldB = self.scalar * goldC
# goldC = goldA + goldB
# goldA = goldB + self.scalar * goldC

# errA = reduce(lambda s, val: s + abs(val - goldA), self.a)
# errA /= len(self.a)
# errB = reduce(lambda s, val: s + abs(val - goldB), self.b)
# errB /= len(self.b)
# errC = reduce(lambda s, val: s + abs(val - goldC), self.c)
# errC /= len(self.c)
# # epsi = sys.float_info.epsilon * 100
# epsi = 1e-8
# if (errA > epsi):
# print(f"Validation failed on a[]. Average error {errA}")
# if (errB > epsi):
# print(f"Validation failed on b[]. Average error {errB}")
# if (errC > epsi):
# print(f"Validation failed on c[]. Average error {errC}")

# goldSum = goldA * goldB * self.array_size
# errSum = self.sum - goldSum
# if (abs(errSum) > 1e-8):
# print(f"Validation failed on sum. Error {errSum}")
@pk.callback
def results(self):
goldA = self.initA
goldB = self.initB
goldC = self.initC

for i in range(self.num_times):
goldC = goldA
goldB = self.scalar * goldC
goldC = goldA + goldB
goldA = goldB + self.scalar * goldC

errA = reduce(lambda s, val: s + abs(val - goldA), self.a)
errA /= len(self.a)
errB = reduce(lambda s, val: s + abs(val - goldB), self.b)
errB /= len(self.b)
errC = reduce(lambda s, val: s + abs(val - goldC), self.c)
errC /= len(self.c)

# epsi = sys.float_info.epsilon * 100
epsi = 1e-8
if (errA > epsi):
print(f"Validation failed on a[]. Average error {errA}")
if (errB > epsi):
print(f"Validation failed on b[]. Average error {errB}")
if (errC > epsi):
print(f"Validation failed on c[]. Average error {errC}")

goldSum = goldA * goldB * self.array_size
errSum = self.sum - goldSum
if (abs(errSum) > 1e-8):
print(f"Validation failed on sum. Error {errSum}")

# total_bytes = 3 * sys.getsizeof(0.0) * self.array_size * num_times;
# bandwidth = 1.0e-9 * (total_bytes / self.runtime)
Expand Down Expand Up @@ -114,7 +114,7 @@ def dot(self, index: int, acc: pk.Acc[float]):
acc += self.a[index] * self.b[index]


if __name__ == "__main__":
def run() -> None:
array_size: int = 2**25 # 100000
startA: float = 0.1
startB: float = 0.2
Expand All @@ -138,3 +138,6 @@ def dot(self, index: int, acc: pk.Acc[float]):

pk.set_default_space(space)
pk.execute(space, KokkosStream(array_size, startA, startB, startC, startScalar, num_times))

if __name__ == "__main__":
run()
6 changes: 4 additions & 2 deletions examples/ParRes/workload/nstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, iterations, length, offset):
self.scalar: float = 3
self.asum: float = 0

self.nstream_time: float = 0
self.nstream_time: float = 0

@pk.main
def run(self):
Expand Down Expand Up @@ -66,7 +66,7 @@ def init(self, i: int):
self.B[i] = 2
self.C[i] = 2

if __name__ == "__main__":
def run() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('iterations', type=int)
parser.add_argument('length', type=int)
Expand Down Expand Up @@ -100,3 +100,5 @@ def init(self, i: int):
print("Offset = " , offset)
pk.execute(pk.ExecutionSpace.Default, main(iterations, length, offset))

if __name__ == "__main__":
run()
20 changes: 11 additions & 9 deletions examples/ParRes/workload/stencil.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ def __init__(self, iterations, n, tile_size, star, radius):
self.out: pk.View2D[pk.double] = pk.View([self.n, self.n], pk.double, layout=pk.Layout.LayoutRight)
self.norm: float = 0

self.stencil_time: float = 0
self.stencil_time: float = 0

@pk.main
def run(self):
t: int = tile_size
r: int = radius

pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),
pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),
self.init)
pk.fence()

Expand All @@ -34,7 +34,7 @@ def run(self):
for i in range(iterations):
if (i == 1):
pk.fence()

if r == 1:
# star1 stencil
pk.parallel_for("stencil", pk.MDRangePolicy([r,r], [n-r, n-r], [t, t]), self.star1)
Expand All @@ -45,8 +45,8 @@ def run(self):
# star3 stencil
pk.parallel_for("stencil", pk.MDRangePolicy([r,r], [n-r, n-r], [t, t]), self.star3)

pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),

pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),
self.increment)

pk.fence()
Expand All @@ -55,7 +55,7 @@ def run(self):
active_points: int = (n-2*r)*(n-2*r)

# verify correctness
self.norm = pk.parallel_reduce(pk.MDRangePolicy([r, r], [n-r, n-r], [t, t]),
self.norm = pk.parallel_reduce(pk.MDRangePolicy([r, r], [n-r, n-r], [t, t]),
self.norm_reduce)
pk.fence()
self.norm /= active_points
Expand All @@ -78,7 +78,7 @@ def increment(self, i: int, j: int):

@pk.workunit
def norm_reduce(self, i: int, j: int, acc: pk.Acc[pk.double]):
acc += abs(self.out[i][j])
acc += abs(self.out[i][j])

# @pk.callback
# def print_result(self):
Expand Down Expand Up @@ -121,7 +121,7 @@ def star3(self, i: int, j: int):
+self.inp[i][j+2] * 0.08333333333333333 \
+self.inp[i][j+3] * 0.05555555555555555

if __name__ == "__main__":
def run() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('iterations', type=int)
parser.add_argument('n', type=int)
Expand Down Expand Up @@ -169,9 +169,11 @@ def star3(self, i: int, j: int):

n = 2 ** n
print("Number of iterations = ", iterations)
print("Grid size = ", n)
print("Grid size = ", n)
print("Tile size = ", tile_size)
print("Type of stencil = ", "star" if star else "grid")
print("Radius of stencil = ", radius)
pk.execute(pk.ExecutionSpace.Default, main(iterations, n, tile_size, star, radius))

if __name__ == "__main__":
run()
15 changes: 9 additions & 6 deletions examples/ParRes/workload/transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@ def __init__(self, iterations, order, tile_size, permute):
self.iterations: int = iterations
self.order: int = order
self.tile_size: int = tile_size
self.permute: int = permute
self.permute: int = permute

self.A: pk.View2D[pk.double] = pk.View([self.order, self.order], pk.double, layout=pk.LayoutRight)
self.B: pk.View2D[pk.double] = pk.View([self.order, self.order], pk.double, layout=pk.LayoutRight)

self.abserr: float = 0
self.transpose_time: float = 0
self.transpose_time: float = 0
self.addit: float = (self.iterations) * (0.5 * (self.iterations - 1))

@pk.main
def run(self):
pk.parallel_for(
pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), self.init)
pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), self.init)
pk.fence()

timer = pk.Timer()
Expand All @@ -39,7 +39,7 @@ def run(self):
self.transpose_time = timer.seconds()

self.abserr = pk.parallel_reduce(
pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]),
pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]),
self.abserr_reduce)

pk.printf("%f\n", self.abserr)
Expand Down Expand Up @@ -69,9 +69,9 @@ def abserr_reduce(self, i: int, j: int, acc: pk.Acc[pk.double]):
def tranpose(self, i: int, j: int):
self.B[i][j] += self.A[j][i]
self.A[j][i] += 1


if __name__ == "__main__":

def run() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('iterations', type=int)
parser.add_argument('order', type=int)
Expand Down Expand Up @@ -112,3 +112,6 @@ def tranpose(self, i: int, j: int):
print("Tile size = " , tile_size)
print("Permute loops = " , "yes" if permute else "no")
pk.execute(pk.ExecutionSpace.Default, main(iterations, order, tile_size, permute))

if __name__ == "__main__":
run()
22 changes: 13 additions & 9 deletions examples/kokkos-benchmarks/functor/bytes_and_flops.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def benchmark(self, team: pk.TeamMember):
n: int = team.league_rank()
for r in range(self.R):
def team_for(i: int):
a1: pk.double = self.A[n][i][0]
a1: pk.double = self.A[n][i][0]
b: pk.double = self.B[n][i][0]
a2: pk.double = a1 * 1.3
a3: pk.double = a2 * 1.1
Expand All @@ -51,13 +51,13 @@ def team_for(i: int):

pk.parallel_for(pk.TeamThreadRange(team, self.K), team_for)

if __name__ == "__main__":
def run() -> None:
# example args
# Bandwidth Bound : 2 100000 1024 1 1 1 8 256 0
# Cache Bound : 2 100000 1024 64 1 1 8 512 0
# Compute Bound : 2 100000 1024 1 1 8 64 256 0
# Load Slots Used : 2 20000 256 32 16 8 1 256 0
# Inefficient Load: 2 20000 256 32 2 8 1 256 0
# Bandwidth Bound : 2 100000 1024 1 1 1 8 256 0
# Cache Bound : 2 100000 1024 64 1 1 8 512 0
# Compute Bound : 2 100000 1024 1 1 8 64 256 0
# Load Slots Used : 2 20000 256 32 16 8 1 256 0
# Inefficient Load: 2 20000 256 32 2 8 1 256 0
# NOTE P and U are hard coded to double and 8 because otherwise we would have a lot of duplicates
parser = argparse.ArgumentParser()
parser.add_argument("P", type=int, help="Precision (1==float, 2==double)")
Expand All @@ -84,7 +84,7 @@ def team_for(i: int):
exit(1)
if args.S != 0:
print("S must be 0 (shared scratch memory not supported)")
exit(1)
exit(1)

space = pk.ExecutionSpace.OpenMP
if args.execution_space:
Expand All @@ -98,7 +98,7 @@ def team_for(i: int):
T = args.T
S = args.S
scalar_size = 8

pk.set_default_space(space)

r = pk.TeamPolicy(N, T)
Expand All @@ -113,3 +113,7 @@ def team_for(i: int):
print(f"NKRUFTS: {N} {K} {R} {U} {F} {T} {S} Time: {seconds} " +
f"Bandwidth: {1.0 * num_bytes / seconds / (1024**3)} GiB/s GFlop/s: {1e-9 * flops / seconds}")
print(w.C)


if __name__ == "__main__":
run()
Loading

0 comments on commit 01f920f

Please sign in to comment.