-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #195 from dodecatheon/add-intel-hpl
Intel MKL-optimized HPL
- Loading branch information
Showing
2 changed files
with
526 additions
and
76 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,126 +11,184 @@ | |
import math | ||
|
||
|
||
def pad_value(val, desc): | ||
return ('{:<14}'.format(val) + desc) | ||
|
||
|
||
class Hpl(SpackApplication): | ||
'''Define HPL application''' | ||
name = 'hpl' | ||
|
||
maintainers('douglasjacobsen', 'dodecatheon') | ||
|
||
tags('benchmark-app', 'mini-app', 'benchmark') | ||
tags('benchmark-app', 'benchmark', 'linpack') | ||
|
||
default_compiler('gcc9', spack_spec='[email protected]') | ||
|
||
software_spec('impi2018', spack_spec='[email protected]') | ||
software_spec('impi_2018', spack_spec='[email protected]') | ||
|
||
software_spec('hpl', | ||
spack_spec='[email protected] +openmp', | ||
compiler='gcc9') | ||
software_spec('hpl', spack_spec='[email protected] +openmp', compiler='gcc9') | ||
|
||
required_package('hpl') | ||
|
||
executable('execute', 'xhpl', use_mpi=True) | ||
|
||
workload('standard', executables=['execute']) | ||
workload('calculator', executables=['execute']) | ||
|
||
workload_variable('output_file', default='HPL.out output file name (if any)', | ||
workload_variable('output_file', | ||
default=pad_value('HPL.out', 'output file name (if any)'), | ||
description='Output file name (if any)', | ||
workloads=['standard']) | ||
workload_variable('device_out', default='6 device out (6=stdout,7=stderr,file)', | ||
workload_variable('device_out', | ||
default=pad_value('6', 'device out (6=stdout,7=stderr,file)'), | ||
description='Output device', | ||
workloads=['standard']) | ||
workload_variable('N-Ns', default='4 # of problems sizes (N)', | ||
workload_variable('N-Ns', | ||
default=pad_value('4', 'Number of problems sizes (N)'), | ||
description='Number of problems sizes', | ||
workloads=['standard']) | ||
workload_variable('Ns', default='29 30 34 35 Ns', | ||
workload_variable('Ns', | ||
default=pad_value('29 30 34 35', 'Ns'), | ||
description='Problem sizes', | ||
workloads=['standard']) | ||
workload_variable('N-NBs', default='4 # of NBs', | ||
workload_variable('N-NBs', | ||
default=pad_value('4', 'Number of NBs'), | ||
description='Number of NBs', | ||
workloads=['standard']) | ||
workload_variable('NBs', default='1 2 3 4 NBs', | ||
workload_variable('NBs', | ||
default=pad_value('1 2 3 4', 'NBs'), | ||
description='NB values', | ||
workloads=['standard']) | ||
workload_variable('PMAP', default='0 PMAP process mapping (0=Row-,1=Column-major)', | ||
workload_variable('PMAP', | ||
default=pad_value('0', 'PMAP process mapping (0=Row-,1=Column-major)'), | ||
description='PMAP Process mapping. (0=Row-, 1=Column-Major)', | ||
workloads=['standard']) | ||
workload_variable('N-Grids', default='3 # of process grids (P x Q)', | ||
workload_variable('N-Grids', | ||
default=pad_value('3', 'Number of process grids (P x Q)'), | ||
description='Number of process grids (P x Q)', | ||
workloads=['standard']) | ||
workload_variable('Ps', default='2 1 4 Ps', | ||
workload_variable('Ps', | ||
default=pad_value('2 1 4', 'Ps'), | ||
description='P values', | ||
workloads=['standard']) | ||
workload_variable('Qs', default='2 4 1 Qs', | ||
workload_variable('Qs', | ||
default=pad_value('2 4 1', 'Qs'), | ||
description='Q values', | ||
workloads=['standard']) | ||
workload_variable('threshold', default='16.0 threshold', | ||
workload_variable('threshold', | ||
default=pad_value('16.0', 'threshold'), | ||
description='Residual threshold', | ||
workloads=['standard']) | ||
workload_variable('NPFACTS', default='3 # of panel fact', | ||
workload_variable('NPFACTs', | ||
default=pad_value('3', 'Number of PFACTs, panel fact'), | ||
description='Number of PFACTs', | ||
workloads=['standard']) | ||
workload_variable('PFACTS', default='0 1 2 PFACTs (0=left, 1=Crout, 2=Right)', | ||
workload_variable('PFACTs', | ||
default=pad_value('0 1 2', 'PFACTs (0=left, 1=Crout, 2=Right)'), | ||
description='PFACT Values', | ||
workloads=['standard']) | ||
workload_variable('N-NBMINs', default='2 # of recursive stopping criterium', | ||
workload_variable('N-NBMINs', | ||
default=pad_value('2', 'Number of NBMINs, recursive stopping criteria'), | ||
description='Number of NBMINs', | ||
workloads=['standard']) | ||
workload_variable('NBMINs', default='2 4 NBMINs (>= 1)', | ||
workload_variable('NBMINs', | ||
default=pad_value('2 4', 'NBMINs (>= 1)'), | ||
description='NBMIN values', | ||
workloads=['standard']) | ||
workload_variable('N-NDIVs', default='1 # of panels in recursion', | ||
workload_variable('N-NDIVs', | ||
default=pad_value('1', 'Number of NDIVs, panels in recursion'), | ||
description='Number of NDIVs', | ||
workloads=['standard']) | ||
workload_variable('NDIVs', default='2 NDIVs', | ||
workload_variable('NDIVs', | ||
default=pad_value('2', 'NDIVs'), | ||
description='NDIV values', | ||
workloads=['standard']) | ||
workload_variable('N-RFACTs', default='3 # of recursive panel fact.', | ||
workload_variable('N-RFACTs', | ||
default=pad_value('3', 'Number of RFACTs, recursive panel fact.'), | ||
description='Number of RFACTs', | ||
workloads=['standard']) | ||
workload_variable('RFACTs', default='0 1 2 RFACTs (0=left, 1=Crout, 2=Right)', | ||
workload_variable('RFACTs', | ||
default=pad_value('0 1 2', 'RFACTs (0=left, 1=Crout, 2=Right)'), | ||
description='RFACT values', | ||
workloads=['standard']) | ||
workload_variable('N-BCASTs', default='1 # of broadcast', | ||
workload_variable('N-BCASTs', | ||
default=pad_value('1', 'Number of BCASTs, broadcast'), | ||
description='Number of BCASTs', | ||
workloads=['standard']) | ||
workload_variable('BCASTs', default='0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)', | ||
workload_variable('BCASTs', | ||
default=pad_value('0', 'BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)'), | ||
description='BCAST values', | ||
workloads=['standard']) | ||
workload_variable('N-DEPTHs', default='1 # of lookahead depth', | ||
workload_variable('N-DEPTHs', | ||
default=pad_value('1', 'Number of DEPTHs, lookahead depth'), | ||
description='Number of DEPTHs', | ||
workloads=['standard']) | ||
workload_variable('DEPTHs', default='0 DEPTHs (>=0)', | ||
workload_variable('DEPTHs', | ||
default=pad_value('0', 'DEPTHs (>=0)'), | ||
description='DEPTH values', | ||
workloads=['standard']) | ||
workload_variable('SWAP', default='2 SWAP (0=bin-exch,1=long,2=mix)', | ||
workload_variable('SWAP', | ||
default=pad_value('2', 'SWAP (0=bin-exch,1=long,2=mix)'), | ||
description='Swapping algorithm', | ||
workloads=['standard']) | ||
workload_variable('swapping_threshold', default='64 swapping threshold', | ||
workload_variable('swapping_threshold', | ||
default=pad_value('64', 'swapping threshold'), | ||
description='Swapping threshold', | ||
workloads=['standard']) | ||
workload_variable('L1', default='0 L1 in (0=transposed,1=no-transposed) form', | ||
workload_variable('L1', | ||
default=pad_value('0', 'L1 in (0=transposed,1=no-transposed) form'), | ||
description='Storage for upper triangular portion of columns', | ||
workloads=['standard']) | ||
workload_variable('U', default='0 U in (0=transposed,1=no-transposed) form', | ||
workload_variable('U', | ||
default=pad_value('0', 'U in (0=transposed,1=no-transposed) form'), | ||
description='Storage for the rows of U', | ||
workloads=['standard']) | ||
workload_variable('Equilibration', default='1 Equilibration (0=no,1=yes)', | ||
workload_variable('Equilibration', | ||
default=pad_value('1', 'Equilibration (0=no,1=yes)'), | ||
description='Determines if equilibration should be enabled or disabled.', | ||
workloads=['standard']) | ||
workload_variable('mem_alignment', default='8 memory alignment in double (> 0)', | ||
workload_variable('mem_alignment', | ||
default=pad_value('8', 'memory alignment in double (> 0)'), | ||
description='Sets the alignment in doubles for memory addresses', | ||
workloads=['standard']) | ||
|
||
workload('calculator', executables=['execute']) | ||
# calculator workload-specific variables: | ||
|
||
workload_variable('percent_mem', default='85', | ||
description='Percent of memory to use (default 85)', | ||
workloads=['calculator']) | ||
|
||
workload_variable('memory_per_node', default='128', | ||
workload_variable('memory_per_node', default='240', | ||
description='Memory per node in GB', | ||
workloads=['calculator']) | ||
|
||
workload_variable('block_size', default='224', | ||
workload_variable('block_size', default='384', | ||
description='Size of each block', | ||
workloads=['calculator']) | ||
|
||
workload_variable('pfact', default='0', | ||
description='PFACT for optimized calculator', | ||
workloads=['calculator']) | ||
|
||
workload_variable('nbmin', default='2', | ||
description='NBMIN for optimized calculator', | ||
workloads=['calculator']) | ||
|
||
workload_variable('rfact', default='0', | ||
description='RFACT for optimized calculator', | ||
workloads=['calculator']) | ||
|
||
workload_variable('bcast', default='0', | ||
description='BCAST for optimized calculator', | ||
workloads=['calculator']) | ||
|
||
workload_variable('depth', default='0', | ||
description='DEPTH for optimized calculator', | ||
workloads=['calculator']) | ||
|
||
# FoMs: | ||
|
||
figure_of_merit('Time', log_file='{experiment_run_dir}/{experiment_name}.out', | ||
fom_regex=r'.*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+(?P<gflops>[0-9].*)\n', | ||
group_name='time', units='s', contexts=['problem-name']) | ||
|
@@ -142,55 +200,109 @@ class Hpl(SpackApplication): | |
|
||
figure_of_merit_context('problem-name', regex=r'.*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+(?P<gflops>[0-9].*)\n', output_format='{N}-{NB}-{P}-{Q}') | ||
|
||
# Integer sqrt | ||
def _isqrt(self, n): | ||
if n < 0: | ||
raise Exception | ||
elif n < 2: | ||
return (n) | ||
else: | ||
lo = self._isqrt(n >> 2) << 1 | ||
hi = lo + 1 | ||
if ((hi * hi) > n): | ||
return (lo) | ||
else: | ||
return (hi) | ||
|
||
def _calculate_values(self, workspace, expander): | ||
if expander.workload_name == 'calculator': | ||
memoryPerNode = int(expander.expand_var('{memory_per_node}')) | ||
# Find the best P and Q whose product is the number of available | ||
# cores, with P less than Q | ||
nNodes = int(expander.expand_var('{n_nodes}')) | ||
processesPerNode = int(expander.expand_var('{processes_per_node}')) | ||
blockSize = int(expander.expand_var('{block_size}')) | ||
|
||
targetProblemSize = 0.80 * int(math.sqrt((memoryPerNode | ||
* nNodes * 1024 * 1024 * 1024) | ||
/ 8)) | ||
nBlocks = int(targetProblemSize / blockSize) | ||
nBlocks = nBlocks if (nBlocks % 2 == 0) else nBlocks - 1 | ||
problemSize = blockSize * nBlocks | ||
|
||
totalCores = nNodes * processesPerNode | ||
sqrtCores = int(math.sqrt(totalCores)) | ||
|
||
bestDist = totalCores - 1 | ||
bestP = 1 | ||
bestP = self._isqrt(totalCores) | ||
while ((totalCores % bestP) > 0): # stops at 1 because any int % 1 = 0 | ||
bestP -= 1 | ||
|
||
bestQ = totalCores // bestP | ||
|
||
for i in range(2, sqrtCores): | ||
if totalCores % i == 0: | ||
testDist = totalCores - i | ||
if testDist < bestDist: | ||
bestDist = testDist | ||
bestP = i | ||
# Find LCM(P,Q) | ||
P = int(bestP) | ||
Q = int(bestQ) | ||
lcmPQ = Q # Q is always the larger of P and Q | ||
while ((lcmPQ % P) > 0): | ||
lcmPQ += Q | ||
|
||
bestQ = int(totalCores / bestP) | ||
# HPL maintainers recommend basing the target problem size on | ||
# the square root of 80% of total memory in words. | ||
memoryPerNode = int(expander.expand_var(expander.expansion_str('memory_per_node'))) | ||
memFraction = int(expander.expand_var(expander.expansion_str('percent_mem'))) / 100 | ||
blockSize = int(expander.expand_var(expander.expansion_str('block_size'))) | ||
one_gb_mem_in_words = (1 << 30) / 8 | ||
|
||
fullMemWords = nNodes * memoryPerNode * one_gb_mem_in_words | ||
|
||
targetProblemSize = math.sqrt(fullMemWords * memFraction) | ||
|
||
# Ensure that N is divisible by NB * LCM(P,Q) | ||
problemSize = int(targetProblemSize) | ||
problemSize -= (problemSize % blockSize) | ||
nBlocks = problemSize // blockSize | ||
nBlocks -= nBlocks % lcmPQ | ||
problemSize = blockSize * nBlocks | ||
usedPercentage = int(problemSize**2 / fullMemWords * 100) | ||
|
||
for var, config in self.workload_variables['standard'].items(): | ||
self.variables[var] = config['default'] | ||
|
||
self.variables['N-Ns'] = '1' | ||
self.variables['Ns'] = int(problemSize) | ||
self.variables['N-NBs'] = '1' | ||
self.variables['NBs'] = blockSize | ||
self.variables['N-Grids'] = '1' | ||
self.variables['Ps'] = int(bestP) | ||
self.variables['Qs'] = int(bestQ) | ||
self.variables['NPFACTS'] = '1' | ||
self.variables['PFACTS'] = '2' | ||
self.variables['N-NBMINs'] = '1' | ||
self.variables['NBMINs'] = '4' | ||
self.variables['N-RFACTs'] = '1' | ||
self.variables['RFACTs'] = '1' | ||
self.variables['N-BCASTs'] = '1' | ||
self.variables['BCASTs'] = '1' | ||
self.variables['N-DEPTHs'] = '1' | ||
self.variables['DEPTHs'] = '1' | ||
pfact = expander.expand_var(expander.expansion_str('pfact')) | ||
nbmin = expander.expand_var(expander.expansion_str('nbmin')) | ||
rfact = expander.expand_var(expander.expansion_str('rfact')) | ||
bcast = expander.expand_var(expander.expansion_str('bcast')) | ||
depth = expander.expand_var(expander.expansion_str('depth')) | ||
|
||
self.variables['N-Ns'] = pad_value('1', 'Number of problems sizes (N)') # vs 4 | ||
|
||
# Calculated: | ||
self.variables['Ns'] = pad_value(int(problemSize), | ||
f"Ns (= {usedPercentage}% of total available memory)") | ||
|
||
self.variables['N-NBs'] = pad_value('1', 'Number of NBs') # vs 4 | ||
|
||
# Calculated: | ||
self.variables['NBs'] = pad_value(blockSize, "NBs") # calculated, vs 4 samples | ||
|
||
self.variables['N-Grids'] = pad_value('1', 'Number of Grids, process grids (P x Q)') # vs 3 | ||
|
||
# Calculated: | ||
self.variables['Ps'] = pad_value(int(bestP), "Ps") | ||
|
||
# Calculated: | ||
self.variables['Qs'] = pad_value(int(bestQ), "Qs") | ||
|
||
self.variables['NPFACTs'] = pad_value('1', 'Number of PFACTs, panel fact') # vs 3 | ||
|
||
# ramble.yaml configurable | ||
self.variables['PFACTs'] = pad_value(pfact, 'PFACT Values (0=left, 1=Crout, 2=Right)') # vs 0 1 2 | ||
|
||
self.variables['N-NBMINs'] = pad_value('1', 'Number of NBMINs, recursive stopping criteria') # vs 2 | ||
|
||
# ramble.yaml configurable | ||
self.variables['NBMINs'] = pad_value(nbmin, 'NBMINs (>= 1)') # vs '2 4' | ||
|
||
self.variables['N-RFACTs'] = pad_value('1', 'Number of RFACTS, recursive panel fact.') # vs '3' | ||
|
||
# ramble.yaml configurable | ||
self.variables['RFACTs'] = pad_value(rfact, 'RFACTs (0=left, 1=Crout, 2=Right)') # vs '0 1 2' | ||
|
||
# ramble.yaml configurable | ||
self.variables['BCASTs'] = pad_value(bcast, 'BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MKL BPUSH,7=AMD Hybrid Panel)') # vs '0' | ||
|
||
# ramble.yaml configurable | ||
self.variables['DEPTHs'] = pad_value(depth, 'DEPTHs (>=0)') # vs '0' | ||
|
||
def _make_experiments(self, workspace): | ||
super()._make_experiments(workspace) | ||
|
@@ -199,8 +311,8 @@ def _make_experiments(self, workspace): | |
input_path = self.expander.expand_var('{experiment_run_dir}/HPL.dat') | ||
|
||
settings = ['output_file', 'device_out', 'N-Ns', 'Ns', 'N-NBs', 'NBs', | ||
'PMAP', 'N-Grids', 'Ps', 'Qs', 'threshold', 'NPFACTS', | ||
'PFACTS', 'N-NBMINs', 'NBMINs', 'N-NDIVs', 'NDIVs', 'N-RFACTs', | ||
'PMAP', 'N-Grids', 'Ps', 'Qs', 'threshold', 'NPFACTs', | ||
'PFACTs', 'N-NBMINs', 'NBMINs', 'N-NDIVs', 'NDIVs', 'N-RFACTs', | ||
'RFACTs', 'N-BCASTs', 'BCASTs', 'N-DEPTHs', 'DEPTHs', 'SWAP', | ||
'swapping_threshold', 'L1', 'U', 'Equilibration', | ||
'mem_alignment'] | ||
|
@@ -210,4 +322,10 @@ def _make_experiments(self, workspace): | |
f.write('Innovative Computing Laboratory, University of Tennessee\n') | ||
|
||
for setting in settings: | ||
f.write(self.expander.expand_var('{' + setting + '}') + '\n') | ||
# This gets around an issue in expander where trailing comments | ||
# after '#' are not printed | ||
hash_replace_str = self.expander.expand_var(self.expander.expansion_str(setting)).replace('Number', '#') | ||
f.write(hash_replace_str + '\n') | ||
|
||
# Write some documentation at the bottom of the input file: | ||
f.write('##### This line (no. 32) is ignored (it serves as a separator). ######') |
Oops, something went wrong.