Skip to content

Commit

Permalink
Merge pull request #195 from dodecatheon/add-intel-hpl
Browse files Browse the repository at this point in the history
Intel MKL-optimized HPL
  • Loading branch information
douglasjacobsen authored Jul 12, 2023
2 parents cc730ce + bde7c1a commit 080c07d
Show file tree
Hide file tree
Showing 2 changed files with 526 additions and 76 deletions.
270 changes: 194 additions & 76 deletions var/ramble/repos/builtin/applications/hpl/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,126 +11,184 @@
import math


def pad_value(val, desc):
return ('{:<14}'.format(val) + desc)


class Hpl(SpackApplication):
'''Define HPL application'''
name = 'hpl'

maintainers('douglasjacobsen', 'dodecatheon')

tags('benchmark-app', 'mini-app', 'benchmark')
tags('benchmark-app', 'benchmark', 'linpack')

default_compiler('gcc9', spack_spec='[email protected]')

software_spec('impi2018', spack_spec='[email protected]')
software_spec('impi_2018', spack_spec='[email protected]')

software_spec('hpl',
spack_spec='[email protected] +openmp',
compiler='gcc9')
software_spec('hpl', spack_spec='[email protected] +openmp', compiler='gcc9')

required_package('hpl')

executable('execute', 'xhpl', use_mpi=True)

workload('standard', executables=['execute'])
workload('calculator', executables=['execute'])

workload_variable('output_file', default='HPL.out output file name (if any)',
workload_variable('output_file',
default=pad_value('HPL.out', 'output file name (if any)'),
description='Output file name (if any)',
workloads=['standard'])
workload_variable('device_out', default='6 device out (6=stdout,7=stderr,file)',
workload_variable('device_out',
default=pad_value('6', 'device out (6=stdout,7=stderr,file)'),
description='Output device',
workloads=['standard'])
workload_variable('N-Ns', default='4 # of problems sizes (N)',
workload_variable('N-Ns',
default=pad_value('4', 'Number of problems sizes (N)'),
description='Number of problems sizes',
workloads=['standard'])
workload_variable('Ns', default='29 30 34 35 Ns',
workload_variable('Ns',
default=pad_value('29 30 34 35', 'Ns'),
description='Problem sizes',
workloads=['standard'])
workload_variable('N-NBs', default='4 # of NBs',
workload_variable('N-NBs',
default=pad_value('4', 'Number of NBs'),
description='Number of NBs',
workloads=['standard'])
workload_variable('NBs', default='1 2 3 4 NBs',
workload_variable('NBs',
default=pad_value('1 2 3 4', 'NBs'),
description='NB values',
workloads=['standard'])
workload_variable('PMAP', default='0 PMAP process mapping (0=Row-,1=Column-major)',
workload_variable('PMAP',
default=pad_value('0', 'PMAP process mapping (0=Row-,1=Column-major)'),
description='PMAP Process mapping. (0=Row-, 1=Column-Major)',
workloads=['standard'])
workload_variable('N-Grids', default='3 # of process grids (P x Q)',
workload_variable('N-Grids',
default=pad_value('3', 'Number of process grids (P x Q)'),
description='Number of process grids (P x Q)',
workloads=['standard'])
workload_variable('Ps', default='2 1 4 Ps',
workload_variable('Ps',
default=pad_value('2 1 4', 'Ps'),
description='P values',
workloads=['standard'])
workload_variable('Qs', default='2 4 1 Qs',
workload_variable('Qs',
default=pad_value('2 4 1', 'Qs'),
description='Q values',
workloads=['standard'])
workload_variable('threshold', default='16.0 threshold',
workload_variable('threshold',
default=pad_value('16.0', 'threshold'),
description='Residual threshold',
workloads=['standard'])
workload_variable('NPFACTS', default='3 # of panel fact',
workload_variable('NPFACTs',
default=pad_value('3', 'Number of PFACTs, panel fact'),
description='Number of PFACTs',
workloads=['standard'])
workload_variable('PFACTS', default='0 1 2 PFACTs (0=left, 1=Crout, 2=Right)',
workload_variable('PFACTs',
default=pad_value('0 1 2', 'PFACTs (0=left, 1=Crout, 2=Right)'),
description='PFACT Values',
workloads=['standard'])
workload_variable('N-NBMINs', default='2 # of recursive stopping criterium',
workload_variable('N-NBMINs',
default=pad_value('2', 'Number of NBMINs, recursive stopping criteria'),
description='Number of NBMINs',
workloads=['standard'])
workload_variable('NBMINs', default='2 4 NBMINs (>= 1)',
workload_variable('NBMINs',
default=pad_value('2 4', 'NBMINs (>= 1)'),
description='NBMIN values',
workloads=['standard'])
workload_variable('N-NDIVs', default='1 # of panels in recursion',
workload_variable('N-NDIVs',
default=pad_value('1', 'Number of NDIVs, panels in recursion'),
description='Number of NDIVs',
workloads=['standard'])
workload_variable('NDIVs', default='2 NDIVs',
workload_variable('NDIVs',
default=pad_value('2', 'NDIVs'),
description='NDIV values',
workloads=['standard'])
workload_variable('N-RFACTs', default='3 # of recursive panel fact.',
workload_variable('N-RFACTs',
default=pad_value('3', 'Number of RFACTs, recursive panel fact.'),
description='Number of RFACTs',
workloads=['standard'])
workload_variable('RFACTs', default='0 1 2 RFACTs (0=left, 1=Crout, 2=Right)',
workload_variable('RFACTs',
default=pad_value('0 1 2', 'RFACTs (0=left, 1=Crout, 2=Right)'),
description='RFACT values',
workloads=['standard'])
workload_variable('N-BCASTs', default='1 # of broadcast',
workload_variable('N-BCASTs',
default=pad_value('1', 'Number of BCASTs, broadcast'),
description='Number of BCASTs',
workloads=['standard'])
workload_variable('BCASTs', default='0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)',
workload_variable('BCASTs',
default=pad_value('0', 'BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)'),
description='BCAST values',
workloads=['standard'])
workload_variable('N-DEPTHs', default='1 # of lookahead depth',
workload_variable('N-DEPTHs',
default=pad_value('1', 'Number of DEPTHs, lookahead depth'),
description='Number of DEPTHs',
workloads=['standard'])
workload_variable('DEPTHs', default='0 DEPTHs (>=0)',
workload_variable('DEPTHs',
default=pad_value('0', 'DEPTHs (>=0)'),
description='DEPTH values',
workloads=['standard'])
workload_variable('SWAP', default='2 SWAP (0=bin-exch,1=long,2=mix)',
workload_variable('SWAP',
default=pad_value('2', 'SWAP (0=bin-exch,1=long,2=mix)'),
description='Swapping algorithm',
workloads=['standard'])
workload_variable('swapping_threshold', default='64 swapping threshold',
workload_variable('swapping_threshold',
default=pad_value('64', 'swapping threshold'),
description='Swapping threshold',
workloads=['standard'])
workload_variable('L1', default='0 L1 in (0=transposed,1=no-transposed) form',
workload_variable('L1',
default=pad_value('0', 'L1 in (0=transposed,1=no-transposed) form'),
description='Storage for upper triangular portion of columns',
workloads=['standard'])
workload_variable('U', default='0 U in (0=transposed,1=no-transposed) form',
workload_variable('U',
default=pad_value('0', 'U in (0=transposed,1=no-transposed) form'),
description='Storage for the rows of U',
workloads=['standard'])
workload_variable('Equilibration', default='1 Equilibration (0=no,1=yes)',
workload_variable('Equilibration',
default=pad_value('1', 'Equilibration (0=no,1=yes)'),
description='Determines if equilibration should be enabled or disabled.',
workloads=['standard'])
workload_variable('mem_alignment', default='8 memory alignment in double (> 0)',
workload_variable('mem_alignment',
default=pad_value('8', 'memory alignment in double (> 0)'),
description='Sets the alignment in doubles for memory addresses',
workloads=['standard'])

workload('calculator', executables=['execute'])
# calculator workload-specific variables:

workload_variable('percent_mem', default='85',
description='Percent of memory to use (default 85)',
workloads=['calculator'])

workload_variable('memory_per_node', default='128',
workload_variable('memory_per_node', default='240',
description='Memory per node in GB',
workloads=['calculator'])

workload_variable('block_size', default='224',
workload_variable('block_size', default='384',
description='Size of each block',
workloads=['calculator'])

workload_variable('pfact', default='0',
description='PFACT for optimized calculator',
workloads=['calculator'])

workload_variable('nbmin', default='2',
description='NBMIN for optimized calculator',
workloads=['calculator'])

workload_variable('rfact', default='0',
description='RFACT for optimized calculator',
workloads=['calculator'])

workload_variable('bcast', default='0',
description='BCAST for optimized calculator',
workloads=['calculator'])

workload_variable('depth', default='0',
description='DEPTH for optimized calculator',
workloads=['calculator'])

# FoMs:

figure_of_merit('Time', log_file='{experiment_run_dir}/{experiment_name}.out',
fom_regex=r'.*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+(?P<gflops>[0-9].*)\n',
group_name='time', units='s', contexts=['problem-name'])
Expand All @@ -142,55 +200,109 @@ class Hpl(SpackApplication):

figure_of_merit_context('problem-name', regex=r'.*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+(?P<gflops>[0-9].*)\n', output_format='{N}-{NB}-{P}-{Q}')

# Integer sqrt
def _isqrt(self, n):
if n < 0:
raise Exception
elif n < 2:
return (n)
else:
lo = self._isqrt(n >> 2) << 1
hi = lo + 1
if ((hi * hi) > n):
return (lo)
else:
return (hi)

def _calculate_values(self, workspace, expander):
if expander.workload_name == 'calculator':
memoryPerNode = int(expander.expand_var('{memory_per_node}'))
# Find the best P and Q whose product is the number of available
# cores, with P less than Q
nNodes = int(expander.expand_var('{n_nodes}'))
processesPerNode = int(expander.expand_var('{processes_per_node}'))
blockSize = int(expander.expand_var('{block_size}'))

targetProblemSize = 0.80 * int(math.sqrt((memoryPerNode
* nNodes * 1024 * 1024 * 1024)
/ 8))
nBlocks = int(targetProblemSize / blockSize)
nBlocks = nBlocks if (nBlocks % 2 == 0) else nBlocks - 1
problemSize = blockSize * nBlocks

totalCores = nNodes * processesPerNode
sqrtCores = int(math.sqrt(totalCores))

bestDist = totalCores - 1
bestP = 1
bestP = self._isqrt(totalCores)
while ((totalCores % bestP) > 0): # stops at 1 because any int % 1 = 0
bestP -= 1

bestQ = totalCores // bestP

for i in range(2, sqrtCores):
if totalCores % i == 0:
testDist = totalCores - i
if testDist < bestDist:
bestDist = testDist
bestP = i
# Find LCM(P,Q)
P = int(bestP)
Q = int(bestQ)
lcmPQ = Q # Q is always the larger of P and Q
while ((lcmPQ % P) > 0):
lcmPQ += Q

bestQ = int(totalCores / bestP)
# HPL maintainers recommend basing the target problem size on
# the square root of 80% of total memory in words.
memoryPerNode = int(expander.expand_var(expander.expansion_str('memory_per_node')))
memFraction = int(expander.expand_var(expander.expansion_str('percent_mem'))) / 100
blockSize = int(expander.expand_var(expander.expansion_str('block_size')))
one_gb_mem_in_words = (1 << 30) / 8

fullMemWords = nNodes * memoryPerNode * one_gb_mem_in_words

targetProblemSize = math.sqrt(fullMemWords * memFraction)

# Ensure that N is divisible by NB * LCM(P,Q)
problemSize = int(targetProblemSize)
problemSize -= (problemSize % blockSize)
nBlocks = problemSize // blockSize
nBlocks -= nBlocks % lcmPQ
problemSize = blockSize * nBlocks
usedPercentage = int(problemSize**2 / fullMemWords * 100)

for var, config in self.workload_variables['standard'].items():
self.variables[var] = config['default']

self.variables['N-Ns'] = '1'
self.variables['Ns'] = int(problemSize)
self.variables['N-NBs'] = '1'
self.variables['NBs'] = blockSize
self.variables['N-Grids'] = '1'
self.variables['Ps'] = int(bestP)
self.variables['Qs'] = int(bestQ)
self.variables['NPFACTS'] = '1'
self.variables['PFACTS'] = '2'
self.variables['N-NBMINs'] = '1'
self.variables['NBMINs'] = '4'
self.variables['N-RFACTs'] = '1'
self.variables['RFACTs'] = '1'
self.variables['N-BCASTs'] = '1'
self.variables['BCASTs'] = '1'
self.variables['N-DEPTHs'] = '1'
self.variables['DEPTHs'] = '1'
pfact = expander.expand_var(expander.expansion_str('pfact'))
nbmin = expander.expand_var(expander.expansion_str('nbmin'))
rfact = expander.expand_var(expander.expansion_str('rfact'))
bcast = expander.expand_var(expander.expansion_str('bcast'))
depth = expander.expand_var(expander.expansion_str('depth'))

self.variables['N-Ns'] = pad_value('1', 'Number of problems sizes (N)') # vs 4

# Calculated:
self.variables['Ns'] = pad_value(int(problemSize),
f"Ns (= {usedPercentage}% of total available memory)")

self.variables['N-NBs'] = pad_value('1', 'Number of NBs') # vs 4

# Calculated:
self.variables['NBs'] = pad_value(blockSize, "NBs") # calculated, vs 4 samples

self.variables['N-Grids'] = pad_value('1', 'Number of Grids, process grids (P x Q)') # vs 3

# Calculated:
self.variables['Ps'] = pad_value(int(bestP), "Ps")

# Calculated:
self.variables['Qs'] = pad_value(int(bestQ), "Qs")

self.variables['NPFACTs'] = pad_value('1', 'Number of PFACTs, panel fact') # vs 3

# ramble.yaml configurable
self.variables['PFACTs'] = pad_value(pfact, 'PFACT Values (0=left, 1=Crout, 2=Right)') # vs 0 1 2

self.variables['N-NBMINs'] = pad_value('1', 'Number of NBMINs, recursive stopping criteria') # vs 2

# ramble.yaml configurable
self.variables['NBMINs'] = pad_value(nbmin, 'NBMINs (>= 1)') # vs '2 4'

self.variables['N-RFACTs'] = pad_value('1', 'Number of RFACTS, recursive panel fact.') # vs '3'

# ramble.yaml configurable
self.variables['RFACTs'] = pad_value(rfact, 'RFACTs (0=left, 1=Crout, 2=Right)') # vs '0 1 2'

# ramble.yaml configurable
self.variables['BCASTs'] = pad_value(bcast, 'BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MKL BPUSH,7=AMD Hybrid Panel)') # vs '0'

# ramble.yaml configurable
self.variables['DEPTHs'] = pad_value(depth, 'DEPTHs (>=0)') # vs '0'

def _make_experiments(self, workspace):
super()._make_experiments(workspace)
Expand All @@ -199,8 +311,8 @@ def _make_experiments(self, workspace):
input_path = self.expander.expand_var('{experiment_run_dir}/HPL.dat')

settings = ['output_file', 'device_out', 'N-Ns', 'Ns', 'N-NBs', 'NBs',
'PMAP', 'N-Grids', 'Ps', 'Qs', 'threshold', 'NPFACTS',
'PFACTS', 'N-NBMINs', 'NBMINs', 'N-NDIVs', 'NDIVs', 'N-RFACTs',
'PMAP', 'N-Grids', 'Ps', 'Qs', 'threshold', 'NPFACTs',
'PFACTs', 'N-NBMINs', 'NBMINs', 'N-NDIVs', 'NDIVs', 'N-RFACTs',
'RFACTs', 'N-BCASTs', 'BCASTs', 'N-DEPTHs', 'DEPTHs', 'SWAP',
'swapping_threshold', 'L1', 'U', 'Equilibration',
'mem_alignment']
Expand All @@ -210,4 +322,10 @@ def _make_experiments(self, workspace):
f.write('Innovative Computing Laboratory, University of Tennessee\n')

for setting in settings:
f.write(self.expander.expand_var('{' + setting + '}') + '\n')
# This gets around an issue in expander where trailing comments
# after '#' are not printed
hash_replace_str = self.expander.expand_var(self.expander.expansion_str(setting)).replace('Number', '#')
f.write(hash_replace_str + '\n')

# Write some documentation at the bottom of the input file:
f.write('##### This line (no. 32) is ignored (it serves as a separator). ######')
Loading

0 comments on commit 080c07d

Please sign in to comment.