Merge pull request #195 from dodecatheon/add-intel-hpl

Intel MKL-optimized HPL
GoogleCloudPlatform · Jul 12, 2023 · 080c07d · 080c07d
2 parents cc730ce + bde7c1a
commit 080c07d
Show file tree

Hide file tree

Showing 2 changed files with 526 additions and 76 deletions.
diff --git a/var/ramble/repos/builtin/applications/hpl/application.py b/var/ramble/repos/builtin/applications/hpl/application.py
@@ -11,126 +11,184 @@
 import math
 
 
+def pad_value(val, desc):
+    return ('{:<14}'.format(val) + desc)
+
+
 class Hpl(SpackApplication):
     '''Define HPL application'''
     name = 'hpl'
 
     maintainers('douglasjacobsen', 'dodecatheon')
 
-    tags('benchmark-app', 'mini-app', 'benchmark')
+    tags('benchmark-app', 'benchmark', 'linpack')
 
     default_compiler('gcc9', spack_spec='[email protected]')
 
-    software_spec('impi2018', spack_spec='[email protected]')
+    software_spec('impi_2018', spack_spec='[email protected]')
 
-    software_spec('hpl',
-                  spack_spec='[email protected] +openmp',
-                  compiler='gcc9')
+    software_spec('hpl', spack_spec='[email protected] +openmp', compiler='gcc9')
 
     required_package('hpl')
 
     executable('execute', 'xhpl', use_mpi=True)
 
     workload('standard', executables=['execute'])
+    workload('calculator', executables=['execute'])
 
-    workload_variable('output_file', default='HPL.out      output file name (if any)',
+    workload_variable('output_file',
+                      default=pad_value('HPL.out', 'output file name (if any)'),
                       description='Output file name (if any)',
                       workloads=['standard'])
-    workload_variable('device_out', default='6            device out (6=stdout,7=stderr,file)',
+    workload_variable('device_out',
+                      default=pad_value('6', 'device out (6=stdout,7=stderr,file)'),
                       description='Output device',
                       workloads=['standard'])
-    workload_variable('N-Ns', default='4            # of problems sizes (N)',
+    workload_variable('N-Ns',
+                      default=pad_value('4', 'Number of problems sizes (N)'),
                       description='Number of problems sizes',
                       workloads=['standard'])
-    workload_variable('Ns', default='29 30 34 35  Ns',
+    workload_variable('Ns',
+                      default=pad_value('29 30 34 35', 'Ns'),
                       description='Problem sizes',
                       workloads=['standard'])
-    workload_variable('N-NBs', default='4            # of NBs',
+    workload_variable('N-NBs',
+                      default=pad_value('4', 'Number of NBs'),
                       description='Number of NBs',
                       workloads=['standard'])
-    workload_variable('NBs', default='1 2 3 4      NBs',
+    workload_variable('NBs',
+                      default=pad_value('1 2 3 4', 'NBs'),
                       description='NB values',
                       workloads=['standard'])
-    workload_variable('PMAP', default='0            PMAP process mapping (0=Row-,1=Column-major)',
+    workload_variable('PMAP',
+                      default=pad_value('0', 'PMAP process mapping (0=Row-,1=Column-major)'),
                       description='PMAP Process mapping. (0=Row-, 1=Column-Major)',
                       workloads=['standard'])
-    workload_variable('N-Grids', default='3            # of process grids (P x Q)',
+    workload_variable('N-Grids',
+                      default=pad_value('3', 'Number of process grids (P x Q)'),
                       description='Number of process grids (P x Q)',
                       workloads=['standard'])
-    workload_variable('Ps', default='2 1 4        Ps',
+    workload_variable('Ps',
+                      default=pad_value('2 1 4', 'Ps'),
                       description='P values',
                       workloads=['standard'])
-    workload_variable('Qs', default='2 4 1        Qs',
+    workload_variable('Qs',
+                      default=pad_value('2 4 1', 'Qs'),
                       description='Q values',
                       workloads=['standard'])
-    workload_variable('threshold', default='16.0         threshold',
+    workload_variable('threshold',
+                      default=pad_value('16.0', 'threshold'),
                       description='Residual threshold',
                       workloads=['standard'])
-    workload_variable('NPFACTS', default='3            # of panel fact',
+    workload_variable('NPFACTs',
+                      default=pad_value('3', 'Number of PFACTs, panel fact'),
                       description='Number of PFACTs',
                       workloads=['standard'])
-    workload_variable('PFACTS', default='0 1 2        PFACTs (0=left, 1=Crout, 2=Right)',
+    workload_variable('PFACTs',
+                      default=pad_value('0 1 2', 'PFACTs (0=left, 1=Crout, 2=Right)'),
                       description='PFACT Values',
                       workloads=['standard'])
-    workload_variable('N-NBMINs', default='2            # of recursive stopping criterium',
+    workload_variable('N-NBMINs',
+                      default=pad_value('2', 'Number of NBMINs, recursive stopping criteria'),
                       description='Number of NBMINs',
                       workloads=['standard'])
-    workload_variable('NBMINs', default='2 4          NBMINs (>= 1)',
+    workload_variable('NBMINs',
+                      default=pad_value('2 4', 'NBMINs (>= 1)'),
                       description='NBMIN values',
                       workloads=['standard'])
-    workload_variable('N-NDIVs', default='1            # of panels in recursion',
+    workload_variable('N-NDIVs',
+                      default=pad_value('1', 'Number of NDIVs, panels in recursion'),
                       description='Number of NDIVs',
                       workloads=['standard'])
-    workload_variable('NDIVs', default='2            NDIVs',
+    workload_variable('NDIVs',
+                      default=pad_value('2', 'NDIVs'),
                       description='NDIV values',
                       workloads=['standard'])
-    workload_variable('N-RFACTs', default='3            # of recursive panel fact.',
+    workload_variable('N-RFACTs',
+                      default=pad_value('3', 'Number of RFACTs, recursive panel fact.'),
                       description='Number of RFACTs',
                       workloads=['standard'])
-    workload_variable('RFACTs', default='0 1 2        RFACTs (0=left, 1=Crout, 2=Right)',
+    workload_variable('RFACTs',
+                      default=pad_value('0 1 2', 'RFACTs (0=left, 1=Crout, 2=Right)'),
                       description='RFACT values',
                       workloads=['standard'])
-    workload_variable('N-BCASTs', default='1            # of broadcast',
+    workload_variable('N-BCASTs',
+                      default=pad_value('1', 'Number of BCASTs, broadcast'),
                       description='Number of BCASTs',
                       workloads=['standard'])
-    workload_variable('BCASTs', default='0            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)',
+    workload_variable('BCASTs',
+                      default=pad_value('0', 'BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)'),
                       description='BCAST values',
                       workloads=['standard'])
-    workload_variable('N-DEPTHs', default='1            # of lookahead depth',
+    workload_variable('N-DEPTHs',
+                      default=pad_value('1', 'Number of DEPTHs, lookahead depth'),
                       description='Number of DEPTHs',
                       workloads=['standard'])
-    workload_variable('DEPTHs', default='0            DEPTHs (>=0)',
+    workload_variable('DEPTHs',
+                      default=pad_value('0', 'DEPTHs (>=0)'),
                       description='DEPTH values',
                       workloads=['standard'])
-    workload_variable('SWAP', default='2            SWAP (0=bin-exch,1=long,2=mix)',
+    workload_variable('SWAP',
+                      default=pad_value('2', 'SWAP (0=bin-exch,1=long,2=mix)'),
                       description='Swapping algorithm',
                       workloads=['standard'])
-    workload_variable('swapping_threshold', default='64           swapping threshold',
+    workload_variable('swapping_threshold',
+                      default=pad_value('64', 'swapping threshold'),
                       description='Swapping threshold',
                       workloads=['standard'])
-    workload_variable('L1', default='0            L1 in (0=transposed,1=no-transposed) form',
+    workload_variable('L1',
+                      default=pad_value('0', 'L1 in (0=transposed,1=no-transposed) form'),
                       description='Storage for upper triangular portion of columns',
                       workloads=['standard'])
-    workload_variable('U', default='0            U  in (0=transposed,1=no-transposed) form',
+    workload_variable('U',
+                      default=pad_value('0', 'U  in (0=transposed,1=no-transposed) form'),
                       description='Storage for the rows of U',
                       workloads=['standard'])
-    workload_variable('Equilibration', default='1            Equilibration (0=no,1=yes)',
+    workload_variable('Equilibration',
+                      default=pad_value('1', 'Equilibration (0=no,1=yes)'),
                       description='Determines if equilibration should be enabled or disabled.',
                       workloads=['standard'])
-    workload_variable('mem_alignment', default='8            memory alignment in double (> 0)',
+    workload_variable('mem_alignment',
+                      default=pad_value('8', 'memory alignment in double (> 0)'),
                       description='Sets the alignment in doubles for memory addresses',
                       workloads=['standard'])
 
-    workload('calculator', executables=['execute'])
+    # calculator workload-specific variables:
+
+    workload_variable('percent_mem', default='85',
+                      description='Percent of memory to use (default 85)',
+                      workloads=['calculator'])
 
-    workload_variable('memory_per_node', default='128',
+    workload_variable('memory_per_node', default='240',
                       description='Memory per node in GB',
                       workloads=['calculator'])
 
-    workload_variable('block_size', default='224',
+    workload_variable('block_size', default='384',
                       description='Size of each block',
                       workloads=['calculator'])
 
+    workload_variable('pfact', default='0',
+                      description='PFACT for optimized calculator',
+                      workloads=['calculator'])
+
+    workload_variable('nbmin', default='2',
+                      description='NBMIN for optimized calculator',
+                      workloads=['calculator'])
+
+    workload_variable('rfact', default='0',
+                      description='RFACT for optimized calculator',
+                      workloads=['calculator'])
+
+    workload_variable('bcast', default='0',
+                      description='BCAST for optimized calculator',
+                      workloads=['calculator'])
+
+    workload_variable('depth', default='0',
+                      description='DEPTH for optimized calculator',
+                      workloads=['calculator'])
+
+    # FoMs:
+
     figure_of_merit('Time', log_file='{experiment_run_dir}/{experiment_name}.out',
                     fom_regex=r'.*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+(?P<gflops>[0-9].*)\n',
                     group_name='time', units='s', contexts=['problem-name'])
@@ -142,55 +200,109 @@ class Hpl(SpackApplication):
 
     figure_of_merit_context('problem-name', regex=r'.*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+(?P<gflops>[0-9].*)\n', output_format='{N}-{NB}-{P}-{Q}')
 
+    # Integer sqrt
+    def _isqrt(self, n):
+        if n < 0:
+            raise Exception
+        elif n < 2:
+            return (n)
+        else:
+            lo = self._isqrt(n >> 2) << 1
+            hi = lo + 1
+            if ((hi * hi) > n):
+                return (lo)
+            else:
+                return (hi)
+
     def _calculate_values(self, workspace, expander):
         if expander.workload_name == 'calculator':
-            memoryPerNode = int(expander.expand_var('{memory_per_node}'))
+            # Find the best P and Q whose product is the number of available
+            # cores, with P less than Q
             nNodes = int(expander.expand_var('{n_nodes}'))
             processesPerNode = int(expander.expand_var('{processes_per_node}'))
-            blockSize = int(expander.expand_var('{block_size}'))
-
-            targetProblemSize = 0.80 * int(math.sqrt((memoryPerNode
-                                           * nNodes * 1024 * 1024 * 1024)
-                                           / 8))
-            nBlocks = int(targetProblemSize / blockSize)
-            nBlocks = nBlocks if (nBlocks % 2 == 0) else nBlocks - 1
-            problemSize = blockSize * nBlocks
 
             totalCores = nNodes * processesPerNode
-            sqrtCores = int(math.sqrt(totalCores))
 
-            bestDist = totalCores - 1
-            bestP = 1
+            bestP = self._isqrt(totalCores)
+            while ((totalCores % bestP) > 0):     # stops at 1 because any int % 1 = 0
+                bestP -= 1
+
+            bestQ = totalCores // bestP
 
-            for i in range(2, sqrtCores):
-                if totalCores % i == 0:
-                    testDist = totalCores - i
-                    if testDist < bestDist:
-                        bestDist = testDist
-                        bestP = i
+            # Find LCM(P,Q)
+            P = int(bestP)
+            Q = int(bestQ)
+            lcmPQ = Q             # Q is always the larger of P and Q
+            while ((lcmPQ % P) > 0):
+                lcmPQ += Q
 
-            bestQ = int(totalCores / bestP)
+            # HPL maintainers recommend basing the target problem size on
+            # the square root of 80% of total memory in words.
+            memoryPerNode = int(expander.expand_var(expander.expansion_str('memory_per_node')))
+            memFraction = int(expander.expand_var(expander.expansion_str('percent_mem'))) / 100
+            blockSize = int(expander.expand_var(expander.expansion_str('block_size')))
+            one_gb_mem_in_words = (1 << 30) / 8
+
+            fullMemWords = nNodes * memoryPerNode * one_gb_mem_in_words
+
+            targetProblemSize = math.sqrt(fullMemWords * memFraction)
+
+            # Ensure that N is divisible by NB * LCM(P,Q)
+            problemSize = int(targetProblemSize)
+            problemSize -= (problemSize % blockSize)
+            nBlocks = problemSize // blockSize
+            nBlocks -= nBlocks % lcmPQ
+            problemSize = blockSize * nBlocks
+            usedPercentage = int(problemSize**2 / fullMemWords * 100)
 
             for var, config in self.workload_variables['standard'].items():
                 self.variables[var] = config['default']
 
-            self.variables['N-Ns'] = '1'
-            self.variables['Ns'] = int(problemSize)
-            self.variables['N-NBs'] = '1'
-            self.variables['NBs'] = blockSize
-            self.variables['N-Grids'] = '1'
-            self.variables['Ps'] = int(bestP)
-            self.variables['Qs'] = int(bestQ)
-            self.variables['NPFACTS'] = '1'
-            self.variables['PFACTS'] = '2'
-            self.variables['N-NBMINs'] = '1'
-            self.variables['NBMINs'] = '4'
-            self.variables['N-RFACTs'] = '1'
-            self.variables['RFACTs'] = '1'
-            self.variables['N-BCASTs'] = '1'
-            self.variables['BCASTs'] = '1'
-            self.variables['N-DEPTHs'] = '1'
-            self.variables['DEPTHs'] = '1'
+            pfact = expander.expand_var(expander.expansion_str('pfact'))
+            nbmin = expander.expand_var(expander.expansion_str('nbmin'))
+            rfact = expander.expand_var(expander.expansion_str('rfact'))
+            bcast = expander.expand_var(expander.expansion_str('bcast'))
+            depth = expander.expand_var(expander.expansion_str('depth'))
+
+            self.variables['N-Ns'] = pad_value('1', 'Number of problems sizes (N)')  # vs 4
+
+            # Calculated:
+            self.variables['Ns'] = pad_value(int(problemSize),
+                                             f"Ns (= {usedPercentage}% of total available memory)")
+
+            self.variables['N-NBs'] = pad_value('1', 'Number of NBs')  # vs 4
+
+            # Calculated:
+            self.variables['NBs'] = pad_value(blockSize, "NBs")  # calculated, vs 4 samples
+
+            self.variables['N-Grids'] = pad_value('1', 'Number of Grids, process grids (P x Q)')  # vs 3
+
+            # Calculated:
+            self.variables['Ps'] = pad_value(int(bestP), "Ps")
+
+            # Calculated:
+            self.variables['Qs'] = pad_value(int(bestQ), "Qs")
+
+            self.variables['NPFACTs'] = pad_value('1', 'Number of PFACTs, panel fact')  # vs 3
+
+            # ramble.yaml configurable
+            self.variables['PFACTs'] = pad_value(pfact, 'PFACT Values (0=left, 1=Crout, 2=Right)')  # vs 0 1 2
+
+            self.variables['N-NBMINs'] = pad_value('1', 'Number of NBMINs, recursive stopping criteria')  # vs 2
+
+            # ramble.yaml configurable
+            self.variables['NBMINs'] = pad_value(nbmin, 'NBMINs (>= 1)')  # vs '2 4'
+
+            self.variables['N-RFACTs'] = pad_value('1', 'Number of RFACTS, recursive panel fact.')  # vs '3'
+
+            # ramble.yaml configurable
+            self.variables['RFACTs'] = pad_value(rfact, 'RFACTs (0=left, 1=Crout, 2=Right)')  # vs '0 1 2'
+
+            # ramble.yaml configurable
+            self.variables['BCASTs'] = pad_value(bcast, 'BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MKL BPUSH,7=AMD Hybrid Panel)')  # vs '0'
+
+            # ramble.yaml configurable
+            self.variables['DEPTHs'] = pad_value(depth, 'DEPTHs (>=0)')  # vs '0'
 
     def _make_experiments(self, workspace):
         super()._make_experiments(workspace)
@@ -199,8 +311,8 @@ def _make_experiments(self, workspace):
         input_path = self.expander.expand_var('{experiment_run_dir}/HPL.dat')
 
         settings = ['output_file', 'device_out', 'N-Ns', 'Ns', 'N-NBs', 'NBs',
-                    'PMAP', 'N-Grids', 'Ps', 'Qs', 'threshold', 'NPFACTS',
-                    'PFACTS', 'N-NBMINs', 'NBMINs', 'N-NDIVs', 'NDIVs', 'N-RFACTs',
+                    'PMAP', 'N-Grids', 'Ps', 'Qs', 'threshold', 'NPFACTs',
+                    'PFACTs', 'N-NBMINs', 'NBMINs', 'N-NDIVs', 'NDIVs', 'N-RFACTs',
                     'RFACTs', 'N-BCASTs', 'BCASTs', 'N-DEPTHs', 'DEPTHs', 'SWAP',
                     'swapping_threshold', 'L1', 'U', 'Equilibration',
                     'mem_alignment']
@@ -210,4 +322,10 @@ def _make_experiments(self, workspace):
             f.write('Innovative Computing Laboratory, University of Tennessee\n')
 
             for setting in settings:
-                f.write(self.expander.expand_var('{' + setting + '}') + '\n')
+                # This gets around an issue in expander where trailing comments
+                # after '#' are not printed
+                hash_replace_str = self.expander.expand_var(self.expander.expansion_str(setting)).replace('Number', '#')
+                f.write(hash_replace_str + '\n')
+
+            # Write some documentation at the bottom of the input file:
+            f.write('##### This line (no. 32) is ignored (it serves as a separator). ######')