diff --git a/loki/transformations/build_system/dependency.py b/loki/transformations/build_system/dependency.py index 27c616bd6..73acee66a 100644 --- a/loki/transformations/build_system/dependency.py +++ b/loki/transformations/build_system/dependency.py @@ -10,7 +10,8 @@ from loki.backend import fgen from loki.batch import Transformation from loki.ir import ( - CallStatement, Import, Interface, FindNodes, FindInlineCalls, Transformer + CallStatement, Import, Interface, FindNodes, FindInlineCalls, Transformer, + Pragma, get_pragma_parameters ) from loki.logging import warning from loki.module import Module @@ -131,6 +132,8 @@ def transform_module(self, module, **kwargs): ), ) + self.rename_omp_target_declare_pragmas(module) + targets = tuple(str(t).lower() for t in as_tuple(kwargs.get('targets'))) if self.replace_ignore_items and (item := kwargs.get('item')): targets += tuple(str(i).lower() for i in item.ignore) @@ -188,6 +191,17 @@ def transform_subroutine(self, routine, **kwargs): # Re-generate C-style interface header self.generate_interfaces(routine) + + def rename_omp_target_declare_pragmas(self, module): + """ + Update :any:`Pragma` `!$omp declare target data ` accordingly. + """ + for pragma in FindNodes(Pragma).visit(module.spec): + pragma_parameters = get_pragma_parameters(pragma, starts_with='declare', only_loki_pragmas=False) + if 'target' in pragma_parameters: + pragma._update(content=f"declare target({pragma_parameters['target']}{self.suffix.lower()})") + + def remove_inactive_ir_nodes(self, body, transformed_scope_name, **kwargs): """ Utility to filter :any:`Scope` nodes in :data:`body` to include only diff --git a/loki/transformations/data_offload/offload.py b/loki/transformations/data_offload/offload.py index 13df21248..caa9c0362 100644 --- a/loki/transformations/data_offload/offload.py +++ b/loki/transformations/data_offload/offload.py @@ -28,6 +28,9 @@ class DataOffloadTransformation(Transformation): Parameters ---------- + directive : str, optional + Pragma/Directive language to be used (OpenACC via 'openacc' + or OpenMP offload via 'omp-gpu') remove_openmp : bool Remove any existing OpenMP pragmas inside the marked region. present_on_device : bool @@ -37,9 +40,11 @@ class DataOffloadTransformation(Transformation): is being managed outside of structured OpenACC data regions. """ - def __init__(self, **kwargs): + def __init__(self, directive='openacc', **kwargs): # We need to record if we actually added any, so # that down-stream processing can use that info + self.directive = directive + assert self.directive in ['openacc', 'omp-gpu'] self.has_data_regions = False self.remove_openmp = kwargs.get('remove_openmp', False) self.assume_deviceptr = kwargs.get('assume_deviceptr', False) @@ -148,28 +153,40 @@ def insert_data_offload_pragmas(self, routine, targets): outargs = tuple(dict.fromkeys(outargs)) inoutargs = tuple(dict.fromkeys(inoutargs)) - # Now generate the pre- and post pragmas (OpenACC) - if self.present_on_device: - if self.assume_deviceptr: - offload_args = inargs + outargs + inoutargs - if offload_args: - deviceptr = f' deviceptr({", ".join(offload_args)})' + # Now generate the pre- and post pragmas (OpenACC or OpenMP) + pragma = None + pragma_post = None + if self.directive == 'openacc': + if self.present_on_device: + if self.assume_deviceptr: + offload_args = inargs + outargs + inoutargs + if offload_args: + deviceptr = f' deviceptr({", ".join(offload_args)})' + else: + deviceptr = '' + pragma = Pragma(keyword='acc', content=f'data{deviceptr}') else: - deviceptr = '' - pragma = Pragma(keyword='acc', content=f'data{deviceptr}') + offload_args = inargs + outargs + inoutargs + if offload_args: + present = f' present({", ".join(offload_args)})' + else: + present = '' + pragma = Pragma(keyword='acc', content=f'data{present}') else: - offload_args = inargs + outargs + inoutargs - if offload_args: - present = f' present({", ".join(offload_args)})' - else: - present = '' - pragma = Pragma(keyword='acc', content=f'data{present}') - else: - copyin = f'copyin({", ".join(inargs)})' if inargs else '' - copy = f'copy({", ".join(inoutargs)})' if inoutargs else '' - copyout = f'copyout({", ".join(outargs)})' if outargs else '' - pragma = Pragma(keyword='acc', content=f'data {copyin} {copy} {copyout}') - pragma_post = Pragma(keyword='acc', content='end data') + copyin = f'copyin({", ".join(inargs)})' if inargs else '' + copy = f'copy({", ".join(inoutargs)})' if inoutargs else '' + copyout = f'copyout({", ".join(outargs)})' if outargs else '' + pragma = Pragma(keyword='acc', content=f'data {copyin} {copy} {copyout}') + pragma_post = Pragma(keyword='acc', content='end data') + elif self.directive == 'omp-gpu': + if self.present_on_device: + ... # TODO: OpenMP offload if self.present_on_device + else: + copyin = f'map(to: {", ".join(inargs)})' if inargs else '' + copy = f'map(tofrom:{", ".join(inoutargs)})' if inoutargs else '' + copyout = f'map(from: {", ".join(outargs)})' if outargs else '' + pragma = Pragma(keyword='omp', content=f'target data {copyin} {copy} {copyout}') + pragma_post = Pragma(keyword='omp', content='end target data') pragma_map[region.pragma] = pragma pragma_map[region.pragma_post] = pragma_post diff --git a/loki/transformations/pool_allocator.py b/loki/transformations/pool_allocator.py index f5e2d3208..479e513d9 100644 --- a/loki/transformations/pool_allocator.py +++ b/loki/transformations/pool_allocator.py @@ -451,6 +451,14 @@ def _get_stack_storage_and_size_var(self, routine, stack_size): body_prepend += [pragma_data_start] pragma_data_end = Pragma(keyword='acc', content='end data') body_append += [pragma_data_end] + elif self.directive == 'omp-gpu': + pragma_data_start = Pragma( + keyword='omp', + content=f'target enter data map(alloc: {stack_storage.name})' # pylint: disable=no-member + ) + body_prepend += [pragma_data_start] + pragma_data_end = Pragma(keyword='omp', content=f'target exit data map(delete: {stack_storage.name})') # pylint: disable=no-member + body_append += [pragma_data_end] body_append += [stack_dealloc] # Inject new variables and body nodes diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index fb82f7876..de13c792c 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -37,7 +37,7 @@ class SCCAnnotateTransformation(Transformation): to use for hoisted column arrays if hoisting is enabled. directive : string or None Directives flavour to use for parallelism annotations; either - ``'openacc'`` or ``None``. + ``'openacc'``, ``'omp-gpu'``or ``None``. """ def __init__(self, directive, block_dim): @@ -69,19 +69,40 @@ def annotate_vector_loops(self, routine): f'{[a.name for a in private_arrays]}' ) - with pragmas_attached(routine, ir.Loop): - for loop in FindNodes(ir.Loop).visit(routine.body): - for pragma in as_tuple(loop.pragma): - if is_loki_pragma(pragma, starts_with='loop vector reduction'): - # Turn reduction pragmas into `!$acc` equivalent - pragma._update(keyword='acc') - continue - - if is_loki_pragma(pragma, starts_with='loop vector'): - # Turn general vector pragmas into `!$acc` and add private clause - private_arrs = ', '.join(v.name for v in private_arrays) - private_clause = '' if not private_arrays else f' private({private_arrs})' - pragma._update(keyword='acc', content=f'loop vector{private_clause}') + if self.directive == 'openacc': + with pragmas_attached(routine, ir.Loop): + for loop in FindNodes(ir.Loop).visit(routine.body): + for pragma in as_tuple(loop.pragma): + if is_loki_pragma(pragma, starts_with='loop vector reduction'): + # Turn reduction pragmas into `!$acc` equivalent + pragma._update(keyword='acc') + continue + + if is_loki_pragma(pragma, starts_with='loop vector'): + # Turn general vector pragmas into `!$acc` and add private clause + private_arrs = ', '.join(v.name for v in private_arrays) + private_clause = '' if not private_arrays else f' private({private_arrs})' + pragma._update(keyword='acc', content=f'loop vector{private_clause}') + + if self.directive == 'omp-gpu': + with pragmas_attached(routine, ir.Loop): + for loop in FindNodes(ir.Loop).visit(routine.body): + for pragma in as_tuple(loop.pragma): + #TODO: how to handle vector reductions? + + if is_loki_pragma(pragma, starts_with='loop vector'): + # TODO: need for privatizing variables/arrays? + pragma_new = ir.Pragma(keyword='omp', content='parallel do') + pragma_post = ir.Pragma(keyword='omp', content='end parallel do') + # pragma_new = ir.Pragma(keyword='omp', content='loop bind(parallel)') + # pragma_post = ir.Pragma(keyword='omp', content='end loop') + + # Replace existing loki pragma and add post-pragma + loop_pragmas = tuple(p for p in as_tuple(loop.pragma) if p is not pragma) + loop._update( + pragma=loop_pragmas + (pragma_new,), + pragma_post=(pragma_post,) + as_tuple(loop.pragma_post) + ) def annotate_sequential_loops(self, routine): """ @@ -93,19 +114,20 @@ def annotate_sequential_loops(self, routine): routine : :any:`Subroutine` The subroutine in which to annotate sequential loops """ - with pragmas_attached(routine, ir.Loop): - for loop in FindNodes(ir.Loop).visit(routine.body): - if not is_loki_pragma(loop.pragma, starts_with='loop seq'): - continue + if self.directive == 'openacc': + with pragmas_attached(routine, ir.Loop): + for loop in FindNodes(ir.Loop).visit(routine.body): + if not is_loki_pragma(loop.pragma, starts_with='loop seq'): + continue - # Replace internal `!$loki loop seq`` pragam with `!$acc` equivalent - loop._update(pragma=(ir.Pragma(keyword='acc', content='loop seq'),)) + # Replace internal `!$loki loop seq`` pragam with `!$acc` equivalent + loop._update(pragma=(ir.Pragma(keyword='acc', content='loop seq'),)) - # Warn if we detect vector insisde sequential loop nesting - nested_loops = FindNodes(ir.Loop).visit(loop.body) - loop_pragmas = flatten(as_tuple(l.pragma) for l in as_tuple(nested_loops)) - if any('loop vector' in pragma.content for pragma in loop_pragmas): - info(f'[Loki-SCC::Annotate] Detected vector loop in sequential loop in {routine.name}') + # Warn if we detect vector insisde sequential loop nesting + nested_loops = FindNodes(ir.Loop).visit(loop.body) + loop_pragmas = flatten(as_tuple(l.pragma) for l in as_tuple(nested_loops)) + if any('loop vector' in pragma.content for pragma in loop_pragmas): + info(f'[Loki-SCC::Annotate] Detected vector loop in sequential loop in {routine.name}') def annotate_kernel_routine(self, routine): """ @@ -118,27 +140,38 @@ def annotate_kernel_routine(self, routine): The subroutine to which annotations will be added """ - # Update `!$loki routine seq/vector` pragmas with `!$acc` - pragma_map = {} - for pragma in FindNodes(ir.Pragma).visit(routine.ir): - if is_loki_pragma(pragma, starts_with='routine'): - # We have to re-insert the pragma here, in case it was - # falsely attributed to the body! - routine.spec.append(pragma.clone(keyword='acc')) - pragma_map[pragma] = None - pragma_transformer = Transformer(pragma_map) - routine.spec = pragma_transformer.visit(routine.spec) - routine.body = pragma_transformer.visit(routine.body) - - # Get the names of all array and derived type arguments - args = [a for a in routine.arguments if isinstance(a, sym.Array)] - args += [a for a in routine.arguments if isinstance(a.type.dtype, DerivedType)] - argnames = [str(a.name) for a in args] - - if argnames: - routine.body.prepend(ir.Pragma(keyword='acc', content=f'data present({", ".join(argnames)})')) - # Add comment to prevent false-attachment in case it is preceded by an "END DO" statement - routine.body.append((ir.Comment(text=''), ir.Pragma(keyword='acc', content='end data'))) + if self.directive == 'openacc': + # Update `!$loki routine seq/vector` pragmas with `!$acc` + pragma_map = {} + for pragma in FindNodes(ir.Pragma).visit(routine.ir): + if is_loki_pragma(pragma, starts_with='routine'): + # We have to re-insert the pragma here, in case it was + # falsely attributed to the body! + routine.spec.append(pragma.clone(keyword='acc')) + pragma_map[pragma] = None + pragma_transformer = Transformer(pragma_map) + routine.spec = pragma_transformer.visit(routine.spec) + routine.body = pragma_transformer.visit(routine.body) + + # Get the names of all array and derived type arguments + args = [a for a in routine.arguments if isinstance(a, sym.Array)] + args += [a for a in routine.arguments if isinstance(a.type.dtype, DerivedType)] + argnames = [str(a.name) for a in args] + + if argnames: + routine.body.prepend(ir.Pragma(keyword='acc', content=f'data present({", ".join(argnames)})')) + # Add comment to prevent false-attachment in case it is preceded by an "END DO" statement + routine.body.append((ir.Comment(text=''), ir.Pragma(keyword='acc', content='end data'))) + + if self.directive == 'omp-gpu': + # IF and only IF we would need to do it in the parent module for some compiler(s) + # problem would be that not all the relevant routines do have a module they live in ... + # try: + # scope = item.scope_ir + # scope.parent.spec.append(ir.Pragma(keyword='omp', content=f'declare target({routine.name.lower()})')) + # except: + # pass + routine.spec.append(ir.Pragma(keyword='omp', content='declare target')) def transform_subroutine(self, routine, **kwargs): """ @@ -166,14 +199,18 @@ def transform_subroutine(self, routine, **kwargs): role = kwargs['role'] targets = as_tuple(kwargs.get('targets')) - if not self.directive == 'openacc': + if not self.directive in ['openacc', 'omp-gpu']: return if role == 'kernel': # Bail if this routine has been processed before for p in FindNodes(ir.Pragma).visit(routine.ir): # Check if `!$acc routine` has already been added - if p.keyword.lower() == 'acc' and 'routine' in p.content.lower(): + if self.directive == 'openacc'\ + and p.keyword.lower() == 'acc' and 'routine' in p.content.lower(): + return + if self.directive == 'omp-gpu' and\ + p.keyword.lower() == 'omp' and 'declare target' in p.content.lower(): return # Mark all parallel vector loops as `!$acc loop vector` @@ -249,7 +286,7 @@ def find_acc_vars(self, routine, targets): return acc_vars @classmethod - def device_alloc_column_locals(cls, routine, column_locals): + def device_alloc_column_locals(cls, routine, column_locals, directive='openacc'): """ Add explicit OpenACC statements for creating device variables for hoisted column locals. @@ -263,11 +300,19 @@ def device_alloc_column_locals(cls, routine, column_locals): if column_locals: vnames = ', '.join(v.name for v in column_locals) - pragma = ir.Pragma(keyword='acc', content=f'enter data create({vnames})') - pragma_post = ir.Pragma(keyword='acc', content=f'exit data delete({vnames})') - # Add comments around standalone pragmas to avoid false attachment - routine.body.prepend((ir.Comment(''), pragma, ir.Comment(''))) - routine.body.append((ir.Comment(''), pragma_post, ir.Comment(''))) + if directive == 'openacc': + pragma = ir.Pragma(keyword='acc', content=f'enter data create({vnames})') + pragma_post = ir.Pragma(keyword='acc', content=f'exit data delete({vnames})') + # Add comments around standalone pragmas to avoid false attachment + routine.body.prepend((ir.Comment(''), pragma, ir.Comment(''))) + routine.body.append((ir.Comment(''), pragma_post, ir.Comment(''))) + if directive == 'omp-gpu': + pragma = ir.Pragma(keyword='omp', content=f'omp target enter data map(alloc: {vnames})') + routine.body.prepend((ir.Comment(''), pragma, ir.Comment(''))) + pragma_post = ir.Pragma(keyword='omp', content=f'target exit data map(delete: {vnames})') + # Add comments around standalone pragmas to avoid false attachment + routine.body.prepend((ir.Comment(''), pragma, ir.Comment(''))) + routine.body.append((ir.Comment(''), pragma_post, ir.Comment(''))) def annotate_driver_loop(self, loop, acc_vars): """ @@ -311,3 +356,26 @@ def annotate_driver_loop(self, loop, acc_vars): pragma=loop_pragmas + (pragma_new,), pragma_post=(pragma_post,) + as_tuple(loop.pragma_post) ) + + if self.directive == 'omp-gpu': + # TODO: no need to privatize arrays? + for pragma in as_tuple(loop.pragma): + if is_loki_pragma(pragma, starts_with='loop driver'): + # Replace `!$loki loop driver` pragma with OpenMP offload equivalent + params = get_pragma_parameters(loop.pragma, starts_with='loop driver') + vlength = params.get('vector_length') + vlength_clause = f' thread_limit({vlength})' if vlength else '' + + content = f'target teams distribute{vlength_clause}' + pragma_new = ir.Pragma(keyword='omp', content=content) + pragma_post = ir.Pragma(keyword='omp', content='end target teams distribute') + # content = f'target teams loop bind(teams){vlength_clause}' + # pragma_new = ir.Pragma(keyword='omp', content=content) + # pragma_post = ir.Pragma(keyword='omp', content='end target teams loop') + + # Replace existing loki pragma and add post-pragma + loop_pragmas = tuple(p for p in as_tuple(loop.pragma) if p is not pragma) + loop._update( + pragma=loop_pragmas + (pragma_new,), + pragma_post=(pragma_post,) + as_tuple(loop.pragma_post) + ) diff --git a/loki/transformations/single_column/base.py b/loki/transformations/single_column/base.py index 183b1d3e1..5499ab003 100644 --- a/loki/transformations/single_column/base.py +++ b/loki/transformations/single_column/base.py @@ -41,7 +41,7 @@ class methods can be called directly. def __init__(self, horizontal, directive=None): self.horizontal = horizontal - assert directive in [None, 'openacc'] + assert directive in [None, 'openacc', 'omp-gpu'] self.directive = directive # TODO: correct "definition" of a pure/elemental routine (take e.g. loki serial into account ...) diff --git a/loki/transformations/single_column/hoist.py b/loki/transformations/single_column/hoist.py index 290897100..9ca531809 100644 --- a/loki/transformations/single_column/hoist.py +++ b/loki/transformations/single_column/hoist.py @@ -30,10 +30,14 @@ class SCCHoistTemporaryArraysTransformation(HoistVariablesTransformation): block_dim : :any:`Dimension` :any:`Dimension` object to define the blocking dimension to use for hoisted array arguments on the driver side. + directive : str, optional + Pragma/directive to be used for offloading, either OpenACC via + `openacc` or OpenMP offload via `omp-gpu` """ - def __init__(self, block_dim=None, **kwargs): + def __init__(self, block_dim=None, directive='openacc', **kwargs): self.block_dim = block_dim + self.directive = directive super().__init__(**kwargs) def driver_variable_declaration(self, routine, variables): @@ -65,13 +69,19 @@ def driver_variable_declaration(self, routine, variables): # Add explicit device-side allocations/deallocations for hoisted temporaries vnames = ', '.join(v.name for v in variables) + pragma = None + pragma_post = None if vnames: - pragma = ir.Pragma(keyword='acc', content=f'enter data create({vnames})') - pragma_post = ir.Pragma(keyword='acc', content=f'exit data delete({vnames})') - + if self.directive == 'openacc': + pragma = ir.Pragma(keyword='acc', content=f'enter data create({vnames})') + pragma_post = ir.Pragma(keyword='acc', content=f'exit data delete({vnames})') + if self.directive == 'omp-gpu': + pragma = ir.Pragma(keyword='omp', content=f'omp target enter data map(alloc: {vnames})') + pragma_post = ir.Pragma(keyword='omp', content=f'omp target exit data map(delete: {vnames})') # Add comments around standalone pragmas to avoid false attachment routine.body.prepend((ir.Comment(''), pragma, ir.Comment(''))) - routine.body.append((ir.Comment(''), pragma_post, ir.Comment(''))) + if pragma_post is not None: + routine.body.append((ir.Comment(''), pragma_post, ir.Comment(''))) def driver_call_argument_remapping(self, routine, call, variables): """ diff --git a/loki/transformations/single_column/vector.py b/loki/transformations/single_column/vector.py index 0ad4dcc01..7605fb061 100644 --- a/loki/transformations/single_column/vector.py +++ b/loki/transformations/single_column/vector.py @@ -19,17 +19,18 @@ ) from loki.tools import as_tuple, flatten from loki.types import BasicType - from loki.transformations.array_indexing import demote_variables from loki.transformations.utilities import ( get_integer_variable, get_loop_bounds, find_driver_loops, - get_local_arrays, check_routine_sequential + get_local_arrays, check_routine_sequential, + single_variable_declaration ) __all__ = [ 'SCCDevectorTransformation', 'SCCRevectorTransformation', - 'SCCDemoteTransformation', 'wrap_vector_section' + 'SCCDemoteTransformation', 'wrap_vector_section', + 'SCCRevectorSeqKernelsTransformation', 'SCCRevectorVectorKernelsTransformation' ] @@ -252,7 +253,7 @@ def process_driver(self, routine, targets=()): routine.body = Transformer(driver_loop_map).visit(routine.body) -def wrap_vector_section(section, routine, horizontal, insert_pragma=True): +def wrap_vector_section_from_dimension(section, routine, horizontal=None, insert_pragma=True): """ Wrap a section of nodes in a vector-level loop across the horizontal. @@ -268,9 +269,27 @@ def wrap_vector_section(section, routine, horizontal, insert_pragma=True): Adds a ``!$loki vector`` pragma around the created loop """ bounds = get_loop_bounds(routine, dimension=horizontal) + return wrap_vector_section(section, routine, bounds, horizontal.index, insert_pragma=insert_pragma) + +def wrap_vector_section(section, routine, bounds, index, insert_pragma=True): + """ + Wrap a section of nodes in a vector-level loop across the horizontal. + TODO: adapt docstring + + Parameters + ---------- + section : tuple of :any:`Node` + A section of nodes to be wrapped in a vector-level loop + routine : :any:`Subroutine` + The subroutine in the vector loops should be removed. + horizontal: :any:`Dimension` + The dimension specifying the horizontal vector dimension + insert_pragma: bool, optional + Adds a ``!$loki vector`` pragma around the created loop + """ # Create a single loop around the horizontal from a given body - index = get_integer_variable(routine, horizontal.index) + index = get_integer_variable(routine, index) bounds = sym.LoopRange(bounds) # Ensure we clone all body nodes, to avoid recursion issues @@ -285,7 +304,7 @@ def wrap_vector_section(section, routine, horizontal, insert_pragma=True): return (ir.Comment(''), vector_loop, ir.Comment('')) -class SCCRevectorTransformation(Transformation): +class SCCRevectorVectorKernelsTransformation(Transformation): """ A transformation to wrap thread-parallel IR sections within a horizontal loop. This transformation relies on markers placed by :any:`SCCDevectorTransformation`. @@ -297,9 +316,8 @@ class SCCRevectorTransformation(Transformation): to define the horizontal data dimension and iteration space. """ - def __init__(self, horizontal, remove_vector_section=False): + def __init__(self, horizontal): self.horizontal = horizontal - self.remove_vector_section = remove_vector_section def revector_section(self, routine, section): """ @@ -317,7 +335,7 @@ def revector_section(self, routine, section): """ # Wrap all thread-parallel sections into horizontal thread loops mapper = { - s: wrap_vector_section(s.body, routine, self.horizontal) + s: wrap_vector_section_from_dimension(s.body, routine, self.horizontal) for s in FindNodes(ir.Section).visit(section) if s.label == 'vector_section' } @@ -451,6 +469,262 @@ def transform_subroutine(self, routine, **kwargs): self.mark_driver_loop(routine, loop) +class SCCRevectorSeqKernelsTransformation(Transformation): + """ + A transformation to wrap thread-parallel IR sections within a horizontal loop + in a way that the horizontal loop is moved to the driver level. + This transformation relies on markers placed by :any:`SCCDevectorTransformation`. + + Parameters + ---------- + horizontal : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the horizontal data dimension and iteration space. + """ + + process_ignored_items = True + + def __init__(self, horizontal): + self.horizontal = horizontal + + def remove_vector_sections(self, section): + """ + Remove all thread-parallel :any:`Section` objects within a given + code section + + Parameters + ---------- + routine : :any:`Subroutine` + Subroutine to apply this transformation to. + section : tuple of :any:`Node` + Code section in which to replace vector-parallel + :any:`Section` objects. + """ + # Wrap all thread-parallel sections into horizontal thread loops + mapper = { + s: s.body + for s in FindNodes(ir.Section).visit(section) + if s.label == 'vector_section' + } + return Transformer(mapper).visit(section) + + def revector_section(self, routine, section): + """ + Wrap all thread-parallel :any:`Section` objects within a given + code section in a horizontal loop and mark interior loops as + ``!$loki loop seq``. + + Parameters + ---------- + routine : :any:`Subroutine` + Subroutine to apply this transformation to. + section : tuple of :any:`Node` + Code section in which to replace vector-parallel + :any:`Section` objects. + """ + # Wrap all thread-parallel sections into horizontal thread loops + mapper = { + s: wrap_vector_section_from_dimension(s.body, routine, self.horizontal) + for s in FindNodes(ir.Section).visit(section) + if s.label == 'vector_section' + } + return Transformer(mapper).visit(section) + + def mark_vector_reductions(self, routine, section): + """ + Mark vector-reduction loops in marked vector-reduction + regions. + + If a region explicitly marked with + ``!$loki vector-reduction()``/ + ``!$loki end vector-reduction`` is encountered, we replace + existing ``!$loki loop vector`` loop pragmas and add the + reduction keyword and clause. These will be turned into + OpenACC equivalents by :any:`SCCAnnotate`. + """ + with pragma_regions_attached(routine): + for region in FindNodes(ir.PragmaRegion).visit(section): + if is_loki_pragma(region.pragma, starts_with='vector-reduction'): + if (reduction_clause := re.search(r'reduction\([\w:0-9 \t]+\)', region.pragma.content)): + + loops = FindNodes(ir.Loop).visit(region) + assert len(loops) == 1 + pragma = ir.Pragma(keyword='loki', content=f'loop vector {reduction_clause[0]}') + # Update loop and region in place to remove marker pragmas + loops[0]._update(pragma=(pragma,)) + region._update(pragma=None, pragma_post=None) + + + def mark_seq_loops(self, section): + """ + Mark interior sequential loops in a thread-parallel section + with ``!$loki loop seq`` for later annotation. + + This utility requires loop-pragmas to be attached via + :any:`pragmas_attached`. It also updates loops in-place. + + Parameters + ---------- + section : tuple of :any:`Node` + Code section in which to mark "seq loops". + """ + for loop in FindNodes(ir.Loop).visit(section): + + # Skip loops explicitly marked with `!$loki/claw nodep` + if loop.pragma and any('nodep' in p.content.lower() for p in as_tuple(loop.pragma)): + continue + + # Mark loop as sequential with `!$loki loop seq` + if loop.variable != self.horizontal.index: + loop._update(pragma=(ir.Pragma(keyword='loki', content='loop seq'),)) + + def mark_driver_loop(self, routine, loop): + """ + Add ``!$loki loop driver`` pragmas to outer block loops and + add ``vector-length(size)`` clause for later annotations. + + This method assumes that pragmas have been attached via + :any:`pragmas_attached`. + """ + # Find a horizontal size variable to mark vector_length + symbol_map = routine.symbol_map + sizes = tuple( + symbol_map.get(size) for size in self.horizontal.size_expressions + if size in symbol_map + ) + vector_length = f' vector_length({sizes[0]})' if sizes else '' + + # Replace existing `!$loki loop driver markers, but leave all others + pragma = ir.Pragma(keyword='loki', content=f'loop driver{vector_length}') + loop_pragmas = tuple( + p for p in as_tuple(loop.pragma) if not is_loki_pragma(p, starts_with='driver-loop') + ) + loop._update(pragma=loop_pragmas + (pragma,)) + + + def convert_kwargs_to_args(self, routine, **kwargs): + targets = kwargs.get('targets', ()) + for call in FindNodes(ir.CallStatement).visit(routine.body): + if call.name in targets: + if not any(arg.type.optional for arg in call.routine.arguments): + call.convert_kwargs_to_args() + + def transform_subroutine(self, routine, **kwargs): + """ + Wrap vector-parallel sections in vector :any:`Loop` objects. + + This wraps all thread-parallel sections within "kernel" + routines or within the parallel loops in "driver" routines. + + The markers placed by :any:`SCCDevectorTransformation` are removed + + Parameters + ---------- + routine : :any:`Subroutine` + Subroutine to apply this transformation to. + role : str + Must be either ``"kernel"`` or ``"driver"`` + targets : tuple or str + Tuple of target routine names for determining "driver" loops + """ + role = kwargs['role'] + targets = kwargs.get('targets', ()) + # ignore = kwargs.get('ignore', ()) + item = kwargs.get('item', None) + ignore = item.ignore if item else () + + if role == 'kernel': + # Skip if kernel is marked as `!$loki routine seq` + if check_routine_sequential(routine): + return + + if self.horizontal.index not in routine.variables: + jl = get_integer_variable(routine, self.horizontal.index) + routine.arguments += (jl.clone(type=jl.type.clone(intent='in')),) + else: + single_variable_declaration(routine, variables=(self.horizontal.index,)) + routine.symbol_attrs.update({self.horizontal.index:\ + routine.variable_map[self.horizontal.index].type.clone(intent='in')}) + if self.horizontal.index not in routine.arguments: + routine.arguments += (get_integer_variable(routine, self.horizontal.index),) + + # add horizontal.index as argument for calls/routines being in targets + call_map = {} + for call in FindNodes(ir.CallStatement).visit(routine.body): + if call.name in targets or call.routine.name.lower() in ignore: + if check_routine_sequential(call.routine): + continue + if self.horizontal.index not in call.arg_map: + new_kwarg = (self.horizontal.index, get_integer_variable(routine, self.horizontal.index)) + updated_call = call.clone(kwarguments=call.kwarguments + (new_kwarg,)) + call_map[call] = updated_call + if call.routine.name.lower() in ignore: + if self.horizontal.index not in call.routine.variables: + jl = get_integer_variable(call.routine, self.horizontal.index) + call.routine.arguments += (jl.clone(type=jl.type.clone(intent='in')),) + else: + single_variable_declaration(call.routine, variables=(self.horizontal.index,)) + call.routine.symbol_attrs.update({self.horizontal.index:\ + call.routine.variable_map[self.horizontal.index].type.clone(intent='in')}) + if self.horizontal.index not in call.routine.arguments: + call.routine.arguments += (get_integer_variable(call.routine, self.horizontal.index),) + routine.body = Transformer(call_map).visit(routine.body) + + # Revector all marked vector sections within the kernel body + routine.body = self.remove_vector_sections(routine.body) + + with pragmas_attached(routine, ir.Loop): + # Check for explicitly labelled vector-reduction regions + self.mark_vector_reductions(routine, routine.body) + + # Mark sequential loops inside vector sections + self.mark_seq_loops(routine.body) + + # Mark subroutine as vector parallel for later annotation + # routine.spec.append(ir.Pragma(keyword='loki', content='routine vector')) + routine.spec.append(ir.Pragma(keyword='loki', content='routine seq')) + + if role == 'driver': + + # add horizontal.index, e.g., 'jl' + index = get_integer_variable(routine, self.horizontal.index) + routine.variables += (index,) + + with pragmas_attached(routine, ir.Loop): + driver_loops = find_driver_loops(section=routine.body, targets=targets) + + for loop in driver_loops: + + # Wrap calls being in targets in a horizontal loop and add horizontal.index as argument + call_map = {} + for call in FindNodes(ir.CallStatement).visit(loop.body): + if call.name in targets: + if self.horizontal.index not in call.arg_map: + new_kwarg = (self.horizontal.index, + get_integer_variable(routine, self.horizontal.index)) + updated_call = call.clone(kwarguments=call.kwarguments + (new_kwarg,)) + call_arg_map = {k.name.lower(): v for (k, v) in call.arg_map.items()} + loop_bounds = (call_arg_map[self.horizontal.lower.lower()], + call_arg_map[self.horizontal.upper.lower()]) + call_map[call] = wrap_vector_section((updated_call,), routine, bounds=loop_bounds, + insert_pragma=True, index=self.horizontal.index) + + loop._update(body=Transformer(call_map).visit(loop.body)) + + # Revector all marked sections within the driver loop body + loop._update(body=self.revector_section(routine, loop.body)) + + # Check for explicitly labelled vector-reduction regions + self.mark_vector_reductions(routine, loop.body) + + # Mark sequential loops inside vector sections + self.mark_seq_loops(loop.body) + + # Mark outer driver loops + self.mark_driver_loop(routine, loop) + + # self.convert_kwargs_to_args(routine, **kwargs) + class SCCDemoteTransformation(Transformation): """ A set of utilities to determine which local arrays can be safely demoted in a @@ -584,3 +858,33 @@ def process_kernel(self, routine, demote_locals=True, preserve_arrays=None): routine, variable_names=variables, dimensions=self.horizontal.sizes ) + +class SCCRevectorTransformation(Transformation): + """ + A transformation to wrap thread-parallel IR sections within a horizontal loop + either by + + * revectoring in the kernels itself if ``'seq_kernels=False'`` + * or in a way that the horizontal loop is moved to the driver level ``'seq_kernels=True'`` + + This transformation relies on markers placed by :any:`SCCDevectorTransformation`. + + Parameters + ---------- + horizontal : :any:`Dimension` + :any:`Dimension` object describing the variable conventions used in code + to define the horizontal data dimension and iteration space. + seq_kernels : bool, optional + Whether to revector in the kernels itself (``'seq_kernels=True'``, being the default) + or move the horizontal loop to + the driver level (``'seq_kernels=True'``). + """ + def __init__(self, horizontal, seq_kernels=False): + self.seq_kernels = seq_kernels + if self.seq_kernels: + self.revector_trafo = SCCRevectorSeqKernelsTransformation(horizontal) + else: + self.revector_trafo = SCCRevectorVectorKernelsTransformation(horizontal) + + def transform_subroutine(self, routine, **kwargs): + self.revector_trafo.transform_subroutine(routine, **kwargs)