diff --git a/TODO b/TODO new file mode 100644 index 0000000000000000000000000000000000000000..9f0bf2ecf68f28a330012ff1c77c9931f81144ef --- /dev/null +++ b/TODO @@ -0,0 +1,4 @@ +FORTRAN: +do/continue +case sensitivity + diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy new file mode 100644 index 0000000000000000000000000000000000000000..ea85b0c6b198c8bac6270098696504fc9006fd14 --- /dev/null +++ b/examples/fortran/matmul.floopy @@ -0,0 +1,26 @@ +subroutine dgemm(m,n,l,alpha,a,b,c) + implicit none + real*8 temp, a(m,l),b(l,n),c(m,n), alpha + integer m,n,k,i,j,l + + do j = 1,n + do k = 1,l + do i = 1,m + c(i,j) = c(i,j) + alpha*b(k,j)*a(i,k) + end do + end do + end do +end subroutine + +!$loopy begin transform +! dgemm = lp.split_iname(dgemm, "i", 16, +! outer_tag="g.0", inner_tag="l.1") +! dgemm = lp.split_iname(dgemm, "j", 8, +! outer_tag="g.1", inner_tag="l.0") +! dgemm = lp.split_iname(dgemm, "k", 32) +! +! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2") +! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") +! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner") +! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner") +!$loopy end transform diff --git a/loopy/array_buffer.py b/loopy/array_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..c15935b733ba70d4b8410bb9eff7afb4d9b0ae9c --- /dev/null +++ b/loopy/array_buffer.py @@ -0,0 +1,408 @@ +from __future__ import division, absolute_import +from six.moves import range, zip + +__copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import islpy as isl +from islpy import dim_type +from loopy.symbolic import (get_dependencies, SubstitutionMapper) +from pymbolic.mapper.substitutor import make_subst_func + +from pytools import Record +from pymbolic import var + + +class AccessDescriptor(Record): + """ + .. attribute:: identifier + + An identifier under user control, used to connect this access descriptor + to the access that generated it. Any Python value. + """ + + __slots__ = [ + "identifier", + "expands_footprint", + "storage_axis_exprs", + ] + + +def to_parameters_or_project_out(param_inames, set_inames, set): + for iname in list(set.get_space().get_var_dict().keys()): + if iname in param_inames: + dt, idx = set.get_space().get_var_dict()[iname] + set = set.move_dims( + dim_type.param, set.dim(dim_type.param), + dt, idx, 1) + elif iname in set_inames: + pass + else: + dt, idx = set.get_space().get_var_dict()[iname] + set = set.project_out(dt, idx, 1) + + return set + + +# {{{ construct storage->sweep map + +def build_per_access_storage_to_domain_map(accdesc, domain, + storage_axis_names, + prime_sweep_inames): + + map_space = domain.space + stor_dim = len(storage_axis_names) + rn = map_space.dim(dim_type.out) + + map_space = map_space.add_dims(dim_type.in_, stor_dim) + for i, saxis in enumerate(storage_axis_names): + # arg names are initially primed, to be replaced with unprimed + # base-0 versions below + + map_space = map_space.set_dim_name(dim_type.in_, i, saxis+"'") + + # map_space: [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep](rn) + + set_space = map_space.move_dims( + dim_type.out, rn, + dim_type.in_, 0, stor_dim).range() + + # set_space: [domain](dup_sweep_index)[dup_sweep](rn)[stor_axes'] + + stor2sweep = None + + from loopy.symbolic import aff_from_expr + + for saxis, sa_expr in zip(storage_axis_names, accdesc.storage_axis_exprs): + cns = isl.Constraint.equality_from_aff( + aff_from_expr(set_space, + var(saxis+"'") - prime_sweep_inames(sa_expr))) + + cns_map = isl.BasicMap.from_constraint(cns) + if stor2sweep is None: + stor2sweep = cns_map + else: + stor2sweep = stor2sweep.intersect(cns_map) + + if stor2sweep is not None: + stor2sweep = stor2sweep.move_dims( + dim_type.in_, 0, + dim_type.out, rn, stor_dim) + + # stor2sweep is back in map_space + return stor2sweep + + +def move_to_par_from_out(s2smap, except_inames): + while True: + var_dict = s2smap.get_var_dict(dim_type.out) + todo_inames = set(var_dict) - except_inames + if todo_inames: + iname = todo_inames.pop() + + _, dim_idx = var_dict[iname] + s2smap = s2smap.move_dims( + dim_type.param, s2smap.dim(dim_type.param), + dim_type.out, dim_idx, 1) + else: + return s2smap + + +def build_global_storage_to_sweep_map(kernel, access_descriptors, + domain_dup_sweep, dup_sweep_index, + storage_axis_names, + sweep_inames, primed_sweep_inames, prime_sweep_inames): + # The storage map goes from storage axes to the domain. + # The first len(arg_names) storage dimensions are the rule's arguments. + + global_stor2sweep = None + + # build footprint + for accdesc in access_descriptors: + if accdesc.expands_footprint: + stor2sweep = build_per_access_storage_to_domain_map( + accdesc, domain_dup_sweep, + storage_axis_names, + prime_sweep_inames) + + if global_stor2sweep is None: + global_stor2sweep = stor2sweep + else: + global_stor2sweep = global_stor2sweep.union(stor2sweep) + + if isinstance(global_stor2sweep, isl.BasicMap): + global_stor2sweep = isl.Map.from_basic_map(global_stor2sweep) + global_stor2sweep = global_stor2sweep.intersect_range(domain_dup_sweep) + + # space for global_stor2sweep: + # [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep](rn) + + return global_stor2sweep + +# }}} + + +# {{{ compute storage bounds + +def find_var_base_indices_and_shape_from_inames( + domain, inames, cache_manager, context=None): + base_indices_and_sizes = [ + cache_manager.base_index_and_length(domain, iname, context) + for iname in inames] + return list(zip(*base_indices_and_sizes)) + + +def compute_bounds(kernel, domain, stor2sweep, + primed_sweep_inames, storage_axis_names): + + bounds_footprint_map = move_to_par_from_out( + stor2sweep, except_inames=frozenset(primed_sweep_inames)) + + # compute bounds for each storage axis + storage_domain = bounds_footprint_map.domain().coalesce() + + if not storage_domain.is_bounded(): + raise RuntimeError("sweep did not result in a bounded storage domain") + + return find_var_base_indices_and_shape_from_inames( + storage_domain, [saxis+"'" for saxis in storage_axis_names], + kernel.cache_manager, context=kernel.assumptions) + +# }}} + + +# {{{ array-to-buffer map + +class ArrayToBufferMap(object): + def __init__(self, kernel, domain, sweep_inames, access_descriptors, + storage_axis_count): + self.kernel = kernel + self.sweep_inames = sweep_inames + + storage_axis_names = self.storage_axis_names = [ + "_loopy_storage_%d" % i for i in range(storage_axis_count)] + + # {{{ duplicate sweep inames + + # The duplication is necessary, otherwise the storage fetch + # inames remain weirdly tied to the original sweep inames. + + self.primed_sweep_inames = [psin+"'" for psin in sweep_inames] + + from loopy.isl_helpers import duplicate_axes + dup_sweep_index = domain.space.dim(dim_type.out) + domain_dup_sweep = duplicate_axes( + domain, sweep_inames, + self.primed_sweep_inames) + + self.prime_sweep_inames = SubstitutionMapper(make_subst_func( + dict((sin, var(psin)) + for sin, psin in zip(sweep_inames, self.primed_sweep_inames)))) + + # # }}} + + self.stor2sweep = build_global_storage_to_sweep_map( + kernel, access_descriptors, + domain_dup_sweep, dup_sweep_index, + storage_axis_names, + sweep_inames, self.primed_sweep_inames, self.prime_sweep_inames) + + storage_base_indices, storage_shape = compute_bounds( + kernel, domain, self.stor2sweep, self.primed_sweep_inames, + storage_axis_names) + + # compute augmented domain + + # {{{ filter out unit-length dimensions + + non1_storage_axis_flags = [] + non1_storage_shape = [] + + for saxis, bi, l in zip( + storage_axis_names, storage_base_indices, storage_shape): + has_length_non1 = l != 1 + + non1_storage_axis_flags.append(has_length_non1) + + if has_length_non1: + non1_storage_shape.append(l) + + # }}} + + # {{{ subtract off the base indices + # add the new, base-0 indices as new in dimensions + + sp = self.stor2sweep.get_space() + stor_idx = sp.dim(dim_type.out) + + n_stor = storage_axis_count + nn1_stor = len(non1_storage_shape) + + aug_domain = self.stor2sweep.move_dims( + dim_type.out, stor_idx, + dim_type.in_, 0, + n_stor).range() + + # aug_domain space now: + # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes'] + + aug_domain = aug_domain.insert_dims(dim_type.set, stor_idx, nn1_stor) + + inew = 0 + for i, name in enumerate(storage_axis_names): + if non1_storage_axis_flags[i]: + aug_domain = aug_domain.set_dim_name( + dim_type.set, stor_idx + inew, name) + inew += 1 + + # aug_domain space now: + # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes'][n1_stor_axes] + + from loopy.symbolic import aff_from_expr + for saxis, bi, s in zip(storage_axis_names, storage_base_indices, + storage_shape): + if s != 1: + cns = isl.Constraint.equality_from_aff( + aff_from_expr(aug_domain.get_space(), + var(saxis) - (var(saxis+"'") - bi))) + + aug_domain = aug_domain.add_constraint(cns) + + # }}} + + # eliminate (primed) storage axes with non-zero base indices + aug_domain = aug_domain.project_out(dim_type.set, stor_idx+nn1_stor, n_stor) + + # eliminate duplicated sweep_inames + nsweep = len(sweep_inames) + aug_domain = aug_domain.project_out(dim_type.set, dup_sweep_index, nsweep) + + self.non1_storage_axis_flags = non1_storage_axis_flags + self.aug_domain = aug_domain + self.storage_base_indices = storage_base_indices + self.non1_storage_shape = non1_storage_shape + + def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names, + boxify_sweep=False): + + renamed_aug_domain = self.aug_domain + first_storage_index = ( + renamed_aug_domain.dim(dim_type.set) + - len(self.non1_storage_shape)) + + inon1 = 0 + for i, old_name in enumerate(self.storage_axis_names): + if not self.non1_storage_axis_flags[i]: + continue + + new_name = new_non1_storage_axis_names[inon1] + + assert ( + renamed_aug_domain.get_dim_name( + dim_type.set, first_storage_index+inon1) + == old_name) + renamed_aug_domain = renamed_aug_domain.set_dim_name( + dim_type.set, first_storage_index+inon1, new_name) + + inon1 += 1 + + domain, renamed_aug_domain = isl.align_two(domain, renamed_aug_domain) + domain = domain & renamed_aug_domain + + from loopy.isl_helpers import convexify, boxify + if boxify_sweep: + return boxify(self.kernel.cache_manager, domain, + new_non1_storage_axis_names, self.kernel.assumptions) + else: + return convexify(domain) + + def is_access_descriptor_in_footprint(self, accdesc): + if accdesc.expands_footprint: + return True + + # Make all inames except the sweep parameters. (The footprint may depend on + # those.) (I.e. only leave sweep inames as out parameters.) + + global_s2s_par_dom = move_to_par_from_out( + self.stor2sweep, + except_inames=frozenset(self.primed_sweep_inames)).domain() + + arg_inames = ( + set(global_s2s_par_dom.get_var_names(dim_type.param)) + & self.kernel.all_inames()) + + for arg in accdesc.args: + arg_inames.update(get_dependencies(arg)) + arg_inames = frozenset(arg_inames) + + from loopy.kernel import CannotBranchDomainTree + try: + usage_domain = self.kernel.get_inames_domain(arg_inames) + except CannotBranchDomainTree: + return False + + for i in range(usage_domain.dim(dim_type.set)): + iname = usage_domain.get_dim_name(dim_type.set, i) + if iname in self.sweep_inames: + usage_domain = usage_domain.set_dim_name( + dim_type.set, i, iname+"'") + + stor2sweep = build_per_access_storage_to_domain_map(accdesc, + usage_domain, self.storage_axis_names, + self.prime_sweep_inames) + + if isinstance(stor2sweep, isl.BasicMap): + stor2sweep = isl.Map.from_basic_map(stor2sweep) + + stor2sweep = stor2sweep.intersect_range(usage_domain) + + stor2sweep = move_to_par_from_out(stor2sweep, + except_inames=frozenset(self.primed_sweep_inames)) + + s2s_domain = stor2sweep.domain() + s2s_domain, aligned_g_s2s_parm_dom = isl.align_two( + s2s_domain, global_s2s_par_dom) + + arg_restrictions = ( + aligned_g_s2s_parm_dom + .eliminate(dim_type.set, 0, + aligned_g_s2s_parm_dom.dim(dim_type.set)) + .remove_divs()) + + return (arg_restrictions & s2s_domain).is_subset( + aligned_g_s2s_parm_dom) + + +class NoOpArrayToBufferMap(object): + non1_storage_axis_names = () + storage_base_indices = () + non1_storage_shape = () + + def is_access_descriptor_in_footprint(self, accdesc): + # no index dependencies--every reference to the subst rule + # is necessarily in the footprint. + + return True + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 44426c3519e03c9f6cbfe5990324d824c43132b7..3927293ae82dc521327a8c17afae5e93be1e3aa9 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -195,11 +195,11 @@ def build_loop_nest(kernel, sched_index, codegen_state): sched_index_info_entries = [ ScheduleIndexInfo( schedule_index=i, - admissible_cond_inames= - get_admissible_conditional_inames_for(kernel, i), + admissible_cond_inames=( + get_admissible_conditional_inames_for(kernel, i)), required_predicates=get_required_predicates(kernel, i) ) - for i in my_sched_indices] + for i in my_sched_indices] # }}} @@ -279,11 +279,11 @@ def build_loop_nest(kernel, sched_index, codegen_state): current_iname_set = ( current_iname_set & sched_index_info_entries[candidate_group_length-1] - .admissible_cond_inames) + .admissible_cond_inames) current_pred_set = ( current_pred_set & sched_index_info_entries[candidate_group_length-1] - .required_predicates) + .required_predicates) # {{{ see which inames are actually used in group @@ -376,6 +376,4 @@ def build_loop_nest(kernel, sched_index, codegen_state): build_insn_group(sched_index_info_entries, codegen_state)) - - # vim: foldmethod=marker diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index ca1cce69c4cf1f87233c35e3dfc7987940f132fa..1bd977f3ea2e8e15e63424e89b1e347063da70e2 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -36,7 +36,6 @@ def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, st bounds_checks = get_bounds_checks( domain, check_inames, codegen_state.implemented_domain, overapproximate=False) - bounds_check_set = isl.Set.universe(domain.get_space()) \ .add_constraints(bounds_checks) bounds_check_set, new_implemented_domain = isl.align_two( @@ -70,6 +69,7 @@ def generate_instruction_code(kernel, insn, codegen_state): raise RuntimeError("unexpected instruction type") insn_inames = kernel.insn_inames(insn) + insn_code, impl_domain = wrap_in_conditionals( codegen_state, kernel.get_inames_domain(insn_inames), insn_inames, diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 783dc32313883f092f15baacccf3e68d75af3b93..a6b5b422bcaa413033ecce4808be581139394876 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -221,6 +221,7 @@ class F2LoopyTranslator(FTreeWalkerBase): self.isl_context = isl.Context() self.insn_id_counter = 0 + self.condition_id_counter = 0 self.kernels = [] @@ -229,9 +230,34 @@ class F2LoopyTranslator(FTreeWalkerBase): self.in_transform_code = False self.instruction_tags = [] + self.conditions = [] self.transform_code_lines = [] + def add_expression_instruction(self, lhs, rhs): + scope = self.scope_stack[-1] + + new_id = "insn%d" % self.insn_id_counter + self.insn_id_counter += 1 + + if scope.previous_instruction_id: + insn_deps = frozenset([scope.previous_instruction_id]) + else: + insn_deps = frozenset() + + from loopy.kernel.data import ExpressionInstruction + insn = ExpressionInstruction( + lhs, rhs, + forced_iname_deps=frozenset( + scope.active_loopy_inames), + insn_deps=insn_deps, + id=new_id, + predicates=frozenset(self.conditions), + tags=tuple(self.instruction_tags)) + + scope.previous_instruction_id = new_id + scope.instructions.append(insn) + # {{{ map_XXX functions def map_BeginSource(self, node): @@ -385,28 +411,9 @@ class F2LoopyTranslator(FTreeWalkerBase): scope.use_name(lhs_name) - from loopy.kernel.data import ExpressionInstruction - rhs = scope.process_expression_for_loopy(self.parse_expr(node.expr)) - new_id = "insn%d" % self.insn_id_counter - self.insn_id_counter += 1 - - if scope.previous_instruction_id: - insn_deps = frozenset([scope.previous_instruction_id]) - else: - insn_deps = frozenset() - - insn = ExpressionInstruction( - lhs, rhs, - forced_iname_deps=frozenset( - scope.active_loopy_inames), - insn_deps=insn_deps, - id=new_id, - tags=tuple(self.instruction_tags)) - - scope.previous_instruction_id = new_id - scope.instructions.append(insn) + self.add_expression_instruction(lhs, rhs) def map_Allocate(self, node): raise NotImplementedError("allocate") @@ -448,10 +455,31 @@ class F2LoopyTranslator(FTreeWalkerBase): # node.content[0] def map_IfThen(self, node): - raise NotImplementedError("if-then") + scope = self.scope_stack[-1] + + cond_name = "loopy_cond%d" % self.condition_id_counter + self.condition_id_counter += 1 + assert cond_name not in scope.type_map + + scope.type_map[cond_name] = np.int32 + + from pymbolic import var + cond_var = var(cond_name) + + self.add_expression_instruction( + cond_var, self.parse_expr(node.expr)) + + self.conditions.append(cond_name) + + for c in node.content: + self.rec(c) + + def map_Else(self, node): + cond_name = self.conditions.pop() + self.conditions.append("!" + cond_name) def map_EndIfThen(self, node): - return [] + self.conditions.pop() def map_Do(self, node): scope = self.scope_stack[-1] @@ -460,7 +488,8 @@ class F2LoopyTranslator(FTreeWalkerBase): loop_var, loop_bounds = node.loopcontrol.split("=") loop_var = loop_var.strip() scope.use_name(loop_var) - loop_bounds = [self.parse_expr(s) for s in loop_bounds.split(",")] + loop_bounds = self.parse_expr( + loop_bounds, min_precedence=self.expr_parser._PREC_FUNC_ARGS) if len(loop_bounds) == 2: start, stop = loop_bounds @@ -560,7 +589,8 @@ class F2LoopyTranslator(FTreeWalkerBase): begin_tag_match = self.begin_tag_re.match(stripped_comment_line) end_tag_match = self.end_tag_re.match(stripped_comment_line) - faulty_loopy_pragma_match = self.faulty_loopy_pragma.match(stripped_comment_line) + faulty_loopy_pragma_match = self.faulty_loopy_pragma.match( + stripped_comment_line) if stripped_comment_line == "$loopy begin transform": if self.in_transform_code: @@ -653,6 +683,9 @@ class F2LoopyTranslator(FTreeWalkerBase): default_order="F" ) + from loopy.loop import fuse_loop_domains + knl = fuse_loop_domains(knl) + proc_dict[sub.subprogram_name] = lp.fold_constants(knl) transform_code = remove_common_indentation( diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index fe25435b79d73d89f84f3a7dc1e5e9a474ef8b6a..4291d98749cb84bf743c3e8c745052190000cd03 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -81,8 +81,8 @@ class FTreeWalkerBase(object): # {{{ expressions - def parse_expr(self, expr_str): - return self.expr_parser(expr_str) + def parse_expr(self, expr_str, **kwargs): + return self.expr_parser(expr_str, **kwargs) # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d3089fc6a6baad741c1cfc2d1843aa541192cfb3..68f7f131c4c40d2f647fb6c6095959b4884a3c41 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -392,10 +392,9 @@ class LoopKernel(RecordWithoutPickling): iname_set_stack = [] result = [] - writer_map = self.writer_map() + from loopy.kernel.tools import is_domain_dependent_on_inames - for dom in self.domains: - parameters = set(dom.get_var_names(dim_type.param)) + for dom_idx, dom in enumerate(self.domains): inames = set(dom.get_var_names(dim_type.set)) # This next domain may be nested inside the previous domain. @@ -405,37 +404,11 @@ class LoopKernel(RecordWithoutPickling): discard_level_count = 0 while discard_level_count < len(iname_set_stack): - # {{{ check for parenthood by loop bound iname - last_inames = iname_set_stack[-1-discard_level_count] - if last_inames & parameters: - break - - # }}} - - # {{{ check for parenthood by written variable - is_parent_by_variable = False - for par in parameters: - if par in self.temporary_variables: - writer_insns = writer_map[par] - - if len(writer_insns) > 1: - raise RuntimeError("loop bound '%s' " - "may only be written to once" % par) - - writer_insn, = writer_insns - writer_inames = self.insn_inames(writer_insn) - - if writer_inames & last_inames: - is_parent_by_variable = True - break - - if is_parent_by_variable: + if is_domain_dependent_on_inames(self, dom_idx, last_inames): break - # }}} - discard_level_count += 1 if discard_level_count: diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index ba405c15af18321cd8cae8fbd41816c6b9b14927..871f5359b925703a8e71729af7585e51d051f7ca 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -431,8 +431,10 @@ class InstructionBase(Record): .. attribute:: predicates - a :class:`frozenset` of variable names whose truth values (as defined - by C) determine whether this instruction should be run + a :class:`frozenset` of variable names the conjunction (logical and) of + whose truth values (as defined by C) determine whether this instruction + should be run. Each variable name may, optionally, be preceded by + an exclamation point, indicating negation. .. attribute:: forced_iname_deps_is_final @@ -678,7 +680,10 @@ class ExpressionInstruction(InstructionBase): for _, subscript in self.assignees_and_indices(): result = result | get_dependencies(subscript) - result = result | self.predicates + processed_predicates = frozenset( + pred.lstrip("!") for pred in self.predicates) + + result = result | processed_predicates return result diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index bd1fcb449320b5bab98b5d69cd0239621a61076a..2973cd9e69e46c875e15e6c8674150373be8be1e 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -539,4 +539,41 @@ class DomainParameterFinder(object): # }}} + +# {{{ is domain dependent on inames + +def is_domain_dependent_on_inames(kernel, domain_index, inames): + dom = kernel.domains[domain_index] + dom_parameters = set(dom.get_var_names(dim_type.param)) + + # {{{ check for parenthood by loop bound iname + + if inames & dom_parameters: + return True + + # }}} + + # {{{ check for parenthood by written variable + + for par in dom_parameters: + if par in kernel.temporary_variables: + writer_insns = kernel.writer_map()[par] + + if len(writer_insns) > 1: + raise RuntimeError("loop bound '%s' " + "may only be written to once" % par) + + writer_insn, = writer_insns + writer_inames = kernel.insn_inames(writer_insn) + + if writer_inames & inames: + return True + + # }}} + + return False + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/loop.py b/loopy/loop.py new file mode 100644 index 0000000000000000000000000000000000000000..4592463822a2321745aaf48a316d16c98d4efca3 --- /dev/null +++ b/loopy/loop.py @@ -0,0 +1,122 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import islpy as isl +import six + + +def potential_loop_nest_map(kernel): + """Returns a dictionary mapping inames to other inames that *could* + be nested around them. + + :seealso: :func:`loopy.schedule.loop_nest_map` + """ + + result = {} + + all_inames = kernel.all_inames() + iname_to_insns = kernel.iname_to_insns() + + # examine pairs of all inames--O(n**2), I know. + for inner_iname in all_inames: + inner_result = set() + for outer_iname in all_inames: + if inner_iname == outer_iname: + continue + + if iname_to_insns[inner_iname] <= iname_to_insns[outer_iname]: + inner_result.add(outer_iname) + + if inner_result: + result[inner_iname] = inner_result + + return result + + +def fuse_loop_domains(kernel): + from loopy.kernel.tools import is_domain_dependent_on_inames + + while True: + lnm = potential_loop_nest_map(kernel) + parents_per_domain = kernel.parents_per_domain() + all_parents_per_domain = kernel.all_parents_per_domain() + + new_domains = None + + for inner_iname, outer_inames in six.iteritems(lnm): + for outer_iname in outer_inames: + # {{{ check if it's safe to fuse + + inner_domain_idx = kernel.get_home_domain_index(inner_iname) + outer_domain_idx = kernel.get_home_domain_index(outer_iname) + + if inner_domain_idx == outer_domain_idx: + break + + if ( + outer_domain_idx in all_parents_per_domain[inner_domain_idx] + and not + outer_domain_idx == parents_per_domain[inner_domain_idx]): + # Outer domain is not a direct parent of the inner + # domain. Unable to fuse. + continue + + outer_dom = kernel.domains[outer_domain_idx] + inner_dom = kernel.domains[inner_domain_idx] + + outer_inames = set(outer_dom.get_var_names(isl.dim_type.set)) + if is_domain_dependent_on_inames(kernel, inner_domain_idx, + outer_inames): + # Bounds of inner domain depend on outer domain. + # Unable to fuse. + continue + + # }}} + + new_domains = kernel.domains[:] + min_idx = min(inner_domain_idx, outer_domain_idx) + max_idx = max(inner_domain_idx, outer_domain_idx) + + del new_domains[max_idx] + del new_domains[min_idx] + + outer_dom, inner_dom = isl.align_two(outer_dom, inner_dom) + + new_domains.insert(min_idx, inner_dom & outer_dom) + break + + if new_domains: + break + + if not new_domains: + # Nothing was accomplished in the last loop trip, time to quit. + break + + kernel = kernel.copy(domains=new_domains) + + return kernel + + +# vim: fdm=marker diff --git a/loopy/precompute.py b/loopy/precompute.py index 984e24d1f89b5ea036636513acee7636bbec58dd..de5cd9e02bf1a5e5d367cc565c6c6aef9efce52c 100644 --- a/loopy/precompute.py +++ b/loopy/precompute.py @@ -1,8 +1,6 @@ -from __future__ import division -from __future__ import absolute_import +from __future__ import division, absolute_import import six -from six.moves import range -from six.moves import zip +from six.moves import range, zip __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -28,337 +26,35 @@ THE SOFTWARE. import islpy as isl -from islpy import dim_type from loopy.symbolic import (get_dependencies, SubstitutionMapper, ExpandingIdentityMapper) from pymbolic.mapper.substitutor import make_subst_func import numpy as np -from pytools import Record from pymbolic import var +from loopy.array_buffer import (ArrayToBufferMap, NoOpArrayToBufferMap, + AccessDescriptor) -class InvocationDescriptor(Record): - __slots__ = [ - "args", - "expands_footprint", - "is_in_footprint", - # Remember where the invocation happened, in terms of the expansion - # call stack. - "expansion_stack", - ] +class RuleAccessDescriptor(AccessDescriptor): + __slots__ = ["args", "expansion_stack"] -def to_parameters_or_project_out(param_inames, set_inames, set): - for iname in list(set.get_space().get_var_dict().keys()): - if iname in param_inames: - dt, idx = set.get_space().get_var_dict()[iname] - set = set.move_dims( - dim_type.param, set.dim(dim_type.param), - dt, idx, 1) - elif iname in set_inames: - pass - else: - dt, idx = set.get_space().get_var_dict()[iname] - set = set.project_out(dt, idx, 1) - - return set - - -# {{{ construct storage->sweep map - -def build_per_access_storage_to_domain_map(invdesc, domain, - storage_axis_names, storage_axis_sources, - prime_sweep_inames): - - map_space = domain.get_space() - stor_dim = len(storage_axis_names) - rn = map_space.dim(dim_type.out) - - map_space = map_space.add_dims(dim_type.in_, stor_dim) - for i, saxis in enumerate(storage_axis_names): - # arg names are initially primed, to be replaced with unprimed - # base-0 versions below - - map_space = map_space.set_dim_name(dim_type.in_, i, saxis+"'") +def access_descriptor_id(args, expansion_stack): + return (args, expansion_stack) - # map_space: [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep](rn) - set_space = map_space.move_dims( - dim_type.out, rn, - dim_type.in_, 0, stor_dim).range() +def storage_axis_exprs(storage_axis_sources, args): + result = [] - # set_space: [domain](dup_sweep_index)[dup_sweep](rn)[stor_axes'] - - stor2sweep = None - - from loopy.symbolic import aff_from_expr - - for saxis, saxis_source in zip(storage_axis_names, storage_axis_sources): + for saxis_source in storage_axis_sources: if isinstance(saxis_source, int): - # an argument - cns = isl.Constraint.equality_from_aff( - aff_from_expr(set_space, - var(saxis+"'") - - prime_sweep_inames(invdesc.args[saxis_source]))) - else: - # a 'bare' sweep iname - cns = isl.Constraint.equality_from_aff( - aff_from_expr(set_space, - var(saxis+"'") - - prime_sweep_inames(var(saxis_source)))) - - cns_map = isl.BasicMap.from_constraint(cns) - if stor2sweep is None: - stor2sweep = cns_map - else: - stor2sweep = stor2sweep.intersect(cns_map) - - if stor2sweep is not None: - stor2sweep = stor2sweep.move_dims( - dim_type.in_, 0, - dim_type.out, rn, stor_dim) - - # stor2sweep is back in map_space - return stor2sweep - - -def move_to_par_from_out(s2smap, except_inames): - while True: - var_dict = s2smap.get_var_dict(dim_type.out) - todo_inames = set(var_dict) - except_inames - if todo_inames: - iname = todo_inames.pop() - - _, dim_idx = var_dict[iname] - s2smap = s2smap.move_dims( - dim_type.param, s2smap.dim(dim_type.param), - dim_type.out, dim_idx, 1) + result.append(args[saxis_source]) else: - return s2smap - - -def build_global_storage_to_sweep_map(kernel, invocation_descriptors, - domain_dup_sweep, dup_sweep_index, - storage_axis_names, storage_axis_sources, - sweep_inames, primed_sweep_inames, prime_sweep_inames): - """ - As a side effect, this fills out is_in_footprint in the - invocation descriptors. - """ - - # The storage map goes from storage axes to the domain. - # The first len(arg_names) storage dimensions are the rule's arguments. - - global_stor2sweep = None - - # build footprint - for invdesc in invocation_descriptors: - if invdesc.expands_footprint: - stor2sweep = build_per_access_storage_to_domain_map( - invdesc, domain_dup_sweep, - storage_axis_names, storage_axis_sources, - prime_sweep_inames) - - if global_stor2sweep is None: - global_stor2sweep = stor2sweep - else: - global_stor2sweep = global_stor2sweep.union(stor2sweep) - - invdesc.is_in_footprint = True - - if isinstance(global_stor2sweep, isl.BasicMap): - global_stor2sweep = isl.Map.from_basic_map(global_stor2sweep) - global_stor2sweep = global_stor2sweep.intersect_range(domain_dup_sweep) - - # space for global_stor2sweep: - # [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep](rn) - - # {{{ check if non-footprint-building invocation descriptors fall into footprint - - # Make all inames except the sweep parameters. (The footprint may depend on - # those.) (I.e. only leave sweep inames as out parameters.) - global_s2s_par_dom = move_to_par_from_out( - global_stor2sweep, except_inames=frozenset(primed_sweep_inames)).domain() - - for invdesc in invocation_descriptors: - if not invdesc.expands_footprint: - arg_inames = ( - set(global_s2s_par_dom.get_var_names(dim_type.param)) - & kernel.all_inames()) - - for arg in invdesc.args: - arg_inames.update(get_dependencies(arg)) - arg_inames = frozenset(arg_inames) - - from loopy.kernel import CannotBranchDomainTree - try: - usage_domain = kernel.get_inames_domain(arg_inames) - except CannotBranchDomainTree: - # and that's the end of that. - invdesc.is_in_footprint = False - continue - - for i in range(usage_domain.dim(dim_type.set)): - iname = usage_domain.get_dim_name(dim_type.set, i) - if iname in sweep_inames: - usage_domain = usage_domain.set_dim_name( - dim_type.set, i, iname+"'") - - stor2sweep = build_per_access_storage_to_domain_map(invdesc, - usage_domain, storage_axis_names, storage_axis_sources, - prime_sweep_inames) - - if isinstance(stor2sweep, isl.BasicMap): - stor2sweep = isl.Map.from_basic_map(stor2sweep) - - stor2sweep = stor2sweep.intersect_range(usage_domain) - - stor2sweep = move_to_par_from_out(stor2sweep, - except_inames=frozenset(primed_sweep_inames)) - - s2s_domain = stor2sweep.domain() - s2s_domain, aligned_g_s2s_parm_dom = isl.align_two( - s2s_domain, global_s2s_par_dom) - - arg_restrictions = ( - aligned_g_s2s_parm_dom - .eliminate(dim_type.set, 0, - aligned_g_s2s_parm_dom.dim(dim_type.set)) - .remove_divs()) - - is_in_footprint = (arg_restrictions & s2s_domain).is_subset( - aligned_g_s2s_parm_dom) - - invdesc.is_in_footprint = is_in_footprint - - # }}} - - return global_stor2sweep - -# }}} - - -# {{{ compute storage bounds - -def find_var_base_indices_and_shape_from_inames( - domain, inames, cache_manager, context=None): - base_indices_and_sizes = [ - cache_manager.base_index_and_length(domain, iname, context) - for iname in inames] - return list(zip(*base_indices_and_sizes)) - - -def compute_bounds(kernel, domain, stor2sweep, - primed_sweep_inames, storage_axis_names): - - bounds_footprint_map = move_to_par_from_out( - stor2sweep, except_inames=frozenset(primed_sweep_inames)) - - # compute bounds for each storage axis - storage_domain = bounds_footprint_map.domain().coalesce() - - if not storage_domain.is_bounded(): - raise RuntimeError("sweep did not result in a bounded storage domain") - - return find_var_base_indices_and_shape_from_inames( - storage_domain, [saxis+"'" for saxis in storage_axis_names], - kernel.cache_manager, context=kernel.assumptions) - -# }}} + result.append(var(saxis_source)) - -def get_access_info(kernel, domain, - storage_axis_names, storage_axis_sources, - sweep_inames, invocation_descriptors): - - # {{{ duplicate sweep inames - - # The duplication is necessary, otherwise the storage fetch - # inames remain weirdly tied to the original sweep inames. - - primed_sweep_inames = [psin+"'" for psin in sweep_inames] - from loopy.isl_helpers import duplicate_axes - dup_sweep_index = domain.space.dim(dim_type.out) - domain_dup_sweep = duplicate_axes( - domain, sweep_inames, - primed_sweep_inames) - - prime_sweep_inames = SubstitutionMapper(make_subst_func( - dict((sin, var(psin)) - for sin, psin in zip(sweep_inames, primed_sweep_inames)))) - - # }}} - - stor2sweep = build_global_storage_to_sweep_map( - kernel, invocation_descriptors, - domain_dup_sweep, dup_sweep_index, - storage_axis_names, storage_axis_sources, - sweep_inames, primed_sweep_inames, prime_sweep_inames) - - storage_base_indices, storage_shape = compute_bounds( - kernel, domain, stor2sweep, primed_sweep_inames, - storage_axis_names) - - # compute augmented domain - - # {{{ filter out unit-length dimensions - - non1_storage_axis_names = [] - non1_storage_shape = [] - - for saxis, bi, l in zip(storage_axis_names, storage_base_indices, storage_shape): - if l != 1: - non1_storage_axis_names.append(saxis) - non1_storage_shape.append(l) - - # }}} - - # {{{ subtract off the base indices - # add the new, base-0 indices as new in dimensions - - sp = stor2sweep.get_space() - stor_idx = sp.dim(dim_type.out) - - n_stor = len(storage_axis_names) - nn1_stor = len(non1_storage_axis_names) - - aug_domain = stor2sweep.move_dims( - dim_type.out, stor_idx, - dim_type.in_, 0, - n_stor).range() - - # aug_domain space now: - # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes'] - - aug_domain = aug_domain.insert_dims(dim_type.set, stor_idx, nn1_stor) - for i, name in enumerate(non1_storage_axis_names): - aug_domain = aug_domain.set_dim_name(dim_type.set, stor_idx+i, name) - - # aug_domain space now: - # [domain](dup_sweep_index)[dup_sweep](stor_idx)[stor_axes'][n1_stor_axes] - - from loopy.symbolic import aff_from_expr - for saxis, bi, s in zip(storage_axis_names, storage_base_indices, storage_shape): - if s != 1: - cns = isl.Constraint.equality_from_aff( - aff_from_expr(aug_domain.get_space(), - var(saxis) - (var(saxis+"'") - bi))) - - aug_domain = aug_domain.add_constraint(cns) - - # }}} - - # eliminate (primed) storage axes with non-zero base indices - aug_domain = aug_domain.project_out(dim_type.set, stor_idx+nn1_stor, n_stor) - - # eliminate duplicated sweep_inames - nsweep = len(sweep_inames) - aug_domain = aug_domain.project_out(dim_type.set, dup_sweep_index, nsweep) - - return (non1_storage_axis_names, aug_domain, - storage_base_indices, non1_storage_shape) + return result def simplify_via_aff(expr): @@ -369,7 +65,7 @@ def simplify_via_aff(expr): expr)) -class InvocationGatherer(ExpandingIdentityMapper): +class RuleInvocationGatherer(ExpandingIdentityMapper): def __init__(self, kernel, subst_name, subst_tag, within): ExpandingIdentityMapper.__init__(self, kernel.substitutions, kernel.get_var_name_generator()) @@ -383,7 +79,7 @@ class InvocationGatherer(ExpandingIdentityMapper): self.subst_tag = subst_tag self.within = within - self.invocation_descriptors = [] + self.access_descriptors = [] def map_substitution(self, name, tag, arguments, expn_state): process_me = name == self.subst_name @@ -407,6 +103,9 @@ class InvocationGatherer(ExpandingIdentityMapper): | get_dependencies(self.subst_expander( arg_val, insn_id=None, insn_tags=None))) + # FIXME: This is too strict--and the footprint machinery + # needs to be taught how to deal with locally constant + # variables. if not arg_deps <= self.kernel.all_inames(): from warnings import warn warn("Precompute arguments in '%s(%s)' do not consist exclusively " @@ -420,17 +119,21 @@ class InvocationGatherer(ExpandingIdentityMapper): return ExpandingIdentityMapper.map_substitution( self, name, tag, arguments, expn_state) - self.invocation_descriptors.append( - InvocationDescriptor( - args=[arg_context[arg_name] for arg_name in rule.arguments], - expansion_stack=expn_state.stack)) + args = [arg_context[arg_name] for arg_name in rule.arguments] + + # Do not set expands_footprint here, it is set below. + self.access_descriptors.append( + RuleAccessDescriptor( + identifier=access_descriptor_id(args, expn_state.stack), + args=args, + )) return 0 # exact value irrelevant -class InvocationReplacer(ExpandingIdentityMapper): +class RuleInvocationReplacer(ExpandingIdentityMapper): def __init__(self, kernel, subst_name, subst_tag, within, - invocation_descriptors, + access_descriptors, array_base_map, storage_axis_names, storage_axis_sources, storage_base_indices, non1_storage_axis_names, target_var_name): @@ -446,7 +149,8 @@ class InvocationReplacer(ExpandingIdentityMapper): self.subst_tag = subst_tag self.within = within - self.invocation_descriptors = invocation_descriptors + self.access_descriptors = access_descriptors + self.array_base_map = array_base_map self.storage_axis_names = storage_axis_names self.storage_axis_sources = storage_axis_sources @@ -474,21 +178,21 @@ class InvocationReplacer(ExpandingIdentityMapper): return ExpandingIdentityMapper.map_substitution( self, name, tag, arguments, expn_state) - matching_invdesc = None - for invdesc in self.invocation_descriptors: - if invdesc.args == args and expn_state.stack: + matching_accdesc = None + for accdesc in self.access_descriptors: + if accdesc.identifier == access_descriptor_id(args, expn_state.stack): # Could be more than one, that's fine. - matching_invdesc = invdesc + matching_accdesc = accdesc break - assert matching_invdesc is not None + assert matching_accdesc is not None - invdesc = matching_invdesc - del matching_invdesc + accdesc = matching_accdesc + del matching_accdesc # }}} - if not invdesc.is_in_footprint: + if not self.array_base_map.is_access_descriptor_in_footprint(accdesc): return ExpandingIdentityMapper.map_substitution( self, name, tag, arguments, expn_state) @@ -526,7 +230,8 @@ class InvocationReplacer(ExpandingIdentityMapper): def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, new_storage_axis_names=None, storage_axis_to_tag={}, - default_tag="l.auto", dtype=None, fetch_bounding_box=False): + default_tag="l.auto", dtype=None, fetch_bounding_box=False, + temporary_is_local=None): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two things to operate, a list of *sweep_inames* (order irrelevant) and an @@ -642,11 +347,17 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, from loopy.context_matching import parse_stack_match within = parse_stack_match(within) + from loopy.kernel.data import parse_tag + default_tag = parse_tag(default_tag) + + subst = kernel.substitutions[subst_name] + c_subst_name = subst_name.replace(".", "_") + # }}} - # {{{ process invocations in footprint generators, start invocation_descriptors + # {{{ process invocations in footprint generators, start access_descriptors - invocation_descriptors = [] + access_descriptors = [] if footprint_generators: from pymbolic.primitives import Variable, Call @@ -660,35 +371,29 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, raise ValueError("footprint generator must " "be substitution rule invocation") - invocation_descriptors.append( - InvocationDescriptor(args=args, + access_descriptors.append( + RuleAccessDescriptor( + identifier=access_descriptor_id(args, None), expands_footprint=True, - expansion_stack=None)) + args=args + )) # }}} - c_subst_name = subst_name.replace(".", "_") + # {{{ gather up invocations in kernel code, finish access_descriptors - from loopy.kernel.data import parse_tag - default_tag = parse_tag(default_tag) - - subst = kernel.substitutions[subst_name] - arg_names = subst.arguments - - # {{{ gather up invocations in kernel code, finish invocation_descriptors - - invg = InvocationGatherer(kernel, subst_name, subst_tag, within) + invg = RuleInvocationGatherer(kernel, subst_name, subst_tag, within) import loopy as lp for insn in kernel.instructions: if isinstance(insn, lp.ExpressionInstruction): invg(insn.expression, insn.id, insn.tags) - for invdesc in invg.invocation_descriptors: - invocation_descriptors.append( - invdesc.copy(expands_footprint=footprint_generators is None)) + for accdesc in invg.access_descriptors: + access_descriptors.append( + accdesc.copy(expands_footprint=footprint_generators is None)) - if not invocation_descriptors: + if not access_descriptors: raise RuntimeError("no invocations of '%s' found" % subst_name) # }}} @@ -700,9 +405,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, expanding_usage_arg_deps = set() - for invdesc in invocation_descriptors: - if invdesc.expands_footprint: - for arg in invdesc.args: + for accdesc in access_descriptors: + if accdesc.expands_footprint: + for arg in accdesc.args: expanding_usage_arg_deps.update( get_dependencies(arg) & kernel.all_inames()) @@ -731,7 +436,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, if storage_axes is None: storage_axes = ( list(extra_storage_axes) - + list(range(len(arg_names)))) + + list(range(len(subst.arguments)))) expr_subst_dict = {} @@ -780,6 +485,16 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} + # {{{ fill out access_descriptors[...].storage_axis_exprs + + access_descriptors = [ + accdesc.copy( + storage_axis_exprs=storage_axis_exprs( + storage_axis_sources, accdesc.args)) + for accdesc in access_descriptors] + + # }}} + expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps) assert expanding_inames <= kernel.all_inames() @@ -801,37 +516,26 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} - (non1_storage_axis_names, new_domain, - storage_base_indices, non1_storage_shape) = \ - get_access_info(kernel, domch.domain, - storage_axis_names, storage_axis_sources, - sweep_inames, invocation_descriptors) - - from loopy.isl_helpers import convexify, boxify - if fetch_bounding_box: - new_domain = boxify(kernel.cache_manager, new_domain, - non1_storage_axis_names, kernel.assumptions) - else: - new_domain = convexify(new_domain) + abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames, + access_descriptors, len(storage_axis_names)) - for saxis in storage_axis_names: - if saxis not in non1_storage_axis_names: + non1_storage_axis_names = [] + for i, saxis in enumerate(storage_axis_names): + if abm.non1_storage_axis_flags[i]: + non1_storage_axis_names.append(saxis) + else: del new_iname_to_tag[saxis] - new_kernel_domains = domch.get_domains_with(new_domain) + new_kernel_domains = domch.get_domains_with( + abm.augment_domain_with_sweep( + domch.domain, non1_storage_axis_names, + boxify_sweep=fetch_bounding_box)) + else: # leave kernel domains unchanged new_kernel_domains = kernel.domains - non1_storage_axis_names = () - storage_base_indices = () - non1_storage_shape = () - - # no index dependencies--every reference to the subst rule - # is necessarily in the footprint. - - for invdesc in invocation_descriptors: - invdesc.is_in_footprint = True + abm = NoOpArrayToBufferMap() # {{{ set up compute insn @@ -852,7 +556,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, compute_expr = (SubstitutionMapper( make_subst_func(dict( (arg_name, zero_length_1_arg(arg_name)+bi) - for arg_name, bi in zip(storage_axis_names, storage_base_indices) + for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices) ))) (compute_expr)) @@ -866,10 +570,10 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # {{{ substitute rule into expressions in kernel (if within footprint) - invr = InvocationReplacer(kernel, subst_name, subst_tag, within, - invocation_descriptors, + invr = RuleInvocationReplacer(kernel, subst_name, subst_tag, within, + access_descriptors, abm, storage_axis_names, storage_axis_sources, - storage_base_indices, non1_storage_axis_names, + abm.storage_base_indices, non1_storage_axis_names, target_var_name) kernel = invr.map_kernel(kernel) @@ -886,13 +590,16 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, import loopy as lp + if temporary_is_local is None: + temporary_is_local = lp.auto + new_temporary_variables = kernel.temporary_variables.copy() temp_var = lp.TemporaryVariable( name=target_var_name, dtype=dtype, - base_indices=(0,)*len(non1_storage_shape), - shape=tuple(non1_storage_shape), - is_local=lp.auto) + base_indices=(0,)*len(abm.non1_storage_shape), + shape=tuple(abm.non1_storage_shape), + is_local=temporary_is_local) new_temporary_variables[target_var_name] = temp_var diff --git a/loopy/schedule.py b/loopy/schedule.py index 6bba764842cc19a55f8e1bd02cccdfb71bacccc8..6ef5378d6456ca022f9dd27ee87e593810981e98 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -638,7 +638,7 @@ def generate_loop_schedules_internal(sched_state, loop_priority, schedule=[], if debug_mode: print(75*"=") - six.input("Hit Enter for next schedule:") + six.moves.input("Hit Enter for next schedule:") if not active_inames and not unscheduled_insn_ids: # if done, yield result @@ -1025,8 +1025,6 @@ def generate_loop_schedules(kernel, debug_args={}): from loopy.check import pre_schedule_checks pre_schedule_checks(kernel) - logger.info("%s: schedule start" % kernel.name) - schedule_count = 0 debug = ScheduleDebugger(**debug_args) @@ -1094,7 +1092,7 @@ def generate_loop_schedules(kernel, debug_args={}): print(" debug_args=dict(interactive=False)") print("to generate_loop_schedules().") print(75*"-") - six.input("Enter:") + six.moves.input("Enter:") print() print() @@ -1134,6 +1132,11 @@ def get_one_scheduled_kernel(kernel): kernel_count = 0 + from time import time + start_time = time() + + logger.info("%s: schedule start" % kernel.name) + for scheduled_kernel in generate_loop_schedules(kernel): kernel_count += 1 @@ -1145,6 +1148,9 @@ def get_one_scheduled_kernel(kernel): ambiguous = True break + logger.info("%s: scheduling done after %.2f s" % ( + kernel.name, time()-start_time)) + if ambiguous: from warnings import warn from loopy.diagnostic import LoopyWarning diff --git a/loopy/subst.py b/loopy/subst.py index 90c0a79162c77ac1cbd97ac8ffaf12fd8c8c36d0..d4dba4b5dfd9918926adaad2af9179813914e8be 100644 --- a/loopy/subst.py +++ b/loopy/subst.py @@ -43,10 +43,12 @@ class ExprDescriptor(Record): __slots__ = ["insn", "expr", "unif_var_dict"] -def extract_subst(kernel, subst_name, template, parameters): +def extract_subst(kernel, subst_name, template, parameters=()): """ :arg subst_name: The name of the substitution rule to be created. :arg template: Unification template expression. + :arg parameters: An iterable of parameters used in + *template*, or a comma-separated string of the same. All targeted subexpressions must match ('unify with') *template* The template may contain '*' wildcards that will have to match exactly across all @@ -57,6 +59,10 @@ def extract_subst(kernel, subst_name, template, parameters): from pymbolic import parse template = parse(template) + if isinstance(parameters, str): + parameters = tuple( + s.strip() for s in parameters.split(",")) + var_name_gen = kernel.get_var_name_generator() # {{{ replace any wildcards in template with new variables @@ -268,7 +274,6 @@ def temporary_to_subst(kernel, temp_name, within=None): """Extract an assignment to a temporary variable as a :ref:`substituion-rule`. The temporary may - :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. @@ -319,7 +324,6 @@ def temporary_to_subst(kernel, temp_name, within=None): return def_id usage_to_definition = {} - definition_insn_ids = set() for insn in kernel.instructions: if temp_name not in insn.read_dependency_names(): @@ -332,7 +336,11 @@ def temporary_to_subst(kernel, temp_name, within=None): % (temp_name, insn.id)) usage_to_definition[insn.id] = def_id - definition_insn_ids.add(def_id) + + definition_insn_ids = set() + for insn in kernel.instructions: + if temp_name in insn.write_dependency_names(): + definition_insn_ids.add(insn.id) # }}} @@ -350,7 +358,7 @@ def temporary_to_subst(kernel, temp_name, within=None): new_substs = kernel.substitutions.copy() for def_id, subst_name in six.iteritems(tts.definition_insn_id_to_subst_name): - def_insn = id_to_insn[def_id] + def_insn = kernel.id_to_insn[def_id] (_, indices), = def_insn.assignees_and_indices() diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py index f1d3bfdde83735a34b9f86f14c561e9aa1bb2124..a1b323d7cf0deaef308952d87b1523b0a020ad68 100644 --- a/loopy/target/pyopencl/__init__.py +++ b/loopy/target/pyopencl/__init__.py @@ -34,6 +34,9 @@ from loopy.target.opencl import OpenCLTarget import pyopencl as cl import pyopencl.characterize as cl_char +# This ensures the dtype registry is populated. +import pyopencl.tools # noqa + import logging logger = logging.getLogger(__name__) @@ -228,7 +231,7 @@ def pyopencl_preamble_generator(target, seen_dtypes, seen_functions): # }}} -# {{{ +# {{{ pyopencl tools class PyOpenCLTarget(OpenCLTarget): def __init__(self, device=None): @@ -236,9 +239,6 @@ class PyOpenCLTarget(OpenCLTarget): self.device = device - # This ensures the dtype registry is populated. - import pyopencl.tools # noqa - hash_fields = ["device"] comparison_fields = ["device"] diff --git a/test/test_dg.py b/test/test_dg.py index 291da44846f551e4576ec81326836191942dbfd8..581562da89210ea476700191c6d21ad2dbe7fd3d 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -25,6 +25,7 @@ THE SOFTWARE. import numpy as np import pyopencl as cl +import pyopencl.array # noqa import loopy as lp import logging # noqa diff --git a/test/test_fortran.py b/test/test_fortran.py index bf6ffe140289c13e5c04df02b0c226f44832d41b..33826e4063190f7d7d48b1bfcb4122931868bd1e 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -201,12 +201,122 @@ def test_temporary_to_subst_indices(ctx_factory): ref_knl = knl + assert "a" in knl.temporary_variables knl = lp.temporary_to_subst(knl, "a") + assert "a" not in knl.temporary_variables ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl) +def test_if(ctx_factory): + fortran_src = """ + subroutine fill(out, out2, inp, n) + implicit none + + real*8 a, b, out(n), out2(n), inp(n) + integer n + + do i = 1, n + a = inp(i) + if (a.ge.3) then + b = 2*a + do j = 1,3 + b = 3 * b + end do + out(i) = 5*b + else + out(i) = 4*a + endif + end do + end + """ + + from loopy.frontend.fortran import f2loopy + knl, = f2loopy(fortran_src) + + ref_knl = knl + + knl = lp.temporary_to_subst(knl, "a") + + ctx = ctx_factory() + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) + + +def test_tagged(ctx_factory): + fortran_src = """ + subroutine rot_norm(out, alpha, out2, inp, inp2, n) + implicit none + real*8 a, b, r, out(n), out2(n), inp(n), inp2(n) + real*8 alpha + integer n + + do i = 1, n + !$loopy begin tagged: input + a = cos(alpha)*inp(i) + sin(alpha)*inp2(i) + b = -sin(alpha)*inp(i) + cos(alpha)*inp2(i) + !$loopy end tagged: input + + r = sqrt(a**2 + b**2) + a = a/r + b = b/r + + out(i) = a + out2(i) = b + end do + end + """ + + from loopy.frontend.fortran import f2loopy + knl, = f2loopy(fortran_src) + + assert sum(1 for insn in lp.find_instructions(knl, "*$input")) == 2 + + +def test_matmul(ctx_factory): + fortran_src = """ + subroutine dgemm(m,n,l,a,b,c) + implicit none + real*8 temp, a(m,l),b(l,n),c(m,n) + integer m,n,k,i,j,l + + do j = 1,n + do i = 1,m + temp = 0 + do k = 1,l + temp = temp + b(k,j)*a(i,k) + end do + c(i,j) = temp + end do + end do + end subroutine + + !$loopy begin transform + !$loopy end transform + """ + + from loopy.frontend.fortran import f2loopy + knl, = f2loopy(fortran_src) + + assert len(knl.domains) == 1 + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16, + outer_tag="g.0", inner_tag="l.1") + knl = lp.split_iname(knl, "j", 8, + outer_tag="g.1", inner_tag="l.0") + knl = lp.split_iname(knl, "k", 32) + + knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") + knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") + knl = lp.precompute(knl, "a_acc", "k_inner,i_inner") + knl = lp.precompute(knl, "b_acc", "j_inner,k_inner") + + ctx = ctx_factory() + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5, m=7, l=10)) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_linalg.py b/test/test_linalg.py index b1ebacb1e477e2cb2529fed263b94faa787e334d..d1a995b19fe22c818f03cadc7131e7732d3e1d58 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -183,32 +183,28 @@ def test_plain_matrix_mul(ctx_factory): def test_variable_size_matrix_mul(ctx_factory): - dtype = np.float32 ctx = ctx_factory() - order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( - "[n] -> {[i,j,k]: 0<=i,j,k<n}", - [ - "c[i, j] = sum(k, a[i, k]*b[k, j]) {id=labl}" - ], - [ - lp.GlobalArg("a", dtype, shape="n, n", order=order), - lp.GlobalArg("b", dtype, shape="n, n", order=order), - lp.GlobalArg("c", dtype, shape="n, n", order=order), - lp.ValueArg("n", np.int32, approximately=n), - ], - name="matmul", assumptions="n >= 16") + "{[i,j,k]: 0<=i,j,k<n}", + "c[i, j] = sum(k, a[i, k]*b[k, j])") + + knl = lp.add_dtypes(knl, { + "a": np.float32, + "b": np.float32, + }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, - outer_tag="g.0", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 8, - outer_tag="g.1", inner_tag="l.0") - knl = lp.split_iname(knl, "k", 32) + outer_tag="g.0", inner_tag="l.1", + slabs=(0, 1)) + knl = lp.split_iname(knl, "j", 16, + outer_tag="g.1", inner_tag="l.0", + slabs=(0, 1)) + knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) @@ -226,7 +222,7 @@ def test_funny_shape_matrix_mul(ctx_factory): l = m+12 knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", + "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], @@ -245,8 +241,12 @@ def test_funny_shape_matrix_mul(ctx_factory): outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) - knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) - knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) + #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) + #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) + knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") + knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") + knl = lp.precompute(knl, "a_acc", "k_inner,i_inner") + knl = lp.precompute(knl, "b_acc", "j_inner,k_inner") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], diff --git a/test/test_loopy.py b/test/test_loopy.py index 17e28af6a05ef94abb5ba061271d9f7f5d4ec617..6a59eb7388afedcc65bb6aa3d42f57b2e7152ac3 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -63,14 +63,9 @@ def test_complicated_subst(ctx_factory): h(x) := 1 + g(x) + 20*g$two(x) a[i] = h$one(i) * h$two(i) - """, - [ - lp.GlobalArg("a", np.float32, shape=("n",)), - lp.ValueArg("n", np.int32), - ]) + """) - from loopy.subst import expand_subst - knl = expand_subst(knl, "g$two < h$two") + knl = lp.expand_subst(knl, "g$two < h$two") print(knl) @@ -84,6 +79,23 @@ def test_complicated_subst(ctx_factory): assert substs_with_letter == how_many +def test_extract_subst(ctx_factory): + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + a[i] = 23*b[i]**2 + 25*b[i]**2 + """) + + knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha") + + print(knl) + + from loopy.symbolic import parse + + insn, = knl.instructions + assert insn.expression == parse("bsquare(23) + bsquare(25)") + + def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() @@ -1694,13 +1706,11 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory): assert (a_ref == a_knl).get().all() -def test_multiple_writes_to_local_temporary(ctx_factory): +def test_multiple_writes_to_local_temporary(): # Loopy would previously only handle barrier insertion correctly if exactly # one instruction wrote to each local temporary. This tests that multiple # writes are OK. - ctx = ctx_factory() - knl = lp.make_kernel( "{[i,e]: 0<=i<5 and 0<=e<nelements}", """ @@ -1709,13 +1719,13 @@ def test_multiple_writes_to_local_temporary(ctx_factory): """) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) + knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): code, _ = lp.generate_code(k) print(code) -def test_fd_demo(ctx_factory): +def test_fd_demo(): knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", "result[i,j] = u[i, j]**2 + -1 + (-4)*u[i + 1, j + 1] \ @@ -1743,6 +1753,26 @@ def test_fd_demo(ctx_factory): assert "double" not in code +def test_fd_1d(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + "result[i] = u[i+1]-u[i]") + + knl = lp.add_and_infer_dtypes(knl, {"u": np.float32}) + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16) + knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j") + knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for") + knl = lp.assume(knl, "n mod 16 = 0") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=2048)) + + def test_make_copy_kernel(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx)