diff --git a/MEMO b/MEMO index d650e7ea46aea8b01def996336810a2fe97a61fa..f4e5c34e48e62d5c951d01fcb212a9117e361def 100644 --- a/MEMO +++ b/MEMO @@ -142,7 +142,7 @@ Dealt with - How can one automatically generate something like microblocks? -> Some sort of axis-adding transform? -- ExpandingIdentityMapper +- RuleAwareIdentityMapper extract_subst -> needs WalkMapper [actually fine as is] padding [DONE] replace make_unique_var_name [DONE] diff --git a/doc/reference.rst b/doc/reference.rst index 595fd4b5fa5bac3ccd69cfb1b7a05ebb1c7b75f9..9d75a924b71847d04632b4945f9228c80c2e01f4 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -402,6 +402,8 @@ Caching, Precomputation and Prefetching .. autofunction:: add_prefetch +.. autofunction:: buffer_array + Influencing data access ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/loopy/__init__.py b/loopy/__init__.py index 3027e10d38a98b4c86737502b7fd20f41c3cc3c3..450aa4ac45272a580afd7dc5fb933355b5a8dcfd 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -30,7 +30,8 @@ THE SOFTWARE. import islpy as isl from islpy import dim_type -from loopy.symbolic import ExpandingIdentityMapper, ExpandingSubstitutionMapper +from loopy.symbolic import (RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, + SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError @@ -55,6 +56,7 @@ from loopy.kernel.creation import make_kernel, UniqueName from loopy.library.reduction import register_reduction_parser from loopy.subst import extract_subst, expand_subst, temporary_to_subst from loopy.precompute import precompute +from loopy.buffer import buffer_array from loopy.padding import (split_arg_axis, find_padding_multiple, add_padding) from loopy.preprocess import (preprocess_kernel, realize_reduction, @@ -82,7 +84,7 @@ __all__ = [ "register_reduction_parser", "extract_subst", "expand_subst", "temporary_to_subst", - "precompute", + "precompute", "buffer_array", "split_arg_axis", "find_padding_multiple", "add_padding", "get_dot_dependency_graph", @@ -125,11 +127,10 @@ __all__ = [ # {{{ split inames -class _InameSplitter(ExpandingIdentityMapper): - def __init__(self, kernel, within, +class _InameSplitter(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, within, split_iname, outer_iname, inner_iname, replacement_index): - ExpandingIdentityMapper.__init__(self, - kernel.substitutions, kernel.get_var_name_generator()) + super(_InameSplitter, self).__init__(rule_mapping_context) self.within = within @@ -140,7 +141,9 @@ class _InameSplitter(ExpandingIdentityMapper): self.replacement_index = replacement_index def map_reduction(self, expr, expn_state): - if self.split_iname in expr.inames and self.within(expn_state.stack): + if (self.split_iname in expr.inames + and self.split_iname not in expn_state.arg_context + and self.within(expn_state.stack)): new_inames = list(expr.inames) new_inames.remove(self.split_iname) new_inames.extend([self.outer_iname, self.inner_iname]) @@ -149,13 +152,15 @@ class _InameSplitter(ExpandingIdentityMapper): return Reduction(expr.operation, tuple(new_inames), self.rec(expr.expr, expn_state)) else: - return ExpandingIdentityMapper.map_reduction(self, expr, expn_state) + return super(_InameSplitter, self).map_reduction(expr, expn_state) def map_variable(self, expr, expn_state): - if expr.name == self.split_iname and self.within(expn_state.stack): + if (expr.name == self.split_iname + and self.split_iname not in expn_state.arg_context + and self.within(expn_state.stack)): return self.replacement_index else: - return ExpandingIdentityMapper.map_variable(self, expr, expn_state) + return super(_InameSplitter, self).map_variable(expr, expn_state) def split_iname(kernel, split_iname, inner_length, @@ -272,10 +277,13 @@ def split_iname(kernel, split_iname, inner_length, from loopy.context_matching import parse_stack_match within = parse_stack_match(within) - ins = _InameSplitter(kernel, within, + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + ins = _InameSplitter(rule_mapping_context, within, split_iname, outer_iname, inner_iname, new_loop_index) kernel = ins.map_kernel(kernel) + kernel = rule_mapping_context.finish_kernel(kernel) if existing_tag is not None: kernel = tag_inames(kernel, @@ -288,10 +296,10 @@ def split_iname(kernel, split_iname, inner_length, # {{{ join inames -class _InameJoiner(ExpandingSubstitutionMapper): - def __init__(self, kernel, within, subst_func, joined_inames, new_iname): - ExpandingSubstitutionMapper.__init__(self, - kernel.substitutions, kernel.get_var_name_generator(), +class _InameJoiner(RuleAwareSubstitutionMapper): + def __init__(self, rule_mapping_context, within, subst_func, + joined_inames, new_iname): + super(_InameJoiner, self).__init__(rule_mapping_context, subst_func, within) self.joined_inames = set(joined_inames) @@ -299,7 +307,8 @@ class _InameJoiner(ExpandingSubstitutionMapper): def map_reduction(self, expr, expn_state): expr_inames = set(expr.inames) - overlap = self.join_inames & expr_inames + overlap = (self.join_inames & expr_inames + - set(expn_state.arg_context)) if overlap and self.within(expn_state.stack): if overlap != expr_inames: raise LoopyError( @@ -317,7 +326,7 @@ class _InameJoiner(ExpandingSubstitutionMapper): return Reduction(expr.operation, tuple(new_inames), self.rec(expr.expr, expn_state)) else: - return ExpandingIdentityMapper.map_reduction(self, expr, expn_state) + return super(_InameJoiner, self).map_reduction(expr, expn_state) def join_inames(kernel, inames, new_iname=None, tag=None, within=None): @@ -419,11 +428,14 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func - ijoin = _InameJoiner(kernel, within, + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + ijoin = _InameJoiner(rule_mapping_context, within, make_subst_func(subst_dict), inames, new_iname) - kernel = ijoin.map_kernel(kernel) + kernel = rule_mapping_context.finish_kernel( + ijoin.map_kernel(kernel)) if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) @@ -488,33 +500,37 @@ def tag_inames(kernel, iname_to_tag, force=False): # {{{ duplicate inames -class _InameDuplicator(ExpandingIdentityMapper): - def __init__(self, rules, make_unique_var_name, +class _InameDuplicator(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, old_to_new, within): - ExpandingIdentityMapper.__init__(self, - rules, make_unique_var_name) + super(_InameDuplicator, self).__init__(rule_mapping_context) self.old_to_new = old_to_new self.old_inames_set = set(six.iterkeys(old_to_new)) self.within = within def map_reduction(self, expr, expn_state): - if set(expr.inames) & self.old_inames_set and self.within(expn_state.stack): + if (set(expr.inames) & self.old_inames_set + and self.within(expn_state.stack)): new_inames = tuple( self.old_to_new.get(iname, iname) + if iname not in expn_state.arg_context + else iname for iname in expr.inames) from loopy.symbolic import Reduction return Reduction(expr.operation, new_inames, self.rec(expr.expr, expn_state)) else: - return ExpandingIdentityMapper.map_reduction(self, expr, expn_state) + return super(_InameDuplicator, self).map_reduction(expr, expn_state) def map_variable(self, expr, expn_state): new_name = self.old_to_new.get(expr.name) - if new_name is None or not self.within(expn_state.stack): - return ExpandingIdentityMapper.map_variable(self, expr, expn_state) + if (new_name is None + or expr.name in expn_state.arg_context + or not self.within(expn_state.stack)): + return super(_InameDuplicator, self).map_variable(expr, expn_state) else: from pymbolic import var return var(new_name) @@ -591,11 +607,14 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, # {{{ change the inames in the code - indup = _InameDuplicator(knl.substitutions, name_gen, + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, name_gen) + indup = _InameDuplicator(rule_mapping_context, old_to_new=dict(list(zip(inames, new_inames))), within=within) - knl = indup.map_kernel(knl) + knl = rule_mapping_context.finish_kernel( + indup.map_kernel(knl)) # }}} @@ -694,8 +713,6 @@ def link_inames(knl, inames, new_iname, within=None, tag=None): all_equal = True first_proj = projections[0] for proj in projections[1:]: - print(proj.gist(first_proj)) - print(first_proj.gist(proj)) all_equal = all_equal and (proj <= first_proj and first_proj <= proj) if not all_equal: @@ -721,10 +738,13 @@ def link_inames(knl, inames, new_iname, within=None, tag=None): within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func - ijoin = ExpandingSubstitutionMapper(knl.substitutions, var_name_gen, + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, var_name_gen) + ijoin = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within) - knl = ijoin.map_kernel(knl) + knl = rule_mapping_context.finish_kernel( + ijoin.map_kernel(knl)) # }}} @@ -1176,16 +1196,20 @@ def tag_data_axes(knl, ary_names, dim_tags): # {{{ split_reduction -class _ReductionSplitter(ExpandingIdentityMapper): - def __init__(self, kernel, within, inames, direction): - ExpandingIdentityMapper.__init__(self, - kernel.substitutions, kernel.get_var_name_generator()) +class _ReductionSplitter(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, within, inames, direction): + super(_ReductionSplitter, self).__init__( + rule_mapping_context) self.within = within self.inames = inames self.direction = direction def map_reduction(self, expr, expn_state): + if set(expr.inames) & set(expn_state.arg_context): + # FIXME + raise NotImplementedError() + if self.inames <= set(expr.inames) and self.within(expn_state.stack): leftover_inames = set(expr.inames) - self.inames @@ -1201,7 +1225,7 @@ class _ReductionSplitter(ExpandingIdentityMapper): else: assert False else: - return ExpandingIdentityMapper.map_reduction(self, expr, expn_state) + return super(_ReductionSplitter, self).map_reduction(expr, expn_state) def _split_reduction(kernel, inames, direction, within=None): @@ -1215,8 +1239,12 @@ def _split_reduction(kernel, inames, direction, within=None): from loopy.context_matching import parse_stack_match within = parse_stack_match(within) - rsplit = _ReductionSplitter(kernel, within, inames, direction) - return rsplit.map_kernel(kernel) + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + rsplit = _ReductionSplitter(rule_mapping_context, + within, inames, direction) + return rule_mapping_context.finish_kernel( + rsplit.map_kernel(kernel)) def split_reduction_inward(kernel, inames, within=None): @@ -1313,11 +1341,13 @@ def _fix_parameter(kernel, name, value): from loopy.context_matching import parse_stack_match within = parse_stack_match(None) - from loopy.symbolic import ExpandingSubstitutionMapper - esubst_map = ExpandingSubstitutionMapper( - kernel.substitutions, kernel.get_var_name_generator(), - subst_func, within=within) - return (esubst_map.map_kernel(kernel) + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + esubst_map = RuleAwareSubstitutionMapper( + rule_mapping_context, subst_func, within=within) + return ( + rule_mapping_context.finish_kernel( + esubst_map.map_kernel(kernel)) .copy( domains=new_domains, args=new_args, @@ -1570,11 +1600,11 @@ def affine_map_inames(kernel, old_inames, new_inames, equations): equations = [equations] import re - EQN_RE = re.compile(r"^([^=]+)=([^=]+)$") + eqn_re = re.compile(r"^([^=]+)=([^=]+)$") def parse_equation(eqn): if isinstance(eqn, str): - eqn_match = EQN_RE.match(eqn) + eqn_match = eqn_re.match(eqn) if not eqn_match: raise ValueError("invalid equation: %s" % eqn) @@ -1618,10 +1648,14 @@ def affine_map_inames(kernel, old_inames, new_inames, equations): var_name_gen = kernel.get_var_name_generator() from pymbolic.mapper.substitutor import make_subst_func - old_to_new = ExpandingSubstitutionMapper(kernel.substitutions, var_name_gen, + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, var_name_gen) + old_to_new = RuleAwareSubstitutionMapper(rule_mapping_context, make_subst_func(subst_dict), within=lambda stack: True) - kernel = (old_to_new.map_kernel(kernel) + kernel = ( + rule_mapping_context.finish_kernel( + old_to_new.map_kernel(kernel)) .copy( applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict] )) diff --git a/loopy/array_buffer.py b/loopy/array_buffer_map.py similarity index 96% rename from loopy/array_buffer.py rename to loopy/array_buffer_map.py index c15935b733ba70d4b8410bb9eff7afb4d9b0ae9c..9189421271ea28c95be861c785243aabbd7bd934 100644 --- a/loopy/array_buffer.py +++ b/loopy/array_buffer_map.py @@ -42,7 +42,6 @@ class AccessDescriptor(Record): __slots__ = [ "identifier", - "expands_footprint", "storage_axis_exprs", ] @@ -138,16 +137,15 @@ def build_global_storage_to_sweep_map(kernel, access_descriptors, # build footprint for accdesc in access_descriptors: - if accdesc.expands_footprint: - stor2sweep = build_per_access_storage_to_domain_map( - accdesc, domain_dup_sweep, - storage_axis_names, - prime_sweep_inames) + stor2sweep = build_per_access_storage_to_domain_map( + accdesc, domain_dup_sweep, + storage_axis_names, + prime_sweep_inames) - if global_stor2sweep is None: - global_stor2sweep = stor2sweep - else: - global_stor2sweep = global_stor2sweep.union(stor2sweep) + if global_stor2sweep is None: + global_stor2sweep = stor2sweep + else: + global_stor2sweep = global_stor2sweep.union(stor2sweep) if isinstance(global_stor2sweep, isl.BasicMap): global_stor2sweep = isl.Map.from_basic_map(global_stor2sweep) @@ -336,9 +334,6 @@ class ArrayToBufferMap(object): return convexify(domain) def is_access_descriptor_in_footprint(self, accdesc): - if accdesc.expands_footprint: - return True - # Make all inames except the sweep parameters. (The footprint may depend on # those.) (I.e. only leave sweep inames as out parameters.) @@ -350,7 +345,7 @@ class ArrayToBufferMap(object): set(global_s2s_par_dom.get_var_names(dim_type.param)) & self.kernel.all_inames()) - for arg in accdesc.args: + for arg in accdesc.storage_axis_exprs: arg_inames.update(get_dependencies(arg)) arg_inames = frozenset(arg_inames) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index ecd74c1a6b3d9ee793889d44976dce1f4bdc0a65..774e1f86e672cbf62eefb8bdffaeb3da2b6cc76f 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -26,11 +26,12 @@ THE SOFTWARE. """ from pytools import Record +from warnings import warn import numpy as np import loopy as lp -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError, AutomaticTestFailure AUTO_TEST_SKIP_RUN = False @@ -76,7 +77,7 @@ class TestArgInfo(Record): # {{{ "reference" arguments -def make_ref_args(kernel, impl_arg_info, queue, parameters, fill_value): +def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array @@ -143,36 +144,36 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters, fill_value): numpy_strides = [itemsize*s for s in strides] storage_array = cl_array.empty(queue, alloc_size, dtype) - ary = cl_array.as_strided(storage_array, shape, numpy_strides) - if is_output: - if arg.arg_class is ImageArg: - raise LoopyError("write-mode images not supported in " - "automatic testing") + if is_output and arg.arg_class is ImageArg: + raise LoopyError("write-mode images not supported in " + "automatic testing") - if is_dtype_supported(dtype): - storage_array.fill(fill_value) - else: - from warnings import warn - warn("Cannot pre-fill array of dtype '%s' with set " - "value--zeroing instead" % dtype) - storage_array.view(np.uint8).fill(0) + fill_rand(storage_array) - ref_args[arg.name] = ary + if arg.arg_class is ImageArg: + # must be contiguous + pre_run_ary = pre_run_storage_array = storage_array.copy() + + ref_args[arg.name] = cl.image_from_array( + queue.context, ary.get()) else: - fill_rand(storage_array) - if arg.arg_class is ImageArg: - # must be contiguous - ref_args[arg.name] = cl.image_from_array( - queue.context, ary.get()) - else: - ref_args[arg.name] = ary + pre_run_storage_array = storage_array.copy() + + ary = cl_array.as_strided(storage_array, shape, numpy_strides) + pre_run_ary = cl_array.as_strided( + pre_run_storage_array, shape, numpy_strides) + ref_args[arg.name] = ary ref_arg_data.append( TestArgInfo( name=arg.name, ref_array=ary, ref_storage_array=storage_array, + + ref_pre_run_array=pre_run_ary, + ref_pre_run_storage_array=pre_run_storage_array, + ref_shape=shape, ref_strides=strides, ref_alloc_size=alloc_size, @@ -188,8 +189,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters, fill_value): # {{{ "full-scale" arguments -def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters, - fill_value): +def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array @@ -224,7 +224,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters, # must be contiguous args[arg.name] = cl.image_from_array( - queue.context, arg_desc.ref_array.get()) + queue.context, arg_desc.ref_pre_run_array.get()) elif arg.arg_class is GlobalArg: shape = evaluate(arg.unvec_shape, parameters) @@ -238,50 +238,37 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters, alloc_size = sum(astrd*(alen-1) for alen, astrd in zip(shape, strides)) + 1 - if arg.base_name in kernel.get_written_variables(): - storage_array = cl_array.empty(queue, alloc_size, dtype) - ary = cl_array.as_strided(storage_array, shape, numpy_strides) + # use contiguous array to transfer to host + host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get() - if is_dtype_supported(dtype): - storage_array.fill(fill_value) - else: - from warnings import warn - warn("Cannot pre-fill array of dtype '%s'" % dtype) - storage_array.view(np.uint8).fill(0) + # use device shape/strides + from pyopencl.compyte.array import as_strided + host_ref_array = as_strided(host_ref_contig_array, + arg_desc.ref_shape, arg_desc.ref_numpy_strides) - args[arg.name] = ary - else: - # use contiguous array to transfer to host - host_ref_contig_array = arg_desc.ref_storage_array.get() - - # use device shape/strides - from pyopencl.compyte.array import as_strided - host_ref_array = as_strided(host_ref_contig_array, - arg_desc.ref_shape, arg_desc.ref_numpy_strides) - - # flatten the thing - host_ref_flat_array = host_ref_array.flatten() - - # create host array with test shape (but not strides) - host_contig_array = np.empty(shape, dtype=dtype) - - common_len = min( - len(host_ref_flat_array), - len(host_contig_array.ravel())) - host_contig_array.ravel()[:common_len] = \ - host_ref_flat_array[:common_len] - - # create host array with test shape and storage layout - host_storage_array = np.empty(alloc_size, dtype) - host_array = as_strided( - host_storage_array, shape, numpy_strides) - host_array[:] = host_contig_array - - host_contig_array = arg_desc.ref_storage_array.get() - storage_array = cl_array.to_device(queue, host_storage_array) - ary = cl_array.as_strided(storage_array, shape, numpy_strides) + # flatten the thing + host_ref_flat_array = host_ref_array.flatten() + + # create host array with test shape (but not strides) + host_contig_array = np.empty(shape, dtype=dtype) + + common_len = min( + len(host_ref_flat_array), + len(host_contig_array.ravel())) + host_contig_array.ravel()[:common_len] = \ + host_ref_flat_array[:common_len] + + # create host array with test shape and storage layout + host_storage_array = np.empty(alloc_size, dtype) + host_array = as_strided( + host_storage_array, shape, numpy_strides) + host_array[:] = host_contig_array - args[arg.name] = ary + host_contig_array = arg_desc.ref_storage_array.get() + storage_array = cl_array.to_device(queue, host_storage_array) + ary = cl_array.as_strided(storage_array, shape, numpy_strides) + + args[arg.name] = ary arg_desc.test_storage_array = storage_array arg_desc.test_array = ary @@ -324,7 +311,7 @@ def _default_check_result(result, ref_result): # }}} -# {{{ ref device finder +# {{{ find device for reference test def _enumerate_cl_devices_for_ref_test(): import pyopencl as cl @@ -332,8 +319,6 @@ def _enumerate_cl_devices_for_ref_test(): noncpu_devs = [] cpu_devs = [] - from warnings import warn - for pf in cl.get_platforms(): if pf.name == "Portable Computing Language": # pocl not mature enough yet, sadly @@ -368,7 +353,7 @@ def auto_test_vs_ref( ref_knl, ctx, test_knl, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, - fills_entire_output=True, do_check=True, check_result=None + fills_entire_output=None, do_check=True, check_result=None ): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. @@ -386,46 +371,29 @@ def auto_test_vs_ref( for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: - raise LoopyError("ref_knl and test_knl argument lists disagee at index " + raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: - raise LoopyError("ref_knl and test_knl argument lists disagee at index " + raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) from loopy.compiled import CompiledKernel, get_highlighted_cl_code if isinstance(op_count, (int, float)): - from warnings import warn warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): - from warnings import warn warn("op_label should be a list", stacklevel=2) op_label = [op_label] - read_and_written_args = ( - ref_knl.get_read_variables() - & ref_knl.get_written_variables() - & set(ref_knl.arg_dict)) - - if read_and_written_args: - # FIXME: In principle, that's possible to test - raise LoopyError("kernel reads *and* writes argument(s) '%s' " - "and therefore cannot be automatically tested" - % ", ".join(read_and_written_args)) - from time import time if check_result is None: check_result = _default_check_result - if fills_entire_output: - fill_value_ref = -17 - fill_value = -18 - else: - fill_value_ref = -17 - fill_value = fill_value_ref + if fills_entire_output is not None: + warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code @@ -460,8 +428,7 @@ def auto_test_vs_ref( try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_cl_kernel_info.impl_arg_info, - ref_queue, parameters, - fill_value=fill_value_ref) + ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: @@ -523,7 +490,6 @@ def auto_test_vs_ref( args = None from loopy.kernel import LoopKernel if not isinstance(test_knl, LoopKernel): - from warnings import warn warn("Passing an iterable of kernels to auto_test_vs_ref " "is deprecated--just pass the kernel instead. " "Scheduling will be performed in auto_test_vs_ref.", @@ -552,7 +518,7 @@ def auto_test_vs_ref( cl_kernel_info = compiled.cl_kernel_info(frozenset()) args = make_args(kernel, cl_kernel_info.impl_arg_info, - queue, ref_arg_data, parameters, fill_value=fill_value) + queue, ref_arg_data, parameters) args["out_host"] = False print(75*"-") @@ -593,7 +559,9 @@ def auto_test_vs_ref( test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) - assert error_is_small, error + if not error_is_small: + raise AutomaticTestFailure(error) + need_check = False events = [] diff --git a/loopy/buffer.py b/loopy/buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..d155dba7e852e6a3978c3c92d4f4a7cba29bc4f0 --- /dev/null +++ b/loopy/buffer.py @@ -0,0 +1,412 @@ +from __future__ import division, absolute_import +from six.moves import range + +__copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, + AccessDescriptor) +from loopy.symbolic import (get_dependencies, + RuleAwareIdentityMapper, SubstitutionRuleMappingContext, + SubstitutionMapper) +from pymbolic.mapper.substitutor import make_subst_func + +from pymbolic import var + + +# {{{ replace array access + +class ArrayAccessReplacer(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, + var_name, within, array_base_map, buf_var): + super(ArrayAccessReplacer, self).__init__(rule_mapping_context) + + self.within = within + + self.array_base_map = array_base_map + + self.var_name = var_name + self.modified_insn_ids = set() + + self.buf_var = buf_var + + def map_variable(self, expr, expn_state): + result = None + if expr.name == self.var_name and self.within(expn_state): + result = self.map_array_access((), expn_state) + + if result is None: + return super(ArrayAccessReplacer, self).map_variable(expr, expn_state) + else: + self.modified_insn_ids.add(expn_state.insn_id) + return result + + def map_subscript(self, expr, expn_state): + result = None + if expr.aggregate.name == self.var_name and self.within(expn_state): + result = self.map_array_access(expr.index, expn_state) + + if result is None: + return super(ArrayAccessReplacer, self).map_subscript(expr, expn_state) + else: + self.modified_insn_ids.add(expn_state.insn_id) + return result + + def map_array_access(self, index, expn_state): + accdesc = AccessDescriptor( + identifier=None, + storage_axis_exprs=index) + + if not self.array_base_map.is_access_descriptor_in_footprint(accdesc): + return None + + abm = self.array_base_map + + index = expn_state.apply_arg_context(index) + + assert len(index) == len(abm.non1_storage_axis_flags) + + access_subscript = [] + for i in range(len(index)): + if not abm.non1_storage_axis_flags[i]: + continue + + ax_index = index[i] + from loopy.isl_helpers import simplify_via_aff + ax_index = simplify_via_aff( + ax_index - abm.storage_base_indices[i]) + + access_subscript.append(ax_index) + + result = self.buf_var + if access_subscript: + result = result.index(tuple(access_subscript)) + + # Can't possibly be nested, but recurse anyway to + # make sure substitution rules referenced below here + # do not get thrown away. + self.rec(result, expn_state.copy(arg_context={})) + + return result + +# }}} + + +def buffer_array(kernel, var_name, buffer_inames, init_expression=None, + store_expression=None, within=None, default_tag="l.auto", + temporary_is_local=None, fetch_bounding_box=False): + """ + :arg init_expression: Either *None* (indicating the prior value of the buffered + array should be read) or an expression optionally involving the + variable 'base' (which references the associated location in the array + being buffered). + :arg store_expression: Either *None* or an expression involving + variables 'base' and 'buffer' (without array indices). + """ + + # {{{ process arguments + + if isinstance(init_expression, str): + from loopy.symbolic import parse + init_expression = parse(init_expression) + + if isinstance(store_expression, str): + from loopy.symbolic import parse + store_expression = parse(store_expression) + + if isinstance(buffer_inames, str): + buffer_inames = [s.strip() + for s in buffer_inames.split(",") if s.strip()] + + for iname in buffer_inames: + if iname not in kernel.all_inames(): + raise RuntimeError("sweep iname '%s' is not a known iname" + % iname) + + buffer_inames = list(buffer_inames) + buffer_inames_set = frozenset(buffer_inames) + + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + if var_name in kernel.arg_dict: + var_descr = kernel.arg_dict[var_name] + elif var_name in kernel.temporary_variables: + var_descr = kernel.temporary_variables[var_name] + else: + raise ValueError("variable '%s' not found" % var_name) + + from loopy.kernel.data import ArrayBase + if isinstance(var_descr, ArrayBase): + var_shape = var_descr.shape + else: + var_shape = () + + if temporary_is_local is None: + import loopy as lp + temporary_is_local = lp.auto + + # }}} + + var_name_gen = kernel.get_var_name_generator() + within_inames = set() + + access_descriptors = [] + for insn in kernel.instructions: + if not within((insn.id, insn.tags)): + continue + + for assignee, index in insn.assignees_and_indices(): + if assignee == var_name: + within_inames.update( + (get_dependencies(index) & kernel.all_inames()) + - buffer_inames_set) + access_descriptors.append( + AccessDescriptor( + identifier=insn.id, + storage_axis_exprs=index)) + + # {{{ find fetch/store inames + + init_inames = [] + store_inames = [] + new_iname_to_tag = {} + + for i in range(len(var_shape)): + init_iname = var_name_gen("%s_init_%d" % (var_name, i)) + store_iname = var_name_gen("%s_store_%d" % (var_name, i)) + + new_iname_to_tag[init_iname] = default_tag + new_iname_to_tag[store_iname] = default_tag + + init_inames.append(init_iname) + store_inames.append(store_iname) + + # }}} + + # {{{ modify loop domain + + non1_init_inames = [] + non1_store_inames = [] + + if var_shape: + # {{{ find domain to be changed + + from loopy.kernel.tools import DomainChanger + domch = DomainChanger(kernel, buffer_inames_set | within_inames) + + if domch.leaf_domain_index is not None: + # If the sweep inames are at home in parent domains, then we'll add + # fetches with loops over copies of these parent inames that will end + # up being scheduled *within* loops over these parents. + + for iname in buffer_inames_set: + if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: + raise RuntimeError("buffer iname '%s' is not 'at home' in the " + "sweep's leaf domain" % iname) + + # }}} + + abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames, + access_descriptors, len(var_shape)) + + for i in range(len(var_shape)): + if abm.non1_storage_axis_flags[i]: + non1_init_inames.append(init_inames[i]) + non1_store_inames.append(store_inames[i]) + else: + del new_iname_to_tag[init_inames[i]] + del new_iname_to_tag[store_inames[i]] + + new_domain = domch.domain + new_domain = abm.augment_domain_with_sweep( + new_domain, non1_init_inames, + boxify_sweep=fetch_bounding_box) + new_domain = abm.augment_domain_with_sweep( + new_domain, non1_store_inames, + boxify_sweep=fetch_bounding_box) + new_kernel_domains = domch.get_domains_with(new_domain) + del new_domain + + else: + # leave kernel domains unchanged + new_kernel_domains = kernel.domains + + abm = NoOpArrayToBufferMap() + + # }}} + + # {{{ set up temp variable + + import loopy as lp + + buf_var_name = var_name_gen(based_on=var_name+"_buf") + + new_temporary_variables = kernel.temporary_variables.copy() + temp_var = lp.TemporaryVariable( + name=buf_var_name, + dtype=var_descr.dtype, + base_indices=(0,)*len(abm.non1_storage_shape), + shape=tuple(abm.non1_storage_shape), + is_local=temporary_is_local) + + new_temporary_variables[buf_var_name] = temp_var + + # }}} + + new_insns = [] + + buf_var = var(buf_var_name) + + # {{{ generate init instruction + + buf_var_init = buf_var + if non1_init_inames: + buf_var_init = buf_var_init.index( + tuple(var(iname) for iname in non1_init_inames)) + + init_base = var(var_name) + + init_subscript = [] + init_iname_idx = 0 + if var_shape: + for i in range(len(var_shape)): + ax_subscript = abm.storage_base_indices[i] + if abm.non1_storage_axis_flags[i]: + ax_subscript += var(non1_init_inames[init_iname_idx]) + init_iname_idx += 1 + init_subscript.append(ax_subscript) + + if init_subscript: + init_base = init_base.index(tuple(init_subscript)) + + if init_expression is None: + init_expression = init_base + else: + init_expression = init_expression + init_expression = SubstitutionMapper( + make_subst_func({ + "base": init_base, + }))(init_expression) + + init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name) + from loopy.kernel.data import ExpressionInstruction + init_instruction = ExpressionInstruction(id=init_insn_id, + assignee=buf_var_init, + expression=init_expression, + forced_iname_deps=frozenset(within_inames), + insn_deps=frozenset(), + insn_deps_is_final=True, + ) + + # }}} + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + aar = ArrayAccessReplacer(rule_mapping_context, var_name, + within, abm, buf_var) + kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel)) + + did_write = False + for insn_id in aar.modified_insn_ids: + insn = kernel.id_to_insn[insn_id] + if any(assignee_name == buf_var_name + for assignee_name, _ in insn.assignees_and_indices()): + did_write = True + + # {{{ add init_insn_id to insn_deps + + new_insns = [] + + def none_to_empty_set(s): + if s is None: + return frozenset() + else: + return s + + for insn in kernel.instructions: + if insn.id in aar.modified_insn_ids: + new_insns.append( + insn.copy( + insn_deps=( + none_to_empty_set(insn.insn_deps) + | frozenset([init_insn_id])))) + else: + new_insns.append(insn) + + # }}} + + # {{{ generate store instruction + + buf_var_store = buf_var + if non1_store_inames: + buf_var_store = buf_var_store.index( + tuple(var(iname) for iname in non1_store_inames)) + + store_subscript = [] + store_iname_idx = 0 + if var_shape: + for i in range(len(var_shape)): + ax_subscript = abm.storage_base_indices[i] + if abm.non1_storage_axis_flags[i]: + ax_subscript += var(non1_store_inames[store_iname_idx]) + store_iname_idx += 1 + store_subscript.append(ax_subscript) + + store_target = var(var_name) + if store_subscript: + store_target = store_target.index(tuple(store_subscript)) + + if store_expression is None: + store_expression = buf_var_store + else: + store_expression = SubstitutionMapper( + make_subst_func({ + "base": store_target, + "buffer": buf_var_store, + }))(store_expression) + + from loopy.kernel.data import ExpressionInstruction + store_instruction = ExpressionInstruction( + id=kernel.make_unique_instruction_id(based_on="store_"+var_name), + insn_deps=frozenset(aar.modified_insn_ids), + assignee=store_target, + expression=store_expression, + forced_iname_deps=frozenset(within_inames)) + + # }}} + + new_insns.append(init_instruction) + if did_write: + new_insns.append(store_instruction) + + kernel = kernel.copy( + domains=new_kernel_domains, + instructions=new_insns, + temporary_variables=new_temporary_variables) + + from loopy import tag_inames + kernel = tag_inames(kernel, new_iname_to_tag) + + return kernel + +# vim: foldmethod=marker diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index dc0d0d4538c618bf1115b8636e5b50e826882b11..56d7f6706c8932a9a97202768bdd0053c5c36d10 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -74,6 +74,10 @@ class TypeInferenceFailure(LoopyError): pass +class AutomaticTestFailure(LoopyError): + pass + + class DependencyTypeInferenceFailure(TypeInferenceFailure): def __init__(self, message, symbol): TypeInferenceFailure.__init__(self, message) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 93be120d2cca92e9e887ffb7908816da82b2c925..fdf38b768269dfc8e86c76de9fd1dcf05e4e86e6 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -354,4 +354,13 @@ def boxify(cache_manager, domain, box_inames, context): return convexify(result) + +def simplify_via_aff(expr): + from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies + deps = get_dependencies(expr) + return aff_to_expr(aff_from_expr( + isl.Space.create_from_names(isl.Context(), list(deps)), + expr)) + + # vim: foldmethod=marker diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 68f7f131c4c40d2f647fb6c6095959b4884a3c41..14562b384a89f1455a3a29ac161682889f1d3e01 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -79,7 +79,7 @@ class _UniqueVarNameGenerator(UniqueNameGenerator): # {{{ loop kernel object -class kernel_state: +class kernel_state: # noqa INITIAL = 0 PREPROCESSED = 1 SCHEDULED = 2 @@ -922,12 +922,6 @@ class LoopKernel(RecordWithoutPickling): line = "%s: %s" % (iname, self.iname_to_tag.get(iname)) lines.append(line) - if self.substitutions: - lines.append(sep) - lines.append("SUBSTIUTION RULES:") - for rule_name in sorted(six.iterkeys(self.substitutions)): - lines.append(str(self.substitutions[rule_name])) - if self.temporary_variables: lines.append(sep) lines.append("TEMPORARIES:") @@ -935,6 +929,12 @@ class LoopKernel(RecordWithoutPickling): key=lambda tv: tv.name): lines.append(str(tv)) + if self.substitutions: + lines.append(sep) + lines.append("SUBSTIUTION RULES:") + for rule_name in sorted(six.iterkeys(self.substitutions)): + lines.append(str(self.substitutions[rule_name])) + lines.append(sep) lines.append("INSTRUCTIONS:") loop_list_width = 35 diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index ee14042d11263b6711b022fc546e7d1979f0da6b..3f5340d645495de42e8e384dc07c34d138be87f4 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -342,6 +342,15 @@ def parse_array_dim_tags(dim_tags, use_increasing_target_axes=False): for nl in ta_nesting_levels), target_axis)) + ta_nesting_level_increment = -min(ta_nesting_levels) + for i in range(len(result)): + if (isinstance(result[i], _StrideArrayDimTagBase) + and result[i].target_axis == target_axis + and result[i].layout_nesting_level is not None): + result[i] = result[i].copy( + layout_nesting_level=result[i].layout_nesting_level + + ta_nesting_level_increment) + # }}} return result diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index a21d983d7059499a16242f6c4857a2a7332a4f82..e99dedbf9b4ec36dfebe082da970ff260e2fb27e 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -459,16 +459,16 @@ class ArgumentGuesser: (assignee_var_name, _), = insn.assignees_and_indices() self.all_written_names.add(assignee_var_name) self.all_names.update(get_dependencies( - self.submap(insn.assignee, insn.id, insn.tags))) + self.submap(insn.assignee))) self.all_names.update(get_dependencies( - self.submap(insn.expression, insn.id, insn.tags))) + self.submap(insn.expression))) def find_index_rank(self, name): irf = IndexRankFinder(name) for insn in self.instructions: insn.with_transformed_expressions( - lambda expr: irf(self.submap(expr, insn.id, insn.tags))) + lambda expr: irf(self.submap(expr))) if not irf.index_ranks: return 0 @@ -859,8 +859,7 @@ def guess_arg_shape_if_requested(kernel, default_order): from loopy.kernel.array import ArrayBase from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper - submap = SubstitutionRuleExpander(kernel.substitutions, - kernel.get_var_name_generator()) + submap = SubstitutionRuleExpander(kernel.substitutions) for arg in kernel.args: if isinstance(arg, ArrayBase) and arg.shape is lp.auto: @@ -869,11 +868,12 @@ def guess_arg_shape_if_requested(kernel, default_order): try: for insn in kernel.instructions: if isinstance(insn, lp.ExpressionInstruction): - armap(submap(insn.assignee, insn.id, insn.tags), - kernel.insn_inames(insn)) - armap(submap(insn.expression, insn.id, insn.tags), - kernel.insn_inames(insn)) + armap(submap(insn.assignee), kernel.insn_inames(insn)) + armap(submap(insn.expression), kernel.insn_inames(insn)) except TypeError as e: + from traceback import print_exc + print_exc() + from loopy.diagnostic import LoopyError raise LoopyError( "Failed to (automatically, as requested) find " diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 871f5359b925703a8e71729af7585e51d051f7ca..ab747b9a66f82044e0fe7dc4591867fdd34d3519 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -28,10 +28,10 @@ THE SOFTWARE. import numpy as np from pytools import Record, memoize_method from loopy.kernel.array import ArrayBase -from loopy.diagnostic import LoopyError # noqa +from loopy.diagnostic import LoopyError -class auto(object): +class auto(object): # noqa """A generic placeholder object for something that should be automatically detected. See, for example, the *shape* or *strides* argument of :class:`GlobalArg`. @@ -484,6 +484,10 @@ class InstructionBase(Record): if insn_deps_is_final is None: insn_deps_is_final = False + if insn_deps_is_final and not isinstance(insn_deps, frozenset): + raise LoopyError("Setting insn_deps_is_final to True requires " + "actually specifying insn_deps") + if tags is None: tags = () diff --git a/loopy/padding.py b/loopy/padding.py index 8f747d413e0182f2a863bbe13d745362b993d1ee..4951c68543bc143f25a2c376b183d0f76e5d6c18 100644 --- a/loopy/padding.py +++ b/loopy/padding.py @@ -25,12 +25,12 @@ THE SOFTWARE. """ -from loopy.symbolic import ExpandingIdentityMapper +from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext -class ArgAxisSplitHelper(ExpandingIdentityMapper): - def __init__(self, rules, var_name_gen, arg_names, handler): - ExpandingIdentityMapper.__init__(self, rules, var_name_gen) +class ArgAxisSplitHelper(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, arg_names, handler): + super(ArgAxisSplitHelper, self).__init__(rule_mapping_context) self.arg_names = arg_names self.handler = handler @@ -38,7 +38,7 @@ class ArgAxisSplitHelper(ExpandingIdentityMapper): if expr.aggregate.name in self.arg_names: return self.handler(expr) else: - return ExpandingIdentityMapper.map_subscript(self, expr, expn_state) + return super(ArgAxisSplitHelper, self).map_subscript(expr, expn_state) def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True): @@ -205,9 +205,11 @@ def split_arg_axis(kernel, args_and_axes, count, auto_split_inames=True): return expr.aggregate.index(tuple(idx)) - aash = ArgAxisSplitHelper(kernel.substitutions, var_name_gen, + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, var_name_gen) + aash = ArgAxisSplitHelper(rule_mapping_context, set(six.iterkeys(arg_to_rest)), split_access_axis) - kernel = aash.map_kernel(kernel) + kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) kernel = kernel.copy(args=new_args) diff --git a/loopy/precompute.py b/loopy/precompute.py index de5cd9e02bf1a5e5d367cc565c6c6aef9efce52c..e516dde139d370f60b05fe3d6740d6fbd9d833e8 100644 --- a/loopy/precompute.py +++ b/loopy/precompute.py @@ -25,15 +25,15 @@ THE SOFTWARE. """ -import islpy as isl -from loopy.symbolic import (get_dependencies, SubstitutionMapper, - ExpandingIdentityMapper) +from loopy.symbolic import (get_dependencies, + RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, + SubstitutionRuleMappingContext) from pymbolic.mapper.substitutor import make_subst_func import numpy as np from pymbolic import var -from loopy.array_buffer import (ArrayToBufferMap, NoOpArrayToBufferMap, +from loopy.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) @@ -57,18 +57,11 @@ def storage_axis_exprs(storage_axis_sources, args): return result -def simplify_via_aff(expr): - from loopy.symbolic import aff_from_expr, aff_to_expr - deps = get_dependencies(expr) - return aff_to_expr(aff_from_expr( - isl.Space.create_from_names(isl.Context(), list(deps)), - expr)) +# {{{ gather rule invocations - -class RuleInvocationGatherer(ExpandingIdentityMapper): - def __init__(self, kernel, subst_name, subst_tag, within): - ExpandingIdentityMapper.__init__(self, - kernel.substitutions, kernel.get_var_name_generator()) +class RuleInvocationGatherer(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, kernel, subst_name, subst_tag, within): + super(RuleInvocationGatherer, self).__init__(rule_mapping_context) from loopy.symbolic import SubstitutionRuleExpander self.subst_expander = SubstitutionRuleExpander( @@ -90,18 +83,17 @@ class RuleInvocationGatherer(ExpandingIdentityMapper): process_me = process_me and self.within(expn_state.stack) if not process_me: - return ExpandingIdentityMapper.map_substitution( - self, name, tag, arguments, expn_state) + return super(RuleInvocationGatherer, self).map_substitution( + name, tag, arguments, expn_state) - rule = self.old_subst_rules[name] + rule = self.rule_mapping_context.old_subst_rules[name] arg_context = self.make_new_arg_context( name, rule.arguments, arguments, expn_state.arg_context) arg_deps = set() for arg_val in six.itervalues(arg_context): arg_deps = (arg_deps - | get_dependencies(self.subst_expander( - arg_val, insn_id=None, insn_tags=None))) + | get_dependencies(self.subst_expander(arg_val))) # FIXME: This is too strict--and the footprint machinery # needs to be taught how to deal with locally constant @@ -116,12 +108,11 @@ class RuleInvocationGatherer(ExpandingIdentityMapper): ", ".join(arg_deps - self.kernel.all_inames()), )) - return ExpandingIdentityMapper.map_substitution( - self, name, tag, arguments, expn_state) + return super(RuleInvocationGatherer, self).map_substitution( + name, tag, arguments, expn_state) args = [arg_context[arg_name] for arg_name in rule.arguments] - # Do not set expands_footprint here, it is set below. self.access_descriptors.append( RuleAccessDescriptor( identifier=access_descriptor_id(args, expn_state.stack), @@ -130,21 +121,19 @@ class RuleInvocationGatherer(ExpandingIdentityMapper): return 0 # exact value irrelevant +# }}} + + +# {{{ replace rule invocation -class RuleInvocationReplacer(ExpandingIdentityMapper): - def __init__(self, kernel, subst_name, subst_tag, within, +class RuleInvocationReplacer(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, subst_name, subst_tag, within, access_descriptors, array_base_map, storage_axis_names, storage_axis_sources, - storage_base_indices, non1_storage_axis_names, + non1_storage_axis_names, target_var_name): - ExpandingIdentityMapper.__init__(self, - kernel.substitutions, kernel.get_var_name_generator()) - - from loopy.symbolic import SubstitutionRuleExpander - self.subst_expander = SubstitutionRuleExpander( - kernel.substitutions, kernel.get_var_name_generator()) + super(RuleInvocationReplacer, self).__init__(rule_mapping_context) - self.kernel = kernel self.subst_name = subst_name self.subst_tag = subst_tag self.within = within @@ -154,55 +143,44 @@ class RuleInvocationReplacer(ExpandingIdentityMapper): self.storage_axis_names = storage_axis_names self.storage_axis_sources = storage_axis_sources - self.storage_base_indices = storage_base_indices self.non1_storage_axis_names = non1_storage_axis_names self.target_var_name = target_var_name def map_substitution(self, name, tag, arguments, expn_state): - process_me = name == self.subst_name + if not ( + name == self.subst_name + and self.within(expn_state.stack) + and (self.subst_tag is None or self.subst_tag == tag)): + return super(RuleInvocationReplacer, self).map_substitution( + name, tag, arguments, expn_state) - if self.subst_tag is not None and self.subst_tag != tag: - process_me = False + # {{{ check if in footprint - process_me = process_me and self.within(expn_state.stack) - - # {{{ find matching invocation descriptor - - rule = self.old_subst_rules[name] + rule = self.rule_mapping_context.old_subst_rules[name] arg_context = self.make_new_arg_context( name, rule.arguments, arguments, expn_state.arg_context) args = [arg_context[arg_name] for arg_name in rule.arguments] - if not process_me: - return ExpandingIdentityMapper.map_substitution( - self, name, tag, arguments, expn_state) - - matching_accdesc = None - for accdesc in self.access_descriptors: - if accdesc.identifier == access_descriptor_id(args, expn_state.stack): - # Could be more than one, that's fine. - matching_accdesc = accdesc - break - - assert matching_accdesc is not None + accdesc = AccessDescriptor( + storage_axis_exprs=storage_axis_exprs( + self.storage_axis_sources, args)) - accdesc = matching_accdesc - del matching_accdesc + if not self.array_base_map.is_access_descriptor_in_footprint(accdesc): + return super(RuleInvocationReplacer, self).map_substitution( + name, tag, arguments, expn_state) # }}} - if not self.array_base_map.is_access_descriptor_in_footprint(accdesc): - return ExpandingIdentityMapper.map_substitution( - self, name, tag, arguments, expn_state) - assert len(arguments) == len(rule.arguments) + abm = self.array_base_map + stor_subscript = [] for sax_name, sax_source, sax_base_idx in zip( self.storage_axis_names, self.storage_axis_sources, - self.storage_base_indices): + abm.storage_base_indices): if sax_name not in self.non1_storage_axis_names: continue @@ -213,6 +191,7 @@ class RuleInvocationReplacer(ExpandingIdentityMapper): # an iname ax_index = var(sax_source) + from loopy.isl_helpers import simplify_via_aff ax_index = simplify_via_aff(ax_index - sax_base_idx) stor_subscript.append(ax_index) @@ -220,13 +199,14 @@ class RuleInvocationReplacer(ExpandingIdentityMapper): if stor_subscript: new_outer_expr = new_outer_expr.index(tuple(stor_subscript)) - # Can't possibly be nested, but recurse anyway to - # make sure substitution rules referenced below here - # do not get thrown away. - self.rec(rule.expression, expn_state.copy(arg_context={})) + # Can't possibly be nested, and no need to traverse + # further as compute expression has already been seen + # by rule_mapping_context. return new_outer_expr +# }}} + def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, new_storage_axis_names=None, storage_axis_to_tag={}, @@ -301,6 +281,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, raise RuntimeError("sweep iname '%s' is not a known iname" % iname) + sweep_inames = list(sweep_inames) + sweep_inames_set = frozenset(sweep_inames) + if isinstance(storage_axes, str): storage_axes = storage_axes.split(",") @@ -357,11 +340,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # {{{ process invocations in footprint generators, start access_descriptors - access_descriptors = [] - if footprint_generators: from pymbolic.primitives import Variable, Call + access_descriptors = [] + for fpg in footprint_generators: if isinstance(fpg, Variable): args = () @@ -374,7 +357,6 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, access_descriptors.append( RuleAccessDescriptor( identifier=access_descriptor_id(args, None), - expands_footprint=True, args=args )) @@ -382,34 +364,33 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # {{{ gather up invocations in kernel code, finish access_descriptors - invg = RuleInvocationGatherer(kernel, subst_name, subst_tag, within) - - import loopy as lp - for insn in kernel.instructions: - if isinstance(insn, lp.ExpressionInstruction): - invg(insn.expression, insn.id, insn.tags) + if not footprint_generators: + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + invg = RuleInvocationGatherer( + rule_mapping_context, kernel, subst_name, subst_tag, within) + del rule_mapping_context - for accdesc in invg.access_descriptors: - access_descriptors.append( - accdesc.copy(expands_footprint=footprint_generators is None)) + import loopy as lp + for insn in kernel.instructions: + if isinstance(insn, lp.ExpressionInstruction): + invg(insn.assignee, insn.id, insn.tags) + invg(insn.expression, insn.id, insn.tags) - if not access_descriptors: - raise RuntimeError("no invocations of '%s' found" % subst_name) + access_descriptors = invg.access_descriptors + if not access_descriptors: + raise RuntimeError("no invocations of '%s' found" % subst_name) # }}} - sweep_inames = list(sweep_inames) - sweep_inames_set = frozenset(sweep_inames) - # {{{ find inames used in arguments expanding_usage_arg_deps = set() for accdesc in access_descriptors: - if accdesc.expands_footprint: - for arg in accdesc.args: - expanding_usage_arg_deps.update( - get_dependencies(arg) & kernel.all_inames()) + for arg in accdesc.args: + expanding_usage_arg_deps.update( + get_dependencies(arg) & kernel.all_inames()) # }}} @@ -424,7 +405,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, submap = SubstitutionRuleExpander(kernel.substitutions) value_inames = get_dependencies( - submap(subst.expression, insn_id=None, insn_tags=None) + submap(subst.expression) ) & kernel.all_inames() if value_inames - expanding_usage_arg_deps < extra_storage_axes: raise RuntimeError("unreferenced sweep inames specified: " @@ -438,7 +419,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, list(extra_storage_axes) + list(range(len(subst.arguments)))) - expr_subst_dict = {} + prior_storage_axis_name_dict = {} storage_axis_names = [] storage_axis_sources = [] # number for arg#, or iname @@ -471,18 +452,12 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, new_iname_to_tag[name] = storage_axis_to_tag.get( tag_lookup_saxis, default_tag) - expr_subst_dict[old_name] = var(name) + prior_storage_axis_name_dict[name] = old_name del storage_axis_to_tag del storage_axes del new_storage_axis_names - compute_expr = ( - SubstitutionMapper(make_subst_func(expr_subst_dict)) - (subst.expression)) - - del expr_subst_dict - # }}} # {{{ fill out access_descriptors[...].storage_axis_exprs @@ -535,48 +510,68 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # leave kernel domains unchanged new_kernel_domains = kernel.domains + non1_storage_axis_names = [] abm = NoOpArrayToBufferMap() + kernel = kernel.copy(domains=new_kernel_domains) + # {{{ set up compute insn target_var_name = var_name_gen(based_on=c_subst_name) - assignee = var(target_var_name) if non1_storage_axis_names: assignee = assignee.index( tuple(var(iname) for iname in non1_storage_axis_names)) - def zero_length_1_arg(arg_name): + # {{{ process substitutions on compute instruction + + storage_axis_subst_dict = {} + + for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices): if arg_name in non1_storage_axis_names: - return var(arg_name) + arg = var(arg_name) else: - return 0 + arg = 0 + + storage_axis_subst_dict[ + prior_storage_axis_name_dict.get(arg_name, arg_name)] = arg+bi - compute_expr = (SubstitutionMapper( - make_subst_func(dict( - (arg_name, zero_length_1_arg(arg_name)+bi) - for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices) - ))) - (compute_expr)) + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + from loopy.context_matching import AllStackMatch + expr_subst_map = RuleAwareSubstitutionMapper( + rule_mapping_context, + make_subst_func(storage_axis_subst_dict), + within=AllStackMatch()) + + compute_expression = expr_subst_map(subst.expression, None, None) + + # }}} from loopy.kernel.data import ExpressionInstruction + compute_insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name) compute_insn = ExpressionInstruction( - id=kernel.make_unique_instruction_id(based_on=c_subst_name), + id=compute_insn_id, assignee=assignee, - expression=compute_expr) + expression=compute_expression) # }}} # {{{ substitute rule into expressions in kernel (if within footprint) - invr = RuleInvocationReplacer(kernel, subst_name, subst_tag, within, + invr = RuleInvocationReplacer(rule_mapping_context, + subst_name, subst_tag, within, access_descriptors, abm, storage_axis_names, storage_axis_sources, - abm.storage_base_indices, non1_storage_axis_names, + non1_storage_axis_names, target_var_name) kernel = invr.map_kernel(kernel) + kernel = kernel.copy( + instructions=[compute_insn] + kernel.instructions) + kernel = rule_mapping_context.finish_kernel(kernel) # }}} @@ -603,13 +598,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, new_temporary_variables[target_var_name] = temp_var - # }}} - kernel = kernel.copy( - domains=new_kernel_domains, - instructions=[compute_insn] + kernel.instructions, temporary_variables=new_temporary_variables) + # }}} + from loopy import tag_inames return tag_inames(kernel, new_iname_to_tag) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 64736a40a210d526ddbfa01854daac76b5daa1e7..f9cd3f3eb2140354decf61b475ba1742d5ea6832 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -26,8 +26,8 @@ THE SOFTWARE. import six import numpy as np from loopy.diagnostic import ( - LoopyError, LoopyWarning, WriteRaceConditionWarning, warn, - LoopyAdvisory) + LoopyError, WriteRaceConditionWarning, warn, + LoopyAdvisory, DependencyTypeInferenceFailure) from pytools.persistent_dict import PersistentDict from loopy.tools import LoopyKeyBuilder @@ -88,8 +88,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if not isinstance(writer_insn, lp.ExpressionInstruction): continue - expr = subst_expander(writer_insn.expression, - insn_id=writer_insn_id, insn_tags=writer_insn.tags) + expr = subst_expander(writer_insn.expression) try: debug(" via expr %s" % expr) @@ -139,12 +138,8 @@ def infer_unknown_types(kernel, expect_completion=False): def debug(s): logger.debug("%s: %s" % (kernel.name, s)) + unexpanded_kernel = kernel if kernel.substitutions: - from warnings import warn as py_warn - py_warn("type inference called when substitution " - "rules are still unexpanded, expanding", - LoopyWarning, stacklevel=2) - from loopy.subst import expand_subst kernel = expand_subst(kernel) @@ -175,8 +170,7 @@ def infer_unknown_types(kernel, expect_completion=False): ])) from loopy.symbolic import SubstitutionRuleExpander - subst_expander = SubstitutionRuleExpander(kernel.substitutions, - kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) # {{{ work on type inference queue @@ -240,7 +234,7 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} - return kernel.copy( + return unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) @@ -419,7 +413,11 @@ def realize_reduction(kernel, insn_id_filter=None): target_var_name = var_name_gen("acc_"+"_".join(expr.inames)) target_var = var(target_var_name) - arg_dtype = type_inf_mapper(expr.expr) + try: + arg_dtype = type_inf_mapper(expr.expr) + except DependencyTypeInferenceFailure: + raise LoopyError("failed to determine type of accumulator for " + "reduction '%s'" % expr) from loopy.kernel.data import ExpressionInstruction, TemporaryVariable @@ -845,8 +843,9 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): continue coeffs = CoefficientCollector()(iexpr_i) for var, coeff in six.iteritems(coeffs): - assert isinstance(var, Variable) - if var.name in auto_axis_inames: # excludes '1', i.e. the constant + if (isinstance(var, Variable) + and var.name in auto_axis_inames): + # excludes '1', i.e. the constant new_stride = coeff*stride old_stride = iname_to_stride_expr.get(var.name, None) if old_stride is None or new_stride < old_stride: diff --git a/loopy/subst.py b/loopy/subst.py index d4dba4b5dfd9918926adaad2af9179813914e8be..3b112a4fd44880f6cd66019aff1023ee426ae125 100644 --- a/loopy/subst.py +++ b/loopy/subst.py @@ -27,7 +27,7 @@ THE SOFTWARE. from loopy.symbolic import ( get_dependencies, SubstitutionMapper, - ExpandingIdentityMapper) + RuleAwareIdentityMapper, SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func @@ -200,19 +200,20 @@ def extract_subst(kernel, subst_name, template, parameters=()): # {{{ temporary_to_subst -class TemporaryToSubstChanger(ExpandingIdentityMapper): - def __init__(self, kernel, temp_name, definition_insn_ids, - usage_to_definition, within): - self.var_name_gen = kernel.get_var_name_generator() +class TemporaryToSubstChanger(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, temp_name, definition_insn_ids, + usage_to_definition, extra_arguments, within): + self.var_name_gen = rule_mapping_context.make_unique_var_name - super(TemporaryToSubstChanger, self).__init__( - kernel.substitutions, self.var_name_gen) + super(TemporaryToSubstChanger, self).__init__(rule_mapping_context) - self.kernel = kernel self.temp_name = temp_name self.definition_insn_ids = definition_insn_ids self.usage_to_definition = usage_to_definition + from pymbolic import var + self.extra_arguments = tuple(var(arg) for arg in extra_arguments) + self.within = within self.definition_insn_id_to_subst_name = {} @@ -230,7 +231,8 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper): return subst_name def map_variable(self, expr, expn_state): - if expr.name == self.temp_name: + if (expr.name == self.temp_name + and expr.name not in expn_state.arg_context): result = self.transform_access(None, expn_state) if result is not None: return result @@ -239,7 +241,8 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper): expr, expn_state) def map_subscript(self, expr, expn_state): - if expr.aggregate.name == self.temp_name: + if (expr.aggregate.name == self.temp_name + and expr.aggregate.name not in expn_state.arg_context): result = self.transform_access(expr.index, expn_state) if result is not None: return result @@ -248,7 +251,7 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper): expr, expn_state) def transform_access(self, index, expn_state): - my_insn_id = expn_state.stack[0][0] + my_insn_id = expn_state.insn_id if my_insn_id in self.definition_insn_ids: return None @@ -259,10 +262,14 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper): self.saw_unmatched_usage_sites[my_def_id] = True return None - my_insn_id = expn_state.stack[0][0] - subst_name = self.get_subst_name(my_def_id) + if self.extra_arguments: + if index is None: + index = self.extra_arguments + else: + index = index + self.extra_arguments + from pymbolic import var if index is None: return var(subst_name) @@ -270,9 +277,11 @@ class TemporaryToSubstChanger(ExpandingIdentityMapper): return var(subst_name)(*index) -def temporary_to_subst(kernel, temp_name, within=None): +def temporary_to_subst(kernel, temp_name, extra_arguments=(), within=None): """Extract an assignment to a temporary variable - as a :ref:`substituion-rule`. The temporary may + as a :ref:`substituiton-rule`. The temporary may be an array, in + which case the array indices will become arguments to the substitution + rule. :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. @@ -284,6 +293,9 @@ def temporary_to_subst(kernel, temp_name, within=None): as the temporary variable is left in place. """ + if isinstance(extra_arguments, str): + extra_arguments = tuple(s.strip() for s in extra_arguments.split(",")) + # {{{ establish the relevant definition of temp_name for each usage site dep_kernel = expand_subst(kernel) @@ -347,10 +359,13 @@ def temporary_to_subst(kernel, temp_name, within=None): from loopy.context_matching import parse_stack_match within = parse_stack_match(within) - tts = TemporaryToSubstChanger(kernel, temp_name, definition_insn_ids, - usage_to_definition, within) + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + tts = TemporaryToSubstChanger(rule_mapping_context, + temp_name, definition_insn_ids, + usage_to_definition, extra_arguments, within) - kernel = tts.map_kernel(kernel) + kernel = rule_mapping_context.finish_kernel(tts.map_kernel(kernel)) from loopy.kernel.data import SubstitutionRule @@ -376,7 +391,7 @@ def temporary_to_subst(kernel, temp_name, within=None): new_substs[subst_name] = SubstitutionRule( name=subst_name, - arguments=tuple(arguments), + arguments=tuple(arguments) + extra_arguments, expression=def_insn.expression) # }}} @@ -410,19 +425,18 @@ def temporary_to_subst(kernel, temp_name, within=None): # }}} -def expand_subst(kernel, ctx_match=None): +def expand_subst(kernel, within=None): logger.debug("%s: expand subst" % kernel.name) - from loopy.symbolic import SubstitutionRuleExpander + from loopy.symbolic import RuleAwareSubstitutionRuleExpander from loopy.context_matching import parse_stack_match - submap = SubstitutionRuleExpander(kernel.substitutions, - kernel.get_var_name_generator(), - parse_stack_match(ctx_match)) - - kernel = submap.map_kernel(kernel) - if ctx_match is None: - return kernel.copy(substitutions={}) - else: - return kernel + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + submap = RuleAwareSubstitutionRuleExpander( + rule_mapping_context, + kernel.substitutions, + parse_stack_match(within)) + + return rule_mapping_context.finish_kernel(submap.map_kernel(kernel)) # vim: foldmethod=marker diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 4492f032d0d355e9268dab049b55c0613b22b47b..8897f35938ee29e50d70edfdc52945bc2ad856ac 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1,11 +1,8 @@ """Pymbolic mappers for loopy.""" -from __future__ import division -from __future__ import absolute_import +from __future__ import division, absolute_import import six -from six.moves import range -from six.moves import zip -from functools import reduce +from six.moves import range, zip, reduce __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -197,6 +194,41 @@ class DependencyMapper(DependencyMapperBase): map_linear_subscript = DependencyMapperBase.map_subscript + +class SubstitutionRuleExpander(IdentityMapper): + def __init__(self, rules): + self.rules = rules + + def map_variable(self, expr): + if expr.name in self.rules: + return self.map_substitution(expr.name, self.rules[expr.name], ()) + else: + return super(SubstitutionRuleExpander, self).map_variable(expr) + + def map_call(self, expr): + if expr.function.name in self.rules: + return self.map_substitution( + expr.function.name, + self.rules[expr.function.name], + expr.parameters) + else: + return super(SubstitutionRuleExpander, self).map_call(expr) + + def map_substitution(self, name, rule, arguments): + if len(rule.arguments) != len(arguments): + from loopy.diagnostic import LoopyError + raise LoopyError("number of arguments to '%s' does not match " + "definition" % name) + + from pymbolic.mapper.substitutor import make_subst_func + submap = SubstitutionMapper( + make_subst_func( + dict(zip(rule.arguments, arguments)))) + + expr = submap(rule.expression) + + return self.rec(expr) + # }}} @@ -333,7 +365,7 @@ def get_dependencies(expr): return frozenset(dep.name for dep in dep_mapper(expr)) -# {{{ identity mapper that expands subst rules on the fly +# {{{ rule-aware mappers def parse_tagged_name(expr): if isinstance(expr, TaggedVariable): @@ -346,12 +378,26 @@ def parse_tagged_name(expr): class ExpansionState(Record): """ - :ivar stack: a tuple representing the current expansion stack, as a tuple + .. attribute:: stack + + a tuple representing the current expansion stack, as a tuple of (name, tag) pairs. At the top level, this should be initialized to a tuple with the id of the calling instruction. - :ivar arg_context: a dict representing current argument values + + .. attribute:: arg_context + + a dict representing current argument values """ + @property + def insn_id(self): + return self.stack[0][0] + + def apply_arg_context(self, expr): + from pymbolic.mapper.substitutor import make_subst_func + return SubstitutionMapper( + make_subst_func(self.arg_context))(expr) + class SubstitutionRuleRenamer(IdentityMapper): def __init__(self, renames): @@ -395,11 +441,7 @@ def rename_subst_rules_in_instructions(insns, renames): for insn in insns] -class ExpandingIdentityMapper(IdentityMapper): - """Note: the third argument dragged around by this mapper is the - current :class:`ExpansionState`. - """ - +class SubstitutionRuleMappingContext(object): def __init__(self, old_subst_rules, make_unique_var_name): self.old_subst_rules = old_subst_rules self.make_unique_var_name = make_unique_var_name @@ -428,9 +470,84 @@ class ExpandingIdentityMapper(IdentityMapper): self.subst_rule_use_count[key] = self.subst_rule_use_count.get(key, 0) + 1 return new_name + def _get_new_substitutions_and_renames(self): + """This makes a new dictionary of substitutions from the ones + encountered in mapping all the encountered expressions. + It tries hard to keep substitution names the same--i.e. + if all derivative versions of a substitution rule ended + up with the same mapped version, then this version should + retain the name that the substitution rule had previously. + Unfortunately, this can't be done in a single pass, and so + the routine returns an additional dictionary *subst_renames* + of renamings to be performed on the processed expressions. + + The returned substitutions already have the rename applied + to them. + + :returns: (new_substitutions, subst_renames) + """ + + from loopy.kernel.data import SubstitutionRule + + orig_name_histogram = {} + for key, (name, orig_name) in six.iteritems(self.subst_rule_registry): + if self.subst_rule_use_count.get(key, 0): + orig_name_histogram[orig_name] = \ + orig_name_histogram.get(orig_name, 0) + 1 + + result = {} + renames = {} + + for key, (name, orig_name) in six.iteritems(self.subst_rule_registry): + args, body = key + + if self.subst_rule_use_count.get(key, 0): + if orig_name_histogram[orig_name] == 1 and name != orig_name: + renames[name] = orig_name + name = orig_name + + result[name] = SubstitutionRule( + name=name, + arguments=args, + expression=body) + + # {{{ perform renames on new substitutions + + subst_renamer = SubstitutionRuleRenamer(renames) + + renamed_result = {} + for name, rule in six.iteritems(result): + renamed_result[name] = rule.copy( + expression=subst_renamer(rule.expression)) + + # }}} + + return renamed_result, renames + + def finish_kernel(self, kernel): + new_substs, renames = self._get_new_substitutions_and_renames() + + new_insns = rename_subst_rules_in_instructions(kernel.instructions, renames) + + return kernel.copy( + substitutions=new_substs, + instructions=new_insns) + + +class RuleAwareIdentityMapper(IdentityMapper): + """Note: the third argument dragged around by this mapper is the + current :class:`ExpansionState`. + + Subclasses of this must be careful to not touch identifiers that + are in :attr:`ExpansionState.arg_context`. + """ + + def __init__(self, rule_mapping_context): + self.rule_mapping_context = rule_mapping_context + def map_variable(self, expr, expn_state): name, tag = parse_tagged_name(expr) - if name not in self.old_subst_rules: + if name not in self.rule_mapping_context.old_subst_rules: return IdentityMapper.map_variable(self, expr, expn_state) else: return self.map_substitution(name, tag, (), expn_state) @@ -441,8 +558,8 @@ class ExpandingIdentityMapper(IdentityMapper): name, tag = parse_tagged_name(expr.function) - if name not in self.old_subst_rules: - return IdentityMapper.map_call(self, expr, expn_state) + if name not in self.rule_mapping_context.old_subst_rules: + return super(RuleAwareIdentityMapper, self).map_call(expr, expn_state) else: return self.map_substitution(name, tag, expr.parameters, expn_state) @@ -459,7 +576,7 @@ class ExpandingIdentityMapper(IdentityMapper): for formal_arg_name, arg_value in zip(arg_names, arguments)) def map_substitution(self, name, tag, arguments, expn_state): - rule = self.old_subst_rules[name] + rule = self.rule_mapping_context.old_subst_rules[name] rec_arguments = self.rec(arguments, expn_state) @@ -475,7 +592,8 @@ class ExpandingIdentityMapper(IdentityMapper): result = self.rec(rule.expression, new_expn_state) - new_name = self.register_subst_rule(name, rule.arguments, result) + new_name = self.rule_mapping_context.register_subst_rule( + name, rule.arguments, result) if tag is None: sym = Variable(new_name) @@ -496,60 +614,6 @@ class ExpandingIdentityMapper(IdentityMapper): return IdentityMapper.__call__(self, expr, ExpansionState( stack=stack, arg_context={})) - def _get_new_substitutions_and_renames(self): - """This makes a new dictionary of substitutions from the ones - encountered in mapping all the encountered expressions. - It tries hard to keep substitution names the same--i.e. - if all derivative versions of a substitution rule ended - up with the same mapped version, then this version should - retain the name that the substitution rule had previously. - Unfortunately, this can't be done in a single pass, and so - the routine returns an additional dictionary *subst_renames* - of renamings to be performed on the processed expressions. - - The returned substitutions already have the rename applied - to them. - - :returns: (new_substitutions, subst_renames) - """ - - from loopy.kernel.data import SubstitutionRule - - orig_name_histogram = {} - for key, (name, orig_name) in six.iteritems(self.subst_rule_registry): - if self.subst_rule_use_count.get(key, 0): - orig_name_histogram[orig_name] = \ - orig_name_histogram.get(orig_name, 0) + 1 - - result = {} - renames = {} - - for key, (name, orig_name) in six.iteritems(self.subst_rule_registry): - args, body = key - - if self.subst_rule_use_count.get(key, 0): - if orig_name_histogram[orig_name] == 1 and name != orig_name: - renames[name] = orig_name - name = orig_name - - result[name] = SubstitutionRule( - name=name, - arguments=args, - expression=body) - - # {{{ perform renames on new substitutions - - subst_renamer = SubstitutionRuleRenamer(renames) - - renamed_result = {} - for name, rule in six.iteritems(result): - renamed_result[name] = rule.copy( - expression=subst_renamer(rule.expression)) - - # }}} - - return renamed_result, renames - def map_instruction(self, insn): return insn @@ -558,49 +622,40 @@ class ExpandingIdentityMapper(IdentityMapper): # While subst rules are not allowed in assignees, the mapper # may perform tasks entirely unrelated to subst rules, so # we must map assignees, too. - - insn.with_transformed_expressions(self, insn.id, insn.tags) + self.map_instruction( + insn.with_transformed_expressions(self, insn.id, insn.tags)) for insn in kernel.instructions] - new_substs, renames = self._get_new_substitutions_and_renames() - - new_insns = [self.map_instruction(insn) - for insn in rename_subst_rules_in_instructions( - new_insns, renames)] + return kernel.copy(instructions=new_insns) - return kernel.copy( - substitutions=new_substs, - instructions=new_insns) - -class ExpandingSubstitutionMapper(ExpandingIdentityMapper): - def __init__(self, rules, make_unique_var_name, subst_func, within): - ExpandingIdentityMapper.__init__(self, rules, make_unique_var_name) +class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, subst_func, within): + super(RuleAwareSubstitutionMapper, self).__init__(rule_mapping_context) self.subst_func = subst_func self.within = within def map_variable(self, expr, expn_state): + if (expr.name in expn_state.arg_context + or not self.within(expn_state.stack)): + return super(RuleAwareSubstitutionMapper, self).map_variable( + expr, expn_state) + result = self.subst_func(expr) - if result is not None or not self.within(expn_state.stack): + if result is not None: return result else: - return ExpandingIdentityMapper.map_variable(self, expr, expn_state) + return super(RuleAwareSubstitutionMapper, self).map_variable( + expr, expn_state) -# }}} - - -# {{{ substitution rule expander -class SubstitutionRuleExpander(ExpandingIdentityMapper): - def __init__(self, rules, make_unique_var=None, ctx_match=None): - ExpandingIdentityMapper.__init__(self, rules, make_unique_var) +class RuleAwareSubstitutionRuleExpander(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, rules, within): + super(RuleAwareSubstitutionRuleExpander, self).__init__(rule_mapping_context) - if ctx_match is None: - from loopy.context_matching import AllStackMatch - ctx_match = AllStackMatch() - - self.ctx_match = ctx_match + self.rules = rules + self.within = within def map_substitution(self, name, tag, arguments, expn_state): if tag is None: @@ -610,9 +665,9 @@ class SubstitutionRuleExpander(ExpandingIdentityMapper): new_stack = expn_state.stack + ((name, tags),) - if self.ctx_match(new_stack): + if self.within(new_stack): # expand - rule = self.old_subst_rules[name] + rule = self.rules[name] new_expn_state = expn_state.copy( stack=new_stack, @@ -630,8 +685,8 @@ class SubstitutionRuleExpander(ExpandingIdentityMapper): else: # do not expand - return ExpandingIdentityMapper.map_substitution( - self, name, tag, arguments, expn_state) + return super(RuleAwareSubstitutionRuleExpander, self).map_substitution( + name, tag, arguments, expn_state) # }}} diff --git a/loopy/tools.py b/loopy/tools.py index 29005cfbb6aca798d6b0f51c3dabf9a2b75e53a4..0c8898bb1ce20a732dbd0d64a679e18c98609edb 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -68,7 +68,7 @@ class LoopyKeyBuilder(KeyBuilderBase): for dict_key in sorted(six.iterkeys(key)): self.rec(key_hash, (dict_key, key[dict_key])) - def update_for_BasicSet(self, key_hash, key): + def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) getattr(prn, "print_"+key._base_name)(key) diff --git a/test/test_fortran.py b/test/test_fortran.py index 33826e4063190f7d7d48b1bfcb4122931868bd1e..d51411a6e65ba750ddf8cfc2ee844c4bc167ab82 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -130,7 +130,7 @@ def test_temporary_to_subst(ctx_factory): integer n do i = 1, n - a = inp(n) + a = inp(i) out(i) = 5*a out2(i) = 6*a end do @@ -142,7 +142,7 @@ def test_temporary_to_subst(ctx_factory): ref_knl = knl - knl = lp.temporary_to_subst(knl, "a") + knl = lp.temporary_to_subst(knl, "a", "i") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) @@ -273,26 +273,25 @@ def test_tagged(ctx_factory): assert sum(1 for insn in lp.find_instructions(knl, "*$input")) == 2 -def test_matmul(ctx_factory): +@pytest.mark.parametrize("buffer_inames", [ + "", + "i_inner,j_inner", + ]) +def test_matmul(ctx_factory, buffer_inames): fortran_src = """ subroutine dgemm(m,n,l,a,b,c) implicit none - real*8 temp, a(m,l),b(l,n),c(m,n) + real*8 a(m,l),b(l,n),c(m,n) integer m,n,k,i,j,l do j = 1,n do i = 1,m - temp = 0 do k = 1,l - temp = temp + b(k,j)*a(i,k) + c(i,j) = c(i,j) + b(k,j)*a(i,k) end do - c(i,j) = temp end do end do end subroutine - - !$loopy begin transform - !$loopy end transform """ from loopy.frontend.fortran import f2loopy @@ -307,14 +306,71 @@ def test_matmul(ctx_factory): knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) + knl = lp.assume(knl, "n mod 32 = 0") + knl = lp.assume(knl, "m mod 32 = 0") + knl = lp.assume(knl, "l mod 16 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner") + knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, + init_expression="0", store_expression="base+buffer") + ctx = ctx_factory() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5, m=7, l=10)) + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128)) + + # # FIXME: Make r/w tests possible, reactivate the above + # knl = lp.preprocess_kernel(knl) + # for k in lp.generate_loop_schedules(knl): + # code, _ = lp.generate_code(k) + # print(code) + + +@pytest.mark.xfail +def test_batched_sparse(): + fortran_src = """ + subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y) + implicit none + + integer rowstarts(m+1), colindices(nvals) + real*8 values(nvals) + real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs) + + integer m, n, rowstart, rowend, length, nvals, nvecs + + do i = 1, m + rowstart = rowstarts(i) + rowend = rowstarts(i+1) + length = rowend - rowstart + + do k = 1, nvecs + rowsum(k) = 0 + enddo + do k = 1, nvecs + do j = 1, length + rowsum(k) = rowsum(k) + & + x(colindices(rowstart+j-1),k)*values(rowstart+j-1) + end do + end do + do k = 1, nvecs + y(i,k) = rowsum(k) + end do + end do + end + + """ + + from loopy.frontend.fortran import f2loopy + knl, = f2loopy(fortran_src) + + knl = lp.split_iname(knl, "i", 128) + knl = lp.tag_inames(knl, {"i_outer": "g.0"}) + knl = lp.tag_inames(knl, {"i_inner": "l.0"}) + knl = lp.add_prefetch(knl, "values") + knl = lp.add_prefetch(knl, "colindices") + knl = lp.fix_parameters(knl, nvecs=4) if __name__ == "__main__": diff --git a/test/test_linalg.py b/test/test_linalg.py index d1a995b19fe22c818f03cadc7131e7732d3e1d58..9dbedf320eec358d29245e35ab009ded910f1b5c 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -615,8 +615,8 @@ def test_small_batched_matvec(ctx_factory): order = "C" - K = 9997 - Np = 36 + K = 9997 # noqa + Np = 36 # noqa knl = lp.make_kernel( "[K] -> {[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np, diff --git a/test/test_loopy.py b/test/test_loopy.py index 6a59eb7388afedcc65bb6aa3d42f57b2e7152ac3..f99c0b3b3638d024530c339e4340e27d71043c4e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1816,6 +1816,69 @@ def test_affine_map_inames(): print(knl) +def test_precompute_confusing_subst_arguments(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<5}", + """ + D(i):=a[i+1]-a[i] + b[i,j] = D(j) + """) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + + ref_knl = knl + + knl = lp.tag_inames(knl, dict(j="g.1")) + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + from loopy.symbolic import get_dependencies + assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) + knl = lp.precompute(knl, "D") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=12345)) + + +def test_precompute_nested_subst(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<5}", + """ + E:=a[i] + D:=E*E + b[i] = D + """) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + + ref_knl = knl + + knl = lp.tag_inames(knl, dict(j="g.1")) + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + from loopy.symbolic import get_dependencies + assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) + knl = lp.precompute(knl, "D", "i_inner") + + # There's only one surviving 'E' rule. + assert len([ + rule_name + for rule_name in knl.substitutions + if rule_name.startswith("E")]) == 1 + + # That rule should use the newly created prefetch inames, + # not the prior 'i_inner' + assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=12345)) + + def test_poisson(ctx_factory): # Stolen from Peter Coogan and Rob Kirby for FEM assembly ctx = ctx_factory() @@ -1873,6 +1936,28 @@ def test_poisson(ctx_factory): parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7)) +def test_auto_test_can_detect_problems(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n}", + """ + a[i,j] = 25 + """) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + + ref_knl = knl + + knl = lp.link_inames(knl, "i,j", "i0") + + from loopy.diagnostic import AutomaticTestFailure + with pytest.raises(AutomaticTestFailure): + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=123)) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])