diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index eeb510f2f23a786135c719af081185e3045bade2..5ea075d194a9da75a1c18d180c65239be83eb85e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -11,6 +11,7 @@ Python 2.7 AMD CPU: - amd-cl-cpu except: - tags + Python 2.6 POCL: script: - export PY_EXE=python2.6 @@ -24,6 +25,7 @@ Python 2.6 POCL: - pocl except: - tags + Python 3.5 AMD CPU: script: - export PY_EXE=python3.5 @@ -38,6 +40,7 @@ Python 3.5 AMD CPU: - amd-cl-cpu except: - tags + Python 2.7 POCL: script: - export PY_EXE=python2.7 @@ -51,6 +54,7 @@ Python 2.7 POCL: - pocl except: - tags + Python 2.7 with legacy PyOpenCL: script: - export PY_EXE=python2.7 @@ -65,6 +69,22 @@ Python 2.7 with legacy PyOpenCL: - pocl except: - tags + +Python 3.6 POCL: + script: + - export PY_EXE=python3.6 + - export PYOPENCL_TEST=portable + - export EXTRA_INSTALL="numpy mako" + - export LOOPY_NO_CACHE=1 + - export NO_DOCTESTS=1 + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python3.6 + - pocl + except: + - tags + # PyPy AMD CPU: # script: # - export PY_EXE=pypy diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 2d754dec23762b289d3bf30ed6a7740326b11817..8674434084077ba1f791c46123a083346715209e 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -249,16 +249,32 @@ These are usually key-value pairs. The following attributes are recognized: dependencies. * ``nosync=id1:id2`` prescribes that no barrier synchronization is necessary - the instructions with identifiers ``id1`` and ``id2`` to the, even if - a dependency chain exists and variables are accessed in an apparently - racy way. + for the instructions with identifiers ``id1`` and ``id2``, even if a + dependency chain exists and variables are accessed in an apparently racy + way. Identifiers here are allowed to be wildcards as defined by the Python function :func:`fnmatch.fnmatchcase`. This is helpful in conjunction with ``id_prefix``. + Identifiers (including wildcards) accept an optional `@scope` suffix, + which prescribes that no synchronization at level `scope` is needed. + This does not preclude barriers at levels different from `scope`. + Allowable `scope` values are: + + * `local` + * `global` + * `any` + + As an example, ``nosync=id1@local:id2@global`` prescribes that no local + synchronization is needed with instruction ``id1`` and no global + synchronization is needed with instruction ``id2``. + + ``nosync=id1@any`` has the same effect as ``nosync=id1``. + * ``nosync_query=...`` provides an alternative way of specifying ``nosync``, - just like ``dep_query`` and ``dep``. + just like ``dep_query`` and ``dep``. As with ``nosync``, ``nosync_query`` + accepts an optional `@scope` suffix. * ``priority=integer`` sets the instructions priority to the value ``integer``. Instructions with higher priority will be scheduled sooner, diff --git a/doc/tutorial.rst b/doc/tutorial.rst index d44e8f250ac7cbc88ad3338e4031064002133a65..942c7d56e01f9d037b0e2b601f88bc8b96dda151 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -54,7 +54,9 @@ And some data on the host: .. }}} We'll also disable console syntax highlighting because it confuses -doctest:: +doctest: + +.. doctest:: >>> # not a documented interface >>> import loopy.options @@ -120,7 +122,7 @@ always see loopy's view of a kernel by printing it. i: None --------------------------------------------------------------------------- INSTRUCTIONS: - [i] out[i] <- 2*a[i] # insn + [i] out[i] <- 2*a[i] # insn --------------------------------------------------------------------------- You'll likely have noticed that there's quite a bit more information here @@ -454,8 +456,8 @@ control is the nesting of loops. For example, should the *i* loop be nested around the *j* loop, or the other way around, in the following simple zero-fill kernel? -It turns out that Loopy will typically choose a loop nesting for us, but it -does not like doing so. Loo.py will react to the following code +It turns out that Loopy will choose a loop nesting for us, but it might be +ambiguous. Consider the following code: .. doctest:: @@ -465,13 +467,8 @@ does not like doing so. Loo.py will react to the following code ... a[i,j] = 0 ... """) -By saying:: - - LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring - -And by picking one of the possible loop orderings at random. - -The warning (and the nondeterminism it warns about) is easily resolved: +Both nestings of the inames `i` and `j` result in a correct kernel. +This ambiguity can be resolved: .. doctest:: diff --git a/loopy/check.py b/loopy/check.py index 7562eacd76079b83a20e75e965787676c3d4eb9a..6a1e3dc33a33b826ad54c42a549b35ad275d9fe5 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -693,13 +693,13 @@ def check_implemented_domains(kernel, implemented_domains, code=None): parameter_inames = set( insn_domain.get_dim_name(dim_type.param, i) - for i in range(insn_domain.dim(dim_type.param))) + for i in range(insn_impl_domain.dim(dim_type.param))) lines = [] - for kind, diff_set, gist_domain in [ - ("implemented, but not desired", i_minus_d, + for bigger, smaller, diff_set, gist_domain in [ + ("implemented", "desired", i_minus_d, desired_domain.gist(insn_impl_domain)), - ("desired, but not implemented", d_minus_i, + ("desired", "implemented", d_minus_i, insn_impl_domain.gist(desired_domain))]: if diff_set.is_empty(): @@ -721,9 +721,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None): iname, pt.get_coordinate_val(tp, dim).to_python())) lines.append( - "sample point in %s: %s" % (kind, ", ".join(point_axes))) + "sample point in %s but not %s: %s" % ( + bigger, smaller, ", ".join(point_axes))) lines.append( - "gist of %s: %s" % (kind, gist_domain)) + "gist of constraints in %s but not %s: %s" % ( + smaller, bigger, gist_domain)) if code is not None: print(79*"-") diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 6f312ec798e13fa4b1d183c27578089857b13e3d..009dadc1a0d6236f092029dbc03ad0c035c7b8f8 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -506,6 +506,12 @@ def generate_code_v2(kernel): # }}} + # For faster unpickling in the common case when implemented_domains isn't needed. + from loopy.tools import LazilyUnpicklingDictionary + codegen_result = codegen_result.copy( + implemented_domains=LazilyUnpicklingDictionary( + codegen_result.implemented_domains)) + logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index c490abb6ed1635c135fc77468f27cd833b1d57b2..3ef7c8f6ad6c8af09dd01bf9e1341179d2be0be7 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -126,10 +126,13 @@ def generate_assignment_instruction_code(codegen_state, insn): # }}} - from pymbolic.primitives import Variable, Subscript + from pymbolic.primitives import Variable, Subscript, Lookup from loopy.symbolic import LinearSubscript lhs = insn.assignee + if isinstance(lhs, Lookup): + lhs = lhs.aggregate + if isinstance(lhs, Variable): assignee_var_name = lhs.name assignee_indices = () @@ -145,6 +148,8 @@ def generate_assignment_instruction_code(codegen_state, insn): else: raise RuntimeError("invalid lvalue '%s'" % lhs) + del lhs + result = codegen_state.ast_builder.emit_assignment(codegen_state, insn) # {{{ tracing @@ -221,7 +226,7 @@ def generate_call_code(codegen_state, insn): if codegen_state.vectorization_info: if insn.atomicity: - raise Unvectorizable("function call") + raise Unvectorizable("atomic operation") # }}} diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index ad80475c1d27f67b3df8a885f60dd96ff28efe6a..0110a06095fa0bd690045f050136027d7bed3a28 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -220,7 +220,7 @@ def intersect_kernel_with_slab(kernel, slab, iname): domch = DomainChanger(kernel, (iname,)) orig_domain = domch.get_original_domain() - orig_domain, slab = isl.align_two(orig_domain, slab) + orig_domain, slab = isl.align_two(slab, orig_domain) return domch.get_kernel_with(orig_domain & slab) @@ -376,10 +376,10 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): # move inames that are usable into parameters moved_inames = [] - for iname in dom_and_slab.get_var_names(dim_type.set): - if iname in usable_inames: - moved_inames.append(iname) - dt, idx = dom_and_slab.get_var_dict()[iname] + for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): + if das_iname in usable_inames: + moved_inames.append(das_iname) + dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) @@ -422,8 +422,9 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): impl_lbound, impl_ubound) - for iname in moved_inames: - dt, idx = impl_loop.get_var_dict()[iname] + for moved_iname in moved_inames: + # move moved_iname to 'set' dim_type in impl_loop + dt, idx = impl_loop.get_var_dict()[moved_iname] impl_loop = impl_loop.move_dims( dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) @@ -432,7 +433,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): codegen_state .intersect(impl_loop) .copy(kernel=intersect_kernel_with_slab( - kernel, slab, iname))) + kernel, slab, loop_iname))) inner = build_loop_nest(new_codegen_state, sched_index+1) @@ -464,12 +465,17 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): else: inner_ast = inner.current_ast(codegen_state) + + from loopy.isl_helpers import simplify_pw_aff + result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, - pw_aff_to_expr(lbound), pw_aff_to_expr(ubound), inner_ast))) + pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), + pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), + inner_ast))) return merge_codegen_results(codegen_state, result) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 602830de38e457c5ff4a55d7685dc346a7b4de35..0ebe90fbca0d31c05eaee64321e2b73709292331 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -142,6 +142,55 @@ def iname_rel_aff(space, iname, rel, aff): raise ValueError("unknown value of 'rel': %s" % rel) +# {{{ simplify_pw_aff + +def simplify_pw_aff(pw_aff, context=None): + if context is not None: + pw_aff = pw_aff.gist_params(context) + + old_pw_aff = pw_aff + + while True: + restart = False + did_something = False + + pieces = pw_aff.get_pieces() + for i, (dom_i, aff_i) in enumerate(pieces): + for j, (dom_j, aff_j) in enumerate(pieces): + if i == j: + continue + + if aff_i.gist(dom_j).is_equal(aff_j): + # aff_i is sufficient to conver aff_j, eliminate aff_j + new_pieces = pieces[:] + if i < j: + new_pieces.pop(j) + new_pieces.pop(i) + else: + new_pieces.pop(i) + new_pieces.pop(j) + + pw_aff = isl.PwAff.alloc(dom_i | dom_j, aff_i) + for dom, aff in new_pieces: + pw_aff = pw_aff.union_max(isl.PwAff.alloc(dom, aff)) + + restart = True + did_something = True + break + + if restart: + break + + if not did_something: + break + + assert pw_aff.get_aggregate_domain() <= pw_aff.eq_set(old_pw_aff) + + return pw_aff + +# }}} + + # {{{ static_*_of_pw_aff def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 5dff5e53c04521bcd2f53cb2fc971ec12227149c..793d31791a3295ef1d7c03132f43489ab828f089 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -199,7 +199,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # When kernels get intersected in slab decomposition, # their grid sizes shouldn't change. This provides # a way to forward sub-kernel grid size requests. - get_grid_sizes_for_insn_ids=None): + overridden_get_grid_sizes_for_insn_ids=None): if cache_manager is None: from loopy.kernel.tools import SetOperationCacheManager @@ -265,10 +265,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): if np.iinfo(index_dtype.numpy_dtype).min >= 0: raise TypeError("index_dtype must be signed") - if get_grid_sizes_for_insn_ids is not None: - # overwrites method down below - self.get_grid_sizes_for_insn_ids = get_grid_sizes_for_insn_ids - if state not in [ kernel_state.INITIAL, kernel_state.PREPROCESSED, @@ -302,7 +298,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): index_dtype=index_dtype, options=options, state=state, - target=target) + target=target, + overridden_get_grid_sizes_for_insn_ids=( + overridden_get_grid_sizes_for_insn_ids)) self._kernel_executor_cache = {} @@ -467,7 +465,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): discard_level_count = 0 while discard_level_count < len(iname_set_stack): - last_inames = iname_set_stack[-1-discard_level_count] + last_inames = ( + iname_set_stack[-1-discard_level_count]) + if discard_level_count + 1 < len(iname_set_stack): + last_inames = ( + last_inames - iname_set_stack[-2-discard_level_count]) if is_domain_dependent_on_inames(self, dom_idx, last_inames): break @@ -919,6 +921,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + ignore_auto=ignore_auto) + all_inames_by_insns = set() for insn_id in insn_ids: all_inames_by_insns |= self.insn_inames(insn_id) @@ -1071,6 +1078,21 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ pretty-printing + @memoize_method + def _get_iname_order_for_printing(self): + try: + from loopy.kernel.tools import get_visual_iname_order_embedding + embedding = get_visual_iname_order_embedding(self) + except ValueError: + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(self, + "iname-order", + "get_visual_iname_order_embedding() could not determine a " + "consistent iname nesting order") + embedding = dict((iname, iname) for iname in self.all_inames()) + + return embedding + def stringify(self, what=None, with_dependencies=False): all_what = set([ "name", @@ -1112,6 +1134,20 @@ class LoopKernel(ImmutableRecordWithoutPickling): sep = 75*"-" + def natorder(key): + # Return natural ordering for strings, as opposed to dictionary order. + # E.g. will result in + # 'abc1' < 'abc9' < 'abc10' + # rather than + # 'abc1' < 'abc10' < 'abc9' + # Based on + # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7 + import re + return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)] + + def natsorted(seq, key=lambda x: x): + return sorted(seq, key=lambda y: natorder(key(y))) + if "name" in what: lines.append(sep) lines.append("KERNEL: " + kernel.name) @@ -1119,7 +1155,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if "arguments" in what: lines.append(sep) lines.append("ARGUMENTS:") - for arg_name in sorted(kernel.arg_dict): + for arg_name in natsorted(kernel.arg_dict): lines.append(str(kernel.arg_dict[arg_name])) if "domains" in what: @@ -1131,21 +1167,21 @@ class LoopKernel(ImmutableRecordWithoutPickling): if "tags" in what: lines.append(sep) lines.append("INAME IMPLEMENTATION TAGS:") - for iname in sorted(kernel.all_inames()): + for iname in natsorted(kernel.all_inames()): line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname)) lines.append(line) if "variables" in what and kernel.temporary_variables: lines.append(sep) lines.append("TEMPORARIES:") - for tv in sorted(six.itervalues(kernel.temporary_variables), + for tv in natsorted(six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): lines.append(str(tv)) if "rules" in what and kernel.substitutions: lines.append(sep) lines.append("SUBSTIUTION RULES:") - for rule_name in sorted(six.iterkeys(kernel.substitutions)): + for rule_name in natsorted(six.iterkeys(kernel.substitutions)): lines.append(str(kernel.substitutions[rule_name])) if "instructions" in what: @@ -1163,7 +1199,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return printed_insn_ids.add(insn.id) - for dep_id in sorted(insn.depends_on): + for dep_id in natsorted(insn.depends_on): insert_insn_into_order(kernel.id_to_insn[dep_id]) printed_insn_order.append(insn) @@ -1210,7 +1246,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): raise LoopyError("unexpected instruction type: %s" % type(insn).__name__) - loop_list = ",".join(sorted(kernel.insn_inames(insn))) + order = self._get_iname_order_for_printing() + loop_list = ",".join( + sorted(kernel.insn_inames(insn), key=lambda iname: order[iname])) options = [Fore.GREEN+insn.id+Style.RESET_ALL] if insn.priority: @@ -1226,9 +1264,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): options.append( "conflicts=%s" % ":".join(insn.conflicts_with_groups)) if insn.no_sync_with: - # FIXME: Find a syntax to express scopes. - options.append("no_sync_with=%s" % ":".join(id for id, _ in - insn.no_sync_with)) + options.append("no_sync_with=%s" % ":".join( + "%s@%s" % entry for entry in sorted(insn.no_sync_with))) if lhs: core = "%s <- %s" % ( @@ -1239,14 +1276,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): core = Fore.MAGENTA+rhs+Style.RESET_ALL if len(loop_list) > loop_list_width: - lines.append("%s[%s]" % (arrows, loop_list)) - lines.append("%s%s%s # %s" % ( + lines.append("%s [%s]" % (arrows, loop_list)) + lines.append("%s %s%s # %s" % ( extender, (loop_list_width+2)*" ", core, ", ".join(options))) else: - lines.append("%s[%s]%s%s # %s" % ( + lines.append("%s [%s]%s%s # %s" % ( arrows, loop_list, " "*(loop_list_width-len(loop_list)), core, @@ -1280,6 +1317,13 @@ class LoopKernel(ImmutableRecordWithoutPickling): return "\n".join(lines) def __str__(self): + if six.PY3: + return self.stringify() + else: + # Path of least resistance... + return self.stringify().encode("utf-8") + + def __unicode__(self): return self.stringify() # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 019b899594ee1670d1ee59654f58b93ff8afacb9..14b18150f5b84218f39ba23662eb6106ffb596a0 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -26,6 +26,8 @@ THE SOFTWARE. import numpy as np + +from pymbolic.mapper import CSECachingMapperMixin from loopy.tools import intern_frozenset_of_ids from loopy.symbolic import IdentityMapper, WalkMapper from loopy.kernel.data import ( @@ -165,6 +167,11 @@ def get_default_insn_options_dict(): } +from collections import namedtuple + +_NosyncParseResult = namedtuple("_NosyncParseResult", "expr, scope") + + def parse_insn_options(opt_dict, options_str, assignee_names=None): if options_str is None: return opt_dict @@ -173,6 +180,20 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): result = opt_dict.copy() + def parse_nosync_option(opt_value): + if "@" in opt_value: + expr, scope = opt_value.split("@") + else: + expr = opt_value + scope = "any" + allowable_scopes = ("local", "global", "any") + if scope not in allowable_scopes: + raise ValueError( + "unknown scope for nosync option: '%s' " + "(allowable scopes are %s)" % + (scope, ', '.join("'%s'" % s for s in allowable_scopes))) + return _NosyncParseResult(expr, scope) + for option in options_str.split(","): option = option.strip() if not option: @@ -240,23 +261,24 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): raise LoopyError("'nosync' option may not be specified " "in a 'with' block") - # TODO: Come up with a syntax that allows the user to express - # different synchronization scopes. result["no_sync_with"] = result["no_sync_with"].union(frozenset( - (intern(dep.strip()), "any") - for dep in opt_value.split(":") if dep.strip())) + (option.expr.strip(), option.scope) + for option in ( + parse_nosync_option(entry) + for entry in opt_value.split(":")) + if option.expr.strip())) elif opt_key == "nosync_query" and opt_value is not None: if is_with_block: raise LoopyError("'nosync' option may not be specified " "in a 'with' block") + match_expr, scope = parse_nosync_option(opt_value) + from loopy.match import parse_match - match = parse_match(opt_value) - # TODO: Come up with a syntax that allows the user to express - # different synchronization scopes. + match = parse_match(match_expr) result["no_sync_with"] = result["no_sync_with"].union( - frozenset([(match, "any")])) + frozenset([(match, scope)])) elif opt_key == "groups" and opt_value is not None: result["groups"] = frozenset( @@ -372,7 +394,7 @@ ELSE_RE = re.compile("^\s*else\s*$") INSN_RE = re.compile( "^" "\s*" - "(?P<lhs>.+?)" + "(?P<lhs>[^{]+?)" "\s*(?<!\:)=\s*" "(?P<rhs>.+?)" "\s*?" @@ -426,7 +448,7 @@ def parse_insn(groups, insn_options): "the following error occurred:" % groups["rhs"]) raise - from pymbolic.primitives import Variable, Subscript + from pymbolic.primitives import Variable, Subscript, Lookup from loopy.symbolic import TypeAnnotation if not isinstance(lhs, tuple): @@ -447,11 +469,15 @@ def parse_insn(groups, insn_options): else: temp_var_types.append(None) + inner_lhs_i = lhs_i + if isinstance(inner_lhs_i, Lookup): + inner_lhs_i = inner_lhs_i.aggregate + from loopy.symbolic import LinearSubscript - if isinstance(lhs_i, Variable): - assignee_names.append(lhs_i.name) - elif isinstance(lhs_i, (Subscript, LinearSubscript)): - assignee_names.append(lhs_i.aggregate.name) + if isinstance(inner_lhs_i, Variable): + assignee_names.append(inner_lhs_i.name) + elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): + assignee_names.append(inner_lhs_i.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " "be variable or subscript" % (lhs_i,)) @@ -995,7 +1021,7 @@ def parse_domains(domains, defines): # {{{ guess kernel args (if requested) -class IndexRankFinder(WalkMapper): +class IndexRankFinder(CSECachingMapperMixin, WalkMapper): def __init__(self, arg_name): self.arg_name = arg_name self.index_ranks = [] @@ -1012,6 +1038,13 @@ class IndexRankFinder(WalkMapper): else: self.index_ranks.append(len(expr.index)) + def map_common_subexpression_uncached(self, expr): + if not self.visit(expr): + return + + self.rec(expr.child) + self.post_visit(expr) + class ArgumentGuesser: def __init__(self, domains, instructions, temporary_variables, @@ -1392,11 +1425,11 @@ def create_temporaries(knl, default_order): # {{{ determine shapes of temporaries -def find_var_shape(knl, var_name, feed_expression): - from loopy.symbolic import AccessRangeMapper, SubstitutionRuleExpander +def find_shapes_of_vars(knl, var_names, feed_expression): + from loopy.symbolic import BatchedAccessRangeMapper, SubstitutionRuleExpander submap = SubstitutionRuleExpander(knl.substitutions) - armap = AccessRangeMapper(knl, var_name) + armap = BatchedAccessRangeMapper(knl, var_names) def run_through_armap(expr, inames): armap(submap(expr), inames) @@ -1404,61 +1437,105 @@ def find_var_shape(knl, var_name, feed_expression): feed_expression(run_through_armap) - if armap.access_range is not None: - base_indices, shape = list(zip(*[ - knl.cache_manager.base_index_and_length( - armap.access_range, i) - for i in range(armap.access_range.dim(dim_type.set))])) - else: - if armap.bad_subscripts: - raise RuntimeError("cannot determine access range for '%s': " - "undetermined index in subscript(s) '%s'" - % (var_name, ", ".join( - str(i) for i in armap.bad_subscripts))) + var_to_base_indices = {} + var_to_shape = {} + var_to_error = {} + + from loopy.diagnostic import StaticValueFindingError + + for var_name in var_names: + access_range = armap.access_ranges[var_name] + bad_subscripts = armap.bad_subscripts[var_name] + + if access_range is not None: + try: + base_indices, shape = list(zip(*[ + knl.cache_manager.base_index_and_length( + access_range, i) + for i in range(access_range.dim(dim_type.set))])) + except StaticValueFindingError as e: + var_to_error[var_name] = str(e) + continue + + else: + if bad_subscripts: + raise RuntimeError("cannot determine access range for '%s': " + "undetermined index in subscript(s) '%s'" + % (var_name, ", ".join( + str(i) for i in bad_subscripts))) + + # no subscripts found, let's call it a scalar + base_indices = () + shape = () - # no subscripts found, let's call it a scalar - base_indices = () - shape = () + var_to_base_indices[var_name] = base_indices + var_to_shape[var_name] = shape - return base_indices, shape + return var_to_base_indices, var_to_shape, var_to_error def determine_shapes_of_temporaries(knl): new_temp_vars = knl.temporary_variables.copy() import loopy as lp - from loopy.diagnostic import StaticValueFindingError - new_temp_vars = {} + vars_needing_shape_inference = set() + for tv in six.itervalues(knl.temporary_variables): if tv.shape is lp.auto or tv.base_indices is lp.auto: - def feed_all_expressions(receiver): - for insn in knl.instructions: - insn.with_transformed_expressions( - lambda expr: receiver(expr, knl.insn_inames(insn))) + vars_needing_shape_inference.add(tv.name) - def feed_assignee_of_instruction(receiver): - for insn in knl.instructions: - for assignee in insn.assignees: - receiver(assignee, knl.insn_inames(insn)) + def feed_all_expressions(receiver): + for insn in knl.instructions: + insn.with_transformed_expressions( + lambda expr: receiver(expr, knl.insn_inames(insn))) - try: - base_indices, shape = find_var_shape( - knl, tv.name, feed_all_expressions) - except StaticValueFindingError as e: - warn_with_kernel(knl, "temp_shape_fallback", - "Had to fall back to legacy method of determining " - "shape of temporary '%s' because: %s" - % (tv.name, str(e))) + var_to_base_indices, var_to_shape, var_to_error = ( + find_shapes_of_vars( + knl, vars_needing_shape_inference, feed_all_expressions)) + + # {{{ fall back to legacy method + + if len(var_to_error) > 0: + vars_needing_shape_inference = set(var_to_error.keys()) + + from six import iteritems + for varname, err in iteritems(var_to_error): + warn_with_kernel(knl, "temp_shape_fallback", + "Had to fall back to legacy method of determining " + "shape of temporary '%s' because: %s" + % (varname, err)) + + def feed_assignee_of_instruction(receiver): + for insn in knl.instructions: + for assignee in insn.assignees: + receiver(assignee, knl.insn_inames(insn)) - base_indices, shape = find_var_shape( - knl, tv.name, feed_assignee_of_instruction) + var_to_base_indices_fallback, var_to_shape_fallback, var_to_error = ( + find_shapes_of_vars( + knl, vars_needing_shape_inference, feed_assignee_of_instruction)) - if tv.base_indices is lp.auto: - tv = tv.copy(base_indices=base_indices) - if tv.shape is lp.auto: - tv = tv.copy(shape=shape) + if len(var_to_error) > 0: + # No way around errors: propagate an exception upward. + formatted_errors = ( + "\n\n".join("'%s': %s" % (varname, var_to_error[varname]) + for varname in sorted(var_to_error.keys()))) + raise LoopyError("got the following exception(s) trying to find the " + "shape of temporary variables: %s" % formatted_errors) + + var_to_base_indices.update(var_to_base_indices_fallback) + var_to_shape.update(var_to_shape_fallback) + + # }}} + + new_temp_vars = {} + + for tv in six.itervalues(knl.temporary_variables): + if tv.base_indices is lp.auto: + tv = tv.copy(base_indices=var_to_base_indices[tv.name]) + if tv.shape is lp.auto: + tv = tv.copy(shape=var_to_shape[tv.name]) new_temp_vars[tv.name] = tv return knl.copy(temporary_variables=new_temp_vars) @@ -1551,15 +1628,30 @@ def _resolve_dependencies(knl, insn, deps): new_deps = [] for dep in deps: + found_any = False + if isinstance(dep, MatchExpressionBase): for new_dep in find_instructions(knl, dep): if new_dep.id != insn.id: new_deps.append(new_dep.id) + found_any = True else: from fnmatch import fnmatchcase for other_insn in knl.instructions: if fnmatchcase(other_insn.id, dep): new_deps.append(other_insn.id) + found_any = True + + if not found_any and knl.options.check_dep_resolution: + raise LoopyError("instruction '%s' declared a depency on '%s', " + "which did not resolve to any instruction present in the " + "kernel '%s'. Set the kernel option 'check_dep_resolution'" + "to False to disable this check." % (insn.id, dep, knl.name)) + + for dep_id in new_deps: + if dep_id not in knl.id_to_insn: + raise LoopyError("instruction '%s' depends on instruction id '%s', " + "which was not found" % (insn.id, dep_id)) return frozenset(new_deps) @@ -1574,7 +1666,7 @@ def resolve_dependencies(knl): (resolved_insn_id, nosync_scope) for nosync_dep, nosync_scope in insn.no_sync_with for resolved_insn_id in - _resolve_dependencies(knl, insn, nosync_dep)), + _resolve_dependencies(knl, insn, (nosync_dep,))), )) return knl.copy(instructions=new_insns) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 5195e0a107ad4c7f944f2eb56b5f3b248b5a4997..001dd06326edcad14d8ecd39e29229dd45de8ef2 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -379,10 +379,10 @@ class TemporaryVariable(ArrayBase): "offset must be 0 if initializer specified" % name) + from loopy.types import NumpyType, to_loopy_type if dtype is auto or dtype is None: - from loopy.types import NumpyType dtype = NumpyType(initializer.dtype) - elif dtype.numpy_dtype != initializer.dtype: + elif to_loopy_type(dtype) != to_loopy_type(initializer.dtype): raise LoopyError( "temporary variable '%s': " "dtype of initializer does not match " diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 7c82de1358f9bd11eb552a54a1e834317eb59ce5..0d22dbb88ed99c7c92480d1d39b924cc2198cc3f 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -222,6 +222,10 @@ class InstructionBase(ImmutableRecord): if within_inames_is_final is None: within_inames_is_final = False + if isinstance(depends_on, str): + depends_on = frozenset( + s.strip() for s in depends_on.split(",") if s.strip()) + if depends_on_is_final is None: depends_on_is_final = False @@ -388,10 +392,8 @@ class InstructionBase(ImmutableRecord): if self.depends_on: result.append("dep="+":".join(self.depends_on)) if self.no_sync_with: - # TODO: Come up with a syntax to express different kinds of - # synchronization scopes. result.append("nosync="+":".join( - insn_id for insn_id, _ in self.no_sync_with)) + "%s@%s" % entry for entry in self.no_sync_with)) if self.groups: result.append("groups=%s" % ":".join(self.groups)) if self.conflicts_with_groups: @@ -440,7 +442,8 @@ class InstructionBase(ImmutableRecord): from loopy.tools import intern_frozenset_of_ids - self.id = intern(self.id) + if self.id is not None: + self.id = intern(self.id) self.depends_on = intern_frozenset_of_ids(self.depends_on) self.groups = intern_frozenset_of_ids(self.groups) self.conflicts_with_groups = ( @@ -452,9 +455,12 @@ class InstructionBase(ImmutableRecord): def _get_assignee_var_name(expr): - from pymbolic.primitives import Variable, Subscript + from pymbolic.primitives import Variable, Subscript, Lookup from loopy.symbolic import LinearSubscript + if isinstance(expr, Lookup): + expr = expr.aggregate + if isinstance(expr, Variable): return expr.name @@ -474,9 +480,12 @@ def _get_assignee_var_name(expr): def _get_assignee_subscript_deps(expr): - from pymbolic.primitives import Variable, Subscript + from pymbolic.primitives import Variable, Subscript, Lookup from loopy.symbolic import LinearSubscript, get_dependencies + if isinstance(expr, Lookup): + expr = expr.aggregate + if isinstance(expr, Variable): return frozenset() elif isinstance(expr, Subscript): @@ -767,9 +776,9 @@ class Assignment(MultiAssignmentBase): if isinstance(expression, str): expression = parse(expression) - from pymbolic.primitives import Variable, Subscript + from pymbolic.primitives import Variable, Subscript, Lookup from loopy.symbolic import LinearSubscript - if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): + if not isinstance(assignee, (Variable, Subscript, LinearSubscript, Lookup)): raise LoopyError("invalid lvalue '%s'" % assignee) self.assignee = assignee @@ -990,9 +999,21 @@ class CallInstruction(MultiAssignmentBase): if field_name in ["assignees", "expression"]: key_builder.update_for_pymbolic_expression( key_hash, getattr(self, field_name)) + elif field_name == "predicates": + preds = sorted(self.predicates, key=str) + for pred in preds: + key_builder.update_for_pymbolic_expression( + key_hash, pred) else: key_builder.rec(key_hash, getattr(self, field_name)) + @property + def atomicity(self): + # Function calls can impossibly be atomic, and even the result assignment + # is troublesome, especially in the case of multiple results. Avoid the + # issue altogether by disallowing atomicity. + return () + # }}} @@ -1287,7 +1308,7 @@ class NoOpInstruction(_DataObliviousInstruction): class BarrierInstruction(_DataObliviousInstruction): """An instruction that requires synchronization with all - concurrent work items of :attr:`kind. + concurrent work items of :attr:`kind`. .. attribute:: kind diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 7545070261a9be7a8b5e3e5dde4fdce3e815f44a..2033425236836ecf000d6c341c46dcb8b087a29a 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -439,7 +439,8 @@ class DomainChanger: # Changing the domain might look like it wants to change grid # sizes. Not true. # (Relevant for 'slab decomposition') - get_grid_sizes_for_insn_ids=self.kernel.get_grid_sizes_for_insn_ids) + overridden_get_grid_sizes_for_insn_ids=( + self.kernel.get_grid_sizes_for_insn_ids)) # }}} @@ -527,7 +528,9 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): lines.append("%s -> %s" % (insn_2, insn_1)) if iname_cluster: - from loopy.schedule import EnterLoop, LeaveLoop, RunInstruction, Barrier + from loopy.schedule import ( + EnterLoop, LeaveLoop, RunInstruction, Barrier, + CallKernel, ReturnFromKernel) for sched_item in kernel.schedule: if isinstance(sched_item, EnterLoop): @@ -537,7 +540,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): lines.append("}") elif isinstance(sched_item, RunInstruction): lines.append(sched_item.insn_id) - elif isinstance(sched_item, Barrier): + elif isinstance(sched_item, (CallKernel, ReturnFromKernel, Barrier)): pass else: raise LoopyError("schedule item not unterstood: %r" % sched_item) @@ -1098,6 +1101,120 @@ def guess_var_shape(kernel, var_name): # }}} +# {{{ loop nest tracker + +class SetTrie(object): + """ + Similar to a trie, but uses an unordered sequence as the key. + """ + + def __init__(self, children=(), all_items=None): + self.children = dict(children) + # all_items should be shared within a trie. + if all_items is None: + self.all_items = set() + else: + self.all_items = all_items + + def descend(self, on_found=lambda prefix: None, prefix=frozenset()): + on_found(prefix) + from six import iteritems + for prefix, child in sorted( + iteritems(self.children), + key=lambda it: sorted(it[0])): + child.descend(on_found, prefix=prefix) + + def check_consistent_insert(self, items_to_insert): + if items_to_insert & self.all_items: + raise ValueError("inconsistent nesting") + + def add_or_update(self, key): + if len(key) == 0: + return + + from six import iteritems + + for child_key, child in iteritems(self.children): + common = child_key & key + if common: + break + else: + # Key not found - insert new child + self.check_consistent_insert(key) + self.children[frozenset(key)] = SetTrie(all_items=self.all_items) + self.all_items.update(key) + return + + if child_key <= key: + # child is a prefix of key: + child.add_or_update(key - common) + elif key < child_key: + # key is a strict prefix of child: + # + # -[new child] + # | + # [child] + # + del self.children[child_key] + self.children[common] = SetTrie( + children={frozenset(child_key - common): child}, + all_items=self.all_items) + else: + # key and child share a common prefix: + # + # -[new placeholder] + # / \ + # [new child] [child] + # + self.check_consistent_insert(key - common) + + del self.children[child_key] + self.children[common] = SetTrie( + children={ + frozenset(child_key - common): child, + frozenset(key - common): SetTrie(all_items=self.all_items)}, + all_items=self.all_items) + self.all_items.update(key - common) + + +def get_visual_iname_order_embedding(kernel): + """ + Return :class:`dict` `embedding` mapping inames to a totally ordered set of + values, such that `embedding[iname1] < embedding[iname2]` when `iname2` + is nested inside `iname1`. + """ + from loopy.kernel.data import IlpBaseTag + # Ignore ILP tagged inames, since they do not have to form a strict loop + # nest. + ilp_inames = frozenset( + iname for iname in kernel.iname_to_tag + if isinstance(kernel.iname_to_tag[iname], IlpBaseTag)) + + iname_trie = SetTrie() + + for insn in kernel.instructions: + within_inames = set( + iname for iname in insn.within_inames + if iname not in ilp_inames) + iname_trie.add_or_update(within_inames) + + embedding = {} + + def update_embedding(inames): + embedding.update( + dict((iname, (len(embedding), iname)) for iname in inames)) + + iname_trie.descend(update_embedding) + + for iname in ilp_inames: + # Nest ilp_inames innermost, so they don't interrupt visual order. + embedding[iname] = (len(embedding), iname) + + return embedding + +# }}} + + # {{{ find_recursive_dependencies def find_recursive_dependencies(kernel, insn_ids): @@ -1183,7 +1300,7 @@ def draw_dependencies_as_unicode_arrows( def make_extender(): result = n_columns[0] * [" "] for col, (_, updown) in six.iteritems(columns_in_use): - result[col] = do_flag_downward("│", updown) + result[col] = do_flag_downward(u"│", updown) return result @@ -1198,7 +1315,7 @@ def draw_dependencies_as_unicode_arrows( if dep_key not in dep_to_column: col = dep_to_column[dep_key] = find_free_column() columns_in_use[col] = (rdep, "up") - row[col] = "↱" + row[col] = u"↱" for dep in insn.depends_on: assert dep != insn.id @@ -1206,15 +1323,15 @@ def draw_dependencies_as_unicode_arrows( if dep_key not in dep_to_column: col = dep_to_column[dep_key] = find_free_column() columns_in_use[col] = (dep, "down") - row[col] = do_flag_downward("┌", "down") + row[col] = do_flag_downward(u"┌", "down") for col, (end, updown) in list(six.iteritems(columns_in_use)): if insn.id == end: del columns_in_use[col] if updown == "up": - row[col] = "└" + row[col] = u"└" else: - row[col] = do_flag_downward("↳", updown) + row[col] = do_flag_downward(u"↳", updown) extender = make_extender() @@ -1224,17 +1341,30 @@ def draw_dependencies_as_unicode_arrows( uniform_length = min(n_columns[0], max_columns) + added_ellipsis = [False] + def conform_to_uniform_length(s): if len(s) <= uniform_length: return s + " "*(uniform_length-len(s)) else: - return s[:uniform_length] + "..." + added_ellipsis[0] = True + return s[:uniform_length] + u"…" - return [ + rows = [ (conform_to_uniform_length(row), conform_to_uniform_length(extender)) for row, extender in rows] + if added_ellipsis[0]: + uniform_length += 1 + + rows = [ + (conform_to_uniform_length(row), + conform_to_uniform_length(extender)) + for row, extender in rows] + + return rows + # }}} # vim: foldmethod=marker diff --git a/loopy/options.py b/loopy/options.py index 7c778681dced904b31e0cc39cff529b8c026640d..25bb7014ce07a30c49f7f78d5a6325eaba36291d 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -95,6 +95,11 @@ class Options(ImmutableRecord): determining whether an iname duplication is necessary for the kernel to be schedulable. + .. attribute:: check_dep_resolution + + Whether loopy should issue an error if a dependency + expression does not match any instructions in the kernel. + .. rubric:: Invocation-related options .. attribute:: skip_arg_checks @@ -200,6 +205,7 @@ class Options(ImmutableRecord): allow_terminal_colors_def), disable_global_barriers=kwargs.get("disable_global_barriers", False), + check_dep_resolution=kwargs.get("check_dep_resolution", True), ) # {{{ legacy compatibility diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 6b5488a20bc9d714fb5fde908b559ddebf4b9591..0d8e771954cf26cc11747e745946389420fa5e1b 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -417,6 +417,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): # }}} + neutral_var_names = [ + var_name_gen("neutral_"+red_iname) + for i in range(nresults)] acc_var_names = [ var_name_gen("acc_"+red_iname) for i in range(nresults)] @@ -429,6 +432,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): shape=outer_local_iname_sizes + (size,), dtype=dtype, scope=temp_var_scope.LOCAL) + for name, dtype in zip(neutral_var_names, reduction_dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=(), + dtype=dtype, + scope=temp_var_scope.PRIVATE) base_iname_deps = outer_insn_inames - frozenset(expr.inames) @@ -446,6 +455,22 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): depends_on=frozenset()) generated_insns.append(init_insn) + def _strip_if_scalar(c): + if len(acc_vars) == 1: + return c[0] + else: + return c + + init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname)) + init_neutral_insn = make_assignment( + id=init_neutral_id, + assignees=tuple(var(nvn) for nvn in neutral_var_names), + expression=neutral, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset()) + generated_insns.append(init_neutral_insn) + transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) transfer_insn = make_assignment( id=transfer_id, @@ -453,21 +478,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), expression=expr.operation( - arg_dtype, neutral, expr.expr, expr.inames), + arg_dtype, + _strip_if_scalar(tuple(var(nvn) for nvn in neutral_var_names)), + expr.expr, expr.inames), within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([init_id]) | insn.depends_on, + depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on, no_sync_with=frozenset([(init_id, "any")])) generated_insns.append(transfer_insn) - def _strip_if_scalar(c): - if len(acc_vars) == 1: - return c[0] - else: - return c - cur_size = 1 while cur_size < size: cur_size *= 2 @@ -493,15 +514,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): for acc_var in acc_vars), expression=expr.operation( arg_dtype, - _strip_if_scalar([ + _strip_if_scalar(tuple( acc_var[ outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars]), - _strip_if_scalar([ + for acc_var in acc_vars)), + _strip_if_scalar(tuple( acc_var[ outer_local_iname_vars + ( var(stage_exec_iname) + new_size,)] - for acc_var in acc_vars]), + for acc_var in acc_vars)), expr.inames), within_inames=( base_iname_deps | frozenset([stage_exec_iname])), @@ -518,7 +539,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): new_insn_add_depends_on.add(prev_id) new_insn_add_no_sync_with.add((prev_id, "any")) - new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname) + new_insn_add_within_inames.add(base_exec_iname or stage_exec_iname) if nresults == 1: assert len(acc_vars) == 1 @@ -738,52 +759,43 @@ def find_idempotence(kernel): (insn.id, insn.read_dependency_names() & var_names) for insn in kernel.instructions) - non_idempotently_updated_vars = set() - - # FIXME: This can be made more efficient by simply starting - # from all written variables and not even considering - # instructions as the start of the first pass. + from collections import defaultdict + dep_graph = defaultdict(lambda: set()) - new_insns = [] for insn in kernel.instructions: - all_my_var_writers = set() - for var in reads_map[insn.id]: - var_writers = writer_map.get(var, set()) - all_my_var_writers |= var_writers + dep_graph[insn.id] = set(writer_id + for var in reads_map[insn.id] + for writer_id in writer_map.get(var, set())) - # {{{ find dependency loops, flag boostability + # Find SCCs of dep_graph. These are used for checking if the instruction is + # in a dependency cycle. + from loopy.tools import compute_sccs - while True: - last_all_my_var_writers = all_my_var_writers + sccs = dict((item, scc) + for scc in compute_sccs(dep_graph) + for item in scc) - for writer_insn_id in last_all_my_var_writers: - for var in reads_map[writer_insn_id]: - all_my_var_writers = \ - all_my_var_writers | writer_map.get(var, set()) - - if last_all_my_var_writers == all_my_var_writers: - break - - # }}} + non_idempotently_updated_vars = set() - boostable = insn.id not in all_my_var_writers + new_insns = [] + for insn in kernel.instructions: + boostable = len(sccs[insn.id]) == 1 and insn.id not in dep_graph[insn.id] if not boostable: non_idempotently_updated_vars.update( insn.assignee_var_names()) - insn = insn.copy(boostable=boostable) - - new_insns.append(insn) + new_insns.append(insn.copy(boostable=boostable)) # {{{ remove boostability from isns that access non-idempotently updated vars new2_insns = [] for insn in new_insns: - accessed_vars = insn.dependency_names() - boostable = insn.boostable and not bool( - non_idempotently_updated_vars & accessed_vars) - new2_insns.append(insn.copy(boostable=boostable)) + if insn.boostable and bool( + non_idempotently_updated_vars & insn.dependency_names()): + new2_insns.append(insn.copy(boostable=False)) + else: + new2_insns.append(insn) # }}} diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index e23e5f350864a623d2ff05551fc2d559c361d734..c078da2ec58dabbbf646bfcf593ea0138941cc85 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1384,172 +1384,251 @@ class DependencyRecord(ImmutableRecord): var_kind=var_kind) -def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): - """If there exists a dependency between target and source and the two access - a common variable of *var_kind* in a way that requires a barrier (essentially, - at least one write), then the function will return a tuple - ``(target, source, var_name)``. Otherwise, it will return *None*. - - This function finds direct or indirect instruction dependencies, but does - not attempt to guess dependencies that exist based on common access to - variables. - - :arg reverse: a :class:`bool` indicating whether - forward or reverse dependencies are sought. (see above) - :arg var_kind: "global" or "local", the kind of variable based on which - barrier-needing dependencies should be found. +class DependencyTracker(object): """ + A utility to help track dependencies between originating from a set + of sources (as defined by :meth:`add_source`. For each target, + dependencies can then be obtained using :meth:`gen_dependencies_with_target_at`. - # If target or source are insn IDs, look up the actual instructions. - from loopy.kernel.data import InstructionBase - if not isinstance(source, InstructionBase): - source = kernel.id_to_insn[source] - if not isinstance(target, InstructionBase): - target = kernel.id_to_insn[target] - - if reverse: - source, target = target, source - - if source.id in kernel.get_nosync_set(target.id, var_kind): - return - - # {{{ check that a dependency exists - - dep_descr = None + .. automethod:: add_source + .. automethod:: gen_dependencies_with_target_at + """ - target_deps = kernel.recursive_insn_dep_map()[target.id] - if source.id in target_deps: - if reverse: - dep_descr = "{src} rev-depends on {tgt}" + def __init__(self, kernel, var_kind, reverse): + """ + :arg var_kind: "global" or "local", the kind of variable based on which + barrier-needing dependencies should be found. + :arg reverse: + In straight-line code, this only tracks 'b depends on + a'-type 'forward' dependencies. But a loop of the type:: + + for i in range(10): + A + B + + effectively glues multiple copies of 'A;B' one after the other:: + + A + B + A + B + ... + + Now, if B depends on (i.e. is required to be textually before) A in a + way requiring a barrier, then we will assume that the reverse + dependency exists as well, i.e. a barrier between the tail end of + execution of B and the next beginning of A is also needed. + + Setting *reverse* to *True* tracks these reverse (instead of forward) + dependencies. + """ + self.kernel = kernel + self.reverse = reverse + self.var_kind = var_kind + + if var_kind == "local": + self.relevant_vars = kernel.local_var_names() + elif var_kind == "global": + self.relevant_vars = kernel.global_var_names() else: - dep_descr = "{tgt} depends on {src}" - - grps = source.groups & target.conflicts_with_groups - if grps: - dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps) - - grps = target.groups & source.conflicts_with_groups - if grps: - dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps) + raise ValueError("unknown 'var_kind': %s" % var_kind) - if not dep_descr: - return None + from collections import defaultdict + self.writer_map = defaultdict(lambda: set()) + self.reader_map = defaultdict(lambda: set()) + self.temp_to_base_storage = kernel.get_temporary_to_base_storage_map() - # }}} - - if var_kind == "local": - relevant_vars = kernel.local_var_names() - elif var_kind == "global": - relevant_vars = kernel.global_var_names() - else: - raise ValueError("unknown 'var_kind': %s" % var_kind) - - temp_to_base_storage = kernel.get_temporary_to_base_storage_map() - - def map_to_base_storage(var_names): + def map_to_base_storage(self, var_names): result = set(var_names) for name in var_names: - bs = temp_to_base_storage.get(name) + bs = self.temp_to_base_storage.get(name) if bs is not None: result.add(bs) return result - tgt_write = map_to_base_storage( - set(target.assignee_var_names()) & relevant_vars) - tgt_read = map_to_base_storage( - target.read_dependency_names() & relevant_vars) + def discard_all_sources(self): + self.writer_map.clear() + self.reader_map.clear() - src_write = map_to_base_storage( - set(source.assignee_var_names()) & relevant_vars) - src_read = map_to_base_storage( - source.read_dependency_names() & relevant_vars) + def add_source(self, source): + """ + Specify that an instruction may be used as the source of a dependency edge. + """ + # If source is an insn ID, look up the actual instruction. + source = self.kernel.id_to_insn.get(source, source) - waw = tgt_write & src_write - raw = tgt_read & src_write - war = tgt_write & src_read + for written in self.map_to_base_storage( + set(source.assignee_var_names()) & self.relevant_vars): + self.writer_map[written].add(source.id) - for var_name in sorted(raw | war): - return DependencyRecord( - source=source, - target=target, - dep_descr=dep_descr, - variable=var_name, - var_kind=var_kind) + for read in self.map_to_base_storage( + source.read_dependency_names() & self.relevant_vars): + self.reader_map[read].add(source.id) - if source is target: - return None + def gen_dependencies_with_target_at(self, target): + """ + Generate :class:`DependencyRecord` instances for dependencies edges + whose target is the given instruction. - for var_name in sorted(waw): - return DependencyRecord( - source=source, - target=target, - dep_descr=dep_descr, - variable=var_name, - var_kind=var_kind) + :arg target: The ID of the instruction for which dependencies + with conflicting var access should be found. + """ + # If target is an insn ID, look up the actual instruction. + target = self.kernel.id_to_insn.get(target, target) + + tgt_write = self.map_to_base_storage( + set(target.assignee_var_names()) & self.relevant_vars) + tgt_read = self.map_to_base_storage( + target.read_dependency_names() & self.relevant_vars) + + for (accessed_vars, accessor_map) in [ + (tgt_read, self.writer_map), + (tgt_write, self.reader_map), + (tgt_write, self.writer_map)]: + + for dep in self.get_conflicting_accesses( + accessed_vars, accessor_map, target.id): + yield dep + + def get_conflicting_accesses( + self, accessed_vars, var_to_accessor_map, target): + + def determine_conflict_nature(source, target): + if (not self.reverse and source in + self.kernel.get_nosync_set(target, scope=self.var_kind)): + return None + if (self.reverse and target in + self.kernel.get_nosync_set(source, scope=self.var_kind)): + return None + return self.describe_dependency(source, target) + + for var in sorted(accessed_vars): + for source in sorted(var_to_accessor_map[var]): + dep_descr = determine_conflict_nature(source, target) + + if dep_descr is None: + continue - return None + yield DependencyRecord( + source=self.kernel.id_to_insn[source], + target=self.kernel.id_to_insn[target], + dep_descr=dep_descr, + variable=var, + var_kind=self.var_kind) + def describe_dependency(self, source, target): + dep_descr = None -def barrier_kind_more_or_equally_global(kind1, kind2): - return (kind1 == kind2) or (kind1 == "global" and kind2 == "local") + source = self.kernel.id_to_insn[source] + target = self.kernel.id_to_insn[target] + if self.reverse: + source, target = target, source -def get_tail_starting_at_last_barrier(schedule, kind): - result = [] + target_deps = self.kernel.recursive_insn_dep_map()[target.id] + if source.id in target_deps: + if self.reverse: + dep_descr = "{tgt} rev-depends on {src}" + else: + dep_descr = "{tgt} depends on {src}" - for sched_item in reversed(schedule): - if isinstance(sched_item, Barrier): - if barrier_kind_more_or_equally_global(sched_item.kind, kind): - break + grps = source.groups & target.conflicts_with_groups + if not grps: + grps = target.groups & source.conflicts_with_groups - elif isinstance(sched_item, RunInstruction): - result.append(sched_item.insn_id) + if grps: + dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps) - elif isinstance(sched_item, (EnterLoop, LeaveLoop)): - pass + return dep_descr - elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): - pass - else: - raise ValueError("unexpected schedule item type '%s'" - % type(sched_item).__name__) +def barrier_kind_more_or_equally_global(kind1, kind2): + return (kind1 == kind2) or (kind1 == "global" and kind2 == "local") - return reversed(result) +def insn_ids_reaching_end_without_intervening_barrier(schedule, kind): + return _insn_ids_reaching_end(schedule, kind, reverse=False) -def insn_ids_from_schedule(schedule): - result = [] - for sched_item in reversed(schedule): - if isinstance(sched_item, RunInstruction): - result.append(sched_item.insn_id) - elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier, CallKernel, - ReturnFromKernel)): - pass +def insn_ids_reachable_from_start_without_intervening_barrier(schedule, kind): + return _insn_ids_reaching_end(schedule, kind, reverse=True) - else: - raise ValueError("unexpected schedule item type '%s'" - % type(sched_item).__name__) - return result +def _insn_ids_reaching_end(schedule, kind, reverse): + if reverse: + schedule = reversed(schedule) + enter_scope_item_kind = LeaveLoop + leave_scope_item_kind = EnterLoop + else: + enter_scope_item_kind = EnterLoop + leave_scope_item_kind = LeaveLoop + + insn_ids_alive_at_scope = [set()] + + for sched_item in schedule: + if isinstance(sched_item, enter_scope_item_kind): + insn_ids_alive_at_scope.append(set()) + elif isinstance(sched_item, leave_scope_item_kind): + innermost_scope = insn_ids_alive_at_scope.pop() + # Instructions in deeper scopes are alive but could be killed by + # barriers at a shallower level, e.g.: + # + # for i + # insn0 + # end + # barrier() <= kills insn0 + # + # Hence we merge this scope into the parent scope. + insn_ids_alive_at_scope[-1].update(innermost_scope) + elif isinstance(sched_item, Barrier): + # This barrier kills only the instruction ids that are alive at + # the current scope (or deeper). Without further analysis, we + # can't assume that instructions at shallower scope can be + # killed by deeper barriers, since loops might be empty, e.g.: + # + # insn0 <= isn't killed by barrier (i loop could be empty) + # for i + # insn1 <= is killed by barrier + # for j + # insn2 <= is killed by barrier + # end + # barrier() + # end + if barrier_kind_more_or_equally_global(sched_item.kind, kind): + insn_ids_alive_at_scope[-1].clear() + else: + insn_ids_alive_at_scope[-1] |= set( + insn_id for insn_id in sched_item_to_insn_id(sched_item)) + + assert len(insn_ids_alive_at_scope) == 1 + return insn_ids_alive_at_scope[-1] + + +def append_barrier_or_raise_error(schedule, dep, verify_only): + if verify_only: + from loopy.diagnostic import MissingBarrierError + raise MissingBarrierError( + "Dependency '%s' (for variable '%s') " + "requires synchronization " + "by a %s barrier (add a 'no_sync_with' " + "instruction option to state that no " + "synchronization is needed)" + % ( + dep.dep_descr.format( + tgt=dep.target.id, src=dep.source.id), + dep.variable, + dep.var_kind)) + else: + comment = "for %s (%s)" % ( + dep.variable, dep.dep_descr.format( + tgt=dep.target.id, src=dep.source.id)) + schedule.append(Barrier(comment=comment, kind=dep.var_kind)) -def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): +def insert_barriers(kernel, schedule, kind, verify_only, level=0): """ - :arg reverse: a :class:`bool`. For ``level > 0``, this function should be - called twice, first with ``reverse=False`` to insert barriers for - forward dependencies, and then again with ``reverse=True`` to insert - reverse depedencies. This order is preferable because the forward pass - will limit the number of instructions that need to be considered as - depedency source candidates by already inserting some number of - barriers into *schedule*. - - Calling it with ``reverse==True and level==0` is not necessary, - since the root of the schedule is in no loop, therefore not repeated, - and therefore reverse dependencies don't need to be added. :arg kind: "local" or "global". The :attr:`Barrier.kind` to be inserted. Generally, this function will be called once for each kind of barrier at the top level, where more global barriers should be inserted first. @@ -1557,184 +1636,118 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): missing. :arg level: the current level of loop nesting, 0 for outermost. """ - result = [] - - # In straight-line code, we have only 'b depends on a'-type 'forward' - # dependencies. But a loop of the type - # - # for i in range(10): - # A - # B - # - # effectively glues multiple copies of 'A;B' one after the other: - # - # A - # B - # A - # B - # ... - # - # Now, if B depends on (i.e. is required to be textually before) A in a way - # requiring a barrier, then we will assume that the reverse dependency exists - # as well, i.e. a barrier between the tail end fo execution of B and the next - # beginning of A is also needed. - - if level == 0 and reverse: - # The global schedule is in no loop, therefore not repeated, and - # therefore reverse dependencies don't need to be added. - return schedule - - # a list of instruction IDs that could lead to barrier-needing dependencies. - if reverse: - candidates = set(get_tail_starting_at_last_barrier(schedule, kind)) - else: - candidates = set() - - past_first_barrier = [False] - - def seen_barrier(): - past_first_barrier[0] = True - # We've just gone across a barrier, so anything that needed - # one from above just got one. + # {{{ insert barriers at outermost scheduling level - candidates.clear() + def insert_barriers_at_outer_level(schedule, reverse=False): + dep_tracker = DependencyTracker(kernel, var_kind=kind, reverse=reverse) - def issue_barrier(dep): - seen_barrier() + if reverse: + # Populate the dependency tracker with sources from the tail end of + # the schedule block. + for insn_id in ( + insn_ids_reaching_end_without_intervening_barrier( + schedule, kind)): + dep_tracker.add_source(insn_id) - comment = None - if dep is not None: - comment = "for %s (%s)" % ( - dep.variable, dep.dep_descr.format( - tgt=dep.target.id, src=dep.source.id)) + result = [] - result.append(Barrier(comment=comment, kind=dep.var_kind)) + i = 0 + while i < len(schedule): + sched_item = schedule[i] - i = 0 - while i < len(schedule): - sched_item = schedule[i] + if isinstance(sched_item, EnterLoop): + subloop, new_i = gather_schedule_block(schedule, i) - if isinstance(sched_item, EnterLoop): - # {{{ recurse for nested loop + loop_head = ( + insn_ids_reachable_from_start_without_intervening_barrier( + subloop, kind)) - subloop, new_i = gather_schedule_block(schedule, i) - i = new_i + loop_tail = ( + insn_ids_reaching_end_without_intervening_barrier( + subloop, kind)) - # Run barrier insertion for inner loop - subresult = subloop[1:-1] - for sub_reverse in [False, True]: - subresult = insert_barriers( - kernel, subresult, - reverse=sub_reverse, kind=kind, - verify_only=verify_only, - level=level+1) + # Checks if a barrier is needed before the loop. This handles + # dependencies with targets that can be reached without an + # intervening barrier from the start of the loop: + # + # a[x] = ... <= source + # for i + # ... = a[y] <= target + # barrier() + # ... + from itertools import chain + for dep in chain.from_iterable( + dep_tracker.gen_dependencies_with_target_at(insn) + for insn in loop_head): + append_barrier_or_raise_error(result, dep, verify_only) + # This barrier gets inserted outside the loop, hence it is + # executed unconditionally and so kills all sources before + # the loop. + dep_tracker.discard_all_sources() + break - # {{{ find barriers in loop body + result.extend(subloop) - first_barrier_index = None - last_barrier_index = None + # Handle dependencies with sources not killed inside the loop: + # + # for i + # ... + # barrier() + # b[i] = ... <= source + # end for + # ... = f(b) <= target + for item in loop_tail: + dep_tracker.add_source(item) + + i = new_i + + elif isinstance(sched_item, Barrier): + result.append(sched_item) + if barrier_kind_more_or_equally_global(sched_item.kind, kind): + dep_tracker.discard_all_sources() + i += 1 + + elif isinstance(sched_item, RunInstruction): + for dep in dep_tracker.gen_dependencies_with_target_at( + sched_item.insn_id): + append_barrier_or_raise_error(result, dep, verify_only) + dep_tracker.discard_all_sources() + break + result.append(sched_item) + dep_tracker.add_source(sched_item.insn_id) + i += 1 - for j, sub_sched_item in enumerate(subresult): - if (isinstance(sub_sched_item, Barrier) and - barrier_kind_more_or_equally_global( - sub_sched_item.kind, kind)): + elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + result.append(sched_item) + i += 1 - seen_barrier() - last_barrier_index = j - if first_barrier_index is None: - first_barrier_index = j + else: + raise ValueError("unexpected schedule item type '%s'" + % type(sched_item).__name__) - # }}} + return result - # {{{ check if a barrier is needed before the loop - - # (for leading (before-first-barrier) bit of loop body) - for insn_id in insn_ids_from_schedule(subresult[:first_barrier_index]): - search_set = sorted(candidates) - - for dep_src_insn_id in search_set: - dep = get_barrier_needing_dependency( - kernel, - target=insn_id, - source=dep_src_insn_id, - reverse=reverse, var_kind=kind) - if dep: - if verify_only: - from loopy.diagnostic import MissingBarrierError - raise MissingBarrierError( - "Dependency '%s' (for variable '%s') " - "requires synchronization " - "by a %s barrier (add a 'no_sync_with' " - "instruction option to state that no" - "synchronization is needed)" - % ( - dep.dep_descr.format( - tgt=dep.target.id, src=dep.source.id), - dep.variable, - kind)) - else: - issue_barrier(dep=dep) - break + # }}} - # }}} + # {{{ recursively insert barriers in loops - # add trailing end (past-last-barrier) of loop body to candidates - if last_barrier_index is None: - candidates.update(insn_ids_from_schedule(subresult)) - else: - candidates.update( - insn_ids_from_schedule( - subresult[last_barrier_index+1:])) + result = [] + i = 0 + while i < len(schedule): + sched_item = schedule[i] + if isinstance(sched_item, EnterLoop): + subloop, new_i = gather_schedule_block(schedule, i) + new_subloop = insert_barriers( + kernel, subloop[1:-1], kind, verify_only, level + 1) result.append(subloop[0]) - result.extend(subresult) + result.extend(new_subloop) result.append(subloop[-1]) + i = new_i - # }}} - - elif isinstance(sched_item, Barrier): - i += 1 - - if barrier_kind_more_or_equally_global(sched_item.kind, kind): - seen_barrier() - - result.append(sched_item) - - elif isinstance(sched_item, RunInstruction): - i += 1 - - search_set = sorted(candidates) - - for dep_src_insn_id in search_set: - dep = get_barrier_needing_dependency( - kernel, - target=sched_item.insn_id, - source=dep_src_insn_id, - reverse=reverse, var_kind=kind) - if dep: - if verify_only: - from loopy.diagnostic import MissingBarrierError - raise MissingBarrierError( - "Dependency '%s' (for variable '%s') " - "requires synchronization " - "by a %s barrier (add a 'no_sync_with' " - "instruction option to state that no " - "synchronization is needed)" - % ( - dep.dep_descr.format( - tgt=dep.target.id, src=dep.source.id), - dep.variable, - kind)) - - else: - issue_barrier(dep=dep) - break - - result.append(sched_item) - candidates.add(sched_item.insn_id) - - elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + elif isinstance(sched_item, + (Barrier, RunInstruction, CallKernel, ReturnFromKernel)): result.append(sched_item) i += 1 @@ -1742,13 +1755,13 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): raise ValueError("unexpected schedule item type '%s'" % type(sched_item).__name__) - if past_first_barrier[0] and reverse: - # We can quit here, because we're only trying add - # reverse-dep barriers to the beginning of the loop, up to - # the first barrier. + # }}} + + result = insert_barriers_at_outer_level(result) - result.extend(schedule[i:]) - break + # When level = 0 there is no loop. + if level != 0: + result = insert_barriers_at_outer_level(result, reverse=True) return result @@ -1897,11 +1910,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}): if not kernel.options.disable_global_barriers: logger.info("%s: barrier insertion: global" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, - reverse=False, kind="global", verify_only=True) + kind="global", verify_only=True) logger.info("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, - reverse=False, kind="local", verify_only=False) + gen_sched = insert_barriers(kernel, gen_sched, kind="local", + verify_only=False) logger.info("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( @@ -1959,7 +1972,7 @@ def get_one_scheduled_kernel(kernel): if CACHING_ENABLED: try: - result, ambiguous = schedule_cache[sched_cache_key] + result = schedule_cache[sched_cache_key] logger.debug("%s: schedule cache hit" % kernel.name) from_cache = True @@ -1967,38 +1980,18 @@ def get_one_scheduled_kernel(kernel): pass if not from_cache: - ambiguous = False - - kernel_count = 0 - from time import time start_time = time() logger.info("%s: schedule start" % kernel.name) - for scheduled_kernel in generate_loop_schedules(kernel): - kernel_count += 1 - - if kernel_count == 1: - # use the first schedule - result = scheduled_kernel - - if kernel_count == 2: - ambiguous = True - break + result = next(iter(generate_loop_schedules(kernel))) logger.info("%s: scheduling done after %.2f s" % ( kernel.name, time()-start_time)) - if ambiguous: - from warnings import warn - from loopy.diagnostic import LoopyWarning - warn("scheduling for kernel '%s' was ambiguous--more than one " - "schedule found, ignoring" % kernel.name, LoopyWarning, - stacklevel=2) - if CACHING_ENABLED and not from_cache: - schedule_cache[sched_cache_key] = result, ambiguous + schedule_cache[sched_cache_key] = result return result diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 52fd6e57f92e7f9599a3a0fb4256f97347708303..50c891be476810887720c4e13c9659966b431f5d 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -38,6 +38,7 @@ from pymbolic.mapper import ( IdentityMapper as IdentityMapperBase, WalkMapper as WalkMapperBase, CallbackMapper as CallbackMapperBase, + CSECachingMapperMixin, ) from pymbolic.mapper.evaluator import \ EvaluationMapper as EvaluationMapperBase @@ -113,10 +114,14 @@ class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): pass -class PartialEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin): +class PartialEvaluationMapper( + EvaluationMapperBase, CSECachingMapperMixin, IdentityMapperMixin): def map_variable(self, expr): return expr + def map_common_subexpression_uncached(self, expr): + return type(expr)(self.rec(expr.child), expr.prefix, expr.scope) + class WalkMapper(WalkMapperBase): def map_literal(self, expr, *args): @@ -162,8 +167,10 @@ class CombineMapper(CombineMapperBase): map_linear_subscript = CombineMapperBase.map_subscript -class SubstitutionMapper(SubstitutionMapperBase, IdentityMapperMixin): - pass +class SubstitutionMapper( + CSECachingMapperMixin, SubstitutionMapperBase, IdentityMapperMixin): + def map_common_subexpression_uncached(self, expr): + return type(expr)(self.rec(expr.child), expr.prefix, expr.scope) class ConstantFoldingMapper(ConstantFoldingMapperBase, @@ -1471,12 +1478,13 @@ def get_access_range(domain, subscript, assumptions): # {{{ access range mapper -class AccessRangeMapper(WalkMapper): - def __init__(self, kernel, arg_name): +class BatchedAccessRangeMapper(WalkMapper): + + def __init__(self, kernel, arg_names): self.kernel = kernel - self.arg_name = arg_name - self.access_range = None - self.bad_subscripts = [] + self.arg_names = set(arg_names) + self.access_ranges = dict((arg, None) for arg in arg_names) + self.bad_subscripts = dict((arg, []) for arg in arg_names) def map_subscript(self, expr, inames): domain = self.kernel.get_inames_domain(inames) @@ -1484,38 +1492,58 @@ class AccessRangeMapper(WalkMapper): assert isinstance(expr.aggregate, p.Variable) - if expr.aggregate.name != self.arg_name: + if expr.aggregate.name not in self.arg_names: return + arg_name = expr.aggregate.name subscript = expr.index_tuple if not get_dependencies(subscript) <= set(domain.get_var_dict()): - self.bad_subscripts.append(expr) + self.bad_subscripts[arg_name].append(expr) return access_range = get_access_range(domain, subscript, self.kernel.assumptions) - if self.access_range is None: - self.access_range = access_range + if self.access_ranges[arg_name] is None: + self.access_ranges[arg_name] = access_range else: - if (self.access_range.dim(dim_type.set) + if (self.access_ranges[arg_name].dim(dim_type.set) != access_range.dim(dim_type.set)): raise RuntimeError( "error while determining shape of argument '%s': " "varying number of indices encountered" - % self.arg_name) + % arg_name) - self.access_range = self.access_range | access_range + self.access_ranges[arg_name] = ( + self.access_ranges[arg_name] | access_range) def map_linear_subscript(self, expr, inames): self.rec(expr.index, inames) - if expr.aggregate.name == self.arg_name: - self.bad_subscripts.append(expr) + if expr.aggregate.name in self.arg_names: + self.bad_subscripts[expr.aggregate.name].append(expr) def map_reduction(self, expr, inames): return WalkMapper.map_reduction(self, expr, inames | set(expr.inames)) + +class AccessRangeMapper(object): + + def __init__(self, kernel, arg_name): + self.arg_name = arg_name + self.inner_mapper = BatchedAccessRangeMapper(kernel, [arg_name]) + + def __call__(self, expr, inames): + return self.inner_mapper(expr, inames) + + @property + def access_range(self): + return self.inner_mapper.access_ranges[self.arg_name] + + @property + def bad_subscripts(self): + return self.inner_mapper.bad_subscripts[self.arg_name] + # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 61e8e4f396126e17123c1bf775dbfeee2fe21f0d..a8f47adb991e331f8a473c4eb14b1ea634c7a3b1 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -669,8 +669,6 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): import pyopencl as cl - logger.info("%s: opencl compilation start" % self.kernel.name) - cl_program = ( cl.Program(self.context, dev_code) .build(options=kernel.options.cl_build_options)) @@ -679,8 +677,6 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): for dp in codegen_result.device_programs: setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) - logger.info("%s: opencl compilation done" % self.kernel.name) - return _CLKernelInfo( kernel=kernel, cl_kernels=cl_kernels, diff --git a/loopy/tools.py b/loopy/tools.py index ae370d5aaac9ff75f530e1d0951a2f904b686e42..56b673b597fc3bf43a6b03f87607ea8d3db0866a 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -281,6 +281,128 @@ def empty_aligned(shape, dtype, order='C', n=64): # }}} +# {{{ compute SCCs with Tarjan's algorithm + +def compute_sccs(graph): + to_search = set(graph.keys()) + visit_order = {} + scc_root = {} + sccs = [] + + while to_search: + top = next(iter(to_search)) + call_stack = [(top, iter(graph[top]), None)] + visit_stack = [] + visiting = set() + + scc = [] + + while call_stack: + top, children, last_popped_child = call_stack.pop() + + if top not in visiting: + # Unvisited: mark as visited, initialize SCC root. + count = len(visit_order) + visit_stack.append(top) + visit_order[top] = count + scc_root[top] = count + visiting.add(top) + to_search.discard(top) + + # Returned from a recursion, update SCC. + if last_popped_child is not None: + scc_root[top] = min( + scc_root[top], + scc_root[last_popped_child]) + + for child in children: + if child not in visit_order: + # Recurse. + call_stack.append((top, children, child)) + call_stack.append((child, iter(graph[child]), None)) + break + if child in visiting: + scc_root[top] = min( + scc_root[top], + visit_order[child]) + else: + if scc_root[top] == visit_order[top]: + scc = [] + while visit_stack[-1] != top: + scc.append(visit_stack.pop()) + scc.append(visit_stack.pop()) + for item in scc: + visiting.remove(item) + sccs.append(scc) + + return sccs + +# }}} + + +# {{{ lazily unpickling dictionary + + +class _PickledObjectWrapper(object): + """ + A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDictionary`). + """ + + @classmethod + def from_object(cls, obj): + if isinstance(obj, cls): + return obj + from pickle import dumps + return cls(dumps(obj)) + + def __init__(self, objstring): + self.objstring = objstring + + def unpickle(self): + from pickle import loads + return loads(self.objstring) + + def __getstate__(self): + return {"objstring": self.objstring} + + +import collections + + +class LazilyUnpicklingDictionary(collections.MutableMapping): + """ + A dictionary-like object which lazily unpickles its values. + """ + + def __init__(self, *args, **kwargs): + self._map = dict(*args, **kwargs) + + def __getitem__(self, key): + value = self._map[key] + if isinstance(value, _PickledObjectWrapper): + value = self._map[key] = value.unpickle() + return value + + def __setitem__(self, key, value): + self._map[key] = value + + def __delitem__(self, key): + del self._map[key] + + def __len__(self): + return len(self._map) + + def __iter__(self): + return iter(self._map) + + def __getstate__(self): + return {"_map": dict( + (key, _PickledObjectWrapper.from_object(val)) + for key, val in six.iteritems(self._map))} + +# }}} + + def is_interned(s): return s is None or intern(s) is s diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 88e3898e2cceeeb62edea306283fcb718c3b088d..e7a86300f9d040cba1688e5bb0f3dcbbd926f783 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -63,7 +63,7 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): if not self.needs_batch_subscript(expr.aggregate.name): return super(_BatchVariableChanger, self).map_subscript(expr, expn_state) - idx = expr.index + idx = self.rec(expr.index, expn_state) if not isinstance(idx, tuple): idx = (idx,) @@ -73,7 +73,7 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): if not self.needs_batch_subscript(expr.name): return super(_BatchVariableChanger, self).map_variable(expr, expn_state) - return expr.aggregate[self.batch_iname_expr] + return expr[self.batch_iname_expr] def _add_unique_dim_name(name, dim_names): diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 6871cdbf46e7b117ea4e9047f0095f303b25667e..575311b11716f5a52e4713aa51922eb348c839d9 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -209,7 +209,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, oooooooooo oooooooooo - Passing ``fetch_bounding_box=True` gives :mod:`loopy` permission + Passing ``fetch_bounding_box=True`` gives :mod:`loopy` permission to instead fetch the 'bounding box' of the footprint, i.e. this set in the stencil example:: diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index c35b5064365293ac78cdd01af537c9d28bd67193..ea90abfe27c8de69daf39021b3d0ea5463a2e4c8 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -986,8 +986,9 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # Emit a warning that we needed boostable_into from warnings import warn from loopy.diagnostic import LoopyWarning - warn("Kernel '%s' required the deprecated 'boostable_into" - "field in order to be schedulable!" % knl.name, LoopyWarning) + warn("Kernel '%s' required the deprecated 'boostable_into' " + "instruction attribute in order to be schedulable!" % knl.name, + LoopyWarning) # Return to avoid yielding the duplication # options without boostable_into @@ -1198,7 +1199,8 @@ class _ReductionSplitter(RuleAwareIdentityMapper): return Reduction(expr.operation, tuple(self.inames), Reduction(expr.operation, tuple(leftover_inames), self.rec(expr.expr, expn_state), - expr.allow_simultaneous)) + expr.allow_simultaneous), + expr.allow_simultaneous) else: assert False else: diff --git a/loopy/transform/save.py b/loopy/transform/save.py index bc8981281a94bde2685be2efa659b8db01b493bb..8afc1695a38a37baf165f6ec6ef6567e2012173b 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -439,7 +439,8 @@ class TemporarySaver(object): kernel = self.kernel.copy( instructions=new_instructions, iname_to_tag=self.updated_iname_to_tag, - temporary_variables=self.updated_temporary_variables) + temporary_variables=self.updated_temporary_variables, + overridden_get_grid_sizes_for_insn_ids=None) from loopy.kernel.tools import assign_automatic_axes return assign_automatic_axes(kernel) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index a31f011a0ce8e5403b54984eb45db0970a8370b0..4c1e423e93e104fecd0b49a2b1ef2b4a261e38e7 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -38,6 +38,12 @@ import logging logger = logging.getLogger(__name__) +def _debug(kernel, s, *args): + if logger.isEnabledFor(logging.DEBUG): + logstr = s % args + logger.debug("%s: %s" % (kernel.name, logstr)) + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): @@ -112,32 +118,28 @@ class TypeInferenceMapper(CombineMapper): 0 <= len(dtype_set) <= 1 for dtype_set in dtype_sets) - if not all( - isinstance(dtype, NumpyType) + from pytools import is_single_valued + + dtypes = [dtype for dtype_set in dtype_sets - for dtype in dtype_set): - from pytools import is_single_valued, single_valued - if not is_single_valued( - dtype - for dtype_set in dtype_sets - for dtype in dtype_set): + for dtype in dtype_set] + + if not all(isinstance(dtype, NumpyType) for dtype in dtypes): + if not is_single_valued(dtypes): raise TypeInferenceFailure( "Nothing known about operations between '%s'" - % ", ".join(str(dtype) - for dtype_set in dtype_sets - for dtype in dtype_set)) + % ", ".join(str(dtype) for dtype in dtypes)) - return single_valued(dtype - for dtype_set in dtype_sets - for dtype in dtype_set) + return [dtypes[0]] - numpy_dtypes = [dtype.dtype - for dtype_set in dtype_sets - for dtype in dtype_set] + numpy_dtypes = [dtype.dtype for dtype in dtypes] if not numpy_dtypes: return [] + if is_single_valued(numpy_dtypes): + return [dtypes[0]] + result = numpy_dtypes.pop() while numpy_dtypes: other = numpy_dtypes.pop() @@ -179,7 +181,6 @@ class TypeInferenceMapper(CombineMapper): else: dtype_sets.append(dtype_set) - from pytools import all if all(dtype.is_integral() for dtype_set in dtype_sets for dtype in dtype_set): @@ -383,8 +384,8 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): return [kernel.index_dtype], [] - def debug(s): - logger.debug("%s: %s" % (kernel.name, s)) + from functools import partial + debug = partial(_debug, kernel) dtype_sets = [] @@ -399,7 +400,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): expr = subst_expander(writer_insn.expression) - debug(" via expr %s" % expr) + debug(" via expr %s", expr) if isinstance(writer_insn, lp.Assignment): result = type_inf_mapper(expr, return_dtype_set=True) elif isinstance(writer_insn, lp.CallInstruction): @@ -421,7 +422,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if result_i is not None: result.append(result_i) - debug(" result: %s" % result) + debug(" result: %s", result) dtype_sets.append(result) @@ -462,8 +463,11 @@ def infer_unknown_types(kernel, expect_completion=False): logger.debug("%s: infer types" % kernel.name) - def debug(s): - logger.debug("%s: %s" % (kernel.name, s)) + from functools import partial + debug = partial(_debug, kernel) + + import time + start_time = time.time() unexpanded_kernel = kernel if kernel.substitutions: @@ -489,6 +493,27 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} + logger.debug("finding types for {count:d} names".format( + count=len(names_for_type_inference))) + + writer_map = kernel.writer_map() + + dep_graph = dict( + (written_var, set( + read_var + for insn_id in writer_map.get(written_var, []) + for read_var in kernel.id_to_insn[insn_id].read_dependency_names() + if read_var in names_for_type_inference)) + for written_var in names_for_type_inference) + + from loopy.tools import compute_sccs + + # To speed up processing, we sort the variables by computing the SCCs of the + # type dependency graph. Each SCC represents a set of variables whose types + # mutually depend on themselves. The SCCs are returned and processed in + # topological order. + sccs = compute_sccs(dep_graph) + item_lookup = _DictUnionView([ new_temp_vars, new_arg_dict @@ -502,75 +527,89 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument - changed_during_last_queue_run = False - queue = names_for_type_inference[:] - - failed_names = set() - while queue or changed_during_last_queue_run: - if not queue and changed_during_last_queue_run: - changed_during_last_queue_run = False - queue = names_for_type_inference[:] - - name = queue.pop(0) - item = item_lookup[name] - - debug("inferring type for %s %s" % (type(item).__name__, item.name)) - - result, symbols_with_unavailable_types = \ - _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander) + for var_chain in sccs: + changed_during_last_queue_run = False + queue = var_chain[:] + failed_names = set() + + while queue or changed_during_last_queue_run: + if not queue and changed_during_last_queue_run: + changed_during_last_queue_run = False + # Optimization: If there's a single variable in the SCC without + # a self-referential dependency, then the type is known after a + # single iteration (we don't need to look at the expressions + # again). + if len(var_chain) == 1: + single_var, = var_chain + if single_var not in dep_graph[single_var]: + break + queue = var_chain[:] + + name = queue.pop(0) + item = item_lookup[name] + + debug("inferring type for %s %s", type(item).__name__, item.name) + + result, symbols_with_unavailable_types = ( + _infer_var_type( + kernel, item.name, type_inf_mapper, subst_expander)) + + failed = not result + if not failed: + new_dtype, = result + debug(" success: %s", new_dtype) + if new_dtype != item.dtype: + debug(" changed from: %s", item.dtype) + changed_during_last_queue_run = True + + if isinstance(item, TemporaryVariable): + new_temp_vars[name] = item.copy(dtype=new_dtype) + elif isinstance(item, KernelArgument): + new_arg_dict[name] = item.copy(dtype=new_dtype) + else: + raise LoopyError("unexpected item type in type inference") + else: + debug(" failure") + + if failed: + if item.name in failed_names: + # this item has failed before, give up. + advice = "" + if symbols_with_unavailable_types: + advice += ( + " (need type of '%s'--check for missing arguments)" + % ", ".join(symbols_with_unavailable_types)) + + if expect_completion: + raise LoopyError( + "could not determine type of '%s'%s" + % (item.name, advice)) + + else: + # We're done here. + break - failed = not result - if not failed: - new_dtype, = result - debug(" success: %s" % new_dtype) - if new_dtype != item.dtype: - debug(" changed from: %s" % item.dtype) - changed_during_last_queue_run = True + # remember that this item failed + failed_names.add(item.name) - if isinstance(item, TemporaryVariable): - new_temp_vars[name] = item.copy(dtype=new_dtype) - elif isinstance(item, KernelArgument): - new_arg_dict[name] = item.copy(dtype=new_dtype) - else: - raise LoopyError("unexpected item type in type inference") - else: - debug(" failure") - - if failed: - if item.name in failed_names: - # this item has failed before, give up. - advice = "" - if symbols_with_unavailable_types: - advice += ( - " (need type of '%s'--check for missing arguments)" - % ", ".join(symbols_with_unavailable_types)) - - if expect_completion: - raise LoopyError( - "could not determine type of '%s'%s" - % (item.name, advice)) - - else: - # We're done here. + if set(queue) == failed_names: + # We did what we could... + print(queue, failed_names, item.name) + assert not expect_completion break - # remember that this item failed - failed_names.add(item.name) - - if set(queue) == failed_names: - # We did what we could... - print(queue, failed_names, item.name) - assert not expect_completion - break - - # can't infer type yet, put back into queue - queue.append(name) - else: - # we've made progress, reset failure markers - failed_names = set() + # can't infer type yet, put back into queue + queue.append(name) + else: + # we've made progress, reset failure markers + failed_names = set() # }}} + end_time = time.time() + logger.debug("type inference took {dur:.2f} seconds".format( + dur=end_time - start_time)) + return unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], diff --git a/loopy/version.py b/loopy/version.py index 7de2d687794358f182776b90752ceb81ee2675e4..77d0e21bdd2ef5383c5f874656c25fe1ede21a70 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v50-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v60-islpy%s" % _islpy_version diff --git a/test/test_apps.py b/test/test_apps.py index 9eab3fdb1fbc152b65344362d39766793d372d90..c4844d3a3c5d88e0c4eeccf0d67e9b4284fd744f 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -608,6 +608,61 @@ def test_poisson_fem(ctx_factory): parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7)) +def test_domain_tree_nesting(): + # From https://github.com/inducer/loopy/issues/78 + from loopy.kernel.data import temp_var_scope as scopes + + out_map = np.array([1, 2], dtype=np.int32) + if_val = np.array([-1, 0], dtype=np.int32) + vals = np.array([2, 3], dtype=np.int32) + num_vals = np.array([2, 4], dtype=np.int32) + num_vals_offset = np.array(np.cumsum(num_vals) - num_vals, dtype=np.int32) + + TV = lp.TemporaryVariable # noqa + + knl = lp.make_kernel(['{[i]: 0 <= i < 12}', + '{[j]: 0 <= j < 100}', + '{[a_count]: 0 <= a_count < a_end}', + '{[b_count]: 0 <= b_count < b_end}'], + """ + for j + for i + <> a_end = abs(if_val[i]) + + <>b_end = num_vals[i] + <>offset = num_vals_offset[i] {id=offset} + <>b_sum = 0 {id=b_init} + for b_count + <>val = vals[offset + b_count] {dep=offset} + end + b_sum = exp(b_sum) {id=b_final} + + out[j,i] = b_sum {dep=b_final} + end + end + """, + [ + TV('out_map', initializer=out_map, read_only=True, scope=scopes.PRIVATE), + TV('if_val', initializer=if_val, read_only=True, scope=scopes.PRIVATE), + TV('vals', initializer=vals, read_only=True, scope=scopes.PRIVATE), + TV('num_vals', initializer=num_vals, read_only=True, scope=scopes.PRIVATE), + TV('num_vals_offset', initializer=num_vals_offset, read_only=True, + scope=scopes.PRIVATE), + lp.GlobalArg('B', shape=(100, 31), dtype=np.float64), + lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)]) + + parents_per_domain = knl.parents_per_domain() + + def depth(i): + if parents_per_domain[i] is None: + return 0 + else: + return 1 + depth(parents_per_domain[i]) + + for i in range(len(parents_per_domain)): + assert depth(i) < 2 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_domain.py b/test/test_domain.py index e01c3a937fd95984a27a7f2490edf4be12d9b57b..9d0379a50af188cc84de8e01f8278030b6cc04e2 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -229,6 +229,41 @@ def test_dependent_loop_bounds_3(ctx_factory): list(lp.generate_loop_schedules(knl_bad)) +def test_dependent_loop_bounds_4(): + # https://gitlab.tiker.net/inducer/loopy/issues/23 + import loopy as lp + + loopy_knl = lp.make_kernel( + [ + "{[a]: 0<=a<10}", + "{[b]: b_start<=b<b_end}", + "{[c,idim]: c_start<=c<c_end and 0<=idim<dim}", + ], + """ + for a + <> b_start = 1 + <> b_end = 2 + for b + <> c_start = 1 + <> c_end = 2 + + for c + ... nop + end + + <>t[idim] = 1 + end + end + """, + "...", + seq_dependencies=True) + + loopy_knl = lp.fix_parameters(loopy_knl, dim=3) + + with lp.CacheMode(False): + lp.generate_code_v2(loopy_knl) + + def test_independent_multi_domain(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() diff --git a/test/test_linalg.py b/test/test_linalg.py index 96b95cf4b44c603e3ee1948c1d5eceecac277bce..772d536d1e00fedc0b7abcd2f8c05350fe3b633e 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -192,6 +192,10 @@ def test_plain_matrix_mul(ctx_factory): def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + n = get_suitable_size(ctx) knl = lp.make_kernel( diff --git a/test/test_loopy.py b/test/test_loopy.py index e41d55b85e504bcd39db37bd888ddbedbf6122f4..94cdb499c096337db0657267d5afd472eef80a9c 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -102,6 +102,28 @@ def test_type_inference_no_artificial_doubles(ctx_factory): assert "double" not in code +def test_type_inference_with_type_dependencies(): + knl = lp.make_kernel( + "{[i]: i=0}", + """ + <>a = 99 + a = a + 1 + <>b = 0 + <>c = 1 + b = b + c + 1.0 + c = b + c + <>d = b + 2 + 1j + """, + "...") + knl = lp.infer_unknown_types(knl) + + from loopy.types import to_loopy_type + assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) + assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32) + assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32) + assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128) + + def test_sized_and_complex_literals(ctx_factory): ctx = ctx_factory() @@ -1473,6 +1495,22 @@ def test_call_with_no_returned_value(ctx_factory): # }}} +# {{{ call with no return values and options + +def test_call_with_options(): + knl = lp.make_kernel( + "{:}", + "f() {id=init}" + ) + + from library_for_test import no_ret_f_mangler + knl = lp.register_function_manglers(knl, [no_ret_f_mangler]) + + print(lp.generate_code_v2(knl).device_code()) + +# }}} + + def test_unschedulable_kernel_detection(): knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"], """ @@ -1910,8 +1948,8 @@ def test_tight_loop_bounds_codegen(): for_loop = \ "for (int j = " \ - "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ - "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)" + "(gid(0) == 0 && lid(0) == 0 ? 0 : -2 + 2 * lid(0) + 10 * gid(0)); " \ + "j <= (-1 + gid(0) == 0 && lid(0) == 0 ? 9 : 2 * lid(0)); ++j)" assert for_loop in cgr.device_code() @@ -1973,6 +2011,138 @@ def test_integer_reduction(ctx_factory): assert function(out) +def test_nosync_option_parsing(): + knl = lp.make_kernel( + "{[i]: 0 <= i < 10}", + """ + <>t = 1 {id=insn1,nosync=insn1} + t = 2 {id=insn2,nosync=insn1:insn2} + t = 3 {id=insn3,nosync=insn1@local:insn2@global:insn3@any} + t = 4 {id=insn4,nosync_query=id:insn*@local} + t = 5 {id=insn5,nosync_query=id:insn1} + """, + options=lp.Options(allow_terminal_colors=False)) + kernel_str = str(knl) + assert "# insn1,no_sync_with=insn1@any" in kernel_str + assert "# insn2,no_sync_with=insn1@any:insn2@any" in kernel_str + assert "# insn3,no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str + assert "# insn4,no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str # noqa + assert "# insn5,no_sync_with=insn1@any" in kernel_str + + +def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()): + from loopy.schedule import (RunInstruction, Barrier, EnterLoop, LeaveLoop) + watch_for_barrier = False + seen_barrier = False + loop_level = 0 + + for sched_item in knl.schedule: + if isinstance(sched_item, RunInstruction): + if sched_item.insn_id == id1: + watch_for_barrier = True + elif sched_item.insn_id == id2: + assert watch_for_barrier + assert seen_barrier + return + elif isinstance(sched_item, Barrier): + if watch_for_barrier and loop_level not in ignore_barriers_in_levels: + seen_barrier = True + elif isinstance(sched_item, EnterLoop): + loop_level += 1 + elif isinstance(sched_item, LeaveLoop): + loop_level -= 1 + + raise RuntimeError("id2 was not seen") + + +def test_barrier_insertion_near_top_of_loop(): + knl = lp.make_kernel( + "{[i,j]: 0 <= i,j < 10 }", + """ + for i + <>a[i] = i {id=ainit} + for j + <>t = a[(i + 1) % 10] {id=tcomp} + <>b[i,j] = a[i] + t {id=bcomp1} + b[i,j] = b[i,j] + 1 {id=bcomp2} + end + end + """, + seq_dependencies=True) + + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.set_temporary_scope(knl, "a", "local") + knl = lp.set_temporary_scope(knl, "b", "local") + knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + + print(knl) + + assert_barrier_between(knl, "ainit", "tcomp") + assert_barrier_between(knl, "tcomp", "bcomp1") + assert_barrier_between(knl, "bcomp1", "bcomp2") + + +def test_barrier_insertion_near_bottom_of_loop(): + knl = lp.make_kernel( + ["{[i]: 0 <= i < 10 }", + "[jmax] -> {[j]: 0 <= j < jmax}"], + """ + for i + <>a[i] = i {id=ainit} + for j + <>b[i,j] = a[i] + t {id=bcomp1} + b[i,j] = b[i,j] + 1 {id=bcomp2} + end + a[i] = i + 1 {id=aupdate} + end + """, + seq_dependencies=True) + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.set_temporary_scope(knl, "a", "local") + knl = lp.set_temporary_scope(knl, "b", "local") + knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + + print(knl) + + assert_barrier_between(knl, "bcomp1", "bcomp2") + assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) + + +def test_struct_assignment(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + bbhit = np.dtype([ + ("tmin", np.float32), + ("tmax", np.float32), + ("bi", np.int32), + ("hit", np.int32)]) + + bbhit, bbhit_c_decl = cl.tools.match_dtype_to_c_struct( + ctx.devices[0], "bbhit", bbhit) + bbhit = cl.tools.get_or_register_dtype('bbhit', bbhit) + + preamble = bbhit_c_decl + + knl = lp.make_kernel( + "{ [i]: 0<=i<N }", + """ + for i + result[i].hit = i % 2 + result[i].tmin = i + result[i].tmax = i+10 + result[i].bi = i + end + """, + [ + lp.GlobalArg("result", shape=("N",), dtype=bbhit), + "..."], + preambles=[("000", preamble)]) + + knl = lp.set_options(knl, write_cl=True) + knl(queue, N=200) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_misc.py b/test/test_misc.py new file mode 100644 index 0000000000000000000000000000000000000000..a22e424630255df4225586eeb9f0d62a03d5318f --- /dev/null +++ b/test/test_misc.py @@ -0,0 +1,154 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2016 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six # noqa +import pytest +from six.moves import range + +import sys + +import logging +logger = logging.getLogger(__name__) + + +def test_compute_sccs(): + from loopy.tools import compute_sccs + import random + + rng = random.Random(0) + + def generate_random_graph(nnodes): + graph = dict((i, set()) for i in range(nnodes)) + for i in range(nnodes): + for j in range(nnodes): + # Edge probability 2/n: Generates decently interesting inputs. + if rng.randint(0, nnodes - 1) <= 1: + graph[i].add(j) + return graph + + def verify_sccs(graph, sccs): + visited = set() + + def visit(node): + if node in visited: + return [] + else: + visited.add(node) + result = [] + for child in graph[node]: + result = result + visit(child) + return result + [node] + + for scc in sccs: + scc = set(scc) + assert not scc & visited + # Check that starting from each element of the SCC results + # in the same set of reachable nodes. + for scc_root in scc: + visited.difference_update(scc) + result = visit(scc_root) + assert set(result) == scc, (set(result), scc) + + for nnodes in range(10, 20): + for i in range(40): + graph = generate_random_graph(nnodes) + verify_sccs(graph, compute_sccs(graph)) + + +def test_SetTrie(): + from loopy.kernel.tools import SetTrie + + s = SetTrie() + s.add_or_update(set([1, 2, 3])) + s.add_or_update(set([4, 2, 1])) + s.add_or_update(set([1, 5])) + + result = [] + s.descend(lambda prefix: result.extend(prefix)) + assert result == [1, 2, 3, 4, 5] + + with pytest.raises(ValueError): + s.add_or_update(set([1, 4])) + + +class PicklableItem(object): + + flags = {"unpickled": False} + + def __getstate__(self): + return True + + def __setstate__(self, state): + PicklableItem.flags["unpickled"] = True + + +def test_LazilyUnpicklingDictionary(): + def is_unpickled(): + return PicklableItem.flags["unpickled"] + + from loopy.tools import LazilyUnpicklingDictionary + + mapping = LazilyUnpicklingDictionary({0: PicklableItem()}) + + assert not is_unpickled() + + from pickle import loads, dumps + + pickled_mapping = dumps(mapping) + + # {{{ test lazy loading + + mapping = loads(pickled_mapping) + assert not is_unpickled() + list(mapping.keys()) + assert not is_unpickled() + assert isinstance(mapping[0], PicklableItem) + assert is_unpickled() + + # }}} + + # {{{ test multi round trip + + mapping = loads(dumps(loads(pickled_mapping))) + assert isinstance(mapping[0], PicklableItem) + + # }}} + + # {{{ test empty map + + mapping = LazilyUnpicklingDictionary({}) + mapping = loads(dumps(mapping)) + assert len(mapping) == 0 + + # }}} + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_reduction.py b/test/test_reduction.py index 820c669da494f4d8863d274120cd5c0c7eb4420f..86e72c0c6644b7b9837a6d74da756c58344b1d6f 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -181,7 +181,7 @@ def test_recursive_nested_dependent_reduction(ctx_factory): # FIXME: Actually test functionality. -@pytest.mark.parametrize("size", [128, 5, 113, 67]) +@pytest.mark.parametrize("size", [128, 5, 113, 67, 1]) def test_local_parallel_reduction(ctx_factory, size): ctx = ctx_factory() @@ -393,6 +393,18 @@ def test_double_sum_made_unique(ctx_factory): assert b.get() == ref +def test_parallel_multi_output_reduction(): + knl = lp.make_kernel( + "{[i]: 0<=i<128}", + """ + max_val, max_indices = argmax(i, fabs(a[i])) + """) + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.realize_reduction(knl) + print(knl) + # TODO: Add functional test + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])