diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index d151a2128754373559ea8a45730609782adb33d7..560facd63f183e1113ccb7cb94ff1953aced6e7d 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -217,9 +217,9 @@ These are usually key-value pairs. The following attributes are recognized: dependency is that the code generated for this instruction is required to appear textually after all of these dependees' generated code. - Identifiers here are allowed to be wildcards as defined by - the Python module :mod:`fnmatchcase`. This is helpful in conjunction - with ``id_prefix``. + Identifiers here are allowed to be wildcards as defined by the Python + function :func:`fnmatch.fnmatchcase`. This is helpful in conjunction with + ``id_prefix``. .. note:: @@ -242,6 +242,15 @@ These are usually key-value pairs. The following attributes are recognized: heuristic and indicate that the specified list of dependencies is exhaustive. +* ``nosync=id1:id2`` prescribes that no barrier synchronization is necessary + the instructions with identifiers ``id1`` and ``id2`` to the, even if + a dependency chain exists and variables are accessed in an apparently + racy way. + + Identifiers here are allowed to be wildcards as defined by the Python + function :func:`fnmatch.fnmatchcase`. This is helpful in conjunction with + ``id_prefix``. + * ``priority=integer`` sets the instructions priority to the value ``integer``. Instructions with higher priority will be scheduled sooner, if possible. Note that the scheduler may still schedule a lower-priority @@ -284,6 +293,11 @@ Loopy's expressions are a slight superset of the expressions supported by TODO: Functions TODO: Reductions +Function Call Instructions +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: CallInstruction + C Block Instructions ^^^^^^^^^^^^^^^^^^^^ @@ -459,6 +473,8 @@ Targets .. automodule:: loopy.target +.. currentmodule:: loopy + Helper values ------------- @@ -470,6 +486,27 @@ Helper values .. }}} +Libraries: Extending and Interfacing with External Functionality +---------------------------------------------------------------- + +.. _symbols: + +Symbols +^^^^^^^ + +.. _functions: + +Functions +^^^^^^^^^ + +.. autoclass:: CallMangleInfo + +.. _reductions: + +Reductions +^^^^^^^^^^ + + The Kernel Object ----------------- diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index f92cfbf67c0d2af723db8f6fa0ee67ebb14c08fe..d085c1215781f52a8f0c79359e8ac7061d94ffb7 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -68,6 +68,8 @@ Manipulating Instructions .. autofunction:: remove_instructions +.. autofunction:: replace_instruction_ids + .. autofunction:: tag_instructions Registering Library Routines diff --git a/doc/tutorial.rst b/doc/tutorial.rst index bc8912527dcefa2ddf43d5adb6a83173900d69d4..cde36163d09a5a7d31a1afc97e0bfcaa636cdda3 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -563,8 +563,9 @@ relation to loop nesting. For example, it's perfectly possible to request #define lid(N) ((int) get_local_id(N)) ... for (int i_inner = 0; i_inner <= 15; ++i_inner) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[i_inner + i_outer * 16] = 0.0f; + if (-1 + -1 * i_inner + n >= 0) + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[i_inner + i_outer * 16] = 0.0f; ... Notice how loopy has automatically generated guard conditionals to make @@ -1105,7 +1106,7 @@ Attempting to create this kernel results in an error: ... # While trying to find shape axis 0 of argument 'out', the following exception occurred: Traceback (most recent call last): ... - StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n = 1; [(n)] : n >= 2; [(1)] : n <= 0 }' + StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }' The problem is that loopy cannot find a simple, universally valid expression for the length of *out* in this case. Notice how the kernel accesses both the diff --git a/loopy/__init__.py b/loopy/__init__.py index 22022c0f63540c56ad65b9813ea576e491dd7fde..10bd1100126e79ff2451eaf96c45bd4be47adf3a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -41,9 +41,12 @@ from loopy.kernel.data import ( KernelArgument, ValueArg, GlobalArg, ConstantArg, ImageArg, memory_ordering, memory_scope, VarAtomicity, AtomicInit, AtomicUpdate, - InstructionBase, Assignment, ExpressionInstruction, CInstruction, + InstructionBase, + MultiAssignmentBase, Assignment, ExpressionInstruction, + CallInstruction, CInstruction, temp_var_scope, TemporaryVariable, - SubstitutionRule) + SubstitutionRule, + CallMangleInfo) from loopy.kernel import LoopKernel, kernel_state from loopy.kernel.tools import ( @@ -67,7 +70,9 @@ from loopy.transform.iname import ( from loopy.transform.instruction import ( find_instructions, map_instructions, set_instruction_priority, add_dependency, - remove_instructions, tag_instructions) + remove_instructions, + replace_instruction_ids, + tag_instructions) from loopy.transform.data import ( add_prefetch, change_arg_to_image, tag_data_axes, @@ -103,7 +108,8 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (get_op_poly, sum_ops_to_dtypes, get_gmem_access_poly, get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping, - sum_mem_access_to_bytes) + sum_mem_access_to_bytes, + gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import generate_code, generate_body from loopy.compiled import CompiledKernel from loopy.options import Options @@ -131,8 +137,11 @@ __all__ = [ "ValueArg", "GlobalArg", "ConstantArg", "ImageArg", "temp_var_scope", "TemporaryVariable", "SubstitutionRule", + "CallMangleInfo", - "InstructionBase", "Assignment", "ExpressionInstruction", "CInstruction", + "InstructionBase", + "MultiAssignmentBase", "Assignment", "ExpressionInstruction", + "CallInstruction", "CInstruction", "default_function_mangler", "single_arg_function_mangler", @@ -157,7 +166,9 @@ __all__ = [ "find_instructions", "map_instructions", "set_instruction_priority", "add_dependency", - "remove_instructions", "tag_instructions", + "remove_instructions", + "replace_instruction_ids", + "tag_instructions", "extract_subst", "expand_subst", "assignment_to_subst", "find_rules_matching", "find_one_rule_matching", @@ -190,6 +201,7 @@ __all__ = [ "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly", "get_DRAM_access_poly", "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes", + "gather_access_footprints", "gather_access_footprint_bytes", "CompiledKernel", @@ -278,6 +290,11 @@ def register_symbol_manglers(kernel, manglers): def register_function_manglers(kernel, manglers): + """ + :arg manglers: list of functions of signature ``(target, name, arg_dtypes)`` + returning a :class:`loopy.CallMangleInfo`. + :returns: *kernel* with *manglers* registered + """ new_manglers = kernel.function_manglers[:] for m in manglers: if m not in new_manglers: diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 2ef5295c5cb157179a3eeaf952cebe4e65f2d5f2..f34a943831c68bd0d2cf6089d8b70ad4bc43022e 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -64,9 +64,9 @@ def fill_rand(ary): real_dtype = ary.dtype.type(0).real.dtype real_ary = ary.view(real_dtype) - fill_rand(real_ary, luxury=0) + fill_rand(real_ary) else: - fill_rand(ary, luxury=0) + fill_rand(ary) class TestArgInfo(Record): diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index d3a7ae42c078fc39c5844e3cb8047900ae347dc1..7b95f59482c826473e069d9f07a80dd97d7ea0ce 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -61,10 +61,12 @@ def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, st def generate_instruction_code(kernel, insn, codegen_state): - from loopy.kernel.data import Assignment, CInstruction + from loopy.kernel.data import Assignment, CallInstruction, CInstruction if isinstance(insn, Assignment): result = generate_expr_instruction_code(kernel, insn, codegen_state) + elif isinstance(insn, CallInstruction): + result = generate_call_code(kernel, insn, codegen_state) elif isinstance(insn, CInstruction): result = generate_c_instruction_code(kernel, insn, codegen_state) else: @@ -218,12 +220,32 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): return result +def generate_call_code(kernel, insn, codegen_state): + # {{{ vectorization handling + + if codegen_state.vectorization_info: + if insn.atomicity: + raise Unvectorizable("function call") + + # }}} + + result = kernel.target.generate_multiple_assignment( + codegen_state, insn) + + # {{{ tracing + + if kernel.options.trace_assignments or kernel.options.trace_assignment_values: + raise NotImplementedError("tracing of multi-output function calls") + + # }}} + + return result + + def generate_c_instruction_code(kernel, insn, codegen_state): if codegen_state.vectorization_info is not None: raise Unvectorizable("C instructions cannot be vectorized") - ecm = codegen_state.expression_to_code_mapper - body = [] from loopy.codegen import POD diff --git a/loopy/expression.py b/loopy/expression.py index 16c09b82821044f880e0f459af980da13526211b..6fd49661d617f65d2cdbc5773c3a59955da8c19c 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -199,7 +199,7 @@ class TypeInferenceMapper(CombineMapper): def map_linear_subscript(self, expr): return self.rec(expr.aggregate) - def map_call(self, expr): + def map_call(self, expr, multiple_types_ok=False): from pymbolic.primitives import Variable identifier = expr.function @@ -212,11 +212,20 @@ class TypeInferenceMapper(CombineMapper): arg_dtypes = tuple(self.rec(par) for par in expr.parameters) mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if mangle_result is not None: - return mangle_result[0] + if multiple_types_ok: + if mangle_result is not None: + return mangle_result.result_dtypes + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not multiple_types_ok: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct assignments") - raise RuntimeError("no type inference information on " - "function '%s'" % identifier) + return mangle_result.result_dtypes[0] + + raise RuntimeError("unable to resolve " + "function '%s' with %d given arguments" + % (identifier, len(arg_dtypes))) def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -267,7 +276,8 @@ class TypeInferenceMapper(CombineMapper): def map_lookup(self, expr): agg_result = self.rec(expr.aggregate) - dtype, offset = agg_result.numpy_dtype.fields[expr.name] + field = agg_result.numpy_dtype.fields[expr.name] + dtype = field[0] return NumpyType(dtype) def map_comparison(self, expr): @@ -285,9 +295,19 @@ class TypeInferenceMapper(CombineMapper): def map_local_hw_index(self, expr, *args): return self.kernel.index_dtype - def map_reduction(self, expr): - return expr.operation.result_dtype( - self.kernel.target, self.rec(expr.expr), expr.inames) + def map_reduction(self, expr, multiple_types_ok=False): + result = expr.operation.result_dtypes( + self.kernel, self.rec(expr.expr), expr.inames) + + if multiple_types_ok: + return result + + else: + if len(result) != 1 and not multiple_types_ok: + raise LoopyError("reductions with more or fewer than one " + "return value may only be used in direct assignments") + + return result[0] # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d7009900b5683d43effb594cf5b38498db8e09c1..7baea324346b79e40d7ca601a99644727dd66e95 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -313,7 +313,35 @@ class LoopKernel(RecordWithoutPickling): for mangler in manglers: mangle_result = mangler(self, identifier, arg_dtypes) if mangle_result is not None: - return mangle_result + from loopy.kernel.data import CallMangleInfo + if isinstance(mangle_result, CallMangleInfo): + assert len(mangle_result.arg_dtypes) == len(arg_dtypes) + return mangle_result + + assert isinstance(mangle_result, tuple) + + from warnings import warn + warn("'%s' returned a tuple instead of a CallMangleInfo instance. " + "This is deprecated." % mangler.__name__, + DeprecationWarning) + + if len(mangle_result) == 2: + result_dtype, target_name = mangle_result + return CallMangleInfo( + target_name=target_name, + result_dtypes=(result_dtype,), + arg_dtypes=None) + + elif len(mangle_result) == 3: + result_dtype, target_name, actual_arg_dtypes = mangle_result + return CallMangleInfo( + target_name=target_name, + result_dtypes=(result_dtype,), + arg_dtypes=actual_arg_dtypes) + + else: + raise ValueError("unexpected size of tuple returned by '%s'" + % mangler.__name__) return None @@ -1027,8 +1055,8 @@ class LoopKernel(RecordWithoutPickling): for dep_id in sorted(insn.depends_on): print_insn(kernel.id_to_insn[dep_id]) - if isinstance(insn, lp.Assignment): - lhs = str(insn.assignee) + if isinstance(insn, lp.MultiAssignmentBase): + lhs = ", ".join(str(a) for a in insn.assignees) rhs = str(insn.expression) trailing = [] elif isinstance(insn, lp.CInstruction): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index aedc1edc650d570cdeec1ecca8dc08989678df65..9fe0f5b7904c47b26b8d0b1b4c4ea5ce039da713 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -29,7 +29,9 @@ import numpy as np from loopy.tools import intern_frozenset_of_ids from loopy.symbolic import IdentityMapper, WalkMapper from loopy.kernel.data import ( - InstructionBase, Assignment, SubstitutionRule) + InstructionBase, + MultiAssignmentBase, Assignment, + SubstitutionRule) from loopy.diagnostic import LoopyError import islpy as isl from islpy import dim_type @@ -147,7 +149,6 @@ def expand_defines_in_expr(expr, defines): # {{{ parse instructions INSN_RE = re.compile( - "\s*(?:\<(?P<temp_var_type>.*?)\>)?" "\s*(?P<lhs>.+?)\s*(?<!\:)=\s*(?P<rhs>.+?)" "\s*?(?:\{(?P<options>.+)\}\s*)?$" ) @@ -159,7 +160,8 @@ SUBST_RE = re.compile( def parse_insn(insn): """ :return: a tuple ``(insn, inames_to_dup)``, where insn is a - :class:`Assignment` or a :class:`SubstitutionRule` + :class:`Assignment`, a :class:`CallInstruction`, + or a :class:`SubstitutionRule` and *inames_to_dup* is None or a list of tuples `(old, new)`. """ @@ -192,21 +194,74 @@ def parse_insn(insn): "the following error occurred:" % groups["rhs"]) raise - from pymbolic.primitives import Variable, Subscript, Call - if isinstance(lhs, Variable): - assignee_name = lhs.name - elif isinstance(lhs, Subscript): - assignee_name = lhs.aggregate.name - elif isinstance(lhs, Call): - assignee_name = None - assert subst_match is not None - else: - raise LoopyError("left hand side of assignment '%s' must " - "be variable or subscript" % lhs) + from pymbolic.primitives import Variable, Call, Subscript + from loopy.symbolic import TypeAnnotation + + # {{{ deal with subst rules + + if subst_match is not None: + assert insn_match is None + if isinstance(lhs, Variable): + subst_name = lhs.name + arg_names = [] + elif isinstance(lhs, Call): + if not isinstance(lhs.function, Variable): + raise RuntimeError("Invalid substitution rule left-hand side") + subst_name = lhs.function.name + arg_names = [] + + for i, arg in enumerate(lhs.parameters): + if not isinstance(arg, Variable): + raise RuntimeError("Invalid substitution rule " + "left-hand side: %s--arg number %d " + "is not a variable" % (lhs, i)) + arg_names.append(arg.name) + else: + raise RuntimeError("Invalid substitution rule left-hand side") + + return SubstitutionRule( + name=subst_name, + arguments=tuple(arg_names), + expression=rhs), [] + + # }}} + + if not isinstance(lhs, tuple): + lhs = (lhs,) + + temp_var_types = [] + new_lhs = [] + assignee_names = [] + + for lhs_i in lhs: + if isinstance(lhs_i, TypeAnnotation): + if lhs_i.type is None: + temp_var_types.append(lp.auto) + else: + temp_var_types.append(lhs_i.type) + + lhs_i = lhs_i.child + else: + temp_var_types.append(None) + + if isinstance(lhs_i, Variable): + assignee_names.append(lhs_i.name) + elif isinstance(lhs_i, Subscript): + assignee_names.append(lhs_i.aggregate.name) + else: + raise LoopyError("left hand side of assignment '%s' must " + "be variable or subscript" % (lhs_i,)) + + new_lhs.append(lhs_i) + + lhs = tuple(new_lhs) + temp_var_types = tuple(temp_var_types) + del new_lhs if insn_match is not None: depends_on = None depends_on_is_final = False + no_sync_with = None insn_groups = None conflicts_with_groups = None insn_id = None @@ -256,6 +311,11 @@ def parse_insn(insn): intern(dep.strip()) for dep in opt_value.split(":") if dep.strip()) + elif opt_key == "nosync" and opt_value is not None: + no_sync_with = frozenset( + intern(dep.strip()) for dep in opt_value.split(":") + if dep.strip()) + elif opt_key == "groups" and opt_value is not None: insn_groups = frozenset( intern(grp.strip()) for grp in opt_value.split(":") @@ -284,6 +344,11 @@ def parse_insn(insn): if tag.strip()) elif opt_key == "atomic": + if len(assignee_names) != 1: + raise LoopyError("atomic operations with more than one " + "left-hand side not supported") + assignee_name, = assignee_names + if opt_value is None: atomicity = atomicity + ( lp.AtomicUpdate(assignee_name),) @@ -296,6 +361,7 @@ def parse_insn(insn): raise LoopyError("atomicity directive not " "understood: %s" % v) + del assignee_name else: raise ValueError( @@ -303,58 +369,27 @@ def parse_insn(insn): "(maybe a missing/extraneous =value?)" % opt_key) - if groups["temp_var_type"] is not None: - if groups["temp_var_type"]: - temp_var_type = np.dtype(groups["temp_var_type"]) - else: - import loopy as lp - temp_var_type = lp.auto - else: - temp_var_type = None - - return Assignment( + kwargs = dict( id=( intern(insn_id) if isinstance(insn_id, str) else insn_id), depends_on=depends_on, depends_on_is_final=depends_on_is_final, + no_sync_with=no_sync_with, groups=insn_groups, conflicts_with_groups=conflicts_with_groups, forced_iname_deps_is_final=forced_iname_deps_is_final, forced_iname_deps=forced_iname_deps, - assignee=lhs, expression=rhs, - temp_var_type=temp_var_type, - atomicity=atomicity, priority=priority, predicates=predicates, - tags=tags), inames_to_dup - - elif subst_match is not None: - from pymbolic.primitives import Variable, Call - - if isinstance(lhs, Variable): - subst_name = lhs.name - arg_names = [] - elif isinstance(lhs, Call): - if not isinstance(lhs.function, Variable): - raise RuntimeError("Invalid substitution rule left-hand side") - subst_name = lhs.function.name - arg_names = [] - - for i, arg in enumerate(lhs.parameters): - if not isinstance(arg, Variable): - raise RuntimeError("Invalid substitution rule " - "left-hand side: %s--arg number %d " - "is not a variable" % (lhs, i)) - arg_names.append(arg.name) - else: - raise RuntimeError("Invalid substitution rule left-hand side") + tags=tags, + atomicity=atomicity) - return SubstitutionRule( - name=subst_name, - arguments=tuple(arg_names), - expression=rhs), [] + from loopy.kernel.data import make_assignment + return make_assignment( + lhs, rhs, temp_var_types, **kwargs + ), inames_to_dup def parse_if_necessary(insn, defines): @@ -515,13 +550,13 @@ class ArgumentGuesser: self.all_written_names = set() from loopy.symbolic import get_dependencies for insn in instructions: - if isinstance(insn, Assignment): - (assignee_var_name, _), = insn.assignees_and_indices() - self.all_written_names.add(assignee_var_name) - self.all_names.update(get_dependencies( - self.submap(insn.assignee))) - self.all_names.update(get_dependencies( - self.submap(insn.expression))) + if isinstance(insn, MultiAssignmentBase): + for assignee_var_name, _ in insn.assignees_and_indices(): + self.all_written_names.add(assignee_var_name) + self.all_names.update(get_dependencies( + self.submap(insn.assignees))) + self.all_names.update(get_dependencies( + self.submap(insn.expression))) def find_index_rank(self, name): irf = IndexRankFinder(name) @@ -583,10 +618,12 @@ class ArgumentGuesser: temp_var_names = set(six.iterkeys(self.temporary_variables)) for insn in self.instructions: - if isinstance(insn, Assignment): - if insn.temp_var_type is not None: - (assignee_var_name, _), = insn.assignees_and_indices() - temp_var_names.add(assignee_var_name) + if isinstance(insn, MultiAssignmentBase): + for (assignee_var_name, _), temp_var_type in zip( + insn.assignees_and_indices(), + insn.temp_var_types): + if temp_var_type is not None: + temp_var_names.add(assignee_var_name) # }}} @@ -780,7 +817,7 @@ def expand_cses(instructions, cse_prefix="cse_expr"): new_temp_vars = [] for insn in instructions: - if isinstance(insn, Assignment): + if isinstance(insn, MultiAssignmentBase): new_insns.append(insn.copy(expression=cseam(insn.expression))) else: new_insns.append(insn) @@ -799,29 +836,36 @@ def create_temporaries(knl, default_order): import loopy as lp for insn in knl.instructions: - if isinstance(insn, Assignment) \ - and insn.temp_var_type is not None: - (assignee_name, _), = insn.assignees_and_indices() - - if assignee_name in new_temp_vars: - raise RuntimeError("cannot create temporary variable '%s'--" - "already exists" % assignee_name) - if assignee_name in knl.arg_dict: - raise RuntimeError("cannot create temporary variable '%s'--" - "already exists as argument" % assignee_name) - - logger.debug("%s: creating temporary %s" - % (knl.name, assignee_name)) - - new_temp_vars[assignee_name] = lp.TemporaryVariable( - name=assignee_name, - dtype=insn.temp_var_type, - is_local=lp.auto, - base_indices=lp.auto, - shape=lp.auto, - order=default_order) - - insn = insn.copy(temp_var_type=None) + if isinstance(insn, MultiAssignmentBase): + for (assignee_name, _), temp_var_type in zip( + insn.assignees_and_indices(), + insn.temp_var_types): + + if temp_var_type is None: + continue + + if assignee_name in new_temp_vars: + raise RuntimeError("cannot create temporary variable '%s'--" + "already exists" % assignee_name) + if assignee_name in knl.arg_dict: + raise RuntimeError("cannot create temporary variable '%s'--" + "already exists as argument" % assignee_name) + + logger.debug("%s: creating temporary %s" + % (knl.name, assignee_name)) + + new_temp_vars[assignee_name] = lp.TemporaryVariable( + name=assignee_name, + dtype=temp_var_type, + is_local=lp.auto, + base_indices=lp.auto, + shape=lp.auto, + order=default_order) + + if isinstance(insn, Assignment): + insn = insn.copy(temp_var_type=None) + else: + insn = insn.copy(temp_var_types=None) new_insns.append(insn) @@ -925,8 +969,8 @@ def guess_arg_shape_if_requested(kernel, default_order): try: for insn in kernel.instructions: - if isinstance(insn, lp.Assignment): - armap(submap(insn.assignee), kernel.insn_inames(insn)) + if isinstance(insn, lp.MultiAssignmentBase): + armap(submap(insn.assignees), kernel.insn_inames(insn)) armap(submap(insn.expression), kernel.insn_inames(insn)) except TypeError as e: from traceback import print_exc @@ -1021,25 +1065,39 @@ def apply_default_order_to_args(kernel, default_order): # {{{ resolve wildcard insn dependencies +def find_matching_insn_ids(knl, dep): + from fnmatch import fnmatchcase + + return [ + other_insn.id + for other_insn in knl.instructions + if fnmatchcase(other_insn.id, dep)] + + +def resove_wildcard_insn_ids(knl, deps): + new_deps = [] + for dep in deps: + matches = find_matching_insn_ids(knl, dep) + + if matches: + new_deps.extend(matches) + else: + # Uh, best we can do + new_deps.append(dep) + + return frozenset(new_deps) + + def resolve_wildcard_deps(knl): new_insns = [] - from fnmatch import fnmatchcase for insn in knl.instructions: if insn.depends_on is not None: - new_deps = set() - for dep in insn.depends_on: - match_count = 0 - for other_insn in knl.instructions: - if fnmatchcase(other_insn.id, dep): - new_deps.add(other_insn.id) - match_count += 1 - - if match_count == 0: - # Uh, best we can do - new_deps.add(dep) - - insn = insn.copy(depends_on=frozenset(new_deps)) + insn = insn.copy( + depends_on=resove_wildcard_insn_ids(knl, insn.depends_on), + no_sync_with=resove_wildcard_insn_ids( + knl, insn.no_sync_with), + ) new_insns.append(insn) @@ -1112,10 +1170,9 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): :arg default_offset: 0 or :class:`loopy.auto`. The default value of *offset* in :attr:`GlobalArg` for guessed arguments. Defaults to 0. - :arg function_manglers: list of functions of signature (name, arg_dtypes) - returning a tuple (result_dtype, c_name) - or a tuple (result_dtype, c_name, arg_dtypes), - where c_name is the C-level function to be called. + :arg function_manglers: list of functions of signature + ``(target, name, arg_dtypes)`` + returning a :class:`loopy.CallMangleInfo`. :arg symbol_manglers: list of functions of signature (name) returning a tuple (result_dtype, c_name), where c_name is the C-level symbol to be evaluated. diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 0e0638491f9314de786a83cb7ccb1fedf095b9f6..b477b5fee0b047d0fb74099174e2a8c6f08336d1 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -417,9 +417,7 @@ class SubstitutionRule(Record): # }}} -# {{{ instruction - -# {{{ base class +# {{{ instructions: base class class InstructionBase(Record): """A base class for all types of instruction that can occur in @@ -430,6 +428,8 @@ class InstructionBase(Record): An (otherwise meaningless) identifier that is unique within a :class:`loopy.kernel.LoopKernel`. + .. rubric:: Instruction ordering + .. attribute:: depends_on a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances @@ -460,6 +460,21 @@ class InstructionBase(Record): (see :class:`InstructionBase.groups`) may not be active when this instruction is scheduled. + .. attribute:: priority + + Scheduling priority, an integer. Higher means 'execute sooner'. + Default 0. + + .. rubric :: Synchronization + + .. attribute:: no_sync_with + + a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances + with which no barrier synchronization is necessary, even given the existence + of a dependency chain and apparently conflicting writes + + .. rubric:: Conditionals + .. attribute:: predicates a :class:`frozenset` of variable names the conjunction (logical and) of @@ -467,6 +482,8 @@ class InstructionBase(Record): should be run. Each variable name may, optionally, be preceded by an exclamation point, indicating negation. + .. rubric:: Iname dependencies + .. attribute:: forced_iname_deps_is_final A :class:`bool` determining whether :attr:`forced_iname_deps` constitutes @@ -478,10 +495,7 @@ class InstructionBase(Record): dependencies *or* constitute the entire list of iname dependencies, depending on the value of :attr:`forced_iname_deps_is_final`. - .. attribute:: priority - - Scheduling priority, an integer. Higher means 'execute sooner'. - Default 0. + .. rubric:: Iname dependencies .. attribute:: boostable @@ -495,6 +509,8 @@ class InstructionBase(Record): may need to be boosted, as a heuristic help for the scheduler. Also allowed to be *None*. + .. rubric:: Tagging + .. attribute:: tags A tuple of string identifiers that can be used to identify groups @@ -512,12 +528,14 @@ class InstructionBase(Record): fields = set("id depends_on depends_on_is_final " "groups conflicts_with_groups " + "no_sync_with " "predicates " "forced_iname_deps_is_final forced_iname_deps " "priority boostable boostable_into".split()) def __init__(self, id, depends_on, depends_on_is_final, groups, conflicts_with_groups, + no_sync_with, forced_iname_deps_is_final, forced_iname_deps, priority, boostable, boostable_into, predicates, tags, insn_deps=None, insn_deps_is_final=None): @@ -541,6 +559,9 @@ class InstructionBase(Record): if conflicts_with_groups is None: conflicts_with_groups = frozenset() + if no_sync_with is None: + no_sync_with = frozenset() + if forced_iname_deps_is_final is None: forced_iname_deps_is_final = False @@ -574,6 +595,7 @@ class InstructionBase(Record): id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, + no_sync_with=no_sync_with, groups=groups, conflicts_with_groups=conflicts_with_groups, forced_iname_deps_is_final=forced_iname_deps_is_final, forced_iname_deps=forced_iname_deps, @@ -681,7 +703,7 @@ class InstructionBase(Record): result.append("priority=%d" % self.priority) if self.tags: result.append("tags=%s" % ":".join(self.tags)) - if self.atomicity: + if hasattr(self, "atomicity"): result.append("atomic=%s" % ":".join(str(a) for a in self.atomicity)) return result @@ -762,6 +784,8 @@ def _get_assignee_and_index(expr): raise RuntimeError("invalid lvalue '%s'" % expr) +# {{{ atomic ops + class memory_ordering: """Ordering of atomic operations, defined as in C11 and OpenCL. @@ -903,10 +927,51 @@ class AtomicUpdate(VarAtomicity): memory_ordering.to_string(self.ordering), memory_scope.to_string(self.scope)) +# }}} + + +# {{{ instruction base class: expression rhs + +class MultiAssignmentBase(InstructionBase): + """An assignment instruction with an expression as a right-hand side.""" + + fields = InstructionBase.fields | set(["expression"]) + + @memoize_method + def read_dependency_names(self): + from loopy.symbolic import get_dependencies + result = get_dependencies(self.expression) + for _, subscript in self.assignees_and_indices(): + result = result | get_dependencies(subscript) + + processed_predicates = frozenset( + pred.lstrip("!") for pred in self.predicates) + + result = result | processed_predicates + + return result + + @memoize_method + def reduction_inames(self): + def map_reduction(expr, rec): + rec(expr.expr) + for iname in expr.inames: + result.add(iname) + + from loopy.symbolic import ReductionCallbackMapper + cb_mapper = ReductionCallbackMapper(map_reduction) + + result = set() + cb_mapper(self.expression) + + return result + +# }}} + -# {{{ assignment +# {{{ instruction: assignment -class Assignment(InstructionBase): +class Assignment(MultiAssignmentBase): """ .. attribute:: assignee @@ -957,8 +1022,8 @@ class Assignment(InstructionBase): .. automethod:: __init__ """ - fields = InstructionBase.fields | \ - set("assignee expression temp_var_type atomicity".split()) + fields = MultiAssignmentBase.fields | \ + set("assignee temp_var_type atomicity".split()) def __init__(self, assignee, expression, @@ -967,6 +1032,7 @@ class Assignment(InstructionBase): depends_on_is_final=None, groups=None, conflicts_with_groups=None, + no_sync_with=None, forced_iname_deps_is_final=None, forced_iname_deps=frozenset(), boostable=None, boostable_into=None, tags=None, @@ -974,12 +1040,13 @@ class Assignment(InstructionBase): priority=0, predicates=frozenset(), insn_deps=None, insn_deps_is_final=None): - InstructionBase.__init__(self, + super(Assignment, self).__init__( id=id, depends_on=depends_on, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, + no_sync_with=no_sync_with, forced_iname_deps_is_final=forced_iname_deps_is_final, forced_iname_deps=forced_iname_deps, boostable=boostable, @@ -994,7 +1061,7 @@ class Assignment(InstructionBase): if isinstance(assignee, str): assignee = parse(assignee) if isinstance(expression, str): - assignee = parse(expression) + expression = parse(expression) # FIXME: It may be worth it to enable this check eventually. # For now, it causes grief with certain 'checky' uses of the @@ -1011,35 +1078,6 @@ class Assignment(InstructionBase): # {{{ implement InstructionBase interface - @memoize_method - def read_dependency_names(self): - from loopy.symbolic import get_dependencies - result = get_dependencies(self.expression) - for _, subscript in self.assignees_and_indices(): - result = result | get_dependencies(subscript) - - processed_predicates = frozenset( - pred.lstrip("!") for pred in self.predicates) - - result = result | processed_predicates - - return result - - @memoize_method - def reduction_inames(self): - def map_reduction(expr, rec): - rec(expr.expr) - for iname in expr.inames: - result.add(iname) - - from loopy.symbolic import ReductionCallbackMapper - cb_mapper = ReductionCallbackMapper(map_reduction) - - result = set() - cb_mapper(self.expression) - - return result - @memoize_method def assignees_and_indices(self): return [_get_assignee_and_index(self.assignee)] @@ -1078,6 +1116,18 @@ class Assignment(InstructionBase): else: key_builder.rec(key_hash, getattr(self, field_name)) + # {{{ for interface uniformity with CallInstruction + + @property + def temp_var_types(self): + return (self.temp_var_type,) + + @property + def assignees(self): + return (self.assignee,) + + # }}} + class ExpressionInstruction(Assignment): def __init__(self, *args, **kwargs): @@ -1090,6 +1140,162 @@ class ExpressionInstruction(Assignment): # }}} +# {{{ instruction: function call + +class CallInstruction(MultiAssignmentBase): + """An instruction capturing a function call. Unlike :class:`Assignment`, + this instruction supports functions with multiple return values. + + .. attribute:: assignees + + .. attribute:: expression + + The following attributes are only used until + :func:`loopy.make_kernel` is finished: + + .. attribute:: temp_var_types + + if not *None*, a type that will be assigned to the new temporary variable + created from the assignee + + .. automethod:: __init__ + """ + + fields = MultiAssignmentBase.fields | \ + set("assignees temp_var_types".split()) + + def __init__(self, + assignees, expression, + id=None, + depends_on=None, + depends_on_is_final=None, + groups=None, + conflicts_with_groups=None, + no_sync_with=None, + forced_iname_deps_is_final=None, + forced_iname_deps=frozenset(), + boostable=None, boostable_into=None, tags=None, + temp_var_types=None, + priority=0, predicates=frozenset(), + insn_deps=None, insn_deps_is_final=None): + + super(CallInstruction, self).__init__( + id=id, + depends_on=depends_on, + depends_on_is_final=depends_on_is_final, + groups=groups, + conflicts_with_groups=conflicts_with_groups, + no_sync_with=no_sync_with, + forced_iname_deps_is_final=forced_iname_deps_is_final, + forced_iname_deps=forced_iname_deps, + boostable=boostable, + boostable_into=boostable_into, + priority=priority, + predicates=predicates, + tags=tags, + insn_deps=insn_deps, + insn_deps_is_final=insn_deps_is_final) + + from pymbolic.primitives import Call + from loopy.symbolic import Reduction + if not isinstance(expression, (Call, Reduction)) and expression is not None: + raise LoopyError("'expression' argument to CallInstruction " + "must be a function call") + + from loopy.symbolic import parse + if isinstance(assignees, str): + assignees = parse(assignees) + if isinstance(expression, str): + expression = parse(expression) + + # FIXME: It may be worth it to enable this check eventually. + # For now, it causes grief with certain 'checky' uses of the + # with_transformed_expressions(). (notably the access checker) + # + # from pymbolic.primitives import Variable, Subscript + # if not isinstance(assignee, (Variable, Subscript)): + # raise LoopyError("invalid lvalue '%s'" % assignee) + + self.assignees = assignees + self.expression = expression + self.temp_var_types = temp_var_types + + # {{{ implement InstructionBase interface + + @memoize_method + def assignees_and_indices(self): + return [_get_assignee_and_index(a) for a in self.assignees] + + def with_transformed_expressions(self, f, *args): + return self.copy( + assignees=f(self.assignees, *args), + expression=f(self.expression, *args)) + + # }}} + + def __str__(self): + result = "%s: %s <- %s" % (self.id, + ", ".join(str(a) for a in self.assignees), + self.expression) + + options = self.get_str_options() + if options: + result += " (%s)" % (": ".join(options)) + + if self.predicates: + result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) + return result + + def update_persistent_hash(self, key_hash, key_builder): + """Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + + Only works in conjunction with :class:`loopy.tools.KeyBuilder`. + """ + + # Order matters for hash forming--sort the fields. + for field_name in sorted(self.fields): + if field_name in ["assignees", "expression"]: + key_builder.update_for_pymbolic_expression( + key_hash, getattr(self, field_name)) + else: + key_builder.rec(key_hash, getattr(self, field_name)) + +# }}} + + +def make_assignment(assignees, expression, temp_var_types=None, **kwargs): + if len(assignees) < 1: + raise LoopyError("every instruction must have a left-hand side") + elif len(assignees) > 1: + atomicity = kwargs.pop("atomicity", ()) + if atomicity: + raise LoopyError("atomic operations with more than one " + "left-hand side not supported") + + from pymbolic.primitives import Call + from loopy.symbolic import Reduction + if not isinstance(expression, (Call, Reduction)): + raise LoopyError("right-hand side in multiple assignment must be " + "function call or reduction") + + return CallInstruction( + assignees=assignees, + expression=expression, + temp_var_types=temp_var_types, + **kwargs) + + else: + return Assignment( + assignee=assignees[0], + expression=expression, + temp_var_type=( + temp_var_types[0] + if temp_var_types is not None + else None), + **kwargs) + + # {{{ c instruction class CInstruction(InstructionBase): @@ -1121,9 +1327,10 @@ class CInstruction(InstructionBase): .. attribute:: assignees - A sequence of variable references (with or without subscript) as - :class:`pymbolic.primitives.Expression` instances that :attr:`code` - writes to. This is optional and only used for figuring out dependencies. + A sequence (typically a :class:`tuple`) of variable references (with or + without subscript) as :class:`pymbolic.primitives.Expression` instances + that :attr:`code` writes to. This is optional and only used for + figuring out dependencies. """ fields = InstructionBase.fields | \ @@ -1131,9 +1338,10 @@ class CInstruction(InstructionBase): def __init__(self, iname_exprs, code, - read_variables=frozenset(), assignees=frozenset(), + read_variables=frozenset(), assignees=tuple(), id=None, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, + no_sync_with=None, forced_iname_deps_is_final=None, forced_iname_deps=frozenset(), priority=0, boostable=None, boostable_into=None, predicates=frozenset(), tags=None, @@ -1153,6 +1361,7 @@ class CInstruction(InstructionBase): depends_on=depends_on, depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, + no_sync_with=no_sync_with, forced_iname_deps_is_final=forced_iname_deps_is_final, forced_iname_deps=forced_iname_deps, boostable=boostable, @@ -1263,6 +1472,35 @@ class CInstruction(InstructionBase): # }}} + +# {{{ function call mangling + +class CallMangleInfo(Record): + """ + .. attribute:: target_name + + A string. The name of the function to be called in the + generated target code. + + .. attribute:: result_dtypes + + A tuple of :class:`LoopyType` instances indicating what + types of values the function returns. + + .. attribute:: arg_dtypes + + A tuple of :class:`LoopyType` instances indicating what + types of arguments the function actually receives. + """ + + def __init__(self, target_name, result_dtypes, arg_dtypes): + assert isinstance(result_dtypes, tuple) + + super(CallMangleInfo, self).__init__( + target_name=target_name, + result_dtypes=result_dtypes, + arg_dtypes=arg_dtypes) + # }}} # vim: foldmethod=marker diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index ef3cb3c8a4fdb25fa655c74d642f720dad825af2..0a02979ab32ad45cc4b1f6519c2bd327fa150812 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -423,11 +423,11 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): dep_graph = {} lines = [] - from loopy.kernel.data import Assignment, CInstruction + from loopy.kernel.data import MultiAssignmentBase, CInstruction for insn in kernel.instructions: - if isinstance(insn, Assignment): - op = "%s <- %s" % (insn.assignee, insn.expression) + if isinstance(insn, MultiAssignmentBase): + op = "%s <- %s" % (insn.assignees, insn.expression) if len(op) > 200: op = op[:200] + "..." @@ -657,8 +657,9 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): from pymbolic.primitives import Subscript - if isinstance(insn.assignee, Subscript): - ary_acc_exprs.append(insn.assignee) + for assignee in insn.assignees: + if isinstance(assignee, Subscript): + ary_acc_exprs.append(assignee) # }}} @@ -855,7 +856,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): import loopy as lp for insn in kernel.instructions: - if not isinstance(insn, lp.Assignment): + if not isinstance(insn, lp.MultiAssignmentBase): continue auto_axis_inames = [ diff --git a/loopy/library/function.py b/loopy/library/function.py index df623a4770f4f14a7952ee2e0edbf59939de1cfd..efa590371bb632cbc9776078ea6b5c64f626d46a 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -38,7 +38,9 @@ def default_function_mangler(kernel, name, arg_dtypes): def single_arg_function_mangler(kernel, name, arg_dtypes): if len(arg_dtypes) == 1: dtype, = arg_dtypes - return dtype, name + + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo(name, (dtype,), (dtype,)) return None diff --git a/loopy/library/random123.py b/loopy/library/random123.py new file mode 100644 index 0000000000000000000000000000000000000000..7d04b8c7330f88af9ee1d79fe19fd87b29b70050 --- /dev/null +++ b/loopy/library/random123.py @@ -0,0 +1,222 @@ +"""Library integration with Random123.""" + +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from pytools import Record +from mako.template import Template +import numpy as np + + +# {{{ rng metadata + +class RNGInfo(Record): + @property + def full_name(self): + return "%s%dx%d" % (self.name, self.width, self.bits) + + +_philox_base_info = RNGInfo( + name="philox", + pyopencl_header="pyopencl-random123/philox.cl", + generic_header="Random123/philox.h", + key_width=2) + +_threefry_base_info = RNGInfo( + name="threefry", + pyopencl_header="pyopencl-random123/threefry.cl", + generic_header="Random123/threefry.h", + key_width=4) + +RNG_VARIANTS = [ + _philox_base_info.copy(width=2, bits=32), + _philox_base_info.copy(width=2, bits=64), + _philox_base_info.copy(width=4, bits=32), + _philox_base_info.copy(width=4, bits=64), + + _threefry_base_info.copy(width=2, bits=32), + _threefry_base_info.copy(width=2, bits=64), + _threefry_base_info.copy(width=4, bits=32), + _threefry_base_info.copy(width=4, bits=64), + ] + +FUNC_NAMES_TO_RNG = dict( + (v.full_name + suffix, v) + for v in RNG_VARIANTS + for suffix in [ + "", "_f32", "_f64", + ]) + +# }}} + + +# {{{ preamble + +PREAMBLE_TEMPLATE = Template(""" +%if is_pyopencl_target: +#include <${ rng_variant.pyopencl_header }> +%else: +#include <${ rng_variant.generic_header }> +%endif + +<% +name = rng_variant.full_name +width = rng_variant.width +if rng_variant.bits == 32: + counter_type = "uint%d" % width + key_type = "uint%d" % rng_variant.key_width +elif rng_variant.bits == 64: + counter_type = "ulong%d" % width + key_type = "ulong%d" % rng_variant.key_width +else: + assert False +%> + +typedef union { + ${ counter_type } v; + ${ name }_ctr_t c; +} ${ name }_ctr_vec_union; + + +${ counter_type } ${ name }_bump(${ counter_type } ctr) +{ + if (++ctr.x == 0) + if (++ctr.y == 0) + ++ctr.z; + return ctr; +} + +${ counter_type } ${ name }_gen( + ${ counter_type } ctr, + ${ key_type } key, + ${ counter_type } *new_ctr) +{ + ${ name }_ctr_vec_union result; + result.c = ${ name }( + *(${ name }_ctr_t *) &ctr, + *(${ name }_key_t *) &key); + *new_ctr = ${ name }_bump(ctr); + return result.v; +} + +float${ width } ${ name }_f32( + ${ counter_type } ctr, + ${ key_type } key, + ${ counter_type } *new_ctr) +{ + *new_ctr = ctr; + return + convert_float${ width }(${ name }_gen(*new_ctr, key, new_ctr)) + * ${ repr(1./2**32) }f; +} + +double${ width } ${ name }_f64( + ${ counter_type } ctr, + ${ key_type } key, + ${ counter_type } *new_ctr) +{ + *new_ctr = ctr; + %if rng_variant.bits == 32: + return + convert_double${ width }(${ name }_gen(*new_ctr, key, new_ctr)) + * ${ repr(1./2**32) } + + + convert_double${ width }(${ name }_gen(*new_ctr, key, new_ctr)) + * ${ repr(1./2**64) }; + + %elif rng_variant.bits == 64: + *new_ctr = ctr; + return + convert_double${ width }(${ name }_gen(*new_ctr, key, new_ctr)) + * ${ repr(1./2**64) }; + + %else: + #error Unrecognized bit width in RNG + + %endif +} + +""", strict_undefined=True) + +# }}} + + +def random123_preamble_generator(preamble_info): + for f in preamble_info.seen_functions: + try: + rng_variant = FUNC_NAMES_TO_RNG[f.name] + except KeyError: + continue + + from loopy.target.pyopencl import PyOpenCLTarget + yield ("90-random123-"+rng_variant.full_name, + PREAMBLE_TEMPLATE.render( + is_pyopencl_target=isinstance( + preamble_info.kernel.target, + PyOpenCLTarget), + rng_variant=rng_variant, + )) + + +def random123_function_mangler(kernel, name, arg_dtypes): + try: + rng_variant = FUNC_NAMES_TO_RNG[name] + except KeyError: + return None + + from loopy.types import NumpyType + target = kernel.target + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + from loopy.kernel.data import CallMangleInfo + fn = rng_variant.full_name + if name == fn: + return CallMangleInfo( + target_name=fn+"_gen", + result_dtypes=(ctr_dtype, ctr_dtype), + arg_dtypes=(ctr_dtype, key_dtype)) + + elif name == fn + "_f32": + return CallMangleInfo( + target_name=name, + result_dtypes=( + target.vector_dtype(NumpyType(np.float32), rng_variant.width), + ctr_dtype), + arg_dtypes=(ctr_dtype, key_dtype)) + + elif name == fn + "_f64": + return CallMangleInfo( + target_name=name, + result_dtypes=( + target.vector_dtype(NumpyType(np.float64), rng_variant.width), + ctr_dtype), + arg_dtypes=(ctr_dtype, key_dtype)) + + else: + return None + +# vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index b39115a355069cadf698b466b55c2fc3c6b5a4e0..8a38eebd55b003c624b386bcdf296d2b97e2c97c 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -36,7 +36,7 @@ class ReductionOperation(object): equality-comparable. """ - def result_dtype(self, target, arg_dtype, inames): + def result_dtypes(self, target, arg_dtype, inames): raise NotImplementedError def neutral_element(self, dtype, inames): @@ -82,11 +82,12 @@ class ScalarReductionOperation(ReductionOperation): """ self.forced_result_type = forced_result_type - def result_dtype(self, target, arg_dtype, inames): + def result_dtypes(self, kernel, arg_dtype, inames): if self.forced_result_type is not None: - return self.parse_result_type(target, self.forced_result_type) + return (self.parse_result_type( + kernel.target, self.forced_result_type),) - return arg_dtype + return (arg_dtype,) def __hash__(self): return hash((type(self), self.forced_result_type)) @@ -148,23 +149,12 @@ class MinReductionOperation(ScalarReductionOperation): # {{{ argmin/argmax -ARGEXT_STRUCT_DTYPES = {} - - class _ArgExtremumReductionOperation(ReductionOperation): def prefix(self, dtype): return "loopy_arg%s_%s" % (self.which, dtype.numpy_dtype.type.__name__) - def result_dtype(self, target, dtype, inames): - try: - return ARGEXT_STRUCT_DTYPES[dtype] - except KeyError: - struct_dtype = np.dtype([("value", dtype), ("index", np.int32)]) - ARGEXT_STRUCT_DTYPES[dtype] = NumpyType(struct_dtype, target) - - target.get_or_register_dtype(self.prefix(dtype)+"_result", - NumpyType(struct_dtype)) - return ARGEXT_STRUCT_DTYPES[dtype] + def result_dtypes(self, kernel, dtype, inames): + return (dtype, kernel.index_dtype) def neutral_element(self, dtype, inames): return ArgExtFunction(self, dtype, "init", inames)() @@ -179,7 +169,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): iname, = inames return ArgExtFunction(self, dtype, "update", inames)( - operand1, operand2, var(iname)) + *(operand1 + (operand2, var(iname)))) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -207,7 +197,7 @@ class ArgExtFunction(FunctionIdentifier): return (self.reduction_op, self.scalar_dtype, self.name, self.inames) -def get_argext_preamble(target, func_id): +def get_argext_preamble(kernel, func_id): op = func_id.reduction_op prefix = op.prefix(func_id.scalar_dtype) @@ -216,35 +206,32 @@ def get_argext_preamble(target, func_id): c_code_mapper = CCodeMapper() return (prefix, """ - typedef struct { - %(scalar_type)s value; - int index; - } %(type_name)s; - - inline %(type_name)s %(prefix)s_init() + inline %(scalar_t)s %(prefix)s_init(%(index_t)s *index_out) { - %(type_name)s result; - result.value = %(neutral)s; - result.index = INT_MIN; - return result; + *index_out = INT_MIN; + return %(neutral)s; } - inline %(type_name)s %(prefix)s_update( - %(type_name)s state, %(scalar_type)s op2, int index) + inline %(scalar_t)s %(prefix)s_update( + %(scalar_t)s op1, %(index_t)s index1, + %(scalar_t)s op2, %(index_t)s index2, + %(index_t)s *index_out) { - %(type_name)s result; - if (op2 %(comp)s state.value) + if (op2 %(comp)s op1) + { + *index_out = index2; + return op2; + } + else { - result.value = op2; - result.index = index; - return result; + *index_out = index1; + return op1; } - else return state; } """ % dict( - type_name=prefix+"_result", - scalar_type=target.dtype_to_typename(func_id.scalar_dtype), + scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype), prefix=prefix, + index_t=kernel.target.dtype_to_typename(kernel.index_dtype), neutral=c_code_mapper( op.neutral_sign*get_le_neutral(func_id.scalar_dtype)), comp=op.update_comparison, @@ -302,14 +289,39 @@ def parse_reduction_op(name): def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtFunction): + if isinstance(func_id, ArgExtFunction) and func_id.name == "init": from loopy.target.opencl import OpenCLTarget if not isinstance(kernel.target, OpenCLTarget): raise LoopyError("only OpenCL supported for now") op = func_id.reduction_op - return (op.result_dtype(kernel.target, func_id.scalar_dtype, func_id.inames), - "%s_%s" % (op.prefix(func_id.scalar_dtype), func_id.name)) + + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo( + target_name="%s_init" % op.prefix(func_id.scalar_dtype), + result_dtypes=op.result_dtypes( + kernel, func_id.scalar_dtype, func_id.inames), + arg_dtypes=(), + ) + + elif isinstance(func_id, ArgExtFunction) and func_id.name == "update": + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + raise LoopyError("only OpenCL supported for now") + + op = func_id.reduction_op + + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo( + target_name="%s_update" % op.prefix(func_id.scalar_dtype), + result_dtypes=op.result_dtypes( + kernel, func_id.scalar_dtype, func_id.inames), + arg_dtypes=( + func_id.scalar_dtype, + kernel.index_dtype, + func_id.scalar_dtype, + kernel.index_dtype), + ) return None @@ -322,6 +334,6 @@ def reduction_preamble_generator(preamble_info): if not isinstance(preamble_info.kernel.target, OpenCLTarget): raise LoopyError("only OpenCL supported for now") - yield get_argext_preamble(preamble_info.kernel.target, func.name) + yield get_argext_preamble(preamble_info.kernel, func.name) # vim: fdm=marker diff --git a/loopy/maxima.py b/loopy/maxima.py index 1a2d0770c78af6506664aaed17f99e90e200875c..29f974ff9d1f6f0a6e6d4311eca88201559e6854 100644 --- a/loopy/maxima.py +++ b/loopy/maxima.py @@ -82,7 +82,7 @@ def get_loopy_instructions_as_maxima(kernel, prefix): if not isinstance(insn, InstructionBase): insn = kernel.id_to_insn[insn] if not isinstance(insn, Assignment): - raise RuntimeError("non-expression instructions not supported " + raise RuntimeError("non-single-output assignment not supported " "in maxima export") for dep in insn.depends_on: diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e30d3bcb3f441036bab6558bc2d8691031a1c2e4..2fdadb48e790d2cd05cc00f4d04c31957505aaa9 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -31,6 +31,7 @@ from loopy.diagnostic import ( from pytools.persistent_dict import PersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from loopy.kernel.data import make_assignment import logging logger = logging.getLogger(__name__) @@ -123,14 +124,26 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): from loopy.diagnostic import DependencyTypeInferenceFailure for writer_insn_id in kernel.writer_map().get(var_name, []): writer_insn = kernel.id_to_insn[writer_insn_id] - if not isinstance(writer_insn, lp.Assignment): + if not isinstance(writer_insn, lp.MultiAssignmentBase): continue expr = subst_expander(writer_insn.expression) try: debug(" via expr %s" % expr) - result = type_inf_mapper(expr) + if isinstance(writer_insn, lp.Assignment): + result = type_inf_mapper(expr) + elif isinstance(writer_insn, lp.CallInstruction): + result_dtypes = type_inf_mapper(expr, multiple_types_ok=True) + + result = None + for (assignee, _), comp_dtype in zip( + writer_insn.assignees_and_indices(), result_dtypes): + if assignee == var_name: + result = comp_dtype + break + + assert result is not None debug(" result: %s" % result) @@ -319,9 +332,10 @@ def mark_local_temporaries(kernel): if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase)) locparallel_assignee_inames = set(iname - for _, assignee_indices in insn.assignees_and_indices() - for iname in get_dependencies(assignee_indices) + for aname, aindices in insn.assignees_and_indices() + for iname in get_dependencies(aindices) & kernel.all_inames() + if aname == temp_var.name if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase)) assert locparallel_assignee_inames <= locparallel_compute_inames @@ -400,7 +414,9 @@ def add_default_dependencies(kernel): % var) if len(var_writers) == 1: - auto_deps.update(var_writers - set([insn.id])) + auto_deps.update( + var_writers + - set([insn.id])) # }}} @@ -436,35 +452,48 @@ def realize_reduction(kernel, insn_id_filter=None): new_insns = [] + insn_id_gen = kernel.get_instruction_id_generator() + var_name_gen = kernel.get_var_name_generator() new_temporary_variables = kernel.temporary_variables.copy() from loopy.expression import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel) - def map_reduction(expr, rec): + def map_reduction(expr, rec, multiple_values_ok=False): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. - from pymbolic import var - - target_var_name = var_name_gen("acc_"+"_".join(expr.inames)) - target_var = var(target_var_name) - try: arg_dtype = type_inf_mapper(expr.expr) except DependencyTypeInferenceFailure: raise LoopyError("failed to determine type of accumulator for " "reduction '%s'" % expr) - from loopy.kernel.data import Assignment, TemporaryVariable + arg_dtype = arg_dtype.with_target(kernel.target) + + reduction_dtypes = expr.operation.result_dtypes( + kernel, arg_dtype, expr.inames) + reduction_dtypes = tuple( + dt.with_target(kernel.target) for dt in reduction_dtypes) + + ncomp = len(reduction_dtypes) + + from pymbolic import var + + acc_var_names = [ + var_name_gen("acc_"+"_".join(expr.inames)) + for i in range(ncomp)] + acc_vars = tuple(var(n) for n in acc_var_names) + + from loopy.kernel.data import TemporaryVariable - new_temporary_variables[target_var_name] = TemporaryVariable( - name=target_var_name, - shape=(), - dtype=expr.operation.result_dtype( - kernel.target, arg_dtype, expr.inames), - is_local=False) + for name, dtype in zip(acc_var_names, reduction_dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=(), + dtype=dtype, + is_local=False) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -472,41 +501,52 @@ def realize_reduction(kernel, insn_id_filter=None): raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) - init_id = temp_kernel.make_unique_instruction_id( - based_on="%s_%s_init" % (insn.id, "_".join(expr.inames)), - extra_used_ids=set(i.id for i in generated_insns)) + init_id = insn_id_gen( + "%s_%s_init" % (insn.id, "_".join(expr.inames))) - init_insn = Assignment( + init_insn = make_assignment( id=init_id, - assignee=target_var, + assignees=acc_vars, forced_iname_deps=outer_insn_inames - frozenset(expr.inames), + forced_iname_deps_is_final=insn.forced_iname_deps_is_final, depends_on=frozenset(), expression=expr.operation.neutral_element(arg_dtype, expr.inames)) generated_insns.append(init_insn) - update_id = temp_kernel.make_unique_instruction_id( - based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)), - extra_used_ids=set(i.id for i in generated_insns)) + update_id = insn_id_gen( + based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) + + update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames) + if insn.forced_iname_deps_is_final: + update_insn_iname_deps = insn.forced_iname_deps | set(expr.inames) - reduction_insn = Assignment( + reduction_insn = make_assignment( id=update_id, - assignee=target_var, + assignees=acc_vars, expression=expr.operation( - arg_dtype, target_var, expr.expr, expr.inames), + arg_dtype, + acc_vars if len(acc_vars) > 1 else acc_vars[0], + expr.expr, expr.inames), depends_on=frozenset([init_insn.id]) | insn.depends_on, - forced_iname_deps=temp_kernel.insn_inames(insn) | set(expr.inames)) + forced_iname_deps=update_insn_iname_deps, + forced_iname_deps_is_final=insn.forced_iname_deps_is_final) generated_insns.append(reduction_insn) new_insn_depends_on.add(reduction_insn.id) - return target_var + if multiple_values_ok: + return acc_vars + else: + assert len(acc_vars) == 1 + return acc_vars[0] from loopy.symbolic import ReductionCallbackMapper cb_mapper = ReductionCallbackMapper(map_reduction) insn_queue = kernel.instructions[:] + insn_id_replacements = {} temp_kernel = kernel @@ -518,24 +558,47 @@ def realize_reduction(kernel, insn_id_filter=None): insn = insn_queue.pop(0) if insn_id_filter is not None and insn.id != insn_id_filter \ - or not isinstance(insn, lp.Assignment): + or not isinstance(insn, lp.MultiAssignmentBase): new_insns.append(insn) continue # Run reduction expansion. - new_expression = cb_mapper(insn.expression) + from loopy.symbolic import Reduction + if isinstance(insn.expression, Reduction): + new_expressions = cb_mapper(insn.expression, multiple_values_ok=True) + else: + new_expressions = (cb_mapper(insn.expression),) if generated_insns: # An expansion happened, so insert the generated stuff plus # ourselves back into the queue. - insn = insn.copy( - expression=new_expression, - depends_on=insn.depends_on - | frozenset(new_insn_depends_on), - forced_iname_deps=temp_kernel.insn_inames(insn)) + kwargs = insn.get_copy_kwargs( + depends_on=insn.depends_on + | frozenset(new_insn_depends_on), + forced_iname_deps=temp_kernel.insn_inames(insn)) + + kwargs.pop("id") + kwargs.pop("expression") + kwargs.pop("assignee", None) + kwargs.pop("assignees", None) + kwargs.pop("temp_var_type", None) + kwargs.pop("temp_var_types", None) - insn_queue = generated_insns + [insn] + insn_queue + replacement_insns = [ + lp.Assignment( + id=insn_id_gen(insn.id), + assignee=assignee, + expression=new_expr, + **kwargs) + for assignee, new_expr in zip(insn.assignees, new_expressions)] + + insn_id_replacements[insn.id] = [ + rinsn.id for rinsn in replacement_insns] + + # FIXME: Track dep rename + + insn_queue = generated_insns + replacement_insns + insn_queue # The reduction expander needs an up-to-date kernel # object to find dependencies. Keep temp_kernel up-to-date. @@ -550,10 +613,12 @@ def realize_reduction(kernel, insn_id_filter=None): new_insns.append(insn) - return kernel.copy( + kernel = kernel.copy( instructions=new_insns, temporary_variables=new_temporary_variables) + return lp.replace_instruction_ids(kernel, insn_id_replacements) + # }}} diff --git a/loopy/schedule.py b/loopy/schedule.py index 056030cf56f7b1a6e00cddd64c81cced6fd37c8f..7ac8778378869f7c9cf6cfbe8d4be145e9fad520 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -355,7 +355,7 @@ def format_insn(kernel, insn_id): Style = kernel.options._style return "[%s] %s%s%s <- %s%s%s" % ( format_insn_id(kernel, insn_id), - Fore.BLUE, str(insn.assignee), Style.RESET_ALL, + Fore.BLUE, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL, Fore.MAGENTA, str(insn.expression), Style.RESET_ALL) @@ -363,7 +363,7 @@ def dump_schedule(kernel, schedule): lines = [] indent = "" - from loopy.kernel.data import Assignment + from loopy.kernel.data import MultiAssignmentBase for sched_item in schedule: if isinstance(sched_item, EnterLoop): lines.append(indent + "LOOP %s" % sched_item.iname) @@ -379,13 +379,13 @@ def dump_schedule(kernel, schedule): lines.append(indent + "RETURN FROM KERNEL %s" % sched_item.kernel_name) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] - if isinstance(insn, Assignment): + if isinstance(insn, MultiAssignmentBase): insn_str = format_insn(kernel, sched_item.insn_id) else: insn_str = sched_item.insn_id lines.append(indent + insn_str) elif isinstance(sched_item, Barrier): - lines.append(indent + "---BARRIER---") + lines.append(indent + "---BARRIER:%s---" % sched_item.kind) else: assert False @@ -1057,6 +1057,9 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): if reverse: source, target = target, source + if source.id in target.no_sync_with: + return None + # {{{ check that a dependency exists dep_descr = None diff --git a/loopy/statistics.py b/loopy/statistics.py index c5eb3142d73efe0b48960bc8ed7ef1aead20d0de..1a044b8d7f389fbb8e58d3fa597932f8c56ac97f 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -31,8 +31,8 @@ import islpy as isl from pytools import memoize_in from pymbolic.mapper import CombineMapper from functools import reduce -from loopy.kernel.data import Assignment -from loopy.diagnostic import warn +from loopy.kernel.data import MultiAssignmentBase +from loopy.diagnostic import warn, LoopyError __doc__ = """ @@ -47,6 +47,9 @@ __doc__ = """ .. autofunction:: get_barrier_poly +.. autofunction:: gather_access_footprints +.. autofunction:: gather_access_footprint_bytes + """ @@ -415,9 +418,10 @@ class GlobalSubscriptCounter(CombineMapper): # {{{ AccessFootprintGatherer class AccessFootprintGatherer(CombineMapper): - def __init__(self, kernel, domain): + def __init__(self, kernel, domain, ignore_uncountable=False): self.kernel = kernel self.domain = domain + self.ignore_uncountable = ignore_uncountable @staticmethod def combine(values): @@ -456,10 +460,17 @@ class AccessFootprintGatherer(CombineMapper): self.kernel.assumptions) except isl.Error: # Likely: index was non-linear, nothing we can do. - return + if self.ignore_uncountable: + return {} + else: + raise LoopyError("failed to gather footprint: %s" % expr) + except TypeError: # Likely: index was non-linear, nothing we can do. - return + if self.ignore_uncountable: + return {} + else: + raise LoopyError("failed to gather footprint: %s" % expr) from pymbolic.primitives import Variable assert isinstance(expr.aggregate, Variable) @@ -838,8 +849,16 @@ def get_barrier_poly(knl): # {{{ gather_access_footprints -def gather_access_footprints(kernel): - # TODO: Docs +def gather_access_footprints(kernel, ignore_uncountable=False): + """Return a dictionary mapping ``(var_name, direction)`` + to :class:`islpy.Set` instances capturing which indices + of each the array *var_name* are read/written (where + *direction* is either ``read`` or ``write``. + + :arg ignore_uncountable: If *True*, an error will be raised for + accesses on which the footprint cannot be determined (e.g. + data-dependent or nonlinear indices) + """ from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) @@ -849,7 +868,7 @@ def gather_access_footprints(kernel): read_footprints = [] for insn in kernel.instructions: - if not isinstance(insn, Assignment): + if not isinstance(insn, MultiAssignmentBase): warn(kernel, "count_non_assignment", "Non-assignment instruction encountered in " "gather_access_footprints, not counted") @@ -859,9 +878,11 @@ def gather_access_footprints(kernel): inames_domain = kernel.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) - afg = AccessFootprintGatherer(kernel, domain) + afg = AccessFootprintGatherer(kernel, domain, + ignore_uncountable=ignore_uncountable) - write_footprints.append(afg(insn.assignee)) + for assignee in insn.assignees: + write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) write_footprints = AccessFootprintGatherer.combine(write_footprints) @@ -877,6 +898,35 @@ def gather_access_footprints(kernel): return result + +def gather_access_footprint_bytes(kernel, ignore_uncountable=False): + """Return a dictionary mapping ``(var_name, direction)`` to + :class:`islpy.PwQPolynomial` instances capturing the number of bytes are + read/written (where *direction* is either ``read`` or ``write`` on array + *var_name* + + :arg ignore_uncountable: If *True*, an error will be raised for + accesses on which the footprint cannot be determined (e.g. + data-dependent or nonlinear indices) + """ + + result = {} + fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable) + + for key, var_fp in fp.items(): + vname, direction = key + + var_descr = kernel.get_var_descriptor(vname) + bytes_transferred = ( + int(var_descr.dtype.numpy_dtype.itemsize) + * count(kernel, var_fp)) + if key in result: + result[key] += bytes_transferred + else: + result[key] = bytes_transferred + + return result + # }}} # vim: foldmethod=marker diff --git a/loopy/symbolic.py b/loopy/symbolic.py index b887c703420d092d7f3c0fc9c729dd1d1f942a76..219d66d49a47882bf76bd8c89f7298c8597dfa0e 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -33,7 +33,7 @@ from pytools import memoize, memoize_method, Record import pytools.lex from pymbolic.primitives import ( - Leaf, AlgebraicLeaf, Variable, + Leaf, AlgebraicLeaf, Expression, Variable, CommonSubexpression) from pymbolic.mapper import ( @@ -98,6 +98,9 @@ class IdentityMapperMixin(object): # leaf, doesn't change return expr + def map_type_annotation(self, expr, *args): + return TypeAnnotation(expr.type, self.rec(expr.child)) + map_linear_subscript = IdentityMapperBase.map_subscript @@ -321,6 +324,18 @@ class TypedCSE(CommonSubexpression): return dict(dtype=self.dtype) +class TypeAnnotation(Expression): + def __init__(self, type, child): + super(TypeAnnotation, self).__init__() + self.type = type + self.child = child + + def __getinitargs__(self): + return (self.type, self.child) + + mapper_method = intern("map_type_annotation") + + class TaggedVariable(Variable): """This is an identifier with a tag, such as 'matrix$one', where 'one' identifies this specific use of the identifier. This mechanism @@ -882,7 +897,6 @@ class FunctionToPrimitiveMapper(IdentityMapper): # {{{ customization to pymbolic parser _open_dbl_bracket = intern("open_dbl_bracket") -_close_dbl_bracket = intern("close_dbl_bracket") TRAILING_FLOAT_TAG_RE = re.compile("^(.*?)([a-zA-Z]*)$") @@ -908,6 +922,26 @@ class LoopyParser(ParserBase): else: return float(val) # generic float + def parse_prefix(self, pstate): + from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier + if pstate.is_next(_less): + pstate.advance() + if pstate.is_next(_greater): + typename = None + pstate.advance() + else: + pstate.expect(_identifier) + typename = pstate.next_str() + pstate.advance() + pstate.expect(_greater) + pstate.advance() + + return TypeAnnotation( + typename, + self.parse_expression(pstate, _PREC_UNARY)) + else: + return super(LoopyParser, self).parse_prefix(pstate) + def parse_postfix(self, pstate, min_precedence, left_exp): from pymbolic.parser import _PREC_CALL, _closebracket if pstate.next_tag() is _open_dbl_bracket and _PREC_CALL > min_precedence: @@ -1079,10 +1113,10 @@ class ReductionCallbackMapper(IdentityMapper): def __init__(self, callback): self.callback = callback - def map_reduction(self, expr): - result = self.callback(expr, self.rec) + def map_reduction(self, expr, **kwargs): + result = self.callback(expr, self.rec, **kwargs) if result is None: - return IdentityMapper.map_reduction(self, expr) + return IdentityMapper.map_reduction(self, expr, **kwargs) return result # }}} diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 42a9e5a3dc4c88cceadb3399b5399d609b877cfb..cad451c2ed041875f9dd9535d73ce20e8ac9e057 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -133,6 +133,9 @@ class TargetBase(object): def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError() + def generate_multiple_assignment(self, codegen_state, insn): + raise NotImplementedError() + def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic update in target %s" % type(self).__name__) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 822ee1838d5c0bf56eed98e9c3b36f34583ce033..5e35075623c346f908468ae3855f78883da8d652 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -341,6 +341,71 @@ class CTarget(TargetBase): "++%s" % iname, inner) + def generate_multiple_assignment(self, codegen_state, insn): + ecm = codegen_state.expression_to_code_mapper + + from pymbolic.primitives import Variable + from pymbolic.mapper.stringifier import PREC_NONE + + func_id = insn.expression.function + parameters = insn.expression.parameters + + if isinstance(func_id, Variable): + func_id = func_id.name + + assignee_var_descriptors = [codegen_state.kernel.get_var_descriptor(a) + for a, _ in insn.assignees_and_indices()] + + par_dtypes = tuple(ecm.infer_type(par) for par in parameters) + + str_parameters = None + + mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) + if mangle_result is None: + raise RuntimeError("function '%s' unknown--" + "maybe you need to register a function mangler?" + % func_id) + + assert mangle_result.arg_dtypes is not None + + from loopy.expression import dtype_to_type_context + str_parameters = [ + ecm(par, PREC_NONE, + dtype_to_type_context(self, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + parameters, par_dtypes, mangle_result.arg_dtypes)] + + from loopy.codegen import SeenFunction + codegen_state.seen_functions.add( + SeenFunction(func_id, + mangle_result.target_name, + mangle_result.arg_dtypes)) + + for i, (a, tgt_dtype) in enumerate( + zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): + if tgt_dtype != ecm.infer_type(a): + raise LoopyError("type mismatch in %d'th (1-based) left-hand " + "side of instruction '%s'" % (i+1, insn.id)) + str_parameters.append( + "&(%s)" % ecm(a, PREC_NONE, + dtype_to_type_context(self, tgt_dtype), + tgt_dtype)) + + result = "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + + result = ecm.wrap_in_typecast( + mangle_result.result_dtypes[0], + assignee_var_descriptors[0].dtype, + result) + + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + + from cgen import Assign + return Assign( + lhs_code, + result) + # }}} # vim: foldmethod=marker diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 63a053c586362a10fa2eea70b83b2659646f62d0..11686dcdc3b691dbab6dbdc716729637e6dc96b7 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -34,7 +34,7 @@ import islpy as isl from loopy.expression import dtype_to_type_context, TypeInferenceMapper -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError, LoopyWarning from loopy.tools import is_integer from loopy.types import LoopyType @@ -94,22 +94,23 @@ class LoopyCCodeMapper(RecursiveMapper): return ary - def rec(self, expr, prec, type_context=None, needed_dtype=None): - if needed_dtype is None: - return RecursiveMapper.rec(self, expr, prec, type_context) - - actual_type = self.infer_type(expr) - + def wrap_in_typecast(self, actual_type, needed_dtype, s): if (actual_type.is_complex() and needed_dtype.is_complex() and actual_type != needed_dtype): - result = RecursiveMapper.rec(self, expr, PREC_NONE, type_context) - return "%s_cast(%s)" % (self.complex_type_name(needed_dtype), result) + return "%s_cast(%s)" % (self.complex_type_name(needed_dtype), s) elif not actual_type.is_complex() and needed_dtype.is_complex(): - result = RecursiveMapper.rec(self, expr, PREC_NONE, type_context) - return "%s_fromreal(%s)" % (self.complex_type_name(needed_dtype), result) + return "%s_fromreal(%s)" % (self.complex_type_name(needed_dtype), s) else: + return s + + def rec(self, expr, prec, type_context=None, needed_dtype=None): + if needed_dtype is None: return RecursiveMapper.rec(self, expr, prec, type_context) + return self.wrap_in_typecast( + self.infer_type(expr), needed_dtype, + RecursiveMapper.rec(self, expr, PREC_NONE, type_context)) + __call__ = rec # }}} @@ -419,37 +420,32 @@ class LoopyCCodeMapper(RecursiveMapper): # }}} - c_name = None if isinstance(identifier, Variable): identifier = identifier.name - c_name = identifier par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) str_parameters = None mangle_result = self.kernel.mangle_function(identifier, par_dtypes) - if mangle_result is not None: - if len(mangle_result) == 2: - result_dtype, c_name = mangle_result - elif len(mangle_result) == 3: - result_dtype, c_name, arg_tgt_dtypes = mangle_result - - str_parameters = [ - self.rec(par, PREC_NONE, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, arg_tgt_dtypes)] - else: - raise RuntimeError("result of function mangler " - "for function '%s' not understood" - % identifier) + if mangle_result is None: + raise RuntimeError("function '%s' unknown--" + "maybe you need to register a function mangler?" + % identifier) - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, c_name, par_dtypes)) - if str_parameters is None: + if len(mangle_result.result_dtypes) != 1: + raise LoopyError("functions with more or fewer than one return value " + "may not be used in an expression") + + if mangle_result.arg_dtypes is not None: + str_parameters = [ + self.rec(par, PREC_NONE, + dtype_to_type_context(self.kernel.target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expr.parameters, par_dtypes, mangle_result.arg_dtypes)] + + else: # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to # propagate the type context here. But for many others, it does # not. Using the inferred type as a stopgap for now. @@ -459,11 +455,18 @@ class LoopyCCodeMapper(RecursiveMapper): self.kernel.target, par_dtype)) for par, par_dtype in zip(expr.parameters, par_dtypes)] - if c_name is None: - raise RuntimeError("unable to find C name for function identifier '%s'" - % identifier) + from warnings import warn + warn("Calling function '%s' with unknown C signature--" + "return CallMangleInfo.arg_dtypes" + % identifier, LoopyWarning) + + from loopy.codegen import SeenFunction + self.codegen_state.seen_functions.add( + SeenFunction(identifier, + mangle_result.target_name, + mangle_result.arg_dtypes or par_dtypes)) - return "%s(%s)" % (c_name, ", ".join(str_parameters)) + return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) # {{{ deal with complex-valued variables diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index f083a70b0288d88d544285e0063eb2e677dfefcf..e839801fdfebaa40f6431db947200c32f880ee53 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -32,7 +32,7 @@ from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper -from loopy.kernel.data import temp_var_scope +from loopy.kernel.data import temp_var_scope, CallMangleInfo # {{{ dtype registry wrappers @@ -137,11 +137,34 @@ def _register_vector_types(dtype_registry): # {{{ function mangler +_CL_SIMPLE_MULTI_ARG_FUNCTIONS = { + "clamp": 3, + } + + +VECTOR_LITERAL_FUNCS = dict( + ("make_%s%d" % (name, count), (name, dtype, count)) + for name, dtype in [ + ('char', np.int8), + ('uchar', np.uint8), + ('short', np.int16), + ('ushort', np.uint16), + ('int', np.int32), + ('uint', np.uint32), + ('long', np.int64), + ('ulong', np.uint64), + ('float', np.float32), + ('double', np.float64), + ] + for count in [2, 3, 4, 8, 16] + ) + + def opencl_function_mangler(kernel, name, arg_dtypes): if not isinstance(name, str): return None - if name in ["max", "min"] and len(arg_dtypes) == 2: + if name in ["max", "min", "atan2"] and len(arg_dtypes) == 2: dtype = np.find_common_type( [], [dtype.numpy_dtype for dtype in arg_dtypes]) @@ -151,14 +174,49 @@ def opencl_function_mangler(kernel, name, arg_dtypes): if dtype.kind == "f": name = "f" + name - return NumpyType(dtype), name - - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name + result_dtype = NumpyType(dtype) + return CallMangleInfo( + target_name=name, + result_dtypes=(result_dtype,), + arg_dtypes=2*(result_dtype,)) if name == "dot": scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return NumpyType(scalar_dtype), name + return CallMangleInfo( + target_name=name, + result_dtypes=(NumpyType(scalar_dtype),), + arg_dtypes=(arg_dtypes[0],)*2) + + if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + if len(arg_dtypes) != num_args: + raise LoopyError("%s takes %d arguments (%d received)" + % (name, num_args, len(arg_dtypes))) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for dtype in arg_dtypes]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + result_dtype = NumpyType(dtype) + return CallMangleInfo( + target_name=name, + result_dtypes=(result_dtype,), + arg_dtypes=(result_dtype,)*3) + + if name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + if count != len(arg_dtypes): + return None + + return CallMangleInfo( + target_name="(%s%d) " % (base_tp_name, count), + result_dtypes=(kernel.target.vector_dtype( + NumpyType(dtype), count),), + arg_dtypes=(NumpyType(dtype),)*count) return None diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index aea2473972c9f0cc4144e17e1943bf9008cb4f18..44b057f1a713a09bd2f493c8f3be67e790049e6e 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -29,6 +29,7 @@ from six.moves import range import numpy as np +from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import OpenCLTarget from loopy.types import NumpyType @@ -194,13 +195,18 @@ def pyopencl_function_mangler(target, name, arg_dtypes): "sin", "cos", "tan", "sinh", "cosh", "tanh", "conj"]: - return arg_dtype, "%s_%s" % (tpname, name) + return CallMangleInfo( + target_name="%s_%s" % (tpname, name), + result_dtypes=(arg_dtype,), + arg_dtypes=(arg_dtype,)) if name in ["real", "imag", "abs"]: - return ( - NumpyType( + return CallMangleInfo( + target_name="%s_%s" % (tpname, name), + result_dtypes=(NumpyType( np.dtype(arg_dtype.numpy_dtype.type(0).real)), - "%s_%s" % (tpname, name)) + ), + arg_dtypes=(arg_dtype,)) return None @@ -270,14 +276,18 @@ class PyOpenCLTarget(OpenCLTarget): comparison_fields = ["device"] def function_manglers(self): + from loopy.library.random123 import random123_function_mangler return ( super(PyOpenCLTarget, self).function_manglers() + [ - pyopencl_function_mangler + pyopencl_function_mangler, + random123_function_mangler ]) def preamble_generators(self): + from loopy.library.random123 import random123_preamble_generator return ([ - pyopencl_preamble_generator + pyopencl_preamble_generator, + random123_preamble_generator, ] + super(PyOpenCLTarget, self).preamble_generators()) def preprocess(self, kernel): diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index d41222c26056300729b0c4005200ba6ea904010d..2c50b3f11a34ced81d6a93eb467482110f495f2f 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -158,7 +158,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): continue if not isinstance(insn, Assignment): - raise LoopyError("'%s' modified by non-expression instruction" + raise LoopyError("'%s' modified by non-single-assignment" % var_name) lhs = insn.assignee diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 25634c91975e686fed04223472ee9fbe2b416a4d..8b2fa370ad4214cbe16caec61ff45b684ed0a889 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -124,6 +124,27 @@ def remove_instructions(kernel, insn_ids): return kernel.copy( instructions=new_insns) + +def replace_instruction_ids(kernel, replacements): + new_insns = [] + + for insn in kernel.instructions: + changed = False + new_depends_on = [] + + for dep in insn.depends_on: + if dep in replacements: + new_depends_on.extend(replacements[dep]) + changed = True + else: + new_depends_on.append(dep) + + new_insns.append( + insn.copy(depends_on=frozenset(new_depends_on)) + if changed else insn) + + return kernel.copy(instructions=new_insns) + # }}} diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 5c9a286de2678c1c23bcb3df682110c431bc793c..6ea0c06e631c5ea321dfd3a11b42f00bb0480078 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -426,8 +426,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, import loopy as lp for insn in kernel.instructions: - if isinstance(insn, lp.Assignment): - invg(insn.assignee, kernel, insn) + if isinstance(insn, lp.MultiAssignmentBase): + for assignee in insn.assignees: + invg(assignee, kernel, insn) invg(insn.expression, kernel, insn) access_descriptors = invg.access_descriptors diff --git a/loopy/version.py b/loopy/version.py index 9fa881d02879dbdac67043f975bc3c00a6194f11..7716feea32079bb122557223b2a1e970ff630ecc 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v24-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v26-islpy%s" % _islpy_version diff --git a/setup.py b/setup.py index 5ed095315234339709309a1c55ec88c7fdab6bfa..30d6dfb636c4d5cea181da4a6b7e6df514adca9d 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ setup(name="loo.py", "islpy>=2016.1.2", "six>=1.8.0", "colorama", + "Mako", ], extras_require={ diff --git a/test/test_loopy.py b/test/test_loopy.py index cebbda4d9b9177bd0a9f63ff9b3126b3c188c76c..a61b6563ff254fdf6eaf6279158a97d20045a663 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -262,7 +262,7 @@ def test_join_inames(ctx_factory): knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"]) knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"]) - lp.auto_test_vs_ref(ref_knl, ctx, knl) + lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True) def test_divisibility_assumption(ctx_factory): @@ -439,26 +439,21 @@ def test_argmax(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) - order = "C" n = 10000 knl = lp.make_kernel( "{[i]: 0<=i<%d}" % n, - [ - "<> result = argmax(i, fabs(a[i]))", - "max_idx = result.index", - "max_val = result.value", - ], - [ - lp.GlobalArg("a", dtype, shape=(n,), order=order), - lp.GlobalArg("max_idx", np.int32, shape=(), order=order), - lp.GlobalArg("max_val", dtype, shape=(), order=order), - ]) + """ + max_val, max_idx = argmax(i, fabs(a[i])) + """) + + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + print(lp.preprocess_kernel(knl)) + knl = lp.set_options(knl, write_cl=True, highlight_cl=True) a = np.random.randn(10000).astype(dtype) - cknl = lp.CompiledKernel(ctx, knl) - evt, (max_idx, max_val) = cknl(queue, a=a, out_host=True) + evt, (max_idx, max_val) = knl(queue, a=a, out_host=True) assert max_val == np.max(np.abs(a)) assert max_idx == np.where(np.abs(a) == max_val)[-1] @@ -2473,6 +2468,94 @@ def test_atomic(ctx_factory, dtype): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000)) +def test_clamp(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 15 * 10**6 + x = cl.clrandom.rand(queue, n, dtype=np.float32) + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = clamp(x[i], a, b)") + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.set_options(knl, write_cl=True) + + evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15)) + + +def test_forced_iname_deps_and_reduction(): + # See https://github.com/inducer/loopy/issues/24 + + # This is (purposefully) somewhat un-idiomatic, to replicate the conditions + # under which the above bug was found. If assignees were phi[i], then the + # iname propagation heuristic would not assume that dependent instructions + # need to run inside of 'i', and hence the forced_iname_* bits below would not + # be needed. + + i1 = lp.CInstruction("i", + "doSomethingToGetPhi();", + assignees="phi") + + from pymbolic.primitives import Subscript, Variable + i2 = lp.Assignment("a", + lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))), + forced_iname_deps=frozenset(), + forced_iname_deps_is_final=True) + + k = lp.make_kernel("{[i,j] : 0<=i,j<n}", + [i1, i2], + [ + lp.GlobalArg("a", dtype=np.float32, shape=()), + lp.ValueArg("n", dtype=np.int32), + lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)), + ], + target=lp.CTarget(), + ) + + k = lp.preprocess_kernel(k) + + assert 'i' not in k.insn_inames("insn_0_j_update") + print(k.stringify(with_dependencies=True)) + + +@pytest.mark.parametrize("tp", ["f32", "f64"]) +def test_random123(ctx_factory, tp): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + import pyopencl.version # noqa + if cl.version.VERSION < (2016, 2): + pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") + + n = 150000 + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + """ + <> key2 = make_uint2(i, 324830944) {inames=i} + <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i} + <> ctr = make_uint4(0, 1, 2, 3) {inames=i} + <> real, ctr = philox4x32_TYPE(ctr, key2) + <> imag, ctr = threefry4x32_TYPE(ctr, key4) + + out[i, 0] = real.s0 + 1j * imag.s0 + out[i, 1] = real.s1 + 1j * imag.s1 + out[i, 2] = real.s2 + 1j * imag.s2 + out[i, 3] = real.s3 + 1j * imag.s3 + """.replace("TYPE", tp)) + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.set_options(knl, write_cl=True) + + evt, (out,) = knl(queue, n=n) + + out = out.get() + assert (out < 1).all() + assert (0 <= out).all() + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])