diff --git a/.gitignore b/.gitignore index 7ae71dc52e28413d3da125b4e5e9a64c2c970a19..5c9e73c7b6ffc90a15059b96c59c461ca0157baf 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ htmlcov .ipynb_checkpoints lextab.py yacctab.py + +.cache diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d62a3c85a518ef0cb0cba138755f5c28a90027c2..bb15d7c9fe93a0f0307edb33c62f76e1f8deec5a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -88,3 +88,15 @@ CentOS binary: - build-helpers/loopy-centos6 tags: - docker + only: + - master + +Documentation: + script: + - EXTRA_INSTALL="numpy" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-docs.sh + - ". ./build-docs.sh" + tags: + - python3.5 + only: + - master diff --git a/doc/misc.rst b/doc/misc.rst index a4c174e52da118143dc38f4d66dfa649eb395a50..d7abe80dda7999575b7d0386cce1bdcecb236c55 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -17,7 +17,7 @@ run this beforehand:: python get-pip.py For a more manual installation, `download the source -<http://pypi.python.org/pypi/Loopy>`_, unpack it, and say:: +<http://pypi.python.org/pypi/loo.py>`_, unpack it, and say:: python setup.py install @@ -310,5 +310,14 @@ Here's a Bibtex entry for your convenience:: doi = "{10.1145/2627373.2627387}", } +Acknowledgments +=============== +Andreas Klöckner's work on :mod:`loopy` was supported in part by +* US Navy ONR grant number N00014-14-1-0117 +* the US National Science Foundation under grant numbers DMS-1418961 and CCF-1524433. + +AK also gratefully acknowledges a hardware gift from Nvidia Corporation. The +views and opinions expressed herein do not necessarily reflect those of the +funding agencies. diff --git a/doc/upload-docs.sh b/doc/upload-docs.sh index a95f9a68d7b1aed82bf92773422b22eb1336bc8a..a4691aecd31a14163b8f0c9bd30df137558eaa50 100755 --- a/doc/upload-docs.sh +++ b/doc/upload-docs.sh @@ -1,12 +1,3 @@ #! /bin/bash -cat > _build/html/.htaccess <<EOF -AuthUserFile /home/andreas/htpasswd -AuthGroupFile /dev/null -AuthName "Pre-Release Documentation" -AuthType Basic - -require user iliketoast -EOF - -rsync --progress --verbose --archive --delete _build/html/{.*,*} doc-upload:doc/loopy +rsync --verbose --archive --delete _build/html/{.*,*} doc-upload:doc/loopy diff --git a/loopy/__init__.py b/loopy/__init__.py index 778b3d49ef65df162857b8638d56b42ee8dee9d8..8975674442f049f51f724bfcb818fc6e8a441c88 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -66,7 +66,8 @@ from loopy.transform.iname import ( split_reduction_inward, split_reduction_outward, affine_map_inames, find_unused_axis_tag, make_reduction_inames_unique, - has_schedulable_iname_nesting, get_iname_duplication_options) + has_schedulable_iname_nesting, get_iname_duplication_options, + add_inames_to_insn) from loopy.transform.instruction import ( find_instructions, map_instructions, @@ -171,6 +172,7 @@ __all__ = [ "affine_map_inames", "find_unused_axis_tag", "make_reduction_inames_unique", "has_schedulable_iname_nesting", "get_iname_duplication_options", + "add_inames_to_insn", "add_prefetch", "change_arg_to_image", "tag_array_axes", "tag_data_axes", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index b475a40f3580b3b0e3c746a0de90f33cf182aa4e..e992fa3d48a625c3cb982eed1409be847aadde5f 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -444,7 +444,7 @@ def auto_test_vs_ref( print(75*"-") print("Reference Code:") print(75*"-") - print(get_highlighted_cl_code(ref_compiled.code)) + print(get_highlighted_cl_code(ref_compiled.get_code())) print(75*"-") ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset()) diff --git a/loopy/check.py b/loopy/check.py index 76b86a7c3f0b4267a9a41b5be3e6fae3a78a01bb..2f48211da430b42e8000467f070b1633ab3a4f38 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -34,17 +34,41 @@ import logging logger = logging.getLogger(__name__) +# {{{ sanity checks run before preprocessing + +def check_identifiers_in_subst_rules(knl): + """Substitution rules may only refer to kernel-global quantities or their + own arguments. + """ + + from loopy.symbolic import get_dependencies + + allowed_identifiers = knl.all_variable_names() + + for rule in six.itervalues(knl.substitutions): + deps = get_dependencies(rule.expression) + rule_allowed_identifiers = allowed_identifiers | frozenset(rule.arguments) + + if not deps <= rule_allowed_identifiers: + raise LoopyError("kernel '%s': substitution rule '%s' refers to " + "identifier(s) '%s' which are neither rule arguments nor " + "kernel-global identifiers" + % (knl.name, ", ".join(deps-rule_allowed_identifiers))) + +# }}} + + # {{{ sanity checks run pre-scheduling def check_insn_attributes(kernel): all_insn_ids = set(insn.id for insn in kernel.instructions) for insn in kernel.instructions: - if not insn.forced_iname_deps <= kernel.all_inames(): + if not insn.within_inames <= kernel.all_inames(): raise LoopyError("insn '%s' has unknown forced iname " "dependencies: %s" % (insn.id, ", ".join( - insn.forced_iname_deps - kernel.all_inames()))) + insn.within_inames - kernel.all_inames()))) if insn.depends_on is not None and not insn.depends_on <= all_insn_ids: raise LoopyError("insn '%s' has unknown instruction " @@ -278,7 +302,12 @@ def check_bounds(kernel): continue acm = _AccessCheckMapper(kernel, domain, insn.id) - insn.with_transformed_expressions(acm) + + def run_acm(expr): + acm(expr) + return expr + + insn.with_transformed_expressions(run_acm) def check_write_destinations(kernel): @@ -324,7 +353,7 @@ def check_has_schedulable_iname_nesting(kernel): def pre_schedule_checks(kernel): try: - logger.info("pre-schedule check %s: start" % kernel.name) + logger.info("%s: pre-schedule check: start" % kernel.name) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel) @@ -337,7 +366,7 @@ def pre_schedule_checks(kernel): check_write_destinations(kernel) check_has_schedulable_iname_nesting(kernel) - logger.info("pre-schedule check %s: done" % kernel.name) + logger.info("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: raise except: diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d95ab928925655148775699e206ebe76429d7463..79d824a44fc04f479139f2797994621f09798297 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -402,7 +402,7 @@ def generate_code_v2(kernel): input_kernel = kernel try: result = code_gen_cache[input_kernel] - logger.info("%s: code generation cache hit" % kernel.name) + logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 8531b97a947f3c46b984cf7405b794bd68e2f638..5fa54aeee91b5c41531e0dd5bfd7ab97323b19ae 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -97,7 +97,7 @@ def generate_assignment_instruction_code(codegen_state, insn): ecm = codegen_state.expression_to_code_mapper - from loopy.expression import dtype_to_type_context, VectorizabilityChecker + from loopy.expression import VectorizabilityChecker # {{{ vectorization handling @@ -141,52 +141,19 @@ def generate_assignment_instruction_code(codegen_state, insn): else: raise RuntimeError("invalid lvalue '%s'" % lhs) - lhs_var = kernel.get_var_descriptor(assignee_var_name) - lhs_dtype = lhs_var.dtype - - if insn.atomicity is not None: - lhs_atomicity = [ - a for a in insn.atomicity if a.var_name == assignee_var_name] - assert len(lhs_atomicity) <= 1 - if lhs_atomicity: - lhs_atomicity, = lhs_atomicity - else: - lhs_atomicity = None - else: - lhs_atomicity = None - - from loopy.kernel.data import AtomicInit, AtomicUpdate - - lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None) - rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) - if lhs_atomicity is None: - result = codegen_state.ast_builder.emit_assignment( - codegen_state, - lhs_code, - ecm(insn.expression, prec=PREC_NONE, - type_context=rhs_type_context, - needed_dtype=lhs_dtype)) - - elif isinstance(lhs_atomicity, AtomicInit): - raise NotImplementedError("atomic init") - - elif isinstance(lhs_atomicity, AtomicUpdate): - codegen_state.seen_atomic_dtypes.add(lhs_dtype) - result = codegen_state.ast_builder.generate_atomic_update( - kernel, codegen_state, lhs_atomicity, lhs_var, - insn.assignee, insn.expression, - lhs_dtype, rhs_type_context) - - else: - raise ValueError("unexpected lhs atomicity type: %s" - % type(lhs_atomicity).__name__) + result = codegen_state.ast_builder.emit_assignment(codegen_state, insn) # {{{ tracing + lhs_dtype = codegen_state.kernel.get_var_descriptor(assignee_var_name).dtype + if kernel.options.trace_assignments or kernel.options.trace_assignment_values: if codegen_state.vectorization_info and is_vector: raise Unvectorizable("tracing does not support vectorization") + from pymbolic.mapper.stringifier import PREC_NONE + lhs_code = codegen_state.expression_to_code_mapper(insn.assignee, PREC_NONE) + from cgen import Statement as S # noqa gs, ls = kernel.get_grid_size_upper_bounds() diff --git a/loopy/compiled.py b/loopy/compiled.py index 5782ed7d9b897ad9ca3bf4da165e3a61da31ce40..b3e4fe0589dc9a62d7bdefd7152784560be0ca8a 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -1,6 +1,6 @@ -from __future__ import division, with_statement, absolute_import +from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" +__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -22,898 +22,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six -from six.moves import range, zip -import numpy as np -from pytools import Record, memoize_method -from loopy.diagnostic import ParameterFinderWarning -from pytools.py_codegen import ( - Indentation, PythonFunctionGenerator) -from loopy.diagnostic import LoopyError -from loopy.types import NumpyType +from loopy.target.pyopencl_execution import ( # noqa + PyOpenCLKernelExecutor, + get_highlighted_cl_code) -import logging -logger = logging.getLogger(__name__) +# {{{ compatibility -# {{{ object array argument packing - -class _PackingInfo(Record): - """ - .. attribute:: name - .. attribute:: sep_shape - - .. attribute:: subscripts_and_names - - A list of type ``[(index, unpacked_name), ...]``. - """ - - -class SeparateArrayPackingController(object): - """For argument arrays with axes tagged to be implemented as separate - arrays, this class provides preprocessing of the incoming arguments so that - all sub-arrays may be passed in one object array (under the original, - un-split argument name) and are unpacked into separate arrays before being - passed to the kernel. - - It also repacks outgoing arrays of this type back into an object array. - """ - - def __init__(self, kernel): - # map from arg name - self.packing_info = {} - - from loopy.kernel.array import ArrayBase - for arg in kernel.args: - if not isinstance(arg, ArrayBase): - continue - - if arg.shape is None or arg.dim_tags is None: - continue - - subscripts_and_names = arg.subscripts_and_names() - - if subscripts_and_names is None: - continue - - self.packing_info[arg.name] = _PackingInfo( - name=arg.name, - sep_shape=arg.sep_shape(), - subscripts_and_names=subscripts_and_names, - is_written=arg.name in kernel.get_written_variables()) - - def unpack(self, kernel_kwargs): - if not self.packing_info: - return kernel_kwargs - - kernel_kwargs = kernel_kwargs.copy() - - for packing_info in six.itervalues(self.packing_info): - arg_name = packing_info.name - if packing_info.name in kernel_kwargs: - arg = kernel_kwargs[arg_name] - for index, unpacked_name in packing_info.subscripts_and_names: - assert unpacked_name not in kernel_kwargs - kernel_kwargs[unpacked_name] = arg[index] - del kernel_kwargs[arg_name] - - return kernel_kwargs - - def pack(self, outputs): - if not self.packing_info: - return outputs - - for packing_info in six.itervalues(self.packing_info): - if not packing_info.is_written: - continue - - result = outputs[packing_info.name] = \ - np.zeros(packing_info.sep_shape, dtype=np.object) - - for index, unpacked_name in packing_info.subscripts_and_names: - result[index] = outputs.pop(unpacked_name) - - return outputs - -# }}} - - -# {{{ invoker generation - -# /!\ This code runs in a namespace controlled by the user. -# Prefix all auxiliary variables with "_lpy". - - -def python_dtype_str(dtype): - import pyopencl.tools as cl_tools - if dtype.isbuiltin: - return "_lpy_np."+dtype.name - else: - return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")" - % cl_tools.dtype_to_ctype(dtype)) - - -# {{{ integer arg finding from shapes - -def generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info): - # a mapping from integer argument names to a list of tuples - # (arg_name, expression), where expression is a - # unary function of kernel.arg_dict[arg_name] - # returning the desired integer argument. - iarg_to_sources = {} - - from loopy.kernel.data import GlobalArg - from loopy.symbolic import DependencyMapper, StringifyMapper - dep_map = DependencyMapper() - - from pymbolic import var - for arg in implemented_data_info: - if arg.arg_class is GlobalArg: - sym_shape = var(arg.name).attr("shape") - for axis_nr, shape_i in enumerate(arg.shape): - if shape_i is None: - continue - - deps = dep_map(shape_i) - - if len(deps) == 1: - integer_arg_var, = deps - - if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): - from pymbolic.algorithm import solve_affine_equations_for - try: - # friggin' overkill :) - iarg_expr = solve_affine_equations_for( - [integer_arg_var.name], - [(shape_i, sym_shape.index(axis_nr))] - )[integer_arg_var] - except Exception as e: - #from traceback import print_exc - #print_exc() - - # went wrong? oh well - from warnings import warn - warn("Unable to generate code to automatically " - "find '%s' from the shape of '%s':\n%s" - % (integer_arg_var.name, arg.name, str(e)), - ParameterFinderWarning) - else: - iarg_to_sources.setdefault(integer_arg_var.name, []) \ - .append((arg.name, iarg_expr)) - - gen("# {{{ find integer arguments from shapes") - gen("") - - for iarg_name, sources in six.iteritems(iarg_to_sources): - gen("if %s is None:" % iarg_name) - with Indentation(gen): - if_stmt = "if" - for arg_name, value_expr in sources: - gen("%s %s is not None:" % (if_stmt, arg_name)) - with Indentation(gen): - gen("%s = %s" - % (iarg_name, StringifyMapper()(value_expr))) - - if_stmt = "elif" - - gen("") - - gen("# }}}") - gen("") - -# }}} - - -# {{{ integer arg finding from offsets - -def generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info): - options = kernel.options - - gen("# {{{ find integer arguments from offsets") - gen("") - - for arg in implemented_data_info: - impl_array_name = arg.offset_for_name - if impl_array_name is not None: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("if %s is None:" % impl_array_name) - with Indentation(gen): - gen("# Output variable, we'll be allocating " - "it, with zero offset.") - gen("%s = 0" % arg.name) - gen("else:") - with Indentation(gen): - if not options.no_numpy: - gen("_lpy_offset = getattr(%s, \"offset\", 0)" - % impl_array_name) - else: - gen("_lpy_offset = %s.offset" % impl_array_name) - - base_arg = kernel.impl_arg_to_arg[impl_array_name] - - if not options.skip_arg_checks: - gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" - % (arg.name, base_arg.dtype.itemsize)) - - gen("assert _lpy_remdr == 0, \"Offset of array '%s' is " - "not divisible by its dtype itemsize\"" - % impl_array_name) - gen("del _lpy_remdr") - else: - gen("%s = _lpy_offset // %d" - % (arg.name, base_arg.dtype.itemsize)) - - if not options.skip_arg_checks: - gen("del _lpy_offset") - - gen("# }}}") - gen("") - -# }}} - - -# {{{ integer arg finding from strides - -def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info): - options = kernel.options - - gen("# {{{ find integer arguments from strides") - gen("") - - for arg in implemented_data_info: - if arg.stride_for_name_and_axis is not None: - impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis - - gen("if %s is None:" % arg.name) - with Indentation(gen): - if not options.skip_arg_checks: - gen("if %s is None:" % impl_array_name) - with Indentation(gen): - gen("raise RuntimeError(\"required stride '%s' for " - "argument '%s' not given or deducible from " - "passed array\")" - % (arg.name, impl_array_name)) - - base_arg = kernel.impl_arg_to_arg[impl_array_name] - - if not options.skip_arg_checks: - gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" - % (arg.name, impl_array_name, stride_impl_axis, - base_arg.dtype.dtype.itemsize)) - - gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' is " - "not divisible by its dtype itemsize\"" - % (stride_impl_axis, impl_array_name)) - gen("del _lpy_remdr") - else: - gen("%s = _lpy_offset // %d" - % (arg.name, base_arg.dtype.itemsize)) - - gen("# }}}") - gen("") - -# }}} - - -# {{{ check that value args are present - -def generate_value_arg_check(gen, kernel, implemented_data_info): - if kernel.options.skip_arg_checks: - return - - from loopy.kernel.data import ValueArg - - gen("# {{{ check that value args are present") - gen("") - - for arg in implemented_data_info: - if not issubclass(arg.arg_class, ValueArg): - continue - - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise TypeError(\"value argument '%s' " - "was not given and could not be automatically " - "determined\")" % arg.name) - - gen("# }}}") - gen("") - -# }}} - - -# {{{ arg setup - -def generate_arg_setup(gen, kernel, implemented_data_info, options): - import loopy as lp - - from loopy.kernel.data import KernelArgument - from loopy.kernel.array import ArrayBase - from loopy.symbolic import StringifyMapper - from pymbolic import var - - gen("# {{{ set up array arguments") - gen("") - - if not options.no_numpy: - gen("_lpy_encountered_numpy = False") - gen("_lpy_encountered_dev = False") - gen("") - - args = [] - - strify = StringifyMapper() - - expect_no_more_arguments = False - - for arg_idx, arg in enumerate(implemented_data_info): - is_written = arg.base_name in kernel.get_written_variables() - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) - - if not issubclass(arg.arg_class, KernelArgument): - expect_no_more_arguments = True - continue - - if expect_no_more_arguments: - raise LoopyError("Further arguments encountered after arg info " - "describing a global temporary variable") - - if not issubclass(arg.arg_class, ArrayBase): - args.append(arg.name) - continue - - gen("# {{{ process %s" % arg.name) - gen("") - - if not options.no_numpy: - gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) - with Indentation(gen): - gen("# synchronous, nothing to worry about") - gen("%s = _lpy_cl_array.to_device(" - "queue, %s, allocator=allocator)" - % (arg.name, arg.name)) - gen("_lpy_encountered_numpy = True") - gen("elif %s is not None:" % arg.name) - with Indentation(gen): - gen("_lpy_encountered_dev = True") - - gen("") - - if not options.skip_arg_checks and not is_written: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise RuntimeError(\"input argument '%s' must " - "be supplied\")" % arg.name) - gen("") - - if (is_written - and arg.arg_class is lp.ImageArg - and not options.skip_arg_checks): - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise RuntimeError(\"written image '%s' must " - "be supplied\")" % arg.name) - gen("") - - if is_written and arg.shape is None and not options.skip_arg_checks: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise RuntimeError(\"written argument '%s' has " - "unknown shape and must be supplied\")" % arg.name) - gen("") - - possibly_made_by_loopy = False - - # {{{ allocate written arrays, if needed - - if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ - and arg.shape is not None: - - if not isinstance(arg.dtype, NumpyType): - raise LoopyError("do not know how to pass arg of type '%s'" - % arg.dtype) - - possibly_made_by_loopy = True - gen("_lpy_made_by_loopy = False") - gen("") - - gen("if %s is None:" % arg.name) - with Indentation(gen): - num_axes = len(arg.strides) - for i in range(num_axes): - gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) - - itemsize = kernel_arg.dtype.numpy_dtype.itemsize - for i in range(num_axes): - gen("_lpy_strides_%d = %s" % (i, strify( - itemsize*arg.unvec_strides[i]))) - - if not options.skip_arg_checks: - for i in range(num_axes): - gen("assert _lpy_strides_%d > 0, " - "\"'%s' has negative stride in axis %d\"" - % (i, arg.name, i)) - - sym_strides = tuple( - var("_lpy_strides_%d" % i) - for i in range(num_axes)) - sym_shape = tuple( - var("_lpy_shape_%d" % i) - for i in range(num_axes)) - - alloc_size_expr = (sum(astrd*(alen-1) - for alen, astrd in zip(sym_shape, sym_strides)) - + itemsize) - - gen("_lpy_alloc_size = %s" % strify(alloc_size_expr)) - gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, " - "%(dtype)s, strides=%(strides)s, " - "data=allocator(_lpy_alloc_size), allocator=allocator)" - % dict( - name=arg.name, - shape=strify(sym_shape), - strides=strify(sym_strides), - dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype))) - - if not options.skip_arg_checks: - for i in range(num_axes): - gen("del _lpy_shape_%d" % i) - gen("del _lpy_strides_%d" % i) - gen("del _lpy_alloc_size") - gen("") - - gen("_lpy_made_by_loopy = True") - gen("") - - # }}} - - # {{{ argument checking - - if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ - and not options.skip_arg_checks: - if possibly_made_by_loopy: - gen("if not _lpy_made_by_loopy:") - else: - gen("if True:") - - with Indentation(gen): - gen("if %s.dtype != %s:" - % (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype))) - with Indentation(gen): - gen("raise TypeError(\"dtype mismatch on argument '%s' " - "(got: %%s, expected: %s)\" %% %s.dtype)" - % (arg.name, arg.dtype, arg.name)) - - # {{{ generate shape checking code - - def strify_allowing_none(shape_axis): - if shape_axis is None: - return "None" - else: - return strify(shape_axis) - - def strify_tuple(t): - if len(t) == 0: - return "()" - else: - return "(%s,)" % ", ".join( - strify_allowing_none(sa) - for sa in t) - - shape_mismatch_msg = ( - "raise TypeError(\"shape mismatch on argument '%s' " - "(got: %%s, expected: %%s)\" " - "%% (%s.shape, %s))" - % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - - if kernel_arg.shape is None: - pass - - elif any(shape_axis is None for shape_axis in kernel_arg.shape): - gen("if len(%s.shape) != %s:" - % (arg.name, len(arg.unvec_shape))) - with Indentation(gen): - gen(shape_mismatch_msg) - - for i, shape_axis in enumerate(arg.unvec_shape): - if shape_axis is None: - continue - - gen("if %s.shape[%d] != %s:" - % (arg.name, i, strify(shape_axis))) - with Indentation(gen): - gen(shape_mismatch_msg) - - else: # not None, no Nones in tuple - gen("if %s.shape != %s:" - % (arg.name, strify(arg.unvec_shape))) - with Indentation(gen): - gen(shape_mismatch_msg) - - # }}} - - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize - sym_strides = tuple( - itemsize*s_i for s_i in arg.unvec_strides) - gen("if %s.strides != %s:" - % (arg.name, strify(sym_strides))) - with Indentation(gen): - gen("raise TypeError(\"strides mismatch on " - "argument '%s' (got: %%s, expected: %%s)\" " - "%% (%s.strides, %s))" - % (arg.name, arg.name, strify(sym_strides))) - - if not arg.allows_offset: - gen("if %s.offset:" % arg.name) - with Indentation(gen): - gen("raise ValueError(\"Argument '%s' does not " - "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_kernel()." - "\")" % arg.name) - gen("") - - # }}} - - if possibly_made_by_loopy and not options.skip_arg_checks: - gen("del _lpy_made_by_loopy") - gen("") - - if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: - args.append("%s.base_data" % arg.name) - else: - args.append("%s" % arg.name) - - gen("") - - gen("# }}}") - gen("") - - gen("# }}}") - gen("") - - return args - -# }}} - - -def generate_invoker(kernel, codegen_result): - options = kernel.options - implemented_data_info = codegen_result.implemented_data_info - host_code = codegen_result.host_code() - - system_args = [ - "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None", - # ignored if options.no_numpy - "out_host=None" - ] - - from loopy.kernel.data import KernelArgument - gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % kernel.name, - system_args + [ - "%s=None" % idi.name - for idi in implemented_data_info - if issubclass(idi.arg_class, KernelArgument) - ]) - - gen.add_to_preamble("from __future__ import division") - gen.add_to_preamble("") - gen.add_to_preamble("import pyopencl as _lpy_cl") - gen.add_to_preamble("import pyopencl.array as _lpy_cl_array") - gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools") - gen.add_to_preamble("import numpy as _lpy_np") - gen.add_to_preamble("") - gen.add_to_preamble(host_code) - gen.add_to_preamble("") - - gen("if allocator is None:") - with Indentation(gen): - gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)") - gen("") - - generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info) - generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info) - generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info) - generate_value_arg_check(gen, kernel, implemented_data_info) - - args = generate_arg_setup(gen, kernel, implemented_data_info, options) - - # {{{ generate invocation - - gen("_lpy_evt = {kernel_name}({args})" - .format( - kernel_name=codegen_result.host_program.name, - args=", ".join( - ["_lpy_cl_kernels", "queue"] - + args - + ["wait_for=wait_for"]))) - - # }}} - - # {{{ output - - if not options.no_numpy: - gen("if out_host is None and (_lpy_encountered_numpy " - "and not _lpy_encountered_dev):") - with Indentation(gen): - gen("out_host = True") - - gen("if out_host:") - with Indentation(gen): - gen("pass") # if no outputs (?!) - for arg in implemented_data_info: - if not issubclass(arg.arg_class, KernelArgument): - continue - - is_written = arg.base_name in kernel.get_written_variables() - if is_written: - gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) - - gen("") - - if options.return_dict: - gen("return _lpy_evt, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) - for arg in implemented_data_info - if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) - else: - out_args = [arg - for arg in implemented_data_info - if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] - if out_args: - gen("return _lpy_evt, (%s,)" - % ", ".join(arg.name for arg in out_args)) - else: - gen("return _lpy_evt, ()") - - # }}} - - if options.write_wrapper: - output = gen.get() - if options.highlight_wrapper: - output = get_highlighted_python_code(output) - - if options.write_wrapper is True: - print(output) - else: - with open(options.write_wrapper, "w") as outf: - outf.write(output) - - return gen.get_function() - - -# }}} - - -# {{{ compiled kernel object - -class _CLKernelInfo(Record): - pass - - -class _CLKernels(object): - pass - - -class CompiledKernel: - """An object connecting a kernel to a :class:`pyopencl.Context` - for execution. - - .. automethod:: __init__ - .. automethod:: __call__ - """ - +class CompiledKernel(PyOpenCLKernelExecutor): def __init__(self, context, kernel): - """ - :arg context: a :class:`pyopencl.Context` - :arg kernel: may be a loopy.LoopKernel, a generator returning kernels - (a warning will be issued if more than one is returned). If the - kernel has not yet been loop-scheduled, that is done, too, with no - specific arguments. - """ - - self.context = context - - from loopy.target.pyopencl import PyOpenCLTarget - self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0])) - - self.packing_controller = SeparateArrayPackingController(kernel) - - self.output_names = tuple(arg.name for arg in self.kernel.args - if arg.name in self.kernel.get_written_variables()) - - @memoize_method - def get_typed_and_scheduled_kernel(self, var_to_dtype_set): - kernel = self.kernel - - from loopy.kernel.tools import add_dtypes - - if var_to_dtype_set: - var_to_dtype = {} - for var, dtype in var_to_dtype_set: - try: - dest_name = kernel.impl_arg_to_arg[var].name - except KeyError: - dest_name = var - - try: - var_to_dtype[dest_name] = dtype - except KeyError: - raise LoopyError("cannot set type for '%s': " - "no known variable/argument with that name" - % var) - - kernel = add_dtypes(kernel, var_to_dtype) - - from loopy.preprocess import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - if kernel.schedule is None: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) - - return kernel - - @memoize_method - def cl_kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) - - from loopy.codegen import generate_code_v2 - codegen_result = generate_code_v2(kernel) - - dev_code = codegen_result.device_code() - - if self.kernel.options.write_cl: - output = dev_code - if self.kernel.options.highlight_cl: - output = get_highlighted_cl_code(output) - - if self.kernel.options.write_cl is True: - print(output) - else: - with open(self.kernel.options.write_cl, "w") as outf: - outf.write(output) - - if self.kernel.options.edit_cl: - from pytools import invoke_editor - dev_code = invoke_editor(dev_code, "code.cl") - - import pyopencl as cl - - logger.info("%s: opencl compilation start" % self.kernel.name) - - cl_program = ( - cl.Program(self.context, dev_code) - .build(options=kernel.options.cl_build_options)) - - cl_kernels = _CLKernels() - for dp in codegen_result.device_programs: - setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) - - logger.info("%s: opencl compilation done" % self.kernel.name) - - return _CLKernelInfo( - kernel=kernel, - cl_kernels=cl_kernels, - implemented_data_info=codegen_result.implemented_data_info, - invoker=generate_invoker(kernel, codegen_result)) - - # {{{ debugging aids - - def get_code(self, arg_to_dtype=None): - if arg_to_dtype is not None: - arg_to_dtype = frozenset(six.iteritems(arg_to_dtype)) - - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) - - from loopy.codegen import generate_code_v2 - code = generate_code_v2(kernel) - return code.device_code() - - def get_highlighted_code(self, arg_to_dtype=None): - return get_highlighted_cl_code( - self.get_code(arg_to_dtype)) - - @property - def code(self): from warnings import warn - warn("CompiledKernel.code is deprecated. Use .get_code() instead.", + warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.", DeprecationWarning, stacklevel=2) - return self.get_code() - - # }}} - - def __call__(self, queue, **kwargs): - """ - :arg allocator: a callable passed a byte count and returning - a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator - maybe. - :arg wait_for: A list of :class:`pyopencl.Event` instances - for which to wait. - :arg out_host: :class:`bool` - Decides whether output arguments (i.e. arguments - written by the kernel) are to be returned as - :mod:`numpy` arrays. *True* for yes, *False* for no. - - For the default value of *None*, if all (input) array - arguments are :mod:`numpy` arrays, defaults to - returning :mod:`numpy` arrays as well. - - :returns: ``(evt, output)`` where *evt* is a :class:`pyopencl.Event` - associated with the execution of the kernel, and - output is a tuple of output arguments (arguments that - are written as part of the kernel). The order is given - by the order of kernel arguments. If this order is unspecified - (such as when kernel arguments are inferred automatically), - enable :attr:`loopy.Options.return_dict` to make *output* a - :class:`dict` instead, with keys of argument names and values - of the returned arrays. - """ - - allocator = kwargs.pop("allocator", None) - wait_for = kwargs.pop("wait_for", None) - out_host = kwargs.pop("out_host", None) - - kwargs = self.packing_controller.unpack(kwargs) - - impl_arg_to_arg = self.kernel.impl_arg_to_arg - arg_to_dtype = {} - for arg_name, val in six.iteritems(kwargs): - arg = impl_arg_to_arg.get(arg_name, None) - - if arg is None: - # offsets, strides and such - continue - - if arg.dtype is None and val is not None: - try: - dtype = val.dtype - except AttributeError: - pass - else: - arg_to_dtype[arg_name] = dtype - - kernel_info = self.cl_kernel_info( - frozenset(six.iteritems(arg_to_dtype))) - - return kernel_info.invoker( - kernel_info.cl_kernels, queue, allocator, wait_for, - out_host, **kwargs) + super(CompiledKernel, self).__init__(context, kernel) # }}} - - -def get_highlighted_python_code(text): - try: - from pygments import highlight - except ImportError: - return text - else: - from pygments.lexers import PythonLexer - from pygments.formatters import TerminalFormatter - - return highlight(text, PythonLexer(), TerminalFormatter()) - - -def get_highlighted_cl_code(text): - try: - from pygments import highlight - except ImportError: - return text - else: - from pygments.lexers import CLexer - from pygments.formatters import TerminalFormatter - - return highlight(text, CLexer(), TerminalFormatter()) - - -# vim: foldmethod=marker diff --git a/loopy/execution.py b/loopy/execution.py new file mode 100644 index 0000000000000000000000000000000000000000..802684247f9f95a4374838ddcfaaae0ddbadec2e --- /dev/null +++ b/loopy/execution.py @@ -0,0 +1,200 @@ +from __future__ import division, with_statement, absolute_import + +__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import six +import numpy as np +from pytools import Record, memoize_method +from loopy.diagnostic import LoopyError + + +# {{{ object array argument packing + +class _PackingInfo(Record): + """ + .. attribute:: name + .. attribute:: sep_shape + + .. attribute:: subscripts_and_names + + A list of type ``[(index, unpacked_name), ...]``. + """ + + +class SeparateArrayPackingController(object): + """For argument arrays with axes tagged to be implemented as separate + arrays, this class provides preprocessing of the incoming arguments so that + all sub-arrays may be passed in one object array (under the original, + un-split argument name) and are unpacked into separate arrays before being + passed to the kernel. + + It also repacks outgoing arrays of this type back into an object array. + """ + + def __init__(self, kernel): + # map from arg name + self.packing_info = {} + + from loopy.kernel.array import ArrayBase + for arg in kernel.args: + if not isinstance(arg, ArrayBase): + continue + + if arg.shape is None or arg.dim_tags is None: + continue + + subscripts_and_names = arg.subscripts_and_names() + + if subscripts_and_names is None: + continue + + self.packing_info[arg.name] = _PackingInfo( + name=arg.name, + sep_shape=arg.sep_shape(), + subscripts_and_names=subscripts_and_names, + is_written=arg.name in kernel.get_written_variables()) + + def unpack(self, kernel_kwargs): + if not self.packing_info: + return kernel_kwargs + + kernel_kwargs = kernel_kwargs.copy() + + for packing_info in six.itervalues(self.packing_info): + arg_name = packing_info.name + if packing_info.name in kernel_kwargs: + arg = kernel_kwargs[arg_name] + for index, unpacked_name in packing_info.subscripts_and_names: + assert unpacked_name not in kernel_kwargs + kernel_kwargs[unpacked_name] = arg[index] + del kernel_kwargs[arg_name] + + return kernel_kwargs + + def pack(self, outputs): + if not self.packing_info: + return outputs + + for packing_info in six.itervalues(self.packing_info): + if not packing_info.is_written: + continue + + result = outputs[packing_info.name] = \ + np.zeros(packing_info.sep_shape, dtype=np.object) + + for index, unpacked_name in packing_info.subscripts_and_names: + result[index] = outputs.pop(unpacked_name) + + return outputs + +# }}} + + +# {{{ KernelExecutorBase + +class KernelExecutorBase(object): + """An object connecting a kernel to a :class:`pyopencl.Context` + for execution. + + .. automethod:: __init__ + .. automethod:: __call__ + """ + + def __init__(self, kernel): + """ + :arg kernel: a loopy.LoopKernel + """ + + self.kernel = kernel + + self.packing_controller = SeparateArrayPackingController(kernel) + + self.output_names = tuple(arg.name for arg in self.kernel.args + if arg.name in self.kernel.get_written_variables()) + + self.has_runtime_typed_args = any( + arg.dtype is None + for arg in kernel.args) + + @memoize_method + def get_typed_and_scheduled_kernel(self, var_to_dtype_set): + kernel = self.kernel + + from loopy.kernel.tools import add_dtypes + + if var_to_dtype_set: + var_to_dtype = {} + for var, dtype in var_to_dtype_set: + try: + dest_name = kernel.impl_arg_to_arg[var].name + except KeyError: + dest_name = var + + try: + var_to_dtype[dest_name] = dtype + except KeyError: + raise LoopyError("cannot set type for '%s': " + "no known variable/argument with that name" + % var) + + kernel = add_dtypes(kernel, var_to_dtype) + + from loopy.preprocess import infer_unknown_types + kernel = infer_unknown_types(kernel, expect_completion=True) + + if kernel.schedule is None: + from loopy.preprocess import preprocess_kernel + kernel = preprocess_kernel(kernel) + + from loopy.schedule import get_one_scheduled_kernel + kernel = get_one_scheduled_kernel(kernel) + + return kernel + + def arg_to_dtype_set(self, kwargs): + if not self.has_runtime_typed_args: + return None + + impl_arg_to_arg = self.kernel.impl_arg_to_arg + arg_to_dtype = {} + for arg_name, val in six.iteritems(kwargs): + arg = impl_arg_to_arg.get(arg_name, None) + + if arg is None: + # offsets, strides and such + continue + + if arg.dtype is None and val is not None: + try: + dtype = val.dtype + except AttributeError: + pass + else: + arg_to_dtype[arg_name] = dtype + + return frozenset(six.iteritems(arg_to_dtype)) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index e6e3d07c3a93ab0d3b7a308505cae1549802e364..f0f978c43fec48e24d192f2626baf59e4657aa3d 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -234,7 +234,7 @@ class F2LoopyTranslator(FTreeWalkerBase): from loopy.kernel.data import Assignment insn = Assignment( lhs, rhs, - forced_iname_deps=frozenset( + within_inames=frozenset( scope.active_loopy_inames), depends_on=depends_on, id=new_id, diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 077bc2dd12aeede3d59abd93ce04f795dad99b3f..c3c543d853b0f21db2ee7b1f5e63952f3cd4e716 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -303,6 +303,8 @@ class LoopKernel(RecordWithoutPickling): state=state, target=target) + self._kernel_executor_cache = {} + # }}} # {{{ function mangling @@ -671,7 +673,7 @@ class LoopKernel(RecordWithoutPickling): """ result = {} for insn in self.instructions: - result[insn.id] = insn.forced_iname_deps + result[insn.id] = insn.within_inames return result @@ -685,7 +687,7 @@ class LoopKernel(RecordWithoutPickling): def insn_inames(self, insn): if isinstance(insn, str): insn = self.id_to_insn[insn] - return insn.forced_iname_deps + return insn.within_inames @memoize_method def iname_to_insns(self): @@ -1262,14 +1264,15 @@ class LoopKernel(RecordWithoutPickling): # {{{ direct execution - @memoize_method - def get_compiled_kernel(self, ctx): - from loopy.compiled import CompiledKernel - return CompiledKernel(ctx, self) + def __call__(self, *args, **kwargs): + key = self.target.get_kernel_executor_cache_key(*args, **kwargs) + try: + kex = self._kernel_executor_cache[key] + except KeyError: + kex = self.target.get_kernel_executor(self, *args, **kwargs) + self._kernel_executor_cache[key] = kex - def __call__(self, queue, **kwargs): - cknl = self.get_compiled_kernel(queue.context) - return cknl(queue, **kwargs) + return kex(*args, **kwargs) # }}} @@ -1363,7 +1366,7 @@ class LoopKernel(RecordWithoutPickling): return False for set_a, set_b in zip(self.domains, other.domains): - if not set_a.plain_is_equal(set_b): + if not (set_a.plain_is_equal(set_b) or set_a.is_equal(set_b)): return False elif field_name == "assumptions": diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index ce8092b346f3a4232e7398e45799cda28a99c08c..cff92016dd4b07f937d443adbf8a3baa5fc5890a 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -782,7 +782,6 @@ class ArrayBase(Record): kwargs["strides"] = strides if dim_names is not None and not isinstance(dim_names, tuple): - pu.db from warnings import warn warn("dim_names is not a tuple when calling ArrayBase constructor", DeprecationWarning, stacklevel=2) @@ -1173,6 +1172,9 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): or *None*. """ + import loopy as lp + from pymbolic import var + def eval_expr_assert_integer_constant(i, expr): from pymbolic.mapper.evaluator import UnknownVariableError try: @@ -1191,6 +1193,16 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): return result + def apply_offset(sub): + if ary.offset: + offset_name = ary.offset + if offset_name is lp.auto: + offset_name = array_name+"_offset" + + return var(offset_name) + sub + else: + return sub + if not isinstance(index, tuple): index = (index,) @@ -1204,7 +1216,10 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): "'shape=None'?)" % ary.name) - return AccessInfo(array_name=array_name, subscripts=index, vector_index=None) + return AccessInfo( + array_name=array_name, + subscripts=(apply_offset(index[0]),), + vector_index=None) if len(ary.dim_tags) != len(index): raise LoopyError("subscript to '%s[%s]' has the wrong " @@ -1231,8 +1246,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): if isinstance(dim_tag, FixedStrideArrayDimTag): - import loopy as lp - stride = dim_tag.stride if is_integer(stride): @@ -1243,7 +1256,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): % (ary.name, i, dim_tag.stride, vector_size)) elif stride is lp.auto: - from pymbolic import var stride = var(array_name + "_stride%d" % i) subscripts[dim_tag.target_axis] += (stride // vector_size)*idx @@ -1278,11 +1290,7 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): if num_target_axes > 1: raise NotImplementedError("offsets for multiple image axes") - offset_name = ary.offset - if offset_name is lp.auto: - offset_name = array_name+"_offset" - - subscripts[0] = var(offset_name) + subscripts[0] + subscripts[0] = apply_offset(subscripts[0]) return AccessInfo( array_name=array_name, diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 3583fdc689659cca3d240edf9be7f501672d9987..6aa2fcbbf6cabd4b5c8eea98dbbcbed35f0edd89 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -158,8 +158,8 @@ def get_default_insn_options_dict(): "insn_id": None, "inames_to_dup": [], "priority": 0, - "forced_iname_deps_is_final": False, - "forced_iname_deps": frozenset(), + "within_inames_is_final": False, + "within_inames": frozenset(), "predicates": frozenset(), "tags": frozenset(), "atomicity": (), @@ -215,10 +215,10 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): for value in opt_value.split(":"): arrow_idx = value.find("->") if arrow_idx >= 0: - result["inames_to_dup"].append( + result.setdefault("inames_to_dup", []).append( (value[:arrow_idx], value[arrow_idx+2:])) else: - result["inames_to_dup"].append((value, None)) + result.setdefault("inames_to_dup", []).append((value, None)) elif opt_key == "dep" and opt_value is not None: if is_with_block: @@ -254,12 +254,12 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): elif opt_key == "inames" and opt_value is not None: if opt_value.startswith("+"): - result["forced_iname_deps_is_final"] = False + result["within_inames_is_final"] = False opt_value = (opt_value[1:]).strip() else: - result["forced_iname_deps_is_final"] = True + result["within_inames_is_final"] = True - result["forced_iname_deps"] = intern_frozenset_of_ids( + result["within_inames"] = intern_frozenset_of_ids( opt_value.split(":")) elif opt_key == "if" and opt_value is not None: @@ -316,8 +316,14 @@ WITH_OPTIONS_RE = re.compile( FOR_RE = re.compile( "^" - "\s*for\s*" - "(?P<inames>[ ,\w]+)" + "\s*(for)\s+" + "(?P<inames>[ ,\w]*)" + "\s*$") + +IF_RE = re.compile( + "^" + "\s*if\s+" + "(?P<predicate>\w+)" "\s*$") INSN_RE = re.compile( @@ -329,6 +335,13 @@ INSN_RE = re.compile( "\s*?" "(?:\{(?P<options>.+)\}\s*)?$") +EMPTY_LHS_INSN_RE = re.compile( + "^" + "\s*" + "(?P<rhs>.+?)" + "\s*?" + "(?:\{(?P<options>.+)\}\s*)?$") + SUBST_RE = re.compile( r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$") @@ -342,14 +355,17 @@ def parse_insn(groups, insn_options): """ import loopy as lp - from loopy.symbolic import parse - try: - lhs = parse(groups["lhs"]) - except: - print("While parsing left hand side '%s', " - "the following error occurred:" % groups["lhs"]) - raise + + if "lhs" in groups: + try: + lhs = parse(groups["lhs"]) + except: + print("While parsing left hand side '%s', " + "the following error occurred:" % groups["lhs"]) + raise + else: + lhs = () try: rhs = parse(groups["rhs"]) @@ -508,8 +524,8 @@ def parse_instructions(instructions, defines): groups=frozenset(intern(grp) for grp in insn.groups), conflicts_with_groups=frozenset( intern(grp) for grp in insn.conflicts_with_groups), - forced_iname_deps=frozenset( - intern(iname) for iname in insn.forced_iname_deps), + within_inames=frozenset( + intern(iname) for iname in insn.within_inames), predicates=frozenset( intern(pred) for pred in insn.predicates), )) @@ -593,27 +609,27 @@ def parse_instructions(instructions, defines): for insn in instructions: if isinstance(insn, InstructionBase): - local_fids = insn_options_stack[-1]["forced_iname_deps"] + local_w_inames = insn_options_stack[-1]["within_inames"] - if insn.forced_iname_deps_is_final: + if insn.within_inames_is_final: if not ( - local_fids <= insn.forced_iname_deps): + local_w_inames <= insn.within_inames): raise LoopyError("non-parsed instruction '%s' without " "inames '%s' (but with final iname dependencies) " "found inside 'for'/'with' block for inames " "'%s'" % (insn.id, - ", ".join(local_fids - insn.forced_iname_deps), - insn_options_stack[-1].forced_iname_deps)) + ", ".join(local_w_inames - insn.within_inames), + insn_options_stack[-1].within_inames)) else: # not final, add inames from current scope insn = insn.copy( - forced_iname_deps=insn.forced_iname_deps | local_fids, - forced_iname_deps_is_final=( + within_inames=insn.within_inames | local_w_inames, + within_inames_is_final=( # If it's inside a for/with block, then it's # final now. - bool(local_fids)), + bool(local_w_inames)), tags=( insn.tags | insn_options_stack[-1]["tags"]), @@ -631,7 +647,7 @@ def parse_instructions(instructions, defines): new_instructions.append(insn) inames_to_dup.append([]) - del local_fids + del local_w_inames continue @@ -646,12 +662,32 @@ def parse_instructions(instructions, defines): for_match = FOR_RE.match(insn) if for_match is not None: options = insn_options_stack[-1].copy() - options["forced_iname_deps"] = ( - options.get("forced_iname_deps", frozenset()) - | frozenset( - iname.strip() - for iname in for_match.group("inames").split(","))) - options["forced_iname_deps_is_final"] = True + added_inames = frozenset( + iname.strip() + for iname in for_match.group("inames").split(",") + if iname.strip()) + if not added_inames: + raise LoopyError("'for' without inames encountered") + + options["within_inames"] = ( + options.get("within_inames", frozenset()) + | added_inames) + options["within_inames_is_final"] = True + + insn_options_stack.append(options) + del options + continue + + if_match = IF_RE.match(insn) + if if_match is not None: + options = insn_options_stack[-1].copy() + predicate = if_match.group("predicate") + if not predicate: + raise LoopyError("'if' without predicate encountered") + + options["predicates"] = ( + options.get("predicates", frozenset()) + | frozenset([predicate])) insn_options_stack.append(options) del options @@ -675,6 +711,14 @@ def parse_instructions(instructions, defines): inames_to_dup.append(insn_inames_to_dup) continue + insn_match = EMPTY_LHS_INSN_RE.match(insn) + if insn_match is not None: + insn, insn_inames_to_dup = parse_insn( + insn_match.groupdict(), insn_options_stack[-1]) + new_instructions.append(insn) + inames_to_dup.append(insn_inames_to_dup) + continue + raise LoopyError("instruction parse error: %s" % insn) if len(insn_options_stack) != 1: @@ -732,7 +776,7 @@ def parse_domains(domains, defines): parameters = (_gather_isl_identifiers(dom) - _find_inames_in_set(dom) - _find_existentially_quantified_inames(dom)) - dom = "[%s] -> %s" % (",".join(parameters), dom) + dom = "[%s] -> %s" % (",".join(sorted(parameters)), dom) try: dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom) @@ -815,17 +859,21 @@ class ArgumentGuesser: if isinstance(insn, MultiAssignmentBase): for assignee_var_name in insn.assignee_var_names(): self.all_written_names.add(assignee_var_name) - self.all_names.update(get_dependencies( - self.submap(insn.assignees))) - self.all_names.update(get_dependencies( - self.submap(insn.expression))) + + self.all_names.update(get_dependencies( + self.submap(insn.assignees))) + self.all_names.update(get_dependencies( + self.submap(insn.expression))) def find_index_rank(self, name): irf = IndexRankFinder(name) + def run_irf(expr): + irf(self.submap(expr)) + return expr + for insn in self.instructions: - insn.with_transformed_expressions( - lambda expr: irf(self.submap(expr))) + insn.with_transformed_expressions(run_irf) if not irf.index_ranks: return 0 @@ -941,13 +989,13 @@ def check_for_duplicate_names(knl): def check_for_nonexistent_iname_deps(knl): for insn in knl.instructions: - if not set(insn.forced_iname_deps) <= knl.all_inames(): + if not set(insn.within_inames) <= knl.all_inames(): raise ValueError("In instruction '%s': " "cannot force dependency on inames '%s'--" "they don't exist" % ( insn.id, ",".join( - set(insn.forced_iname_deps)-knl.all_inames()))) + set(insn.within_inames)-knl.all_inames()))) def check_for_multiple_writes_to_loop_bounds(knl): @@ -1037,8 +1085,8 @@ def expand_cses(instructions, cse_prefix="cse_expr"): assignee=Variable(new_var_name), expression=expr, predicates=insn.predicates, - forced_iname_deps=insn.forced_iname_deps, - forced_iname_deps_is_final=insn.forced_iname_deps_is_final, + within_inames=insn.within_inames, + within_inames_is_final=insn.within_inames_is_final, ) newly_created_insn_ids.add(new_insn.id) new_insns.append(new_insn) @@ -1352,6 +1400,27 @@ def resolve_wildcard_deps(knl): # }}} +# {{{ add used inames deps + +def add_used_inames(knl): + new_insns = [] + + for insn in knl.instructions: + deps = insn.read_dependency_names() | insn.write_dependency_names() + iname_deps = deps & knl.all_inames() + + new_within_inames = insn.within_inames | iname_deps + + if new_within_inames != insn.within_inames: + insn = insn.copy(within_inames=new_within_inames) + + new_insns.append(insn) + + return knl.copy(instructions=new_insns) + +# }}} + + # {{{ add inferred iname deps def add_inferred_inames(knl): @@ -1359,7 +1428,7 @@ def add_inferred_inames(knl): insn_inames = find_all_insn_inames(knl) return knl.copy(instructions=[ - insn.copy(forced_iname_deps=insn_inames[insn.id]) + insn.copy(within_inames=insn_inames[insn.id]) for insn in knl.instructions]) # }}} @@ -1549,10 +1618,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): **kwargs) from loopy import duplicate_inames + from loopy.match import Id for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup): for old_iname, new_iname in insn_inames_to_dup: knl = duplicate_inames(knl, old_iname, - within=insn.id, new_inames=new_iname) + within=Id(insn.id), new_inames=new_iname) check_for_nonexistent_iname_deps(knl) @@ -1563,6 +1633,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # Must create temporaries before inferring inames (because those temporaries # mediate dependencies that are then used for iname propagation.) # ------------------------------------------------------------------------- + knl = add_used_inames(knl) # NOTE: add_inferred_inames will be phased out and throws warnings if it # does something. knl = add_inferred_inames(knl) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index a884fb0f07f14072c75201b0fe4013e3d72f691c..5af3f0272073243417c3111a847b921a41afec6e 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -26,6 +26,7 @@ THE SOFTWARE. from six.moves import intern +from warnings import warn import numpy as np # noqa from pytools import Record, memoize_method from loopy.kernel.array import ArrayBase @@ -590,32 +591,13 @@ class InstructionBase(Record): .. rubric:: Iname dependencies - .. attribute:: forced_iname_deps_is_final + .. attribute:: within_inames - A :class:`bool` determining whether :attr:`forced_iname_deps` constitutes - the *entire* list of iname dependencies. - - .. attribute:: forced_iname_deps - - A :class:`frozenset` of inames that are added to the list of iname - dependencies *or* constitute the entire list of iname dependencies, - depending on the value of :attr:`forced_iname_deps_is_final`. + A :class:`frozenset` of inames identifying the loops within which this + instruction will be executed. .. rubric:: Iname dependencies - .. attribute:: boostable - - Whether the instruction may safely be executed inside more loops than - advertised without changing the meaning of the program. Allowed values - are *None* (for unknown), *True*, and *False*. - - .. attribute:: boostable_into - - A :class:`set` of inames into which the instruction - may need to be boosted, as a heuristic help for the scheduler. - Also allowed to be *None* to indicate that this hasn't been - decided yet. - .. rubric:: Tagging .. attribute:: tags @@ -623,6 +605,9 @@ class InstructionBase(Record): A :class:`frozenset` of string identifiers that can be used to identify groups of instructions. + Tags starting with exclamation marks (``!``) are reserved and may have + specific meanings defined by :mod:`loopy` or its targets. + .. automethod:: __init__ .. automethod:: assignee_var_names .. automethod:: assignee_subscript_deps @@ -632,31 +617,48 @@ class InstructionBase(Record): .. automethod:: copy """ + # within_inames_is_final, boostable and boostable_into are deprecated and + # will be removed in version 2017.x. + fields = set("id depends_on depends_on_is_final " "groups conflicts_with_groups " "no_sync_with " "predicates " - "forced_iname_deps_is_final forced_iname_deps " + "within_inames_is_final within_inames " "priority boostable boostable_into".split()) def __init__(self, id, depends_on, depends_on_is_final, groups, conflicts_with_groups, no_sync_with, - forced_iname_deps_is_final, forced_iname_deps, + within_inames_is_final, within_inames, priority, boostable, boostable_into, predicates, tags, - insn_deps=None, insn_deps_is_final=None): + insn_deps=None, insn_deps_is_final=None, + forced_iname_deps=None, forced_iname_deps_is_final=None): + + # {{{ backwards compatibility goop if depends_on is not None and insn_deps is not None: - raise ValueError("may not specify both insn_deps and depends_on") + raise LoopyError("may not specify both insn_deps and depends_on") elif insn_deps is not None: - from warnings import warn warn("insn_deps is deprecated, use depends_on", DeprecationWarning, stacklevel=2) depends_on = insn_deps depends_on_is_final = insn_deps_is_final + if forced_iname_deps is not None and within_inames is not None: + raise LoopyError("may not specify both forced_iname_deps " + "and within_inames") + elif forced_iname_deps is not None: + warn("forced_iname_deps is deprecated, use within_inames", + DeprecationWarning, stacklevel=2) + + within_inames = forced_iname_deps + within_inames_is_final = forced_iname_deps_is_final + + # }}} + if depends_on is None: depends_on = frozenset() @@ -669,8 +671,11 @@ class InstructionBase(Record): if no_sync_with is None: no_sync_with = frozenset() - if forced_iname_deps_is_final is None: - forced_iname_deps_is_final = False + if within_inames is None: + within_inames = frozenset() + + if within_inames_is_final is None: + within_inames_is_final = False if depends_on_is_final is None: depends_on_is_final = False @@ -694,10 +699,10 @@ class InstructionBase(Record): # assert all(is_interned(dep) for dep in depends_on) # assert all(is_interned(grp) for grp in groups) # assert all(is_interned(grp) for grp in conflicts_with_groups) - # assert all(is_interned(iname) for iname in forced_iname_deps) + # assert all(is_interned(iname) for iname in within_inames) # assert all(is_interned(pred) for pred in predicates) - assert isinstance(forced_iname_deps, frozenset) + assert isinstance(within_inames, frozenset) assert isinstance(depends_on, frozenset) or depends_on is None assert isinstance(groups, frozenset) assert isinstance(conflicts_with_groups, frozenset) @@ -708,24 +713,45 @@ class InstructionBase(Record): depends_on_is_final=depends_on_is_final, no_sync_with=no_sync_with, groups=groups, conflicts_with_groups=conflicts_with_groups, - forced_iname_deps_is_final=forced_iname_deps_is_final, - forced_iname_deps=forced_iname_deps, + within_inames_is_final=within_inames_is_final, + within_inames=within_inames, priority=priority, boostable=boostable, boostable_into=boostable_into, predicates=predicates, tags=tags) - # legacy + # {{{ backwards compatibility goop + @property def insn_deps(self): + warn("insn_deps is deprecated, use depends_on", + DeprecationWarning, stacklevel=2) + return self.depends_on # legacy @property def insn_deps_is_final(self): + warn("insn_deps_is_final is deprecated, use depends_on_is_final", + DeprecationWarning, stacklevel=2) + return self.depends_on_is_final + @property + def forced_iname_deps(self): + warn("forced_iname_deps is deprecated, use within_inames", + DeprecationWarning, stacklevel=2) + return self.within_inames + + @property + def forced_iname_deps_is_final(self): + warn("forced_iname_deps_is_final is deprecated, use within_inames_is_final", + DeprecationWarning, stacklevel=2) + return self.within_inames_is_final + + # }}} + # {{{ abstract interface def read_dependency_names(self): @@ -877,8 +903,8 @@ class InstructionBase(Record): self.groups = intern_frozenset_of_ids(self.groups) self.conflicts_with_groups = ( intern_frozenset_of_ids(self.conflicts_with_groups)) - self.forced_iname_deps = ( - intern_frozenset_of_ids(self.forced_iname_deps)) + self.within_inames = ( + intern_frozenset_of_ids(self.within_inames)) self.predicates = ( intern_frozenset_of_ids(self.predicates)) @@ -1170,12 +1196,13 @@ class Assignment(MultiAssignmentBase): groups=None, conflicts_with_groups=None, no_sync_with=None, - forced_iname_deps_is_final=None, - forced_iname_deps=frozenset(), + within_inames_is_final=None, + within_inames=None, boostable=None, boostable_into=None, tags=None, temp_var_type=None, atomicity=(), priority=0, predicates=frozenset(), - insn_deps=None, insn_deps_is_final=None): + insn_deps=None, insn_deps_is_final=None, + forced_iname_deps=None, forced_iname_deps_is_final=None): super(Assignment, self).__init__( id=id, @@ -1184,15 +1211,17 @@ class Assignment(MultiAssignmentBase): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, - forced_iname_deps_is_final=forced_iname_deps_is_final, - forced_iname_deps=forced_iname_deps, + within_inames_is_final=within_inames_is_final, + within_inames=within_inames, boostable=boostable, boostable_into=boostable_into, priority=priority, predicates=predicates, tags=tags, insn_deps=insn_deps, - insn_deps_is_final=insn_deps_is_final) + insn_deps_is_final=insn_deps_is_final, + forced_iname_deps=forced_iname_deps, + forced_iname_deps_is_final=forced_iname_deps_is_final) from loopy.symbolic import parse if isinstance(assignee, str): @@ -1200,13 +1229,10 @@ class Assignment(MultiAssignmentBase): if isinstance(expression, str): expression = parse(expression) - # FIXME: It may be worth it to enable this check eventually. - # For now, it causes grief with certain 'checky' uses of the - # with_transformed_expressions(). (notably the access checker) - # - # from pymbolic.primitives import Variable, Subscript - # if not isinstance(assignee, (Variable, Subscript)): - # raise LoopyError("invalid lvalue '%s'" % assignee) + from pymbolic.primitives import Variable, Subscript + from loopy.symbolic import LinearSubscript + if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): + raise LoopyError("invalid lvalue '%s'" % assignee) self.assignee = assignee self.expression = expression @@ -1288,6 +1314,8 @@ class CallInstruction(MultiAssignmentBase): .. attribute:: assignees + A :class:`tuple` of left-hand sides for the assignment + .. attribute:: expression The following attributes are only used until @@ -1312,12 +1340,14 @@ class CallInstruction(MultiAssignmentBase): groups=None, conflicts_with_groups=None, no_sync_with=None, - forced_iname_deps_is_final=None, - forced_iname_deps=frozenset(), + within_inames_is_final=None, + within_inames=None, boostable=None, boostable_into=None, tags=None, temp_var_types=None, priority=0, predicates=frozenset(), - insn_deps=None, insn_deps_is_final=None): + insn_deps=None, insn_deps_is_final=None, + forced_iname_deps=None, + forced_iname_deps_is_final=None): super(CallInstruction, self).__init__( id=id, @@ -1326,15 +1356,17 @@ class CallInstruction(MultiAssignmentBase): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, - forced_iname_deps_is_final=forced_iname_deps_is_final, - forced_iname_deps=forced_iname_deps, + within_inames_is_final=within_inames_is_final, + within_inames=within_inames, boostable=boostable, boostable_into=boostable_into, priority=priority, predicates=predicates, tags=tags, insn_deps=insn_deps, - insn_deps_is_final=insn_deps_is_final) + insn_deps_is_final=insn_deps_is_final, + forced_iname_deps=forced_iname_deps, + forced_iname_deps_is_final=forced_iname_deps_is_final) from pymbolic.primitives import Call from loopy.symbolic import Reduction @@ -1345,16 +1377,19 @@ class CallInstruction(MultiAssignmentBase): from loopy.symbolic import parse if isinstance(assignees, str): assignees = parse(assignees) + if not isinstance(assignees, tuple): + raise LoopyError("'assignees' argument to CallInstruction " + "must be a tuple or a string parseable to a tuple" + "--got '%s'" % type(assignees).__name__) + if isinstance(expression, str): expression = parse(expression) - # FIXME: It may be worth it to enable this check eventually. - # For now, it causes grief with certain 'checky' uses of the - # with_transformed_expressions(). (notably the access checker) - # - # from pymbolic.primitives import Variable, Subscript - # if not isinstance(assignee, (Variable, Subscript)): - # raise LoopyError("invalid lvalue '%s'" % assignee) + from pymbolic.primitives import Variable, Subscript + from loopy.symbolic import LinearSubscript + for assignee in assignees: + if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): + raise LoopyError("invalid lvalue '%s'" % assignee) self.assignees = assignees self.expression = expression @@ -1414,9 +1449,7 @@ class CallInstruction(MultiAssignmentBase): def make_assignment(assignees, expression, temp_var_types=None, **kwargs): - if len(assignees) < 1: - raise LoopyError("every instruction must have a left-hand side") - elif len(assignees) > 1: + if len(assignees) > 1 or len(assignees) == 0: atomicity = kwargs.pop("atomicity", ()) if atomicity: raise LoopyError("atomic operations with more than one " @@ -1491,7 +1524,7 @@ class CInstruction(InstructionBase): id=None, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, - forced_iname_deps_is_final=None, forced_iname_deps=frozenset(), + within_inames_is_final=None, within_inames=None, priority=0, boostable=None, boostable_into=None, predicates=frozenset(), tags=None, insn_deps=None, insn_deps_is_final=None): @@ -1511,8 +1544,8 @@ class CInstruction(InstructionBase): depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, - forced_iname_deps_is_final=forced_iname_deps_is_final, - forced_iname_deps=forced_iname_deps, + within_inames_is_final=within_inames_is_final, + within_inames=within_inames, boostable=boostable, boostable_into=boostable_into, priority=priority, predicates=predicates, tags=tags, diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index fa47cd275419c8ca9fbfdabb927ddad8bc25879c..7bf4d786613f5fb3b9d9633e1f810ccbc260d061 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -63,10 +63,11 @@ def _add_dtypes(knl, dtype_dict): dtype_dict = dtype_dict.copy() new_args = [] + from loopy.types import to_loopy_type for arg in knl.args: new_dtype = dtype_dict.pop(arg.name, None) if new_dtype is not None: - new_dtype = np.dtype(new_dtype) + new_dtype = to_loopy_type(new_dtype) if arg.dtype is not None and arg.dtype != new_dtype: raise RuntimeError( "argument '%s' already has a different dtype " @@ -142,7 +143,7 @@ def guess_iname_deps_based_on_var_use(kernel, insn, insn_id_to_inames=None): for writer_id in writer_map[tv_name]: writer_insn = kernel.id_to_insn[writer_id] if insn_id_to_inames is None: - writer_inames = writer_insn.forced_iname_deps + writer_inames = writer_insn.within_inames else: writer_inames = insn_id_to_inames[writer_id] @@ -180,12 +181,12 @@ def find_all_insn_inames(kernel): all_write_deps[insn.id] = write_deps = insn.write_dependency_names() deps = read_deps | write_deps - if insn.forced_iname_deps_is_final: - iname_deps = insn.forced_iname_deps + if insn.within_inames_is_final: + iname_deps = insn.within_inames else: iname_deps = ( deps & kernel.all_inames() - | insn.forced_iname_deps) + | insn.within_inames) assert isinstance(read_deps, frozenset), type(insn) assert isinstance(write_deps, frozenset), type(insn) @@ -217,7 +218,7 @@ def find_all_insn_inames(kernel): did_something = False for insn in kernel.instructions: - if insn.forced_iname_deps_is_final: + if insn.within_inames_is_final: continue # {{{ depdency-based propagation @@ -232,12 +233,12 @@ def find_all_insn_inames(kernel): did_something = True warn_with_kernel(kernel, "inferred_iname", - "The iname(s) '%s' on instruction '%s' in kernel '%s' " + "The iname(s) '%s' on instruction '%s' " "was/were automatically added. " "This is deprecated. Please add the iname " "to the instruction " "explicitly, e.g. by adding 'for' loops" - % (", ".join(inames_new-inames_old), insn.id, kernel.name)) + % (", ".join(inames_new-inames_old), insn.id)) # }}} @@ -273,7 +274,7 @@ def find_all_insn_inames(kernel): "automatically added. " "This is deprecated. Please add the iname " "to the instruction " - "explicitly, e.g. by adding '{inames=...}" + "explicitly, e.g. by adding 'for' loops" % (", ".join(inames_new-inames_old), insn.id)) # }}} diff --git a/loopy/maxima.py b/loopy/maxima.py index 738df86c4b94988b57d6dc01bcc22bb0ca62ac21..22d0c085c0edd5a96b6b45b457957f4be50e49d7 100644 --- a/loopy/maxima.py +++ b/loopy/maxima.py @@ -25,7 +25,8 @@ THE SOFTWARE. """ -from pymbolic.maxima import MaximaStringifyMapper as MaximaStringifyMapperBase +from pymbolic.interop.maxima import \ + MaximaStringifyMapper as MaximaStringifyMapperBase class MaximaStringifyMapper(MaximaStringifyMapperBase): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 9738777ad1fd599732e39d419c3de43827f13470..bdb91d01d7988c1bf5517f699664e0090f97cd34 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -528,8 +528,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): init_insn = make_assignment( id=init_id, assignees=acc_vars, - forced_iname_deps=outer_insn_inames - frozenset(expr.inames), - forced_iname_deps_is_final=insn.forced_iname_deps_is_final, + within_inames=outer_insn_inames - frozenset(expr.inames), + within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset(), expression=expr.operation.neutral_element(arg_dtype, expr.inames)) @@ -539,8 +539,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames) - if insn.forced_iname_deps_is_final: - update_insn_iname_deps = insn.forced_iname_deps | set(expr.inames) + if insn.within_inames_is_final: + update_insn_iname_deps = insn.within_inames | set(expr.inames) reduction_insn = make_assignment( id=update_id, @@ -550,8 +550,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): acc_vars if len(acc_vars) > 1 else acc_vars[0], expr.expr, expr.inames), depends_on=frozenset([init_insn.id]) | insn.depends_on, - forced_iname_deps=update_insn_iname_deps, - forced_iname_deps_is_final=insn.forced_iname_deps_is_final) + within_inames=update_insn_iname_deps, + within_inames_is_final=insn.within_inames_is_final) generated_insns.append(reduction_insn) @@ -644,8 +644,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): acc_var[outer_local_iname_vars + (var(base_exec_iname),)] for acc_var in acc_vars), expression=neutral, - forced_iname_deps=base_iname_deps | frozenset([base_exec_iname]), - forced_iname_deps_is_final=insn.forced_iname_deps_is_final, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset()) generated_insns.append(init_insn) @@ -657,10 +657,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): for acc_var in acc_vars), expression=expr.operation( arg_dtype, neutral, expr.expr, expr.inames), - forced_iname_deps=( + within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), - forced_iname_deps_is_final=insn.forced_iname_deps_is_final, + within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id]) | insn.depends_on, no_sync_with=frozenset([init_id])) generated_insns.append(transfer_insn) @@ -706,9 +706,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): var(stage_exec_iname) + new_size,)] for acc_var in acc_vars]), expr.inames), - forced_iname_deps=( + within_inames=( base_iname_deps | frozenset([stage_exec_iname])), - forced_iname_deps_is_final=insn.forced_iname_deps_is_final, + within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([prev_id]), ) @@ -721,7 +721,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): new_insn_add_depends_on.add(prev_id) new_insn_add_no_sync_with.add(prev_id) - new_insn_add_forced_iname_deps.add(stage_exec_iname or base_exec_iname) + new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname) if nresults == 1: assert len(acc_vars) == 1 @@ -831,7 +831,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): while insn_queue: new_insn_add_depends_on = set() new_insn_add_no_sync_with = set() - new_insn_add_forced_iname_deps = set() + new_insn_add_within_inames = set() generated_insns = [] @@ -860,9 +860,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): | frozenset(new_insn_add_depends_on), no_sync_with=insn.no_sync_with | frozenset(new_insn_add_no_sync_with), - forced_iname_deps=( + within_inames=( temp_kernel.insn_inames(insn) - | new_insn_add_forced_iname_deps)) + | new_insn_add_within_inames)) kwargs.pop("id") kwargs.pop("expression") @@ -871,13 +871,25 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): kwargs.pop("temp_var_type", None) kwargs.pop("temp_var_types", None) - replacement_insns = [ - lp.Assignment( - id=insn_id_gen(insn.id), - assignee=assignee, - expression=new_expr, - **kwargs) - for assignee, new_expr in zip(insn.assignees, new_expressions)] + if isinstance(insn.expression, Reduction) and nresults > 1: + replacement_insns = [ + lp.Assignment( + id=insn_id_gen(insn.id), + assignee=assignee, + expression=new_expr, + **kwargs) + for assignee, new_expr in zip( + insn.assignees, new_expressions)] + + else: + new_expr, = new_expressions + replacement_insns = [ + make_assignment( + id=insn_id_gen(insn.id), + assignees=insn.assignees, + expression=new_expr, + **kwargs) + ] insn_id_replacements[insn.id] = [ rinsn.id for rinsn in replacement_insns] @@ -1054,7 +1066,7 @@ def preprocess_kernel(kernel, device=None): try: result = preprocess_cache[kernel] - logger.info("%s: preprocess cache hit" % kernel.name) + logger.debug("%s: preprocess cache hit" % kernel.name) return result except KeyError: pass @@ -1063,6 +1075,9 @@ def preprocess_kernel(kernel, device=None): logger.info("%s: preprocess start" % kernel.name) + from loopy.check import check_identifiers_in_subst_rules + check_identifiers_in_subst_rules(kernel) + # {{{ check that there are no l.auto-tagged inames from loopy.kernel.data import AutoLocalIndexTagBase @@ -1106,6 +1121,8 @@ def preprocess_kernel(kernel, device=None): kernel = add_axes_to_temporaries_for_ilp_and_vec(kernel) kernel = find_temporary_scope(kernel) + + # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 3437c49177e87288c5f0a14a599618ee0144bf72..04882334f7c5ce9337a690277b16e258eac5327b 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -27,7 +27,7 @@ import six from pytools import Record import sys import islpy as isl -from loopy.diagnostic import LoopyError # noqa +from loopy.diagnostic import warn_with_kernel, LoopyError # noqa from pytools.persistent_dict import PersistentDict from loopy.tools import LoopyKeyBuilder @@ -529,6 +529,11 @@ class SchedulerState(Record): A mapping from instruction group names to the number of instructions in them that are left to schedule. If a group name occurs in this mapping, that group is considered active. + + .. attribute:: uses_of_boostability + + Used to produce warnings about deprecated 'boosting' behavior + Should be removed along with boostability in 2017.x. """ @property @@ -624,6 +629,7 @@ def generate_loop_schedules_internal( # If insn is boostable, it may be placed inside a more deeply # nested loop without harm. + orig_have = have if allow_boost: # Note that the inames in 'insn.boostable_into' necessarily won't # be contained in 'want'. @@ -685,12 +691,21 @@ def generate_loop_schedules_internal( # }}} + new_uses_of_boostability = [] + if allow_boost: + if orig_have & insn.boostable_into: + new_uses_of_boostability.append( + (insn.id, orig_have & insn.boostable_into)) + new_sched_state = sched_state.copy( scheduled_insn_ids=sched_state.scheduled_insn_ids | iid_set, unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set, schedule=( sched_state.schedule + (RunInstruction(insn_id=insn.id),)), active_group_counts=new_active_group_counts, + uses_of_boostability=( + sched_state.uses_of_boostability + + new_uses_of_boostability) ) # Don't be eager about entering/leaving loops--if progress has been @@ -875,6 +890,10 @@ def generate_loop_schedules_internal( writer_insn, = kernel.writer_map()[domain_par] if writer_insn not in sched_state.scheduled_insn_ids: data_dep_written = False + if debug_mode: + print("iname '%s' not scheduled because domain " + "parameter '%s' is not yet available" + % (iname, domain_par)) break if not data_dep_written: @@ -1000,6 +1019,15 @@ def generate_loop_schedules_internal( # if done, yield result debug.log_success(sched_state.schedule) + for boost_insn_id, boost_inames in sched_state.uses_of_boostability: + warn_with_kernel( + kernel, "used_boostability", + "instruction '%s' was implicitly nested inside " + "inames '%s' based on an idempotence heuristic. " + "This is deprecated and will stop working in loopy 2017.x." + % (boost_insn_id, ", ".join(boost_inames)), + DeprecationWarning) + yield sched_state.schedule else: @@ -1053,7 +1081,7 @@ class DependencyRecord(Record): def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): - """If there exists a depdency between target and source and the two access + """If there exists a dependency between target and source and the two access a common variable of *var_kind* in a way that requires a barrier (essentially, at least one write), then the function will return a tuple ``(target, source, var_name)``. Otherwise, it will return *None*. @@ -1437,7 +1465,9 @@ def generate_loop_schedules(kernel, debug_args={}): parallel_inames=parallel_inames - ilp_inames - vec_inames, group_insn_counts=group_insn_counts(kernel), - active_group_counts={}) + active_group_counts={}, + + uses_of_boostability=[]) generators = [ generate_loop_schedules_internal(sched_state, @@ -1546,7 +1576,7 @@ def get_one_scheduled_kernel(kernel): try: result, ambiguous = schedule_cache[sched_cache_key] - logger.info("%s: schedule cache hit" % kernel.name) + logger.debug("%s: schedule cache hit" % kernel.name) from_cache = True except KeyError: pass diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index c3a7d9b0427b660254c8d8210acaee85909a6546..ca782a3d8ca85ea6250f7c9317ca0947db28d5e8 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -95,11 +95,22 @@ def get_common_hw_inames(kernel, insn_ids): # Get the list of hardware inames in which the temporary is defined. if len(insn_ids) == 0: return set() - id_to_insn = kernel.id_to_insn - from six.moves import reduce - return reduce( - set.intersection, - (get_hw_inames(kernel, id_to_insn[id]) for id in insn_ids)) + return set.intersection( + *(get_hw_inames(kernel, kernel.id_to_insn[id]) for id in insn_ids)) + + +def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel): + from loopy.kernel.data import HardwareParallelTag + new_schedule = [] + + for item in kernel.schedule: + if isinstance(item, (EnterLoop, LeaveLoop)): + tag = kernel.iname_to_tag.get(item.iname) + if isinstance(tag, HardwareParallelTag): + continue + new_schedule.append(item) + + return kernel.copy(schedule=new_schedule) # }}} @@ -268,7 +279,7 @@ def compute_live_temporaries(kernel, schedule): live_in[idx] = live_out[idx] = live_in[idx + 1] idx -= 1 else: - raise LoopyError("unexepcted type of schedule item: %s" + raise LoopyError("unexpected type of schedule item: %s" % type(sched_item).__name__) # }}} @@ -333,6 +344,10 @@ class PromotedTemporary(Record): def determine_temporaries_to_promote(kernel, temporaries, name_gen): """ + For each temporary in the passed list of temporaries, construct a + :class:`PromotedTemporary` which describes how the temporary should + get promoted into global storage. + :returns: A :class:`dict` mapping temporary names from `temporaries` to :class:`PromotedTemporary` objects """ @@ -351,6 +366,18 @@ def determine_temporaries_to_promote(kernel, temporaries, name_gen): assert temporary.base_storage is None, \ "Cannot promote temporaries with base_storage to global" + # `hw_inames`: The set of hw-parallel tagged inames that this temporary + # is associated with. This is used for determining the shape of the + # global storage needed for saving and restoring the temporary across + # kernel calls. + # + # TODO: Make a policy decision about which dimensions to use. Currently, + # the code looks at each instruction that defines or uses the temporary, + # and takes the common set of hw-parallel tagged inames associated with + # these instructions. + # + # Furthermore, in the case of local temporaries, inames that are tagged + # hw-local do not contribute to the global storage shape. hw_inames = get_common_hw_inames(kernel, def_lists[temporary.name] + use_lists[temporary.name]) @@ -358,6 +385,8 @@ def determine_temporaries_to_promote(kernel, temporaries, name_gen): hw_inames = sorted(hw_inames, key=lambda iname: str(kernel.iname_to_tag[iname])) + # Calculate the sizes of the dimensions that get added in front for + # the global storage of the temporary. shape_prefix = [] backing_hw_inames = [] @@ -411,11 +440,15 @@ def augment_domain_for_temporary_promotion( new_iname = name_gen("{name}_{mode}_dim_{dim}". format(name=orig_temporary.name, mode=mode, - dim=orig_dim + t_idx)) + dim=t_idx)) domain = domain.set_dim_name( isl.dim_type.set, orig_dim + t_idx, new_iname) - #from loopy.kernel.data import auto - #iname_to_tag[new_iname] = auto + if orig_temporary.is_local: + # If the temporary is has local scope, then loads / stores can be + # done in parallel. + from loopy.kernel.data import AutoFitLocalIndexTag + iname_to_tag[new_iname] = AutoFitLocalIndexTag() + dim_inames.append(new_iname) # Add size information. @@ -423,7 +456,7 @@ def augment_domain_for_temporary_promotion( domain &= aff[0].le_set(aff[new_iname]) size = orig_temporary.shape[t_idx] from loopy.symbolic import aff_from_expr - domain &= aff[new_iname].le_set(aff_from_expr(domain.space, size)) + domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size)) hw_inames = [] @@ -538,7 +571,7 @@ def restore_and_save_temporaries(kernel): for tval in tvals: from loopy.kernel.tools import DomainChanger tval_hw_inames = new_temporaries[tval].hw_inames - dchg = DomainChanger(kernel, + dchg = DomainChanger(new_kernel, frozenset(sched_item.extra_inames + tval_hw_inames)) domain = dchg.domain @@ -572,7 +605,9 @@ def restore_and_save_temporaries(kernel): args = reversed(args) from loopy.kernel.data import Assignment - new_insn = Assignment(*args, id=insn_id) + new_insn = Assignment(*args, id=insn_id, + within_inames=frozenset(hw_inames + dim_inames), + within_inames_is_final=True) new_instructions.append(new_insn) @@ -620,17 +655,25 @@ def restore_and_save_temporaries(kernel): # }}} new_iname_to_tag.update(kernel.iname_to_tag) - new_temporary_variables = dict( + updated_temporary_variables = dict( (t.name, t.as_variable()) for t in new_temporaries.values()) - new_temporary_variables.update(kernel.temporary_variables) + updated_temporary_variables.update(kernel.temporary_variables) kernel = kernel.copy( iname_to_tag=new_iname_to_tag, - temporary_variables=new_temporary_variables, + temporary_variables=updated_temporary_variables, instructions=kernel.instructions + new_instructions, schedule=new_schedule ) + from loopy.kernel.tools import assign_automatic_axes + kernel = assign_automatic_axes(kernel) + + # Once assign_automatic_axes() does its job, loops in the schedule + # for newly hardware-tagged inames are no longer necessary (and in + # fact illegal), so remove them. + kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel) + return kernel @@ -722,7 +765,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): current_chunk.append(sched_item) i += 1 else: - raise LoopyError("unexepcted type of schedule item: %s" + raise LoopyError("unexpected type of schedule item: %s" % type(sched_item).__name__) if current_chunk and schedule_required_splitting: diff --git a/loopy/statistics.py b/loopy/statistics.py index 932acab50b29163345585b10edca0e020f8183b8..6fa4614b9711ce30c44333276970492979554c42 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -308,7 +308,8 @@ class GlobalSubscriptCounter(CombineMapper): from pymbolic.primitives import Variable for idx, axis_tag in zip(index, array.dim_tags): - coeffs = CoefficientCollector()(idx) + from loopy.symbolic import simplify_using_aff + coeffs = CoefficientCollector()(simplify_using_aff(self.knl, idx)) # check if he contains the lid 0 guy try: coeff_id0 = coeffs[Variable(local_id0)] @@ -890,7 +891,10 @@ def gather_access_footprints(kernel, ignore_uncountable=False): from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) - kernel = preprocess_kernel(kernel) + + from loopy.kernel import kernel_state + if kernel.state < kernel_state.PREPROCESSED: + kernel = preprocess_kernel(kernel) write_footprints = [] read_footprints = [] @@ -938,6 +942,13 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): data-dependent or nonlinear indices) """ + from loopy.preprocess import preprocess_kernel, infer_unknown_types + kernel = infer_unknown_types(kernel, expect_completion=True) + + from loopy.kernel import kernel_state + if kernel.state < kernel_state.PREPROCESSED: + kernel = preprocess_kernel(kernel) + result = {} fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 9a753a6630590f05bd4377c1e02863eebb0bf834..1eb7cbed68fc84fdcd5b6a933930373ae6bfb471 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1,8 +1,6 @@ """Pymbolic mappers for loopy.""" from __future__ import division, absolute_import -import six -from six.moves import range, zip, reduce __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -27,13 +25,14 @@ THE SOFTWARE. """ -from six.moves import intern +import six +from six.moves import range, zip, reduce, intern from pytools import memoize, memoize_method, Record import pytools.lex from pymbolic.primitives import ( - Leaf, AlgebraicLeaf, Expression, Variable, CommonSubexpression) + Leaf, Expression, Variable, CommonSubexpression) from pymbolic.mapper import ( CombineMapper as CombineMapperBase, @@ -102,6 +101,8 @@ class IdentityMapperMixin(object): map_linear_subscript = IdentityMapperBase.map_subscript + map_rule_argument = map_group_hw_index + class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): pass @@ -132,6 +133,8 @@ class WalkMapper(WalkMapperBase): map_linear_subscript = WalkMapperBase.map_subscript + map_rule_argument = map_group_hw_index + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant @@ -181,6 +184,9 @@ class StringifyMapper(StringifyMapperBase): type(expr).__name__, ", ".join(str(a) for a in expr.__getinitargs__())) + def map_rule_argument(self, expr, enclosing_prec): + return "<arg%d>" % expr.index + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): @@ -357,7 +363,7 @@ class TaggedVariable(Variable): mapper_method = intern("map_tagged_variable") -class Reduction(AlgebraicLeaf): +class Reduction(Expression): """Represents a reduction operation on :attr:`expr` across :attr:`inames`. @@ -437,7 +443,7 @@ class Reduction(AlgebraicLeaf): mapper_method = intern("map_reduction") -class LinearSubscript(AlgebraicLeaf): +class LinearSubscript(Expression): """Represents a linear index into a multi-dimensional array, completely ignoring any multi-dimensional layout. """ @@ -456,6 +462,26 @@ class LinearSubscript(AlgebraicLeaf): mapper_method = intern("map_linear_subscript") + +class RuleArgument(Expression): + """Represents a (numbered) argument of a :class:`loopy.SubstitutionRule`. + Only used internally in the rule-aware mappers to match subst rules + independently of argument names. + """ + + init_arg_names = ("index",) + + def __init__(self, index): + self.index = index + + def __getinitargs__(self): + return (self.index,) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_rule_argument") + # }}} @@ -544,32 +570,47 @@ def rename_subst_rules_in_instructions(insns, renames): class SubstitutionRuleMappingContext(object): + def _get_subst_rule_key(self, args, body): + subst_dict = dict( + (arg, RuleArgument(i)) + for i, arg in enumerate(args)) + + from pymbolic.mapper.substitutor import make_subst_func + arg_subst_map = SubstitutionMapper(make_subst_func(subst_dict)) + + return arg_subst_map(body) + def __init__(self, old_subst_rules, make_unique_var_name): self.old_subst_rules = old_subst_rules self.make_unique_var_name = make_unique_var_name # maps subst rule (args, bodies) to (names, original_name) self.subst_rule_registry = dict( - ((rule.arguments, rule.expression), (name, name)) + (self._get_subst_rule_key(rule.arguments, rule.expression), + (name, rule.arguments, rule.expression)) for name, rule in six.iteritems(old_subst_rules)) - # maps subst rule (args, bodies) to use counts - self.subst_rule_use_count = {} + # maps subst rule (args, bodies) to a list of old names, + # which doubles as (a) a histogram of uses and (b) a way + # to deterministically find the 'lexicographically earliest' + # name + self.subst_rule_old_names = {} def register_subst_rule(self, original_name, args, body): """Returns a name (as a string) for a newly created substitution rule. """ - key = (args, body) + key = self._get_subst_rule_key(args, body) reg_value = self.subst_rule_registry.get(key) if reg_value is None: - new_name = self.make_unique_var_name(original_name) - self.subst_rule_registry[key] = (new_name, original_name) + # These names are temporary and won't stick around. + new_name = self.make_unique_var_name("_lpy_tmp_"+original_name) + self.subst_rule_registry[key] = (new_name, args, body) else: - new_name, _ = reg_value + new_name, _, _ = reg_value - self.subst_rule_use_count[key] = self.subst_rule_use_count.get(key, 0) + 1 + self.subst_rule_old_names.setdefault(key, []).append(original_name) return new_name def _get_new_substitutions_and_renames(self): @@ -591,27 +632,33 @@ class SubstitutionRuleMappingContext(object): from loopy.kernel.data import SubstitutionRule - orig_name_histogram = {} - for key, (name, orig_name) in six.iteritems(self.subst_rule_registry): - if self.subst_rule_use_count.get(key, 0): - orig_name_histogram[orig_name] = \ - orig_name_histogram.get(orig_name, 0) + 1 - result = {} renames = {} - for key, (name, orig_name) in six.iteritems(self.subst_rule_registry): - args, body = key + used_names = set() + + for key, (name, args, body) in six.iteritems( + self.subst_rule_registry): + orig_names = self.subst_rule_old_names.get(key, []) + + # If no orig_names are found, then this particular + # subst rule was never referenced, and so it's fine + # to leave out. + + if not orig_names: + continue + + new_name = min(orig_names) + if new_name in used_names: + new_name = self.make_unique_var_name(new_name) - if self.subst_rule_use_count.get(key, 0): - if orig_name_histogram[orig_name] == 1 and name != orig_name: - renames[name] = orig_name - name = orig_name + renames[name] = new_name + used_names.add(new_name) - result[name] = SubstitutionRule( - name=name, - arguments=args, - expression=body) + result[new_name] = SubstitutionRule( + name=new_name, + arguments=args, + expression=body) # {{{ perform renames on new substitutions @@ -1005,16 +1052,11 @@ class ArrayAccessFinder(CombineMapper): # }}} -# {{{ aff <-> expr conversion +# {{{ (pw)aff to expr conversion -def aff_to_expr(aff, except_name=None, error_on_name=None): - if except_name is not None and error_on_name is not None: - raise ValueError("except_name and error_on_name may not be specified " - "at the same time") +def aff_to_expr(aff): from pymbolic import var - except_coeff = 0 - denom = aff.get_denominator_val().to_python() result = (aff.get_constant_val()*denom).to_python() @@ -1023,29 +1065,14 @@ def aff_to_expr(aff, except_name=None, error_on_name=None): coeff = (aff.get_coefficient_val(dt, i)*denom).to_python() if coeff: dim_name = aff.get_dim_name(dt, i) - if dim_name == except_name: - except_coeff += coeff - elif dim_name == error_on_name: - raise RuntimeError("'%s' occurred in this subexpression--" - "this is not allowed" % dim_name) - else: - result += coeff*var(dim_name) - - error_on_name = error_on_name or except_name + result += coeff*var(dim_name) for i in range(aff.dim(dim_type.div)): coeff = (aff.get_coefficient_val(dim_type.div, i)*denom).to_python() if coeff: - result += coeff*aff_to_expr(aff.get_div(i), error_on_name=error_on_name) + result += coeff*aff_to_expr(aff.get_div(i)) - if except_name is not None: - if except_coeff % denom != 0: - raise RuntimeError("coefficient of '%s' is not divisible by " - "aff denominator" % except_name) - - return result // denom, except_coeff // denom - else: - return result // denom + return result // denom def pw_aff_to_expr(pw_aff, int_ok=False): @@ -1064,21 +1091,81 @@ def pw_aff_to_expr(pw_aff, int_ok=False): (set, aff), = pieces return aff_to_expr(aff) +# }}} + + +# {{{ (pw)aff_from_expr + +class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin): + def __init__(self, space, vars_to_zero): + self.zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space)) + + context = {} + for name, (dt, pos) in six.iteritems(space.get_var_dict()): + if dt == dim_type.set: + dt = dim_type.in_ + + context[name] = isl.PwAff.from_aff( + self.zero.set_coefficient_val(dt, pos, 1)) + + for v in vars_to_zero: + context[v] = self.zero + + self.pw_zero = isl.PwAff.from_aff(self.zero) -def aff_from_expr(space, expr, vars_to_zero=set()): - zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space)) - context = {} - for name, (dt, pos) in six.iteritems(space.get_var_dict()): - if dt == dim_type.set: - dt = dim_type.in_ + super(PwAffEvaluationMapper, self).__init__(context) - context[name] = zero.set_coefficient_val(dt, pos, 1) + def map_constant(self, expr): + return self.pw_zero + expr + + def map_min(self, expr): + from functools import reduce + return reduce( + lambda a, b: a.min(b), + (self.rec(ch) for ch in expr.children)) + + def map_max(self, expr): + from functools import reduce + return reduce( + lambda a, b: a.max(b), + (self.rec(ch) for ch in expr.children)) + + def map_quotient(self, expr): + raise TypeError("true division in '%s' not supported " + "for as-pwaff evaluation" % expr) + + def map_floor_div(self, expr): + num = self.rec(expr.numerator) + denom = self.rec(expr.denominator) + return num.div(denom).floor() + + def map_remainder(self, expr): + num = self.rec(expr.numerator) + denom = self.rec(expr.denominator) + if not denom.is_cst(): + raise TypeError("modulo non-constant in '%s' not supported " + "for as-pwaff evaluation" % expr) + + (s, denom_aff), = denom.get_pieces() + denom = denom_aff.get_constant_val() + + return num.mod_val(denom) + + +def aff_from_expr(space, expr, vars_to_zero=frozenset()): + pwaff = pwaff_from_expr(space, expr, vars_to_zero).coalesce() + + pieces = pwaff.get_pieces() + if len(pieces) == 1: + (s, aff), = pieces + return aff + else: + raise RuntimeError("expression '%s' could not be converted to a " + "non-piecewise quasi-affine expression" % expr) - for name in vars_to_zero: - context[name] = zero - from pymbolic import evaluate - return zero + evaluate(expr, context) +def pwaff_from_expr(space, expr, vars_to_zero=frozenset()): + return PwAffEvaluationMapper(space, vars_to_zero)(expr) # }}} diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 88e656a1e3a4bfeb25a250dbcb3a05d1f805bac8..c1b9f2302c67d56fd16f72170d70c7a869e6aae9 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -124,6 +124,20 @@ class TargetBase(object): # }}} + def get_kernel_executor_cache_key(self, *args, **kwargs): + """ + :returns: an immutable type to be used as the cache key for + kernel executor caching. + """ + raise NotImplementedError() + + def get_kernel_executor(self, kernel, *args, **kwargs): + """ + :returns: an immutable type to be used as the cache key for + kernel executor caching. + """ + raise NotImplementedError() + class ASTBuilderBase(object): """An interface for generating (host or device) ASTs. @@ -186,16 +200,12 @@ class ASTBuilderBase(object): def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError() - def emit_assignment(self, codegen_state, lhs, rhs): + def emit_assignment(self, codegen_state, insn): raise NotImplementedError() def emit_multiple_assignment(self, codegen_state, insn): raise NotImplementedError() - def emit_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype): - raise NotImplementedError("atomic update in target %s" % type(self).__name__) - def emit_sequential_loop(self, codegen_state, iname, iname_dtype, static_lbound, static_ubound, inner): raise NotImplementedError() @@ -254,16 +264,12 @@ class DummyHostASTBuilder(ASTBuilderBase): def ast_block_class(self): return _DummyASTBlock - def emit_assignment(self, codegen_state, lhs, rhs): + def emit_assignment(self, codegen_state, insn): return None def emit_multiple_assignment(self, codegen_state, insn): return None - def emit_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var, - lhs_expr, rhs_expr, lhs_dtype): - return None - def emit_sequential_loop(self, codegen_state, iname, iname_dtype, static_lbound, static_ubound, inner): return None diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 7580038f45e7b098dfddbb8031584ae4be398c44..6e1893b1a866532f1f97722a72d648c893a4e34a 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -495,9 +495,57 @@ class CASTBuilder(ASTBuilderBase): return arg_decl - def emit_assignment(self, codegen_state, lhs, rhs): - from cgen import Assign - return Assign(lhs, rhs) + def emit_assignment(self, codegen_state, insn): + kernel = codegen_state.kernel + ecm = codegen_state.expression_to_code_mapper + + assignee_var_name, = insn.assignee_var_names() + + lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) + lhs_dtype = lhs_var.dtype + + if insn.atomicity is not None: + lhs_atomicity = [ + a for a in insn.atomicity if a.var_name == assignee_var_name] + assert len(lhs_atomicity) <= 1 + if lhs_atomicity: + lhs_atomicity, = lhs_atomicity + else: + lhs_atomicity = None + else: + lhs_atomicity = None + + from loopy.kernel.data import AtomicInit, AtomicUpdate + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + + lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None) + rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) + if lhs_atomicity is None: + from cgen import Assign + return Assign( + lhs_code, + ecm(insn.expression, prec=PREC_NONE, + type_context=rhs_type_context, + needed_dtype=lhs_dtype)) + + elif isinstance(lhs_atomicity, AtomicInit): + raise NotImplementedError("atomic init") + + elif isinstance(lhs_atomicity, AtomicUpdate): + codegen_state.seen_atomic_dtypes.add(lhs_dtype) + return codegen_state.ast_builder.emit_atomic_update( + codegen_state, lhs_atomicity, lhs_var, + insn.assignee, insn.expression, + lhs_dtype, rhs_type_context) + + else: + raise ValueError("unexpected lhs atomicity type: %s" + % type(lhs_atomicity).__name__) + + def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype): + raise NotImplementedError("atomic updates in %s" % type(self).__name__) def emit_multiple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 51b5f5343b90ac644d9f1d222637bf428d833d6e..0b20c2d08416c931fe88535cbda47c78723c57ae 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -194,10 +194,26 @@ class ExpressionToCMapper(RecursiveMapper): from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable if isinstance(ary, ImageArg): - base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" - % (ary.name, ary.dimensions, - ", ".join(self.rec(idx, PREC_NONE, 'i') - for idx in expr.index[::-1]))) + extra_axes = 0 + + num_target_axes = ary.num_target_axes() + if num_target_axes in [1, 2]: + idx_vec_type = "float2" + extra_axes = 2-num_target_axes + elif num_target_axes == 3: + idx_vec_type = "float4" + extra_axes = 4-num_target_axes + else: + raise LoopyError("unsupported number (%d) of target axes in image" + % num_target_axes) + + idx_tuple = expr.index_tuple[::-1] + (0,) * extra_axes + + idx_str = ", ".join(self.rec(idx, PREC_NONE, 'i') + for idx in idx_tuple) + + base_access = ("read_imagef(%s, loopy_sampler, (%s) (%s))" + % (ary.name, idx_vec_type, idx_str)) if ary.dtype.numpy_dtype == np.float32: return base_access+".x" diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index acfb176ba0a619cf9ced560baad83f256a9971d1..a3f0fdc63e053e8c73a0e8e61fcf949e72b02a0d 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -327,7 +327,7 @@ class CUDACASTBuilder(CASTBuilder): return arg_decl - def get_image_arg_decl(self, name, shape, dtype, is_written): + def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError("not yet: texture arguments in CUDA") def get_constant_arg_decl(self, name, shape, dtype, is_written): diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 7f19bdbdf838bcf43b66eb9c1c4fbf58699634a9..8372abb0373bba7995fc79512df5f4b31631b032 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -356,6 +356,117 @@ class ISPCASTBuilder(CASTBuilder): from cgen.ispc import ISPCUniform return ISPCUniform(result) + def emit_assignment(self, codegen_state, insn): + kernel = codegen_state.kernel + ecm = codegen_state.expression_to_code_mapper + + assignee_var_name, = insn.assignee_var_names() + + lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name) + lhs_dtype = lhs_var.dtype + + if insn.atomicity: + raise NotImplementedError("atomic ops in ISPC") + + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + + rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype) + rhs_code = ecm(insn.expression, prec=PREC_NONE, + type_context=rhs_type_context, + needed_dtype=lhs_dtype) + + lhs = insn.assignee + + # {{{ handle streaming stores + + if "!streaming_store" in insn.tags: + ary = ecm.find_array(lhs) + + from loopy.kernel.array import get_access_info + from pymbolic import evaluate + + from loopy.symbolic import simplify_using_aff + index_tuple = tuple( + simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) + + access_info = get_access_info(kernel.target, ary, index_tuple, + lambda expr: evaluate(expr, self.codegen_state.var_subst_map), + codegen_state.vectorization_info) + + from loopy.kernel.data import GlobalArg, TemporaryVariable + + if not isinstance(ary, (GlobalArg, TemporaryVariable)): + raise LoopyError("array type not supported in ISPC: %s" + % type(ary).__name) + + if len(access_info.subscripts) != 1: + raise LoopyError("streaming stores must have a subscript") + subscript, = access_info.subscripts + + from pymbolic.primitives import Sum, flattened_sum, Variable + if isinstance(subscript, Sum): + terms = subscript.children + else: + terms = (subscript.children,) + + new_terms = [] + + from loopy.kernel.data import LocalIndexTag + from loopy.symbolic import get_dependencies + + saw_l0 = False + for term in terms: + if (isinstance(term, Variable) + and isinstance( + kernel.iname_to_tag.get(term.name), LocalIndexTag) + and kernel.iname_to_tag.get(term.name).axis == 0): + if saw_l0: + raise LoopyError("streaming store must have stride 1 " + "in local index, got: %s" % subscript) + saw_l0 = True + continue + else: + for dep in get_dependencies(term): + if ( + isinstance( + kernel.iname_to_tag.get(dep), LocalIndexTag) + and kernel.iname_to_tag.get(dep).axis == 0): + raise LoopyError("streaming store must have stride 1 " + "in local index, got: %s" % subscript) + + new_terms.append(term) + + if not saw_l0: + raise LoopyError("streaming store must have stride 1 in " + "local index, got: %s" % subscript) + + if access_info.vector_index is not None: + raise LoopyError("streaming store may not use a short-vector " + "data type") + + rhs_has_programindex = any( + isinstance( + kernel.iname_to_tag.get(dep), LocalIndexTag) + and kernel.iname_to_tag.get(dep).axis == 0 + for dep in get_dependencies(insn.expression)) + + if not rhs_has_programindex: + rhs_code = "broadcast(%s, 0)" % rhs_code + + from cgen import Statement + return Statement( + "streaming_store(%s + %s, %s)" + % ( + access_info.array_name, + ecm(flattened_sum(new_terms), PREC_NONE, 'i'), + rhs_code)) + + # }}} + + from cgen import Assign + return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, static_lbound, static_ubound, inner): ecm = codegen_state.expression_to_code_mapper diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 8e32c7fa187c384de261ad9f6444c7bd6f3b4171..104f5c654b9ac662e50c642a0135058ed38ef347 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -311,7 +311,7 @@ class OpenCLTarget(CTarget): """A target for the OpenCL C heterogeneous compute programming language. """ - def __init__(self, atomics_flavor="cl1"): + def __init__(self, atomics_flavor=None): """ :arg atomics_flavor: one of ``"cl1"`` (C11-style atomics from OpenCL 2.0), ``"cl1"`` (OpenCL 1.1 atomics, using bit-for-bit compare-and-swap @@ -320,6 +320,9 @@ class OpenCLTarget(CTarget): """ super(OpenCLTarget, self).__init__() + if atomics_flavor is None: + atomics_flavor = "cl1" + if atomics_flavor not in ["cl1", "cl2"]: raise ValueError("unsupported atomics flavor: %s" % atomics_flavor) @@ -495,7 +498,7 @@ class OpenCLCASTBuilder(CASTBuilder): # {{{ code generation for atomic update - def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var, + def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): from pymbolic.mapper.stringifier import PREC_NONE diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index b570cf4ab7e4ed543213b73716b6271e61e6b69b..bdd5773b3f68d631343871adee423144086ca71a 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -275,11 +275,13 @@ class PyOpenCLTarget(OpenCLTarget): host_program_name_prefix = "_lpy_host_" host_program_name_suffix = "" - def __init__(self, device=None, pyopencl_module_name="_lpy_cl"): + def __init__(self, device=None, pyopencl_module_name="_lpy_cl", + atomics_flavor=None): # This ensures the dtype registry is populated. import pyopencl.tools # noqa - super(PyOpenCLTarget, self).__init__() + super(PyOpenCLTarget, self).__init__( + atomics_flavor=atomics_flavor) self.device = device self.pyopencl_module_name = pyopencl_module_name @@ -375,6 +377,13 @@ class PyOpenCLTarget(OpenCLTarget): # }}} + def get_kernel_executor_cache_key(self, queue, **kwargs): + return queue.context + + def get_kernel_executor(self, kernel, queue, **kwargs): + from loopy.target.pyopencl_execution import PyOpenCLKernelExecutor + return PyOpenCLKernelExecutor(queue.context, kernel) + # }}} @@ -615,7 +624,9 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): from genpy import Assign, Comment, Line def alloc_nbytes(tv): - return tv.dtype.numpy_dtype.itemsize + from six.moves import reduce + from operator import mul + return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from loopy.kernel.data import temp_var_scope diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py new file mode 100644 index 0000000000000000000000000000000000000000..540cad00036de046484357826781353a927d7497 --- /dev/null +++ b/loopy/target/pyopencl_execution.py @@ -0,0 +1,774 @@ +from __future__ import division, with_statement, absolute_import + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +from six.moves import range, zip + +from pytools import Record, memoize_method +from loopy.diagnostic import ParameterFinderWarning +from pytools.py_codegen import ( + Indentation, PythonFunctionGenerator) +from loopy.diagnostic import LoopyError +from loopy.types import NumpyType +from loopy.execution import KernelExecutorBase + +import logging +logger = logging.getLogger(__name__) + + +# {{{ invoker generation + +# /!\ This code runs in a namespace controlled by the user. +# Prefix all auxiliary variables with "_lpy". + + +def python_dtype_str(dtype): + import pyopencl.tools as cl_tools + if dtype.isbuiltin: + return "_lpy_np."+dtype.name + else: + return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")" + % cl_tools.dtype_to_ctype(dtype)) + + +# {{{ integer arg finding from shapes + +def generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info): + # a mapping from integer argument names to a list of tuples + # (arg_name, expression), where expression is a + # unary function of kernel.arg_dict[arg_name] + # returning the desired integer argument. + iarg_to_sources = {} + + from loopy.kernel.data import GlobalArg + from loopy.symbolic import DependencyMapper, StringifyMapper + dep_map = DependencyMapper() + + from pymbolic import var + for arg in implemented_data_info: + if arg.arg_class is GlobalArg: + sym_shape = var(arg.name).attr("shape") + for axis_nr, shape_i in enumerate(arg.shape): + if shape_i is None: + continue + + deps = dep_map(shape_i) + + if len(deps) == 1: + integer_arg_var, = deps + + if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): + from pymbolic.algorithm import solve_affine_equations_for + try: + # friggin' overkill :) + iarg_expr = solve_affine_equations_for( + [integer_arg_var.name], + [(shape_i, sym_shape.index(axis_nr))] + )[integer_arg_var] + except Exception as e: + #from traceback import print_exc + #print_exc() + + # went wrong? oh well + from warnings import warn + warn("Unable to generate code to automatically " + "find '%s' from the shape of '%s':\n%s" + % (integer_arg_var.name, arg.name, str(e)), + ParameterFinderWarning) + else: + iarg_to_sources.setdefault(integer_arg_var.name, []) \ + .append((arg.name, iarg_expr)) + + gen("# {{{ find integer arguments from shapes") + gen("") + + for iarg_name, sources in six.iteritems(iarg_to_sources): + gen("if %s is None:" % iarg_name) + with Indentation(gen): + if_stmt = "if" + for arg_name, value_expr in sources: + gen("%s %s is not None:" % (if_stmt, arg_name)) + with Indentation(gen): + gen("%s = %s" + % (iarg_name, StringifyMapper()(value_expr))) + + if_stmt = "elif" + + gen("") + + gen("# }}}") + gen("") + +# }}} + + +# {{{ integer arg finding from offsets + +def generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info): + options = kernel.options + + gen("# {{{ find integer arguments from offsets") + gen("") + + for arg in implemented_data_info: + impl_array_name = arg.offset_for_name + if impl_array_name is not None: + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("if %s is None:" % impl_array_name) + with Indentation(gen): + gen("# Output variable, we'll be allocating " + "it, with zero offset.") + gen("%s = 0" % arg.name) + gen("else:") + with Indentation(gen): + if not options.no_numpy: + gen("_lpy_offset = getattr(%s, \"offset\", 0)" + % impl_array_name) + else: + gen("_lpy_offset = %s.offset" % impl_array_name) + + base_arg = kernel.impl_arg_to_arg[impl_array_name] + + if not options.skip_arg_checks: + gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" + % (arg.name, base_arg.dtype.itemsize)) + + gen("assert _lpy_remdr == 0, \"Offset of array '%s' is " + "not divisible by its dtype itemsize\"" + % impl_array_name) + gen("del _lpy_remdr") + else: + gen("%s = _lpy_offset // %d" + % (arg.name, base_arg.dtype.itemsize)) + + if not options.skip_arg_checks: + gen("del _lpy_offset") + + gen("# }}}") + gen("") + +# }}} + + +# {{{ integer arg finding from strides + +def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info): + options = kernel.options + + gen("# {{{ find integer arguments from strides") + gen("") + + for arg in implemented_data_info: + if arg.stride_for_name_and_axis is not None: + impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis + + gen("if %s is None:" % arg.name) + with Indentation(gen): + if not options.skip_arg_checks: + gen("if %s is None:" % impl_array_name) + with Indentation(gen): + gen("raise RuntimeError(\"required stride '%s' for " + "argument '%s' not given or deducible from " + "passed array\")" + % (arg.name, impl_array_name)) + + base_arg = kernel.impl_arg_to_arg[impl_array_name] + + if not options.skip_arg_checks: + gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" + % (arg.name, impl_array_name, stride_impl_axis, + base_arg.dtype.dtype.itemsize)) + + gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' is " + "not divisible by its dtype itemsize\"" + % (stride_impl_axis, impl_array_name)) + gen("del _lpy_remdr") + else: + gen("%s = _lpy_offset // %d" + % (arg.name, base_arg.dtype.itemsize)) + + gen("# }}}") + gen("") + +# }}} + + +# {{{ check that value args are present + +def generate_value_arg_check(gen, kernel, implemented_data_info): + if kernel.options.skip_arg_checks: + return + + from loopy.kernel.data import ValueArg + + gen("# {{{ check that value args are present") + gen("") + + for arg in implemented_data_info: + if not issubclass(arg.arg_class, ValueArg): + continue + + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("raise TypeError(\"value argument '%s' " + "was not given and could not be automatically " + "determined\")" % arg.name) + + gen("# }}}") + gen("") + +# }}} + + +# {{{ arg setup + +def generate_arg_setup(gen, kernel, implemented_data_info, options): + import loopy as lp + + from loopy.kernel.data import KernelArgument + from loopy.kernel.array import ArrayBase + from loopy.symbolic import StringifyMapper + from pymbolic import var + + gen("# {{{ set up array arguments") + gen("") + + if not options.no_numpy: + gen("_lpy_encountered_numpy = False") + gen("_lpy_encountered_dev = False") + gen("") + + args = [] + + strify = StringifyMapper() + + expect_no_more_arguments = False + + for arg_idx, arg in enumerate(implemented_data_info): + is_written = arg.base_name in kernel.get_written_variables() + kernel_arg = kernel.impl_arg_to_arg.get(arg.name) + + if not issubclass(arg.arg_class, KernelArgument): + expect_no_more_arguments = True + continue + + if expect_no_more_arguments: + raise LoopyError("Further arguments encountered after arg info " + "describing a global temporary variable") + + if not issubclass(arg.arg_class, ArrayBase): + args.append(arg.name) + continue + + gen("# {{{ process %s" % arg.name) + gen("") + + if not options.no_numpy: + gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) + with Indentation(gen): + gen("# synchronous, nothing to worry about") + gen("%s = _lpy_cl_array.to_device(" + "queue, %s, allocator=allocator)" + % (arg.name, arg.name)) + gen("_lpy_encountered_numpy = True") + gen("elif %s is not None:" % arg.name) + with Indentation(gen): + gen("_lpy_encountered_dev = True") + + gen("") + + if not options.skip_arg_checks and not is_written: + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("raise RuntimeError(\"input argument '%s' must " + "be supplied\")" % arg.name) + gen("") + + if (is_written + and arg.arg_class is lp.ImageArg + and not options.skip_arg_checks): + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("raise RuntimeError(\"written image '%s' must " + "be supplied\")" % arg.name) + gen("") + + if is_written and arg.shape is None and not options.skip_arg_checks: + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("raise RuntimeError(\"written argument '%s' has " + "unknown shape and must be supplied\")" % arg.name) + gen("") + + possibly_made_by_loopy = False + + # {{{ allocate written arrays, if needed + + if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ + and arg.shape is not None: + + if not isinstance(arg.dtype, NumpyType): + raise LoopyError("do not know how to pass arg of type '%s'" + % arg.dtype) + + possibly_made_by_loopy = True + gen("_lpy_made_by_loopy = False") + gen("") + + gen("if %s is None:" % arg.name) + with Indentation(gen): + num_axes = len(arg.strides) + for i in range(num_axes): + gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) + + itemsize = kernel_arg.dtype.numpy_dtype.itemsize + for i in range(num_axes): + gen("_lpy_strides_%d = %s" % (i, strify( + itemsize*arg.unvec_strides[i]))) + + if not options.skip_arg_checks: + for i in range(num_axes): + gen("assert _lpy_strides_%d > 0, " + "\"'%s' has negative stride in axis %d\"" + % (i, arg.name, i)) + + sym_strides = tuple( + var("_lpy_strides_%d" % i) + for i in range(num_axes)) + sym_shape = tuple( + var("_lpy_shape_%d" % i) + for i in range(num_axes)) + + alloc_size_expr = (sum(astrd*(alen-1) + for alen, astrd in zip(sym_shape, sym_strides)) + + itemsize) + + gen("_lpy_alloc_size = %s" % strify(alloc_size_expr)) + gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, " + "%(dtype)s, strides=%(strides)s, " + "data=allocator(_lpy_alloc_size), allocator=allocator)" + % dict( + name=arg.name, + shape=strify(sym_shape), + strides=strify(sym_strides), + dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype))) + + if not options.skip_arg_checks: + for i in range(num_axes): + gen("del _lpy_shape_%d" % i) + gen("del _lpy_strides_%d" % i) + gen("del _lpy_alloc_size") + gen("") + + gen("_lpy_made_by_loopy = True") + gen("") + + # }}} + + # {{{ argument checking + + if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ + and not options.skip_arg_checks: + if possibly_made_by_loopy: + gen("if not _lpy_made_by_loopy:") + else: + gen("if True:") + + with Indentation(gen): + gen("if %s.dtype != %s:" + % (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype))) + with Indentation(gen): + gen("raise TypeError(\"dtype mismatch on argument '%s' " + "(got: %%s, expected: %s)\" %% %s.dtype)" + % (arg.name, arg.dtype, arg.name)) + + # {{{ generate shape checking code + + def strify_allowing_none(shape_axis): + if shape_axis is None: + return "None" + else: + return strify(shape_axis) + + def strify_tuple(t): + if len(t) == 0: + return "()" + else: + return "(%s,)" % ", ".join( + strify_allowing_none(sa) + for sa in t) + + shape_mismatch_msg = ( + "raise TypeError(\"shape mismatch on argument '%s' " + "(got: %%s, expected: %%s)\" " + "%% (%s.shape, %s))" + % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) + + if kernel_arg.shape is None: + pass + + elif any(shape_axis is None for shape_axis in kernel_arg.shape): + gen("if len(%s.shape) != %s:" + % (arg.name, len(arg.unvec_shape))) + with Indentation(gen): + gen(shape_mismatch_msg) + + for i, shape_axis in enumerate(arg.unvec_shape): + if shape_axis is None: + continue + + gen("if %s.shape[%d] != %s:" + % (arg.name, i, strify(shape_axis))) + with Indentation(gen): + gen(shape_mismatch_msg) + + else: # not None, no Nones in tuple + gen("if %s.shape != %s:" + % (arg.name, strify(arg.unvec_shape))) + with Indentation(gen): + gen(shape_mismatch_msg) + + # }}} + + if arg.unvec_strides and kernel_arg.dim_tags: + itemsize = kernel_arg.dtype.numpy_dtype.itemsize + sym_strides = tuple( + itemsize*s_i for s_i in arg.unvec_strides) + gen("if %s.strides != %s:" + % (arg.name, strify(sym_strides))) + with Indentation(gen): + gen("raise TypeError(\"strides mismatch on " + "argument '%s' (got: %%s, expected: %%s)\" " + "%% (%s.strides, %s))" + % (arg.name, arg.name, strify(sym_strides))) + + if not arg.allows_offset: + gen("if %s.offset:" % arg.name) + with Indentation(gen): + gen("raise ValueError(\"Argument '%s' does not " + "allow arrays with offsets. Try passing " + "default_offset=loopy.auto to make_kernel()." + "\")" % arg.name) + gen("") + + # }}} + + if possibly_made_by_loopy and not options.skip_arg_checks: + gen("del _lpy_made_by_loopy") + gen("") + + if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: + args.append("%s.base_data" % arg.name) + else: + args.append("%s" % arg.name) + + gen("") + + gen("# }}}") + gen("") + + gen("# }}}") + gen("") + + return args + +# }}} + + +def generate_invoker(kernel, codegen_result): + options = kernel.options + implemented_data_info = codegen_result.implemented_data_info + host_code = codegen_result.host_code() + + system_args = [ + "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None", + # ignored if options.no_numpy + "out_host=None" + ] + + from loopy.kernel.data import KernelArgument + gen = PythonFunctionGenerator( + "invoke_%s_loopy_kernel" % kernel.name, + system_args + [ + "%s=None" % idi.name + for idi in implemented_data_info + if issubclass(idi.arg_class, KernelArgument) + ]) + + gen.add_to_preamble("from __future__ import division") + gen.add_to_preamble("") + gen.add_to_preamble("import pyopencl as _lpy_cl") + gen.add_to_preamble("import pyopencl.array as _lpy_cl_array") + gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools") + gen.add_to_preamble("import numpy as _lpy_np") + gen.add_to_preamble("") + gen.add_to_preamble(host_code) + gen.add_to_preamble("") + + gen("if allocator is None:") + with Indentation(gen): + gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)") + gen("") + + generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info) + generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info) + generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info) + generate_value_arg_check(gen, kernel, implemented_data_info) + + args = generate_arg_setup(gen, kernel, implemented_data_info, options) + + # {{{ generate invocation + + gen("_lpy_evt = {kernel_name}({args})" + .format( + kernel_name=codegen_result.host_program.name, + args=", ".join( + ["_lpy_cl_kernels", "queue"] + + args + + ["wait_for=wait_for"]))) + + # }}} + + # {{{ output + + if not options.no_numpy: + gen("if out_host is None and (_lpy_encountered_numpy " + "and not _lpy_encountered_dev):") + with Indentation(gen): + gen("out_host = True") + + gen("if out_host:") + with Indentation(gen): + gen("pass") # if no outputs (?!) + for arg in implemented_data_info: + if not issubclass(arg.arg_class, KernelArgument): + continue + + is_written = arg.base_name in kernel.get_written_variables() + if is_written: + gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) + + gen("") + + if options.return_dict: + gen("return _lpy_evt, {%s}" + % ", ".join("\"%s\": %s" % (arg.name, arg.name) + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables())) + else: + out_args = [arg + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables()] + if out_args: + gen("return _lpy_evt, (%s,)" + % ", ".join(arg.name for arg in out_args)) + else: + gen("return _lpy_evt, ()") + + # }}} + + if options.write_wrapper: + output = gen.get() + if options.highlight_wrapper: + output = get_highlighted_python_code(output) + + if options.write_wrapper is True: + print(output) + else: + with open(options.write_wrapper, "w") as outf: + outf.write(output) + + return gen.get_function() + + +# }}} + + +# {{{ kernel executor + +class _CLKernelInfo(Record): + pass + + +class _CLKernels(object): + pass + + +class PyOpenCLKernelExecutor(KernelExecutorBase): + """An object connecting a kernel to a :class:`pyopencl.Context` + for execution. + + .. automethod:: __init__ + .. automethod:: __call__ + """ + + def __init__(self, context, kernel): + """ + :arg context: a :class:`pyopencl.Context` + :arg kernel: may be a loopy.LoopKernel, a generator returning kernels + (a warning will be issued if more than one is returned). If the + kernel has not yet been loop-scheduled, that is done, too, with no + specific arguments. + """ + + super(PyOpenCLKernelExecutor, self).__init__(kernel) + + self.context = context + + from loopy.target.pyopencl import PyOpenCLTarget + if isinstance(kernel.target, PyOpenCLTarget): + self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0])) + + @memoize_method + def cl_kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + + from loopy.codegen import generate_code_v2 + codegen_result = generate_code_v2(kernel) + + dev_code = codegen_result.device_code() + + if self.kernel.options.write_cl: + output = dev_code + if self.kernel.options.highlight_cl: + output = get_highlighted_cl_code(output) + + if self.kernel.options.write_cl is True: + print(output) + else: + with open(self.kernel.options.write_cl, "w") as outf: + outf.write(output) + + if self.kernel.options.edit_cl: + from pytools import invoke_editor + dev_code = invoke_editor(dev_code, "code.cl") + + import pyopencl as cl + + logger.info("%s: opencl compilation start" % self.kernel.name) + + cl_program = ( + cl.Program(self.context, dev_code) + .build(options=kernel.options.cl_build_options)) + + cl_kernels = _CLKernels() + for dp in codegen_result.device_programs: + setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) + + logger.info("%s: opencl compilation done" % self.kernel.name) + + return _CLKernelInfo( + kernel=kernel, + cl_kernels=cl_kernels, + implemented_data_info=codegen_result.implemented_data_info, + invoker=generate_invoker(kernel, codegen_result)) + + # {{{ debugging aids + + def get_code(self, arg_to_dtype=None): + if arg_to_dtype is not None: + arg_to_dtype = frozenset(six.iteritems(arg_to_dtype)) + + kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) + + from loopy.codegen import generate_code_v2 + code = generate_code_v2(kernel) + return code.device_code() + + def get_highlighted_code(self, arg_to_dtype=None): + return get_highlighted_cl_code( + self.get_code(arg_to_dtype)) + + # }}} + + def __call__(self, queue, **kwargs): + """ + :arg allocator: a callable passed a byte count and returning + a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator + maybe. + :arg wait_for: A list of :class:`pyopencl.Event` instances + for which to wait. + :arg out_host: :class:`bool` + Decides whether output arguments (i.e. arguments + written by the kernel) are to be returned as + :mod:`numpy` arrays. *True* for yes, *False* for no. + + For the default value of *None*, if all (input) array + arguments are :mod:`numpy` arrays, defaults to + returning :mod:`numpy` arrays as well. + + :returns: ``(evt, output)`` where *evt* is a :class:`pyopencl.Event` + associated with the execution of the kernel, and + output is a tuple of output arguments (arguments that + are written as part of the kernel). The order is given + by the order of kernel arguments. If this order is unspecified + (such as when kernel arguments are inferred automatically), + enable :attr:`loopy.Options.return_dict` to make *output* a + :class:`dict` instead, with keys of argument names and values + of the returned arrays. + """ + + allocator = kwargs.pop("allocator", None) + wait_for = kwargs.pop("wait_for", None) + out_host = kwargs.pop("out_host", None) + + kwargs = self.packing_controller.unpack(kwargs) + + kernel_info = self.cl_kernel_info(self.arg_to_dtype_set(kwargs)) + + return kernel_info.invoker( + kernel_info.cl_kernels, queue, allocator, wait_for, + out_host, **kwargs) + +# }}} + + +def get_highlighted_python_code(text): + try: + from pygments import highlight + except ImportError: + return text + else: + from pygments.lexers import PythonLexer + from pygments.formatters import TerminalFormatter + + return highlight(text, PythonLexer(), TerminalFormatter()) + + +def get_highlighted_cl_code(text): + try: + from pygments import highlight + except ImportError: + return text + else: + from pygments.lexers import CLexer + from pygments.formatters import TerminalFormatter + + return highlight(text, CLexer(), TerminalFormatter()) + + +# vim: foldmethod=marker diff --git a/loopy/target/python.py b/loopy/target/python.py index f2a529c34bc4c56f69db1693058308769919e27a..591161d818bf6691a0412b3a00d624f8b02dde5b 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -256,9 +256,18 @@ class PythonASTBuilderBase(ASTBuilderBase): from genpy import If return If(condition_str, ast) - def emit_assignment(self, codegen_state, lhs, rhs): + def emit_assignment(self, codegen_state, insn): + ecm = codegen_state.expression_to_code_mapper + + if insn.atomicity: + raise NotImplementedError("atomic ops in Python") + + from pymbolic.mapper.stringifier import PREC_NONE from genpy import Assign - return Assign(lhs, rhs) + + return Assign( + ecm(insn.assignee, prec=PREC_NONE, type_context=None), + ecm(insn.expression, prec=PREC_NONE, type_context=None)) # }}} diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index 181f5241137cc1abe3256506f3ebd59327e4c4f8..b7f47c38a6a0daf8e4495c16791ef2f955019649 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -226,6 +226,13 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): # }}} + common_factors = [ + (ik, cf) for ik, cf in common_factors + if cf] + + if not common_factors: + raise LoopyError("no common factors found") + # {{{ remove common factors new_insns = [] diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 40c2b66c05dd11bd69bfa4ff682c45a68d71d623..88e3898e2cceeeb62edea306283fcb718c3b088d 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -169,7 +169,7 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", batch_iname_set = frozenset([batch_iname]) kernel = kernel.copy( instructions=[ - insn.copy(forced_iname_deps=insn.forced_iname_deps | batch_iname_set) + insn.copy(within_inames=insn.within_inames | batch_iname_set) for insn in kernel.instructions]) return kernel diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 0c25b63930f99ebcb8b2b817367ec1572b15a1d4..2210530e997fae4898c113b88fd141bc07f56163 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -400,7 +400,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, init_instruction = Assignment(id=init_insn_id, assignee=buf_var_init, expression=init_expression, - forced_iname_deps=( + within_inames=( frozenset(within_inames) | frozenset(non1_init_inames)), depends_on=frozenset(), @@ -480,7 +480,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, no_sync_with=frozenset([init_insn_id]), assignee=store_target, expression=store_expression, - forced_iname_deps=( + within_inames=( frozenset(within_inames) | frozenset(non1_store_inames))) else: diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index a8c3b5849d90466d1163f4cc47777e1253cae079..d972f44cf6cdd4497c03b2725c4fe607f0e35842 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -311,8 +311,8 @@ class DifferentiationContext(object): assignee=var(new_var_name)[ lhs_ind + diff_iname_exprs], expression=diff_expr, - forced_iname_deps=( - orig_writer_insn.forced_iname_deps | frozenset(diff_inames))) + within_inames=( + orig_writer_insn.within_inames | frozenset(diff_inames))) self.new_instructions.append(insn) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 3a9941fa83fdad7cf9b11dee54e76499b6bacd6a..1914b8d677af4cd0a5683d90ff5ac1168a660063 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -70,6 +70,8 @@ __doc__ = """ .. autofunction:: make_reduction_inames_unique +.. autofunction:: add_inames_to_insn + """ @@ -86,7 +88,7 @@ def set_loop_priority(kernel, loop_priority): """ if isinstance(loop_priority, str): - loop_priority = [s.strip() for s in loop_priority.split(",")] + loop_priority = [s.strip() for s in loop_priority.split(",") if s.strip()] return kernel.copy(loop_priority=loop_priority) @@ -225,16 +227,16 @@ def _split_iname_backend(kernel, split_iname, new_insns = [] for insn in kernel.instructions: - if split_iname in insn.forced_iname_deps: - new_forced_iname_deps = ( - (insn.forced_iname_deps.copy() + if split_iname in insn.within_inames: + new_within_inames = ( + (insn.within_inames.copy() - frozenset([split_iname])) | frozenset([outer_iname, inner_iname])) else: - new_forced_iname_deps = insn.forced_iname_deps + new_within_inames = insn.within_inames insn = insn.copy( - forced_iname_deps=new_forced_iname_deps) + within_inames=new_within_inames) new_insns.append(insn) @@ -528,7 +530,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): if within is None: new_domain = new_domain.project_out(iname_dt, iname_idx, 1) - def subst_forced_iname_deps(fid): + def subst_within_inames(fid): result = set() for iname in fid: if iname in inames: @@ -540,7 +542,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): new_insns = [ insn.copy( - forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps)) + within_inames=subst_within_inames(insn.within_inames)) for insn in kernel.instructions] kernel = (kernel @@ -579,7 +581,12 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given as an instance of a subclass of :class:`loopy.kernel.data.IndexTag` or as a string as shown in :ref:`iname-tags`. May also be a dictionary - for backwards compatibility. + for backwards compatibility. *iname* may also be a wildcard using ``*`` + and ``?``. + + .. versionchanged:: 2016.3 + + Added wildcards. """ if isinstance(iname_to_tag, dict): @@ -614,14 +621,35 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): from loopy.kernel.data import (ParallelTag, AutoLocalIndexTagBase, ForceSequentialTag) - new_iname_to_tag = kernel.iname_to_tag.copy() + # {{{ globbing + + all_inames = kernel.all_inames() + + from loopy.match import re_from_glob + new_iname_to_tag = {} for iname, new_tag in iname_to_tag: - if iname not in kernel.all_inames(): - if ignore_nonexistent: - continue - else: - raise LoopyError("iname '%s' does not exist" % iname) + if '*' in iname or '?' in iname: + match_re = re_from_glob(iname) + for sub_iname in all_inames: + if match_re.match(sub_iname): + new_iname_to_tag[sub_iname] = new_tag + + else: + if iname not in all_inames: + if ignore_nonexistent: + continue + else: + raise LoopyError("iname '%s' does not exist" % iname) + new_iname_to_tag[iname] = new_tag + + iname_to_tag = new_iname_to_tag + del new_iname_to_tag + + # }}} + + knl_iname_to_tag = kernel.iname_to_tag.copy() + for iname, new_tag in six.iteritems(iname_to_tag): old_tag = kernel.iname_to_tag.get(iname) retag_ok = False @@ -652,9 +680,9 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): raise LoopyError("'%s' is already tagged '%s'--cannot retag" % (iname, old_tag)) - new_iname_to_tag[iname] = new_tag + knl_iname_to_tag[iname] = new_tag - return kernel.copy(iname_to_tag=new_iname_to_tag) + return kernel.copy(iname_to_tag=knl_iname_to_tag) # }}} @@ -709,8 +737,8 @@ class _InameDuplicator(RuleAwareIdentityMapper): new_fid = frozenset( self.old_to_new.get(iname, iname) - for iname in insn.forced_iname_deps) - return insn.copy(forced_iname_deps=new_fid) + for iname in insn.within_inames) + return insn.copy(within_inames=new_fid) def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, @@ -1034,11 +1062,11 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): new_instructions = [] for insn in knl.instructions: - if (old_iname in insn.forced_iname_deps + if (old_iname in insn.within_inames and within(knl, insn, ())): insn = insn.copy( - forced_iname_deps=( - (insn.forced_iname_deps - frozenset([old_iname])) + within_inames=( + (insn.within_inames - frozenset([old_iname])) | frozenset([new_iname]))) new_instructions.append(insn) @@ -1410,8 +1438,8 @@ def affine_map_inames(kernel, old_inames, new_inames, equations): return inames new_instructions = [ - insn.copy(forced_iname_deps=fix_iname_set( - insn.id, insn.forced_iname_deps)) + insn.copy(within_inames=fix_iname_set( + insn.id, insn.within_inames)) for insn in kernel.instructions] # }}} @@ -1590,4 +1618,45 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # }}} + +# {{{ add_inames_to_insn + +def add_inames_to_insn(knl, inames, insn_match): + """ + :arg inames: a frozenset of inames that will be added to the + instructions matched by *insn_match*, or a comma-separated + string that parses to such a tuple. + :arg insn_match: An instruction match as understood by + :func:`loopy.match.parse_match`. + + :returns: an :class:`GroupIndexTag` or :class:`LocalIndexTag` + that is not being used within the instructions matched by + *insn_match*. + + .. versionadded:: 2016.3 + """ + + if isinstance(inames, str): + inames = frozenset(s.strip() for s in inames.split(",")) + + if not isinstance(inames, frozenset): + raise TypeError("'inames' must be a frozenset") + + from loopy.match import parse_match + match = parse_match(insn_match) + + new_instructions = [] + + for insn in knl.instructions: + if match(knl, insn): + new_instructions.append( + insn.copy(within_inames=insn.within_inames | inames)) + else: + new_instructions.append(insn) + + return knl.copy(instructions=new_instructions) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 2828b19f9e1c824be6b81ee92269bb317ad9b620..27fad67f8838622d24e276defc7b8c66eaf11306 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -307,8 +307,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, tuple, in which case names will be automatically created. May also equivalently be a comma-separated string. - :arg precompute_outer_inames: The inames within which the compute - instruction is nested. If *None*, make an educated guess. + :arg precompute_outer_inames: A :class:`frozenset` of inames within which + the compute instruction is nested. If *None*, make an educated guess. + May also be specified as a comma-separated string. :arg compute_insn_id: The ID of the instruction generated to perform the precomputation. @@ -361,6 +362,10 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, if isinstance(precompute_inames, str): precompute_inames = [iname.strip() for iname in precompute_inames.split(",")] + if isinstance(precompute_outer_inames, str): + precompute_outer_inames = frozenset( + iname.strip() for iname in precompute_outer_inames.split(",")) + if isinstance(subst_use, str): subst_use = [subst_use] @@ -770,7 +775,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, id=compute_insn_id, assignee=assignee, expression=compute_expression, - # forced_iname_deps determined below + # within_inames determined below ) # }}} @@ -805,9 +810,12 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, if not isinstance(precompute_outer_inames, frozenset): raise TypeError("precompute_outer_inames must be a frozenset") + precompute_outer_inames = precompute_outer_inames \ + | frozenset(non1_storage_axis_names) + kernel = kernel.copy( instructions=[ - insn.copy(forced_iname_deps=precompute_outer_inames) + insn.copy(within_inames=precompute_outer_inames) if insn.id == compute_insn_id else insn for insn in kernel.instructions]) diff --git a/loopy/version.py b/loopy/version.py index 026ff2e4e5090005e9ce80c1894861048492dc61..12b2fedbec2669205de2a82a8d5eca42678e9353 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v40-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v42-islpy%s" % _islpy_version diff --git a/test/gnuma_loopy_transforms.py b/test/gnuma_loopy_transforms.py index 2107d50b03bdf99dad1b8c5bd7f31dfa7062579c..49a2a99bb8d4d518ac23134ba9d1a1ff3d6e8053 100644 --- a/test/gnuma_loopy_transforms.py +++ b/test/gnuma_loopy_transforms.py @@ -23,17 +23,17 @@ def fix_euler_parameters(kernel, p_p0, p_Gamma, p_R): def set_q_storage_format(kernel, name): - kernel = lp.set_array_dim_names(kernel, name, "i,j,k,field,e") + kernel = lp.set_array_axis_names(kernel, name, "i,j,k,field,e") kernel = lp.split_array_dim( kernel, (name, 3, "F"), 4, auto_split_inames=False) - kernel = lp.tag_data_axes(kernel, name, "N0,N1,N2,vec,N4,N3") + kernel = lp.tag_array_axes(kernel, name, "N0,N1,N2,vec,N4,N3") return kernel def set_D_storage_format(kernel): - return lp.tag_data_axes(kernel, "D", "f,f") + return lp.tag_array_axes(kernel, "D", "f,f") def set_up_volume_loop(kernel, Nq): diff --git a/test/test_diff.py b/test/test_diff.py index d6e2ae156f23259d16cb06040f48fefc0d565661..95471f9b126fd6b763530d115c21509d14d2ba47 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -65,6 +65,8 @@ def test_diff(ctx_factory): dknl, diff_map = diff_kernel(knl, "z", "x") dknl = lp.remove_unused_arguments(dknl) + dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") + print(dknl) n = 50 diff --git a/test/test_linalg.py b/test/test_linalg.py index b7a1e059cb4499fe136f962b5d5fdcd539fc05f1..7db8566084423ab85cc3870448ddefd60b0bc27b 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -286,6 +286,7 @@ def test_rank_one(ctx_factory): knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") knl = lp.set_loop_priority(knl, ["i", "j"]) + knl = lp.add_inames_to_insn(knl, "i", "writes:b_fetch") return knl def variant_2(knl): @@ -330,7 +331,12 @@ def test_rank_one(ctx_factory): seq_knl = knl - for variant in [variant_1, variant_2, variant_3, variant_4]: + for variant in [ + variant_1, + variant_2, + variant_3, + variant_4 + ]: lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"], parameters={"n": n}) diff --git a/test/test_loopy.py b/test/test_loopy.py index 078e89ff4911c2ff8cb3a71e5d394cf4f1705b59..dfc50fca04c7c4e26c11dbdc7a20b6a1ec96d729 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -989,7 +989,7 @@ def test_atomic(ctx_factory, dtype): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000)) -def test_forced_iname_deps_and_reduction(): +def test_within_inames_and_reduction(): # See https://github.com/inducer/loopy/issues/24 # This is (purposefully) somewhat un-idiomatic, to replicate the conditions @@ -1005,8 +1005,8 @@ def test_forced_iname_deps_and_reduction(): from pymbolic.primitives import Subscript, Variable i2 = lp.Assignment("a", lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))), - forced_iname_deps=frozenset(), - forced_iname_deps_is_final=True) + within_inames=frozenset(), + within_inames_is_final=True) k = lp.make_kernel("{[i,j] : 0<=i,j<n}", [i1, i2], @@ -1106,13 +1106,16 @@ def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ - <> t_private = a[k,i+1] + <> t_private_scalar = a[k,i+1] + <> t_private_array[i % 2] = a[k,i+1] c[k,i] = a[k,i+1] - out[k,i] = c[k,i] + t_private + out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + knl = lp.set_temporary_scope(knl, "t_private_scalar", "private") + knl = lp.set_temporary_scope(knl, "t_private_array", "private") ref_knl = knl @@ -1138,6 +1141,46 @@ def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) +def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i,k]: 0<=i<n and 0<=k<3 }", + """ + <> t_local[i % 8,k] = i % 8 + c[k,i] = a[k,i+1] + out[k,i] = c[k,i] + t_local[i % 8,k] + """) + + knl = lp.add_and_infer_dtypes(knl, + {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + + knl = lp.set_temporary_scope(knl, "t_local", "local") + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0") + + # schedule + from loopy.preprocess import preprocess_kernel + knl = preprocess_kernel(knl) + + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(knl) + + # map schedule onto host or device + print(knl) + + cgr = lp.generate_code_v2(knl) + + assert len(cgr.device_programs) == 2 + + print(cgr.device_code()) + print(cgr.host_code()) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8)) + + def test_global_temporary(ctx_factory): ctx = ctx_factory() @@ -1269,7 +1312,7 @@ def test_call_with_no_returned_value(ctx_factory): knl = lp.make_kernel( "{:}", - [lp.CallInstruction([], p.Call(p.Variable("f"), ()))] + [lp.CallInstruction((), p.Call(p.Variable("f"), ()))] ) knl = lp.register_function_manglers(knl, [_f_mangler]) @@ -1310,6 +1353,16 @@ def test_unschedulable_kernel_detection(): assert len(list(lp.get_iname_duplication_options(knl))) == 10 +def test_regression_no_ret_call_removal(ctx_factory): + # https://github.com/inducer/loopy/issues/32 + knl = lp.make_kernel( + "{[i] : 0<=i<n}", + "f(sum(i, x[i]))") + knl = lp.add_and_infer_dtypes(knl, {"x": np.float32}) + knl = lp.preprocess_kernel(knl) + assert len(knl.instructions) == 3 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 389151f173cc835c0e01f2cbc3c640952cf3c575..e4f303f78bd6442ce7d956b66b3d99bffc1d6252 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -140,9 +140,9 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): - hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?") + hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: - hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?") + hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): @@ -190,9 +190,9 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work - hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c") - hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c") - hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f") + hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") + hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") + hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"}, ignore_nonexistent=True) diff --git a/test/test_reduction.py b/test/test_reduction.py index b751dd515b5978ba3c9ccedab06016d19e746546..29dda7419ae870e9a3d773aefeafb540567e94a1 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -126,10 +126,14 @@ def test_multi_nested_dependent_reduction(ctx_factory): "{[isrc_box]: 0 <= isrc_box < nboxes}", "{[isrc]: 0 <= isrc < npart}" ], - [ - "<> npart = nparticles_per_box[isrc_box]", - "a[itgt] = sum((isrc_box, isrc), 1)", - ], + """ + for itgt + for isrc_box + <> npart = nparticles_per_box[isrc_box] + end + a[itgt] = sum((isrc_box, isrc), 1) + end + """, [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n",)), @@ -154,11 +158,15 @@ def test_recursive_nested_dependent_reduction(ctx_factory): "{[isrc_box]: 0 <= isrc_box < nboxes}", "{[isrc]: 0 <= isrc < npart}" ], - [ - "<> npart = nparticles_per_box[isrc_box]", - "<> boxsum = sum(isrc, isrc+isrc_box+itgt)", - "a[itgt] = sum(isrc_box, boxsum)", - ], + """ + for itgt + for isrc_box + <> npart = nparticles_per_box[isrc_box] + <> boxsum = sum(isrc, isrc+isrc_box+itgt) + end + a[itgt] = sum(isrc_box, boxsum) + end + """, [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n",)),