Compare revisions

Matt Wala · Andreas Klöckner · Andreas Klöckner · Andreas Klöckner · Andreas Klöckner · Andreas Klöckner
--- a/README.rst
+++ b/README.rst
@@ -25,7 +25,7 @@ It can capture the following types of optimizations:

 * Vector and multi-core parallelism in the OpenCL/CUDA model
 * Data layout transformations (structure of arrays to array of structures)
-* Loopy Unrolling
+* Loop unrolling
 * Loop tiling with efficient handling of boundary cases
 * Prefetching/copy optimizations
 * Instruction level parallelism

--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -111,9 +111,9 @@ always see loopy's view of a kernel by printing it.
    KERNEL: loopy_kernel
    ---------------------------------------------------------------------------
    ARGUMENTS:
-    a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
-    n: ValueArg, type: <runtime>
-    out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
+    a: GlobalArg, type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1)
+    n: ValueArg, type: <auto/runtime>
+    out: GlobalArg, type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1)
    ---------------------------------------------------------------------------
    DOMAINS:
    [n] -> { [i] : 0 <= i < n }
@@ -154,7 +154,7 @@ following:
  See :ref:`specifying-arguments`.

 * Loopy has not determined the type of ``a`` and ``out``. The data type is
-  given as ``<runtime>``, which means that these types will be determined
+  given as ``<auto/runtime>``, which means that these types will be determined
  by the data passed in when the kernel is invoked. Loopy generates (and
  caches!) a copy of the kernel for each combination of types passed in.


--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1081,7 +1081,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
            warn_with_kernel(self,
                "iname-order",
                "get_visual_iname_order_embedding() could not determine a "
-                "consistent iname nesting order")
+                "consistent iname nesting order. This is a possible indication "
+                "that the kernel may not schedule successfully, but for now "
+                "it only impacts printing of the kernel.")
            embedding = dict((iname, iname) for iname in self.all_inames())

        return embedding

--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -549,15 +549,55 @@ class ArrayBase(ImmutableRecord):
    .. attribute :: name

    .. attribute :: dtype
+        the :class:`loopy.loopytype` of the array.
+        if this is *none*, :mod:`loopy` will try to continue without
+        knowing the type of this array, where the idea is that precise
+        knowledge of the type will become available at invocation time.
+        :class:`loopy.compiledkernel` (and thereby
+        :meth:`loopy.loopkernel.__call__`) automatically add this type
+        information based on invocation arguments.
+
+        note that some transformations, such as :func:`loopy.add_padding`
+        cannot be performed without knowledge of the exact *dtype*.

    .. attribute :: shape

+        May be one of the following:
+
+        * *None*. In this case, no shape is intended to be specified,
+          only the strides will be used to access the array. Bounds checking
+          will not be performed.
+
+        * :class:`loopy.auto`. The shape will be determined by finding the
+          access footprint.
+
+        * a tuple like like :attr:`numpy.ndarray.shape`.
+
+          Each entry of the tuple is also allowed to be a :mod:`pymbolic`
+          expression involving kernel parameters, or a (potentially-comma
+          separated) or a string that can be parsed to such an expression.
+
+          Any element of the shape tuple not used to compute strides
+          may be *None*.
+
    .. attribute:: dim_tags

        See :ref:`data-dim-tags`.

    .. attribute:: offset

+        Offset from the beginning of the buffer to the point from
+            which the strides are counted. May be one of
+
+            * 0 or None
+            * a string (that is interpreted as an argument name).
+            * a pymbolic expression
+            * :class:`loopy.auto`, in which case an offset argument
+              is added automatically, immediately following this argument.
+              :class:`loopy.CompiledKernel` is even smarter in its treatment of
+              this case and will compile custom versions of the kernel based on
+              whether the passed arrays have offsets or not.
+
    .. attribute:: dim_names

        A tuple of strings providing names for the array axes, or *None*.
@@ -568,6 +608,21 @@ class ArrayBase(ImmutableRecord):
        to generate more informative names than could be achieved by
        axis numbers.

+    .. attribute:: alignment
+
+        Memory alignment of the array in bytes. For temporary arrays,
+        this ensures they are allocated with this alignment. For arguments,
+        this entails a promise that the incoming array obeys this alignment
+        restriction.
+
+        Defaults to *None*.
+
+        If an integer N is given, the array would be declared
+        with ``__attribute__((aligned(N)))`` in code generation for
+        :class:`loopy.CTarget`.
+
+        .. versionadded:: 2018.1
+
    .. automethod:: __init__
    .. automethod:: __eq__
    .. automethod:: num_user_axes
@@ -584,46 +639,18 @@ class ArrayBase(ImmutableRecord):

    def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
            dim_names=None, strides=None, order=None, for_atomic=False,
-            target=None,
+            target=None, alignment=None,
            **kwargs):
        """
        All of the following (except *name*) are optional.
        Specify either strides or shape.

-        :arg name: May contain multiple names separated by
-            commas, in which case multiple arguments,
-            each with identical properties, are created
-            for each name.
-        :arg dtype: the :class:`numpy.dtype` of the array.
-            If this is *None*, :mod:`loopy` will try to continue without
-            knowing the type of this array, where the idea is that precise
-            knowledge of the type will become available at invocation time.
-            :class:`loopy.CompiledKernel` (and thereby
-            :meth:`loopy.LoopKernel.__call__`) automatically add this type
-            information based on invocation arguments.
-
-            Note that some transformations, such as :func:`loopy.add_padding`
-            cannot be performed without knowledge of the exact *dtype*.
+        :arg name: When passed to :class:`loopy.make_kernel`, this may contain
+            multiple names separated by commas, in which case multiple arguments,
+            each with identical properties, are created for each name.

-        :arg shape: May be one of the following:
-
-            * *None*. In this case, no shape is intended to be specified,
-              only the strides will be used to access the array. Bounds checking
-              will not be performed.
-
-            * :class:`loopy.auto`. The shape will be determined by finding the
-              access footprint.
-
-            * a tuple like like :attr:`numpy.ndarray.shape`.
-
-              Each entry of the tuple is also allowed to be a :mod:`pymbolic`
-              expression involving kernel parameters, or a (potentially-comma
-              separated) or a string that can be parsed to such an expression.
-
-              Any element of the shape tuple not used to compute strides
-              may be *None*.
-
-            * A string which can be parsed into the previous form.
+        :arg shape: May be any of the things specified under :attr:`shape`,
+            or a string which can be parsed into the previous form.

        :arg dim_tags: A comma-separated list of tags as understood by
            :func:`parse_array_dim_tag`.
@@ -649,17 +676,9 @@ class ArrayBase(ImmutableRecord):
        :arg for_atomic:
            Whether the array is declared for atomic access, and, if necessary,
            using atomic-capable data types.
-        :arg offset: Offset from the beginning of the buffer to the point from
-            which the strides are counted. May be one of
+        :arg offset: (See :attr:`offset`)
+        :arg alignment: memory alignment in bytes

-            * 0 or None
-            * a string (that is interpreted as an argument name).
-            * a pymbolic expression
-            * :class:`loopy.auto`, in which case an offset argument
-              is added automatically, immediately following this argument.
-              :class:`loopy.CompiledKernel` is even smarter in its treatment of
-              this case and will compile custom versions of the kernel based on
-              whether the passed arrays have offsets or not.
        """

        for kwarg_name in kwargs:
@@ -672,6 +691,14 @@ class ArrayBase(ImmutableRecord):
        dtype = to_loopy_type(dtype, allow_auto=True, allow_none=True,
                for_atomic=for_atomic, target=target)

+        if dtype is lp.auto:
+            from warnings import warn
+            warn("Argument/temporary data type should be None if unspecified, "
+                    "not auto. This usage will be disallowed in 2018.",
+                    DeprecationWarning, stacklevel=2)
+
+            dtype = None
+
        strides_known = strides is not None and strides is not lp.auto
        shape_known = shape is not None and shape is not lp.auto

@@ -805,6 +832,7 @@ class ArrayBase(ImmutableRecord):
                offset=offset,
                dim_names=dim_names,
                order=order,
+                alignment=alignment,
                **kwargs)

    def __eq__(self, other):
@@ -832,10 +860,10 @@ class ArrayBase(ImmutableRecord):
        if include_typename:
            info_entries.append(type(self).__name__)

-        if self.dtype is lp.auto:
-            type_str = "<auto>"
-        elif self.dtype is None:
-            type_str = "<runtime>"
+        assert self.dtype is not lp.auto
+
+        if self.dtype is None:
+            type_str = "<auto/runtime>"
        else:
            type_str = str(self.dtype)


--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1004,7 +1004,7 @@ def _find_existentially_quantified_inames(dom_str):


 def parse_domains(domains, defines):
-    if isinstance(domains, str):
+    if isinstance(domains, (isl.BasicSet, str)):
        domains = [domains]

    result = []
@@ -1106,6 +1106,9 @@ class ArgumentGuesser:
        self.all_written_names = set()
        from loopy.symbolic import get_dependencies
        for insn in instructions:
+            for pred in insn.predicates:
+                self.all_names.update(get_dependencies(self.submap(pred)))
+
            if isinstance(insn, MultiAssignmentBase):
                for assignee_var_name in insn.assignee_var_names():
                    self.all_written_names.add(assignee_var_name)

--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -219,9 +219,20 @@ class KernelArgument(ImmutableRecord):

        dtype = kwargs.pop("dtype", None)
        from loopy.types import to_loopy_type
-        kwargs["dtype"] = to_loopy_type(
+        dtype = to_loopy_type(
                dtype, allow_auto=True, allow_none=True, target=target)

+        import loopy as lp
+        if dtype is lp.auto:
+            from warnings import warn
+            warn("Argument/temporary data type should be None if unspecified, "
+                    "not auto. This usage will be disallowed in 2018.",
+                    DeprecationWarning, stacklevel=2)
+
+            dtype = None
+
+        kwargs["dtype"] = dtype
+
        ImmutableRecord.__init__(self, **kwargs)


@@ -268,10 +279,10 @@ class ValueArg(KernelArgument):

    def __str__(self):
        import loopy as lp
-        if self.dtype is lp.auto:
-            type_str = "<auto>"
-        elif self.dtype is None:
-            type_str = "<runtime>"
+        assert self.dtype is not lp.auto
+
+        if self.dtype is None:
+            type_str = "<auto/runtime>"
        else:
            type_str = str(self.dtype)

@@ -449,7 +460,7 @@ class TemporaryVariable(ArrayBase):
                    % name)

        ArrayBase.__init__(self, name=intern(name),
-                dtype=dtype, shape=shape,
+                dtype=dtype, shape=shape, strides=strides,
                dim_tags=dim_tags, offset=offset, dim_names=dim_names,
                order=order,
                base_indices=base_indices, scope=scope,

--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -107,7 +107,7 @@ def get_arguments_with_incomplete_dtype(knl):
            if arg.dtype is None]


-def add_and_infer_dtypes(knl, dtype_dict):
+def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False):
    processed_dtype_dict = {}

    for k, v in six.iteritems(dtype_dict):
@@ -119,7 +119,7 @@ def add_and_infer_dtypes(knl, dtype_dict):
    knl = add_dtypes(knl, processed_dtype_dict)

    from loopy.type_inference import infer_unknown_types
-    return infer_unknown_types(knl, expect_completion=True)
+    return infer_unknown_types(knl, expect_completion=expect_completion)


 def _add_and_infer_dtypes_overdetermined(knl, dtype_dict):

--- a/loopy/match.py
+++ b/loopy/match.py
@@ -134,6 +134,12 @@ class All(MatchExpressionBase):
    def __call__(self, kernel, matchable):
        return True

+    def __str__(self):
+        return "all"
+
+    def __repr__(self):
+        return "%s()" % (type(self).__name__)
+
    def update_persistent_hash(self, key_hash, key_builder):
        key_builder.rec(key_hash, "all_match_expr")

@@ -144,18 +150,21 @@ class All(MatchExpressionBase):
        return hash(type(self))


-class And(MatchExpressionBase):
+class MultiChildMatchExpressionBase(MatchExpressionBase):
    def __init__(self, children):
        self.children = children

-    def __call__(self, kernel, matchable):
-        return all(ch(kernel, matchable) for ch in self.children)
-
    def __str__(self):
-        return "(%s)" % (" and ".join(str(ch) for ch in self.children))
+        joiner = " %s " % type(self).__name__.lower()
+        return "(%s)" % (joiner.join(str(ch) for ch in self.children))
+
+    def __repr__(self):
+        return "%s(%s)" % (
+                type(self).__name__,
+                ", ".join(repr(ch) for ch in self.children))

    def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, "and_match_expr")
+        key_builder.rec(key_hash, type(self).__name__)
        key_builder.rec(key_hash, self.children)

    def __eq__(self, other):
@@ -166,26 +175,14 @@ class And(MatchExpressionBase):
        return hash((type(self), self.children))


-class Or(MatchExpressionBase):
-    def __init__(self, children):
-        self.children = children
-
+class And(MultiChildMatchExpressionBase):
    def __call__(self, kernel, matchable):
-        return any(ch(kernel, matchable) for ch in self.children)
-
-    def __str__(self):
-        return "(%s)" % (" or ".join(str(ch) for ch in self.children))
-
-    def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, "or_match_expr")
-        key_builder.rec(key_hash, self.children)
+        return all(ch(kernel, matchable) for ch in self.children)

-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.children == other.children)

-    def __hash__(self):
-        return hash((type(self), self.children))
+class Or(MultiChildMatchExpressionBase):
+    def __call__(self, kernel, matchable):
+        return any(ch(kernel, matchable) for ch in self.children)


 class Not(MatchExpressionBase):
@@ -198,6 +195,9 @@ class Not(MatchExpressionBase):
    def __str__(self):
        return "(not %s)" % str(self.child)

+    def __repr__(self):
+        return "%s(%r)" % (type(self).__name__, self.child)
+
    def update_persistent_hash(self, key_hash, key_builder):
        key_builder.rec(key_hash, "not_match_expr")
        key_builder.rec(key_hash, self.child)
@@ -222,6 +222,9 @@ class GlobMatchExpressionBase(MatchExpressionBase):
        descr = type(self).__name__
        return descr.lower() + ":" + self.glob

+    def __repr__(self):
+        return "%s(%r)" % (type(self).__name__, self. glob)
+
    def update_persistent_hash(self, key_hash, key_builder):
        key_builder.rec(key_hash, type(self).__name__)
        key_builder.rec(key_hash, self.glob)
@@ -273,7 +276,7 @@ def parse_match(expr):
    """Syntax examples::

    * ``id:yoink and writes:a_temp``
-    * ``id:yoink and (not writes:a_temp or tagged:input)``
+    * ``id:yoink and (not writes:a_temp or tag:input)``
    """
    if not expr:
        return All()

--- a/loopy/options.py
+++ b/loopy/options.py
@@ -112,6 +112,15 @@ class Options(ImmutableRecord):
        Do not check for or accept :mod:`numpy` arrays as
        arguments.

+        Defaults to *False*.
+
+    .. attribute:: cl_exec_manage_array_events
+
+        Within the PyOpenCL executor, respect and udpate
+        :attr:`pyopencl.array.Array.event`.
+
+        Defaults to *True*.
+
    .. attribute:: return_dict

        Have kernels return a :class:`dict` instead of a tuple as
@@ -196,6 +205,7 @@ class Options(ImmutableRecord):

                skip_arg_checks=kwargs.get("skip_arg_checks", False),
                no_numpy=kwargs.get("no_numpy", False),
+                cl_exec_manage_array_events=kwargs.get("no_numpy", True),
                return_dict=kwargs.get("return_dict", False),
                write_wrapper=kwargs.get("write_wrapper", False),
                write_code=kwargs.get("write_code", False),

--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -797,11 +797,10 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):

            newly_added_assignments_ids.add(new_assignment_id)

-            import loopy as lp
            new_temporaries[new_assignee_name] = (
                    TemporaryVariable(
                        name=new_assignee_name,
-                        dtype=lp.auto,
+                        dtype=None,
                        scope=temp_var_scope.PRIVATE))

            from pymbolic import var
@@ -987,7 +986,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
            new_temporary_variables[name] = TemporaryVariable(
                    name=name,
                    shape=(),
-                    dtype=lp.auto,
+                    dtype=None,
                    scope=temp_var_scope.PRIVATE)

        from pymbolic import var

--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -194,7 +194,6 @@ def generate_array_literal(codegen_state, array, value):

    ecm = codegen_state.expression_to_code_mapper

-    from pymbolic.mapper.stringifier import PREC_NONE
    from loopy.expression import dtype_to_type_context
    from loopy.symbolic import ArrayLiteral

@@ -203,7 +202,7 @@ def generate_array_literal(codegen_state, array, value):
            codegen_state.ast_builder.get_c_expression_to_code_mapper(),
            ArrayLiteral(
                tuple(
-                    ecm(d_i, PREC_NONE, type_context, array.dtype).expr
+                    ecm.map_constant(d_i, type_context)
                    for d_i in data)))

 # }}}
@@ -710,13 +709,18 @@ class CASTBuilder(ASTBuilderBase):
                    ecm(p.flattened_product(decl_info.shape),
                        prec=PREC_NONE, type_context="i"))

+        if temp_var.alignment:
+            from cgen import AlignedAttribute
+            temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl)
+
        return temp_var_decl

    def wrap_temporary_decl(self, decl, scope):
        return decl

    def wrap_global_constant(self, decl):
-        return decl
+        from cgen import Static
+        return Static(decl)

    def get_value_arg_decl(self, name, shape, dtype, is_written):
        assert shape == ()

--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -105,12 +105,23 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                        kernel_arg.dtype.numpy_dtype),
                    order=order))

+        expected_strides = tuple(
+                var("_lpy_expected_strides_%s" % i)
+                for i in range(num_axes))
+
+        gen("%s = %s.strides" % (strify(expected_strides), arg.name))
+
        #check strides
        if not skip_arg_checks:
-            gen("assert %(strides)s == %(name)s.strides, "
+            strides_check_expr = self.get_strides_check_expr(
+                    (strify(s) for s in sym_shape),
+                    (strify(s) for s in sym_strides),
+                    (strify(s) for s in expected_strides))
+            gen("assert %(strides_check)s, "
                    "'Strides of loopy created array %(name)s, "
                    "do not match expected.'" %
-                    dict(name=arg.name,
+                    dict(strides_check=strides_check_expr,
+                         name=arg.name,
                         strides=strify(sym_strides)))
            for i in range(num_axes):
                gen("del _lpy_shape_%d" % i)
@@ -133,11 +144,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):

    # {{{ generate invocation

-    def generate_invocation(self, gen, kernel_name, args):
+    def generate_invocation(self, gen, kernel_name, args,
+            kernel, implemented_data_info):
        gen("for knl in _lpy_c_kernels:")
        with Indentation(gen):
            gen('knl({args})'.format(
                args=", ".join(args)))
+
    # }}}

    # {{{

--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -351,6 +351,13 @@ class ExecutionWrapperGeneratorBase(object):
    def get_arg_pass(self, arg):
        raise NotImplementedError()

+    def get_strides_check_expr(self, shape, strides, sym_strides):
+        # Returns an expression suitable for use for checking the strides of an
+        # argument. Arguments should be sequences of strings.
+        return " and ".join(
+                "(%s == 1 or %s == %s)" % elem
+                for elem in zip(shape, strides, sym_strides))
+
    # {{{ arg setup

    def generate_arg_setup(
@@ -516,13 +523,34 @@ class ExecutionWrapperGeneratorBase(object):
                        itemsize = kernel_arg.dtype.numpy_dtype.itemsize
                        sym_strides = tuple(
                                itemsize*s_i for s_i in arg.unvec_strides)
-                        gen("if %s.strides != %s:"
-                                % (arg.name, strify(sym_strides)))
+
+                        ndim = len(arg.unvec_shape)
+                        shape = ["_lpy_shape_%d" % i for i in range(ndim)]
+                        strides = ["_lpy_stride_%d" % i for i in range(ndim)]
+
+                        gen("(%s,) = %s.shape" % (", ".join(shape), arg.name))
+                        gen("(%s,) = %s.strides" % (", ".join(strides), arg.name))
+
+                        gen("if not %s:"
+                                % self.get_strides_check_expr(
+                                    shape, strides,
+                                    (strify(s) for s in sym_strides)))
                        with Indentation(gen):
+                            gen("_lpy_got = tuple(stride "
+                                    "for (dim, stride) in zip(%s.shape, %s.strides) "
+                                    "if dim > 1)"
+                                    % (arg.name, arg.name))
+                            gen("_lpy_expected = tuple(stride "
+                                    "for (dim, stride) in zip(%s.shape, %s) "
+                                    "if dim > 1)"
+                                    % (arg.name, strify_tuple(sym_strides)))
+
                            gen("raise TypeError(\"strides mismatch on "
-                                    "argument '%s' (got: %%s, expected: %%s)\" "
-                                    "%% (%s.strides, %s))"
-                                    % (arg.name, arg.name, strify(sym_strides)))
+                                    "argument '%s' "
+                                    "(after removing unit length dims, "
+                                    "got: %%s, expected: %%s)\" "
+                                    "%% (_lpy_got, _lpy_expected))"
+                                    % arg.name)

                    if not arg.allows_offset:
                        gen("if hasattr(%s, 'offset') and %s.offset:" % (
@@ -571,7 +599,8 @@ class ExecutionWrapperGeneratorBase(object):

    # {{{ generate invocation

-    def generate_invocation(self, gen, kernel_name, args):
+    def generate_invocation(self, gen, kernel_name, args,
+            kernel, implemented_data_info):
        raise NotImplementedError()

    # }}}
@@ -632,7 +661,8 @@ class ExecutionWrapperGeneratorBase(object):
        args = self.generate_arg_setup(
            gen, kernel, implemented_data_info, options)

-        self.generate_invocation(gen, codegen_result.host_program.name, args)
+        self.generate_invocation(gen, codegen_result.host_program.name, args,
+                kernel, implemented_data_info)

        self.generate_output_handler(gen, options, kernel, implemented_data_info)


--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -61,6 +61,11 @@ def adjust_local_temp_var_storage(kernel, device):
                    temp_var.copy(storage_shape=temp_var.shape)
            continue

+        if not temp_var.shape:
+            # scalar, no need to mess with storage shape
+            new_temp_vars[temp_var.name] = temp_var
+            continue
+
        other_loctemp_nbytes = [
                tv.nbytes
                for tv in six.itervalues(kernel.temporary_variables)
@@ -441,7 +446,9 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info):
        warn("{knl_name}: device not supplied to PyOpenCLTarget--"
                "workarounds for broken OpenCL implementations "
                "(such as those relating to complex numbers) "
-                "may not be enabled when needed"
+                "may not be enabled when needed. To avoid this, "
+                "pass target=lp.PyOpenCLTarget(dev) when creating "
+                "the kernel."
                .format(knl_name=kernel.name))

    if any(count_bug_per_dev):

--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -151,7 +151,24 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):

    # {{{ generate invocation

-    def generate_invocation(self, gen, kernel_name, args):
+    def generate_invocation(self, gen, kernel_name, args,
+            kernel, implemented_data_info):
+        if kernel.options.cl_exec_manage_array_events:
+            gen("""
+                if wait_for is None:
+                    wait_for = []
+                """)
+
+            gen("")
+            from loopy.kernel.data import GlobalArg
+            for arg in implemented_data_info:
+                if issubclass(arg.arg_class, GlobalArg):
+                    gen(
+                            "wait_for.extend({arg_name}.events)"
+                            .format(arg_name=arg.name))
+
+            gen("")
+
        gen("_lpy_evt = {kernel_name}({args})"
        .format(
            kernel_name=kernel_name,
@@ -160,6 +177,14 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                + args
                + ["wait_for=wait_for"])))

+        if kernel.options.cl_exec_manage_array_events:
+            gen("")
+            from loopy.kernel.data import GlobalArg
+            for arg in implemented_data_info:
+                if (issubclass(arg.arg_class, GlobalArg)
+                        and arg.base_name in kernel.get_written_variables()):
+                    gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name))
+
    # }}}

    # {{{

--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -38,6 +38,20 @@ __doc__ = """

 # {{{ to_batched

+def temp_needs_batching_if_not_sequential(tv, batch_varying_args):
+    from loopy.kernel.data import temp_var_scope
+    if tv.name in batch_varying_args:
+        return True
+    if tv.initializer is not None and tv.read_only:
+        # do not batch read_only temps  if not in
+        # `batch_varying_args`
+        return False
+    if tv.scope == temp_var_scope.PRIVATE:
+        # do not batch private temps if not in `batch_varying args`
+        return False
+    return True
+
+
 class _BatchVariableChanger(RuleAwareIdentityMapper):
    def __init__(self, rule_mapping_context, kernel, batch_varying_args,
            batch_iname_expr, sequential):
@@ -50,14 +64,17 @@ class _BatchVariableChanger(RuleAwareIdentityMapper):

    def needs_batch_subscript(self, name):
        tv = self.kernel.temporary_variables.get(name)
-        return (
-                (not self.sequential
-                    and (tv is not None
-                        and not (
-                            tv.initializer is not None
-                            and tv.read_only)))
-                or
-                name in self.batch_varying_args)
+
+        if name in self.batch_varying_args:
+            return True
+        if not self.sequential:
+            if tv is None:
+                return False
+            if not temp_needs_batching_if_not_sequential(tv,
+                    self.batch_varying_args):
+                return False
+
+        return True

    def map_subscript(self, expr, expn_state):
        if not self.needs_batch_subscript(expr.aggregate.name):
@@ -89,6 +106,10 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
        sequential=False):
    """Takes in a kernel that carries out an operation and returns a kernel
    that carries out a batch of these operations.
+    .. note::
+       For temporaries in a kernel that are private or read only
+       globals and if `sequential=True`, loopy does not does not batch these
+       variables unless explicitly mentioned in `batch_varying_args`.

    :arg nbatches: the number of batches. May be a constant non-negative
        integer or a string, which will be added as an integer argument.
@@ -144,13 +165,13 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
        new_temps = {}

        for temp in six.itervalues(knl.temporary_variables):
-            if temp.initializer is not None and temp.read_only:
-                new_temps[temp.name] = temp
-            else:
+            if temp_needs_batching_if_not_sequential(temp, batch_varying_args):
                new_temps[temp.name] = temp.copy(
                        shape=(nbatches_expr,) + temp.shape,
                        dim_tags=("c",) * (len(temp.shape) + 1),
                        dim_names=_add_unique_dim_name("ibatch", temp.dim_names))
+            else:
+                new_temps[temp.name] = temp

        knl = knl.copy(temporary_variables=new_temps)
    else:

--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -854,23 +854,23 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,

 # {{{ iname duplication for schedulability

-def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
-    # Remove common inames of the current insn_deps, as they are not relevant
+def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset([])):
+    # Remove common inames of the current insn_iname_sets, as they are not relevant
    # for splitting.
-    common = frozenset([]).union(*insn_deps).intersection(*insn_deps)
+    common = frozenset([]).union(*insn_iname_sets).intersection(*insn_iname_sets)

    # If common inames were found, we reduce the problem and go into recursion
    if common:
        # Remove the common inames from the instruction dependencies
-        insn_deps = (
-            frozenset(dep - common for dep in insn_deps)
+        insn_iname_sets = (
+            frozenset(iname_set - common for iname_set in insn_iname_sets)
            -
            frozenset([frozenset([])]))
        # Join the common inames with those previously found
        common = common.union(old_common_inames)

        # Go into recursion
-        for option in _get_iname_duplication_options(insn_deps, common):
+        for option in _get_iname_duplication_options(insn_iname_sets, common):
            yield option
        # Do not yield anything beyond here!
        return
@@ -880,7 +880,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
    def join_sets_if_not_disjoint(sets):
        for s1 in sets:
            for s2 in sets:
-                if s1 != s2 and s1.intersection(s2):
+                if s1 != s2 and s1 & s2:
                    return (
                        (sets - frozenset([s1, s2]))
                        | frozenset([s1 | s2])
@@ -888,7 +888,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):

        return sets, True

-    partitioning = insn_deps
+    partitioning = insn_iname_sets
    stop = False
    while not stop:
        partitioning, stop = join_sets_if_not_disjoint(partitioning)
@@ -897,7 +897,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
    # subproblems
    if len(partitioning) > 1:
        for part in partitioning:
-            working_set = frozenset(s for s in insn_deps if s.issubset(part))
+            working_set = frozenset(s for s in insn_iname_sets if s <= part)
            for option in _get_iname_duplication_options(working_set,
                                                         old_common_inames):
                yield option
@@ -908,7 +908,9 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
        # There are splitting options for all inames
        for iname in inames:
            iname_insns = frozenset(
-                    insn for insn in insn_deps if frozenset([iname]).issubset(insn))
+                    insn
+                    for insn in insn_iname_sets
+                    if frozenset([iname]) <= insn)

            import itertools as it
            # For a given iname, the set of instructions containing this iname
@@ -919,7 +921,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
                    for l in range(1, len(iname_insns))):
                yield (
                    iname,
-                    tuple(insn.union(old_common_inames) for insn in insns_to_dup))
+                    tuple(insn | old_common_inames for insn in insns_to_dup))

    # If partitioning was empty, we have recursed successfully and yield nothing

@@ -951,12 +953,12 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
    * duplicating j in instruction i2
    * duplicating i in instruction i2 and i3

-    Use :func:`has_schedulable_iname_nesting` to decide, whether an iname needs to be
+    Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be
    duplicated in a given kernel.
    """
    # First we extract the minimal necessary information from the kernel
    if use_boostable_into:
-        insn_deps = (
+        insn_iname_sets = (
            frozenset(insn.within_inames.union(
                insn.boostable_into if insn.boostable_into is not None
                else frozenset([]))
@@ -964,20 +966,20 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
            -
            frozenset([frozenset([])]))
    else:
-        insn_deps = (
+        insn_iname_sets = (
            frozenset(insn.within_inames for insn in knl.instructions)
            -
            frozenset([frozenset([])]))

    # Get the duplication options as a tuple of iname and a set
-    for iname, insns in _get_iname_duplication_options(insn_deps):
+    for iname, insns in _get_iname_duplication_options(insn_iname_sets):
        # Check whether this iname has a parallel tag and discard it if so
        from loopy.kernel.data import ConcurrentTag
        if (iname in knl.iname_to_tag
                    and isinstance(knl.iname_to_tag[iname], ConcurrentTag)):
            continue

-        # If we find a duplication option and fo not use boostable_into
+        # If we find a duplication option and to not use boostable_into
        # information, we restart this generator with use_boostable_into=True
        if not use_boostable_into and not knl.options.ignore_boostable_into:
            for option in get_iname_duplication_options(knl, True):

--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -312,15 +312,8 @@ class TypeInferenceMapper(CombineMapper):

        from loopy.kernel.data import TemporaryVariable, KernelArgument
        import loopy as lp
-        if isinstance(obj, TemporaryVariable):
-            result = [obj.dtype]
-            if result[0] is lp.auto:
-                self.symbols_with_unknown_types.add(expr.name)
-                return []
-            else:
-                return result
-
-        elif isinstance(obj, KernelArgument):
+        if isinstance(obj, (KernelArgument, TemporaryVariable)):
+            assert obj.dtype is not lp.auto
            result = [obj.dtype]
            if result[0] is None:
                self.symbols_with_unknown_types.add(expr.name)
@@ -515,10 +508,12 @@ def infer_unknown_types(kernel, expect_completion=False):

    import loopy as lp
    for tv in six.itervalues(kernel.temporary_variables):
-        if tv.dtype is lp.auto:
+        assert tv.dtype is not lp.auto
+        if tv.dtype is None:
            names_for_type_inference.append(tv.name)

    for arg in kernel.args:
+        assert arg.dtype is not lp.auto
        if arg.dtype is None:
            names_for_type_inference.append(arg.name)

@@ -588,6 +583,9 @@ def infer_unknown_types(kernel, expect_completion=False):
            failed = not result
            if not failed:
                new_dtype, = result
+                if new_dtype.target is None:
+                    new_dtype = new_dtype.with_target(kernel.target)
+
                debug("     success: %s", new_dtype)
                if new_dtype != item.dtype:
                    debug("     changed from: %s", item.dtype)

--- a/loopy/types.py
+++ b/loopy/types.py
@@ -177,13 +177,20 @@ class AtomicNumpyType(NumpyType, AtomicType):
 # }}}


-def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False,
+def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False,
        target=None):
    from loopy.kernel.data import auto
-    if allow_none and dtype is None:
-        return dtype
-    elif allow_auto and dtype is auto:
-        return dtype
+    if dtype is None:
+        if allow_none:
+            return None
+        else:
+            raise LoopyError("dtype may not be none")
+
+    elif dtype is auto:
+        if allow_auto:
+            return dtype
+        else:
+            raise LoopyError("dtype may not be auto")

    numpy_dtype = None


--- a/loopy/version.py
+++ b/loopy/version.py
@@ -21,7 +21,7 @@ THE SOFTWARE.
 """


-VERSION = (2017, 2)
+VERSION = (2017, 2, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS

@@ -32,4 +32,4 @@ except ImportError:
 else:
    _islpy_version = islpy.version.VERSION_TEXT

-DATA_MODEL_VERSION = "v72-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v76-islpy%s" % _islpy_version
No results found