diff --git a/README.rst b/README.rst
index 0e551fbede0460a2e7c76167b54d672afdf81286..94b485a64c18b5e9e727a561f70127269d04a16c 100644
--- a/README.rst
+++ b/README.rst
@@ -25,7 +25,7 @@ It can capture the following types of optimizations:
 
 * Vector and multi-core parallelism in the OpenCL/CUDA model
 * Data layout transformations (structure of arrays to array of structures)
-* Loopy Unrolling
+* Loop unrolling
 * Loop tiling with efficient handling of boundary cases
 * Prefetching/copy optimizations
 * Instruction level parallelism
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 69f89548618e86b408a31af240bee84678c859c1..7196dad863474d9b6ea9df9d9d0ae90b3e14986d 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -111,9 +111,9 @@ always see loopy's view of a kernel by printing it.
     KERNEL: loopy_kernel
     ---------------------------------------------------------------------------
     ARGUMENTS:
-    a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
-    n: ValueArg, type: <runtime>
-    out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
+    a: GlobalArg, type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1)
+    n: ValueArg, type: <auto/runtime>
+    out: GlobalArg, type: <auto/runtime>, shape: (n), dim_tags: (N0:stride:1)
     ---------------------------------------------------------------------------
     DOMAINS:
     [n] -> { [i] : 0 <= i < n }
@@ -154,7 +154,7 @@ following:
   See :ref:`specifying-arguments`.
 
 * Loopy has not determined the type of ``a`` and ``out``. The data type is
-  given as ``<runtime>``, which means that these types will be determined
+  given as ``<auto/runtime>``, which means that these types will be determined
   by the data passed in when the kernel is invoked. Loopy generates (and
   caches!) a copy of the kernel for each combination of types passed in.
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 88a5717642af6d9ebc1bd7770936ae44e8cbf44b..038ef23ac08ce3bbc71a1fd1fce40181c6f8d9bb 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1081,7 +1081,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             warn_with_kernel(self,
                 "iname-order",
                 "get_visual_iname_order_embedding() could not determine a "
-                "consistent iname nesting order")
+                "consistent iname nesting order. This is a possible indication "
+                "that the kernel may not schedule successfully, but for now "
+                "it only impacts printing of the kernel.")
             embedding = dict((iname, iname) for iname in self.all_inames())
 
         return embedding
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 5d4240b9ab3e1ce2ad356a93b5e21b3bbf4d499e..b672f0227b1b8ba931b844b80a24b75c9625286d 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -549,15 +549,55 @@ class ArrayBase(ImmutableRecord):
     .. attribute :: name
 
     .. attribute :: dtype
+        the :class:`loopy.loopytype` of the array.
+        if this is *none*, :mod:`loopy` will try to continue without
+        knowing the type of this array, where the idea is that precise
+        knowledge of the type will become available at invocation time.
+        :class:`loopy.compiledkernel` (and thereby
+        :meth:`loopy.loopkernel.__call__`) automatically add this type
+        information based on invocation arguments.
+
+        note that some transformations, such as :func:`loopy.add_padding`
+        cannot be performed without knowledge of the exact *dtype*.
 
     .. attribute :: shape
 
+        May be one of the following:
+
+        * *None*. In this case, no shape is intended to be specified,
+          only the strides will be used to access the array. Bounds checking
+          will not be performed.
+
+        * :class:`loopy.auto`. The shape will be determined by finding the
+          access footprint.
+
+        * a tuple like like :attr:`numpy.ndarray.shape`.
+
+          Each entry of the tuple is also allowed to be a :mod:`pymbolic`
+          expression involving kernel parameters, or a (potentially-comma
+          separated) or a string that can be parsed to such an expression.
+
+          Any element of the shape tuple not used to compute strides
+          may be *None*.
+
     .. attribute:: dim_tags
 
         See :ref:`data-dim-tags`.
 
     .. attribute:: offset
 
+        Offset from the beginning of the buffer to the point from
+            which the strides are counted. May be one of
+
+            * 0 or None
+            * a string (that is interpreted as an argument name).
+            * a pymbolic expression
+            * :class:`loopy.auto`, in which case an offset argument
+              is added automatically, immediately following this argument.
+              :class:`loopy.CompiledKernel` is even smarter in its treatment of
+              this case and will compile custom versions of the kernel based on
+              whether the passed arrays have offsets or not.
+
     .. attribute:: dim_names
 
         A tuple of strings providing names for the array axes, or *None*.
@@ -568,6 +608,21 @@ class ArrayBase(ImmutableRecord):
         to generate more informative names than could be achieved by
         axis numbers.
 
+    .. attribute:: alignment
+
+        Memory alignment of the array in bytes. For temporary arrays,
+        this ensures they are allocated with this alignment. For arguments,
+        this entails a promise that the incoming array obeys this alignment
+        restriction.
+
+        Defaults to *None*.
+
+        If an integer N is given, the array would be declared
+        with ``__attribute__((aligned(N)))`` in code generation for
+        :class:`loopy.CTarget`.
+
+        .. versionadded:: 2018.1
+
     .. automethod:: __init__
     .. automethod:: __eq__
     .. automethod:: num_user_axes
@@ -584,46 +639,18 @@ class ArrayBase(ImmutableRecord):
 
     def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
             dim_names=None, strides=None, order=None, for_atomic=False,
-            target=None,
+            target=None, alignment=None,
             **kwargs):
         """
         All of the following (except *name*) are optional.
         Specify either strides or shape.
 
-        :arg name: May contain multiple names separated by
-            commas, in which case multiple arguments,
-            each with identical properties, are created
-            for each name.
-        :arg dtype: the :class:`numpy.dtype` of the array.
-            If this is *None*, :mod:`loopy` will try to continue without
-            knowing the type of this array, where the idea is that precise
-            knowledge of the type will become available at invocation time.
-            :class:`loopy.CompiledKernel` (and thereby
-            :meth:`loopy.LoopKernel.__call__`) automatically add this type
-            information based on invocation arguments.
-
-            Note that some transformations, such as :func:`loopy.add_padding`
-            cannot be performed without knowledge of the exact *dtype*.
+        :arg name: When passed to :class:`loopy.make_kernel`, this may contain
+            multiple names separated by commas, in which case multiple arguments,
+            each with identical properties, are created for each name.
 
-        :arg shape: May be one of the following:
-
-            * *None*. In this case, no shape is intended to be specified,
-              only the strides will be used to access the array. Bounds checking
-              will not be performed.
-
-            * :class:`loopy.auto`. The shape will be determined by finding the
-              access footprint.
-
-            * a tuple like like :attr:`numpy.ndarray.shape`.
-
-              Each entry of the tuple is also allowed to be a :mod:`pymbolic`
-              expression involving kernel parameters, or a (potentially-comma
-              separated) or a string that can be parsed to such an expression.
-
-              Any element of the shape tuple not used to compute strides
-              may be *None*.
-
-            * A string which can be parsed into the previous form.
+        :arg shape: May be any of the things specified under :attr:`shape`,
+            or a string which can be parsed into the previous form.
 
         :arg dim_tags: A comma-separated list of tags as understood by
             :func:`parse_array_dim_tag`.
@@ -649,17 +676,9 @@ class ArrayBase(ImmutableRecord):
         :arg for_atomic:
             Whether the array is declared for atomic access, and, if necessary,
             using atomic-capable data types.
-        :arg offset: Offset from the beginning of the buffer to the point from
-            which the strides are counted. May be one of
+        :arg offset: (See :attr:`offset`)
+        :arg alignment: memory alignment in bytes
 
-            * 0 or None
-            * a string (that is interpreted as an argument name).
-            * a pymbolic expression
-            * :class:`loopy.auto`, in which case an offset argument
-              is added automatically, immediately following this argument.
-              :class:`loopy.CompiledKernel` is even smarter in its treatment of
-              this case and will compile custom versions of the kernel based on
-              whether the passed arrays have offsets or not.
         """
 
         for kwarg_name in kwargs:
@@ -672,6 +691,14 @@ class ArrayBase(ImmutableRecord):
         dtype = to_loopy_type(dtype, allow_auto=True, allow_none=True,
                 for_atomic=for_atomic, target=target)
 
+        if dtype is lp.auto:
+            from warnings import warn
+            warn("Argument/temporary data type should be None if unspecified, "
+                    "not auto. This usage will be disallowed in 2018.",
+                    DeprecationWarning, stacklevel=2)
+
+            dtype = None
+
         strides_known = strides is not None and strides is not lp.auto
         shape_known = shape is not None and shape is not lp.auto
 
@@ -805,6 +832,7 @@ class ArrayBase(ImmutableRecord):
                 offset=offset,
                 dim_names=dim_names,
                 order=order,
+                alignment=alignment,
                 **kwargs)
 
     def __eq__(self, other):
@@ -832,10 +860,10 @@ class ArrayBase(ImmutableRecord):
         if include_typename:
             info_entries.append(type(self).__name__)
 
-        if self.dtype is lp.auto:
-            type_str = "<auto>"
-        elif self.dtype is None:
-            type_str = "<runtime>"
+        assert self.dtype is not lp.auto
+
+        if self.dtype is None:
+            type_str = "<auto/runtime>"
         else:
             type_str = str(self.dtype)
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index f7667ca639e649a8f25b6e5d8975710742aef9a6..4a08c28bd8091425293892384e01d20447413cd5 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1004,7 +1004,7 @@ def _find_existentially_quantified_inames(dom_str):
 
 
 def parse_domains(domains, defines):
-    if isinstance(domains, str):
+    if isinstance(domains, (isl.BasicSet, str)):
         domains = [domains]
 
     result = []
@@ -1106,6 +1106,9 @@ class ArgumentGuesser:
         self.all_written_names = set()
         from loopy.symbolic import get_dependencies
         for insn in instructions:
+            for pred in insn.predicates:
+                self.all_names.update(get_dependencies(self.submap(pred)))
+
             if isinstance(insn, MultiAssignmentBase):
                 for assignee_var_name in insn.assignee_var_names():
                     self.all_written_names.add(assignee_var_name)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index a4e6036cbac0235590d7cc66a201c47ac87d6030..c90e8a64b6f47a87e87c5e64d2ef930232d34894 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -219,9 +219,20 @@ class KernelArgument(ImmutableRecord):
 
         dtype = kwargs.pop("dtype", None)
         from loopy.types import to_loopy_type
-        kwargs["dtype"] = to_loopy_type(
+        dtype = to_loopy_type(
                 dtype, allow_auto=True, allow_none=True, target=target)
 
+        import loopy as lp
+        if dtype is lp.auto:
+            from warnings import warn
+            warn("Argument/temporary data type should be None if unspecified, "
+                    "not auto. This usage will be disallowed in 2018.",
+                    DeprecationWarning, stacklevel=2)
+
+            dtype = None
+
+        kwargs["dtype"] = dtype
+
         ImmutableRecord.__init__(self, **kwargs)
 
 
@@ -268,10 +279,10 @@ class ValueArg(KernelArgument):
 
     def __str__(self):
         import loopy as lp
-        if self.dtype is lp.auto:
-            type_str = "<auto>"
-        elif self.dtype is None:
-            type_str = "<runtime>"
+        assert self.dtype is not lp.auto
+
+        if self.dtype is None:
+            type_str = "<auto/runtime>"
         else:
             type_str = str(self.dtype)
 
@@ -449,7 +460,7 @@ class TemporaryVariable(ArrayBase):
                     % name)
 
         ArrayBase.__init__(self, name=intern(name),
-                dtype=dtype, shape=shape,
+                dtype=dtype, shape=shape, strides=strides,
                 dim_tags=dim_tags, offset=offset, dim_names=dim_names,
                 order=order,
                 base_indices=base_indices, scope=scope,
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index a65e7fb4ceefd28a909dcb6cee24ea437f15a60e..fbc4238c21e966cb61d1c074ce6924fd9af26084 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -107,7 +107,7 @@ def get_arguments_with_incomplete_dtype(knl):
             if arg.dtype is None]
 
 
-def add_and_infer_dtypes(knl, dtype_dict):
+def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False):
     processed_dtype_dict = {}
 
     for k, v in six.iteritems(dtype_dict):
@@ -119,7 +119,7 @@ def add_and_infer_dtypes(knl, dtype_dict):
     knl = add_dtypes(knl, processed_dtype_dict)
 
     from loopy.type_inference import infer_unknown_types
-    return infer_unknown_types(knl, expect_completion=True)
+    return infer_unknown_types(knl, expect_completion=expect_completion)
 
 
 def _add_and_infer_dtypes_overdetermined(knl, dtype_dict):
diff --git a/loopy/match.py b/loopy/match.py
index ab0038af8dc5e9189a382bb76115998f57aef74e..3c047e463939cd67a4878d202a754c0cab48058d 100644
--- a/loopy/match.py
+++ b/loopy/match.py
@@ -134,6 +134,12 @@ class All(MatchExpressionBase):
     def __call__(self, kernel, matchable):
         return True
 
+    def __str__(self):
+        return "all"
+
+    def __repr__(self):
+        return "%s()" % (type(self).__name__)
+
     def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, "all_match_expr")
 
@@ -144,18 +150,21 @@ class All(MatchExpressionBase):
         return hash(type(self))
 
 
-class And(MatchExpressionBase):
+class MultiChildMatchExpressionBase(MatchExpressionBase):
     def __init__(self, children):
         self.children = children
 
-    def __call__(self, kernel, matchable):
-        return all(ch(kernel, matchable) for ch in self.children)
-
     def __str__(self):
-        return "(%s)" % (" and ".join(str(ch) for ch in self.children))
+        joiner = " %s " % type(self).__name__.lower()
+        return "(%s)" % (joiner.join(str(ch) for ch in self.children))
+
+    def __repr__(self):
+        return "%s(%s)" % (
+                type(self).__name__,
+                ", ".join(repr(ch) for ch in self.children))
 
     def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, "and_match_expr")
+        key_builder.rec(key_hash, type(self).__name__)
         key_builder.rec(key_hash, self.children)
 
     def __eq__(self, other):
@@ -166,26 +175,14 @@ class And(MatchExpressionBase):
         return hash((type(self), self.children))
 
 
-class Or(MatchExpressionBase):
-    def __init__(self, children):
-        self.children = children
-
+class And(MultiChildMatchExpressionBase):
     def __call__(self, kernel, matchable):
-        return any(ch(kernel, matchable) for ch in self.children)
-
-    def __str__(self):
-        return "(%s)" % (" or ".join(str(ch) for ch in self.children))
-
-    def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, "or_match_expr")
-        key_builder.rec(key_hash, self.children)
+        return all(ch(kernel, matchable) for ch in self.children)
 
-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.children == other.children)
 
-    def __hash__(self):
-        return hash((type(self), self.children))
+class Or(MultiChildMatchExpressionBase):
+    def __call__(self, kernel, matchable):
+        return any(ch(kernel, matchable) for ch in self.children)
 
 
 class Not(MatchExpressionBase):
@@ -198,6 +195,9 @@ class Not(MatchExpressionBase):
     def __str__(self):
         return "(not %s)" % str(self.child)
 
+    def __repr__(self):
+        return "%s(%r)" % (type(self).__name__, self.child)
+
     def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, "not_match_expr")
         key_builder.rec(key_hash, self.child)
@@ -222,6 +222,9 @@ class GlobMatchExpressionBase(MatchExpressionBase):
         descr = type(self).__name__
         return descr.lower() + ":" + self.glob
 
+    def __repr__(self):
+        return "%s(%r)" % (type(self).__name__, self. glob)
+
     def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, type(self).__name__)
         key_builder.rec(key_hash, self.glob)
@@ -273,7 +276,7 @@ def parse_match(expr):
     """Syntax examples::
 
     * ``id:yoink and writes:a_temp``
-    * ``id:yoink and (not writes:a_temp or tagged:input)``
+    * ``id:yoink and (not writes:a_temp or tag:input)``
     """
     if not expr:
         return All()
diff --git a/loopy/options.py b/loopy/options.py
index 25bb7014ce07a30c49f7f78d5a6325eaba36291d..13d0b752dfcfa0f0da233880f27f09a963ab4c81 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -112,6 +112,15 @@ class Options(ImmutableRecord):
         Do not check for or accept :mod:`numpy` arrays as
         arguments.
 
+        Defaults to *False*.
+
+    .. attribute:: cl_exec_manage_array_events
+
+        Within the PyOpenCL executor, respect and udpate
+        :attr:`pyopencl.array.Array.event`.
+
+        Defaults to *True*.
+
     .. attribute:: return_dict
 
         Have kernels return a :class:`dict` instead of a tuple as
@@ -196,6 +205,7 @@ class Options(ImmutableRecord):
 
                 skip_arg_checks=kwargs.get("skip_arg_checks", False),
                 no_numpy=kwargs.get("no_numpy", False),
+                cl_exec_manage_array_events=kwargs.get("no_numpy", True),
                 return_dict=kwargs.get("return_dict", False),
                 write_wrapper=kwargs.get("write_wrapper", False),
                 write_code=kwargs.get("write_code", False),
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index f2b5e7a87022e01bd51368cc3ef3cc60d507d958..ad119e94e74b294e16cdc15c5ab1f723cf7f254b 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -797,11 +797,10 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
 
             newly_added_assignments_ids.add(new_assignment_id)
 
-            import loopy as lp
             new_temporaries[new_assignee_name] = (
                     TemporaryVariable(
                         name=new_assignee_name,
-                        dtype=lp.auto,
+                        dtype=None,
                         scope=temp_var_scope.PRIVATE))
 
             from pymbolic import var
@@ -987,7 +986,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             new_temporary_variables[name] = TemporaryVariable(
                     name=name,
                     shape=(),
-                    dtype=lp.auto,
+                    dtype=None,
                     scope=temp_var_scope.PRIVATE)
 
         from pymbolic import var
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 9536fc711a2266a5fae10e83d3d8de8974fc66c5..177daa02948b9c07ef1d9856dc04019e69e24897 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -194,7 +194,6 @@ def generate_array_literal(codegen_state, array, value):
 
     ecm = codegen_state.expression_to_code_mapper
 
-    from pymbolic.mapper.stringifier import PREC_NONE
     from loopy.expression import dtype_to_type_context
     from loopy.symbolic import ArrayLiteral
 
@@ -203,7 +202,7 @@ def generate_array_literal(codegen_state, array, value):
             codegen_state.ast_builder.get_c_expression_to_code_mapper(),
             ArrayLiteral(
                 tuple(
-                    ecm(d_i, PREC_NONE, type_context, array.dtype).expr
+                    ecm.map_constant(d_i, type_context)
                     for d_i in data)))
 
 # }}}
@@ -710,13 +709,18 @@ class CASTBuilder(ASTBuilderBase):
                     ecm(p.flattened_product(decl_info.shape),
                         prec=PREC_NONE, type_context="i"))
 
+        if temp_var.alignment:
+            from cgen import AlignedAttribute
+            temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl)
+
         return temp_var_decl
 
     def wrap_temporary_decl(self, decl, scope):
         return decl
 
     def wrap_global_constant(self, decl):
-        return decl
+        from cgen import Static
+        return Static(decl)
 
     def get_value_arg_decl(self, name, shape, dtype, is_written):
         assert shape == ()
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index 5efc58bb7cd8692594018a5f7a9bcf75278a3b9b..d8b76d32afa64d308648420904f4f4bf8e2e2316 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -105,12 +105,23 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                         kernel_arg.dtype.numpy_dtype),
                     order=order))
 
+        expected_strides = tuple(
+                var("_lpy_expected_strides_%s" % i)
+                for i in range(num_axes))
+
+        gen("%s = %s.strides" % (strify(expected_strides), arg.name))
+
         #check strides
         if not skip_arg_checks:
-            gen("assert %(strides)s == %(name)s.strides, "
+            strides_check_expr = self.get_strides_check_expr(
+                    (strify(s) for s in sym_shape),
+                    (strify(s) for s in sym_strides),
+                    (strify(s) for s in expected_strides))
+            gen("assert %(strides_check)s, "
                     "'Strides of loopy created array %(name)s, "
                     "do not match expected.'" %
-                    dict(name=arg.name,
+                    dict(strides_check=strides_check_expr,
+                         name=arg.name,
                          strides=strify(sym_strides)))
             for i in range(num_axes):
                 gen("del _lpy_shape_%d" % i)
@@ -133,11 +144,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
     # {{{ generate invocation
 
-    def generate_invocation(self, gen, kernel_name, args):
+    def generate_invocation(self, gen, kernel_name, args,
+            kernel, implemented_data_info):
         gen("for knl in _lpy_c_kernels:")
         with Indentation(gen):
             gen('knl({args})'.format(
                 args=", ".join(args)))
+
     # }}}
 
     # {{{
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 0304ec6f09eb2b014bb01a7b30889e24910e0dd9..3a3ea0a70fe9a9229aa3499ad0bdbfeb87f751ed 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -351,6 +351,13 @@ class ExecutionWrapperGeneratorBase(object):
     def get_arg_pass(self, arg):
         raise NotImplementedError()
 
+    def get_strides_check_expr(self, shape, strides, sym_strides):
+        # Returns an expression suitable for use for checking the strides of an
+        # argument. Arguments should be sequences of strings.
+        return " and ".join(
+                "(%s == 1 or %s == %s)" % elem
+                for elem in zip(shape, strides, sym_strides))
+
     # {{{ arg setup
 
     def generate_arg_setup(
@@ -516,13 +523,34 @@ class ExecutionWrapperGeneratorBase(object):
                         itemsize = kernel_arg.dtype.numpy_dtype.itemsize
                         sym_strides = tuple(
                                 itemsize*s_i for s_i in arg.unvec_strides)
-                        gen("if %s.strides != %s:"
-                                % (arg.name, strify(sym_strides)))
+
+                        ndim = len(arg.unvec_shape)
+                        shape = ["_lpy_shape_%d" % i for i in range(ndim)]
+                        strides = ["_lpy_stride_%d" % i for i in range(ndim)]
+
+                        gen("(%s,) = %s.shape" % (", ".join(shape), arg.name))
+                        gen("(%s,) = %s.strides" % (", ".join(strides), arg.name))
+
+                        gen("if not %s:"
+                                % self.get_strides_check_expr(
+                                    shape, strides,
+                                    (strify(s) for s in sym_strides)))
                         with Indentation(gen):
+                            gen("_lpy_got = tuple(stride "
+                                    "for (dim, stride) in zip(%s.shape, %s.strides) "
+                                    "if dim > 1)"
+                                    % (arg.name, arg.name))
+                            gen("_lpy_expected = tuple(stride "
+                                    "for (dim, stride) in zip(%s.shape, %s) "
+                                    "if dim > 1)"
+                                    % (arg.name, strify_tuple(sym_strides)))
+
                             gen("raise TypeError(\"strides mismatch on "
-                                    "argument '%s' (got: %%s, expected: %%s)\" "
-                                    "%% (%s.strides, %s))"
-                                    % (arg.name, arg.name, strify(sym_strides)))
+                                    "argument '%s' "
+                                    "(after removing unit length dims, "
+                                    "got: %%s, expected: %%s)\" "
+                                    "%% (_lpy_got, _lpy_expected))"
+                                    % arg.name)
 
                     if not arg.allows_offset:
                         gen("if hasattr(%s, 'offset') and %s.offset:" % (
@@ -571,7 +599,8 @@ class ExecutionWrapperGeneratorBase(object):
 
     # {{{ generate invocation
 
-    def generate_invocation(self, gen, kernel_name, args):
+    def generate_invocation(self, gen, kernel_name, args,
+            kernel, implemented_data_info):
         raise NotImplementedError()
 
     # }}}
@@ -632,7 +661,8 @@ class ExecutionWrapperGeneratorBase(object):
         args = self.generate_arg_setup(
             gen, kernel, implemented_data_info, options)
 
-        self.generate_invocation(gen, codegen_result.host_program.name, args)
+        self.generate_invocation(gen, codegen_result.host_program.name, args,
+                kernel, implemented_data_info)
 
         self.generate_output_handler(gen, options, kernel, implemented_data_info)
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index f24b115fd5a35af94e4a6d437550bccf86b5bee0..744c03d8ed091bc0f05e4fc41aa14e88ec89276a 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -61,6 +61,11 @@ def adjust_local_temp_var_storage(kernel, device):
                     temp_var.copy(storage_shape=temp_var.shape)
             continue
 
+        if not temp_var.shape:
+            # scalar, no need to mess with storage shape
+            new_temp_vars[temp_var.name] = temp_var
+            continue
+
         other_loctemp_nbytes = [
                 tv.nbytes
                 for tv in six.itervalues(kernel.temporary_variables)
@@ -441,7 +446,9 @@ def generate_value_arg_setup(kernel, devices, implemented_data_info):
         warn("{knl_name}: device not supplied to PyOpenCLTarget--"
                 "workarounds for broken OpenCL implementations "
                 "(such as those relating to complex numbers) "
-                "may not be enabled when needed"
+                "may not be enabled when needed. To avoid this, "
+                "pass target=lp.PyOpenCLTarget(dev) when creating "
+                "the kernel."
                 .format(knl_name=kernel.name))
 
     if any(count_bug_per_dev):
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index cc0b48a6ac17e23f318c5489d45fca6710bb3392..bef3152d03c193c14b11ce6f9ba3f20fdfcff6ad 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -151,7 +151,24 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
     # {{{ generate invocation
 
-    def generate_invocation(self, gen, kernel_name, args):
+    def generate_invocation(self, gen, kernel_name, args,
+            kernel, implemented_data_info):
+        if kernel.options.cl_exec_manage_array_events:
+            gen("""
+                if wait_for is None:
+                    wait_for = []
+                """)
+
+            gen("")
+            from loopy.kernel.data import GlobalArg
+            for arg in implemented_data_info:
+                if issubclass(arg.arg_class, GlobalArg):
+                    gen(
+                            "wait_for.extend({arg_name}.events)"
+                            .format(arg_name=arg.name))
+
+            gen("")
+
         gen("_lpy_evt = {kernel_name}({args})"
         .format(
             kernel_name=kernel_name,
@@ -160,6 +177,14 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                 + args
                 + ["wait_for=wait_for"])))
 
+        if kernel.options.cl_exec_manage_array_events:
+            gen("")
+            from loopy.kernel.data import GlobalArg
+            for arg in implemented_data_info:
+                if (issubclass(arg.arg_class, GlobalArg)
+                        and arg.base_name in kernel.get_written_variables()):
+                    gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name))
+
     # }}}
 
     # {{{
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index e7a86300f9d040cba1688e5bb0f3dcbbd926f783..7e6b03581e39d03bc06d2f6d37f65a1d4ac6a386 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -38,6 +38,20 @@ __doc__ = """
 
 # {{{ to_batched
 
+def temp_needs_batching_if_not_sequential(tv, batch_varying_args):
+    from loopy.kernel.data import temp_var_scope
+    if tv.name in batch_varying_args:
+        return True
+    if tv.initializer is not None and tv.read_only:
+        # do not batch read_only temps  if not in
+        # `batch_varying_args`
+        return False
+    if tv.scope == temp_var_scope.PRIVATE:
+        # do not batch private temps if not in `batch_varying args`
+        return False
+    return True
+
+
 class _BatchVariableChanger(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, kernel, batch_varying_args,
             batch_iname_expr, sequential):
@@ -50,14 +64,17 @@ class _BatchVariableChanger(RuleAwareIdentityMapper):
 
     def needs_batch_subscript(self, name):
         tv = self.kernel.temporary_variables.get(name)
-        return (
-                (not self.sequential
-                    and (tv is not None
-                        and not (
-                            tv.initializer is not None
-                            and tv.read_only)))
-                or
-                name in self.batch_varying_args)
+
+        if name in self.batch_varying_args:
+            return True
+        if not self.sequential:
+            if tv is None:
+                return False
+            if not temp_needs_batching_if_not_sequential(tv,
+                    self.batch_varying_args):
+                return False
+
+        return True
 
     def map_subscript(self, expr, expn_state):
         if not self.needs_batch_subscript(expr.aggregate.name):
@@ -89,6 +106,10 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
         sequential=False):
     """Takes in a kernel that carries out an operation and returns a kernel
     that carries out a batch of these operations.
+    .. note::
+       For temporaries in a kernel that are private or read only
+       globals and if `sequential=True`, loopy does not does not batch these
+       variables unless explicitly mentioned in `batch_varying_args`.
 
     :arg nbatches: the number of batches. May be a constant non-negative
         integer or a string, which will be added as an integer argument.
@@ -144,13 +165,13 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
         new_temps = {}
 
         for temp in six.itervalues(knl.temporary_variables):
-            if temp.initializer is not None and temp.read_only:
-                new_temps[temp.name] = temp
-            else:
+            if temp_needs_batching_if_not_sequential(temp, batch_varying_args):
                 new_temps[temp.name] = temp.copy(
                         shape=(nbatches_expr,) + temp.shape,
                         dim_tags=("c",) * (len(temp.shape) + 1),
                         dim_names=_add_unique_dim_name("ibatch", temp.dim_names))
+            else:
+                new_temps[temp.name] = temp
 
         knl = knl.copy(temporary_variables=new_temps)
     else:
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index cd8ec409cce1a3f210554a05daf4bd358781fb20..2347cef3c04d2a44cef91782700e097a20e19712 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -854,23 +854,23 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,
 
 # {{{ iname duplication for schedulability
 
-def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
-    # Remove common inames of the current insn_deps, as they are not relevant
+def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset([])):
+    # Remove common inames of the current insn_iname_sets, as they are not relevant
     # for splitting.
-    common = frozenset([]).union(*insn_deps).intersection(*insn_deps)
+    common = frozenset([]).union(*insn_iname_sets).intersection(*insn_iname_sets)
 
     # If common inames were found, we reduce the problem and go into recursion
     if common:
         # Remove the common inames from the instruction dependencies
-        insn_deps = (
-            frozenset(dep - common for dep in insn_deps)
+        insn_iname_sets = (
+            frozenset(iname_set - common for iname_set in insn_iname_sets)
             -
             frozenset([frozenset([])]))
         # Join the common inames with those previously found
         common = common.union(old_common_inames)
 
         # Go into recursion
-        for option in _get_iname_duplication_options(insn_deps, common):
+        for option in _get_iname_duplication_options(insn_iname_sets, common):
             yield option
         # Do not yield anything beyond here!
         return
@@ -880,7 +880,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
     def join_sets_if_not_disjoint(sets):
         for s1 in sets:
             for s2 in sets:
-                if s1 != s2 and s1.intersection(s2):
+                if s1 != s2 and s1 & s2:
                     return (
                         (sets - frozenset([s1, s2]))
                         | frozenset([s1 | s2])
@@ -888,7 +888,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
 
         return sets, True
 
-    partitioning = insn_deps
+    partitioning = insn_iname_sets
     stop = False
     while not stop:
         partitioning, stop = join_sets_if_not_disjoint(partitioning)
@@ -897,7 +897,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
     # subproblems
     if len(partitioning) > 1:
         for part in partitioning:
-            working_set = frozenset(s for s in insn_deps if s.issubset(part))
+            working_set = frozenset(s for s in insn_iname_sets if s <= part)
             for option in _get_iname_duplication_options(working_set,
                                                          old_common_inames):
                 yield option
@@ -908,7 +908,9 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
         # There are splitting options for all inames
         for iname in inames:
             iname_insns = frozenset(
-                    insn for insn in insn_deps if frozenset([iname]).issubset(insn))
+                    insn
+                    for insn in insn_iname_sets
+                    if frozenset([iname]) <= insn)
 
             import itertools as it
             # For a given iname, the set of instructions containing this iname
@@ -919,7 +921,7 @@ def _get_iname_duplication_options(insn_deps, old_common_inames=frozenset([])):
                     for l in range(1, len(iname_insns))):
                 yield (
                     iname,
-                    tuple(insn.union(old_common_inames) for insn in insns_to_dup))
+                    tuple(insn | old_common_inames for insn in insns_to_dup))
 
     # If partitioning was empty, we have recursed successfully and yield nothing
 
@@ -951,12 +953,12 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
     * duplicating j in instruction i2
     * duplicating i in instruction i2 and i3
 
-    Use :func:`has_schedulable_iname_nesting` to decide, whether an iname needs to be
+    Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be
     duplicated in a given kernel.
     """
     # First we extract the minimal necessary information from the kernel
     if use_boostable_into:
-        insn_deps = (
+        insn_iname_sets = (
             frozenset(insn.within_inames.union(
                 insn.boostable_into if insn.boostable_into is not None
                 else frozenset([]))
@@ -964,20 +966,20 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
             -
             frozenset([frozenset([])]))
     else:
-        insn_deps = (
+        insn_iname_sets = (
             frozenset(insn.within_inames for insn in knl.instructions)
             -
             frozenset([frozenset([])]))
 
     # Get the duplication options as a tuple of iname and a set
-    for iname, insns in _get_iname_duplication_options(insn_deps):
+    for iname, insns in _get_iname_duplication_options(insn_iname_sets):
         # Check whether this iname has a parallel tag and discard it if so
         from loopy.kernel.data import ConcurrentTag
         if (iname in knl.iname_to_tag
                     and isinstance(knl.iname_to_tag[iname], ConcurrentTag)):
             continue
 
-        # If we find a duplication option and fo not use boostable_into
+        # If we find a duplication option and to not use boostable_into
         # information, we restart this generator with use_boostable_into=True
         if not use_boostable_into and not knl.options.ignore_boostable_into:
             for option in get_iname_duplication_options(knl, True):
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 6ffc1dff5220ab48c6c87ec29fec6e44d57ba133..fcf8f965b68fd258b0c0f1eae94ec84a39a5b7ee 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -312,15 +312,8 @@ class TypeInferenceMapper(CombineMapper):
 
         from loopy.kernel.data import TemporaryVariable, KernelArgument
         import loopy as lp
-        if isinstance(obj, TemporaryVariable):
-            result = [obj.dtype]
-            if result[0] is lp.auto:
-                self.symbols_with_unknown_types.add(expr.name)
-                return []
-            else:
-                return result
-
-        elif isinstance(obj, KernelArgument):
+        if isinstance(obj, (KernelArgument, TemporaryVariable)):
+            assert obj.dtype is not lp.auto
             result = [obj.dtype]
             if result[0] is None:
                 self.symbols_with_unknown_types.add(expr.name)
@@ -515,10 +508,12 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     import loopy as lp
     for tv in six.itervalues(kernel.temporary_variables):
-        if tv.dtype is lp.auto:
+        assert tv.dtype is not lp.auto
+        if tv.dtype is None:
             names_for_type_inference.append(tv.name)
 
     for arg in kernel.args:
+        assert arg.dtype is not lp.auto
         if arg.dtype is None:
             names_for_type_inference.append(arg.name)
 
@@ -588,6 +583,9 @@ def infer_unknown_types(kernel, expect_completion=False):
             failed = not result
             if not failed:
                 new_dtype, = result
+                if new_dtype.target is None:
+                    new_dtype = new_dtype.with_target(kernel.target)
+
                 debug("     success: %s", new_dtype)
                 if new_dtype != item.dtype:
                     debug("     changed from: %s", item.dtype)
diff --git a/loopy/types.py b/loopy/types.py
index f095d1d58f9eaebb7dcc9c8d41afa73951f2ba84..8f0f310c305b3d5b24bd6e771b501bb6d9c69224 100644
--- a/loopy/types.py
+++ b/loopy/types.py
@@ -177,13 +177,20 @@ class AtomicNumpyType(NumpyType, AtomicType):
 # }}}
 
 
-def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False,
+def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False,
         target=None):
     from loopy.kernel.data import auto
-    if allow_none and dtype is None:
-        return dtype
-    elif allow_auto and dtype is auto:
-        return dtype
+    if dtype is None:
+        if allow_none:
+            return None
+        else:
+            raise LoopyError("dtype may not be none")
+
+    elif dtype is auto:
+        if allow_auto:
+            return dtype
+        else:
+            raise LoopyError("dtype may not be auto")
 
     numpy_dtype = None
 
diff --git a/loopy/version.py b/loopy/version.py
index c415c29a15063b6fd92335fdbaa37ba75ff4019f..7141a678297ded5e0d6e2f16f065f035a034d540 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -21,7 +21,7 @@ THE SOFTWARE.
 """
 
 
-VERSION = (2017, 2)
+VERSION = (2017, 2, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v72-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v76-islpy%s" % _islpy_version
diff --git a/requirements.txt b/requirements.txt
index 1a23022821116aea068b76eab72f9a5596694eea..a3e88cfea99e7413211c35d11464932f98e23758 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ git+https://github.com/inducer/pymbolic.git
 git+https://github.com/inducer/genpy.git
 git+https://github.com/inducer/codepy.git
 
-hg+https://bitbucket.org/inducer/f2py
+git+https://github.com/inducer/f2py
 
 # Optional, needed for using the C preprocessor on Fortran
 ply>=3.6
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 8bb8f37f08b909444f384387e0007c51c8eee587..e36a4c2c3cb3f7e70a5b039ea631bbce20923be8 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2462,16 +2462,9 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel():
     vecsize = 16
     knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0')
 
-    # artifically expand via overridden_get_grid_sizes_for_insn_ids
-    class GridOverride(object):
-        def __init__(self, clean, vecsize=vecsize):
-            self.clean = clean
-            self.vecsize = vecsize
-
-        def __call__(self, insn_ids, ignore_auto=True):
-            gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto)
-            return gsize, (self.vecsize,)
+    from testlib import GridOverride
 
+    # artifically expand via overridden_get_grid_sizes_for_insn_ids
     knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride(
         knl.copy(), vecsize))
     # make sure we can generate the code
@@ -2741,6 +2734,36 @@ def test_preamble_with_separate_temporaries(ctx_factory):
         queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1])
 
 
+def test_arg_inference_for_predicates():
+    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
+            """
+            if incr[i]
+              a = a + 1
+            end
+            """)
+
+    assert "incr" in knl.arg_dict
+    assert knl.arg_dict["incr"].shape == (10,)
+
+
+def test_relaxed_stride_checks(ctx_factory):
+    # Check that loopy is compatible with numpy's relaxed stride rules.
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel("{[i,j]: 0 <= i <= n and 0 <= j <= m}",
+             """
+             a[i] = sum(j, A[i,j] * b[j])
+             """)
+
+    with cl.CommandQueue(ctx) as queue:
+        mat = np.zeros((1, 10), order="F")
+        b = np.zeros(10)
+
+        evt, (a,) = knl(queue, A=mat, b=b)
+
+        assert a == 0
+
+
 def test_add_prefetch_works_in_lhs_index():
     knl = lp.make_kernel(
             "{ [n,k,l,k1,l1,k2,l2]: "
diff --git a/test/test_transform.py b/test/test_transform.py
index e50605b46672f8e9c1817431f1577742b1f6fb4c..0e10db362f36b7fc258059c2ec7ed1a344b97212 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -96,13 +96,65 @@ def test_to_batched(ctx_factory):
     knl = lp.make_kernel(
          ''' { [i,j]: 0<=i,j<n } ''',
          ''' out[i] = sum(j, a[i,j]*x[j])''')
+    knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
+                                            x=np.float32,
+                                            a=np.float32))
 
     bknl = lp.to_batched(knl, "nbatches", "out,x")
 
+    ref_knl = lp.make_kernel(
+         ''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''',
+         '''out[k, i] = sum(j, a[i,j]*x[k, j])''')
+    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
+                                                    x=np.float32,
+                                                    a=np.float32))
+
+    a = np.random.randn(5, 5).astype(np.float32)
+    x = np.random.randn(7, 5).astype(np.float32)
+
+    # Running both the kernels
+    evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7)
+    evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7)
+
+    # checking that the outputs are same
+    assert np.linalg.norm(out1-out2) < 1e-15
+
+
+def test_to_batched_temp(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+         ''' { [i,j]: 0<=i,j<n } ''',
+         ''' cnst = 2.0
+         out[i] = sum(j, cnst*a[i,j]*x[j])''',
+         [lp.TemporaryVariable(
+             "cnst",
+             dtype=np.float32,
+             shape=(),
+             scope=lp.temp_var_scope.PRIVATE), '...'])
+    knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
+                                            x=np.float32,
+                                            a=np.float32))
+    ref_knl = lp.make_kernel(
+         ''' { [i,j]: 0<=i,j<n } ''',
+         '''out[i] = sum(j, 2.0*a[i,j]*x[j])''')
+    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
+                                                    x=np.float32,
+                                                    a=np.float32))
+
+    bknl = lp.to_batched(knl, "nbatches", "out,x")
+    bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")
+
+    # checking that cnst is not being bathced
+    assert bknl.temporary_variables['cnst'].shape == ()
+
     a = np.random.randn(5, 5)
     x = np.random.randn(7, 5)
 
-    bknl(queue, a=a, x=x)
+    # Checking that the program compiles and the logic is correct
+    lp.auto_test_vs_ref(
+            bref_knl, ctx, bknl,
+            parameters=dict(a=a, x=x, n=5, nbatches=7))
 
 
 def test_add_barrier(ctx_factory):
diff --git a/test/testlib.py b/test/testlib.py
index 3fae05a38ad0f0c414f42a182e36ed26c5b50da5..73de4199d31736230026eb7f2eb7939a93806369 100644
--- a/test/testlib.py
+++ b/test/testlib.py
@@ -1,6 +1,20 @@
 import loopy as lp
 
 
+# {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel
+
+class GridOverride(object):
+    def __init__(self, clean, vecsize):
+        self.clean = clean
+        self.vecsize = vecsize
+
+    def __call__(self, insn_ids, ignore_auto=True):
+        gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto)
+        return gsize, (self.vecsize,)
+
+# }}}
+
+
 # {{{ test_preamble_with_separate_temporaries
 
 class SeparateTemporariesPreambleTestHelper:
@@ -99,3 +113,5 @@ class SeparateTemporariesPreambleTestHelper:
         yield (desc, '\n'.join([str(decl), code]))
 
 # }}}
+
+# vim: foldmethod=marker