diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9f68c0d899c9b83c07acdcf83b9b0b83196a2d97..58fb78251d04afb0e7f613aae237799237a010b9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,6 +4,7 @@ Python 2.7 AMD CPU:
   - export PYOPENCL_TEST=amd:pu
   - export EXTRA_INSTALL="numpy mako"
   - export LOOPY_NO_CACHE=1
+  - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -18,6 +19,7 @@ Python 2.6 POCL:
   - export PYOPENCL_TEST=portable
   - export EXTRA_INSTALL="numpy mako"
   - export LOOPY_NO_CACHE=1
+  - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -31,7 +33,6 @@ Python 3.5 AMD CPU:
   - export PY_EXE=python3.5
   - export PYOPENCL_TEST=amd:pu
   - export EXTRA_INSTALL="numpy mako"
-  - export NO_DOCTESTS=1
   - export LOOPY_NO_CACHE=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
@@ -47,6 +48,7 @@ Python 2.7 POCL:
   - export PYOPENCL_TEST=portable
   - export EXTRA_INSTALL="numpy mako"
   - export LOOPY_NO_CACHE=1
+  - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -62,6 +64,7 @@ Python 2.7 with legacy PyOpenCL:
   - export EXTRA_INSTALL="numpy mako"
   - export REQUIREMENTS_TXT="requirements-old-pyopencl.txt"
   - export LOOPY_NO_CACHE=1
+  - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -77,7 +80,6 @@ Python 3.6 POCL:
   - export PYOPENCL_TEST=portable
   - export EXTRA_INSTALL="numpy mako"
   - export LOOPY_NO_CACHE=1
-  - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -91,7 +93,6 @@ Python 3.6 POCL Twice With Cache:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
   - export EXTRA_INSTALL="numpy mako"
-  - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   - "cd .."
diff --git a/doc/misc.rst b/doc/misc.rst
index cd6fe102cb9c97a619d8b6512f103c9dcabe65b5..0cb1f1f7d2dbf0b30ddea9cce57b755b81a05cfe 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -90,7 +90,9 @@ regarding OpenCL drivers.
 User-visible Changes
 ====================
 
-Version 2017.2
+See also :ref:`language-versioning`.
+
+Version 2018.1
 --------------
 .. note::
 
@@ -342,6 +344,41 @@ This list is always growing, but here are a few pointers:
 
   Use :func:`loopy.join_inames`.
 
+In what sense does Loopy suport vectorization?
+----------------------------------------------
+
+There are really two ways in which the OpenCL/CUDA model of computation exposes
+vectorization:
+
+* "SIMT": The user writes scalar program instances and either the compiler or
+  the hardware joins the individual program instances into vectors of a
+  hardware-given length for execution.
+
+* "Short vectors": This type of vectorization is based on vector types,
+  e.g. ``float4``, which support arithmetic with implicit vector semantics
+  as well as a number of 'intrinsic' functions.
+
+Loopy suports both. The first one, SIMT, is accessible by tagging inames with,
+e.g., ``l.0```. Accessing the second one requires using both execution- and
+data-reshaping capabilities in loopy. To start with, you need an array that
+has an axis with the length of the desired vector. If that's not yet available,
+you may use :func:`loopy.split_array_axis` to produce one. Similarly, you need
+an iname whose bounds match those of the desired vector length. Again, if you
+don't already have one, :func:`loopy.split_iname` will easily produce one.
+Lastly, both the array axis an the iname need the implementation tag ``"vec"``.
+Here is an example of this machinery in action:
+
+.. literalinclude:: ../examples/python/vector-types.py
+    :language: python
+
+Note how the example slices off the last 'slab' of iterations to ensure that
+the bulk of the iteration does not require conditionals which would prevent
+successful vectorization. This generates the following code:
+
+.. literalinclude:: ../examples/python/vector-types.cl
+    :language: c
+
+
 Uh-oh. I got a scheduling error. Any hints?
 -------------------------------------------
 
diff --git a/doc/ref_creation.rst b/doc/ref_creation.rst
index 92eff09c9e3ecacfd8bb9030a9e4b9f002fefc71..6b715033cce60fa3a369f2abc4edbecbf4c9a0d3 100644
--- a/doc/ref_creation.rst
+++ b/doc/ref_creation.rst
@@ -30,4 +30,6 @@ To Copy between Data Formats
 
 .. autofunction:: make_copy_kernel
 
+.. automodule:: loopy.version
+
 .. vim: tw=75:spell:fdm=marker
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 7196dad863474d9b6ea9df9d9d0ae90b3e14986d..217e1ef7c323ca13f8a1aaf81e8ea30c08b784a7 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -25,6 +25,7 @@ import a few modules and set up a :class:`pyopencl.Context` and a
 
     >>> import loopy as lp
     >>> lp.set_caching_enabled(False)
+    >>> from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1
 
     >>> from warnings import filterwarnings, catch_warnings
     >>> filterwarnings('error', category=lp.LoopyWarning)
@@ -1157,7 +1158,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted:
    >>> cgr = lp.generate_code_v2(knl)
    Traceback (most recent call last):
    ...
-   MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
+   loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
 
 The syntax for a inserting a global barrier instruction is
 ``... gbarrier``. :mod:`loopy` also supports manually inserting local
@@ -1200,7 +1201,7 @@ Here is what happens when we try to generate code for the kernel:
    >>> cgr = lp.generate_code_v2(knl)
    Traceback (most recent call last):
    ...
-   MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
+   loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
 
 This happens due to the kernel splitting done by :mod:`loopy`. The splitting
 happens when the instruction schedule is generated. To see the schedule, we
@@ -1396,7 +1397,7 @@ Attempting to create this kernel results in an error:
     ... # While trying to find shape axis 0 of argument 'out', the following exception occurred:
     Traceback (most recent call last):
     ...
-    StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }'
+    loopy.diagnostic.StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }'
 
 The problem is that loopy cannot find a simple, universally valid expression
 for the length of *out* in this case. Notice how the kernel accesses both the
@@ -1462,7 +1463,7 @@ sign that something is amiss:
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     Traceback (most recent call last):
     ...
-    WriteRaceConditionWarning: in kernel transpose: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel attribute to disable)
+    loopy.diagnostic.WriteRaceConditionWarning: in kernel transpose: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel attribute to disable)
 
 When we ask to see the code, the issue becomes apparent:
 
@@ -1545,20 +1546,18 @@ containing different types of data:
     >>> knl = lp.add_and_infer_dtypes(knl,
     ...     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
 
-Note that loopy will infer the data types for arrays c and e from the
+Note that loopy will infer the data types for arrays ``c`` and ``e`` from the
 information provided. Now we will count the operations:
 
 .. doctest::
 
     >>> op_map = lp.get_op_map(knl)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    <BLANKLINE>
+    Op(np:dtype('float32'), add) : ...
+
+Each line of output will look roughly like::
+
+    Op(np:dtype('float32'), add) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 }
 
 :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A
@@ -1615,15 +1614,18 @@ together into keys containing only the specified fields:
 
     >>> op_map_dtype = op_map.group_by('dtype')
     >>> print(lp.stringify_stats_mapping(op_map_dtype))
-    Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), None) : ...
     <BLANKLINE>
     >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
     ...                           ].eval_with_dict(param_dict)
     >>> print(f32op_count)
     1572864
 
+The lines of output above might look like::
+
+    Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
+
 See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for
 more information on these functions.
 
@@ -1638,13 +1640,15 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl)
     >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 0, load, a) : ...
+    <BLANKLINE>
+
+Each line of output will look roughly like::
+
+
     MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
     MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
     MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    <BLANKLINE>
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
@@ -1693,18 +1697,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a) : ...
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, load, None) : ...
+    MemAccess(None, None, None, store, None) : ...
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
@@ -1714,6 +1713,15 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
     bytes loaded: 7340032
     bytes stored: 2621440
 
+The lines of output above might look like::
+
+    MemAccess(global, np:[m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+
 One can see how these functions might be useful in computing, for example,
 achieved memory bandwidth in byte/sec or performance in FLOP/sec.
 
@@ -1731,12 +1739,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, a) : ...
+    MemAccess(global, np:dtype('float32'), 1, load, b) : ...
+    MemAccess(global, np:dtype('float32'), 1, store, c) : ...
+    MemAccess(global, np:dtype('float64'), 1, load, g) : ...
+    MemAccess(global, np:dtype('float64'), 1, load, h) : ...
+    MemAccess(global, np:dtype('float64'), 1, store, e) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access consecutive array
@@ -1772,12 +1780,12 @@ switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, a) : ...
+    MemAccess(global, np:dtype('float32'), 128, load, b) : ...
+    MemAccess(global, np:dtype('float32'), 128, store, c) : ...
+    MemAccess(global, np:dtype('float64'), 128, load, g) : ...
+    MemAccess(global, np:dtype('float64'), 128, load, h) : ...
+    MemAccess(global, np:dtype('float64'), 128, store, e) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access *nonconsecutive*
diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py
index 7c5de5a1b1d7042498a12204959a59021ac5e0d8..6fa9b5fd30b350a07e2d1d27fa36c930c9afb892 100644
--- a/examples/python/hello-loopy.py
+++ b/examples/python/hello-loopy.py
@@ -2,6 +2,7 @@ import numpy as np
 import loopy as lp
 import pyopencl as cl
 import pyopencl.array
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1
 
 # setup
 # -----
diff --git a/examples/python/vector-types.cl b/examples/python/vector-types.cl
new file mode 100644
index 0000000000000000000000000000000000000000..9e05994dbab82fdd1a60dcfb949549f06c425af2
--- /dev/null
+++ b/examples/python/vector-types.cl
@@ -0,0 +1,26 @@
+#define lid(N) ((int) get_local_id(N))
+#define gid(N) ((int) get_group_id(N))
+#define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
+
+__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float4 const *__restrict__ a, int const n, __global float4 *__restrict__ out)
+{
+  /* bulk slab for 'i_outer' */
+  for (int i_outer = 0; i_outer <= -2 + int_floor_div_pos_b(3 + n, 4); ++i_outer)
+    out[i_outer] = 2.0f * a[i_outer];
+  /* final slab for 'i_outer' */
+  {
+    int const i_outer = -1 + n + -1 * int_floor_div_pos_b(3 * n, 4);
+
+    if (-1 + n >= 0)
+    {
+      if (-1 + -4 * i_outer + n >= 0)
+        out[i_outer].s0 = 2.0f * a[i_outer].s0;
+      if (-1 + -4 * i_outer + -1 + n >= 0)
+        out[i_outer].s1 = 2.0f * a[i_outer].s1;
+      if (-1 + -4 * i_outer + -1 * 2 + n >= 0)
+        out[i_outer].s2 = 2.0f * a[i_outer].s2;
+      if (-1 + -4 * i_outer + -1 * 3 + n >= 0)
+        out[i_outer].s3 = 2.0f * a[i_outer].s3;
+    }
+  }
+}
diff --git a/examples/python/vector-types.py b/examples/python/vector-types.py
new file mode 100644
index 0000000000000000000000000000000000000000..328aea154bfee653a6fcbf3fc8ad74b08375e13d
--- /dev/null
+++ b/examples/python/vector-types.py
@@ -0,0 +1,21 @@
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.array
+
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+
+n = 15 * 10**6
+a = cl.array.arange(queue, n, dtype=np.float32)
+
+knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "out[i] = 2*a[i]")
+
+knl = lp.set_options(knl, write_code=True)
+knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
+knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
+knl = lp.tag_array_axes(knl, "a,out", "C,vec")
+
+knl(queue, a=a.reshape(-1, 4), n=n)
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 5e8a3fb06b733183fb03c09eb6126a3eee98b916..0f4697f92e3f779b5670147c0fe7936989a317c4 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -65,6 +65,8 @@ from loopy.library.reduction import register_reduction_parser
 
 # {{{ import transforms
 
+from loopy.version import VERSION, MOST_RECENT_LANGUAGE_VERSION
+
 from loopy.transform.iname import (
         set_loop_priority, prioritize_loops,
         split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames,
@@ -171,6 +173,8 @@ __all__ = [
 
         "register_reduction_parser",
 
+        "VERSION", "MOST_RECENT_LANGUAGE_VERSION",
+
         # {{{ transforms
 
         "set_loop_priority", "prioritize_loops",
diff --git a/loopy/check.py b/loopy/check.py
index 7e661b566b15c47ec99e03ffdeb035057602da76..83f529206c6d1d4cb058673162c2285a5ad1356a 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -250,6 +250,8 @@ def check_for_data_dependent_parallel_bounds(kernel):
                         % (i, par, ", ".join(par_inames)))
 
 
+# {{{ check access bounds
+
 class _AccessCheckMapper(WalkMapper):
     def __init__(self, kernel, domain, insn_id):
         self.kernel = kernel
@@ -277,7 +279,8 @@ class _AccessCheckMapper(WalkMapper):
             if not isinstance(subscript, tuple):
                 subscript = (subscript,)
 
-            from loopy.symbolic import get_dependencies, get_access_range
+            from loopy.symbolic import (get_dependencies, get_access_range,
+                    UnableToDetermineAccessRange)
 
             available_vars = set(self.domain.get_var_dict())
             shape_deps = set()
@@ -298,11 +301,8 @@ class _AccessCheckMapper(WalkMapper):
             try:
                 access_range = get_access_range(self.domain, subscript,
                         self.kernel.assumptions)
-            except isl.Error:
-                # Likely: index was non-linear, nothing we can do.
-                return
-            except TypeError:
-                # Likely: index was non-linear, nothing we can do.
+            except UnableToDetermineAccessRange:
+                # Likely: index was non-affine, nothing we can do.
                 return
 
             shape_domain = isl.BasicSet.universe(access_range.get_space())
@@ -340,6 +340,10 @@ def check_bounds(kernel):
 
         insn.with_transformed_expressions(run_acm)
 
+# }}}
+
+
+# {{{ check write destinations
 
 def check_write_destinations(kernel):
     for insn in kernel.instructions:
@@ -363,6 +367,10 @@ def check_write_destinations(kernel):
                     or wvar in kernel.arg_dict) and wvar not in kernel.all_params():
                 raise LoopyError
 
+# }}}
+
+
+# {{{ check_has_schedulable_iname_nesting
 
 def check_has_schedulable_iname_nesting(kernel):
     from loopy.transform.iname import (has_schedulable_iname_nesting,
@@ -382,6 +390,196 @@ def check_has_schedulable_iname_nesting(kernel):
 # }}}
 
 
+# {{{ check_variable_access_ordered
+
+class IndirectDependencyEdgeFinder(object):
+    def __init__(self, kernel):
+        self.kernel = kernel
+        self.dep_edge_cache = {}
+
+    def __call__(self, depender_id, dependee_id):
+        cache_key = (depender_id, dependee_id)
+
+        try:
+            return self.dep_edge_cache[cache_key]
+        except KeyError:
+            pass
+
+        depender = self.kernel.id_to_insn[depender_id]
+
+        if dependee_id in depender.depends_on:
+            self.dep_edge_cache[cache_key] = True
+            return True
+
+        for dep in depender.depends_on:
+            if self(dep, dependee_id):
+                self.dep_edge_cache[cache_key] = True
+                return True
+
+        return False
+
+
+def declares_nosync_with(kernel, var_scope, dep_a, dep_b):
+    from loopy.kernel.data import temp_var_scope
+    if var_scope == temp_var_scope.GLOBAL:
+        search_scopes = ["global", "any"]
+    elif var_scope == temp_var_scope.LOCAL:
+        search_scopes = ["local", "any"]
+    elif var_scope == temp_var_scope.PRIVATE:
+        search_scopes = ["any"]
+    else:
+        raise ValueError("unexpected value of 'temp_var_scope'")
+
+    ab_nosync = False
+    ba_nosync = False
+
+    for scope in search_scopes:
+        if (dep_a.id, scope) in dep_b.no_sync_with:
+            ab_nosync = True
+        if (dep_b.id, scope) in dep_a.no_sync_with:
+            ba_nosync = True
+
+    return ab_nosync and ba_nosync
+
+
+def check_variable_access_ordered(kernel):
+    """Checks that between each write to a variable and all other accesses to
+    the variable there is either:
+
+    * an (at least indirect) depdendency edge, or
+    * an explicit statement that no ordering is necessary (expressed
+      through a bi-directional :attr:`loopy.Instruction.no_sync_with`)
+    """
+    if kernel.options.enforce_variable_access_ordered not in [
+            "no_check",
+            True,
+            False]:
+        raise LoopyError("invalid value for option "
+                "'enforce_variable_access_ordered': %s"
+                % kernel.options.enforce_variable_access_ordered)
+
+    if kernel.options.enforce_variable_access_ordered == "no_check":
+        return
+
+    logger.debug("%s: check_variable_access_ordered: start" % kernel.name)
+
+    checked_variables = kernel.get_written_variables() & (
+            set(kernel.temporary_variables) | set(arg for arg in kernel.arg_dict))
+
+    wmap = kernel.writer_map()
+    rmap = kernel.reader_map()
+
+    from loopy.kernel.data import GlobalArg, ValueArg, temp_var_scope
+    from loopy.kernel.tools import find_aliasing_equivalence_classes
+
+    depfind = IndirectDependencyEdgeFinder(kernel)
+    aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel)
+
+    for name in checked_variables:
+        # This is a tad redundant in that this could probably be restructured
+        # to iterate only over equivalence classes and not individual variables.
+        # But then the access-range overlap check below would have to be smarter.
+        eq_class = aliasing_equiv_classes[name]
+
+        readers = set.union(
+                *[rmap.get(eq_name, set()) for eq_name in eq_class])
+        writers = set.union(
+                *[wmap.get(eq_name, set()) for eq_name in eq_class])
+        unaliased_readers = rmap.get(name, set())
+        unaliased_writers = wmap.get(name, set())
+
+        if not writers:
+            continue
+
+        if name in kernel.temporary_variables:
+            scope = kernel.temporary_variables[name].scope
+        else:
+            arg = kernel.arg_dict[name]
+            if isinstance(arg, GlobalArg):
+                scope = temp_var_scope.GLOBAL
+            elif isinstance(arg, ValueArg):
+                scope = temp_var_scope.PRIVATE
+            else:
+                # No need to consider ConstantArg and ImageArg (for now)
+                # because those won't be written.
+                raise ValueError("could not determine scope of '%s'" % name)
+
+        # Check even for PRIVATE scope, to ensure intentional program order.
+
+        from loopy.symbolic import do_access_ranges_overlap_conservative
+
+        for writer_id in writers:
+            for other_id in readers | writers:
+                if writer_id == other_id:
+                    continue
+
+                writer = kernel.id_to_insn[writer_id]
+                other = kernel.id_to_insn[other_id]
+
+                has_dependency_relationship = (
+                        declares_nosync_with(kernel, scope, other, writer)
+                        or
+                        depfind(writer_id, other_id)
+                        or
+                        depfind(other_id, writer_id)
+                        )
+
+                if has_dependency_relationship:
+                    continue
+
+                is_relationship_by_aliasing = not (
+                        writer_id in unaliased_writers
+                        and (other_id in unaliased_writers
+                            or other_id in unaliased_readers))
+
+                # Do not enforce ordering for disjoint access ranges
+                if (not is_relationship_by_aliasing
+                        and not do_access_ranges_overlap_conservative(
+                            kernel, writer_id, "w", other_id, "any",
+                            name)):
+                    continue
+
+                # Do not enforce ordering for aliasing-based relationships
+                # in different groups.
+                if (is_relationship_by_aliasing and (
+                        bool(writer.groups & other.conflicts_with_groups)
+                        or
+                        bool(other.groups & writer.conflicts_with_groups))):
+                    continue
+
+                msg = ("No dependency relationship found between "
+                        "'{writer_id}' which writes {var} and "
+                        "'{other_id}' which also accesses {var}. "
+                        "Either add a (possibly indirect) dependency "
+                        "between the two, or add them to each others' nosync "
+                        "set to indicate that no ordering is intended, or "
+                        "turn off this check by setting the "
+                        "'enforce_variable_access_ordered' option"
+                        .format(
+                            writer_id=writer_id,
+                            other_id=other_id,
+                            var=(
+                                "the variable '%s'" % name
+                                if len(eq_class) == 1
+                                else (
+                                    "the aliasing equivalence class '%s'"
+                                    % ", ".join(eq_class))
+                                )))
+                if kernel.options.enforce_variable_access_ordered:
+                    from loopy.diagnostic import VariableAccessNotOrdered
+                    raise VariableAccessNotOrdered(msg)
+                else:
+                    from loopy.diagnostic import warn_with_kernel
+                    warn_with_kernel(
+                            kernel, "variable_access_ordered", msg)
+
+    logger.debug("%s: check_variable_access_ordered: done" % kernel.name)
+
+# }}}
+
+# }}}
+
+
 def pre_schedule_checks(kernel):
     try:
         logger.debug("%s: pre-schedule check: start" % kernel.name)
@@ -397,6 +595,7 @@ def pre_schedule_checks(kernel):
         check_bounds(kernel)
         check_write_destinations(kernel)
         check_has_schedulable_iname_nesting(kernel)
+        check_variable_access_ordered(kernel)
 
         logger.debug("%s: pre-schedule check: done" % kernel.name)
     except KeyboardInterrupt:
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index 4868f70af81ae54972e7d81282b62798da233407..c2b78f4d78698998cecb1f082aae2ed433310aed 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -115,6 +115,10 @@ class LoopyTypeError(LoopyError):
 class ExpressionNotAffineError(LoopyError):
     pass
 
+
+class VariableAccessNotOrdered(LoopyError):
+    pass
+
 # }}}
 
 
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index e801d09dcf10750ce09af647e0b14f4641fa1fb2..bcbe41874d8613eaabd84ae71dd65317558f0185 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -708,6 +708,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
             # }}}
 
+            from loopy.version import MOST_RECENT_LANGUAGE_VERSION
             knl = lp.make_kernel(
                     sub.index_sets,
                     sub.instructions,
@@ -717,6 +718,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
                     index_dtype=self.index_dtype,
                     target=self.target,
                     seq_dependencies=seq_dependencies,
+                    lang_version=MOST_RECENT_LANGUAGE_VERSION
                     )
 
             from loopy.loop import fuse_loop_domains
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 4a08c28bd8091425293892384e01d20447413cd5..0daf327f441031662b46a4a83b4fc40e73eb5688 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1666,7 +1666,7 @@ def _is_wildcard(s):
     return any(c in s for c in WILDCARD_SYMBOLS)
 
 
-def _resolve_dependencies(knl, insn, deps):
+def _resolve_dependencies(what, knl, insn, deps):
     from loopy import find_instructions
     from loopy.match import MatchExpressionBase
 
@@ -1692,10 +1692,11 @@ def _resolve_dependencies(knl, insn, deps):
                 found_any = True
 
         if not found_any and knl.options.check_dep_resolution:
-            raise LoopyError("instruction '%s' declared a depency on '%s', "
+            raise LoopyError("instruction '%s' declared %s on '%s', "
                     "which did not resolve to any instruction present in the "
                     "kernel '%s'. Set the kernel option 'check_dep_resolution'"
-                    "to False to disable this check." % (insn.id, dep, knl.name))
+                    "to False to disable this check."
+                    % (insn.id, what, dep, knl.name))
 
     for dep_id in new_deps:
         if dep_id not in knl.id_to_insn:
@@ -1710,13 +1711,14 @@ def resolve_dependencies(knl):
 
     for insn in knl.instructions:
         new_insns.append(insn.copy(
-                    depends_on=_resolve_dependencies(knl, insn, insn.depends_on),
-                    no_sync_with=frozenset(
-                        (resolved_insn_id, nosync_scope)
-                        for nosync_dep, nosync_scope in insn.no_sync_with
-                        for resolved_insn_id in
-                        _resolve_dependencies(knl, insn, (nosync_dep,))),
-                    ))
+            depends_on=_resolve_dependencies(
+                "a dependency", knl, insn, insn.depends_on),
+            no_sync_with=frozenset(
+                (resolved_insn_id, nosync_scope)
+                for nosync_dep, nosync_scope in insn.no_sync_with
+                for resolved_insn_id in
+                _resolve_dependencies("nosync", knl, insn, (nosync_dep,))),
+            ))
 
     return knl.copy(instructions=new_insns)
 
@@ -1909,6 +1911,30 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
         will be fixed to *value*. *name* may refer to :ref:`domain-parameters`
         or :ref:`arguments`. See also :func:`loopy.fix_parameters`.
 
+    :arg lang_version: The language version against which the kernel was
+        written, a tuple. To ensure future compatibility, copy the current value of
+        :data:`loopy.MOST_RECENT_LANGUAGE_VERSION` and pass that value.
+
+        (If you just pass :data:`loopy.MOST_RECENT_LANGUAGE_VERSION` directly,
+        breaking language changes *will* apply to your kernel without asking,
+        likely breaking your code.)
+
+        If not given, this value defaults to version **(2017, 2, 1)** and
+        a warning will be issued.
+
+        To set the kernel version for all :mod:`loopy` kernels in a (Python) source
+        file, you may simply say::
+
+            from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1
+
+        If *lang_version* is not explicitly given, that version value will be used.
+
+        See also :ref:`language-versioning`.
+
+    .. versionchanged:: 2017.2.1
+
+        *lang_version* added.
+
     .. versionchanged:: 2017.2
 
         *fixed_parameters* added.
@@ -1953,6 +1979,56 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     from loopy.options import make_options
     options = make_options(options)
 
+    lang_version = kwargs.pop("lang_version", None)
+    if lang_version is None:
+        # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION
+
+        from loopy.version import LANGUAGE_VERSION_SYMBOLS
+
+        # This *is* gross. But it seems like the right thing interface-wise.
+        import inspect
+        caller_globals = inspect.currentframe().f_back.f_globals
+
+        for ver_sym in LANGUAGE_VERSION_SYMBOLS:
+            try:
+                lang_version = caller_globals[ver_sym]
+                break
+            except KeyError:
+                pass
+
+        # }}}
+
+        import loopy.version
+        version_to_symbol = dict(
+                (getattr(loopy.version, lvs), lvs)
+                for lvs in LANGUAGE_VERSION_SYMBOLS)
+
+        if lang_version is None:
+            from warnings import warn
+            from loopy.diagnostic import LoopyWarning
+            from loopy.version import (
+                    MOST_RECENT_LANGUAGE_VERSION,
+                    FALLBACK_LANGUAGE_VERSION)
+            warn("'lang_version' was not passed to make_kernel(). "
+                    "To avoid this warning, pass "
+                    "lang_version={ver} in this invocation. "
+                    "(Or say 'from loopy.version import "
+                    "{sym_ver}' in "
+                    "the global scope of the calling frame.)"
+                    .format(
+                        ver=MOST_RECENT_LANGUAGE_VERSION,
+                        sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION]
+                        ),
+                    LoopyWarning, stacklevel=2)
+
+            lang_version = FALLBACK_LANGUAGE_VERSION
+
+        if lang_version not in version_to_symbol:
+            raise LoopyError("Language version '%s' is not known." % lang_version)
+
+    if lang_version >= (2018, 1):
+        options = options.copy(enforce_variable_access_ordered=True)
+
     if isinstance(silenced_warnings, str):
         silenced_warnings = silenced_warnings.split(";")
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 9d95408acc7f1a53f1f1a7616f7d6611249c796b..95001c78bb1f3ef0c6e823589075ddb6e3fbb506 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -91,7 +91,7 @@ class InstructionBase(ImmutableRecord):
 
     .. attribute:: no_sync_with
 
-        a :class:`frozenset` of tuples of the form `(insn_id, scope)`, where
+        a :class:`frozenset` of tuples of the form ``(insn_id, scope)``, where
         `insn_id` refers to :attr:`id` of :class:`Instruction` instances
         and `scope` is one of the following strings:
 
@@ -99,13 +99,20 @@ class InstructionBase(ImmutableRecord):
            - `"global"`
            - `"any"`.
 
-        This indicates no barrier synchronization is necessary with the given
-        instruction using barriers of type `scope`, even given the existence of
-        a dependency chain and apparently conflicting access.
+        An element ``(insn_id, scope)`` means "do not consider any variable
+        access conflicting for variables of ``scope`` between this instruction
+        and ``insn_id``".
+        Specifically, loopy will not complain even if it detects that accesses
+        potentially requiring ordering (e.g. by dependencies) exist, and it
+        will not emit barriers to guard any dependencies from this
+        instruction on ``insn_id`` that may exist.
 
         Note, that :attr:`no_sync_with` allows instruction matching through wildcards
         and match expression, just like :attr:`depends_on`.
 
+        This data is used specifically by barrier insertion and
+        :func:`loopy.check.enforce_variable_access_ordered`.
+
     .. rubric:: Conditionals
 
     .. attribute:: predicates
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index fbc4238c21e966cb61d1c074ce6924fd9af26084..15db06ad78b448675b193a2d880ae5b50073e99d 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1278,14 +1278,14 @@ def draw_dependencies_as_unicode_arrows(
 
     for insn in instructions:
         for dep in insn.depends_on:
-            reverse_deps.setdefault(dep, []).append(insn.id)
+            reverse_deps.setdefault(dep, set()).add(insn.id)
 
     # mapping of (from_id, to_id) tuples to column_index
     dep_to_column = {}
 
     # {{{ find column assignments
 
-    # mapping from column indices to (end_insn_id, updown)
+    # mapping from column indices to (end_insn_ids, pointed_at_insn_id)
     columns_in_use = {}
 
     n_columns = [0]
@@ -1299,47 +1299,101 @@ def draw_dependencies_as_unicode_arrows(
             row.append(" ")
         return i
 
-    def do_flag_downward(s, updown):
-        if flag_downward and updown == "down":
+    def do_flag_downward(s, pointed_at_insn_id):
+        if flag_downward and pointed_at_insn_id not in processed_ids:
             return fore.RED+s+style.RESET_ALL
         else:
             return s
 
     def make_extender():
         result = n_columns[0] * [" "]
-        for col, (_, updown) in six.iteritems(columns_in_use):
-            result[col] = do_flag_downward(u"â”‚", updown)
+        for col, (_, pointed_at_insn_id) in six.iteritems(columns_in_use):
+            result[col] = do_flag_downward(u"â”‚", pointed_at_insn_id)
 
         return result
 
+    processed_ids = set()
+
     rows = []
     for insn in instructions:
         row = make_extender()
 
-        for rdep in reverse_deps.get(insn.id, []):
-            assert rdep != insn.id
+        # {{{ add rdeps for already existing columns
 
-            dep_key = (rdep, insn.id)
-            if dep_key not in dep_to_column:
-                col = dep_to_column[dep_key] = find_free_column()
-                columns_in_use[col] = (rdep, "up")
-                row[col] = u"â†±"
+        rdeps = reverse_deps.get(insn.id, set()).copy() - processed_ids
+        assert insn.id not in rdeps
+
+        if insn.id in dep_to_column:
+            columns_in_use[insn.id][0].update(rdeps)
+
+        # }}}
+
+        # {{{ add deps for already existing columns
+
+        for dep in insn.depends_on:
+            dep_key = dep
+            if dep_key in dep_to_column:
+                col = dep_to_column[dep]
+                columns_in_use[col][0].add(insn.id)
+
+        # }}}
+
+        for col, (starts, pointed_at_insn_id) in list(six.iteritems(columns_in_use)):
+            if insn.id == pointed_at_insn_id:
+                if starts:
+                    # will continue downward
+                    row[col] = do_flag_downward(u">", pointed_at_insn_id)
+                else:
+                    # stops here
+
+                    # placeholder, pending deletion
+                    columns_in_use[col] = None
+
+                    row[col] = do_flag_downward(u"â†³", pointed_at_insn_id)
+
+            elif insn.id in starts:
+                starts.remove(insn.id)
+                if starts:
+                    # will continue downward
+                    row[col] = do_flag_downward(u"â”œ", pointed_at_insn_id)
+
+                else:
+                    # stops here
+                    row[col] = u"â””"
+                    # placeholder, pending deletion
+                    columns_in_use[col] = None
+
+        # {{{ start arrows by reverse dep
+
+        dep_key = insn.id
+        if dep_key not in dep_to_column and rdeps:
+            col = dep_to_column[dep_key] = find_free_column()
+            columns_in_use[col] = (rdeps, insn.id)
+            row[col] = u"â†±"
+
+        # }}}
+
+        # {{{ start arrows by forward dep
 
         for dep in insn.depends_on:
             assert dep != insn.id
-            dep_key = (insn.id, dep)
+            dep_key = dep
             if dep_key not in dep_to_column:
                 col = dep_to_column[dep_key] = find_free_column()
-                columns_in_use[col] = (dep, "down")
-                row[col] = do_flag_downward(u"â”Œ", "down")
+                columns_in_use[col] = (set([insn.id]), dep)
+                row[col] = do_flag_downward(u"â”Œ", dep)
 
-        for col, (end, updown) in list(six.iteritems(columns_in_use)):
-            if insn.id == end:
+        # }}}
+
+        # {{{ delete columns_in_use entry for end-of-life columns
+
+        for col, value in list(six.iteritems(columns_in_use)):
+            if value is None:
                 del columns_in_use[col]
-                if updown == "up":
-                    row[col] = u"â””"
-                else:
-                    row[col] = do_flag_downward(u"â†³", updown)
+
+        # }}
+
+        processed_ids.add(insn.id)
 
         extender = make_extender()
 
@@ -1731,4 +1785,84 @@ def get_subkernel_to_insn_id_map(kernel):
 # }}}
 
 
+# {{{ find aliasing equivalence classes
+
+class DisjointSets(object):
+    """
+    .. automethod:: __getitem__
+    .. automethod:: find_leader_or_create_group
+    .. automethod:: union
+    .. automethod:: union_many
+    """
+
+    # https://en.wikipedia.org/wiki/Disjoint-set_data_structure
+
+    def __init__(self):
+        self.leader_to_group = {}
+        self.element_to_leader = {}
+
+    def __getitem__(self, item):
+        """
+        :arg item: A representative of an equivalence class.
+        :returns: the equivalence class, given as a set of elements
+        """
+        try:
+            leader = self.element_to_leader[item]
+        except KeyError:
+            return set([item])
+        else:
+            return self.leader_to_group[leader]
+
+    def find_leader_or_create_group(self, el):
+        try:
+            return self.element_to_leader[el]
+        except KeyError:
+            pass
+
+        self.element_to_leader[el] = el
+        self.leader_to_group[el] = set([el])
+        return el
+
+    def union(self, a, b):
+        leader_a = self.find_leader_or_create_group(a)
+        leader_b = self.find_leader_or_create_group(b)
+
+        if leader_a == leader_b:
+            return
+
+        new_leader = leader_a
+
+        for b_el in self.leader_to_group[leader_b]:
+            self.element_to_leader[b_el] = new_leader
+
+        self.leader_to_group[leader_a].update(self.leader_to_group[leader_b])
+        del self.leader_to_group[leader_b]
+
+    def union_many(self, relation):
+        """
+        :arg relation: an iterable of 2-tuples enumerating the elements of the
+            relation. The relation is assumed to be an equivalence relation
+            (transitive, reflexive, symmetric) but need not explicitly contain
+            all elements to make it that.
+
+            The first elements of the tuples become group leaders.
+
+        :returns: *self*
+        """
+
+        for a, b in relation:
+            self.union(a, b)
+
+        return self
+
+
+def find_aliasing_equivalence_classes(kernel):
+    return DisjointSets().union_many(
+            (tv.base_storage, tv.name)
+            for tv in six.itervalues(kernel.temporary_variables)
+            if tv.base_storage is not None)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/options.py b/loopy/options.py
index 13d0b752dfcfa0f0da233880f27f09a963ab4c81..63089d94d3487e77a1def39a98fe24631c508398 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -162,6 +162,16 @@ class Options(ImmutableRecord):
     .. rubric:: Features
 
     .. attribute:: disable_global_barriers
+
+    .. attribute:: enforce_variable_access_ordered
+
+        If *True*, require that
+        :func:`loopy.check.check_variable_access_ordered` passes.
+        Required for language versions 2018.1 and above. This check
+        helps find and eliminate unintentionally unordered access
+        to variables.
+
+        If equal to ``"no_check"``, then no check is performed.
     """
 
     _legacy_options_map = {
@@ -216,6 +226,9 @@ class Options(ImmutableRecord):
                 disable_global_barriers=kwargs.get("disable_global_barriers",
                     False),
                 check_dep_resolution=kwargs.get("check_dep_resolution", True),
+
+                enforce_variable_access_ordered=kwargs.get(
+                    "enforce_variable_access_ordered", False),
                 )
 
     # {{{ legacy compatibility
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index ad119e94e74b294e16cdc15c5ab1f723cf7f254b..5e36e51a190f9ba93a48aa8eaab5e34c20153e47 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -1861,9 +1861,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             # An expansion happened, so insert the generated stuff plus
             # ourselves back into the queue.
 
+            result_assignment_dep_on = \
+                    insn.depends_on | frozenset(new_insn_add_depends_on)
             kwargs = insn.get_copy_kwargs(
-                    depends_on=insn.depends_on
-                    | frozenset(new_insn_add_depends_on),
                     no_sync_with=insn.no_sync_with
                     | frozenset(new_insn_add_no_sync_with),
                     within_inames=(
@@ -1871,6 +1871,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                         | new_insn_add_within_inames))
 
             kwargs.pop("id")
+            kwargs.pop("depends_on")
             kwargs.pop("expression")
             kwargs.pop("assignee", None)
             kwargs.pop("assignees", None)
@@ -1878,20 +1879,27 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             kwargs.pop("temp_var_types", None)
 
             if isinstance(insn.expression, Reduction) and nresults > 1:
+                result_assignment_ids = [
+                        insn_id_gen(insn.id) for i in range(nresults)]
                 replacement_insns = [
                         lp.Assignment(
-                            id=insn_id_gen(insn.id),
+                            id=result_assignment_ids[i],
+                            depends_on=(
+                                result_assignment_dep_on
+                                | (frozenset([result_assignment_ids[i-1]])
+                                    if i else frozenset())),
                             assignee=assignee,
                             expression=new_expr,
                             **kwargs)
-                        for assignee, new_expr in zip(
-                            insn.assignees, new_expressions)]
+                        for i, (assignee, new_expr) in enumerate(zip(
+                            insn.assignees, new_expressions))]
 
             else:
                 new_expr, = new_expressions
                 replacement_insns = [
                         make_assignment(
                             id=insn_id_gen(insn.id),
+                            depends_on=result_assignment_dep_on,
                             assignees=insn.assignees,
                             expression=new_expr,
                             **kwargs)
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 850f0a61fcdc2878d43895bc0e024032532aa680..b196b343edebf0b9346b449bc5a44bcc065407a2 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -1427,8 +1427,8 @@ class DependencyTracker(object):
             raise ValueError("unknown 'var_kind': %s" % var_kind)
 
         from collections import defaultdict
-        self.writer_map = defaultdict(set)
-        self.reader_map = defaultdict(set)
+        self.base_writer_map = defaultdict(set)
+        self.base_access_map = defaultdict(set)
         self.temp_to_base_storage = kernel.get_temporary_to_base_storage_map()
 
     def map_to_base_storage(self, var_names):
@@ -1442,23 +1442,27 @@ class DependencyTracker(object):
         return result
 
     def discard_all_sources(self):
-        self.writer_map.clear()
-        self.reader_map.clear()
+        self.base_writer_map.clear()
+        self.base_access_map.clear()
+
+    # Anything with 'base' in the name in this class contains names normalized
+    # to their 'base_storage'.
 
     def add_source(self, source):
         """
-        Specify that an instruction may be used as the source of a dependency edge.
+        Specify that an instruction used as the source (depended-upon
+        part) of a dependency edge is of interest to this tracker.
         """
         # If source is an insn ID, look up the actual instruction.
         source = self.kernel.id_to_insn.get(source, source)
 
         for written in self.map_to_base_storage(
                 set(source.assignee_var_names()) & self.relevant_vars):
-            self.writer_map[written].add(source.id)
+            self.base_writer_map[written].add(source.id)
 
         for read in self.map_to_base_storage(
-                source.read_dependency_names() & self.relevant_vars):
-            self.reader_map[read].add(source.id)
+                source.dependency_names() & self.relevant_vars):
+            self.base_access_map[read].add(source.id)
 
     def gen_dependencies_with_target_at(self, target):
         """
@@ -1471,51 +1475,87 @@ class DependencyTracker(object):
         # If target is an insn ID, look up the actual instruction.
         target = self.kernel.id_to_insn.get(target, target)
 
-        tgt_write = self.map_to_base_storage(
-            set(target.assignee_var_names()) & self.relevant_vars)
-        tgt_read = self.map_to_base_storage(
-            target.read_dependency_names() & self.relevant_vars)
-
-        for (accessed_vars, accessor_map) in [
-                (tgt_read, self.writer_map),
-                (tgt_write, self.reader_map),
-                (tgt_write, self.writer_map)]:
+        for (
+                tgt_dir, src_dir, src_base_var_to_accessor_map
+                ) in [
+                ("any", "w", self.base_writer_map),
+                ("w", "any", self.base_access_map),
+                ]:
 
             for dep in self.get_conflicting_accesses(
-                    accessed_vars, accessor_map, target.id):
+                    target, tgt_dir, src_dir, src_base_var_to_accessor_map):
                 yield dep
 
-    def get_conflicting_accesses(
-            self, accessed_vars, var_to_accessor_map, target):
+    def get_conflicting_accesses(self, target, tgt_dir, src_dir,
+            src_base_var_to_accessor_map):
+
+        def get_written_names(insn):
+            return set(insn.assignee_var_names()) & self.relevant_vars
+
+        def get_accessed_names(insn):
+            return insn.dependency_names() & self.relevant_vars
+
+        dir_to_getter = {"w": get_written_names, "any": get_accessed_names}
+
+        def filter_var_set_for_base_storage(var_name_set, base_storage_name):
+            return set(
+                    name
+                    for name in var_name_set
+                    if (self.temp_to_base_storage.get(name, name)
+                        == base_storage_name))
+
+        tgt_accessed_vars = dir_to_getter[tgt_dir](target)
+        tgt_accessed_vars_base = self.map_to_base_storage(tgt_accessed_vars)
+
+        for race_var_base in sorted(tgt_accessed_vars_base):
+            for source_id in sorted(
+                    src_base_var_to_accessor_map[race_var_base]):
 
-        def determine_conflict_nature(source, target):
-            if (not self.reverse and source in
-                    self.kernel.get_nosync_set(target, scope=self.var_kind)):
-                return None
-            if (self.reverse and target in
-                    self.kernel.get_nosync_set(source, scope=self.var_kind)):
-                return None
-            return self.describe_dependency(source, target)
+                # {{{ no barrier if nosync
 
-        for var in sorted(accessed_vars):
-            for source in sorted(var_to_accessor_map[var]):
-                dep_descr = determine_conflict_nature(source, target)
+                if (not self.reverse and source_id in
+                        self.kernel.get_nosync_set(target.id, scope=self.var_kind)):
+                    continue
+                if (self.reverse and target.id in
+                        self.kernel.get_nosync_set(source_id, scope=self.var_kind)):
+                    continue
 
+                # }}}
+
+                dep_descr = self.describe_dependency(source_id, target)
                 if dep_descr is None:
                     continue
 
+                source = self.kernel.id_to_insn[source_id]
+                src_race_vars = filter_var_set_for_base_storage(
+                        dir_to_getter[src_dir](source), race_var_base)
+                tgt_race_vars = filter_var_set_for_base_storage(
+                        tgt_accessed_vars, race_var_base)
+
+                race_var = race_var_base
+
+                # Only one (non-base_storage) race variable name: Data is not
+                # being passed between aliases, so we may look at indices.
+                if src_race_vars == tgt_race_vars and len(src_race_vars) == 1:
+                    race_var, = src_race_vars
+
+                    from loopy.symbolic import do_access_ranges_overlap_conservative
+                    if not do_access_ranges_overlap_conservative(
+                            self.kernel, target.id, tgt_dir,
+                            source_id, src_dir, race_var):
+                        continue
+
                 yield DependencyRecord(
-                        source=self.kernel.id_to_insn[source],
-                        target=self.kernel.id_to_insn[target],
+                        source=source,
+                        target=target,
                         dep_descr=dep_descr,
-                        variable=var,
+                        variable=race_var,
                         var_kind=self.var_kind)
 
-    def describe_dependency(self, source, target):
+    def describe_dependency(self, source_id, target):
         dep_descr = None
 
-        source = self.kernel.id_to_insn[source]
-        target = self.kernel.id_to_insn[target]
+        source = self.kernel.id_to_insn[source_id]
 
         if self.reverse:
             source, target = target, source
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 9e16c3a598246aa71e125ce3d04f372d7c90f28e..242ba6ab79e7b58b99b0d416f0c4b3025fdb7f09 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1537,6 +1537,10 @@ class PrimeAdder(IdentityMapper):
 
 # {{{ get access range
 
+class UnableToDetermineAccessRange(Exception):
+    pass
+
+
 def get_access_range(domain, subscript, assumptions):
     domain, assumptions = isl.align_two(domain,
             assumptions)
@@ -1558,8 +1562,17 @@ def get_access_range(domain, subscript, assumptions):
     access_map = access_map.insert_dims(dim_type.set, dn, dims)
 
     for idim in range(dims):
-        idx_aff = aff_from_expr(access_map.get_space(),
-                subscript[idim])
+        sub_idim = subscript[idim]
+        with isl.SuppressedWarnings(domain.get_ctx()):
+            try:
+                idx_aff = aff_from_expr(access_map.get_space(), sub_idim)
+            except TypeError as e:
+                raise UnableToDetermineAccessRange(
+                        "%s: %s" % (type(e).__name__, str(e)))
+            except isl.Error as e:
+                raise UnableToDetermineAccessRange(
+                        "%s: %s" % (type(e).__name__, str(e)))
+
         idx_aff = idx_aff.set_coefficient_val(
                 dim_type.in_, dn+idim, -1)
 
@@ -1582,11 +1595,11 @@ def get_access_range(domain, subscript, assumptions):
 
 class BatchedAccessRangeMapper(WalkMapper):
 
-    def __init__(self, kernel, arg_names):
+    def __init__(self, kernel, var_names):
         self.kernel = kernel
-        self.arg_names = set(arg_names)
-        self.access_ranges = dict((arg, None) for arg in arg_names)
-        self.bad_subscripts = dict((arg, []) for arg in arg_names)
+        self.var_names = set(var_names)
+        self.access_ranges = dict((arg, None) for arg in var_names)
+        self.bad_subscripts = dict((arg, []) for arg in var_names)
 
     def map_subscript(self, expr, inames):
         domain = self.kernel.get_inames_domain(inames)
@@ -1594,7 +1607,7 @@ class BatchedAccessRangeMapper(WalkMapper):
 
         assert isinstance(expr.aggregate, p.Variable)
 
-        if expr.aggregate.name not in self.arg_names:
+        if expr.aggregate.name not in self.var_names:
             return
 
         arg_name = expr.aggregate.name
@@ -1604,7 +1617,12 @@ class BatchedAccessRangeMapper(WalkMapper):
             self.bad_subscripts[arg_name].append(expr)
             return
 
-        access_range = get_access_range(domain, subscript, self.kernel.assumptions)
+        try:
+            access_range = get_access_range(
+                    domain, subscript, self.kernel.assumptions)
+        except UnableToDetermineAccessRange:
+            self.bad_subscripts[arg_name].append(expr)
+            return
 
         if self.access_ranges[arg_name] is None:
             self.access_ranges[arg_name] = access_range
@@ -1622,7 +1640,7 @@ class BatchedAccessRangeMapper(WalkMapper):
     def map_linear_subscript(self, expr, inames):
         self.rec(expr.index, inames)
 
-        if expr.aggregate.name in self.arg_names:
+        if expr.aggregate.name in self.var_names:
             self.bad_subscripts[expr.aggregate.name].append(expr)
 
     def map_reduction(self, expr, inames):
@@ -1634,20 +1652,87 @@ class BatchedAccessRangeMapper(WalkMapper):
 
 class AccessRangeMapper(object):
 
-    def __init__(self, kernel, arg_name):
-        self.arg_name = arg_name
-        self.inner_mapper = BatchedAccessRangeMapper(kernel, [arg_name])
+    def __init__(self, kernel, var_name):
+        self.var_name = var_name
+        self.inner_mapper = BatchedAccessRangeMapper(kernel, [var_name])
 
     def __call__(self, expr, inames):
         return self.inner_mapper(expr, inames)
 
     @property
     def access_range(self):
-        return self.inner_mapper.access_ranges[self.arg_name]
+        return self.inner_mapper.access_ranges[self.var_name]
 
     @property
     def bad_subscripts(self):
-        return self.inner_mapper.bad_subscripts[self.arg_name]
+        return self.inner_mapper.bad_subscripts[self.var_name]
+
+# }}}
+
+
+# {{{ do_access_ranges_overlap_conservative
+
+def _get_access_range_conservative(kernel, insn_id, access_dir, var_name):
+    insn = kernel.id_to_insn[insn_id]
+    from loopy.kernel.instruction import MultiAssignmentBase
+
+    assert access_dir in ["w", "any"]
+
+    if not isinstance(insn, MultiAssignmentBase):
+        if access_dir == "any":
+            return var_name in insn.dependency_names()
+        else:
+            return var_name in insn.write_dependency_names()
+
+    exprs = list(insn.assignees)
+    if access_dir == "any":
+        exprs.append(insn.expression)
+        exprs.extend(insn.predicates)
+
+    arange = False
+    for expr in exprs:
+        arm = AccessRangeMapper(kernel, var_name)
+        arm(expr, kernel.insn_inames(insn))
+
+        if arm.bad_subscripts:
+            return True
+
+        expr_arange = arm.access_range
+        if expr_arange is None:
+            continue
+
+        if arange is False:
+            arange = expr_arange
+        else:
+            arange = arange | expr_arange
+
+    return arange
+
+
+def do_access_ranges_overlap_conservative(
+        kernel, insn1_id, insn1_dir, insn2_id, insn2_dir, var_name):
+    """Determine whether the access ranges to *var_name* in the two
+    given instructions overlap. This determination is made 'conservatively',
+    i.e. if precise information is unavailable, it is concluded that the
+    ranges overlap.
+
+    :arg insn1_dir: either ``"w"`` or ``"any"``, to indicate which
+        type of access is desired--writing or any
+    :arg insn2_dir: either ``"w"`` or ``"any"``
+    :returns: a :class:`bool`
+    """
+
+    insn1_arange = _get_access_range_conservative(
+            kernel, insn1_id, insn1_dir, var_name)
+    insn2_arange = _get_access_range_conservative(
+            kernel, insn2_id, insn2_dir, var_name)
+
+    if insn1_arange is False or insn2_arange is False:
+        return False
+    if insn1_arange is True or insn2_arange is True:
+        return True
+
+    return not (insn1_arange & insn2_arange).is_empty()
 
 # }}}
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 177daa02948b9c07ef1d9856dc04019e69e24897..8e69793e8079864a7e4c3117f267a20d6db3962f 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -314,11 +314,11 @@ class ExecutableCTarget(CTarget):
     """
     An executable CTarget that uses (by default) JIT compilation of C-code
     """
-    from loopy.target.c.c_execution import CCompiler
 
-    def __init__(self, compiler=CCompiler(), fortran_abi=False):
+    def __init__(self, compiler=None, fortran_abi=False):
         super(ExecutableCTarget, self).__init__(fortran_abi=fortran_abi)
-        self.compiler = compiler
+        from loopy.target.c.c_execution import CCompiler
+        self.compiler = compiler or CCompiler()
 
     def get_kernel_executor(self, knl, *args, **kwargs):
         from loopy.target.c.c_execution import CKernelExecutor
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index d8b76d32afa64d308648420904f4f4bf8e2e2316..3634cc71aa73f9079ce4f0180f546cfc4f344b3d 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -29,7 +29,8 @@ from loopy.target.execution import (KernelExecutorBase, _KernelInfo,
                              ExecutionWrapperGeneratorBase, get_highlighted_code)
 from pytools import memoize_method
 from pytools.py_codegen import (Indentation)
-from codepy.toolchain import guess_toolchain
+from pytools.prefork import ExecError
+from codepy.toolchain import guess_toolchain, ToolchainGuessError, GCCToolchain
 from codepy.jit import compile_from_string
 import six
 import ctypes
@@ -216,8 +217,27 @@ class CCompiler(object):
                  source_suffix='c'):
         # try to get a default toolchain
         # or subclass supplied version if available
-        self.toolchain = guess_toolchain() if toolchain is None else toolchain
-        self.source_suffix = source_suffix
+        self.toolchain = toolchain
+        if toolchain is None:
+            try:
+                self.toolchain = guess_toolchain()
+            except (ToolchainGuessError, ExecError):
+                # missing compiler python was built with (likely, Conda)
+                # use a default GCCToolchain
+                logger = logging.getLogger(__name__)
+                logger.warn('Default toolchain guessed from python config '
+                            'not found, replacing with default GCCToolchain.')
+                # this is ugly, but I'm not sure there's a clean way to copy the
+                # default args
+                self.toolchain = GCCToolchain(
+                    cc='gcc',
+                    cflags='-std=c99 -O3 -fPIC'.split(),
+                    ldflags='-shared'.split(),
+                    libraries=[],
+                    library_dirs=[],
+                    defines=[],
+                    source_suffix='c')
+
         if toolchain is None:
             # copy in all differing values
             diff = {'cc': cc,
@@ -229,9 +249,8 @@ class CCompiler(object):
                     'defines': defines}
             # filter empty and those equal to toolchain defaults
             diff = dict((k, v) for k, v in six.iteritems(diff)
-                    if v and
-                    not hasattr(self.toolchain, k) or
-                    getattr(self.toolchain, k) != v)
+                    if v and (not hasattr(self.toolchain, k) or
+                              getattr(self.toolchain, k) != v))
             self.toolchain = self.toolchain.copy(**diff)
         self.tempdir = tempfile.mkdtemp(prefix="tmp_loopy")
         self.source_suffix = source_suffix
@@ -312,14 +331,14 @@ class CompiledCKernel(object):
     to automatically map argument types.
     """
 
-    def __init__(self, knl, idi, dev_code, target, comp=CCompiler()):
+    def __init__(self, knl, idi, dev_code, target, comp=None):
         from loopy.target.c import ExecutableCTarget
         assert isinstance(target, ExecutableCTarget)
         self.target = target
         self.name = knl.name
         # get code and build
         self.code = dev_code
-        self.comp = comp
+        self.comp = comp if comp is not None else CCompiler()
         self.dll = self.comp.build(self.name, self.code)
 
         # get the function declaration for interface with ctypes
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index caee73eb1c3320f03ceac66e55e8f5c0bfadbbc2..c111a02b75243b10de90b2d18d62e3759c575fa8 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -525,11 +525,17 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
             real_sum = p.flattened_sum([self.rec(r, type_context) for r in reals])
 
-            complex_sum = self.rec(complexes[0], type_context, tgt_dtype)
-            for child in complexes[1:]:
-                complex_sum = var("%s_add" % tgt_name)(
-                        complex_sum,
-                        self.rec(child, type_context, tgt_dtype))
+            c_applied = [self.rec(c, type_context, tgt_dtype) for c in complexes]
+
+            def binary_tree_add(start, end):
+                if start + 1 == end:
+                    return c_applied[start]
+                mid = (start + end)//2
+                lsum = binary_tree_add(start, mid)
+                rsum = binary_tree_add(mid, end)
+                return var("%s_add" % tgt_name)(lsum, rsum)
+
+            complex_sum = binary_tree_add(0, len(c_applied))
 
             if real_sum:
                 return var("%s_radd" % tgt_name)(real_sum, complex_sum)
@@ -569,11 +575,17 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             real_prd = p.flattened_product(
                     [self.rec(r, type_context) for r in reals])
 
-            complex_prd = self.rec(complexes[0], type_context, tgt_dtype)
-            for child in complexes[1:]:
-                complex_prd = var("%s_mul" % tgt_name)(
-                        complex_prd,
-                        self.rec(child, type_context, tgt_dtype))
+            c_applied = [self.rec(c, type_context, tgt_dtype) for c in complexes]
+
+            def binary_tree_mul(start, end):
+                if start + 1 == end:
+                    return c_applied[start]
+                mid = (start + end)//2
+                lsum = binary_tree_mul(start, mid)
+                rsum = binary_tree_mul(mid, end)
+                return var("%s_mul" % tgt_name)(lsum, rsum)
+
+            complex_prd = binary_tree_mul(0, len(complexes))
 
             if real_prd:
                 return var("%s_rmul" % tgt_name)(real_prd, complex_prd)
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 37c5d85a1ade5c8f7fadb2c6a785cf7cea3dde40..e6ecb4093ad24ceafe521c5379f4d2cd96ea6f52 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -228,7 +228,8 @@ def tag_instructions(kernel, new_tag, within=None):
 
 # {{{ add nosync
 
-def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False):
+def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False,
+        empty_ok=False):
     """Add a *no_sync_with* directive between *source* and *sink*.
     *no_sync_with* is only added if *sink* depends on *source* or
     if the instruction pair is in a conflicting group.
@@ -248,8 +249,16 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False):
     :arg force: A :class:`bool`. If *True*, add a *no_sync_with* directive
         even without the presence of a dependency edge or conflicting
         instruction group.
+    :arg empty_ok: If *True*, do not complain even if no *nosync* tags were
+        added as a result of the transformation.
 
     :return: The updated kernel
+
+    .. versionchanged:: 2018.1
+
+        If the transformation adds no *nosync* directives, it will complain.
+        This used to silently pass. This behavior can be restored using
+        *empty_ok*.
     """
 
     if isinstance(source, str) and source in kernel.id_to_insn:
@@ -264,6 +273,11 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False):
         sinks = frozenset(
                 sink.id for sink in find_instructions(kernel, sink))
 
+    if not sources and not empty_ok:
+        raise LoopyError("No match found for source specification '%s'." % source)
+    if not sinks and not empty_ok:
+        raise LoopyError("No match found for sink specification '%s'." % sink)
+
     def insns_in_conflicting_groups(insn1_id, insn2_id):
         insn1 = kernel.id_to_insn[insn1_id]
         insn2 = kernel.id_to_insn[insn2_id]
@@ -275,11 +289,12 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False):
     from collections import defaultdict
     nosync_to_add = defaultdict(set)
 
+    rec_dep_map = kernel.recursive_insn_dep_map()
     for sink in sinks:
         for source in sources:
 
             needs_nosync = force or (
-                    source in kernel.recursive_insn_dep_map()[sink]
+                    source in rec_dep_map[sink]
                     or insns_in_conflicting_groups(source, sink))
 
             if not needs_nosync:
@@ -289,6 +304,12 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False):
             if bidirectional:
                 nosync_to_add[source].add((sink, scope))
 
+    if not nosync_to_add and not empty_ok:
+        raise LoopyError("No nosync annotations were added as a result "
+                "of this call. add_nosync will (by default) only add them to "
+                "accompany existing depencies or group exclusions. Maybe you want "
+                "to pass force=True?")
+
     new_instructions = list(kernel.instructions)
 
     for i, insn in enumerate(new_instructions):
diff --git a/loopy/version.py b/loopy/version.py
index 7141a678297ded5e0d6e2f16f065f035a034d540..aeb0b277a6c4de8a6db346aee97014699d591d03 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,76 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v76-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v77-islpy%s" % _islpy_version
+
+
+FALLBACK_LANGUAGE_VERSION = (2017, 2, 1)
+MOST_RECENT_LANGUAGE_VERSION = (2018, 1)
+
+LOOPY_USE_LANGUAGE_VERSION_2018_1 = (2018, 1)
+LOOPY_USE_LANGUAGE_VERSION_2017_2_1 = (2017, 2, 1)
+
+LANGUAGE_VERSION_SYMBOLS = [
+        "LOOPY_USE_LANGUAGE_VERSION_2018_1",
+        "LOOPY_USE_LANGUAGE_VERSION_2017_2_1",
+        ]
+
+__doc__ = """
+
+.. currentmodule:: loopy
+.. data:: VERSION
+
+    A tuple representing the current version number of loopy, for example
+    **(2017, 2, 1)**. Direct comparison of these tuples will always yield
+    valid version comparisons.
+
+.. _language-versioning:
+
+Loopy Language Versioning
+-------------------------
+
+At version 2018.1, :mod:`loopy` introduced a language versioning scheme to make
+it easier to evolve the language while retaining backward compatibility. What
+prompted this is the addition of
+:attr:`loopy.Options.enforce_variable_access_ordered`, which (despite
+its name) serves to enable a new check that helps ensure that all variable
+access in a kernel is ordered as intended. Since that has the potential to
+break existing programs, kernels now have to declare support for a given
+language version to let them take advantage of this check.
+
+As a result, :mod:`loopy` will now issue a warning when a call to
+:func:`loopy.make_kernel` does not declare a language version. Such kernels
+will (indefinitely) default to language version 2017.2.1.  If passing a
+language version to :func:`make_kernel` is impractical, you may also import
+one of the ``LOOPY_USE_LANGUAGE_VERSION_...`` symbols given below using::
+
+    from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1
+
+in the global namespace of the function calling :func:`make_kernel`. If
+*lang_version* in that call is not explicitly given, this value will be used.
+
+Language versions will generally reflect the version number of :mod:`loopy` in
+which they were introduced, though it is likely that most versions of
+:mod:`loopy` do not introduce language incompatibilities. In such
+situations, the previous language version number remains. (In fact, we
+will work hard to avoid backward-incompatible language changes.)
+
+.. data:: MOST_RECENT_LANGUAGE_VERSION
+
+    A tuple representing the most recent language version number of loopy, for
+    example **(2018, 1)**. Direct comparison of these tuples will always
+    yield valid version comparisons.
+
+
+History of Language Versions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. data:: LOOPY_USE_LANGUAGE_VERSION_2018_1
+
+    :attr:`loopy.Options.enforce_variable_access_ordered`
+    is turned on by default.
+
+.. data:: LOOPY_USE_LANGUAGE_VERSION_2017_2_1
+
+    Initial legacy language version.
+"""
diff --git a/test/test_apps.py b/test/test_apps.py
index c4844d3a3c5d88e0c4eeccf0d67e9b4284fd744f..12b59e18afc1ae956d6b7a4817e908a8bd89e7dc 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -49,6 +49,9 @@ __all__ = [
         ]
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 # {{{ convolutions
 
 def test_convolution(ctx_factory):
@@ -194,7 +197,7 @@ def test_rob_stroud_bernstein(ctx_factory):
 
                     for alpha2
                         tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
-                                {id=write_tmp}
+                                {id=write_tmp,dep=init_w:aind_init}
                         w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                                 {id=update_w,dep=init_w:write_tmp}
                         aind = aind + 1 \
@@ -255,7 +258,7 @@ def test_rob_stroud_bernstein_full(ctx_factory):
                     <> w = s**(deg-alpha1) {id=init_w}
 
                     <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \
-                            {id=write_tmp}
+                            {id=write_tmp,dep=init_w:aind_init}
                     for alpha2
                         w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                             {id=update_w,dep=init_w:write_tmp}
@@ -269,15 +272,16 @@ def test_rob_stroud_bernstein_full(ctx_factory):
                 <> xi2 = qpts[0, i1_2] {dep=aind_incr}
                 <> s2 = 1-xi2
                 <> r2 = xi2/s2
-                <> w2 = s2**deg
+                <> w2 = s2**deg  {id=w2_init}
 
                 for alpha1_2
                     for i2_2
                         result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \
-                                w2 * tmp[alpha1_2, i2_2]
+                                w2 * tmp[alpha1_2, i2_2]  {id=res2,dep=w2_init}
                     end
 
-                    w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2)
+                    w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2)  \
+                            {id=w2_update, dep=res2}
                 end
             end
         end
diff --git a/test/test_c_execution.py b/test/test_c_execution.py
index d1b3c95caa034191b4b29c49076fc101cd318950..975227b435b2c0738031b5eba8308fc3efbf8fb9 100644
--- a/test/test_c_execution.py
+++ b/test/test_c_execution.py
@@ -40,6 +40,9 @@ else:
     faulthandler.enable()
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_c_target():
     from loopy.target.c import ExecutableCTarget
 
@@ -257,9 +260,10 @@ def test_c_caching():
     # setup test logger to check logs
     tl = TestingLogger()
     tl.start_capture()
-    # remake kernel to clear cache
-    knl = __get_knl()
-    assert np.allclose(knl(b=np.arange(10))[1], np.arange(10))
+    # copy kernel such that we share the same executor cache
+    knl = knl.copy()
+    # but use different args, so we can't cache the result
+    assert np.allclose(knl(b=np.arange(1, 11))[1], np.arange(1, 11))
     # and get logs
     logs = tl.stop_capture()
     # check that we didn't recompile
@@ -291,6 +295,54 @@ def test_c_execution_with_global_temporaries():
     assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
 
 
+def test_missing_compilers():
+    from loopy.target.c import ExecutableCTarget, CTarget
+    from loopy.target.c.c_execution import CCompiler
+    from codepy.toolchain import GCCToolchain
+
+    def __test(evalfunc, target, **targetargs):
+        n = 10
+
+        knl = lp.make_kernel('{[i]: 0 <= i < n}',
+            """
+                a[i] = b[i]
+            """,
+            [lp.GlobalArg('a', shape=(n,), dtype=np.int32),
+             lp.GlobalArg('b', shape=(n,), dtype=np.int32)],
+            target=target(**targetargs))
+
+        knl = lp.fix_parameters(knl, n=n)
+        return evalfunc(knl)
+
+    assert __test(lambda knl: lp.generate_code_v2(knl).device_code(), CTarget)
+
+    from pytools.prefork import ExecError
+
+    def eval_tester(knl):
+        return np.allclose(knl(a=np.zeros(10, dtype=np.int32),
+                               b=np.arange(10, dtype=np.int32))[1], np.arange(10))
+    import os
+    path_store = os.environ["PATH"]
+    try:
+        # test with path wiped out such that we can't find gcc
+        with pytest.raises(ExecError):
+            os.environ["PATH"] = ''
+            __test(eval_tester, ExecutableCTarget)
+    finally:
+        # make sure we restore the path regardless for future testing
+        os.environ["PATH"] = path_store
+
+    # next test that some made up compiler can be specified
+    ccomp = CCompiler(cc='foo')
+    assert isinstance(ccomp.toolchain, GCCToolchain)
+    assert ccomp.toolchain.cc == 'foo'
+
+    # and that said made up compiler errors out
+
+    with pytest.raises(ExecError):
+        __test(eval_tester, ExecutableCTarget, compiler=ccomp)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_dg.py b/test/test_dg.py
index d65c68ed4c729582d511064d6495535efcf7a9a4..e96c76d88b0ab497c7ccdc7df94880a894607bc3 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -34,6 +34,9 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_dg_volume(ctx_factory):
     #logging.basicConfig(level=logging.DEBUG)
 
diff --git a/test/test_diff.py b/test/test_diff.py
index 95471f9b126fd6b763530d115c21509d14d2ba47..3d19721ac030ceccf819f4135b4e734594384e53 100644
--- a/test/test_diff.py
+++ b/test/test_diff.py
@@ -48,6 +48,9 @@ __all__ = [
         ]
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_diff(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
diff --git a/test/test_domain.py b/test/test_domain.py
index 9d0379a50af188cc84de8e01f8278030b6cc04e2..680ff299292e928c6286a168c3e71a23c60aac9b 100644
--- a/test/test_domain.py
+++ b/test/test_domain.py
@@ -52,6 +52,9 @@ __all__ = [
         ]
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_assume(ctx_factory):
     ctx = ctx_factory()
 
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 842a0127e3118ec8e7a0ea89ed17decc091e8566..ea2f68b617e4af41210dc05988a3eebb7f0e49a6 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -405,15 +405,11 @@ def test_fuse_kernels(ctx_factory):
             fortran_template.format(
                 inner=(xd_line + "\n" + yd_line), name="xyderiv"))
 
-    knl = lp.fuse_kernels((xderiv, yderiv))
+    knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)])
     knl = lp.prioritize_loops(knl, "e,i,j,k")
 
     assert len(knl.temporary_variables) == 2
 
-    # This is needed for correctness, otherwise ordering could foul things up.
-    knl = lp.assignment_to_subst(knl, "prev")
-    knl = lp.assignment_to_subst(knl, "prev_0")
-
     ctx = ctx_factory()
     lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 3d422f1d8b5a847d4445468978ee529db95c481f..accdebc1237c70f4227adc5bfcba6fa9cf88d190 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -62,6 +62,9 @@ def check_float4(result, ref_result):
                 ref_result[comp], result[comp], rtol=1e-3, atol=1e-3), None
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_axpy(ctx_factory):
     logging.basicConfig(level="INFO")
     ctx = ctx_factory()
diff --git a/test/test_loopy.py b/test/test_loopy.py
index e36a4c2c3cb3f7e70a5b039ea631bbce20923be8..8581ae5b879a7d9a282b5fefb9f3155928e83631 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -52,6 +52,9 @@ __all__ = [
         ]
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_globals_decl_once_with_multi_subprogram(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -1085,12 +1088,12 @@ def test_atomic_load(ctx_factory, dtype):
             "{ [i,j]: 0<=i,j<n}",
             """
             for j
-                <> upper = 0
-                <> lower = 0
+                <> upper = 0  {id=init_upper}
+                <> lower = 0  {id=init_lower}
                 temp = 0 {id=init, atomic}
                 for i
-                    upper = upper + i * a[i] {id=sum0}
-                    lower = lower - b[i] {id=sum1}
+                    upper = upper + i * a[i] {id=sum0,dep=init_upper}
+                    lower = lower - b[i] {id=sum1,dep=init_lower}
                 end
                 temp = temp + lower {id=temp_sum, dep=sum*:init, atomic,\
                                            nosync=init}
@@ -1608,8 +1611,10 @@ def test_missing_temporary_definition_detection():
 def test_missing_definition_check_respects_aliases():
     # Based on https://github.com/inducer/loopy/issues/69
     knl = lp.make_kernel("{ [i] : 0<=i<n }",
-         ["a[i] = 0",
-          "c[i] = b[i]"],
+            """
+            a[i] = 0
+            c[i] = b[i]  {dep_query=writes:a}
+            """,
          temporary_variables={
              "a": lp.TemporaryVariable("a",
                         dtype=np.float64, shape=("n",), base_storage="base"),
@@ -1846,7 +1851,7 @@ def test_nop(ctx_factory):
                 <> z[i] = z[i+1] + z[i]  {id=wr_z}
                 <> v[i] = 11  {id=wr_v}
                 ... nop {dep=wr_z:wr_v,id=yoink}
-                z[i] = z[i] - z[i+1] + v[i]
+                z[i] = z[i] - z[i+1] + v[i]  {dep=yoink}
             end
             """)
 
@@ -1870,7 +1875,7 @@ def test_global_barrier(ctx_factory):
                     <> z[i] = z[i+1] + z[i]  {id=wr_z,dep=top}
                     <> v[i] = 11  {id=wr_v,dep=top}
                     ... gbarrier {dep=wr_z:wr_v,id=yoink}
-                    z[i] = z[i] - z[i+1] + v[i] {id=iupd}
+                    z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=wr_z}
                 end
                 ... gbarrier {dep=iupd,id=postloop}
                 z[i] = z[i] - z[i+1] + v[i]  {dep=postloop}
@@ -2107,11 +2112,11 @@ def test_if_else(ctx_factory):
             "{ [i]: 0<=i<50}",
             """
             if i % 3 == 0
-                a[i] = 15
+                a[i] = 15  {nosync_query=writes:a}
             elif i % 3 == 1
-                a[i] = 11
+                a[i] = 11  {nosync_query=writes:a}
             else
-                a[i] = 3
+                a[i] = 3  {nosync_query=writes:a}
             end
             """
             )
@@ -2131,14 +2136,14 @@ def test_if_else(ctx_factory):
             for i
                 if i % 2 == 0
                     if i % 3 == 0
-                        a[i] = 15
+                        a[i] = 15  {nosync_query=writes:a}
                     elif i % 3 == 1
-                        a[i] = 11
+                        a[i] = 11  {nosync_query=writes:a}
                     else
-                        a[i] = 3
+                        a[i] = 3  {nosync_query=writes:a}
                     end
                 else
-                    a[i] = 4
+                    a[i] = 4  {nosync_query=writes:a}
                 end
             end
             """
@@ -2159,17 +2164,17 @@ def test_if_else(ctx_factory):
                 if i < 25
                     for j
                         if j % 2 == 0
-                            a[i, j] = 1
+                            a[i, j] = 1  {nosync_query=writes:a}
                         else
-                            a[i, j] = 0
+                            a[i, j] = 0  {nosync_query=writes:a}
                         end
                     end
                 else
                     for j
                         if j % 2 == 0
-                            a[i, j] = 0
+                            a[i, j] = 0  {nosync_query=writes:a}
                         else
-                            a[i, j] = 1
+                            a[i, j] = 1  {nosync_query=writes:a}
                         end
                     end
                 end
@@ -2363,8 +2368,9 @@ def test_nosync_option_parsing():
     assert "id=insn5, no_sync_with=insn1@any" in kernel_str
 
 
-def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
-    from loopy.schedule import (RunInstruction, Barrier, EnterLoop, LeaveLoop)
+def barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
+    from loopy.schedule import (RunInstruction, Barrier, EnterLoop, LeaveLoop,
+            CallKernel, ReturnFromKernel)
     watch_for_barrier = False
     seen_barrier = False
     loop_level = 0
@@ -2374,9 +2380,7 @@ def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
             if sched_item.insn_id == id1:
                 watch_for_barrier = True
             elif sched_item.insn_id == id2:
-                assert watch_for_barrier
-                assert seen_barrier
-                return
+                return watch_for_barrier and seen_barrier
         elif isinstance(sched_item, Barrier):
             if watch_for_barrier and loop_level not in ignore_barriers_in_levels:
                 seen_barrier = True
@@ -2384,6 +2388,11 @@ def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
             loop_level += 1
         elif isinstance(sched_item, LeaveLoop):
             loop_level -= 1
+        elif isinstance(sched_item, (CallKernel, ReturnFromKernel)):
+            pass
+        else:
+            raise RuntimeError("schedule item type '%s' not understood"
+                    % type(sched_item).__name__)
 
     raise RuntimeError("id2 was not seen")
 
@@ -2410,9 +2419,9 @@ def test_barrier_insertion_near_top_of_loop():
 
     print(knl)
 
-    assert_barrier_between(knl, "ainit", "tcomp")
-    assert_barrier_between(knl, "tcomp", "bcomp1")
-    assert_barrier_between(knl, "bcomp1", "bcomp2")
+    assert barrier_between(knl, "ainit", "tcomp")
+    assert barrier_between(knl, "tcomp", "bcomp1")
+    assert barrier_between(knl, "bcomp1", "bcomp2")
 
 
 def test_barrier_insertion_near_bottom_of_loop():
@@ -2437,8 +2446,8 @@ def test_barrier_insertion_near_bottom_of_loop():
 
     print(knl)
 
-    assert_barrier_between(knl, "bcomp1", "bcomp2")
-    assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
+    assert barrier_between(knl, "bcomp1", "bcomp2")
+    assert barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
 
 
 def test_barrier_in_overridden_get_grid_size_expanded_kernel():
@@ -2570,10 +2579,10 @@ def test_struct_assignment(ctx_factory):
         "{ [i]: 0<=i<N }",
         """
         for i
-            result[i].hit = i % 2
-            result[i].tmin = i
-            result[i].tmax = i+10
-            result[i].bi = i
+            result[i].hit = i % 2  {nosync_query=writes:result}
+            result[i].tmin = i  {nosync_query=writes:result}
+            result[i].tmax = i+10  {nosync_query=writes:result}
+            result[i].bi = i  {nosync_query=writes:result}
         end
         """,
         [
@@ -2629,8 +2638,8 @@ def test_fixed_parameters(ctx_factory):
     knl = lp.make_kernel(
             "[n] -> {[i]: 0 <= i < n}",
             """
-            <>tmp[i] = i
-            tmp[0] = 0
+            <>tmp[i] = i  {id=init}
+            tmp[0] = 0  {dep=init}
             """,
             fixed_parameters=dict(n=1))
 
@@ -2788,6 +2797,65 @@ def test_add_prefetch_works_in_lhs_index():
         assert "a1_map" not in get_dependencies(insn.assignees)
 
 
+def test_check_for_variable_access_ordering():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+            a[i] = 12
+            a[i+1] = 13
+            """)
+
+    knl = lp.preprocess_kernel(knl)
+
+    from loopy.diagnostic import VariableAccessNotOrdered
+    with pytest.raises(VariableAccessNotOrdered):
+        lp.get_one_scheduled_kernel(knl)
+
+
+def test_check_for_variable_access_ordering_with_aliasing():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+            a[i] = 12
+            b[i+1] = 13
+            """,
+            [
+                lp.TemporaryVariable("a", shape="n+1", base_storage="tmp"),
+                lp.TemporaryVariable("b", shape="n+1", base_storage="tmp"),
+                ])
+
+    knl = lp.preprocess_kernel(knl)
+
+    from loopy.diagnostic import VariableAccessNotOrdered
+    with pytest.raises(VariableAccessNotOrdered):
+        lp.get_one_scheduled_kernel(knl)
+
+
+@pytest.mark.parametrize(("second_index", "expect_barrier"),
+        [
+            ("2*i", True),
+            ("2*i+1", False),
+            ])
+def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier):
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<128}",
+            """
+            a[2*i] = 12  {id=first}
+            a[%s] = 13  {id=second,dep=first}
+            """ % second_index,
+            [
+                lp.TemporaryVariable("a", lp.auto, shape=(256,),
+                    scope=lp.temp_var_scope.LOCAL),
+                ])
+
+    knl = lp.tag_inames(knl, "i:l.0")
+
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+
+    assert barrier_between(knl, "first", "second") == expect_barrier
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_misc.py b/test/test_misc.py
index 0273948b38b28b85e42a600bffb65fbf86dcc554..ec14770a912af978fbc6651110529a86b307df83 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -32,6 +32,9 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_compute_sccs():
     from loopy.tools import compute_sccs
     import random
diff --git a/test/test_nbody.py b/test/test_nbody.py
index e118b04b997020943d79ec1ba566eff85d56199a..f2a8fc1981ddc2066ff52a2b712df95b5d36ccd2 100644
--- a/test/test_nbody.py
+++ b/test/test_nbody.py
@@ -34,6 +34,9 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_nbody(ctx_factory):
     logging.basicConfig(level=logging.INFO)
 
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index eff3dbd0e07439bbec399479183a7e9ddb69b9ff..a287ad59d7697eef79336678afa831e73b81784b 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -44,6 +44,9 @@ __all__ = [
         ]
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 @pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("ilp_multiple", [1, 2])
 @pytest.mark.parametrize("opt_level", [11])
@@ -57,13 +60,14 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     source = source.replace("datafloat", "real*4")
 
     hsv_r, hsv_s = [
-           knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
+           knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False)
            if "KernelR" in knl.name or "KernelS" in knl.name
            ]
     hsv_r = lp.tag_instructions(hsv_r, "rknl")
     hsv_s = lp.tag_instructions(hsv_s, "sknl")
     hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
     #hsv = hsv_s
+    hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True)
 
     from gnuma_loopy_transforms import (
           fix_euler_parameters,
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 909a800b29c75b13fad494b5a859186b9cd5587c..6b62bad5b50952a3d29beec49cfce4369d5a4acf 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -49,6 +49,9 @@ __all__ = [
         ]
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_nonsense_reduction(ctx_factory):
     ctx = ctx_factory()
 
diff --git a/test/test_scan.py b/test/test_scan.py
index 08754819c9a156403aba689cb3e9c238144e7905..44903611d27e14e502c0c8459be9378dbc77a9a4 100644
--- a/test/test_scan.py
+++ b/test/test_scan.py
@@ -56,6 +56,9 @@ __all__ = [
 # - scan(a) + scan(b)
 # - test for badly tagged inames
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 @pytest.mark.parametrize("n", [1, 2, 3, 16])
 @pytest.mark.parametrize("stride", [1, 2])
 def test_sequential_scan(ctx_factory, n, stride):
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index 0571e41910020aa0a60cd911a63b6ce2984ed939..ecb2352ae277bd0677af09801d7bf24ee30da6b9 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -31,6 +31,9 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_tim2d(ctx_factory):
     dtype = np.float32
     ctx = ctx_factory()
diff --git a/test/test_statistics.py b/test/test_statistics.py
index eeb4a5a288afdd5b9295b0b681abb61b5f021d97..e4232e613c569cb4a0d66b500a981643bf5bac05 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -34,6 +34,9 @@ import numpy as np
 from pymbolic.primitives import Variable
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_op_counter_basic():
 
     knl = lp.make_kernel(
diff --git a/test/test_target.py b/test/test_target.py
index d3cf2670cb0db0eb5d0046ce1d816b679d4a1ed8..15964987ab3d83d31c91ea266f29698a695c74a6 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -52,6 +52,9 @@ __all__ = [
         ]
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_ispc_target(occa_mode=False):
     from loopy.target.ispc import ISPCTarget
 
@@ -203,8 +206,8 @@ def test_random123(ctx_factory, tp):
             <> key2 = make_uint2(i, 324830944) {inames=i}
             <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i}
             <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
-            <> real, ctr = philox4x32_TYPE(ctr, key2)  {dep=init_ctr}
-            <> imag, ctr = threefry4x32_TYPE(ctr, key4)  {dep=init_ctr}
+            <> real, ctr = philox4x32_TYPE(ctr, key2)  {id=realpart,dep=init_ctr}
+            <> imag, ctr = threefry4x32_TYPE(ctr, key4)  {dep=init_ctr:realpart}
 
             out[i, 0] = real.s0 + 1j * imag.s0
             out[i, 1] = real.s1 + 1j * imag.s1
diff --git a/test/test_transform.py b/test/test_transform.py
index 0e10db362f36b7fc258059c2ec7ed1a344b97212..e1a58e30286141b4d0592debcd308552f32ff632 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -49,6 +49,9 @@ __all__ = [
         ]
 
 
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1  # noqa
+
+
 def test_chunk_iname(ctx_factory):
     ctx = ctx_factory()
 
@@ -75,7 +78,8 @@ def test_collect_common_factors(ctx_factory):
             """
             <float32> out_tmp = 0 {id=out_init,inames=i}
             out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init}
-            out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] {id=out_up2,dep=out_init}
+            out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] \
+                    {id=out_up2,dep=out_init,nosync=out_up1}
             out[i] = out_tmp {dep=out_up1:out_up2}
             """)
     knl = lp.add_and_infer_dtypes(knl,
@@ -492,7 +496,8 @@ def test_add_nosync():
     orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local")
 
     # No dependency present - don't add nosync
-    knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2")
+    knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2",
+            empty_ok=True)
     assert frozenset() == knl.id_to_insn["insn2"].no_sync_with
 
     # Dependency present