diff --git a/.gitignore b/.gitignore
index 322e3a7caa7f229bccba899cb9bc993d95364b86..c5865116226b2cdc15ed4978548416090265e115 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@ distribute*tar.gz
 core
 .coverage
 htmlcov
+.ipynb_checkpoints
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5e2805bd9cc8fbb64f63893093e45229d6803cb3
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,24 @@
+Python 2.7 AMD CPU:
+  script:
+  - py_version=2.7
+  - cl_dev=amd_pu
+  - EXTRA_INSTALL="numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python2.7
+  - amd-cl-cpu
+  except:
+  - tags
+Python 3.4 AMD CPU:
+  script:
+  - py_version=3.4
+  - cl_dev=amd_pu
+  - EXTRA_INSTALL="numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3.4
+  - amd-cl-cpu
+  except:
+  - tags
diff --git a/bin/loopy b/bin/loopy
index 2a657174d4e390f80331689068b8d21e8eab0de7..cb223c31a1eef3244bd21310722f4b84a539d77c 100644
--- a/bin/loopy
+++ b/bin/loopy
@@ -29,13 +29,13 @@ def to_python_literal(value):
 
 def defines_to_python_code(defines_str):
     import re
-    DEFINE_RE = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$")
+    define_re = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$")
     result = []
     for l in defines_str.split("\n"):
         if not l.strip():
             continue
 
-        match = DEFINE_RE.match(l)
+        match = define_re.match(l)
         if match is None:
             raise RuntimeError("#define not understood: '%s'" % l)
 
@@ -71,23 +71,22 @@ def main():
         with open(args.infile, "r") as infile_fd:
             infile_content = infile_fd.read()
 
-    # {{{ path wrangling
+    if args.lang == "loopy":
+        # {{{ path wrangling
 
-    from os.path import dirname, abspath
-    from os import getcwd
+        from os.path import dirname, abspath
+        from os import getcwd
 
-    infile_dirname = dirname(args.infile)
-    if infile_dirname:
-        infile_dirname = abspath(infile_dirname)
-    else:
-        infile_dirname = getcwd()
+        infile_dirname = dirname(args.infile)
+        if infile_dirname:
+            infile_dirname = abspath(infile_dirname)
+        else:
+            infile_dirname = getcwd()
 
-    import sys
-    sys.path.append(infile_dirname)
+        sys.path.append(infile_dirname)
 
-    # }}}
+        # }}}
 
-    if args.lang == "loopy":
         data_dic = {}
         data_dic["lp"] = lp
         data_dic["np"] = np
@@ -131,9 +130,9 @@ def main():
                         defines_to_python_code(defines_fd.read())
                         + pre_transform_code)
 
-        from loopy.frontend.fortran import f2loopy
-        kernels = f2loopy(infile_content, pre_transform_code=pre_transform_code,
-                use_c_preprocessor=(args.lang == "fpp"))
+        kernels = lp.parse_transformed_fortran(
+                infile_content, pre_transform_code=pre_transform_code,
+                filename=args.infile)
 
         if args.name is not None:
             kernels = [kernel for kernel in kernels
diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec
index ec420760abd4d43f16771d4745be6fd43f3cabb3..e24a24db9bda96f149e4279a6c37c1603476c3a4 100644
--- a/build-helpers/loopy.spec
+++ b/build-helpers/loopy.spec
@@ -2,9 +2,10 @@
 
 single_file = True
 
+from os.path import expanduser
 
 a = Analysis(['bin/loopy'],
-             pathex=['/home/andreas/src/loopy'],
+             pathex=[expanduser('~/src/loopy')],
              hiddenimports=[],
              hookspath=None,
              runtime_hooks=None,
diff --git a/build-helpers/run-pyinstaller.sh b/build-helpers/run-pyinstaller.sh
index 660c20fd4be4be4513b36cbf7d51de1fcac3a388..50f9d85dccc503be2a2ccfb6c0e3d6aa28216981 100755
--- a/build-helpers/run-pyinstaller.sh
+++ b/build-helpers/run-pyinstaller.sh
@@ -2,7 +2,7 @@
 
 # run this from the loopy root directory
 
-rm -Rf dist/loopy
+rm -Rf dist build
 
 pyinstaller \
   --workpath=build/pyinstaller \
diff --git a/build-helpers/upload.sh b/build-helpers/upload.sh
new file mode 100755
index 0000000000000000000000000000000000000000..57b8a873b9395954d76a8fd16f8ca9a261e8baa3
--- /dev/null
+++ b/build-helpers/upload.sh
@@ -0,0 +1,5 @@
+#! /bin/bash
+
+set -e
+
+scp "$1" tiker.net:public_html/pub/loopy-binaries/
diff --git a/doc/reference.rst b/doc/reference.rst
index 9d75a924b71847d04632b4945f9228c80c2e01f4..6ed83d43084b5578c817964b13097b4aa78294ef 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -334,8 +334,16 @@ function, which is responsible for creating kernels:
 
 .. autofunction:: make_kernel
 
+.. autofunction:: parse_fortran
+
+.. autofunction:: parse_transformed_fortran
+
 .. autofunction:: make_copy_kernel
 
+.. autofunction:: fuse_kernels
+
+.. autofunction:: c_preprocess
+
 Transforming Kernels
 --------------------
 
@@ -429,6 +437,8 @@ Manipulating Instructions
 
 .. autofunction:: remove_instructions
 
+.. autofunction:: tag_instructions
+
 Library interface
 ^^^^^^^^^^^^^^^^^
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 226868317b71474b9e1866a4c0ae5df82080f9dc..e566365f8d520d33f518bc95cbaa43e5af6e747c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -22,8 +22,12 @@ import a few modules and set up a :class:`pyopencl.Context` and a
     >>> import pyopencl as cl
     >>> import pyopencl.array
     >>> import pyopencl.clrandom
+
     >>> import loopy as lp
-    >>> from pytools import StderrToStdout as IncludeWarningsInDoctest
+    >>> lp.set_caching_enabled(False)
+
+    >>> from warnings import filterwarnings, catch_warnings
+    >>> filterwarnings('error', category=lp.LoopyWarning)
 
     >>> ctx = cl.create_some_context(interactive=False)
     >>> queue = cl.CommandQueue(ctx)
@@ -97,9 +101,9 @@ always see loopy's view of a kernel by printing it.
     KERNEL: loopy_kernel
     ---------------------------------------------------------------------------
     ARGUMENTS:
-    a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (stride:1)
+    a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
     n: ValueArg, type: <runtime>
-    out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (stride:1)
+    out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1)
     ---------------------------------------------------------------------------
     DOMAINS:
     [n] -> { [i] : i >= 0 and i <= -1 + n }
@@ -168,14 +172,13 @@ by passing :attr:`loopy.Options.write_cl`.
 
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
-    <BLANKLINE>
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
     __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
     <BLANKLINE>
-      for (int i = 0; i <= (-1 + n); ++i)
+      for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
     }
 
@@ -246,14 +249,13 @@ call :func:`loopy.generate_code`:
     >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl)
     >>> code, _ = lp.generate_code(typed_knl)
     >>> print code
-    <BLANKLINE>
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
     __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
     <BLANKLINE>
-      for (int i = 0; i <= (-1 + n); ++i)
+      for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
     }
 
@@ -361,16 +363,16 @@ Let us take a look at the generated code for the above kernel:
 .. doctest::
 
     >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_loop_priority(knl, "i,j")
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
-    <BLANKLINE>
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
     __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
     <BLANKLINE>
-      for (int i = 0; i <= (-1 + n); ++i)
-        for (int j = 0; j <= (-1 + n); ++j)
+      for (int i = 0; i <= -1 + n; ++i)
+        for (int j = 0; j <= -1 + n; ++j)
         {
           out[n * j + i] = a[n * i + j];
           out[n * i + j] = 2.0f * out[n * i + j];
@@ -382,10 +384,8 @@ still not right:
 
 .. doctest::
 
-    >>> assert (out.get() == a_mat_dev.get().T*2).all()
-    Traceback (most recent call last):
-    ...
-    AssertionError
+    >>> print((out.get() == a_mat_dev.get().T*2).all())
+    False
 
 For the kernel to perform the desired computation, *all
 instances* (loop iterations) of the first instruction need to be completed,
@@ -406,6 +406,7 @@ with identical bounds, for the use of the transpose:
     ...     out[j,i] = a[i,j] {id=transpose}
     ...     out[ii,jj] = 2*out[ii,jj]  {dep=transpose}
     ...     """)
+    >>> knl = lp.set_loop_priority(knl, "i,j,ii,jj")
 
 :func:`loopy.duplicate_inames` can be used to achieve the same goal.
 Now the intended code is generated and our test passes.
@@ -414,18 +415,17 @@ Now the intended code is generated and our test passes.
 
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
-    <BLANKLINE>
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
     __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
     <BLANKLINE>
-      for (int i = 0; i <= (-1 + n); ++i)
-        for (int j = 0; j <= (-1 + n); ++j)
+      for (int i = 0; i <= -1 + n; ++i)
+        for (int j = 0; j <= -1 + n; ++j)
           out[n * j + i] = a[n * i + j];
-      for (int ii = 0; ii <= (-1 + n); ++ii)
-        for (int jj = 0; jj <= (-1 + n); ++jj)
+      for (int ii = 0; ii <= -1 + n; ++ii)
+        for (int jj = 0; jj <= -1 + n; ++jj)
           out[n * ii + jj] = 2.0f * out[n * ii + jj];
     }
     >>> assert (out.get() == a_mat_dev.get().T*2).all()
@@ -443,6 +443,10 @@ control is the nesting of loops. For example, should the *i* loop be nested
 around the *j* loop, or the other way around, in the following simple
 zero-fill kernel?
 
+It turns out that Loopy will typically choose a loop nesting for us, but it
+does not like doing so. In this tutorial (where we've turned Loopy's warnings
+into errors), we are told what is wrong in no uncertain terms::
+
 .. doctest::
 
     >>> knl = lp.make_kernel(
@@ -453,18 +457,9 @@ zero-fill kernel?
 
 
     >>> knl = lp.set_options(knl, "write_cl")
-    >>> with IncludeWarningsInDoctest():
-    ...     evt, (out,) = knl(queue, a=a_mat_dev)
-    <BLANKLINE>
-    ...
-      for (int i = 0; i <= (-1 + n); ++i)
-        for (int j = 0; j <= (-1 + n); ++j)
-          a[n * i + j] = 0.0f;
+    >>> evt, (out,) = knl(queue, a=a_mat_dev)
+    Traceback (most recent call last):
     ...
-
-Loopy has chosen a loop nesting for us, but it did not like doing so, as it
-also issued the following warning::
-
     LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
 
 This is easily resolved:
@@ -481,12 +476,11 @@ ambiguous.
 
 .. doctest::
 
-    >>> with IncludeWarningsInDoctest():
-    ...     evt, (out,) = knl(queue, a=a_mat_dev)
-    <BLANKLINE>
+    >>> evt, (out,) = knl(queue, a=a_mat_dev)
+    #define lid(N) ((int) get_local_id(N))
     ...
-      for (int j = 0; j <= (-1 + n); ++j)
-        for (int i = 0; i <= (-1 + n); ++i)
+      for (int j = 0; j <= -1 + n; ++j)
+        for (int i = 0; i <= -1 + n; ++i)
           a[n * i + j] = 0.0f;
     ...
 
@@ -536,15 +530,16 @@ Consider this example:
 
     >>> knl = lp.make_kernel(
     ...     "{ [i]: 0<=i<n }",
-    ...     "a[i] = 0", assumptions="n>=0")
+    ...     "a[i] = 0", assumptions="n>=1")
     >>> knl = lp.split_iname(knl, "i", 16)
+    >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner")
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
-      for (int i_outer = 0; i_outer <= (-1 + ((15 + n) / 16)); ++i_outer)
+      for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer)
         for (int i_inner = 0; i_inner <= 15; ++i_inner)
-          if ((-1 + -1 * i_inner + -16 * i_outer + n) >= 0)
+          if (-1 + -1 * i_inner + -16 * i_outer + n >= 0)
             a[i_inner + i_outer * 16] = 0.0f;
     ...
 
@@ -567,12 +562,11 @@ relation to loop nesting. For example, it's perfectly possible to request
 
     >>> knl = lp.set_loop_priority(knl, "i_inner,i_outer")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
       for (int i_inner = 0; i_inner <= 15; ++i_inner)
-        if ((-1 + n) >= 0)
-          for (int i_outer = 0; i_outer <= (-1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16)); ++i_outer)
-            a[i_inner + i_outer * 16] = 0.0f;
+        for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
+          a[i_inner + i_outer * 16] = 0.0f;
     ...
 
 Notice how loopy has automatically generated guard conditionals to make
@@ -593,10 +587,10 @@ commonly called 'loop tiling':
     >>> knl = lp.set_loop_priority(knl, "i_outer,j_outer,i_inner")
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
-      for (int i_outer = 0; i_outer <= (-1 + ((15 + n) / 16)); ++i_outer)
-        for (int j_outer = 0; j_outer <= (-1 + ((15 + n) / 16)); ++j_outer)
+      for (int i_outer = 0; i_outer <= ((-16 + n) / 16); ++i_outer)
+        for (int j_outer = 0; j_outer <= ((-16 + n) / 16); ++j_outer)
           for (int i_inner = 0; i_inner <= 15; ++i_inner)
             for (int j_inner = 0; j_inner <= 15; ++j_inner)
               out[n * (i_inner + i_outer * 16) + j_inner + j_outer * 16] = a[n * (j_inner + j_outer * 16) + i_inner + i_outer * 16];
@@ -632,11 +626,13 @@ loop's tag to ``"unr"``:
     >>> orig_knl = knl
     >>> knl = lp.split_iname(knl, "i", 4)
     >>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
+    >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner")
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
+    #define lid(N) ((int) get_local_id(N))
     ...
-      for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer)
+      for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer)
       {
         a[0 + i_outer * 4] = 0.0f;
         a[1 + i_outer * 4] = 0.0f;
@@ -708,12 +704,12 @@ Let's try this out on our vector fill kernel by creating workgroups of size
     ...         outer_tag="g.0", inner_tag="l.0")
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
     __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n)
     {
     <BLANKLINE>
-      if ((-1 + -128 * gid(0) + -1 * lid(0) + n) >= 0)
+      if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
         a[lid(0) + gid(0) * 128] = 0.0f;
     }
 
@@ -752,18 +748,19 @@ assumption:
     >>> orig_knl = knl
     >>> knl = lp.split_iname(knl, "i", 4)
     >>> knl = lp.tag_inames(knl, dict(i_inner="unr"))
+    >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner")
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
-      for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer)
+      for (int i_outer = 0; i_outer <= -1 + ((3 + n) / 4); ++i_outer)
       {
         a[0 + i_outer * 4] = 0.0f;
-        if ((-2 + -4 * i_outer + n) >= 0)
+        if (-2 + -4 * i_outer + n >= 0)
           a[1 + i_outer * 4] = 0.0f;
-        if ((-3 + -4 * i_outer + n) >= 0)
+        if (-3 + -4 * i_outer + n >= 0)
           a[2 + i_outer * 4] = 0.0f;
-        if ((-4 + -4 * i_outer + n) >= 0)
+        if (-4 + -4 * i_outer + n >= 0)
           a[3 + i_outer * 4] = 0.0f;
       }
     ...
@@ -781,11 +778,12 @@ enabling some cost savings:
     >>> knl = orig_knl
     >>> knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr")
     >>> knl = lp.set_options(knl, "write_cl")
+    >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
       /* bulk slab for 'i_outer' */
-      for (int i_outer = 0; i_outer <= (-2 + ((3 + n) / 4)); ++i_outer)
+      for (int i_outer = 0; i_outer <= -2 + ((3 + n) / 4); ++i_outer)
       {
         a[0 + i_outer * 4] = 0.0f;
         a[1 + i_outer * 4] = 0.0f;
@@ -793,15 +791,15 @@ enabling some cost savings:
         a[3 + i_outer * 4] = 0.0f;
       }
       /* final slab for 'i_outer' */
-      for (int i_outer = (-1 + n + -1 * (3 * n / 4)); i_outer <= (-1 + ((3 + n) / 4)); ++i_outer)
-        if ((-1 + n) >= 0)
+      for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer)
+        if (-1 + n >= 0)
         {
           a[0 + i_outer * 4] = 0.0f;
-          if ((-2 + -4 * i_outer + n) >= 0)
+          if (-2 + -4 * i_outer + n >= 0)
             a[1 + i_outer * 4] = 0.0f;
-          if ((-3 + -4 * i_outer + n) >= 0)
+          if (-3 + -4 * i_outer + n >= 0)
             a[2 + i_outer * 4] = 0.0f;
-          if ((4 + 4 * i_outer + -1 * n) == 0)
+          if (4 + 4 * i_outer + -1 * n == 0)
             a[3 + i_outer * 4] = 0.0f;
         }
     ...
@@ -871,12 +869,12 @@ memory, local to each work item.
 
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out1, out2) = knl(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
     {
       float a_temp;
     <BLANKLINE>
-      for (int i = 0; i <= (-1 + n); ++i)
+      for (int i = 0; i <= -1 + n; ++i)
       {
         a_temp = sin(a[i]);
         out2[i] = sqrt(1.0f + -1.0f * a_temp * a_temp);
@@ -929,19 +927,19 @@ Consider the following example:
     >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0"))
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
     {
       __local float a_temp[16];
       float acc_k;
     <BLANKLINE>
-      if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0)
+      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
         a_temp[lid(0)] = a[16 * gid(0) + lid(0)];
         acc_k = 0.0f;
       }
       barrier(CLK_LOCAL_MEM_FENCE) /* for a_temp (insn_0_k_update depends on insn) */;
-      if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0)
+      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
         for (int k = 0; k <= 15; ++k)
           acc_k = acc_k + a_temp[k];
@@ -992,7 +990,7 @@ transformation exists in :func:`loopy.add_prefetch`:
     >>> knl = lp.set_options(knl, "write_cl")
     >>> knl_pf = lp.add_prefetch(knl, "a")
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
         a_fetch_0 = a[16 * gid(0) + lid(0)];
         for (int k = 0; k <= 15; ++k)
@@ -1015,12 +1013,12 @@ earlier:
 
     >>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"])
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
-    <BLANKLINE>
+    #define lid(N) ((int) get_local_id(N))
     ...
-      if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0)
+      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
         a_fetch_0[lid(0)] = a[lid(0) + 16 * gid(0)];
       barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch_0 (insn_k_update depends on a_fetch) */;
-      if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0)
+      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
         for (int k = 0; k <= 15; ++k)
           acc_k = acc_k + a_fetch_0[lid(0)];
@@ -1145,18 +1143,20 @@ sign that something is amiss:
 
 .. doctest::
 
-    >>> with IncludeWarningsInDoctest():
-    ...     evt, (out,) = knl(queue, a=a_mat_dev)
-    /...: WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch_0' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable)
-      warn(text, type)
+    >>> evt, (out,) = knl(queue, a=a_mat_dev)
+    Traceback (most recent call last):
+    ...
+    WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch_0' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable)
 
 When we ask to see the code, the issue becomes apparent:
 
 .. doctest::
 
     >>> knl = lp.set_options(knl, "write_cl")
-    >>> evt, (out,) = knl(queue, a=a_mat_dev)
-    <BLANKLINE>
+    >>> from warnings import catch_warnings
+    >>> with catch_warnings():
+    ...     filterwarnings("always", category=lp.LoopyWarning)
+    ...     evt, (out,) = knl(queue, a=a_mat_dev)
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
diff --git a/examples/fortran/IPython Integration Demo.ipynb b/examples/fortran/IPython Integration Demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..c2b34f1d13f7b4971c0c2e88a0ae5013ed12ee23
--- /dev/null
+++ b/examples/fortran/IPython Integration Demo.ipynb	
@@ -0,0 +1,190 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:c9f8334aa7aa4a5ad1437fa5871aafa52bbc9131271d9e90e7be47d22725cc94"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "# Loopy IPython Integration Demo"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "%load_ext loopy.ipython_ext"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "## Without transform code"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "%%fortran_kernel\n",
+      "\n",
+      "subroutine fill(out, a, n)\n",
+      "  implicit none\n",
+      "\n",
+      "  real*8 a, out(n)\n",
+      "  integer n, i\n",
+      "\n",
+      "  do i = 1, n\n",
+      "    out(i) = a\n",
+      "  end do\n",
+      "end"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print(fill)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "---------------------------------------------------------------------------\n",
+        "KERNEL: fill\n",
+        "---------------------------------------------------------------------------\n",
+        "ARGUMENTS:\n",
+        "a: ValueArg, type: float64\n",
+        "n: ValueArg, type: int32\n",
+        "out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
+        "---------------------------------------------------------------------------\n",
+        "DOMAINS:\n",
+        "[n] -> { [i] : i >= 0 and i <= -1 + n }\n",
+        "---------------------------------------------------------------------------\n",
+        "INAME IMPLEMENTATION TAGS:\n",
+        "i: None\n",
+        "---------------------------------------------------------------------------\n",
+        "INSTRUCTIONS:\n",
+        "[i]                                  out[i] <- a   # insn0\n",
+        "---------------------------------------------------------------------------\n"
+       ]
+      }
+     ],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "## With transform code"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "split_amount = 128"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 4
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "%%transformed_fortran_kernel\n",
+      "\n",
+      "subroutine tr_fill(out, a, n)\n",
+      "  implicit none\n",
+      "\n",
+      "  real*8 a, out(n)\n",
+      "  integer n, i\n",
+      "\n",
+      "  do i = 1, n\n",
+      "    out(i) = a\n",
+      "  end do\n",
+      "end\n",
+      "\n",
+      "!$loopy begin\n",
+      "!\n",
+      "! tr_fill, = lp.parse_fortran(SOURCE)\n",
+      "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
+      "!     outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
+      "! RESULT = [tr_fill]\n",
+      "!\n",
+      "!$loopy end"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print(tr_fill)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "---------------------------------------------------------------------------\n",
+        "KERNEL: tr_fill\n",
+        "---------------------------------------------------------------------------\n",
+        "ARGUMENTS:\n",
+        "a: ValueArg, type: float64\n",
+        "n: ValueArg, type: int32\n",
+        "out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n",
+        "---------------------------------------------------------------------------\n",
+        "DOMAINS:\n",
+        "[n] -> { [i_outer, i_inner] : i_inner >= -128i_outer and i_inner <= -1 + n - 128i_outer and i_inner >= 0 and i_inner <= 127 }\n",
+        "---------------------------------------------------------------------------\n",
+        "INAME IMPLEMENTATION TAGS:\n",
+        "i_inner: l.0\n",
+        "i_outer: g.0\n",
+        "---------------------------------------------------------------------------\n",
+        "INSTRUCTIONS:\n",
+        "[i_inner,i_outer]                    out[i_inner + i_outer*128] <- a   # insn0\n",
+        "---------------------------------------------------------------------------\n"
+       ]
+      }
+     ],
+     "prompt_number": 6
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
diff --git a/examples/fortran/foo.floopy b/examples/fortran/foo.floopy
index 7d2d3eef0570e448ad8e9e7113e2470e1d4ef64d..6b8741e1150a05226f3aa9de849db8abb1915002 100644
--- a/examples/fortran/foo.floopy
+++ b/examples/fortran/foo.floopy
@@ -1,24 +1,30 @@
 subroutine fill(out, a, n)
   implicit none
 
-  real*8 a, out(n)
-  integer n
+  real_type a, out(n)
+  integer n, i
 
   do i = 1, n
     out(i) = a
   end do
   do i = 1, n
-    out(i) = out(i) * 2
+    out(i) = out(i) * factor
   end do
 end
 
-!$loopy begin transform
+!$loopy begin
 !
+! SOURCE = lp.c_preprocess(SOURCE, [
+!       "factor 4.0",
+!       "real_type real*8",
+!       ])
+! fill, = lp.parse_fortran(SOURCE, FILENAME)
 ! fill = lp.split_iname(fill, "i", 128,
 !     outer_tag="g.0", inner_tag="l.0")
 ! fill = lp.split_iname(fill, "i_1", 128,
 !     outer_tag="g.0", inner_tag="l.0")
+! RESULT = [fill]
 !
-!$loopy end transform
+!$loopy end
 
 ! vim:filetype=floopy
diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy
index ea85b0c6b198c8bac6270098696504fc9006fd14..3352449d7bfcc406e5349f36c5cb7ebb3e3613b6 100644
--- a/examples/fortran/matmul.floopy
+++ b/examples/fortran/matmul.floopy
@@ -12,7 +12,8 @@ subroutine dgemm(m,n,l,alpha,a,b,c)
   end do
 end subroutine
 
-!$loopy begin transform
+!$loopy begin
+! dgemm, = lp.parse_fortran(SOURCE, FILENAME)
 ! dgemm = lp.split_iname(dgemm, "i", 16,
 !         outer_tag="g.0", inner_tag="l.1")
 ! dgemm = lp.split_iname(dgemm, "j", 8,
@@ -23,4 +24,5 @@ end subroutine
 ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2")
 ! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner")
 ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner")
-!$loopy end transform
+! RESULT = [dgemm]
+!$loopy end
diff --git a/examples/fortran/outerprod.py b/examples/fortran/outerprod.py
deleted file mode 100644
index 4122c84376fcc676023b3a4a01b0728c70b18b61..0000000000000000000000000000000000000000
--- a/examples/fortran/outerprod.py
+++ /dev/null
@@ -1,8 +0,0 @@
-lp_knl = lp.make_kernel(
-	"{[i,j]: 0<=i,j<n}",
-	"c[i,j] = a[i]*b[j]")
-
-lp_knl = lp.add_dtypes(lp_knl, {"a": np.float64, "b": np.float64})
-lp_knl = lp.split_iname(lp_knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
-lp_knl = lp.split_iname(lp_knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
-
diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy
index 924e0aa4abe51c4dd84b01cad0cb83b56122c97d..18542e6b0403a7ab475b3e357f18489847367c3d 100644
--- a/examples/fortran/sparse.floopy
+++ b/examples/fortran/sparse.floopy
@@ -6,6 +6,7 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y)
   real*8 x(n), y(n), rowsum
 
   integer m, n, rowstart, rowend, length, nvals
+  integer i, j
 
   do i = 1, m
     rowstart = rowstarts(i)
@@ -21,10 +22,12 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y)
   end do
 end
 
-!$loopy begin transform
+!$loopy begin
+! sparse, = lp.parse_fortran(SOURCE, FILENAME)
 ! sparse = lp.split_iname(sparse, "i", 128)
 ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"})
 ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"})
 ! sparse = lp.split_iname(sparse, "j", 4)
 ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"})
-!$loopy end transform
+! RESULT = [sparse]
+!$loopy end
diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy
index f4b4e28eab3ddd544c791a088279718ef5221bb9..40b487528cdd788f26fb4b600869e4bf81f090a8 100644
--- a/examples/fortran/tagging.floopy
+++ b/examples/fortran/tagging.floopy
@@ -2,7 +2,7 @@ subroutine fill(out, a, n)
   implicit none
 
   real*8 a, out(n)
-  integer n
+  integer n, i
 
 !$loopy begin tagged: init
   do i = 1, n
@@ -14,11 +14,12 @@ subroutine fill(out, a, n)
   end do
 end
 
-!$loopy begin transform
-!
+!$loopy begin
+! fill, = lp.parse_fortran(SOURCE, FILENAME)
 ! fill = lp.split_iname(fill, "i", 128,
 !     outer_tag="g.0", inner_tag="l.0")
 ! fill = lp.split_iname(fill, "i_1", 128,
 !     outer_tag="g.0", inner_tag="l.0")
-!$loopy end transform
+! RESULT = [fill]
+!$loopy end
 ! vim:filetype=floopy
diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy
index 953432d2253e90e396385cd53f4c0f2d81d70e5a..c5784b63492063bfd2a9604c42dbf65b2ecb86bf 100644
--- a/examples/fortran/volumeKernel.floopy
+++ b/examples/fortran/volumeKernel.floopy
@@ -65,8 +65,9 @@ subroutine volumeKernel(elements, Nfields, Ngeo, Ndim, Dop, geo, Q, rhsQ  )
 
 end subroutine volumeKernel
 
-!$loopy begin transform
+!$loopy begin
 !
+! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME)
 ! volumeKernel = lp.split_iname(volumeKernel,
 !     "e", 32, outer_tag="g.1", inner_tag="g.0")
 ! volumeKernel = lp.fix_parameters(volumeKernel,
@@ -75,5 +76,6 @@ end subroutine volumeKernel
 !     i="l.0", j="l.1", k="l.2",
 !     i_1="l.0", j_1="l.1", k_1="l.2"
 !     ))
+! RESULT = [volumeKernel]
 !
-!$loopy end transform
+!$loopy end
diff --git a/examples/fortran/volumeKernelSimple.floopy b/examples/fortran/volumeKernelSimple.floopy
index 67948020d6fce06e858718fa9cca366d78b33295..afc3321b8fa18ff532b3c2898dbaea8f99e3700c 100644
--- a/examples/fortran/volumeKernelSimple.floopy
+++ b/examples/fortran/volumeKernelSimple.floopy
@@ -27,10 +27,12 @@ subroutine volumeKernel(elements, Nfields, Ngeo, Ndim, Dop, geo, Q, rhsQ  )
 
 end subroutine volumeKernel
 
-!$loopy begin transform
+!$loopy begin
 !
+! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME)
 ! volumeKernel = lp.fix_parameters(volumeKernel,
 !     Nq=5, Ndim=3)
 ! volumeKernel = lp.tag_inames(volumeKernel, dict(i="l.0"))
+! RESULT = [volumeKernel]
 !
-!$loopy end transform
+!$loopy end
diff --git a/examples/python/hello-loopy-lp.py b/examples/python/hello-loopy-lp.py
new file mode 100644
index 0000000000000000000000000000000000000000..382aadc097352ab7328b9ef956e0c0379d06a2cb
--- /dev/null
+++ b/examples/python/hello-loopy-lp.py
@@ -0,0 +1,11 @@
+# This is a version of hello-loopy.py that can be run through
+# a loopy binary using
+#
+# ./loopy --lang=loopy hello-loopy-lp.py -
+
+knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "out[i] = 2*a[i]")
+
+knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+lp_knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 12329bbf6472867c873958f1a18f20ba1016b5f7..dcd8e27c1eaf02a0f7aee133598b0e2fd34f70b9 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -1,8 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import zip
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -27,13 +23,16 @@ THE SOFTWARE.
 """
 
 
+import six
+from six.moves import range, zip
+
 import islpy as isl
 from islpy import dim_type
 
 from loopy.symbolic import (RuleAwareIdentityMapper, RuleAwareSubstitutionMapper,
         SubstitutionRuleMappingContext,
         TaggedVariable, Reduction, LinearSubscript, )
-from loopy.diagnostic import LoopyError
+from loopy.diagnostic import LoopyError, LoopyWarning
 
 
 # {{{ imported user interface
@@ -58,6 +57,7 @@ from loopy.library.reduction import register_reduction_parser
 from loopy.subst import extract_subst, expand_subst, temporary_to_subst
 from loopy.precompute import precompute
 from loopy.buffer import buffer_array
+from loopy.fusion import fuse_kernels
 from loopy.padding import (split_arg_axis, find_padding_multiple,
         add_padding)
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
@@ -67,6 +67,8 @@ from loopy.codegen import generate_code, generate_body
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
 from loopy.auto_test import auto_test_vs_ref
+from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran,
+        parse_fortran)
 
 __all__ = [
         "TaggedVariable", "Reduction", "LinearSubscript",
@@ -88,6 +90,7 @@ __all__ = [
 
         "extract_subst", "expand_subst", "temporary_to_subst",
         "precompute", "buffer_array",
+        "fuse_kernels",
         "split_arg_axis", "find_padding_multiple", "add_padding",
 
         "get_dot_dependency_graph",
@@ -106,6 +109,9 @@ __all__ = [
         "Options",
 
         "make_kernel",
+        "c_preprocess", "parse_transformed_fortran", "parse_fortran",
+
+        "LoopyError", "LoopyWarning",
 
         # {{{ from this file
 
@@ -539,7 +545,7 @@ class _InameDuplicator(RuleAwareIdentityMapper):
             return var(new_name)
 
     def map_instruction(self, insn):
-        if not self.within(((insn.id, None),)):
+        if not self.within(((insn.id, insn.tags),)):
             return insn
 
         new_fid = frozenset(
@@ -1458,7 +1464,8 @@ def register_function_manglers(kernel, manglers):
 
 # {{{ cache control
 
-CACHING_ENABLED = True
+import os
+CACHING_ENABLED = "LOOPY_NO_CACHE" not in os.environ
 
 
 def set_caching_enabled(flag):
@@ -1781,4 +1788,23 @@ def fold_constants(kernel):
 
 # }}}
 
+
+# {{{ tag_instructions
+
+def tag_instructions(kernel, new_tag, within=None):
+    from loopy.context_matching import parse_stack_match
+    within = parse_stack_match(within)
+
+    new_insns = []
+    for insn in kernel.instructions:
+        if within(((insn.id, insn.tags),)):
+            new_insns.append(
+                    insn.copy(tags=insn.tags + (new_tag,)))
+        else:
+            new_insns.append(insn)
+
+    return kernel.copy(instructions=new_insns)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index dcea94690c2cca630b40a870f777d884715f1dd4..33cdc2118e389ae7b04e43e28a4c6d0e62011751 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -277,7 +277,7 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
 
     from loopy.codegen.bounds import get_usable_inames_for_conditional
 
-    # Note: this does note include loop_iname itself!
+    # Note: this does not include loop_iname itself!
     usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
     domain = kernel.get_inames_domain(loop_iname)
 
@@ -299,7 +299,6 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
         dom_and_slab, assumptions_non_param = isl.align_two(
                 dom_and_slab, assumptions_non_param)
         dom_and_slab = dom_and_slab & assumptions_non_param
-        del assumptions_non_param
 
         # move inames that are usable into parameters
         moved_inames = []
@@ -317,14 +316,23 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
                 static_min_of_pw_aff,
                 static_max_of_pw_aff)
 
-        static_lbound = static_min_of_pw_aff(
+        lbound = (
                 kernel.cache_manager.dim_min(
-                    dom_and_slab, loop_iname_idx).coalesce(),
-                constants_only=False, prefer_constants=False)
+                    dom_and_slab, loop_iname_idx)
+                .gist(kernel.assumptions)
+                .coalesce())
+        ubound = (
+            kernel.cache_manager.dim_max(
+                dom_and_slab, loop_iname_idx)
+            .gist(kernel.assumptions)
+            .coalesce())
+
+        static_lbound = static_min_of_pw_aff(
+                lbound,
+                constants_only=False)
         static_ubound = static_max_of_pw_aff(
-                kernel.cache_manager.dim_max(
-                    dom_and_slab, loop_iname_idx).coalesce(),
-                constants_only=False, prefer_constants=False)
+                ubound,
+                constants_only=False)
 
         # }}}
 
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index 56d7f6706c8932a9a97202768bdd0053c5c36d10..93cbd125de96e0481a3f466c0a7e96dd04c89cd7 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -37,11 +37,11 @@ class LoopyAdvisory(LoopyWarningBase):
     pass
 
 
-class ParameterFinderWarning(LoopyWarningBase):
+class ParameterFinderWarning(LoopyWarning):
     pass
 
 
-class WriteRaceConditionWarning(LoopyWarningBase):
+class WriteRaceConditionWarning(LoopyWarning):
     pass
 
 # }}}
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 1a610b63cb35f4931e41446f00c16a182de8430a..1c3d1283541c33ce3944e358f4abd451e2c881ee 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -25,46 +25,209 @@ THE SOFTWARE.
 from loopy.diagnostic import LoopyError
 
 
-def f2loopy(source, free_form=True, strict=True,
-        pre_transform_code=None, pre_transform_code_context=None,
-        use_c_preprocessor=False,
-        file_name="<floopy code>"):
-    if use_c_preprocessor:
-        try:
-            import ply.lex as lex
-            import ply.cpp as cpp
-        except ImportError:
-            raise LoopyError("Using the C preprocessor requires PLY to be installed")
+def c_preprocess(source, defines=None, filename="<floopy source>"):
+    """
+    :arg source: a string, possibly containing C preprocessor constructs
+    :arg defines: a list of strings as they might occur after a
+        C-style ``#define`` directive, for example ``deg2rad(x) (x/180d0 * 3.14d0)``.
+    :return: a string
+    """
+    try:
+        import ply.lex as lex
+        import ply.cpp as cpp
+    except ImportError:
+        raise LoopyError("Using the C preprocessor requires PLY to be installed")
 
-        lexer = lex.lex(cpp)
+    lexer = lex.lex(cpp)
 
-        from ply.cpp import Preprocessor
-        p = Preprocessor(lexer)
-        p.parse(source, file_name)
+    from ply.cpp import Preprocessor
+    p = Preprocessor(lexer)
 
-        tokens = []
-        while True:
-            tok = p.token()
+    if defines:
+        for d in defines:
+            p.define(d)
 
-            if not tok:
-                break
+    p.parse(source, filename)
 
-            if tok.type == "CPP_COMMENT":
-                continue
+    tokens = []
+    while True:
+        tok = p.token()
 
-            tokens.append(tok.value)
+        if not tok:
+            break
 
-        source = "".join(tokens)
+        if tok.type == "CPP_COMMENT":
+            continue
 
+        tokens.append(tok.value)
+
+    return "".join(tokens)
+
+
+def _extract_loopy_lines(source):
+    lines = source.split("\n")
+
+    import re
+    comment_re = re.compile(r"^\s*\!(.*)$")
+
+    remaining_lines = []
+    loopy_lines = []
+
+    in_loopy_code = False
+    for l in lines:
+        comment_match = comment_re.match(l)
+
+        if comment_match is None:
+            if in_loopy_code:
+                raise LoopyError("non-comment source line in loopy block")
+
+            remaining_lines.append(l)
+            continue
+
+        cmt = comment_match.group(1)
+        cmt_stripped = cmt.strip()
+
+        if cmt_stripped == "$loopy begin":
+            if in_loopy_code:
+                raise LoopyError("can't enter loopy block twice")
+            in_loopy_code = True
+
+        elif cmt_stripped == "$loopy end":
+            if not in_loopy_code:
+                raise LoopyError("can't leave loopy block twice")
+            in_loopy_code = False
+
+        elif in_loopy_code:
+            loopy_lines.append(cmt)
+
+        else:
+            remaining_lines.append(l)
+
+    return "\n".join(remaining_lines), "\n".join(loopy_lines)
+
+
+def parse_transformed_fortran(source, free_form=True, strict=True,
+        pre_transform_code=None, transform_code_context=None,
+        filename="<floopy code>"):
+    """
+    :arg source: a string of Fortran source code which must include
+        a snippet of transform code as described below.
+    :arg pre_transform_code: code that is run in the same context
+        as the transform
+
+    *source* may contain snippets of loopy transform code between markers::
+
+        !$loopy begin
+        ! ...
+        !$loopy end
+
+    Within the transform code, the following symbols are predefined:
+
+    * ``lp``: a reference to the :mod:`loopy` package
+    * ``np``: a reference to the :mod:`numpy` package
+    * ``SOURCE``: the source code surrounding the transform block.
+      This may be processed using :func:`c_preprocess` and
+      :func:`parse_fortran`.
+    * ``FILENAME``: the file name of the code being processed
+
+    The transform code must define ``RESULT``, conventionally a list of
+    kernels, which is returned from this function unmodified.
+
+    An example of *source* may look as follows::
+
+        subroutine fill(out, a, n)
+          implicit none
+
+          real*8 a, out(n)
+          integer n, i
+
+          do i = 1, n
+            out(i) = a
+          end do
+        end
+
+        !$loopy begin
+        !
+        ! fill, = lp.parse_fortran(SOURCE, FILENAME)
+        ! fill = lp.split_iname(fill, "i", split_amount,
+        !     outer_tag="g.0", inner_tag="l.0")
+        ! RESULT = [fill]
+        !
+        !$loopy end
+    """
+
+    source, transform_code = _extract_loopy_lines(source)
+    if not transform_code:
+        raise LoopyError("no transform code found")
+
+    from loopy.tools import remove_common_indentation
+    transform_code = remove_common_indentation(
+            transform_code,
+            require_leading_newline=False)
+
+    if transform_code_context is None:
+        proc_dict = {}
+    else:
+        proc_dict = transform_code_context.copy()
+
+    import loopy as lp
+    import numpy as np
+
+    proc_dict["lp"] = lp
+    proc_dict["np"] = np
+
+    proc_dict["SOURCE"] = source
+    proc_dict["FILENAME"] = filename
+
+    from os.path import dirname, abspath
+    from os import getcwd
+
+    infile_dirname = dirname(filename)
+    if infile_dirname:
+        infile_dirname = abspath(infile_dirname)
+    else:
+        infile_dirname = getcwd()
+
+    import sys
+    prev_sys_path = sys.path
+    try:
+        if infile_dirname:
+            sys.path = prev_sys_path + [infile_dirname]
+
+        if pre_transform_code is not None:
+            proc_dict["_MODULE_SOURCE_CODE"] = pre_transform_code
+            exec(compile(pre_transform_code,
+                "<loopy pre-transform code>", "exec"), proc_dict)
+
+        proc_dict["_MODULE_SOURCE_CODE"] = transform_code
+        exec(compile(transform_code, filename, "exec"), proc_dict)
+
+    finally:
+        sys.path = prev_sys_path
+
+    if "RESULT" not in proc_dict:
+        raise LoopyError("transform code did not set RESULT")
+
+    return proc_dict["RESULT"]
+
+
+def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True):
+    """
+    :returns: a list of :class:`loopy.LoopKernel` objects
+    """
     from fparser import api
     tree = api.parse(source, isfree=free_form, isstrict=strict,
             analyze=False, ignore_comments=False)
 
+    if tree is None:
+        raise LoopyError("Fortran parser was unhappy with source code "
+                "and returned invalid data (Sorry!)")
+
     from loopy.frontend.fortran.translator import F2LoopyTranslator
-    f2loopy = F2LoopyTranslator(file_name)
+    f2loopy = F2LoopyTranslator(filename)
     f2loopy(tree)
 
-    return f2loopy.make_kernels(pre_transform_code=pre_transform_code,
-            pre_transform_code_context=pre_transform_code_context)
+    return f2loopy.make_kernels()
+
 
 # vim: foldmethod=marker
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index abd925c14fe9ebf14342ac5ac25e2062291feefc..895606aa38df3bac78e05d18822a5dbf494d0948 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -35,6 +35,7 @@ from loopy.frontend.fortran.diagnostic import (
 import islpy as isl
 from islpy import dim_type
 from loopy.symbolic import IdentityMapper
+from loopy.diagnostic import LoopyError
 from pymbolic.primitives import Wildcard
 
 
@@ -193,24 +194,6 @@ class Scope(object):
 # }}}
 
 
-def remove_common_indentation(lines):
-    while lines and lines[0].strip() == "":
-        lines.pop(0)
-    while lines and lines[-1].strip() == "":
-        lines.pop(-1)
-
-    if lines:
-        base_indent = 0
-        while lines[0][base_indent] in " \t":
-            base_indent += 1
-
-        for line in lines[1:]:
-            if line[:base_indent].strip():
-                raise ValueError("inconsistent indentation")
-
-    return "\n".join(line[base_indent:] for line in lines)
-
-
 # {{{ translator
 
 class F2LoopyTranslator(FTreeWalkerBase):
@@ -225,17 +208,15 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         self.kernels = []
 
-        # Flag to record whether 'loopy begin transform' comment
-        # has been seen.
-        self.in_transform_code = False
-
         self.instruction_tags = []
         self.conditions = []
 
-        self.transform_code_lines = []
-
         self.filename = filename
 
+        self.index_dtype = None
+
+        self.block_nest = []
+
     def add_expression_instruction(self, lhs, rhs):
         scope = self.scope_stack[-1]
 
@@ -276,6 +257,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
         scope = Scope(node.name, list(node.args))
         self.scope_stack.append(scope)
 
+        self.block_nest.append("sub")
         for c in node.content:
             self.rec(c)
 
@@ -284,6 +266,11 @@ class F2LoopyTranslator(FTreeWalkerBase):
         self.kernels.append(scope)
 
     def map_EndSubroutine(self, node):
+        if not self.block_nest:
+            raise TranslationError("no subroutine started at this point")
+        if self.block_nest.pop() != "sub":
+            raise TranslationError("mismatched end subroutine")
+
         return []
 
     def map_Implicit(self, node):
@@ -471,6 +458,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         self.conditions.append(cond_name)
 
+        self.block_nest.append("if")
         for c in node.content:
             self.rec(c)
 
@@ -479,100 +467,119 @@ class F2LoopyTranslator(FTreeWalkerBase):
         self.conditions.append("!" + cond_name)
 
     def map_EndIfThen(self, node):
+        if not self.block_nest:
+            raise TranslationError("no if block started at end do")
+        if self.block_nest.pop() != "if":
+            raise TranslationError("mismatched end if")
+
         self.conditions.pop()
 
     def map_Do(self, node):
         scope = self.scope_stack[-1]
 
-        if node.loopcontrol:
-            loop_var, loop_bounds = node.loopcontrol.split("=")
-            loop_var = loop_var.strip()
-            scope.use_name(loop_var)
-            loop_bounds = self.parse_expr(
-                    node,
-                    loop_bounds, min_precedence=self.expr_parser._PREC_FUNC_ARGS)
-
-            if len(loop_bounds) == 2:
-                start, stop = loop_bounds
-                step = 1
-            elif len(loop_bounds) == 3:
-                start, stop, step = loop_bounds
-            else:
-                raise RuntimeError("loop bounds not understood: %s"
-                        % node.loopcontrol)
+        if not node.loopcontrol:
+            raise NotImplementedError("unbounded do loop")
 
-            if step != 1:
-                raise NotImplementedError(
-                        "do loops with non-unit stride")
+        loop_var, loop_bounds = node.loopcontrol.split("=")
+        loop_var = loop_var.strip()
 
-            if not isinstance(step, int):
-                raise TranslationError(
-                        "non-constant steps not supported: %s" % step)
-
-            from loopy.symbolic import get_dependencies
-            loop_bound_deps = (
-                    get_dependencies(start)
-                    | get_dependencies(stop)
-                    | get_dependencies(step))
-
-            # {{{ find a usable loopy-side loop name
-
-            loopy_loop_var = loop_var
-            loop_var_suffix = None
-            while True:
-                already_used = False
-                for iset in scope.index_sets:
-                    if loopy_loop_var in iset.get_var_dict(dim_type.set):
-                        already_used = True
-                        break
-
-                if not already_used:
+        iname_dtype = scope.get_type(loop_var)
+        if self.index_dtype is None:
+            self.index_dtype = iname_dtype
+        else:
+            if self.index_dtype != iname_dtype:
+                raise LoopyError("type of '%s' (%s) does not agree with prior "
+                        "index type (%s)"
+                        % (loop_var, iname_dtype, self.index_dtype))
+
+        scope.use_name(loop_var)
+        loop_bounds = self.parse_expr(
+                node,
+                loop_bounds, min_precedence=self.expr_parser._PREC_FUNC_ARGS)
+
+        if len(loop_bounds) == 2:
+            start, stop = loop_bounds
+            step = 1
+        elif len(loop_bounds) == 3:
+            start, stop, step = loop_bounds
+        else:
+            raise RuntimeError("loop bounds not understood: %s"
+                    % node.loopcontrol)
+
+        if step != 1:
+            raise NotImplementedError(
+                    "do loops with non-unit stride")
+
+        if not isinstance(step, int):
+            raise TranslationError(
+                    "non-constant steps not supported: %s" % step)
+
+        from loopy.symbolic import get_dependencies
+        loop_bound_deps = (
+                get_dependencies(start)
+                | get_dependencies(stop)
+                | get_dependencies(step))
+
+        # {{{ find a usable loopy-side loop name
+
+        loopy_loop_var = loop_var
+        loop_var_suffix = None
+        while True:
+            already_used = False
+            for iset in scope.index_sets:
+                if loopy_loop_var in iset.get_var_dict(dim_type.set):
+                    already_used = True
                     break
 
-                if loop_var_suffix is None:
-                    loop_var_suffix = 0
+            if not already_used:
+                break
 
-                loop_var_suffix += 1
-                loopy_loop_var = loop_var + "_%d" % loop_var_suffix
+            if loop_var_suffix is None:
+                loop_var_suffix = 0
 
-            # }}}
+            loop_var_suffix += 1
+            loopy_loop_var = loop_var + "_%d" % loop_var_suffix
 
-            space = isl.Space.create_from_names(self.isl_context,
-                    set=[loopy_loop_var], params=list(loop_bound_deps))
-
-            from loopy.isl_helpers import iname_rel_aff
-            from loopy.symbolic import aff_from_expr
-            index_set = (
-                    isl.BasicSet.universe(space)
-                    .add_constraint(
-                        isl.Constraint.inequality_from_aff(
-                            iname_rel_aff(space,
-                                loopy_loop_var, ">=",
-                                aff_from_expr(space, 0))))
-                    .add_constraint(
-                        isl.Constraint.inequality_from_aff(
-                            iname_rel_aff(space,
-                                loopy_loop_var, "<=",
-                                aff_from_expr(space, stop-start)))))
-
-            from pymbolic import var
-            scope.active_iname_aliases[loop_var] = \
-                    var(loopy_loop_var) + start
-            scope.active_loopy_inames.add(loopy_loop_var)
-
-            scope.index_sets.append(index_set)
-
-            for c in node.content:
-                self.rec(c)
-
-            del scope.active_iname_aliases[loop_var]
-            scope.active_loopy_inames.remove(loopy_loop_var)
+        # }}}
 
-        else:
-            raise NotImplementedError("unbounded do loop")
+        space = isl.Space.create_from_names(self.isl_context,
+                set=[loopy_loop_var], params=list(loop_bound_deps))
+
+        from loopy.isl_helpers import iname_rel_aff
+        from loopy.symbolic import aff_from_expr
+        index_set = (
+                isl.BasicSet.universe(space)
+                .add_constraint(
+                    isl.Constraint.inequality_from_aff(
+                        iname_rel_aff(space,
+                            loopy_loop_var, ">=",
+                            aff_from_expr(space, 0))))
+                .add_constraint(
+                    isl.Constraint.inequality_from_aff(
+                        iname_rel_aff(space,
+                            loopy_loop_var, "<=",
+                            aff_from_expr(space, stop-start)))))
+
+        from pymbolic import var
+        scope.active_iname_aliases[loop_var] = \
+                var(loopy_loop_var) + start
+        scope.active_loopy_inames.add(loopy_loop_var)
+
+        scope.index_sets.append(index_set)
+
+        self.block_nest.append("do")
+
+        for c in node.content:
+            self.rec(c)
+
+        del scope.active_iname_aliases[loop_var]
+        scope.active_loopy_inames.remove(loopy_loop_var)
 
     def map_EndDo(self, node):
-        pass
+        if not self.block_nest:
+            raise TranslationError("no do loop started at end do")
+        if self.block_nest.pop() != "do":
+            raise TranslationError("mismatched end do")
 
     def map_Continue(self, node):
         raise NotImplementedError("continue")
@@ -593,17 +600,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
         faulty_loopy_pragma_match = self.faulty_loopy_pragma.match(
                 stripped_comment_line)
 
-        if stripped_comment_line == "$loopy begin transform":
-            if self.in_transform_code:
-                raise TranslationError("can't enter transform code twice")
-            self.in_transform_code = True
-
-        elif stripped_comment_line == "$loopy end transform":
-            if not self.in_transform_code:
-                raise TranslationError("can't leave transform code twice")
-            self.in_transform_code = False
-
-        elif begin_tag_match:
+        if begin_tag_match:
             tag = begin_tag_match.group(1)
             if tag in self.instruction_tags:
                 raise TranslationError("nested begin tag for tag '%s'" % tag)
@@ -616,9 +613,6 @@ class F2LoopyTranslator(FTreeWalkerBase):
                         "end tag without begin tag for tag '%s'" % tag)
             self.instruction_tags.remove(tag)
 
-        elif self.in_transform_code:
-            self.transform_code_lines.append(node.content)
-
         elif faulty_loopy_pragma_match is not None:
             from warnings import warn
             warn("The comment line '%s' was not recognized as a loopy directive"
@@ -628,18 +622,8 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
     # }}}
 
-    def make_kernels(self, pre_transform_code=None, pre_transform_code_context=None):
-        kernel_names = [
-                sub.subprogram_name
-                for sub in self.kernels]
-
-        if pre_transform_code_context is None:
-            proc_dict = {}
-        else:
-            proc_dict = pre_transform_code_context.copy()
-
-        proc_dict["lp"] = lp
-        proc_dict["np"] = np
+    def make_kernels(self):
+        result = []
 
         for sub in self.kernels:
             # {{{ figure out arguments
@@ -685,28 +669,17 @@ class F2LoopyTranslator(FTreeWalkerBase):
                     sub.instructions,
                     kernel_data,
                     name=sub.subprogram_name,
-                    default_order="F"
+                    default_order="F",
+                    index_dtype=self.index_dtype,
                     )
 
             from loopy.loop import fuse_loop_domains
             knl = fuse_loop_domains(knl)
+            knl = lp.fold_constants(knl)
 
-            proc_dict[sub.subprogram_name] = lp.fold_constants(knl)
-
-        transform_code = remove_common_indentation(
-                self.transform_code_lines)
-
-        if pre_transform_code is not None:
-            proc_dict["_MODULE_SOURCE_CODE"] = pre_transform_code
-            exec(compile(pre_transform_code,
-                "<loopy transforms>", "exec"), proc_dict)
-
-        proc_dict["_MODULE_SOURCE_CODE"] = transform_code
-        exec(compile(transform_code,
-            "<loopy transforms>", "exec"), proc_dict)
+            result.append(knl)
 
-        return [proc_dict[knl_name]
-                for knl_name in kernel_names]
+        return result
 
 # }}}
 
diff --git a/loopy/fusion.py b/loopy/fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..4431c2c7f56009b8e205cb310060b96cfa210eae
--- /dev/null
+++ b/loopy/fusion.py
@@ -0,0 +1,291 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import six
+
+import islpy as isl
+from islpy import dim_type
+
+from loopy.diagnostic import LoopyError
+from pymbolic import var
+
+
+def _find_fusable_loop_domain_index(domain, other_domains):
+    my_inames = set(domain.get_var_dict(dim_type.set))
+
+    overlap_domains = []
+    for i, o_dom in enumerate(other_domains):
+        o_inames = set(o_dom.get_var_dict(dim_type.set))
+        if my_inames & o_inames:
+            overlap_domains.append(i)
+
+    if len(overlap_domains) >= 2:
+        raise LoopyError("more than one domain in one kernel has "
+                "overlapping inames with a "
+                "domain of the other kernel, cannot fuse: '%s'"
+                % domain)
+
+    if len(overlap_domains) == 1:
+        return overlap_domains[0]
+    else:
+        return None
+
+
+# {{{ generic merge helpers
+
+def _ordered_merge_lists(list_a, list_b):
+    result = list_a[:]
+    for item in list_b:
+        if item not in list_b:
+            result.append(item)
+
+    return result
+
+
+def _merge_dicts(item_name, dict_a, dict_b):
+    result = dict_a.copy()
+
+    for k, v in six.iteritems(dict_b):
+        if k in result:
+            if v != result[k]:
+                raise LoopyError("inconsistent %ss for key '%s' in merge: %s and %s"
+                        % (item_name, k, v, result[k]))
+
+        else:
+            result[k] = v
+
+    return result
+
+
+def _merge_values(item_name, val_a, val_b):
+    if val_a != val_b:
+        raise LoopyError("inconsistent %ss in merge: %s and %s"
+                % (item_name, val_a, val_b))
+
+    return val_a
+
+# }}}
+
+
+# {{{ two-kernel fusion
+
+def _fuse_two_kernels(knla, knlb):
+    from loopy.kernel import kernel_state
+    if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL:
+        raise LoopyError("can only fuse kernels in INITIAL state")
+
+    # {{{ fuse domains
+
+    new_domains = knla.domains[:]
+
+    for dom_b in knlb.domains:
+        i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains)
+        if i_fuse is None:
+            new_domains.append(dom_b)
+        else:
+            dom_a = new_domains[i_fuse]
+            dom_a, dom_b = isl.align_two(dom_a, dom_b)
+
+            shared_inames = list(
+                    set(dom_a.get_var_dict(dim_type.set))
+                    &
+                    set(dom_b.get_var_dict(dim_type.set)))
+
+            dom_a_s = dom_a.project_out_except(shared_inames, [dim_type.set])
+            dom_b_s = dom_a.project_out_except(shared_inames, [dim_type.set])
+
+            if not (dom_a_s <= dom_b_s and dom_b_s <= dom_a_s):
+                raise LoopyError("kernels do not agree on domain of "
+                        "inames '%s'" % (",".join(shared_inames)))
+
+            new_domain = dom_a & dom_b
+
+            new_domains[i_fuse] = new_domain
+
+    # }}}
+
+    vng = knla.get_var_name_generator()
+    b_var_renames = {}
+
+    # {{{ fuse args
+
+    new_args = knla.args[:]
+    for b_arg in knlb.args:
+        if b_arg.name not in knla.arg_dict:
+            new_arg_name = vng(b_arg.name)
+
+            if new_arg_name != b_arg.name:
+                b_var_renames[b_arg.name] = var(new_arg_name)
+
+            new_args.append(b_arg.copy(name=new_arg_name))
+        else:
+            if b_arg != knla.arg_dict[b_arg.name]:
+                raise LoopyError(
+                        "argument '%s' has inconsistent definition between "
+                        "the two kernels being merged" % b_arg.name)
+
+    # }}}
+
+    # {{{ fuse temporaries
+
+    new_temporaries = knla.temporary_variables.copy()
+    for b_name, b_tv in six.iteritems(knlb.temporary_variables):
+        assert b_name == b_tv.name
+
+        new_tv_name = vng(b_name)
+
+        if new_tv_name != b_name:
+            b_var_renames[b_name] = var(new_tv_name)
+
+        assert new_tv_name not in new_temporaries
+        new_temporaries[new_tv_name] = b_tv.copy(name=new_tv_name)
+
+    # }}}
+
+    # {{{ apply renames in kernel b
+
+    from loopy.symbolic import (
+            SubstitutionRuleMappingContext,
+            RuleAwareSubstitutionMapper)
+    from pymbolic.mapper.substitutor import make_subst_func
+
+    srmc = SubstitutionRuleMappingContext(
+            knlb.substitutions, knlb.get_var_name_generator())
+    subst_map = RuleAwareSubstitutionMapper(
+            srmc, make_subst_func(b_var_renames), within=lambda stack: True)
+    knlb = subst_map.map_kernel(knlb)
+
+    # }}}
+
+    # {{{ fuse instructions
+
+    new_instructions = knla.instructions[:]
+    from pytools import UniqueNameGenerator
+    insn_id_gen = UniqueNameGenerator(
+            set([insna.id for insna in new_instructions]))
+
+    knl_b_instructions = []
+    old_b_id_to_new_b_id = {}
+    for insnb in knlb.instructions:
+        old_id = insnb.id
+        new_id = insn_id_gen(old_id)
+        old_b_id_to_new_b_id[old_id] = new_id
+
+        knl_b_instructions.append(
+                insnb.copy(id=new_id))
+
+    for insnb in knl_b_instructions:
+        new_instructions.append(
+                insnb.copy(
+                    insn_deps=frozenset(
+                        old_b_id_to_new_b_id[dep_id]
+                        for dep_id in insnb.insn_deps)))
+
+    # }}}
+
+    # {{{ fuse assumptions
+
+    assump_a = knla.assumptions
+    assump_b = knlb.assumptions
+    assump_a, assump_b = isl.align_two(assump_a, assump_b)
+
+    shared_param_names = list(
+            set(dom_a.get_var_dict(dim_type.set))
+            &
+            set(dom_b.get_var_dict(dim_type.set)))
+
+    assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param])
+    assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param])
+
+    if not (assump_a_s <= assump_b_s and assump_b_s <= assump_a_s):
+        raise LoopyError("assumptions do not agree on kernels to be merged")
+
+    new_assumptions = (assump_a & assump_b).params()
+
+    # }}}
+
+    from loopy.kernel import LoopKernel
+    return LoopKernel(
+            domains=new_domains,
+            instructions=new_instructions,
+            args=new_args,
+            name="%s_and_%s" % (knla.name, knlb.name),
+            preambles=_ordered_merge_lists(knla.preambles, knlb.preambles),
+            preamble_generators=_ordered_merge_lists(
+                knla.preamble_generators, knlb.preamble_generators),
+            assumptions=new_assumptions,
+            local_sizes=_merge_dicts(
+                "local size", knla.local_sizes, knlb.local_sizes),
+            temporary_variables=new_temporaries,
+            iname_to_tag=_merge_dicts(
+                "iname-to-tag mapping",
+                knla.iname_to_tag,
+                knlb.iname_to_tag),
+            substitutions=_merge_dicts(
+                "substitution",
+                knla.substitutions,
+                knlb.substitutions),
+            function_manglers=_ordered_merge_lists(
+                knla.function_manglers,
+                knlb.function_manglers),
+            symbol_manglers=_ordered_merge_lists(
+                knla.symbol_manglers,
+                knlb.symbol_manglers),
+
+            iname_slab_increments=_merge_dicts(
+                "iname slab increment",
+                knla.iname_slab_increments,
+                knlb.iname_slab_increments),
+            loop_priority=_ordered_merge_lists(
+                knla.loop_priority,
+                knlb.loop_priority),
+            silenced_warnings=_ordered_merge_lists(
+                knla.silenced_warnings,
+                knlb.silenced_warnings),
+            applied_iname_rewrites=_ordered_merge_lists(
+                knla.applied_iname_rewrites,
+                knlb.applied_iname_rewrites),
+            index_dtype=_merge_values(
+                "index dtype",
+                knla.index_dtype,
+                knlb.index_dtype),
+            target=_merge_values(
+                "target",
+                knla.target,
+                knlb.target),
+            options=knla.options)
+
+# }}}
+
+
+def fuse_kernels(kernels):
+    kernels = list(kernels)
+    result = kernels.pop(0)
+    while kernels:
+        result = _fuse_two_kernels(result, kernels.pop(0))
+
+    return result
+
+# vim: foldmethod=marker
diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py
new file mode 100644
index 0000000000000000000000000000000000000000..12fd0354267e86707cc91de0027319e3584a08f0
--- /dev/null
+++ b/loopy/ipython_ext.py
@@ -0,0 +1,28 @@
+from __future__ import division
+
+from IPython.core.magic import (magics_class, Magics, cell_magic)
+
+import loopy as lp
+
+
+@magics_class
+class LoopyMagics(Magics):
+    @cell_magic
+    def fortran_kernel(self, line, cell):
+        result = lp.parse_fortran(cell.encode())
+
+        for knl in result:
+            self.shell.user_ns[knl.name] = knl
+
+    @cell_magic
+    def transformed_fortran_kernel(self, line, cell):
+        result = lp.parse_transformed_fortran(
+                cell.encode(),
+                transform_code_context=self.shell.user_ns)
+
+        for knl in result:
+            self.shell.user_ns[knl.name] = knl
+
+
+def load_ipython_extension(ip):
+    ip.register_magics(LoopyMagics)
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index b876513f754209f7ff498f4b9243a9ac54cce7f8..5c52988926e6714a83a620cfdad1e760fb25c4f2 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -137,8 +137,7 @@ def iname_rel_aff(space, iname, rel, aff):
         raise ValueError("unknown value of 'rel': %s" % rel)
 
 
-def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context,
-        prefer_constants):
+def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context):
     if context is not None:
         context = isl.align_spaces(context, pw_aff.get_domain_space(),
                 obj_bigger_ok=True).params()
@@ -163,21 +162,12 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context,
                 .is_bounded())
 
     # put constant bounds with unbounded validity first
-    # FIXME: Heuristi-hack.
-    if prefer_constants:
-        order = [
-                (True, False),  # constant, unbounded validity
-                (False, False),  # nonconstant, unbounded validity
-                (True, True),  # constant, bounded validity
-                (False, True),  # nonconstant, bounded validity
-                ]
-    else:
-        order = [
-                (False, False),  # nonconstant, unbounded validity
-                (True, False),  # constant, unbounded validity
-                (False, True),  # nonconstant, bounded validity
-                (True, True),  # constant, bounded validity
-                ]
+    order = [
+            (True, False),  # constant, unbounded validity
+            (False, False),  # nonconstant, unbounded validity
+            (True, True),  # constant, bounded validity
+            (False, True),  # nonconstant, bounded validity
+            ]
 
     pieces = flatten([
             [(set, aff) for set, aff in pieces
@@ -209,22 +199,19 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context,
             % (what, pw_aff))
 
 
-def static_min_of_pw_aff(pw_aff, constants_only, context=None,
-        prefer_constants=True):
+def static_min_of_pw_aff(pw_aff, constants_only, context=None):
     return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.ge_set,
-            "minimum", context, prefer_constants)
+            "minimum", context)
 
 
-def static_max_of_pw_aff(pw_aff, constants_only, context=None,
-        prefer_constants=True):
+def static_max_of_pw_aff(pw_aff, constants_only, context=None):
     return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.le_set,
-            "maximum", context, prefer_constants)
+            "maximum", context)
 
 
-def static_value_of_pw_aff(pw_aff, constants_only, context=None,
-        prefer_constants=True):
+def static_value_of_pw_aff(pw_aff, constants_only, context=None):
     return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.eq_set,
-            "value", context, prefer_constants)
+            "value", context)
 
 
 def duplicate_axes(isl_obj, duplicate_inames, new_inames):
@@ -337,7 +324,7 @@ def boxify(cache_manager, domain, box_inames, context):
     domain = domain.move_dims(
             dim_type.param, n_old_parameters, dim_type.set, 0, n_nonbox_inames)
 
-    result = domain.universe_like()
+    result = domain.universe(domain.space)
     zero = isl.Aff.zero_on_domain(result.space)
 
     for i in range(len(box_iname_indices)):
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 14562b384a89f1455a3a29ac161682889f1d3e01..1f64e59e6c9e8ab6d599ea0cc61937df5a60ffcb 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1,10 +1,6 @@
 """Kernel object."""
 
-from __future__ import division
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import zip
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -28,6 +24,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
+from six.moves import range, zip
 
 import numpy as np
 from pytools import RecordWithoutPickling, Record, memoize_method
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 9ba6bd3972985270f674e09e55ee2b439888d361..24ce751ff5bd79308ac54f7c00875293eed0aa4e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1,6 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2015 James Stevens"
 
@@ -24,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
+
 import loopy as lp
 import warnings
 from islpy import dim_type
@@ -35,17 +35,20 @@ class TypeToOpCountMap:
 
     def __init__(self, init_dict=None):
         if init_dict is None:
-            self.dict = {}
-        else:
-            self.dict = init_dict
+            init_dict = {}
+
+        self.dict = init_dict
 
     def __add__(self, other):
-        return TypeToOpCountMap(dict(self.dict.items() + other.dict.items()
-                                     + [(k, self.dict[k] + other.dict[k])
-                                     for k in set(self.dict) & set(other.dict)]))
+        result = self.dict.copy()
+
+        for k, v in six.iteritems(other.dict):
+            result[k] = self.dict.get(k, 0) + v
+
+        return TypeToOpCountMap(result)
 
     def __radd__(self, other):
-        if (other != 0):
+        if other != 0:
             raise ValueError("TypeToOpCountMap: Attempted to add TypeToOpCountMap "
                                 "to {} {}. TypeToOpCountMap may only be added to "
                                 "0 and other TypeToOpCountMap objects."
@@ -81,7 +84,7 @@ class ExpressionOpCounter(CombineMapper):
 
     def __init__(self, knl):
         self.knl = knl
-        from loopy.codegen.expression import TypeInferenceMapper
+        from loopy.expression import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl)
 
     def combine(self, values):
@@ -263,6 +266,36 @@ localid 0 is threadidx.x
 '''
 
 
+def count(kernel, bset):
+    try:
+        return bset.card()
+    except AttributeError:
+        pass
+
+    if not bset.is_box():
+        from loopy.diagnostic import warn
+        warn(kernel, "count_overestimate",
+                "Barvinok wrappers are not installed. "
+                "Counting routines may overestimate the "
+                "number of integer points in your loop "
+                "domain.")
+
+    result = None
+
+    for i in range(bset.dim(isl.dim_type.set)):
+        dmax = bset.dim_max(i)
+        dmin = bset.dim_min(i)
+
+        length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + 1)
+
+        if result is None:
+            result = length
+        else:
+            result = result * length
+
+    return result
+
+
 # to evaluate poly: poly.eval_with_dict(dictionary)
 def get_op_poly(knl):
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
@@ -278,7 +311,7 @@ def get_op_poly(knl):
         inames_domain = knl.get_inames_domain(insn_inames)
         domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
         ops = op_counter(insn.expression)
-        op_poly = op_poly + ops*domain.card()
+        op_poly = op_poly + ops*count(knl, domain)
     return op_poly
 
 
@@ -290,6 +323,5 @@ def get_DRAM_access_poly(knl):  # for now just counting subscripts
         insn_inames = knl.insn_inames(insn)
         inames_domain = knl.get_inames_domain(insn_inames)
         domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
-        poly += subscript_counter(insn.expression) * domain.card()
+        poly += subscript_counter(insn.expression) * count(knl, domain)
     return poly
-
diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py
index e68040c0983d5b6f144ae2b42eaa41899ef56b76..e4533b86dd24a8dca973ac9c8ffd022a4bed204b 100644
--- a/loopy/target/opencl/__init__.py
+++ b/loopy/target/opencl/__init__.py
@@ -214,10 +214,9 @@ class OpenCLTarget(CTarget):
 
     @memoize_method
     def get_dtype_registry(self):
-        from loopy.target.c.compyte import (
-                DTypeRegistry, fill_with_registry_with_c_types)
+        from loopy.target.c.compyte.dtypes import DTypeRegistry, fill_with_registry_with_c_types
         result = DTypeRegistry()
-        fill_with_registry_with_c_types(result)
+        fill_with_registry_with_c_types(result, respect_windows=False)
 
         # complex number support left out
 
diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py
index a0a119dce42e1095ff327f38f6b836f80091b159..174506cd65a81b405053c212c9683f9dd2df2cc1 100644
--- a/loopy/target/pyopencl/__init__.py
+++ b/loopy/target/pyopencl/__init__.py
@@ -125,16 +125,16 @@ def adjust_local_temp_var_storage(kernel, device):
 def check_sizes(kernel, device):
     import loopy as lp
 
+    from loopy.diagnostic import LoopyAdvisory, LoopyError
+
     if device is None:
         from loopy.diagnostic import warn
         warn(kernel, "no_device_in_pre_codegen_checks",
                 "No device parameter was passed to the PyOpenCLTarget. "
                 "Perhaps you want to pass a device to benefit from "
-                "additional checking.")
+                "additional checking.", LoopyAdvisory)
         return
 
-    from loopy.diagnostic import LoopyAdvisory, LoopyError
-
     parameters = {}
     for arg in kernel.args:
         if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
diff --git a/loopy/tools.py b/loopy/tools.py
index 1f47160677c5255539a3a6c098c9ba6d8b89a18b..b3c610dcbcf5e23a1de5746db78d5fedc9d171ed 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -164,14 +164,14 @@ class PicklableDtype(object):
 
 # {{{ remove common indentation
 
-def remove_common_indentation(code):
+def remove_common_indentation(code, require_leading_newline=True):
     if "\n" not in code:
         return code
 
     # accommodate pyopencl-ish syntax highlighting
     code = code.lstrip("//CL//")
 
-    if not code.startswith("\n"):
+    if require_leading_newline and not code.startswith("\n"):
         return code
 
     lines = code.split("\n")
diff --git a/loopy/version.py b/loopy/version.py
index 8f68a4958da821981a6922bd14335a2bc487473a..e7d877aa136ef4ad15774388ba9c9cb37b828ed1 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -25,4 +25,4 @@ VERSION = (2014, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
-DATA_MODEL_VERSION = "v5"
+DATA_MODEL_VERSION = "v6"
diff --git a/test/test_fortran.py b/test/test_fortran.py
index d51411a6e65ba750ddf8cfc2ee844c4bc167ab82..d361b15dc3d5f280bd9a4dadd4dbd603d4470284 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -51,23 +51,27 @@ def test_fill(ctx_factory):
           implicit none
 
           real*8 a, out(n)
-          integer n
+          integer n, i
 
           do i = 1, n
             out(i) = a
           end do
         end
 
-        !$loopy begin transform
+        !$loopy begin
         !
-        ! fill = lp.split_iname(fill, "i", 128,
+        ! fill, = lp.parse_fortran(SOURCE)
+        ! fill = lp.split_iname(fill, "i", split_amount,
         !     outer_tag="g.0", inner_tag="l.0")
+        ! RESULT = [fill]
         !
-        !$loopy end transform
+        !$loopy end
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_transformed_fortran(fortran_src,
+            pre_transform_code="split_amount = 128")
+
+    assert "i_inner" in knl.all_inames()
 
     ctx = ctx_factory()
 
@@ -80,7 +84,7 @@ def test_fill_const(ctx_factory):
           implicit none
 
           real*8 a, out(n)
-          integer n
+          integer n, i
 
           do i = 1, n
             out(i) = 3.45
@@ -88,8 +92,7 @@ def test_fill_const(ctx_factory):
         end
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_fortran(fortran_src)
 
     ctx = ctx_factory()
 
@@ -102,7 +105,7 @@ def test_asterisk_in_shape(ctx_factory):
           implicit none
 
           real*8 a, out(n), out2(n), inp(*)
-          integer n
+          integer n, i
 
           do i = 1, n
             a = inp(n)
@@ -112,8 +115,7 @@ def test_asterisk_in_shape(ctx_factory):
         end
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_fortran(fortran_src)
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -127,7 +129,7 @@ def test_temporary_to_subst(ctx_factory):
           implicit none
 
           real*8 a, out(n), out2(n), inp(n)
-          integer n
+          integer n, i
 
           do i = 1, n
             a = inp(i)
@@ -137,8 +139,7 @@ def test_temporary_to_subst(ctx_factory):
         end
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_fortran(fortran_src)
 
     ref_knl = knl
 
@@ -154,7 +155,7 @@ def test_temporary_to_subst_two_defs(ctx_factory):
           implicit none
 
           real*8 a, out(n), out2(n), inp(n)
-          integer n
+          integer n, i
 
           do i = 1, n
             a = inp(i)
@@ -165,8 +166,7 @@ def test_temporary_to_subst_two_defs(ctx_factory):
         end
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_fortran(fortran_src)
 
     ref_knl = knl
 
@@ -182,7 +182,7 @@ def test_temporary_to_subst_indices(ctx_factory):
           implicit none
 
           real*8 a(n), out(n), out2(n), inp(n)
-          integer n
+          integer n, i
 
           do i = 1, n
             a(i) = 6*inp(i)
@@ -194,8 +194,7 @@ def test_temporary_to_subst_indices(ctx_factory):
         end
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_fortran(fortran_src)
 
     knl = lp.fix_parameters(knl, n=5)
 
@@ -215,7 +214,7 @@ def test_if(ctx_factory):
           implicit none
 
           real*8 a, b, out(n), out2(n), inp(n)
-          integer n
+          integer n, i, j
 
           do i = 1, n
             a = inp(i)
@@ -232,8 +231,7 @@ def test_if(ctx_factory):
         end
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_fortran(fortran_src)
 
     ref_knl = knl
 
@@ -249,7 +247,7 @@ def test_tagged(ctx_factory):
           implicit none
           real*8 a, b, r, out(n), out2(n), inp(n), inp2(n)
           real*8 alpha
-          integer n
+          integer n, i
 
           do i = 1, n
             !$loopy begin tagged: input
@@ -267,8 +265,7 @@ def test_tagged(ctx_factory):
         end
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_fortran(fortran_src)
 
     assert sum(1 for insn in lp.find_instructions(knl, "*$input")) == 2
 
@@ -294,8 +291,7 @@ def test_matmul(ctx_factory, buffer_inames):
         end subroutine
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_fortran(fortran_src)
 
     assert len(knl.domains) == 1
 
@@ -321,12 +317,6 @@ def test_matmul(ctx_factory, buffer_inames):
     ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
 
-    # # FIXME: Make r/w tests possible, reactivate the above
-    # knl = lp.preprocess_kernel(knl)
-    # for k in lp.generate_loop_schedules(knl):
-    #     code, _ = lp.generate_code(k)
-    #     print(code)
-
 
 @pytest.mark.xfail
 def test_batched_sparse():
@@ -339,6 +329,7 @@ def test_batched_sparse():
           real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs)
 
           integer m, n, rowstart, rowend, length, nvals, nvecs
+          integer i, j, k
 
           do i = 1, m
             rowstart = rowstarts(i)
@@ -362,8 +353,7 @@ def test_batched_sparse():
 
         """
 
-    from loopy.frontend.fortran import f2loopy
-    knl, = f2loopy(fortran_src)
+    knl, = lp.parse_fortran(fortran_src)
 
     knl = lp.split_iname(knl, "i", 128)
     knl = lp.tag_inames(knl, {"i_outer": "g.0"})
@@ -373,6 +363,92 @@ def test_batched_sparse():
     knl = lp.fix_parameters(knl, nvecs=4)
 
 
+def test_fuse_kernels(ctx_factory):
+    fortran_template = """
+        subroutine {name}(nelements, ndofs, result, d, q)
+          implicit none
+          integer e, i, j, k
+          integer nelements, ndofs
+          real*8 result(nelements, ndofs, ndofs)
+          real*8 q(nelements, ndofs, ndofs)
+          real*8 d(ndofs, ndofs)
+          real*8 prev
+
+          do e = 1,nelements
+            do i = 1,ndofs
+              do j = 1,ndofs
+                do k = 1,ndofs
+                  {inner}
+                end do
+              end do
+            end do
+          end do
+        end subroutine
+        """
+
+    xd_line = """
+        prev = result(e,i,j)
+        result(e,i,j) = prev + d(i,k)*q(e,i,k)
+        """
+    yd_line = """
+        prev = result(e,i,j)
+        result(e,i,j) = prev + d(i,k)*q(e,k,j)
+        """
+
+    xderiv, = lp.parse_fortran(
+            fortran_template.format(inner=xd_line, name="xderiv"))
+    yderiv, = lp.parse_fortran(
+            fortran_template.format(inner=yd_line, name="yderiv"))
+    xyderiv, = lp.parse_fortran(
+            fortran_template.format(
+                inner=(xd_line + "\n" + yd_line), name="xyderiv"))
+
+    knl = lp.fuse_kernels((xderiv, yderiv))
+    knl = lp.set_loop_priority(knl, "e,i,j,k")
+
+    assert len(knl.temporary_variables) == 2
+
+    ctx = ctx_factory()
+    lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
+
+
+def test_parse_and_fuse_two_kernels():
+    fortran_src = """
+        subroutine fill(out, a, n)
+          implicit none
+
+          real*8 a, out(n)
+          integer n, i
+
+          do i = 1, n
+            out(i) = a
+          end do
+        end
+
+        subroutine twice(out, n)
+          implicit none
+
+          real*8 out(n)
+          integer n, i
+
+          do i = 1, n
+            out(i) = 2*out(i)
+          end do
+        end
+
+        !$loopy begin
+        !
+        ! fill, twice = lp.parse_fortran(SOURCE)
+        ! knl = lp.fuse_kernels((fill, twice))
+        ! print(knl)
+        ! RESULT = [knl]
+        !
+        !$loopy end
+        """
+
+    knl, = lp.parse_transformed_fortran(fortran_src)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_statistics.py b/test/test_statistics.py
index e68700833b4a4b41bef5051b030aed92206622c6..bfcd199f17dd4fa9c176145ebc4f84d1a848b987 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -1,4 +1,4 @@
-from __future__ import division
+from __future__ import division, print_function
 
 __copyright__ = "Copyright (C) 2015 James Stevens"
 
@@ -23,14 +23,14 @@ THE SOFTWARE.
 """
 
 import sys
-from pyopencl.tools import (
+from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 from loopy.statistics import *  # noqa
 import numpy as np
 
 
-def test_op_counter_basic(ctx_factory):
+def test_op_counter_basic():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -56,7 +56,7 @@ def test_op_counter_basic(ctx_factory):
     assert i32 == n*m
 
 
-def test_op_counter_reduction(ctx_factory):
+def test_op_counter_reduction():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -74,10 +74,10 @@ def test_op_counter_reduction(ctx_factory):
     assert f32 == 2*n*m*l
 
 
-def test_op_counter_logic(ctx_factory):
+def test_op_counter_logic():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
             [
                 """
                 e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
@@ -98,10 +98,10 @@ def test_op_counter_logic(ctx_factory):
     assert i32 == n*m
 
 
-def test_op_counter_specialops(ctx_factory):
+def test_op_counter_specialops():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
             [
                 """
                 c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
@@ -124,10 +124,10 @@ def test_op_counter_specialops(ctx_factory):
     assert i32 == n*m
 
 
-def test_op_counter_bitwise(ctx_factory):
+def test_op_counter_bitwise():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
             [
                 """
                 c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
@@ -143,10 +143,43 @@ def test_op_counter_bitwise(ctx_factory):
     m = 256
     l = 128
     i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    print(poly.dict[np.dtype(np.int32)])
     not_there = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert i32 == 3*n*m+n*m*l
     assert not_there == 0
 
+
+def test_op_counter_triangular_domain():
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<n and 0<=j<m and i<j}",
+            """
+            a[i, j] = b[i,j] * 2
+            """,
+            name="bitwise", assumptions="n,m >= 1")
+
+    knl = lp.add_and_infer_dtypes(knl,
+            dict(b=np.float64))
+
+    expect_fallback = False
+    import islpy as isl
+    try:
+        isl.BasicSet.card
+    except AttributeError:
+        expect_fallback = True
+    else:
+        expect_fallback = False
+
+    poly = get_op_poly(knl)[np.dtype(np.float64)]
+    value_dict = dict(m=13, n=200)
+    flops = poly.eval_with_dict(value_dict)
+
+    if expect_fallback:
+        assert flops == 144
+    else:
+        assert flops == 78
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])