diff --git a/doc/reference.rst b/doc/reference.rst
index 9d75a924b71847d04632b4945f9228c80c2e01f4..af39de655c560f5a7be5e98ba9debfc5649b0fdb 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -336,6 +336,8 @@ function, which is responsible for creating kernels:
 
 .. autofunction:: make_copy_kernel
 
+.. autofunction:: fuse_kernels
+
 Transforming Kernels
 --------------------
 
@@ -429,6 +431,8 @@ Manipulating Instructions
 
 .. autofunction:: remove_instructions
 
+.. autofunction:: tag_instructions
+
 Library interface
 ^^^^^^^^^^^^^^^^^
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 12329bbf6472867c873958f1a18f20ba1016b5f7..453b4310339e05861fbe8a5db1d7d89e24dbbc65 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -1,8 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import zip
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -27,6 +23,9 @@ THE SOFTWARE.
 """
 
 
+import six
+from six.moves import range, zip
+
 import islpy as isl
 from islpy import dim_type
 
@@ -58,6 +57,7 @@ from loopy.library.reduction import register_reduction_parser
 from loopy.subst import extract_subst, expand_subst, temporary_to_subst
 from loopy.precompute import precompute
 from loopy.buffer import buffer_array
+from loopy.fusion import fuse_kernels
 from loopy.padding import (split_arg_axis, find_padding_multiple,
         add_padding)
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
@@ -88,6 +88,7 @@ __all__ = [
 
         "extract_subst", "expand_subst", "temporary_to_subst",
         "precompute", "buffer_array",
+        "fuse_kernels",
         "split_arg_axis", "find_padding_multiple", "add_padding",
 
         "get_dot_dependency_graph",
@@ -539,7 +540,7 @@ class _InameDuplicator(RuleAwareIdentityMapper):
             return var(new_name)
 
     def map_instruction(self, insn):
-        if not self.within(((insn.id, None),)):
+        if not self.within(((insn.id, insn.tags),)):
             return insn
 
         new_fid = frozenset(
@@ -1781,4 +1782,23 @@ def fold_constants(kernel):
 
 # }}}
 
+
+# {{{ tag_instructions
+
+def tag_instructions(kernel, new_tag, within=None):
+    from loopy.context_matching import parse_stack_match
+    within = parse_stack_match(within)
+
+    new_insns = []
+    for insn in kernel.instructions:
+        if within(((insn.id, insn.tags),)):
+            new_insns.append(
+                    insn.copy(tags=insn.tags + (new_tag,)))
+        else:
+            new_insns.append(insn)
+
+    return kernel.copy(instructions=new_insns)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/fusion.py b/loopy/fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..434297b1b430ec27f013be9e6838ba321d22f8b7
--- /dev/null
+++ b/loopy/fusion.py
@@ -0,0 +1,240 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import six
+
+import islpy as isl
+from islpy import dim_type
+
+from loopy.diagnostic import LoopyError
+
+
+def _find_fusable_loop_domain_index(domain, other_domains):
+    my_inames = set(domain.get_var_dict(dim_type.set))
+
+    overlap_domains = []
+    for i, o_dom in enumerate(other_domains):
+        o_inames = set(o_dom.get_var_dict(dim_type.set))
+        if my_inames & o_inames:
+            overlap_domains.append(i)
+
+    if len(overlap_domains) >= 2:
+        raise LoopyError("more than one domain in one kernel has "
+                "overlapping inames with a "
+                "domain of the other kernel, cannot fuse: '%s'"
+                % domain)
+
+    if len(overlap_domains) == 1:
+        return overlap_domains[0]
+    else:
+        return None
+
+
+# {{{ generic merge helpers
+
+def _ordered_merge_lists(list_a, list_b):
+    result = list_a[:]
+    for item in list_b:
+        if item not in list_b:
+            result.append(item)
+
+    return result
+
+
+def _merge_dicts(item_name, dict_a, dict_b):
+    result = dict_a.copy()
+
+    for k, v in six.iteritems(dict_b):
+        if k in result:
+            if v != result[k]:
+                raise LoopyError("inconsistent %ss for key '%s' in merge: %s and %s"
+                        % (item_name, k, v, result[k]))
+
+        else:
+            result[k] = v
+
+    return result
+
+
+def _merge_values(item_name, val_a, val_b):
+    if val_a != val_b:
+        raise LoopyError("inconsistent %ss in merge: %s and %s"
+                % (item_name, val_a, val_b))
+
+    return val_a
+
+# }}}
+
+
+# {{{ two-kernel fusion
+
+def _fuse_two_kernels(knla, knlb):
+    from loopy.kernel import kernel_state
+    if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL:
+        raise LoopyError("can only fuse kernels in INITIAL state")
+
+    # {{{ fuse instructions
+
+    new_instructions = knla.instructions[:]
+    from pytools import UniqueNameGenerator
+    insn_id_gen = UniqueNameGenerator(
+            set([insna.id for insna in new_instructions]))
+    for insnb in knlb.instructions:
+        new_instructions.append(
+                insnb.copy(id=insn_id_gen(insnb.id)))
+
+    # }}}
+
+    # {{{ fuse domains
+
+    new_domains = knla.domains[:]
+
+    for dom_b in knlb.domains:
+        i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains)
+        if i_fuse is None:
+            new_domains.append(dom_b)
+        else:
+            dom_a = new_domains[i_fuse]
+            dom_a, dom_b = isl.align_two(dom_a, dom_b)
+
+            shared_inames = list(
+                    set(dom_a.get_var_dict(dim_type.set))
+                    &
+                    set(dom_b.get_var_dict(dim_type.set)))
+
+            dom_a_s = dom_a.project_out_except(shared_inames, [dim_type.set])
+            dom_b_s = dom_a.project_out_except(shared_inames, [dim_type.set])
+
+            if not (dom_a_s <= dom_b_s and dom_b_s <= dom_a_s):
+                raise LoopyError("kernels do not agree on domain of "
+                        "inames '%s'" % (",".join(shared_inames)))
+
+            new_domain = dom_a & dom_b
+
+            new_domains[i_fuse] = new_domain
+
+    # }}}
+
+    # {{{ fuse args
+
+    new_args = knla.args[:]
+    for b_arg in knlb.args:
+        if b_arg.name not in knla.arg_dict:
+            new_args.append(b_arg)
+        else:
+            if b_arg != knla.arg_dict[b_arg.name]:
+                raise LoopyError(
+                        "argument '%s' has inconsistent definition between "
+                        "the two kernels being merged" % b_arg.name)
+
+    # }}}
+
+    # {{{ fuse assumptions
+
+    assump_a = knla.assumptions
+    assump_b = knlb.assumptions
+    assump_a, assump_b = isl.align_two(assump_a, assump_b)
+
+    shared_param_names = list(
+            set(dom_a.get_var_dict(dim_type.set))
+            &
+            set(dom_b.get_var_dict(dim_type.set)))
+
+    assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param])
+    assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param])
+
+    if not (assump_a_s <= assump_b_s and assump_b_s <= assump_a_s):
+        raise LoopyError("assumptions do not agree on kernels to be merged")
+
+    new_assumptions = (assump_a & assump_b).params()
+
+    # }}}
+
+    from loopy.kernel import LoopKernel
+    return LoopKernel(
+            domains=new_domains,
+            instructions=new_instructions,
+            args=new_args,
+            name="%s_and_%s" % (knla.name, knlb.name),
+            preambles=_ordered_merge_lists(knla.preambles, knlb.preambles),
+            preamble_generators=_ordered_merge_lists(
+                knla.preamble_generators, knlb.preamble_generators),
+            assumptions=new_assumptions,
+            local_sizes=_merge_dicts(
+                "local size", knla.local_sizes, knlb.local_sizes),
+            temporary_variables=_merge_dicts(
+                "temporary variable",
+                knla.temporary_variables,
+                knlb.temporary_variables),
+            iname_to_tag=_merge_dicts(
+                "iname-to-tag mapping",
+                knla.iname_to_tag,
+                knlb.iname_to_tag),
+            substitutions=_merge_dicts(
+                "substitution",
+                knla.substitutions,
+                knlb.substitutions),
+            function_manglers=_ordered_merge_lists(
+                knla.function_manglers,
+                knlb.function_manglers),
+            symbol_manglers=_ordered_merge_lists(
+                knla.symbol_manglers,
+                knlb.symbol_manglers),
+
+            iname_slab_increments=_merge_dicts(
+                "iname slab increment",
+                knla.iname_slab_increments,
+                knlb.iname_slab_increments),
+            loop_priority=_ordered_merge_lists(
+                knla.loop_priority,
+                knlb.loop_priority),
+            silenced_warnings=_ordered_merge_lists(
+                knla.silenced_warnings,
+                knlb.silenced_warnings),
+            applied_iname_rewrites=_ordered_merge_lists(
+                knla.applied_iname_rewrites,
+                knlb.applied_iname_rewrites),
+            index_dtype=_merge_values(
+                "index dtype",
+                knla.index_dtype,
+                knlb.index_dtype),
+            target=_merge_values(
+                "target",
+                knla.target,
+                knlb.target),
+            options=knla.options)
+
+# }}}
+
+
+def fuse_kernels(kernels):
+    kernels = list(kernels)
+    result = kernels.pop(0)
+    while kernels:
+        result = _fuse_two_kernels(result, kernels.pop(0))
+
+    return result
+
+# vim: foldmethod=marker
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 14562b384a89f1455a3a29ac161682889f1d3e01..1f64e59e6c9e8ab6d599ea0cc61937df5a60ffcb 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1,10 +1,6 @@
 """Kernel object."""
 
-from __future__ import division
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import zip
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -28,6 +24,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
+from six.moves import range, zip
 
 import numpy as np
 from pytools import RecordWithoutPickling, Record, memoize_method
diff --git a/loopy/loop.py b/loopy/loop.py
index 54030bd8f0ebaaded29078f28a16b1dcc66a98e4..4592463822a2321745aaf48a316d16c98d4efca3 100644
--- a/loopy/loop.py
+++ b/loopy/loop.py
@@ -104,8 +104,7 @@ def fuse_loop_domains(kernel):
 
                 outer_dom, inner_dom = isl.align_two(outer_dom, inner_dom)
 
-                from loopy.isl_helpers import convexify
-                new_domains.insert(min_idx, convexify(inner_dom & outer_dom))
+                new_domains.insert(min_idx, inner_dom & outer_dom)
                 break
 
             if new_domains:
diff --git a/test/test_fortran.py b/test/test_fortran.py
index a2167cdafcf152dca282a39fff77d8b829c18f62..ce27424d8cfb7e4256e745a8806c3b6bedb6b4ac 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -321,12 +321,6 @@ def test_matmul(ctx_factory, buffer_inames):
     ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
 
-    # # FIXME: Make r/w tests possible, reactivate the above
-    # knl = lp.preprocess_kernel(knl)
-    # for k in lp.generate_loop_schedules(knl):
-    #     code, _ = lp.generate_code(k)
-    #     print(code)
-
 
 @pytest.mark.xfail
 def test_batched_sparse():
@@ -374,6 +368,47 @@ def test_batched_sparse():
     knl = lp.fix_parameters(knl, nvecs=4)
 
 
+def test_fuse_kernels(ctx_factory):
+    fortran_template = """
+        subroutine {name}(nelements, ndofs, result, d, q)
+          implicit none
+          integer e, i, j, k
+          integer nelements, ndofs
+          real*8 result(nelements, ndofs, ndofs)
+          real*8 q(nelements, ndofs, ndofs)
+          real*8 d(ndofs, ndofs)
+
+          do e = 1,nelements
+            do i = 1,ndofs
+              do j = 1,ndofs
+                do k = 1,ndofs
+                  {line}
+                end do
+              end do
+            end do
+          end do
+        end subroutine
+        """
+
+    xd_line = "result(e,i,j) = result(e,i,j) + d(i,k)*q(e,i,k)"
+    yd_line = "result(e,i,j) = result(e,i,j) + d(i,k)*q(e,k,j)"
+
+    from loopy.frontend.fortran import f2loopy
+    xderiv, = f2loopy(
+            fortran_template.format(line=xd_line, name="xderiv"))
+    yderiv, = f2loopy(
+            fortran_template.format(line=yd_line, name="yderiv"))
+    xyderiv, = f2loopy(
+            fortran_template.format(
+                line=(xd_line + "\n" + yd_line), name="xyderiv"))
+
+    knl = lp.fuse_kernels((xderiv, yderiv))
+    knl = lp.set_loop_priority(knl, "e,i,j,k")
+
+    ctx = ctx_factory()
+    lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])