diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy
new file mode 100644
index 0000000000000000000000000000000000000000..ea85b0c6b198c8bac6270098696504fc9006fd14
--- /dev/null
+++ b/examples/fortran/matmul.floopy
@@ -0,0 +1,26 @@
+subroutine dgemm(m,n,l,alpha,a,b,c)
+  implicit none
+  real*8 temp, a(m,l),b(l,n),c(m,n), alpha
+  integer m,n,k,i,j,l
+
+  do j = 1,n
+    do k = 1,l
+      do i = 1,m
+        c(i,j) = c(i,j) + alpha*b(k,j)*a(i,k)
+      end do
+    end do
+  end do
+end subroutine
+
+!$loopy begin transform
+! dgemm = lp.split_iname(dgemm, "i", 16,
+!         outer_tag="g.0", inner_tag="l.1")
+! dgemm = lp.split_iname(dgemm, "j", 8,
+!         outer_tag="g.1", inner_tag="l.0")
+! dgemm = lp.split_iname(dgemm, "k", 32)
+!
+! dgemm = lp.extract_subst(dgemm, "a_acc", "a[i1,i2]", parameters="i1, i2")
+! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2")
+! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner")
+! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner")
+!$loopy end transform
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index d3089fc6a6baad741c1cfc2d1843aa541192cfb3..68f7f131c4c40d2f647fb6c6095959b4884a3c41 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -392,10 +392,9 @@ class LoopKernel(RecordWithoutPickling):
         iname_set_stack = []
         result = []
 
-        writer_map = self.writer_map()
+        from loopy.kernel.tools import is_domain_dependent_on_inames
 
-        for dom in self.domains:
-            parameters = set(dom.get_var_names(dim_type.param))
+        for dom_idx, dom in enumerate(self.domains):
             inames = set(dom.get_var_names(dim_type.set))
 
             # This next domain may be nested inside the previous domain.
@@ -405,37 +404,11 @@ class LoopKernel(RecordWithoutPickling):
 
             discard_level_count = 0
             while discard_level_count < len(iname_set_stack):
-                # {{{ check for parenthood by loop bound iname
-
                 last_inames = iname_set_stack[-1-discard_level_count]
-                if last_inames & parameters:
-                    break
-
-                # }}}
-
-                # {{{ check for parenthood by written variable
 
-                is_parent_by_variable = False
-                for par in parameters:
-                    if par in self.temporary_variables:
-                        writer_insns = writer_map[par]
-
-                        if len(writer_insns) > 1:
-                            raise RuntimeError("loop bound '%s' "
-                                    "may only be written to once" % par)
-
-                        writer_insn, = writer_insns
-                        writer_inames = self.insn_inames(writer_insn)
-
-                        if writer_inames & last_inames:
-                            is_parent_by_variable = True
-                            break
-
-                if is_parent_by_variable:
+                if is_domain_dependent_on_inames(self, dom_idx, last_inames):
                     break
 
-                # }}}
-
                 discard_level_count += 1
 
             if discard_level_count:
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index bd1fcb449320b5bab98b5d69cd0239621a61076a..2973cd9e69e46c875e15e6c8674150373be8be1e 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -539,4 +539,41 @@ class DomainParameterFinder(object):
 
 # }}}
 
+
+# {{{ is domain dependent on inames
+
+def is_domain_dependent_on_inames(kernel, domain_index, inames):
+    dom = kernel.domains[domain_index]
+    dom_parameters = set(dom.get_var_names(dim_type.param))
+
+    # {{{ check for parenthood by loop bound iname
+
+    if inames & dom_parameters:
+        return True
+
+    # }}}
+
+    # {{{ check for parenthood by written variable
+
+    for par in dom_parameters:
+        if par in kernel.temporary_variables:
+            writer_insns = kernel.writer_map()[par]
+
+            if len(writer_insns) > 1:
+                raise RuntimeError("loop bound '%s' "
+                        "may only be written to once" % par)
+
+            writer_insn, = writer_insns
+            writer_inames = kernel.insn_inames(writer_insn)
+
+            if writer_inames & inames:
+                return True
+
+    # }}}
+
+    return False
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/loop.py b/loopy/loop.py
index d2ddece1980ec41c9e0062ccd4a091f3e3d5c301..4592463822a2321745aaf48a316d16c98d4efca3 100644
--- a/loopy/loop.py
+++ b/loopy/loop.py
@@ -23,6 +23,7 @@ THE SOFTWARE.
 """
 
 
+import islpy as isl
 import six
 
 
@@ -39,7 +40,6 @@ def potential_loop_nest_map(kernel):
     iname_to_insns = kernel.iname_to_insns()
 
     # examine pairs of all inames--O(n**2), I know.
-    from loopy.kernel.data import IlpBaseTag
     for inner_iname in all_inames:
         inner_result = set()
         for outer_iname in all_inames:
@@ -56,22 +56,67 @@ def potential_loop_nest_map(kernel):
 
 
 def fuse_loop_domains(kernel):
-    did_something = False
+    from loopy.kernel.tools import is_domain_dependent_on_inames
+
     while True:
         lnm = potential_loop_nest_map(kernel)
+        parents_per_domain = kernel.parents_per_domain()
+        all_parents_per_domain = kernel.all_parents_per_domain()
+
+        new_domains = None
 
         for inner_iname, outer_inames in six.iteritems(lnm):
             for outer_iname in outer_inames:
-                inner_do
+                # {{{ check if it's safe to fuse
+
+                inner_domain_idx = kernel.get_home_domain_index(inner_iname)
+                outer_domain_idx = kernel.get_home_domain_index(outer_iname)
+
+                if inner_domain_idx == outer_domain_idx:
+                    break
+
+                if (
+                        outer_domain_idx in all_parents_per_domain[inner_domain_idx]
+                        and not
+                        outer_domain_idx == parents_per_domain[inner_domain_idx]):
+                    # Outer domain is not a direct parent of the inner
+                    # domain. Unable to fuse.
+                    continue
+
+                outer_dom = kernel.domains[outer_domain_idx]
+                inner_dom = kernel.domains[inner_domain_idx]
 
+                outer_inames = set(outer_dom.get_var_names(isl.dim_type.set))
+                if is_domain_dependent_on_inames(kernel, inner_domain_idx,
+                        outer_inames):
+                    # Bounds of inner domain depend on outer domain.
+                    # Unable to fuse.
+                    continue
 
+                # }}}
 
-        print kernel
-        print lnm
-        1/0
+                new_domains = kernel.domains[:]
+                min_idx = min(inner_domain_idx, outer_domain_idx)
+                max_idx = max(inner_domain_idx, outer_domain_idx)
 
-        if not did_something:
+                del new_domains[max_idx]
+                del new_domains[min_idx]
+
+                outer_dom, inner_dom = isl.align_two(outer_dom, inner_dom)
+
+                new_domains.insert(min_idx, inner_dom & outer_dom)
+                break
+
+            if new_domains:
+                break
+
+        if not new_domains:
+            # Nothing was accomplished in the last loop trip, time to quit.
             break
 
+        kernel = kernel.copy(domains=new_domains)
+
     return kernel
 
+
+# vim: fdm=marker
diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py
index f1d3bfdde83735a34b9f86f14c561e9aa1bb2124..a1b323d7cf0deaef308952d87b1523b0a020ad68 100644
--- a/loopy/target/pyopencl/__init__.py
+++ b/loopy/target/pyopencl/__init__.py
@@ -34,6 +34,9 @@ from loopy.target.opencl import OpenCLTarget
 import pyopencl as cl
 import pyopencl.characterize as cl_char
 
+# This ensures the dtype registry is populated.
+import pyopencl.tools  # noqa
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -228,7 +231,7 @@ def pyopencl_preamble_generator(target, seen_dtypes, seen_functions):
 # }}}
 
 
-# {{{
+# {{{ pyopencl tools
 
 class PyOpenCLTarget(OpenCLTarget):
     def __init__(self, device=None):
@@ -236,9 +239,6 @@ class PyOpenCLTarget(OpenCLTarget):
 
         self.device = device
 
-        # This ensures the dtype registry is populated.
-        import pyopencl.tools  # noqa
-
     hash_fields = ["device"]
     comparison_fields = ["device"]
 
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 9a207bf5fb3bfce216809317356730bf521a1ea8..33826e4063190f7d7d48b1bfcb4122931868bd1e 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -272,6 +272,51 @@ def test_tagged(ctx_factory):
 
     assert sum(1 for insn in lp.find_instructions(knl, "*$input")) == 2
 
+
+def test_matmul(ctx_factory):
+    fortran_src = """
+        subroutine dgemm(m,n,l,a,b,c)
+          implicit none
+          real*8 temp, a(m,l),b(l,n),c(m,n)
+          integer m,n,k,i,j,l
+
+          do j = 1,n
+            do i = 1,m
+              temp = 0
+              do k = 1,l
+                temp = temp + b(k,j)*a(i,k)
+              end do
+              c(i,j) = temp
+            end do
+          end do
+        end subroutine
+
+        !$loopy begin transform
+        !$loopy end transform
+        """
+
+    from loopy.frontend.fortran import f2loopy
+    knl, = f2loopy(fortran_src)
+
+    assert len(knl.domains) == 1
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16,
+            outer_tag="g.0", inner_tag="l.1")
+    knl = lp.split_iname(knl, "j", 8,
+            outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", 32)
+
+    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
+    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
+    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner")
+    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner")
+
+    ctx = ctx_factory()
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5, m=7, l=10))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])