diff --git a/doc/reference.rst b/doc/reference.rst
index 595fd4b5fa5bac3ccd69cfb1b7a05ebb1c7b75f9..75592a63c4fec2f9bfb93b2eae8bb8e9f67af8e9 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -402,6 +402,8 @@ Caching, Precomputation and Prefetching
 
 .. autofunction:: add_prefetch
 
+.. autofunction:: buffer_write
+
 Influencing data access
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 3027e10d38a98b4c86737502b7fd20f41c3cc3c3..d9f67a43724a2b28d7192d3b79a04750920509ba 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -55,6 +55,7 @@ from loopy.kernel.creation import make_kernel, UniqueName
 from loopy.library.reduction import register_reduction_parser
 from loopy.subst import extract_subst, expand_subst, temporary_to_subst
 from loopy.precompute import precompute
+from loopy.buffer_writes import buffer_write
 from loopy.padding import (split_arg_axis, find_padding_multiple,
         add_padding)
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
@@ -82,7 +83,7 @@ __all__ = [
         "register_reduction_parser",
 
         "extract_subst", "expand_subst", "temporary_to_subst",
-        "precompute",
+        "precompute", "buffer_write",
         "split_arg_axis", "find_padding_multiple", "add_padding",
 
         "get_dot_dependency_graph",
@@ -1570,11 +1571,11 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
         equations = [equations]
 
     import re
-    EQN_RE = re.compile(r"^([^=]+)=([^=]+)$")
+    eqn_re = re.compile(r"^([^=]+)=([^=]+)$")
 
     def parse_equation(eqn):
         if isinstance(eqn, str):
-            eqn_match = EQN_RE.match(eqn)
+            eqn_match = eqn_re.match(eqn)
             if not eqn_match:
                 raise ValueError("invalid equation: %s" % eqn)
 
diff --git a/loopy/array_buffer.py b/loopy/array_buffer.py
index c15935b733ba70d4b8410bb9eff7afb4d9b0ae9c..6d1c56eef27e6e809dfc6b4048f7a570d1c29a10 100644
--- a/loopy/array_buffer.py
+++ b/loopy/array_buffer.py
@@ -350,7 +350,7 @@ class ArrayToBufferMap(object):
                 set(global_s2s_par_dom.get_var_names(dim_type.param))
                 & self.kernel.all_inames())
 
-        for arg in accdesc.args:
+        for arg in accdesc.storage_axis_exprs:
             arg_inames.update(get_dependencies(arg))
         arg_inames = frozenset(arg_inames)
 
diff --git a/loopy/buffer_writes.py b/loopy/buffer_writes.py
new file mode 100644
index 0000000000000000000000000000000000000000..7083e3c269aea7d663c1c38605ecd8b581324b6d
--- /dev/null
+++ b/loopy/buffer_writes.py
@@ -0,0 +1,386 @@
+from __future__ import division, absolute_import
+import six
+from six.moves import range, zip
+
+__copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from loopy.array_buffer import (ArrayToBufferMap, NoOpArrayToBufferMap,
+        AccessDescriptor)
+from loopy.symbolic import ExpandingIdentityMapper, SubstitutionMapper
+from pymbolic.mapper.substitutor import make_subst_func
+
+from pymbolic import var
+
+
+# {{{ replace array access
+
+class ArrayAccessReplacer(ExpandingIdentityMapper):
+    def __init__(self, kernel, var_name, within, array_base_map, buf_var):
+        super(ArrayAccessReplacer, self).__init__(
+                kernel.substitutions, kernel.get_var_name_generator())
+
+        self.kernel = kernel
+        self.within = within
+
+        self.array_base_map = array_base_map
+
+        self.var_name = var_name
+        self.modified_insn_ids = set()
+
+        self.buf_var = buf_var
+
+    def map_variable(self, expr, expn_state):
+        result = None
+        if expr.name == self.var_name and self.within(expn_state):
+            result = self.map_array_access((), expn_state)
+
+        if result is None:
+            return super(ArrayAccessReplacer, self).map_variable(expr, expn_state)
+        else:
+            self.modified_insn_ids.add(expn_state.insn_id)
+            return result
+
+    def map_subscript(self, expr, expn_state):
+        result = None
+        if expr.aggregate.name == self.var_name and self.within(expn_state):
+            result = self.map_array_access(expr.index, expn_state)
+
+        if result is None:
+            return super(ArrayAccessReplacer, self).map_subscript(expr, expn_state)
+        else:
+            self.modified_insn_ids.add(expn_state.insn_id)
+            return result
+
+    def map_array_access(self, index, expn_state):
+        accdesc = AccessDescriptor(
+            identifier=None,
+            expands_footprint=False,
+            storage_axis_exprs=index)
+
+        if not self.array_base_map.is_access_descriptor_in_footprint(accdesc):
+            return None
+
+        abm = self.array_base_map
+
+        assert len(index) == len(abm.non1_storage_axis_flags)
+
+        access_subscript = []
+        for i in range(len(index)):
+            if not abm.non1_storage_axis_flags[i]:
+                continue
+
+            ax_index = index[i]
+            from loopy.isl_helpers import simplify_via_aff
+            ax_index = simplify_via_aff(
+                    ax_index - abm.storage_base_indices[i])
+
+            access_subscript.append(ax_index)
+
+        result = self.buf_var
+        if access_subscript:
+            result = result.index(tuple(access_subscript))
+
+        # Can't possibly be nested, but recurse anyway to
+        # make sure substitution rules referenced below here
+        # do not get thrown away.
+        self.rec(result, expn_state.copy(arg_context={}))
+
+        return result
+
+# }}}
+
+
+def buffer_write(kernel, var_name, buffer_inames, init_expression=None,
+        store_expression=None, within=None, default_tag="l.auto",
+        temporary_is_local=None, fetch_bounding_box=False,
+        within_inames=()):
+    """
+    :arg init_expression: Either *None* (indicating the prior value of the buffered
+        array should be read) or an expression optionally involving the
+        variable 'base' (which references the associated location in the array
+        being buffered).
+    :arg store_expression: Either *None* or an expression involving
+        variables 'base' and 'buffer' (without array indices).
+    """
+
+    # {{{ process arguments
+
+    if isinstance(init_expression, str):
+        from loopy.symbolic import parse
+        init_expression = parse(init_expression)
+
+    if isinstance(store_expression, str):
+        from loopy.symbolic import parse
+        store_expression = parse(store_expression)
+
+    if isinstance(buffer_inames, str):
+        buffer_inames = buffer_inames.split(",")
+
+    if isinstance(within_inames, str):
+        within_inames = within_inames.split(",")
+
+    for iname in buffer_inames:
+        if iname not in kernel.all_inames():
+            raise RuntimeError("sweep iname '%s' is not a known iname"
+                    % iname)
+
+    buffer_inames = list(buffer_inames)
+    buffer_inames_set = frozenset(buffer_inames)
+
+    from loopy.context_matching import parse_stack_match
+    within = parse_stack_match(within)
+
+    if var_name in kernel.arg_dict:
+        var_descr = kernel.arg_dict[var_name]
+    elif var_name in kernel.temporary_variables:
+        var_descr = kernel.temporary_variables[var_name]
+    else:
+        raise ValueError("variable '%s' not found" % var_name)
+
+    from loopy.kernel.data import ArrayBase
+    if isinstance(var_descr, ArrayBase):
+        var_shape = var_descr.shape
+    else:
+        var_shape = ()
+
+    if temporary_is_local is None:
+        import loopy as lp
+        temporary_is_local = lp.auto
+
+    # }}}
+
+    var_name_gen = kernel.get_var_name_generator()
+
+    access_descriptors = []
+    for insn in kernel.instructions:
+        if not within((insn.id, insn.tags)):
+            continue
+
+        for assignee, index in insn.assignees_and_indices():
+            if assignee == var_name:
+                access_descriptors.append(
+                        AccessDescriptor(
+                            identifier=insn.id,
+                            expands_footprint=True,
+                            storage_axis_exprs=index))
+
+    # {{{ use given / find new storage_axes
+
+    init_inames = []
+    store_inames = []
+    new_iname_to_tag = {}
+
+    for i in range(len(var_shape)):
+        init_iname = var_name_gen("%s_init_%d" % (var_name, i))
+        store_iname = var_name_gen("%s_store_%d" % (var_name, i))
+
+        new_iname_to_tag[init_iname] = default_tag
+        new_iname_to_tag[store_iname] = default_tag
+
+        init_inames.append(init_iname)
+        store_inames.append(store_iname)
+
+    # }}}
+
+    # {{{ modify loop domains
+
+    non1_init_inames = []
+    non1_store_inames = []
+
+    if var_shape:
+        # {{{ find domain to be changed
+
+        from loopy.kernel.tools import DomainChanger
+        domch = DomainChanger(kernel, buffer_inames_set)
+
+        if domch.leaf_domain_index is not None:
+            # If the sweep inames are at home in parent domains, then we'll add
+            # fetches with loops over copies of these parent inames that will end
+            # up being scheduled *within* loops over these parents.
+
+            for iname in buffer_inames_set:
+                if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
+                    raise RuntimeError("buffer iname '%s' is not 'at home' in the "
+                            "sweep's leaf domain" % iname)
+
+        # }}}
+
+        abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames,
+                access_descriptors, len(var_shape))
+
+        for i in range(len(var_shape)):
+            if abm.non1_storage_axis_flags[i]:
+                non1_init_inames.append(init_inames[i])
+                non1_store_inames.append(store_inames[i])
+            else:
+                del new_iname_to_tag[init_inames[i]]
+                del new_iname_to_tag[store_inames[i]]
+
+        new_domain = domch.domain
+        new_domain = abm.augment_domain_with_sweep(
+                    new_domain, non1_init_inames,
+                    boxify_sweep=fetch_bounding_box)
+        new_domain = abm.augment_domain_with_sweep(
+                    new_domain, non1_store_inames,
+                    boxify_sweep=fetch_bounding_box)
+        new_kernel_domains = domch.get_domains_with(new_domain)
+        del new_domain
+
+    else:
+        # leave kernel domains unchanged
+        new_kernel_domains = kernel.domains
+
+        abm = NoOpArrayToBufferMap()
+
+    # }}}
+
+    # {{{ set up temp variable
+
+    import loopy as lp
+
+    buf_var_name = var_name_gen(based_on=var_name+"_buf")
+
+    new_temporary_variables = kernel.temporary_variables.copy()
+    temp_var = lp.TemporaryVariable(
+            name=buf_var_name,
+            dtype=var_descr.dtype,
+            base_indices=(0,)*len(abm.non1_storage_shape),
+            shape=tuple(abm.non1_storage_shape),
+            is_local=temporary_is_local)
+
+    new_temporary_variables[buf_var_name] = temp_var
+
+    # }}}
+
+    new_insns = []
+
+    buf_var = var(buf_var_name)
+
+    # {{{ generate init instruction
+
+    buf_var_init = buf_var
+    if non1_init_inames:
+        buf_var_init = buf_var_init.index(
+                tuple(var(iname) for iname in non1_init_inames))
+
+    init_base = var(var_name)
+
+    init_subscript = []
+    init_iname_idx = 0
+    if var_shape:
+        for i in range(len(var_shape)):
+            ax_subscript = abm.storage_base_indices[i]
+            if abm.non1_storage_axis_flags[i]:
+                ax_subscript += var(non1_init_inames[init_iname_idx])
+                init_iname_idx += 1
+            init_subscript.append(ax_subscript)
+
+    if init_subscript:
+        init_base = init_base.index(tuple(init_subscript))
+
+    if init_expression is None:
+        init_expression = init_base
+    else:
+        init_expression = init_expression
+        init_expression = SubstitutionMapper(
+                make_subst_func({
+                    "base": init_base,
+                    }))(init_expression)
+
+    init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name)
+    from loopy.kernel.data import ExpressionInstruction
+    init_instruction = ExpressionInstruction(id=init_insn_id,
+                assignee=buf_var_init,
+                expression=init_expression,
+                forced_iname_deps=frozenset(within_inames))
+
+    # }}}
+
+    aar = ArrayAccessReplacer(kernel, var_name, within, abm, buf_var)
+    kernel = aar.map_kernel(kernel)
+
+    # {{{ add init_insn_id to insn_deps
+
+    new_insns = []
+
+    for insn in kernel.instructions:
+        if insn.id in aar.modified_insn_ids:
+            new_insns.append(
+                    insn.copy(
+                        insn_deps=insn.insn_deps | frozenset([init_insn_id])))
+        else:
+            new_insns.append(insn)
+
+    # }}}
+
+    # {{{ generate store instruction
+
+    buf_var_store = buf_var
+    if non1_store_inames:
+        buf_var_store = buf_var_store.index(
+                tuple(var(iname) for iname in non1_store_inames))
+
+    store_subscript = []
+    store_iname_idx = 0
+    if var_shape:
+        for i in range(len(var_shape)):
+            ax_subscript = abm.storage_base_indices[i]
+            if abm.non1_storage_axis_flags[i]:
+                ax_subscript += var(non1_store_inames[store_iname_idx])
+                store_iname_idx += 1
+            store_subscript.append(ax_subscript)
+
+    store_target = var(var_name)
+    if store_subscript:
+        store_target = store_target.index(tuple(store_subscript))
+
+    if store_expression is None:
+        store_expression = buf_var_store
+    else:
+        store_expression = SubstitutionMapper(
+                make_subst_func({
+                    "base": store_target,
+                    "buffer": buf_var_store,
+                    }))(store_expression)
+
+    from loopy.kernel.data import ExpressionInstruction
+    store_instruction = ExpressionInstruction(
+                id=kernel.make_unique_instruction_id(based_on="store_"+var_name),
+                insn_deps=frozenset(aar.modified_insn_ids),
+                assignee=store_target,
+                expression=store_expression,
+                forced_iname_deps=frozenset(within_inames))
+
+    # }}}
+
+    kernel = kernel.copy(
+            domains=new_kernel_domains,
+            instructions=new_insns + [init_instruction, store_instruction],
+            temporary_variables=new_temporary_variables)
+
+    from loopy import tag_inames
+    kernel = tag_inames(kernel, new_iname_to_tag)
+
+    return kernel
+
+# vim: foldmethod=marker
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 93be120d2cca92e9e887ffb7908816da82b2c925..fdf38b768269dfc8e86c76de9fd1dcf05e4e86e6 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -354,4 +354,13 @@ def boxify(cache_manager, domain, box_inames, context):
 
     return convexify(result)
 
+
+def simplify_via_aff(expr):
+    from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies
+    deps = get_dependencies(expr)
+    return aff_to_expr(aff_from_expr(
+        isl.Space.create_from_names(isl.Context(), list(deps)),
+        expr))
+
+
 # vim: foldmethod=marker
diff --git a/loopy/precompute.py b/loopy/precompute.py
index de5cd9e02bf1a5e5d367cc565c6c6aef9efce52c..3044b8f7dcc50907b01c9710b0e7880aac93ba4c 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -25,7 +25,6 @@ THE SOFTWARE.
 """
 
 
-import islpy as isl
 from loopy.symbolic import (get_dependencies, SubstitutionMapper,
         ExpandingIdentityMapper)
 from pymbolic.mapper.substitutor import make_subst_func
@@ -57,13 +56,7 @@ def storage_axis_exprs(storage_axis_sources, args):
     return result
 
 
-def simplify_via_aff(expr):
-    from loopy.symbolic import aff_from_expr, aff_to_expr
-    deps = get_dependencies(expr)
-    return aff_to_expr(aff_from_expr(
-        isl.Space.create_from_names(isl.Context(), list(deps)),
-        expr))
-
+# {{{ gather rule invocations
 
 class RuleInvocationGatherer(ExpandingIdentityMapper):
     def __init__(self, kernel, subst_name, subst_tag, within):
@@ -130,20 +123,20 @@ class RuleInvocationGatherer(ExpandingIdentityMapper):
 
         return 0  # exact value irrelevant
 
+# }}}
+
+
+# {{{ replace rule invocation
 
 class RuleInvocationReplacer(ExpandingIdentityMapper):
     def __init__(self, kernel, subst_name, subst_tag, within,
             access_descriptors, array_base_map,
             storage_axis_names, storage_axis_sources,
-            storage_base_indices, non1_storage_axis_names,
+            non1_storage_axis_names,
             target_var_name):
         ExpandingIdentityMapper.__init__(self,
                 kernel.substitutions, kernel.get_var_name_generator())
 
-        from loopy.symbolic import SubstitutionRuleExpander
-        self.subst_expander = SubstitutionRuleExpander(
-                kernel.substitutions, kernel.get_var_name_generator())
-
         self.kernel = kernel
         self.subst_name = subst_name
         self.subst_tag = subst_tag
@@ -154,7 +147,6 @@ class RuleInvocationReplacer(ExpandingIdentityMapper):
 
         self.storage_axis_names = storage_axis_names
         self.storage_axis_sources = storage_axis_sources
-        self.storage_base_indices = storage_base_indices
         self.non1_storage_axis_names = non1_storage_axis_names
 
         self.target_var_name = target_var_name
@@ -198,11 +190,13 @@ class RuleInvocationReplacer(ExpandingIdentityMapper):
 
         assert len(arguments) == len(rule.arguments)
 
+        abm = self.array_base_map
+
         stor_subscript = []
         for sax_name, sax_source, sax_base_idx in zip(
                 self.storage_axis_names,
                 self.storage_axis_sources,
-                self.storage_base_indices):
+                abm.storage_base_indices):
             if sax_name not in self.non1_storage_axis_names:
                 continue
 
@@ -213,6 +207,7 @@ class RuleInvocationReplacer(ExpandingIdentityMapper):
                 # an iname
                 ax_index = var(sax_source)
 
+            from loopy.isl_helpers import simplify_via_aff
             ax_index = simplify_via_aff(ax_index - sax_base_idx)
             stor_subscript.append(ax_index)
 
@@ -227,6 +222,8 @@ class RuleInvocationReplacer(ExpandingIdentityMapper):
 
         return new_outer_expr
 
+# }}}
+
 
 def precompute(kernel, subst_use, sweep_inames=[], within=None,
         storage_axes=None, new_storage_axis_names=None, storage_axis_to_tag={},
@@ -301,6 +298,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
             raise RuntimeError("sweep iname '%s' is not a known iname"
                     % iname)
 
+    sweep_inames = list(sweep_inames)
+    sweep_inames_set = frozenset(sweep_inames)
+
     if isinstance(storage_axes, str):
         storage_axes = storage_axes.split(",")
 
@@ -398,9 +398,6 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # }}}
 
-    sweep_inames = list(sweep_inames)
-    sweep_inames_set = frozenset(sweep_inames)
-
     # {{{ find inames used in arguments
 
     expanding_usage_arg_deps = set()
@@ -573,7 +570,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     invr = RuleInvocationReplacer(kernel, subst_name, subst_tag, within,
             access_descriptors, abm,
             storage_axis_names, storage_axis_sources,
-            abm.storage_base_indices, non1_storage_axis_names,
+            non1_storage_axis_names,
             target_var_name)
 
     kernel = invr.map_kernel(kernel)
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 4492f032d0d355e9268dab049b55c0613b22b47b..9309fd0418e453311240cce8e9f6c73a8d428f9c 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -352,6 +352,10 @@ class ExpansionState(Record):
     :ivar arg_context: a dict representing current argument values
     """
 
+    @property
+    def insn_id(self):
+        return self.stack[0][0]
+
 
 class SubstitutionRuleRenamer(IdentityMapper):
     def __init__(self, renames):
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 33826e4063190f7d7d48b1bfcb4122931868bd1e..c68e963a60585634437d8a80b5eb1cbb1e37e2c2 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -282,17 +282,12 @@ def test_matmul(ctx_factory):
 
           do j = 1,n
             do i = 1,m
-              temp = 0
               do k = 1,l
-                temp = temp + b(k,j)*a(i,k)
+                c(i,j) = c(i,j) + b(k,j)*a(i,k)
               end do
-              c(i,j) = temp
             end do
           end do
         end subroutine
-
-        !$loopy begin transform
-        !$loopy end transform
         """
 
     from loopy.frontend.fortran import f2loopy
@@ -307,15 +302,28 @@ def test_matmul(ctx_factory):
     knl = lp.split_iname(knl, "j", 8,
             outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", 32)
+    knl = lp.assume(knl, "n mod 32 = 0")
+    knl = lp.assume(knl, "m mod 32 = 0")
+    knl = lp.assume(knl, "l mod 16 = 0")
 
     knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
     knl = lp.precompute(knl, "a_acc", "k_inner,i_inner")
     knl = lp.precompute(knl, "b_acc", "j_inner,k_inner")
 
-    ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5, m=7, l=10))
+    # FIXME: also test
+    # knl = lp.buffer_write(knl, "c", (), init_expression="0",
+    #         store_expression="base+buffer")
+    knl = lp.buffer_write(knl, "c", "i_inner,j_inner", init_expression="0",
+            store_expression="base+buffer", within_inames="i_outer,j_outer")
+
+    #ctx = ctx_factory()
+    #lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5, m=7, l=10))
 
+    knl = lp.preprocess_kernel(knl)
+    for k in lp.generate_loop_schedules(knl):
+        code, _ = lp.generate_code(k)
+        print(code)
 
 if __name__ == "__main__":
     if len(sys.argv) > 1: