diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index e6ecb4093ad24ceafe521c5379f4d2cd96ea6f52..e251ef42edbed166614e320e1b179f1adb5ec252 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 import six  # noqa
 
 from loopy.diagnostic import LoopyError
+from loopy.symbolic import CombineMapper
 
 
 # {{{ find_instructions
@@ -357,4 +358,132 @@ def uniquify_instruction_ids(kernel):
 # }}}
 
 
+# {{{ remove_work
+
+class _MemAccessGatherer(CombineMapper):
+    def __init__(self, kernel, address_space):
+        self.kernel = kernel
+        self.address_space = address_space
+
+    def combine(self, values):
+        from pytools import flatten
+        return set(flatten(values))
+
+    def map_constant(self, expr):
+        return set()
+
+    def map_algebraic_leaf(self, expr):
+        return set()
+
+    def _map_access(self, expr, name, index):
+        if name in self.kernel.all_inames():
+            return set()
+
+        descr = self.kernel.get_var_descriptor(name)
+        if descr.address_space == self.address_space:
+            result = set([expr])
+        else:
+            result = set()
+
+        return result | self.rec(index)
+
+    def map_variable(self, expr):
+        return self._map_access(expr, expr.name, ())
+
+    def map_subscript(self, expr):
+        import pymbolic.primitives as p
+        assert isinstance(expr.aggregate, p.Variable)
+        return self._map_access(expr, expr.aggregate.name, expr.index)
+
+
+def remove_work(kernel):
+    """This transform removes operations in a kernel, leaving only
+    accesses to global memory.
+
+    .. note::
+
+        This routine will currently not work correctly in the presence of
+        data-dependent flow control or memory access.
+    """
+    import loopy as lp
+    import pymbolic.primitives as p
+
+    kernel = lp.preprocess_kernel(kernel)
+
+    gatherer = _MemAccessGatherer(kernel, lp.AddressSpace.GLOBAL)
+
+    from loopy.kernel.instruction import MultiAssignmentBase, make_assignment
+
+    # maps each old ID to a frozenset of new IDs
+    old_to_new_ids = {}
+    new_instructions = []
+    insn_id_gen = kernel.get_instruction_id_generator()
+
+    var_name_gen = kernel.get_var_name_generator()
+    private_var_name = var_name_gen()
+    new_temporary_variables = kernel.temporary_variables.copy()
+    new_temporary_variables[private_var_name] = lp.TemporaryVariable(
+            private_var_name, address_space=lp.AddressSpace.PRIVATE)
+
+    # {{{ rewrite instructions
+
+    for insn in kernel.instructions:
+        if not isinstance(insn, MultiAssignmentBase):
+            new_instructions.append(insn)
+            old_to_new_ids[insn.id] = frozenset([insn.id])
+            continue
+
+        writer_accesses = set.union(*[
+            gatherer(lhs) for lhs in insn.assignees])
+
+        reader_accesses = gatherer(insn.expression)
+
+        new_insn_ids = set()
+        for read_expr in reader_accesses:
+            new_id = insn_id_gen(insn.id)
+            new_instructions.append(
+                    make_assignment(
+                        (p.Variable(private_var_name),),
+                        p.Variable(private_var_name) + read_expr,
+                        id=new_id,
+                        within_inames=insn.within_inames,
+                        depends_on=insn.depends_on))
+            new_insn_ids.add(new_id)
+
+        for write_expr in writer_accesses:
+            new_id = insn_id_gen(insn.id)
+            new_instructions.append(
+                    make_assignment(
+                        (write_expr,),
+                        17,
+                        id=new_id,
+                        within_inames=insn.within_inames,
+                        depends_on=insn.depends_on))
+            new_insn_ids.add(new_id)
+
+        old_to_new_ids[insn.id] = frozenset(new_insn_ids)
+
+    # }}}
+
+    # {{{ rewrite dependencies for new IDs
+
+    new_instructions_2 = []
+
+    for insn in new_instructions:
+        new_instructions_2.append(
+                insn.copy(
+                    depends_on=frozenset(
+                        subdep
+                        for dep in insn.depends_on
+                        for subdep in old_to_new_ids[dep])))
+
+    # }}}
+
+    return kernel.copy(
+            state=lp.KernelState.INITIAL,
+            instructions=new_instructions_2,
+            temporary_variables=new_temporary_variables)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/test/test_transform.py b/test/test_transform.py
index ed184fb50c099d5fb2a6a0941d2f2c22c3b757bc..ed00ebd1bbd49d91676d53192f0434ddfd97ed4d 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -533,6 +533,38 @@ def test_uniquify_instruction_ids():
     assert all(isinstance(id, str) for id in insn_ids)
 
 
+def test_remove_work(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<= i,j < n}",
+            [
+                "a_offset(ii, jj) := a[ii+2, jj+2]",
+                "z[i,j] = -2*a_offset(i,j)"
+                " + a_offset(i,j-1)"
+                " + a_offset(i,j+1)"
+                " + a_offset(i-1,j)"
+                " + a_offset(i+1,j)"
+
+                " + a_offset(i,j-2)"
+                " + a_offset(i,j+2)"
+                " + a_offset(i-2,j)"
+                " + a_offset(i+2,j)"
+                ],
+            assumptions="n>=1")
+
+    knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
+    knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
+            fetch_bounding_box=True, default_tag="l.auto")
+    knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"])
+
+    from loopy.transform.instruction import remove_work
+    knl = remove_work(knl)
+
+    lp.auto_test_vs_ref(None, ctx, knl, print_ref_code=False)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])