diff --git a/loopy/__init__.py b/loopy/__init__.py
index b60de6e2dcd35c1c167bf5e303401f2c6242ebec..c74c56768d8c803d9c6da750905ce3f9a5f81488 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -86,7 +86,8 @@ from loopy.transform.instruction import (
         remove_instructions,
         replace_instruction_ids,
         tag_instructions,
-        add_nosync)
+        add_nosync,
+        impose_only_read_after_write_deps)
 
 from loopy.transform.data import (
         add_prefetch, change_arg_to_image,
@@ -95,7 +96,8 @@ from loopy.transform.data import (
         remove_unused_arguments,
         alias_temporaries, set_argument_order,
         rename_argument,
-        set_temporary_scope)
+        set_temporary_scope, squeeze_axes_in_temporaries,
+        remove_axis)
 
 from loopy.transform.subst import (extract_subst,
         assignment_to_subst, expand_subst, find_rules_matching,
@@ -203,6 +205,8 @@ __all__ = [
         "remove_unused_arguments",
         "alias_temporaries", "set_argument_order",
         "rename_argument", "set_temporary_scope",
+        "squeeze_axes_in_temporaries",
+        "remove_axis",
 
         "find_instructions", "map_instructions",
         "set_instruction_priority", "add_dependency",
@@ -210,6 +214,7 @@ __all__ = [
         "replace_instruction_ids",
         "tag_instructions",
         "add_nosync",
+        "impose_only_read_after_write_deps",
 
         "extract_subst", "expand_subst", "assignment_to_subst",
         "find_rules_matching", "find_one_rule_matching",
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index a6a2d7b4fe4ba94caa8cbe112a5cf90719ceb643..02456d64d18a12af7c2200c3d711734d79c681ae 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -30,6 +30,8 @@ from islpy import dim_type
 from loopy.kernel.data import ImageArg
 
 from pytools import MovedFunctionDeprecationWrapper
+from loopy.symbolic import (IdentityMapper, RuleAwareIdentityMapper,
+        SubstitutionRuleMappingContext)
 
 
 # {{{ convenience: add_prefetch
@@ -767,4 +769,116 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No
 # }}}
 
 
+# {{{ remove unused axes in temporaries
+
+class AxesSqueezer(IdentityMapper):
+    def __init__(self, tv_to_removable_axes):
+        self.tv_to_removable_axes = tv_to_removable_axes
+
+    def map_subscript(self, expr):
+        removable_indices = self.tv_to_removable_axes.get(expr.aggregate.name,
+                None)
+
+        if removable_indices:
+            assert all(expr.index_tuple[idx] == 0 for idx in removable_indices)
+            new_expr = type(expr)(expr.aggregate, tuple(self.rec(idx) for i, idx
+                in enumerate(expr.index_tuple) if i not in
+                removable_indices))
+
+            return new_expr
+
+        return super(AxesSqueezer, self).map_subscript(expr)
+
+
+def squeeze_axes_in_temporaries(kernel):
+    """
+    Returns a kernel with all 1-length axes in a temporary variable removed. This
+    is helpful if some temporaries are intended to be run through
+    :func:`loopy.assignment_to_subst`, but all references to the variable are
+    of the form ``var_name[0, i, j]``.
+
+    .. note::
+
+        If the shape of ``A`` is ``(1, 1, 3, 6)`` and all references to ``A``
+        are of the form ``A[0, 0, i0, i1]`` then axes 0 and 1 are unused axes
+        and the references to ``A`` will be updated to ``A[i0, i1]``.
+    """
+    new_temps = {}
+    tv_x_removable_axes = {}
+    for tv in kernel.temporary_variables.values():
+        removable_axes = tuple(i for i, axis_len in enumerate(tv.shape) if
+                axis_len == 1)
+        if removable_axes:
+            tv_x_removable_axes[tv.name] = removable_axes
+            new_temps[tv.name] = tv.copy(shape=tuple(axis_len for axis_len in
+                tv.shape if axis_len != 1),
+                dim_tags=None)
+        else:
+            new_temps[tv.name] = tv
+
+    new_insns = []
+    axes_squeezer = AxesSqueezer(tv_x_removable_axes)
+
+    for insn in kernel.instructions:
+        new_insns.append(insn.with_transformed_expressions(axes_squeezer))
+
+    return kernel.copy(instructions=new_insns, temporary_variables=new_temps)
+
+# }}}
+
+
+# {{{ remove axis
+
+class AxisRemover(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, var_name, axis_num):
+        self.var_name = var_name
+        self.axis_num = axis_num
+        super(AxisRemover, self).__init__(rule_mapping_context)
+
+    def map_subscript(self, expr, expn_state):
+        from pymbolic.primitives import Variable, Subscript
+        if expr.aggregate.name == self.var_name:
+            if len(expr.index_tuple) == 1:
+                return Variable(self.var_name)
+            else:
+                return Subscript(expr.aggregate,
+                        expr.index_tuple[:self.axis_num]
+                        + expr.index_tuple[self.axis_num+1:])
+
+        return super(AxisRemover, self).map_subscript(expr, expn_state)
+
+
+def remove_axis(kernel, var_name, axis_num):
+    """
+    Returns a kernel after removing *axis_num* axis of the temporary variable
+    *var_name*.
+
+    One might interpret this operation as the inverse of privatization.
+    """
+
+    assert var_name in kernel.temporary_variables
+    assert axis_num < len(kernel.temporary_variables[var_name].shape)
+
+    rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions,
+            kernel.get_var_name_generator())
+
+    kernel = AxisRemover(rule_mapping_context, var_name, axis_num).map_kernel(kernel)
+
+    if len(kernel.temporary_variables[var_name].shape) == 1:
+        new_temps = dict((tv.name, tv.copy(shape=(), dim_tags=None))
+                if tv.name == var_name else (tv.name, tv) for tv in
+                kernel.temporary_variables.values())
+    else:
+        from loopy import auto
+        new_temps = dict((tv.name,
+            tv.copy(shape=tv.shape[:axis_num]+tv.shape[axis_num+1:],
+                strides=auto, dim_tags=None))
+                if tv.name == var_name else (tv.name, tv) for tv in
+                kernel.temporary_variables.values())
+
+    return kernel.copy(temporary_variables=new_temps)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index e6ecb4093ad24ceafe521c5379f4d2cd96ea6f52..cb5c903a611d7a0a2701a7323d1336dee57ea605 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -73,28 +73,16 @@ def set_instruction_priority(kernel, insn_match, priority):
 # }}}
 
 
-# {{{ add_dependency
-
-def add_dependency(kernel, insn_match, depends_on):
-    """Add the instruction dependency *dependency* to the instructions matched
-    by *insn_match*.
-
-    *insn_match* and *depends_on* may be any instruction id match understood by
-    :func:`loopy.match.parse_match`.
-
-    .. versionchanged:: 2016.3
-
-        Third argument renamed to *depends_on* for clarity, allowed to
-        be not just ID but also match expression.
-    """
+# {{{ add/remove_dependency
 
+def _add_or_remove_dependency(kernel, insn_match, depends_on, adds):
     if isinstance(depends_on, str) and depends_on in kernel.id_to_insn:
-        added_deps = frozenset([depends_on])
+        depends_on = frozenset([depends_on])
     else:
-        added_deps = frozenset(
+        depends_on = frozenset(
                 dep.id for dep in find_instructions(kernel, depends_on))
 
-    if not added_deps:
+    if not depends_on:
         raise LoopyError("no instructions found matching '%s' "
                 "(to add as dependencies)" % depends_on)
 
@@ -104,13 +92,26 @@ def add_dependency(kernel, insn_match, depends_on):
         new_deps = insn.depends_on
         matched[0] = True
         if new_deps is None:
-            new_deps = added_deps
+            new_deps = depends_on
         else:
-            new_deps = new_deps | added_deps
+            new_deps = new_deps | depends_on
 
         return insn.copy(depends_on=new_deps)
 
-    result = map_instructions(kernel, insn_match, add_dep)
+    def remove_dep(insn):
+        new_deps = insn.depends_on
+        matched[0] = True
+        if new_deps is None:
+            new_deps = None
+        else:
+            new_deps = new_deps - depends_on
+
+        return insn.copy(depends_on=new_deps)
+
+    if adds:
+        result = map_instructions(kernel, insn_match, add_dep)
+    else:
+        result = map_instructions(kernel, insn_match, remove_dep)
 
     if not matched[0]:
         raise LoopyError("no instructions found matching '%s' "
@@ -118,6 +119,31 @@ def add_dependency(kernel, insn_match, depends_on):
 
     return result
 
+
+def add_dependency(kernel, insn_match, depends_on):
+    """Add the instruction dependency *dependency* to the instructions matched
+    by *insn_match*.
+
+    *insn_match* and *depends_on* may be any instruction id match understood by
+    :func:`loopy.match.parse_match`.
+
+    .. versionchanged:: 2016.3
+
+        Third argument renamed to *depends_on* for clarity, allowed to
+        be not just ID but also match expression.
+    """
+    return _add_or_remove_dependency(kernel, insn_match, depends_on, adds=True)
+
+
+def remove_dependency(kernel, insn_match, depends_on):
+    """Remove the instruction dependency *depends_on* to the instructions matched
+    by *insn_match*.
+
+    *insn_match* and *depends_on* may be any instruction id match understood by
+    :func:`loopy.match.parse_match`.
+    """
+    return _add_or_remove_dependency(kernel, insn_match, depends_on, adds=False)
+
 # }}}
 
 
@@ -357,4 +383,33 @@ def uniquify_instruction_ids(kernel):
 # }}}
 
 
+# {{{ impose_only_read_after_write_deps
+
+def impose_only_read_after_write_deps(kernel):
+    """
+    Returns a kernel with every instruction depending only on instructions
+    which write to the variables that it reads.
+    """
+    from loopy.kernel.tools import find_recursive_dependencies
+
+    # insn_to_all_deps: stores all direct or indirect dependencies of an insn
+    insn_to_all_deps = dict((insn.id, set()) for insn in kernel.instructions)
+
+    for insn in kernel.instructions:
+        insn_to_all_deps[insn.id] = find_recursive_dependencies(kernel,
+                [insn.id]) - set([insn.id])
+
+    new_insns = []
+
+    for insn in kernel.instructions:
+        depends_on = frozenset([dep_id for dep_id in insn_to_all_deps[insn.id] if
+                insn.read_dependency_names() & (
+                    kernel.id_to_insn[dep_id].write_dependency_names()
+                    - kernel.all_inames())])
+        new_insns.append(insn.copy(depends_on=depends_on))
+
+    return kernel.copy(instructions=new_insns)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/test/test_transform.py b/test/test_transform.py
index cdc0c14b8bacc4fe5279d000461c0ea2244af021..da0630ca6a3ea03f69d7d49cbaa98976d9ad12fb 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -570,6 +570,72 @@ def test_nested_substs_in_insns(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
 
+def test_impose_only_raw_deps():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<10}",
+            """
+            a[i]  = 2*b[i] {id=insn_0}
+            c[i] = 2*d[i] {id=insn_1}
+            e[i] = 2*a[i] {id=insn_2}
+            """, seq_dependencies=True)
+    knl = lp.impose_only_read_after_write_deps(knl)
+    assert knl.id_to_insn['insn_2'].depends_on == frozenset(['insn_0'])
+    assert knl.id_to_insn['insn_1'].depends_on == frozenset()
+
+
+def test_squeeze_axes_in_temps(ctx_factory):
+    knl = lp.make_kernel(
+            "{[n, i, j]: 0<=i, j<32 and 0<=n<100}",
+            """
+            # unnecessary temps which might exacerbate register pressure
+            <> temp_1[0, i] = 2*x[n, i]**2
+            <> temp_2[0, i] = x[n, i]**2
+            <> temp_3[0, i] = 12*x[n, i]**2
+            <> temp_4[0, i] = 0.2*x[n, i]**2
+            y[n, j] = temp_1[0, j]+2*temp_2[0, j]+11*temp_3[0, j]+2*temp_4[0, j]
+            """, [lp.GlobalArg('x, y', shape=(100, 32), dtype=float), '...'],
+            seq_dependencies=True)
+
+    ref_knl = knl.copy()
+    knl = lp.squeeze_axes_in_temporaries(knl)
+    knl = lp.assignment_to_subst(knl, 'temp_1')
+    knl = lp.assignment_to_subst(knl, 'temp_2')
+    knl = lp.assignment_to_subst(knl, 'temp_3')
+    knl = lp.assignment_to_subst(knl, 'temp_4')
+    lp.auto_test_vs_ref(ref_knl, ctx_factory(), knl)
+
+
+def test_remove_axis(ctx_factory):
+    knl = lp.make_kernel(
+            "{[n, i1, i2, j, k]: 0<=n<100 and 0<=i1, i2<10 and  0<=j, k<6}",
+            """
+            # gather
+            tmp[i1, j] = x[n, i1, j]
+            # scatter
+            y[n, i2, k] = tmp[i2, k]
+            """,
+            [
+                lp.GlobalArg('x, y', shape=(100, 10, 6), dtype=float),
+                lp.TemporaryVariable('tmp', shape=(10, 6,), dtype=float,
+                    address_space=lp.AddressSpace.PRIVATE)
+            ],
+            seq_dependencies=True
+            )
+
+    knl = lp.tag_inames(knl, "n:g.0, j:l.0, k:l.0")
+
+    ref_knl = knl.copy()
+    ref_knl = lp.set_options(ref_knl, 'write_cl')
+
+    # get rid of unnecessary usage of private memory
+    knl = lp.remove_axis(knl, 'tmp', 1)
+
+    assert ref_knl.temporary_variables['tmp'].shape == (10, 6)
+    assert knl.temporary_variables['tmp'].shape == (10,)
+
+    lp.auto_test_vs_ref(ref_knl, ctx_factory(), knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])