diff --git a/.gitignore b/.gitignore
index c5865116226b2cdc15ed4978548416090265e115..7ae71dc52e28413d3da125b4e5e9a64c2c970a19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,5 @@ core
 .coverage
 htmlcov
 .ipynb_checkpoints
+lextab.py
+yacctab.py
diff --git a/bin/loopy b/bin/loopy
index c8cbea1c615c396d3903df24472822cc918d3461..b47127740b4d016d823de3874fd7398c031d8160 100644
--- a/bin/loopy
+++ b/bin/loopy
@@ -186,8 +186,8 @@ def main():
         code, impl_arg_info = generate_code(kernel)
         codes.append(code)
 
-    if args.outfile:
-        outfile, = args.outfile
+    if args.outfile is not None:
+        outfile = args.outfile
     else:
         outfile = "-"
 
diff --git a/doc/reference.rst b/doc/reference.rst
index e79f17554119c5efe9351f46b8c501c2c27d2387..52cd92bb8812565e71ba4cedf1826bf64633f247 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -430,6 +430,8 @@ Influencing data access
 
 .. autofunction:: tag_data_axes
 
+.. autofunction:: remove_unused_arguments
+
 Padding
 ^^^^^^^
 
@@ -537,6 +539,8 @@ Obtaining Kernel Statistics
 
 .. autofunction:: get_gmem_access_poly
 
+.. autofunction:: sum_mem_access_to_bytes
+
 .. autofunction:: get_barrier_poly
 
 .. vim: tw=75:spell
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index c1ce360c88b962ef438809500413d747a3cd4b43..0df707d0db82fcf42b60e8405d1f0d189fb31354 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1233,7 +1233,7 @@ the :class:`loopy.LoopKernel` *inames*). We'll print this map now:
 
 .. doctest::
 
-    >>> print(op_map)
+    >>> print(lp.stringify_stats_mapping(op_map))
     float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 }
     float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 }
     int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
@@ -1244,9 +1244,9 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 .. doctest::
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> i32ops = op_map.dict[np.dtype(np.int32)].eval_with_dict(param_dict)
-    >>> f32ops = op_map.dict[np.dtype(np.float32)].eval_with_dict(param_dict)
-    >>> f64ops = op_map.dict[np.dtype(np.float64)].eval_with_dict(param_dict)
+    >>> i32ops = op_map[np.dtype(np.int32)].eval_with_dict(param_dict)
+    >>> f32ops = op_map[np.dtype(np.float32)].eval_with_dict(param_dict)
+    >>> f64ops = op_map[np.dtype(np.float64)].eval_with_dict(param_dict)
     >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" %
     ...     (i32ops, f32ops, f64ops))
     integer ops: 65536
@@ -1264,7 +1264,7 @@ continue using the kernel from the previous example:
 
     >>> from loopy.statistics import get_gmem_access_poly
     >>> load_store_map = get_gmem_access_poly(knl)
-    >>> print(load_store_map)
+    >>> print(lp.stringify_stats_mapping(load_store_map))
     (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 }
     (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
     (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 }
@@ -1295,13 +1295,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "uniform", "load")
+    >>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load")
     ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map.dict[(np.dtype(np.float64), "uniform", "store")
+    >>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store")
     ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "uniform", "load")
+    >>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load")
     ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map.dict[(np.dtype(np.float32), "uniform", "store")
+    >>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store")
     ...     ].eval_with_dict(param_dict)
     >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
     ...     (f32ld, f32st, f64ld, f64st))
@@ -1322,8 +1322,8 @@ this time, so we'll print the mapping manually to make it more legible:
 
     >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0")
     >>> load_store_map = get_gmem_access_poly(knl_consec)
-    >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)):
-    ...     print("%s :\n%s\n" % (key, load_store_map.dict[key]))
+    >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
+    ...     print("%s :\n%s\n" % (key, load_store_map[key]))
     (dtype('float32'), 'consecutive', 'load') :
     [n, m, l] -> { ... }
     <BLANKLINE>
@@ -1345,13 +1345,13 @@ accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "consecutive", "load")
+    >>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load")
     ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map.dict[(np.dtype(np.float64), "consecutive", "store")
+    >>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store")
     ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "consecutive", "load")
+    >>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load")
     ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map.dict[(np.dtype(np.float32), "consecutive", "store")
+    >>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store")
     ...     ].eval_with_dict(param_dict)
     >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
     ...     (f32ld, f32st, f64ld, f64st))
@@ -1369,8 +1369,8 @@ our parallelization of the kernel:
 
     >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1")
     >>> load_store_map = get_gmem_access_poly(knl_nonconsec)
-    >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)):
-    ...     print("%s :\n%s\n" % (key, load_store_map.dict[key]))
+    >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
+    ...     print("%s :\n%s\n" % (key, load_store_map[key]))
     (dtype('float32'), 'nonconsecutive', 'load') :
     [n, m, l] -> { ... }
     <BLANKLINE>
@@ -1390,16 +1390,16 @@ elements in memory. The total number of array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld = load_store_map.dict[
+    >>> f64ld = load_store_map[
     ...     (np.dtype(np.float64), "nonconsecutive", "load")
     ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map.dict[
+    >>> f64st = load_store_map[
     ...     (np.dtype(np.float64), "nonconsecutive", "store")
     ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map.dict[
+    >>> f32ld = load_store_map[
     ...     (np.dtype(np.float32), "nonconsecutive", "load")
     ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map.dict[
+    >>> f32st = load_store_map[
     ...     (np.dtype(np.float32), "nonconsecutive", "store")
     ...     ].eval_with_dict(param_dict)
     >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 8956856d4735a9554ed8c34741790ef1286d9e54..10fac9cfce3eb5a9a1bb6686fabf453ec6b83a8d 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -64,7 +64,8 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (get_op_poly, get_gmem_access_poly,
-        get_DRAM_access_poly, get_barrier_poly)
+        get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping,
+        sum_mem_access_to_bytes)
 from loopy.codegen import generate_code, generate_body
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
@@ -105,7 +106,7 @@ __all__ = [
         "generate_code", "generate_body",
 
         "get_op_poly", "get_gmem_access_poly", "get_DRAM_access_poly",
-        "get_barrier_poly",
+        "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes",
 
         "CompiledKernel",
 
@@ -2089,4 +2090,22 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"):
 
 # }}}
 
+
+# {{{ remove_unused_arguments
+
+def remove_unused_arguments(knl):
+    new_args = []
+
+    refd_vars = set(knl.all_params())
+    for insn in knl.instructions:
+        refd_vars.update(insn.dependency_names())
+
+    for arg in knl.args:
+        if arg.name in refd_vars:
+            new_args.append(arg)
+
+    return knl.copy(args=new_args)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/check.py b/loopy/check.py
index 3401f7b8d86eb8e89d0bdb0629b5e85d57fff771..01a6e52c268a5c4e76f94653490d05c345e1a7d0 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -373,6 +373,8 @@ def pre_schedule_checks(kernel):
         check_write_destinations(kernel)
 
         logger.info("pre-schedule check %s: done" % kernel.name)
+    except KeyboardInterrupt:
+        raise
     except:
         print(75*"=")
         print("failing kernel during pre-schedule check:")
diff --git a/loopy/diff.py b/loopy/diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..d72465e6fd379eee5b36043f20566e6575e902b4
--- /dev/null
+++ b/loopy/diff.py
@@ -0,0 +1,397 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import islpy as isl
+
+from pymbolic.mapper.differentiator import DifferentiationMapper
+
+import pymbolic.primitives as p
+var = p.Variable
+
+import loopy as lp
+from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext
+from loopy.isl_helpers import make_slab
+from loopy.diagnostic import LoopyError
+
+
+# {{{ diff mapper
+
+def func_map(i, func, args):
+    if func.name == "exp":
+        return var("exp")(*args)
+    elif func.name == "log":
+        return 1/args[0]
+
+    elif func.name == "sin":
+        return var("cos")(*args)
+    elif func.name == "cos":
+        return -var("sin")(*args)
+    elif func.name == "tan":
+        return 1+var("tan")(*args)**2
+
+    elif func.name == "sinh":
+        return var("cosh")(*args)
+    elif func.name == "cosh":
+        return var("sinh")(*args)
+    elif func.name == "tanh":
+        return (1 - var("tanh")(*args))**2
+
+    else:
+        raise NotImplementedError("derivative of '%s'" % func.name)
+
+
+class LoopyDiffMapper(DifferentiationMapper, RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, diff_context, diff_inames):
+        RuleAwareIdentityMapper.__init__(self, rule_mapping_context)
+        self.diff_context = diff_context
+        self.diff_inames = diff_inames
+        self.diff_iname_exprs = tuple(var(diname) for diname in diff_inames)
+        self.function_map = func_map
+
+    def rec_undiff(self, expr, *args):
+        dc = self.diff_context
+
+        from loopy.symbolic import get_dependencies
+        deps = get_dependencies(expr)
+        for dep in deps:
+            assert isinstance(dep, str)
+            if (dep in dc.kernel.arg_dict
+                    or dep in dc.kernel.temporary_variables):
+                dc.import_output_var(dep)
+
+        return expr
+
+    def map_variable(self, expr, *args):
+        dc = self.diff_context
+
+        if expr.name == dc.by_name:
+            assert len(self.diff_inames) == 0
+            return 1
+
+        elif (expr.name in dc.kernel.all_inames()
+                or expr.name in dc.kernel.all_params()):
+            return expr
+
+        else:
+            dvar_dby = dc.get_diff_var(expr.name)
+            if dvar_dby is None:
+                return 0
+
+            if self.diff_inames:
+                return var(dvar_dby)[self.diff_iname_exprs]
+            else:
+                return var(dvar_dby)
+
+    map_tagged_variable = map_variable
+
+    def map_subscript(self, expr, *args):
+        dc = self.diff_context
+
+        if expr.aggregate.name == dc.by_name:
+            index = expr.index
+            if not isinstance(expr.index, tuple):
+                index = (expr.index,)
+
+            assert len(self.diff_inames) == len(index)
+
+            conds = [
+                p.Comparison(var(ti), "==", ei)
+                for ti, ei in zip(self.diff_inames, index)
+                ]
+
+            if len(conds) == 1:
+                and_conds, = conds
+            elif len(conds) > 1:
+                and_conds = p.LogicalAnd(tuple(conds))
+            else:
+                assert False
+
+            return p.If(and_conds, 1, 0)
+
+        else:
+            dvar_dby = dc.get_diff_var(expr.aggregate.name)
+            if dvar_dby is None:
+                return 0
+
+            idx = expr.index
+            if not isinstance(idx, tuple):
+                idx = (idx,)
+
+            return type(expr)(
+                    var(dvar_dby),
+                    expr.index + self.diff_inames)
+
+    def map_call(self, expr, *args):
+        dc = self.diff_context
+
+        if expr.function.name in dc.kernel.substitutions:
+            # FIXME: Deal with subsitution rules
+            # Need to use chain rule here, too.
+            raise NotImplementedError("substitution rules in differentiation")
+        else:
+            return DifferentiationMapper.map_call(self, expr, *args)
+
+# }}}
+
+
+# {{{ diff context
+
+class DifferentiationContext(object):
+    def __init__(self, kernel, var_name_gen, by_name, diff_iname_prefix,
+            additional_shape):
+        self.kernel = kernel
+        self.by_name = by_name
+        self.diff_iname_prefix = diff_iname_prefix
+        self.additional_shape = additional_shape
+
+        self.imported_outputs = set()
+        self.output_to_diff_output = {}
+
+        self.generate_instruction_id = self.kernel.get_instruction_id_generator()
+
+        self.new_args = []
+        self.new_temporary_variables = {}
+        self.new_instructions = []
+        self.imported_instructions = set()
+        self.new_domains = []
+
+        self.rule_mapping_context = SubstitutionRuleMappingContext(
+                kernel.substitutions, var_name_gen)
+
+    def get_new_kernel(self):
+        knl = self.kernel
+
+        new_args = knl.args + self.new_args
+        new_temp_vars = knl.temporary_variables.copy()
+        new_temp_vars.update(self.new_temporary_variables)
+
+        knl = knl.copy(
+                args=new_args,
+                temporary_variables=new_temp_vars,
+                instructions=self.new_instructions,
+                domains=knl.domains + self.new_domains)
+
+        del new_args
+        del new_temp_vars
+
+        knl = self.rule_mapping_context.finish_kernel(knl)
+
+        return knl
+
+    # {{{ kernel gen entrypoints
+
+    def add_diff_inames(self):
+        diff_inames = tuple(
+            self.rule_mapping_context.make_unique_var_name(
+                self.diff_iname_prefix+str(i))
+            for i in range(len(self.additional_shape)))
+
+        diff_parameters = set()
+        from loopy.symbolic import get_dependencies
+        for s in self.additional_shape:
+            diff_parameters.update(get_dependencies(s))
+
+        diff_domain = isl.BasicSet(
+                "[%s] -> {[%s]}"
+                % (", ".join(diff_parameters), ", ".join(diff_inames)))
+
+        for i, diff_iname in enumerate(diff_inames):
+            diff_domain = diff_domain & make_slab(
+                diff_domain.space, diff_iname, 0, self.additional_shape[i])
+
+        self.new_domains.append(diff_domain)
+
+        return diff_inames
+
+    # }}}
+
+    def import_instruction_and_deps(self, insn_id):
+        if insn_id in self.imported_instructions:
+            return
+
+        insn = self.kernel.id_to_insn[insn_id]
+        self.new_instructions.append(insn)
+        self.imported_instructions.add(insn_id)
+
+        id_map = RuleAwareIdentityMapper(self.rule_mapping_context)
+
+        if isinstance(insn, lp.ExpressionInstruction):
+            id_map(insn.expression, self.kernel, insn)
+        else:
+            raise RuntimeError("do not know how to deal with "
+                    "instruction of type %s" % type(insn))
+
+        for dep in insn.insn_deps:
+            self.import_instruction_and_deps(dep)
+
+    def import_output_var(self, var_name):
+        writers = self.kernel.writer_map().get(var_name, [])
+
+        if len(writers) > 1:
+            raise LoopyError("%s is written in more than one place"
+                    % var_name)
+
+        if not writers:
+            return
+
+        insn_id, = writers
+        self.import_instruction_and_deps(insn_id)
+
+    def get_diff_var(self, var_name):
+        """
+        :return: a string containing the name of a new variable
+            holding the derivative of *var_name* by the desired
+            *diff_context.by_name*, or *None* if no dependency exists.
+        """
+        new_var_name = self.rule_mapping_context.make_unique_var_name(
+                var_name + "_d" + self.by_name)
+
+        writers = self.kernel.writer_map().get(var_name, [])
+
+        if not writers:
+            # FIXME: There should be hooks to supply earlier dvar_dby
+            # This would be the spot to think about them.
+            return None
+
+        if len(writers) > 1:
+            raise LoopyError("%s is written in more than one place"
+                    % var_name)
+
+        orig_writer_id, = writers
+        orig_writer_insn = self.kernel.id_to_insn[orig_writer_id]
+
+        diff_inames = self.add_diff_inames()
+        diff_iname_exprs = tuple(var(diname) for diname in diff_inames)
+
+        # {{{ write code
+
+        diff_mapper = LoopyDiffMapper(self.rule_mapping_context, self,
+                diff_inames)
+
+        diff_expr = diff_mapper(orig_writer_insn.expression,
+                self.kernel, orig_writer_insn)
+
+        if not diff_expr:
+            return None
+
+        (_, lhs_ind), = orig_writer_insn.assignees_and_indices()
+        new_insn_id = self.generate_instruction_id()
+        insn = lp.ExpressionInstruction(
+                id=new_insn_id,
+                assignee=var(new_var_name)[
+                    lhs_ind + diff_iname_exprs],
+                expression=diff_expr)
+
+        self.new_instructions.append(insn)
+
+        # }}}
+
+        # {{{ manage variable declaration
+
+        if var_name in self.kernel.arg_dict:
+            arg = self.kernel.arg_dict[var_name]
+            orig_shape = arg.shape
+
+        elif var_name in self.kernel.temporary_variables:
+            tv = self.kernel.temporary_variables[var_name]
+            orig_shape = tv.shape
+
+        else:
+            raise ValueError("%s: variable not found" % var_name)
+
+        shape = orig_shape + self.additional_shape
+        dim_tags = ("c",) * len(shape)
+
+        if var_name in self.kernel.arg_dict:
+            self.new_args.append(
+                lp.GlobalArg(
+                    new_var_name,
+                    arg.dtype,
+                    shape=shape,
+                    dim_tags=dim_tags,
+                ))
+
+        elif var_name in self.kernel.temporary_variables:
+            self.new_temporary_variables[new_var_name] = lp.TemporaryVariable(
+                    new_var_name,
+                    tv.dtype,
+                    shape=shape,
+                    dim_tags=dim_tags)
+
+        # }}}
+
+        return new_var_name
+
+    # }}}
+
+# }}}
+
+
+# {{{ entrypoint
+
+def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i",
+        batch_axes_in_by=frozenset(), copy_outputs=set()):
+    """
+
+    :arg batch_axes_in_by: a :class:`set` of axis indices in the variable named *by*
+        that are not part of the differentiation.
+    :return: a string containing the name of a new variable
+        holding the derivative of *var_name* by the desired
+        *diff_context.by_name*, or *None* if no dependency exists.
+    """
+
+    from loopy.preprocess import add_default_dependencies
+    knl = add_default_dependencies(knl)
+
+    if isinstance(diff_outputs, str):
+        diff_outputs = [
+                dout.strip() for dout in diff_outputs.split(",")
+                if dout.strip()]
+
+    by_arg = knl.arg_dict[by]
+    additional_shape = by_arg.shape
+
+    var_name_gen = knl.get_var_name_generator()
+
+    # {{{ differentiate instructions
+
+    diff_context = DifferentiationContext(
+            knl, var_name_gen, by, diff_iname_prefix=diff_iname_prefix,
+            additional_shape=additional_shape)
+
+    result = {}
+    for dout in diff_outputs:
+        result = diff_context.get_diff_var(dout)
+
+    for cout in copy_outputs:
+        diff_context.import_output_var(cout)
+
+    # }}}
+
+    return diff_context.get_new_kernel(), result
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 4e31db993517fc4426766a6e881fb7f56853915c..c5c0baec6d07fc3daf3adfa5a8e130d712575963 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -345,6 +345,11 @@ class LoopKernel(RecordWithoutPickling):
     def get_var_name_generator(self):
         return _UniqueVarNameGenerator(self.all_variable_names())
 
+    def get_instruction_id_generator(self, based_on="insn"):
+        used_ids = set(insn.id for insn in self.instructions)
+
+        return UniqueNameGenerator(used_ids)
+
     def make_unique_instruction_id(self, insns=None, based_on="insn",
             extra_used_ids=set()):
         if insns is None:
diff --git a/loopy/statistics.py b/loopy/statistics.py
index a6e7c33233728f6df01303ea683916bc1a335828..429a6a2116ab111f6cda85a95afe3255d73014bd 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -76,16 +76,17 @@ class ToCountMap:
         except KeyError:
             return isl.PwQPolynomial('{ 0 }')
 
-    def __str__(self):
-        result = ""
-        for key in sorted(self.dict.keys(), key=lambda k: str(k)):
-            result += ("%s : %s\n" % (key, self.dict[key]))
-        return result
-
     def __repr__(self):
         return repr(self.dict)
 
 
+def stringify_stats_mapping(m):
+    result = ""
+    for key in sorted(m.keys(), key=lambda k: str(k)):
+        result += ("%s : %s\n" % (key, m[key]))
+    return result
+
+
 class ExpressionOpCounter(CombineMapper):
 
     def __init__(self, knl):
@@ -447,8 +448,7 @@ def get_op_poly(knl):
         domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
         ops = op_counter(insn.assignee) + op_counter(insn.expression)
         op_poly = op_poly + ops*count(knl, domain)
-
-    return op_poly
+    return op_poly.dict
 
 
 def get_gmem_access_poly(knl):  # for now just counting subscripts
@@ -463,10 +463,12 @@ def get_gmem_access_poly(knl):  # for now just counting subscripts
              - The :class:`numpy.dtype` specifies the type of the data being
                accessed.
 
-             - The first string in the map key specifies the DRAM access type as
+             - The first string in the map key specifies the global memory
+               access type as
                *consecutive*, *nonconsecutive*, or *uniform*.
 
-             - The second string in the map key specifies the DRAM access type as a
+             - The second string in the map key specifies the global memory
+               access type as a
                *load*, or a *store*.
 
              - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
@@ -515,8 +517,7 @@ def get_gmem_access_poly(knl):  # for now just counting subscripts
             for key, val in six.iteritems(subs_assignee.dict)))
 
         subs_poly = subs_poly + (subs_expr + subs_assignee)*count(knl, domain)
-
-    return subs_poly
+    return subs_poly.dict
 
 
 def get_DRAM_access_poly(knl):
@@ -526,6 +527,27 @@ def get_DRAM_access_poly(knl):
     return get_gmem_access_poly(knl)
 
 
+def sum_mem_access_to_bytes(m):
+    """Sum the mapping returned by :func:`get_gmem_access_poly` to a mapping
+
+    **{(** :class:`string` **,** :class:`string` **)**
+    **:** :class:`islpy.PwQPolynomial` **}**
+
+    i.e., aggregate the transfer numbers for all types into a single byte count.
+    """
+
+    result = {}
+    for (dtype, kind, direction), v in m.items():
+        new_key = (kind, direction)
+        bytes_transferred = int(dtype.itemsize) * v
+        if new_key in result:
+            result[new_key] += bytes_transferred
+        else:
+            result[new_key] = bytes_transferred
+
+    return result
+
+
 def get_barrier_poly(knl):
 
     """Count the number of barriers each thread encounters in a loopy kernel.
diff --git a/setup.py b/setup.py
index 1f1ea68769f71663dc719d274dc38b01b1f602ed..c8c660666ffeed18f92e533b6042cff63988a096 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ setup(name="loo.py",
           "pymbolic>=2015.2.1",
           "cgen>=2013.1.2",
           "islpy>=2014.2",
-          "six",
+          "six>=1.8.0",
           ],
 
       scripts=["bin/loopy"],
diff --git a/test/test_diff.py b/test/test_diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..a13e6d16129ceb997b55bc2990914795ed146bee
--- /dev/null
+++ b/test/test_diff.py
@@ -0,0 +1,107 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import numpy as np  # noqa
+import numpy.linalg as la
+import loopy as lp
+import pyopencl as cl
+import pyopencl.clrandom  # noqa
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+__all__ = [
+        "pytest_generate_tests",
+        "cl"  # 'cl.create_some_context'
+        ]
+
+
+def test_diff(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+         """{ [i,j]: 0<=i,j<n }""",
+         """
+         <> a = 1/(1+sinh(x[i] + y[j])**2)
+         z[i] = sum(j, exp(a * x[j]))
+         """)
+
+    knl = lp.fix_parameters(knl, n=50)
+
+    from loopy.diff import diff_kernel
+    dknl, diff_map = diff_kernel(knl, "z", "x")
+    dknl = lp.remove_unused_arguments(dknl)
+
+    print(dknl)
+
+    n = 50
+    x = np.random.randn(n)
+    y = np.random.randn(n)
+
+    dx = np.random.randn(n)
+
+    fac = 1e-1
+    h1 = 1e-4
+    h2 = h1 * fac
+
+    evt, (z0,) = knl(queue, x=x, y=y)
+    evt, (z1,) = knl(queue, x=(x + h1*dx), y=y)
+    evt, (z2,) = knl(queue, x=(x + h2*dx), y=y)
+
+    dknl = lp.set_options(dknl, write_cl=True)
+    evt, (df,) = dknl(queue, x=x, y=y)
+
+    diff1 = (z1-z0)
+    diff2 = (z2-z0)
+
+    diff1_predicted = df.dot(h1*dx)
+    diff2_predicted = df.dot(h2*dx)
+
+    err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1)
+    err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2)
+    print(err1, err2)
+
+    assert (err2 < err1 * fac * 1.1).all()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_statistics.py b/test/test_statistics.py
index a58ce6d582a8d03d622028156adff35c61009bc0..a504761193fe4acb7dff9a4a9535efb7a74fe2a9 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -49,9 +49,9 @@ def test_op_counter_basic():
     n = 512
     m = 256
     l = 128
-    f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly.dict[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == 3*n*m*l
     assert f64 == n*m
     assert i32 == n*m*2
@@ -71,7 +71,7 @@ def test_op_counter_reduction():
     n = 512
     m = 256
     l = 128
-    f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == 2*n*m*l
 
 
@@ -91,9 +91,9 @@ def test_op_counter_logic():
     n = 512
     m = 256
     l = 128
-    f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly.dict[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == n*m
     assert f64 == 3*n*m
     assert i32 == n*m
@@ -117,9 +117,9 @@ def test_op_counter_specialops():
     n = 512
     m = 256
     l = 128
-    f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly.dict[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == 4*n*m*l
     assert f64 == 3*n*m
     assert i32 == n*m
@@ -146,12 +146,11 @@ def test_op_counter_bitwise():
     n = 512
     m = 256
     l = 128
-    i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
-    i64 = poly.dict[np.dtype(np.int64)].eval_with_dict({'n': n, 'm': m, 'l': l})  # noqa
-    f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l})
+    i64 = poly[np.dtype(np.int64)].eval_with_dict({'n': n, 'm': m, 'l': l})  # noqa
+    assert np.dtype(np.float64) not in poly
     assert i32 == n*m+3*n*m*l
     assert i64 == 6*n*m
-    assert f64 == 0
 
 
 def test_op_counter_triangular_domain():
@@ -203,19 +202,19 @@ def test_gmem_access_counter_basic():
     n = 512
     m = 256
     l = 128
-    f32 = poly.dict[
+    f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'load')
                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly.dict[
+    f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'load')
                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == 3*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly.dict[
+    f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'store')
                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly.dict[
+    f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'store')
                    ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == n*m*l
@@ -236,12 +235,12 @@ def test_gmem_access_counter_reduction():
     n = 512
     m = 256
     l = 128
-    f32 = poly.dict[
+    f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == 2*n*m*l
 
-    f32 = poly.dict[
+    f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == n*l
@@ -263,16 +262,16 @@ def test_gmem_access_counter_logic():
     n = 512
     m = 256
     l = 128
-    f32 = poly.dict[
+    f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly.dict[
+    f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == 2*n*m
     assert f64 == n*m
 
-    f64 = poly.dict[
+    f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f64 == n*m
@@ -296,19 +295,19 @@ def test_gmem_access_counter_specialops():
     n = 512
     m = 256
     l = 128
-    f32 = poly.dict[
+    f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly.dict[
+    f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly.dict[
+    f32 = poly[
                     (np.dtype(np.float32), 'uniform', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f64 = poly.dict[
+    f64 = poly[
                     (np.dtype(np.float64), 'uniform', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f32 == n*m*l
@@ -336,12 +335,12 @@ def test_gmem_access_counter_bitwise():
     n = 512
     m = 256
     l = 128
-    i32 = poly.dict[
+    i32 = poly[
                     (np.dtype(np.int32), 'uniform', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = poly.dict[
+    i32 = poly[
                     (np.dtype(np.int32), 'uniform', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert i32 == n*m+n*m*l
@@ -367,19 +366,19 @@ def test_gmem_access_counter_mixed():
     n = 512
     m = 256
     l = 128
-    f64uniform = poly.dict[
+    f64uniform = poly[
                     (np.dtype(np.float64), 'uniform', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f32nonconsec = poly.dict[
+    f32nonconsec = poly[
                     (np.dtype(np.float32), 'nonconsecutive', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f64uniform == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = poly.dict[
+    f64uniform = poly[
                     (np.dtype(np.float64), 'uniform', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f32nonconsec = poly.dict[
+    f32nonconsec = poly[
                     (np.dtype(np.float32), 'nonconsecutive', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f64uniform == n*m
@@ -406,19 +405,19 @@ def test_gmem_access_counter_nonconsec():
     n = 512
     m = 256
     l = 128
-    f64nonconsec = poly.dict[
+    f64nonconsec = poly[
                     (np.dtype(np.float64), 'nonconsecutive', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f32nonconsec = poly.dict[
+    f32nonconsec = poly[
                     (np.dtype(np.float32), 'nonconsecutive', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64nonconsec = poly.dict[
+    f64nonconsec = poly[
                     (np.dtype(np.float64), 'nonconsecutive', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f32nonconsec = poly.dict[
+    f32nonconsec = poly[
                     (np.dtype(np.float32), 'nonconsecutive', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f64nonconsec == n*m
@@ -445,19 +444,19 @@ def test_gmem_access_counter_consec():
     m = 256
     l = 128
 
-    f64consec = poly.dict[
+    f64consec = poly[
                     (np.dtype(np.float64), 'consecutive', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f32consec = poly.dict[
+    f32consec = poly[
                     (np.dtype(np.float32), 'consecutive', 'load')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = poly.dict[
+    f64consec = poly[
                     (np.dtype(np.float64), 'consecutive', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f32consec = poly.dict[
+    f32consec = poly[
                     (np.dtype(np.float32), 'consecutive', 'store')
                     ].eval_with_dict({'n': n, 'm': m, 'l': l})
     assert f64consec == n*m
@@ -531,10 +530,10 @@ def test_all_counters_parallel_matmul():
     assert barrier_count == 0
 
     op_map = get_op_poly(knl)
-    f32ops = op_map.dict[
+    f32ops = op_map[
                         np.dtype(np.float32)
                         ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    i32ops = op_map.dict[
+    i32ops = op_map[
                         np.dtype(np.int32)
                         ].eval_with_dict({'n': n, 'm': m, 'l': l})
 
@@ -542,17 +541,17 @@ def test_all_counters_parallel_matmul():
     assert i32ops == n*m*l*4 + l*n*4
 
     subscript_map = get_gmem_access_poly(knl)
-    f32uncoal = subscript_map.dict[
+    f32uncoal = subscript_map[
                         (np.dtype(np.float32), 'nonconsecutive', 'load')
                         ].eval_with_dict({'n': n, 'm': m, 'l': l})
-    f32coal = subscript_map.dict[
+    f32coal = subscript_map[
                         (np.dtype(np.float32), 'consecutive', 'load')
                         ].eval_with_dict({'n': n, 'm': m, 'l': l})
 
     assert f32uncoal == n*m*l
     assert f32coal == n*m*l
 
-    f32coal = subscript_map.dict[
+    f32coal = subscript_map[
                         (np.dtype(np.float32), 'consecutive', 'store')
                         ].eval_with_dict({'n': n, 'm': m, 'l': l})